From e056416ddbd41aabdeba111a62c0f92b25a8c0ed Mon Sep 17 00:00:00 2001 From: Joshie Date: Mon, 22 Mar 2021 10:57:25 +0000 Subject: [PATCH] Add mm-soft-dirty patchset from Paul Gofman (#206) Needed for D3D12 APITracing under Wine --- PKGBUILD | 14 +- linux-tkg-config/prepare | 6 + ...t-soft-dirty-flag-reset-for-VA-range.patch | 244 ++++++++++++ ...port-soft-dirty-flag-read-with-reset.patch | 363 ++++++++++++++++++ ...t-soft-dirty-flag-reset-for-VA-range.patch | 244 ++++++++++++ ...port-soft-dirty-flag-read-with-reset.patch | 363 ++++++++++++++++++ 6 files changed, 1232 insertions(+), 2 deletions(-) create mode 100644 linux-tkg-patches/5.11/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch create mode 100644 linux-tkg-patches/5.11/0002-mm-Support-soft-dirty-flag-read-with-reset.patch create mode 100644 linux-tkg-patches/5.12/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch create mode 100644 linux-tkg-patches/5.12/0002-mm-Support-soft-dirty-flag-read-with-reset.patch diff --git a/PKGBUILD b/PKGBUILD index 3bf6443..c257111 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -368,6 +368,9 @@ case $_basever in 0009-prjc_v5.11-r2.patch #0012-linux-hardened.patch 0012-misc-additions.patch + # MM Dirty Soft for WRITE_WATCH support in Wine + 0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch + 0002-mm-Support-soft-dirty-flag-read-with-reset.patch ) sha256sums=('04f07b54f0d40adfab02ee6cbd2a942c96728d87c1ef9e120d0cb9ba3fe067b4' 'dffcabb33733c58344243d722f77f0b38616796b7ecb287892ccb975c91de077' @@ -393,7 +396,9 @@ case $_basever in '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' 'e394d4b7721f55837a8364c8311cb06cb5a59484de8aa8731e38d1aff2b7014e' - '7fb1104c167edb79ec8fbdcde97940ed0f806aa978bdd14d0c665a1d76d25c24') + '7fb1104c167edb79ec8fbdcde97940ed0f806aa978bdd14d0c665a1d76d25c24' + 'b1c6599d0e1ac9b66898d652ed99dae3fb8676d840a43ffa920a78d96e0521be' + 'b0319a7dff9c48b2f3e3d3597ee154bf92223149a633a8b7ce4026252db86da6') ;; 512) opt_ver="5.8%2B" @@ -426,6 +431,9 @@ case $_basever in #0009-prjc_v5.12-r0.patch #0012-linux-hardened.patch 0012-misc-additions.patch + # MM Dirty Soft for WRITE_WATCH support in Wine + 0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch + 0002-mm-Support-soft-dirty-flag-read-with-reset.patch ) sha256sums=('cb251b82131e29c575db3820c9ad62c044d07bcf8bf0bf38482ba55d52dc95b7' 'SKIP' @@ -439,7 +447,9 @@ case $_basever in '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' 'b302ba6c5bbe8ed19b20207505d513208fae1e678cf4d8e7ac0b154e5fe3f456' 'fc0a3274e3285278e925f4b3bfe803e5e610344bebe5bba063ba202dbaff49c8' - '7fb1104c167edb79ec8fbdcde97940ed0f806aa978bdd14d0c665a1d76d25c24') + '7fb1104c167edb79ec8fbdcde97940ed0f806aa978bdd14d0c665a1d76d25c24' + 'b1c6599d0e1ac9b66898d652ed99dae3fb8676d840a43ffa920a78d96e0521be' + 'b0319a7dff9c48b2f3e3d3597ee154bf92223149a633a8b7ce4026252db86da6') ;; esac diff --git a/linux-tkg-config/prepare b/linux-tkg-config/prepare index cbea872..f41c65e 100644 --- a/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -405,6 +405,12 @@ _tkg_srcprep() { tkgpatch="$srcdir/0012-misc-additions.patch" && _tkg_patcher fi + if [ "$_basever" = "511" ] || [ "$_basever" = "512" ]; then + msg2 "Applying patches for WRITE_WATCH support in Wine" + tkgpatch="$srcdir/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch" && _tkg_patcher + tkgpatch="$srcdir/0002-mm-Support-soft-dirty-flag-read-with-reset.patch" && _tkg_patcher + fi + # prjc/bmq patch rev if [ "$_basever" = "58" ] || [ "$_basever" = "57" ]; then rev=3 diff --git a/linux-tkg-patches/5.11/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch b/linux-tkg-patches/5.11/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch new file mode 100644 index 0000000..ac03fd1 --- /dev/null +++ b/linux-tkg-patches/5.11/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch @@ -0,0 +1,244 @@ +From 5ae86c8436b83762bc6cf46bea1da6ace2d3f50e Mon Sep 17 00:00:00 2001 +From: Paul Gofman +Date: Wed, 6 May 2020 14:37:44 +0300 +Subject: [PATCH 1/2] mm: Support soft dirty flag reset for VA range. + +--- + fs/proc/task_mmu.c | 129 ++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 103 insertions(+), 26 deletions(-) + +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 3cec6fbef725..7c7865028f10 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1032,6 +1032,8 @@ enum clear_refs_types { + + struct clear_refs_private { + enum clear_refs_types type; ++ unsigned long start, end; ++ bool clear_range; + }; + + #ifdef CONFIG_MEM_SOFT_DIRTY +@@ -1125,6 +1127,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + spinlock_t *ptl; + struct page *page; + ++ BUG_ON(addr < cp->start || end > cp->end); ++ + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { +@@ -1181,9 +1185,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + +- if (vma->vm_flags & VM_PFNMAP) ++ if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) + return 1; + ++ BUG_ON(start < cp->start || end > cp->end); ++ + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. +@@ -1206,10 +1212,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { + struct task_struct *task; +- char buffer[PROC_NUMBUF]; ++ char buffer[18]; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum clear_refs_types type; ++ unsigned long start, end; ++ bool clear_range; + int itype; + int rv; + +@@ -1218,12 +1226,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; +- rv = kstrtoint(strstrip(buffer), 10, &itype); +- if (rv < 0) +- return rv; +- type = (enum clear_refs_types)itype; +- if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) +- return -EINVAL; ++ ++ if (buffer[0] == '6') ++ { ++ static int once; ++ ++ if (!once++) ++ printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); ++ ++ if (count != 17) ++ return -EINVAL; ++ ++ type = CLEAR_REFS_SOFT_DIRTY; ++ start = *(unsigned long *)(buffer + 1); ++ end = *(unsigned long *)(buffer + 1 + 8); ++ } ++ else ++ { ++ rv = kstrtoint(strstrip(buffer), 10, &itype); ++ if (rv < 0) ++ return rv; ++ type = (enum clear_refs_types)itype; ++ ++ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) ++ return -EINVAL; ++ ++ start = 0; ++ end = -1UL; ++ } + + task = get_proc_task(file_inode(file)); + if (!task) +@@ -1235,41 +1265,87 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + .type = type, + }; + +- if (mmap_write_lock_killable(mm)) { +- count = -EINTR; +- goto out_mm; ++ if (start || end != -1UL) ++ { ++ start = min(start, mm->highest_vm_end) & PAGE_MASK; ++ end = min(end, mm->highest_vm_end) & PAGE_MASK; ++ ++ if (start >= end) ++ { ++ count = -EINVAL; ++ goto out_mm; ++ } ++ clear_range = true; + } ++ else ++ { ++ clear_range = false; ++ } ++ ++ cp.start = start; ++ cp.end = end; ++ cp.clear_range = clear_range; ++ + if (type == CLEAR_REFS_MM_HIWATER_RSS) { ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + reset_mm_hiwater_rss(mm); +- goto out_unlock; ++ mmap_write_unlock(mm); ++ goto out_mm; + } + + if (type == CLEAR_REFS_SOFT_DIRTY) { +- for (vma = mm->mmap; vma; vma = vma->vm_next) { +- if (!(vma->vm_flags & VM_SOFTDIRTY)) +- continue; +- vma->vm_flags &= ~VM_SOFTDIRTY; +- vma_set_page_prot(vma); ++ if (mmap_read_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; + } +- ++ if (!clear_range) ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ if (!(vma->vm_flags & VM_SOFTDIRTY)) ++ continue; ++ mmap_read_unlock(mm); ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ vma->vm_flags &= ~VM_SOFTDIRTY; ++ vma_set_page_prot(vma); ++ } ++ mmap_write_downgrade(mm); ++ break; ++ } + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, +- 0, NULL, mm, 0, -1UL); ++ 0, NULL, mm, start, end); + mmu_notifier_invalidate_range_start(&range); + } +- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, ++ else ++ { ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ } ++ walk_page_range(mm, start, end == -1UL ? mm->highest_vm_end : end, &clear_refs_walk_ops, + &cp); + if (type == CLEAR_REFS_SOFT_DIRTY) { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); ++ mmap_read_unlock(mm); ++ } ++ else ++ { ++ mmap_write_unlock(mm); + } +-out_unlock: +- mmap_write_unlock(mm); + out_mm: + mmput(mm); + } +@@ -1301,6 +1377,7 @@ struct pagemapread { + #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) + #define PM_SOFT_DIRTY BIT_ULL(55) + #define PM_MMAP_EXCLUSIVE BIT_ULL(56) ++#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) + #define PM_FILE BIT_ULL(61) + #define PM_SWAP BIT_ULL(62) + #define PM_PRESENT BIT_ULL(63) +@@ -1373,11 +1450,11 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, + flags |= PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + } else if (is_swap_pte(pte)) { + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + entry = pte_to_swp_entry(pte); + if (pm->show_pfn) + frame = swp_type(entry) | +@@ -1424,7 +1501,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); +@@ -1442,7 +1519,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +-- +2.30.2 + diff --git a/linux-tkg-patches/5.11/0002-mm-Support-soft-dirty-flag-read-with-reset.patch b/linux-tkg-patches/5.11/0002-mm-Support-soft-dirty-flag-read-with-reset.patch new file mode 100644 index 0000000..7dfc910 --- /dev/null +++ b/linux-tkg-patches/5.11/0002-mm-Support-soft-dirty-flag-read-with-reset.patch @@ -0,0 +1,363 @@ +From 9c85113cf4019e7b277a44e72bda8b78347aa72f Mon Sep 17 00:00:00 2001 +From: Paul Gofman +Date: Thu, 7 May 2020 14:05:31 +0300 +Subject: [PATCH 2/2] mm: Support soft dirty flag read with reset. + +--- + fs/proc/base.c | 3 + + fs/proc/internal.h | 1 + + fs/proc/task_mmu.c | 144 +++++++++++++++++++++++++++++++++++++++------ + 3 files changed, 130 insertions(+), 18 deletions(-) + +diff --git a/fs/proc/base.c b/fs/proc/base.c +index b3422cda2a91..8199ae2411ca 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -3202,6 +3202,9 @@ static const struct pid_entry tgid_base_stuff[] = { + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), ++#ifdef CONFIG_MEM_SOFT_DIRTY ++ REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), ++#endif + #endif + #ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +diff --git a/fs/proc/internal.h b/fs/proc/internal.h +index f60b379dcdc7..36a901cf0e7f 100644 +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_operations; + extern const struct file_operations proc_pid_smaps_rollup_operations; + extern const struct file_operations proc_clear_refs_operations; + extern const struct file_operations proc_pagemap_operations; ++extern const struct file_operations proc_pagemap_reset_operations; + + extern unsigned long task_vsize(struct mm_struct *); + extern unsigned long task_statm(struct mm_struct *, +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 7c7865028f10..a21694967915 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1056,8 +1056,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, + return page_maybe_dma_pinned(page); + } + +-static inline void clear_soft_dirty(struct vm_area_struct *vma, +- unsigned long addr, pte_t *pte) ++static inline bool clear_soft_dirty(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *pte) + { + /* + * The soft-dirty tracker uses #PF-s to catch writes +@@ -1066,37 +1066,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, + * of how soft-dirty works. + */ + pte_t ptent = *pte; ++ bool ret = false; + + if (pte_present(ptent)) { + pte_t old_pte; + + if (pte_is_pinned(vma, addr, ptent)) +- return; ++ return ret; + old_pte = ptep_modify_prot_start(vma, addr, pte); ++ ret = pte_soft_dirty(old_pte); + ptent = pte_wrprotect(old_pte); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { ++ ret = pte_swp_soft_dirty(ptent); + ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } ++ return ret; + } + #else +-static inline void clear_soft_dirty(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) + { ++ return false; + } + #endif + + #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { + pmd_t old, pmd = *pmdp; ++ bool ret = false; + + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + old = pmdp_invalidate(vma, addr, pmdp); ++ ++ ret = pmd_soft_dirty(old); ++ + if (pmd_dirty(old)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(old)) +@@ -1107,14 +1116,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { ++ ret = pmd_swp_soft_dirty(pmd); + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } ++ return ret; + } + #else +-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { ++ return false; + } + #endif + +@@ -1367,6 +1379,7 @@ struct pagemapread { + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ + pagemap_entry_t *buffer; + bool show_pfn; ++ bool reset; + }; + + #define PAGEMAP_WALK_SIZE (PMD_SIZE) +@@ -1398,6 +1411,14 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, + return 0; + } + ++static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) ++{ ++ ((unsigned long *)pm->buffer)[pm->pos++] = addr; ++ if (pm->pos >= pm->len) ++ return PM_END_OF_BUFFER; ++ return 0; ++} ++ + static int pagemap_pte_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) + { +@@ -1405,6 +1426,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, + unsigned long addr = start; + int err = 0; + ++ if (pm->reset) ++ goto out; ++ + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(0, 0); +@@ -1439,8 +1463,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, + } + + static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, +- struct vm_area_struct *vma, unsigned long addr, pte_t pte) ++ struct vm_area_struct *vma, unsigned long addr, pte_t *pte_addr) + { ++ pte_t pte = *pte_addr; + u64 frame = 0, flags = 0; + struct page *page = NULL; + +@@ -1493,6 +1518,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + pmd_t pmd = *pmdp; + struct page *page = NULL; + ++ if (pm->reset) ++ { ++ if (clear_soft_dirty_pmd(vma, addr, pmdp)) ++ { ++ for (; addr != end; addr += PAGE_SIZE) ++ { ++ err = add_addr_to_pagemap(addr, pm); ++ if (err) ++ break; ++ } ++ } ++ goto trans_huge_done; ++ } ++ + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + +@@ -1541,6 +1580,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + frame += (1 << MAX_SWAPFILES_SHIFT); + } + } ++trans_huge_done: + spin_unlock(ptl); + return err; + } +@@ -1555,10 +1595,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + */ + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { +- pagemap_entry_t pme; ++ if (pm->reset) ++ { ++ if (clear_soft_dirty(vma, addr, pte)) ++ err = add_addr_to_pagemap(addr, pm); ++ } ++ else ++ { ++ pagemap_entry_t pme; + +- pme = pte_to_pagemap_entry(pm, vma, addr, *pte); +- err = add_to_pagemap(addr, &pme, pm); ++ pme = pte_to_pagemap_entry(pm, vma, addr, pte); ++ err = add_to_pagemap(addr, &pme, pm); ++ } + if (err) + break; + } +@@ -1650,8 +1698,8 @@ static const struct mm_walk_ops pagemap_ops = { + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +-static ssize_t pagemap_read(struct file *file, char __user *buf, +- size_t count, loff_t *ppos) ++static ssize_t do_pagemap_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos, bool reset) + { + struct mm_struct *mm = file->private_data; + struct pagemapread pm; +@@ -1660,6 +1708,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + unsigned long start_vaddr; + unsigned long end_vaddr; + int ret = 0, copied = 0; ++ struct mmu_notifier_range range; ++ size_t buffer_len; + + if (!mm || !mmget_not_zero(mm)) + goto out; +@@ -1675,19 +1725,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); ++ pm.reset = reset; + +- pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); +- pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ++ buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); ++ ++ pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); + ret = -ENOMEM; + if (!pm.buffer) + goto out_mm; + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; +- end_vaddr = mm->task_size; ++ ++ start_vaddr = svpfn << PAGE_SHIFT; ++ ++ if (reset) ++ { ++ if (count < sizeof(end_vaddr)) ++ { ++ ret = -EINVAL; ++ goto out_mm; ++ } ++ if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) ++ return -EFAULT; ++ end_vaddr = min(end_vaddr, mm->task_size); ++ } ++ else ++ { ++ end_vaddr = mm->task_size; ++ start_vaddr = end_vaddr; ++ } + + /* watch out for wraparound */ +- start_vaddr = end_vaddr; + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) + start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + +@@ -1707,18 +1776,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + unsigned long end; + + pm.pos = 0; +- end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; ++ pm.len = min(buffer_len, count / PM_ENTRY_BYTES); ++ ++ end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; ++ + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; ++ ++ if (reset) ++ { ++ inc_tlb_flush_pending(mm); ++ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, ++ 0, NULL, mm, start_vaddr, end); ++ mmu_notifier_invalidate_range_start(&range); ++ } + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); ++ if (reset) ++ { ++ mmu_notifier_invalidate_range_end(&range); ++ flush_tlb_mm(mm); ++ dec_tlb_flush_pending(mm); ++ } + mmap_read_unlock(mm); +- start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); ++ BUG_ON(ret && ret != PM_END_OF_BUFFER); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_free; +@@ -1726,6 +1812,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + copied += len; + buf += len; + count -= len; ++ ++ start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) +@@ -1739,6 +1827,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + return ret; + } + ++static ssize_t pagemap_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ return do_pagemap_read(file, buf, count, ppos, false); ++} ++ ++static ssize_t pagemap_reset_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ return do_pagemap_read(file, buf, count, ppos, true); ++} ++ + static int pagemap_open(struct inode *inode, struct file *file) + { + struct mm_struct *mm; +@@ -1765,6 +1865,14 @@ const struct file_operations proc_pagemap_operations = { + .open = pagemap_open, + .release = pagemap_release, + }; ++ ++const struct file_operations proc_pagemap_reset_operations = { ++ .llseek = mem_lseek, /* borrow this */ ++ .read = pagemap_reset_read, ++ .open = pagemap_open, ++ .release = pagemap_release, ++}; ++ + #endif /* CONFIG_PROC_PAGE_MONITOR */ + + #ifdef CONFIG_NUMA +-- +2.30.2 + diff --git a/linux-tkg-patches/5.12/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch b/linux-tkg-patches/5.12/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch new file mode 100644 index 0000000..ac03fd1 --- /dev/null +++ b/linux-tkg-patches/5.12/0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch @@ -0,0 +1,244 @@ +From 5ae86c8436b83762bc6cf46bea1da6ace2d3f50e Mon Sep 17 00:00:00 2001 +From: Paul Gofman +Date: Wed, 6 May 2020 14:37:44 +0300 +Subject: [PATCH 1/2] mm: Support soft dirty flag reset for VA range. + +--- + fs/proc/task_mmu.c | 129 ++++++++++++++++++++++++++++++++++++--------- + 1 file changed, 103 insertions(+), 26 deletions(-) + +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 3cec6fbef725..7c7865028f10 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1032,6 +1032,8 @@ enum clear_refs_types { + + struct clear_refs_private { + enum clear_refs_types type; ++ unsigned long start, end; ++ bool clear_range; + }; + + #ifdef CONFIG_MEM_SOFT_DIRTY +@@ -1125,6 +1127,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + spinlock_t *ptl; + struct page *page; + ++ BUG_ON(addr < cp->start || end > cp->end); ++ + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { +@@ -1181,9 +1185,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + +- if (vma->vm_flags & VM_PFNMAP) ++ if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP)) + return 1; + ++ BUG_ON(start < cp->start || end > cp->end); ++ + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. +@@ -1206,10 +1212,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { + struct task_struct *task; +- char buffer[PROC_NUMBUF]; ++ char buffer[18]; + struct mm_struct *mm; + struct vm_area_struct *vma; + enum clear_refs_types type; ++ unsigned long start, end; ++ bool clear_range; + int itype; + int rv; + +@@ -1218,12 +1226,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; +- rv = kstrtoint(strstrip(buffer), 10, &itype); +- if (rv < 0) +- return rv; +- type = (enum clear_refs_types)itype; +- if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) +- return -EINVAL; ++ ++ if (buffer[0] == '6') ++ { ++ static int once; ++ ++ if (!once++) ++ printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n"); ++ ++ if (count != 17) ++ return -EINVAL; ++ ++ type = CLEAR_REFS_SOFT_DIRTY; ++ start = *(unsigned long *)(buffer + 1); ++ end = *(unsigned long *)(buffer + 1 + 8); ++ } ++ else ++ { ++ rv = kstrtoint(strstrip(buffer), 10, &itype); ++ if (rv < 0) ++ return rv; ++ type = (enum clear_refs_types)itype; ++ ++ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST) ++ return -EINVAL; ++ ++ start = 0; ++ end = -1UL; ++ } + + task = get_proc_task(file_inode(file)); + if (!task) +@@ -1235,41 +1265,87 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, + .type = type, + }; + +- if (mmap_write_lock_killable(mm)) { +- count = -EINTR; +- goto out_mm; ++ if (start || end != -1UL) ++ { ++ start = min(start, mm->highest_vm_end) & PAGE_MASK; ++ end = min(end, mm->highest_vm_end) & PAGE_MASK; ++ ++ if (start >= end) ++ { ++ count = -EINVAL; ++ goto out_mm; ++ } ++ clear_range = true; + } ++ else ++ { ++ clear_range = false; ++ } ++ ++ cp.start = start; ++ cp.end = end; ++ cp.clear_range = clear_range; ++ + if (type == CLEAR_REFS_MM_HIWATER_RSS) { ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ + /* + * Writing 5 to /proc/pid/clear_refs resets the peak + * resident set size to this mm's current rss value. + */ + reset_mm_hiwater_rss(mm); +- goto out_unlock; ++ mmap_write_unlock(mm); ++ goto out_mm; + } + + if (type == CLEAR_REFS_SOFT_DIRTY) { +- for (vma = mm->mmap; vma; vma = vma->vm_next) { +- if (!(vma->vm_flags & VM_SOFTDIRTY)) +- continue; +- vma->vm_flags &= ~VM_SOFTDIRTY; +- vma_set_page_prot(vma); ++ if (mmap_read_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; + } +- ++ if (!clear_range) ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ if (!(vma->vm_flags & VM_SOFTDIRTY)) ++ continue; ++ mmap_read_unlock(mm); ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ for (vma = mm->mmap; vma; vma = vma->vm_next) { ++ vma->vm_flags &= ~VM_SOFTDIRTY; ++ vma_set_page_prot(vma); ++ } ++ mmap_write_downgrade(mm); ++ break; ++ } + inc_tlb_flush_pending(mm); + mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, +- 0, NULL, mm, 0, -1UL); ++ 0, NULL, mm, start, end); + mmu_notifier_invalidate_range_start(&range); + } +- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, ++ else ++ { ++ if (mmap_write_lock_killable(mm)) { ++ count = -EINTR; ++ goto out_mm; ++ } ++ } ++ walk_page_range(mm, start, end == -1UL ? mm->highest_vm_end : end, &clear_refs_walk_ops, + &cp); + if (type == CLEAR_REFS_SOFT_DIRTY) { + mmu_notifier_invalidate_range_end(&range); + flush_tlb_mm(mm); + dec_tlb_flush_pending(mm); ++ mmap_read_unlock(mm); ++ } ++ else ++ { ++ mmap_write_unlock(mm); + } +-out_unlock: +- mmap_write_unlock(mm); + out_mm: + mmput(mm); + } +@@ -1301,6 +1377,7 @@ struct pagemapread { + #define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0) + #define PM_SOFT_DIRTY BIT_ULL(55) + #define PM_MMAP_EXCLUSIVE BIT_ULL(56) ++#define PM_SOFT_DIRTY_PAGE BIT_ULL(57) + #define PM_FILE BIT_ULL(61) + #define PM_SWAP BIT_ULL(62) + #define PM_PRESENT BIT_ULL(63) +@@ -1373,11 +1450,11 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, + flags |= PM_PRESENT; + page = vm_normal_page(vma, addr, pte); + if (pte_soft_dirty(pte)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + } else if (is_swap_pte(pte)) { + swp_entry_t entry; + if (pte_swp_soft_dirty(pte)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + entry = pte_to_swp_entry(pte); + if (pm->show_pfn) + frame = swp_type(entry) | +@@ -1424,7 +1501,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); +@@ -1442,7 +1519,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) +- flags |= PM_SOFT_DIRTY; ++ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +-- +2.30.2 + diff --git a/linux-tkg-patches/5.12/0002-mm-Support-soft-dirty-flag-read-with-reset.patch b/linux-tkg-patches/5.12/0002-mm-Support-soft-dirty-flag-read-with-reset.patch new file mode 100644 index 0000000..7dfc910 --- /dev/null +++ b/linux-tkg-patches/5.12/0002-mm-Support-soft-dirty-flag-read-with-reset.patch @@ -0,0 +1,363 @@ +From 9c85113cf4019e7b277a44e72bda8b78347aa72f Mon Sep 17 00:00:00 2001 +From: Paul Gofman +Date: Thu, 7 May 2020 14:05:31 +0300 +Subject: [PATCH 2/2] mm: Support soft dirty flag read with reset. + +--- + fs/proc/base.c | 3 + + fs/proc/internal.h | 1 + + fs/proc/task_mmu.c | 144 +++++++++++++++++++++++++++++++++++++++------ + 3 files changed, 130 insertions(+), 18 deletions(-) + +diff --git a/fs/proc/base.c b/fs/proc/base.c +index b3422cda2a91..8199ae2411ca 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -3202,6 +3202,9 @@ static const struct pid_entry tgid_base_stuff[] = { + REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), ++#ifdef CONFIG_MEM_SOFT_DIRTY ++ REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations), ++#endif + #endif + #ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), +diff --git a/fs/proc/internal.h b/fs/proc/internal.h +index f60b379dcdc7..36a901cf0e7f 100644 +--- a/fs/proc/internal.h ++++ b/fs/proc/internal.h +@@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_operations; + extern const struct file_operations proc_pid_smaps_rollup_operations; + extern const struct file_operations proc_clear_refs_operations; + extern const struct file_operations proc_pagemap_operations; ++extern const struct file_operations proc_pagemap_reset_operations; + + extern unsigned long task_vsize(struct mm_struct *); + extern unsigned long task_statm(struct mm_struct *, +diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c +index 7c7865028f10..a21694967915 100644 +--- a/fs/proc/task_mmu.c ++++ b/fs/proc/task_mmu.c +@@ -1056,8 +1056,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, + return page_maybe_dma_pinned(page); + } + +-static inline void clear_soft_dirty(struct vm_area_struct *vma, +- unsigned long addr, pte_t *pte) ++static inline bool clear_soft_dirty(struct vm_area_struct *vma, ++ unsigned long addr, pte_t *pte) + { + /* + * The soft-dirty tracker uses #PF-s to catch writes +@@ -1066,37 +1066,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, + * of how soft-dirty works. + */ + pte_t ptent = *pte; ++ bool ret = false; + + if (pte_present(ptent)) { + pte_t old_pte; + + if (pte_is_pinned(vma, addr, ptent)) +- return; ++ return ret; + old_pte = ptep_modify_prot_start(vma, addr, pte); ++ ret = pte_soft_dirty(old_pte); + ptent = pte_wrprotect(old_pte); + ptent = pte_clear_soft_dirty(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { ++ ret = pte_swp_soft_dirty(ptent); + ptent = pte_swp_clear_soft_dirty(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } ++ return ret; + } + #else +-static inline void clear_soft_dirty(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) + { ++ return false; + } + #endif + + #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { + pmd_t old, pmd = *pmdp; ++ bool ret = false; + + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + old = pmdp_invalidate(vma, addr, pmdp); ++ ++ ret = pmd_soft_dirty(old); ++ + if (pmd_dirty(old)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(old)) +@@ -1107,14 +1116,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { ++ ret = pmd_swp_soft_dirty(pmd); + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } ++ return ret; + } + #else +-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, ++static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) + { ++ return false; + } + #endif + +@@ -1367,6 +1379,7 @@ struct pagemapread { + int pos, len; /* units: PM_ENTRY_BYTES, not bytes */ + pagemap_entry_t *buffer; + bool show_pfn; ++ bool reset; + }; + + #define PAGEMAP_WALK_SIZE (PMD_SIZE) +@@ -1398,6 +1411,14 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme, + return 0; + } + ++static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm) ++{ ++ ((unsigned long *)pm->buffer)[pm->pos++] = addr; ++ if (pm->pos >= pm->len) ++ return PM_END_OF_BUFFER; ++ return 0; ++} ++ + static int pagemap_pte_hole(unsigned long start, unsigned long end, + __always_unused int depth, struct mm_walk *walk) + { +@@ -1405,6 +1426,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, + unsigned long addr = start; + int err = 0; + ++ if (pm->reset) ++ goto out; ++ + while (addr < end) { + struct vm_area_struct *vma = find_vma(walk->mm, addr); + pagemap_entry_t pme = make_pme(0, 0); +@@ -1439,8 +1463,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, + } + + static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, +- struct vm_area_struct *vma, unsigned long addr, pte_t pte) ++ struct vm_area_struct *vma, unsigned long addr, pte_t *pte_addr) + { ++ pte_t pte = *pte_addr; + u64 frame = 0, flags = 0; + struct page *page = NULL; + +@@ -1493,6 +1518,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + pmd_t pmd = *pmdp; + struct page *page = NULL; + ++ if (pm->reset) ++ { ++ if (clear_soft_dirty_pmd(vma, addr, pmdp)) ++ { ++ for (; addr != end; addr += PAGE_SIZE) ++ { ++ err = add_addr_to_pagemap(addr, pm); ++ if (err) ++ break; ++ } ++ } ++ goto trans_huge_done; ++ } ++ + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; + +@@ -1541,6 +1580,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + frame += (1 << MAX_SWAPFILES_SHIFT); + } + } ++trans_huge_done: + spin_unlock(ptl); + return err; + } +@@ -1555,10 +1595,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + */ + orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { +- pagemap_entry_t pme; ++ if (pm->reset) ++ { ++ if (clear_soft_dirty(vma, addr, pte)) ++ err = add_addr_to_pagemap(addr, pm); ++ } ++ else ++ { ++ pagemap_entry_t pme; + +- pme = pte_to_pagemap_entry(pm, vma, addr, *pte); +- err = add_to_pagemap(addr, &pme, pm); ++ pme = pte_to_pagemap_entry(pm, vma, addr, pte); ++ err = add_to_pagemap(addr, &pme, pm); ++ } + if (err) + break; + } +@@ -1650,8 +1698,8 @@ static const struct mm_walk_ops pagemap_ops = { + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +-static ssize_t pagemap_read(struct file *file, char __user *buf, +- size_t count, loff_t *ppos) ++static ssize_t do_pagemap_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos, bool reset) + { + struct mm_struct *mm = file->private_data; + struct pagemapread pm; +@@ -1660,6 +1708,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + unsigned long start_vaddr; + unsigned long end_vaddr; + int ret = 0, copied = 0; ++ struct mmu_notifier_range range; ++ size_t buffer_len; + + if (!mm || !mmget_not_zero(mm)) + goto out; +@@ -1675,19 +1725,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + + /* do not disclose physical addresses: attack vector */ + pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); ++ pm.reset = reset; + +- pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); +- pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL); ++ buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES); ++ ++ pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL); + ret = -ENOMEM; + if (!pm.buffer) + goto out_mm; + + src = *ppos; + svpfn = src / PM_ENTRY_BYTES; +- end_vaddr = mm->task_size; ++ ++ start_vaddr = svpfn << PAGE_SHIFT; ++ ++ if (reset) ++ { ++ if (count < sizeof(end_vaddr)) ++ { ++ ret = -EINVAL; ++ goto out_mm; ++ } ++ if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr))) ++ return -EFAULT; ++ end_vaddr = min(end_vaddr, mm->task_size); ++ } ++ else ++ { ++ end_vaddr = mm->task_size; ++ start_vaddr = end_vaddr; ++ } + + /* watch out for wraparound */ +- start_vaddr = end_vaddr; + if (svpfn <= (ULONG_MAX >> PAGE_SHIFT)) + start_vaddr = untagged_addr(svpfn << PAGE_SHIFT); + +@@ -1707,18 +1776,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + unsigned long end; + + pm.pos = 0; +- end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK; ++ pm.len = min(buffer_len, count / PM_ENTRY_BYTES); ++ ++ end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT)); + /* overflow ? */ + if (end < start_vaddr || end > end_vaddr) + end = end_vaddr; ++ + ret = mmap_read_lock_killable(mm); + if (ret) + goto out_free; ++ ++ if (reset) ++ { ++ inc_tlb_flush_pending(mm); ++ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, ++ 0, NULL, mm, start_vaddr, end); ++ mmu_notifier_invalidate_range_start(&range); ++ } + ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm); ++ if (reset) ++ { ++ mmu_notifier_invalidate_range_end(&range); ++ flush_tlb_mm(mm); ++ dec_tlb_flush_pending(mm); ++ } + mmap_read_unlock(mm); +- start_vaddr = end; + + len = min(count, PM_ENTRY_BYTES * pm.pos); ++ BUG_ON(ret && ret != PM_END_OF_BUFFER); + if (copy_to_user(buf, pm.buffer, len)) { + ret = -EFAULT; + goto out_free; +@@ -1726,6 +1812,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + copied += len; + buf += len; + count -= len; ++ ++ start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end; + } + *ppos += copied; + if (!ret || ret == PM_END_OF_BUFFER) +@@ -1739,6 +1827,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, + return ret; + } + ++static ssize_t pagemap_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ return do_pagemap_read(file, buf, count, ppos, false); ++} ++ ++static ssize_t pagemap_reset_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ return do_pagemap_read(file, buf, count, ppos, true); ++} ++ + static int pagemap_open(struct inode *inode, struct file *file) + { + struct mm_struct *mm; +@@ -1765,6 +1865,14 @@ const struct file_operations proc_pagemap_operations = { + .open = pagemap_open, + .release = pagemap_release, + }; ++ ++const struct file_operations proc_pagemap_reset_operations = { ++ .llseek = mem_lseek, /* borrow this */ ++ .read = pagemap_reset_read, ++ .open = pagemap_open, ++ .release = pagemap_release, ++}; ++ + #endif /* CONFIG_PROC_PAGE_MONITOR */ + + #ifdef CONFIG_NUMA +-- +2.30.2 +