diff --git a/PKGBUILD b/PKGBUILD index 88b02d4..b0e2ba5 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -91,7 +91,7 @@ case $_basever in 0012-linux-hardened.patch ) sha256sums=('bf338980b1670bca287f9994b7441c2361907635879169c64ae78364efc5f491' - '73b34babbf3e1bf143b2d3496c8b9765b1a3bb0179dd30cf0604a6af464f4a37' + '69e2883eb909bb3bd2537236806b78f5c0ab093b56c03e0108c84476b60564ec' 'SKIP' '55dd5117c1da17c9ec38d7bc995958958bcc8b7ebcfd81de1d4c7650b85537ab' '1f4a20d6eaaa0d969af93152a65191492400c6aa838fc1c290b0dd29bb6019d8' @@ -260,7 +260,7 @@ case $_basever in 0012-misc-additions.patch ) sha256sums=('3239a4ee1250bf2048be988cc8cb46c487b2c8a0de5b1b032d38394d5c6b1a06' - 'fb324619e9785bd2aaf10ad428482046738440e7888865fe9eef3095151f74ac' + 'a0f35d02672add81985b2b3e899efafb119cb3a4587e0bfbe5a0ad19b739e993' 'SKIP' '958333f18de79c19ccf9eccb4e16e2a217a0619a1d96c2c65ccba23628815bab' '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' @@ -278,7 +278,7 @@ case $_basever in 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' 'b302ba6c5bbe8ed19b20207505d513208fae1e678cf4d8e7ac0b154e5fe3f456' - '7aba0a625404ed78e73c57860871af3b52610ae5196407286811322e3cd76aa3' + '14a261f1940a2b21b6b14df7391fc2c6274694bcfabfac3d0e985a67285dbfe7' '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' '11d2343174e5486e8ea1a1e98f9f6f1a1625043f6547484f5a729a83f94336eb' diff --git a/linux-tkg-config/prepare b/linux-tkg-config/prepare index 4315eee..4f89bff 100644 --- a/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -1,9 +1,9 @@ #!/bin/bash -ver54=79 +ver54=80 ver57=19 ver58=18 -ver59=10 +ver59=11 ver510=rc5 _tkg_initscript() { diff --git a/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch b/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch index 6c9d1fe..5f620c2 100644 --- a/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch +++ b/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch @@ -777,10 +777,10 @@ index 1c7b0e3f6daa..8afa8e3bc14f 100644 obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 -index 000000000000..5594af719b2a +index 000000000000..57c5d58c2d87 --- /dev/null +++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,50 @@ +@@ -0,0 +1,51 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support" @@ -803,6 +803,7 @@ index 000000000000..5594af719b2a + select SIXLOCKS + select RAID6_PQ + select XOR_BLOCKS ++ select SRCU + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. @@ -833,10 +834,10 @@ index 000000000000..5594af719b2a + Include some unit and performance tests for the core btree code diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 -index 000000000000..d85ced62c0dd +index 000000000000..2fbf978424ed --- /dev/null +++ b/fs/bcachefs/Makefile -@@ -0,0 +1,59 @@ +@@ -0,0 +1,60 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + @@ -895,6 +896,7 @@ index 000000000000..d85ced62c0dd + tests.o \ + trace.o \ + util.o \ ++ varint.o \ + xattr.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c new file mode 100644 @@ -1357,7 +1359,7 @@ index 000000000000..cb62d502a7ff +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000000..97508de9f721 +index 000000000000..2dd8a37f29e7 --- /dev/null +++ b/fs/bcachefs/alloc_background.c @@ -0,0 +1,1477 @@ @@ -2819,7 +2821,7 @@ index 000000000000..97508de9f721 + return 0; + + p = kthread_create(bch2_allocator_thread, ca, -+ "bch_alloc[%s]", ca->name); ++ "bch-alloc/%s", ca->name); + if (IS_ERR(p)) + return PTR_ERR(p); + @@ -2840,7 +2842,7 @@ index 000000000000..97508de9f721 +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 -index 000000000000..cbaff56f7473 +index 000000000000..d10ff56e4de1 --- /dev/null +++ b/fs/bcachefs/alloc_background.h @@ -0,0 +1,105 @@ @@ -2922,7 +2924,7 @@ index 000000000000..cbaff56f7473 +static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) +{ -+ if (expensive_debug_checks(c)) { ++ if (bch2_expensive_debug_checks) { + size_t iter; + long i; + unsigned j; @@ -4210,10 +4212,10 @@ index 000000000000..20705460bb0a +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000000..29f411635f29 +index 000000000000..6d54defcee58 --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,882 @@ +@@ -0,0 +1,898 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -4409,6 +4411,7 @@ index 000000000000..29f411635f29 +#include +#include +#include ++#include +#include +#include +#include @@ -4481,6 +4484,8 @@ index 000000000000..29f411635f29 + BCH_DEBUG_PARAM(debug_check_bkeys, \ + "Run bkey_debugcheck (primarily checking GC/allocation "\ + "information) when iterating over keys") \ ++ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ ++ "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ @@ -4511,6 +4516,16 @@ index 000000000000..29f411635f29 +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() +#endif + ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#ifndef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++#endif ++ +#define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ @@ -4745,6 +4760,10 @@ index 000000000000..29f411635f29 + u64 journal_seq_base; +}; + ++struct btree_iter_buf { ++ struct btree_iter *iter; ++}; ++ +struct bch_fs { + struct closure cl; + @@ -4840,13 +4859,15 @@ index 000000000000..29f411635f29 + struct mutex btree_trans_lock; + struct list_head btree_trans_list; + mempool_t btree_iters_pool; ++ struct btree_iter_buf __percpu *btree_iters_bufs; ++ ++ struct srcu_struct btree_trans_barrier; + + struct btree_key_cache btree_key_cache; + + struct workqueue_struct *wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; -+ struct workqueue_struct *journal_reclaim_wq; + + /* ALLOCATION */ + struct delayed_work pd_controllers_update; @@ -5017,7 +5038,8 @@ index 000000000000..29f411635f29 + struct mutex verify_lock; +#endif + -+ u64 unused_inode_hint; ++ u64 *unused_inode_hints; ++ unsigned inode_shard_bits; + + /* + * A btree node on disk could have too many bsets for an iterator to fit @@ -5042,10 +5064,6 @@ index 000000000000..29f411635f29 + unsigned copy_gc_enabled:1; + bool promote_whole_extents; + -+#define BCH_DEBUG_PARAM(name, description) bool name; -+ BCH_DEBUG_PARAMS_ALL() -+#undef BCH_DEBUG_PARAM -+ + struct time_stats times[BCH_TIME_STAT_NR]; +}; + @@ -5098,10 +5116,10 @@ index 000000000000..29f411635f29 +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000000..2926c648a17f +index 000000000000..94b5418587e3 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,1680 @@ +@@ -0,0 +1,1683 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H @@ -5773,10 +5791,10 @@ index 000000000000..2926c648a17f +} __attribute__((packed, aligned(8))); + +#define BCH_INODE_FIELDS() \ -+ x(bi_atime, 64) \ -+ x(bi_ctime, 64) \ -+ x(bi_mtime, 64) \ -+ x(bi_otime, 64) \ ++ x(bi_atime, 96) \ ++ x(bi_ctime, 96) \ ++ x(bi_mtime, 96) \ ++ x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ @@ -5843,7 +5861,8 @@ index 000000000000..2926c648a17f +#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); ++LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +/* Dirents */ + @@ -6434,13 +6453,15 @@ index 000000000000..2926c648a17f + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) \ -+ x(reflink_inline_data, 14) ++ x(reflink_inline_data, 14) \ ++ x(new_varint, 15) + +#define BCH_SB_FEATURES_ALL \ + ((1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ -+ (1ULL << BCH_FEATURE_extents_above_btree_updates)) ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ ++ (1ULL << BCH_FEATURE_new_varint))\ + +enum bch_sb_feature { +#define x(f, n) BCH_FEATURE_##f, @@ -6784,10 +6805,10 @@ index 000000000000..2926c648a17f +#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h new file mode 100644 -index 000000000000..d71157a3e073 +index 000000000000..0e626b098d91 --- /dev/null +++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -0,0 +1,332 @@ +@@ -0,0 +1,346 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IOCTL_H +#define _BCACHEFS_IOCTL_H @@ -6863,6 +6884,7 @@ index 000000000000..d71157a3e073 +#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) +#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) +#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) + +/* ioctl below act on a particular file, not the filesystem as a whole: */ + @@ -7119,10 +7141,23 @@ index 000000000000..d71157a3e073 + __u64 nbuckets; +}; + ++/* ++ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize_journal { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 -index 000000000000..4d0c9129cd4a +index 000000000000..c06d0a965be1 --- /dev/null +++ b/fs/bcachefs/bkey.c @@ -0,0 +1,1154 @@ @@ -7539,7 +7574,7 @@ index 000000000000..4d0c9129cd4a + + if ((*p & mask) != mask) { + *p += 1ULL << offset; -+ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); ++ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); + return true; + } + @@ -8182,9 +8217,9 @@ index 000000000000..4d0c9129cd4a +} + +__pure __flatten -+int __bch2_bkey_cmp_packed(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) ++int bch2_bkey_cmp_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) +{ + struct bkey unpacked; + @@ -8282,10 +8317,10 @@ index 000000000000..4d0c9129cd4a +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 -index 000000000000..80ea488d57b0 +index 000000000000..2d2c640305e2 --- /dev/null +++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,606 @@ +@@ -0,0 +1,565 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H @@ -8355,13 +8390,6 @@ index 000000000000..80ea488d57b0 +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) + -+#define bkey_packed_typecheck(_k) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ -+ !type_is(_k, struct bkey_packed *)); \ -+ type_is(_k, struct bkey_packed *); \ -+}) -+ +enum bkey_lr_packed { + BKEY_PACKED_BOTH, + BKEY_PACKED_RIGHT, @@ -8369,9 +8397,6 @@ index 000000000000..80ea488d57b0 + BKEY_PACKED_NONE, +}; + -+#define bkey_lr_packed_typecheck(_l, _r) \ -+ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) -+ +#define bkey_lr_packed(_l, _r) \ + ((_l)->format + ((_r)->format << 1)) + @@ -8420,9 +8445,9 @@ index 000000000000..80ea488d57b0 + const struct bpos *); + +__pure -+int __bch2_bkey_cmp_packed(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); ++int bch2_bkey_cmp_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); + +__pure +int __bch2_bkey_cmp_left_packed(const struct btree *, @@ -8448,37 +8473,6 @@ index 000000000000..80ea488d57b0 + return bkey_cmp_left_packed(b, l, &r); +} + -+/* -+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to -+ * skip dispatching on k->format: -+ */ -+#define bkey_cmp_packed(_b, _l, _r) \ -+({ \ -+ int _cmp; \ -+ \ -+ switch (bkey_lr_packed_typecheck(_l, _r)) { \ -+ case BKEY_PACKED_NONE: \ -+ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ -+ ((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_LEFT: \ -+ _cmp = bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_l), \ -+ &((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_RIGHT: \ -+ _cmp = -bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_r), \ -+ &((struct bkey *) (_l))->p); \ -+ break; \ -+ case BKEY_PACKED_BOTH: \ -+ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ -+ (void *) (_r), (_b)); \ -+ break; \ -+ } \ -+ _cmp; \ -+}) -+ +#if 1 +static __always_inline int bkey_cmp(struct bpos l, struct bpos r) +{ @@ -8894,10 +8888,10 @@ index 000000000000..80ea488d57b0 +#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000000..32849229801d +index 000000000000..f5779795a4b2 --- /dev/null +++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,357 @@ +@@ -0,0 +1,365 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -9081,8 +9075,12 @@ index 000000000000..32849229801d +void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) +{ + if (k) { -+ pr_buf(out, "u64s %u type %s ", k->u64s, -+ bch2_bkey_types[k->type]); ++ pr_buf(out, "u64s %u type ", k->u64s); ++ ++ if (k->type < KEY_TYPE_MAX) ++ pr_buf(out, "%s ", bch2_bkey_types[k->type]); ++ else ++ pr_buf(out, "%u ", k->type); + + bch2_bpos_to_text(out, k->p); + @@ -9096,10 +9094,14 @@ index 000000000000..32849229801d +void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ if (k.k->type < KEY_TYPE_MAX) { ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; + -+ if (likely(ops->val_to_text)) -+ ops->val_to_text(out, c, k); ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++ } else { ++ pr_buf(out, "(invalid type %u)", k.k->type); ++ } +} + +void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, @@ -9136,7 +9138,7 @@ index 000000000000..32849229801d + const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; + enum merge_result ret; + -+ if (key_merging_disabled(c) || ++ if (bch2_key_merging_disabled || + !ops->key_merge || + l.k->type != r.k->type || + bversion_cmp(l.k->version, r.k->version) || @@ -9394,7 +9396,7 @@ index 000000000000..f607a0cb37ed +#endif /* _BCACHEFS_BKEY_ON_STACK_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 -index 000000000000..839e78d1dc35 +index 000000000000..99e0a4011fae --- /dev/null +++ b/fs/bcachefs/bkey_sort.c @@ -0,0 +1,515 @@ @@ -9486,7 +9488,7 @@ index 000000000000..839e78d1dc35 + struct bkey_packed *l, + struct bkey_packed *r) +{ -+ return bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed(b, l, r) ?: + cmp_int((unsigned long) l, (unsigned long) r); +} + @@ -9498,7 +9500,7 @@ index 000000000000..839e78d1dc35 + * and should be dropped. + */ + return iter->used >= 2 && -+ !bkey_cmp_packed(iter->b, ++ !bch2_bkey_cmp_packed(iter->b, + iter->data[0].k, + iter->data[1].k); +} @@ -9623,7 +9625,7 @@ index 000000000000..839e78d1dc35 + struct bkey_packed *l, + struct bkey_packed *r) +{ -+ return bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed(b, l, r) ?: + (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; +} @@ -9645,7 +9647,7 @@ index 000000000000..839e78d1dc35 + continue; + + while ((next = sort_iter_peek(iter)) && -+ !bkey_cmp_packed(iter->b, in, next)) { ++ !bch2_bkey_cmp_packed(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + needs_whiteout |= in->needs_whiteout; @@ -9806,7 +9808,7 @@ index 000000000000..839e78d1dc35 + struct bkey_packed *l, + struct bkey_packed *r) +{ -+ return bkey_cmp_packed(b, l, r) ?: ++ return bch2_bkey_cmp_packed(b, l, r) ?: + (int) bkey_deleted(l) - (int) bkey_deleted(r); +} + @@ -9978,10 +9980,10 @@ index 000000000000..458a051fdac5 +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 -index 000000000000..f7c2841ed8a7 +index 000000000000..1c7318c6e46f --- /dev/null +++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1742 @@ +@@ -0,0 +1,1738 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a @@ -10353,10 +10355,10 @@ index 000000000000..f7c2841ed8a7 + return ro_aux_tree_base(b, t)->f + idx; +} + -+static void bset_aux_tree_verify(struct btree *b) ++static void bset_aux_tree_verify(const struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bset_tree *t; ++ const struct bset_tree *t; + + for_each_bset(b, t) { + if (t->aux_data_offset == U16_MAX) @@ -10372,15 +10374,13 @@ index 000000000000..f7c2841ed8a7 +#endif +} + -+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) ++void bch2_btree_keys_init(struct btree *b) +{ + unsigned i; + + b->nsets = 0; + memset(&b->nr, 0, sizeof(b->nr)); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ b->expensive_debug_checks = expensive_debug_checks; -+#endif ++ + for (i = 0; i < MAX_BSETS; i++) + b->set[i].data_offset = U16_MAX; + @@ -10506,7 +10506,7 @@ index 000000000000..f7c2841ed8a7 + struct bkey_packed *k = btree_bkey_first(b, t); + unsigned j = 0; + -+ if (!btree_keys_expensive_checks(b)) ++ if (!bch2_expensive_debug_checks) + return; + + BUG_ON(bset_has_ro_aux_tree(t)); @@ -10590,53 +10590,23 @@ index 000000000000..f7c2841ed8a7 + return (u16) v; +} + -+static void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) ++__always_inline ++static inline void __make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) +{ + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); -+ struct bkey_packed *l, *r; ++ struct bkey_packed *l = is_power_of_2(j) ++ ? min_key ++ : tree_to_prev_bkey(b, t, j >> ffs(j)); ++ struct bkey_packed *r = is_power_of_2(j + 1) ++ ? max_key ++ : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + unsigned mantissa; + int shift, exponent, high_bit; + -+ if (is_power_of_2(j)) { -+ l = min_key; -+ -+ if (!l->u64s) { -+ if (!bkey_pack_pos(l, b->data->min_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = b->data->min_key; -+ bkey_copy(l, &tmp); -+ } -+ } -+ } else { -+ l = tree_to_prev_bkey(b, t, j >> ffs(j)); -+ -+ EBUG_ON(m < l); -+ } -+ -+ if (is_power_of_2(j + 1)) { -+ r = max_key; -+ -+ if (!r->u64s) { -+ if (!bkey_pack_pos(r, t->max_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = t->max_key; -+ bkey_copy(r, &tmp); -+ } -+ } -+ } else { -+ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); -+ -+ EBUG_ON(m > r); -+ } -+ + /* + * for failed bfloats, the lookup code falls back to comparing against + * the original key. @@ -10693,26 +10663,50 @@ index 000000000000..f7c2841ed8a7 + f->mantissa = mantissa; +} + ++static void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_i *k; ++ ++ if (is_power_of_2(j) && ++ !min_key->u64s) { ++ k = (void *) min_key; ++ bkey_init(&k->k); ++ k->k.p = b->data->min_key; ++ } ++ ++ if (is_power_of_2(j + 1) && ++ !max_key->u64s) { ++ k = (void *) max_key; ++ bkey_init(&k->k); ++ k->k.p = t->max_key; ++ } ++ ++ __make_bfloat(b, t, j, min_key, max_key); ++} ++ +/* bytes remaining - only valid for last bset: */ -+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) ++static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + bset_aux_tree_verify(b); + + return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); +} + -+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) ++static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / + (sizeof(struct bkey_float) + sizeof(u8)); +} + -+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) ++static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); +} + -+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *k; + @@ -10731,15 +10725,12 @@ index 000000000000..f7c2841ed8a7 + } +} + -+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); -+ struct bkey_packed min_key, max_key; ++ struct bkey_i min_key, max_key; + unsigned j, cacheline = 1; + -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ + t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), + bset_ro_tree_capacity(b, t)); +retry: @@ -10775,9 +10766,16 @@ index 000000000000..f7c2841ed8a7 + + t->max_key = bkey_unpack_pos(b, prev); + ++ bkey_init(&min_key.k); ++ min_key.k.p = b->data->min_key; ++ bkey_init(&max_key.k); ++ max_key.k.p = t->max_key; ++ + /* Then we build the tree */ + eytzinger1_for_each(j, t->size) -+ make_bfloat(b, t, j, &min_key, &max_key); ++ __make_bfloat(b, t, j, ++ bkey_to_packed(&min_key), ++ bkey_to_packed(&max_key)); +} + +static void bset_alloc_tree(struct btree *b, struct bset_tree *t) @@ -10906,7 +10904,7 @@ index 000000000000..f7c2841ed8a7 + k = p; + } + -+ if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + BUG_ON(ret >= orig_k); + + for (i = ret @@ -11211,8 +11209,8 @@ index 000000000000..f7c2841ed8a7 + +__flatten +static struct bkey_packed *bset_search_tree(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, ++ const struct bset_tree *t, ++ const struct bpos *search, + const struct bkey_packed *packed_search) +{ + struct ro_aux_tree *base = ro_aux_tree_base(b, t); @@ -11329,7 +11327,7 @@ index 000000000000..f7c2841ed8a7 + bkey_iter_pos_cmp(b, m, search) < 0) + m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); + -+ if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && @@ -11585,7 +11583,7 @@ index 000000000000..f7c2841ed8a7 +void bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) +{ -+ if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_next_check(iter, b); + } @@ -11604,7 +11602,7 @@ index 000000000000..f7c2841ed8a7 + struct bset_tree *t; + unsigned end = 0; + -+ if (btree_keys_expensive_checks(b)) ++ if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { @@ -11640,7 +11638,7 @@ index 000000000000..f7c2841ed8a7 + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + -+ if (btree_keys_expensive_checks(b)) ++ if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + return prev; +} @@ -11726,10 +11724,10 @@ index 000000000000..f7c2841ed8a7 +} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h new file mode 100644 -index 000000000000..5921cf689105 +index 000000000000..469294cc716c --- /dev/null +++ b/fs/bcachefs/bset.h -@@ -0,0 +1,661 @@ +@@ -0,0 +1,650 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BSET_H +#define _BCACHEFS_BSET_H @@ -11737,7 +11735,7 @@ index 000000000000..5921cf689105 +#include +#include + -+#include "bcachefs_format.h" ++#include "bcachefs.h" +#include "bkey.h" +#include "bkey_methods.h" +#include "btree_types.h" @@ -11879,17 +11877,6 @@ index 000000000000..5921cf689105 + * first key in that range of bytes again. + */ + -+extern bool bch2_expensive_debug_checks; -+ -+static inline bool btree_keys_expensive_checks(const struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ return bch2_expensive_debug_checks || *b->expensive_debug_checks; -+#else -+ return false; -+#endif -+} -+ +enum bset_aux_tree_type { + BSET_NO_AUX_TREE, + BSET_RO_AUX_TREE, @@ -11933,17 +11920,17 @@ index 000000000000..5921cf689105 + +#define BSET_CACHELINE 128 + -+static inline size_t btree_keys_cachelines(struct btree *b) ++static inline size_t btree_keys_cachelines(const struct btree *b) +{ + return (1U << b->byte_order) / BSET_CACHELINE; +} + -+static inline size_t btree_aux_data_bytes(struct btree *b) ++static inline size_t btree_aux_data_bytes(const struct btree *b) +{ + return btree_keys_cachelines(b) * 8; +} + -+static inline size_t btree_aux_data_u64s(struct btree *b) ++static inline size_t btree_aux_data_u64s(const struct btree *b) +{ + return btree_aux_data_bytes(b) / sizeof(u64); +} @@ -11960,7 +11947,7 @@ index 000000000000..5921cf689105 + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(dst, src); + -+ if (btree_keys_expensive_checks(b)) { ++ if (bch2_expensive_debug_checks) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); @@ -12098,7 +12085,7 @@ index 000000000000..5921cf689105 + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); +} + -+void bch2_btree_keys_init(struct btree *, bool *); ++void bch2_btree_keys_init(struct btree *); + +void bch2_bset_init_first(struct btree *, struct bset *); +void bch2_bset_init_next(struct bch_fs *, struct btree *, @@ -12209,7 +12196,7 @@ index 000000000000..5921cf689105 + const struct bkey_packed *l, + const struct bkey_packed *r) +{ -+ return bkey_cmp_packed(b, l, r) ++ return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) + ?: cmp_int(l, r); +} @@ -12386,17 +12373,17 @@ index 000000000000..5921cf689105 + +static inline void bch2_verify_btree_nr_keys(struct btree *b) +{ -+ if (btree_keys_expensive_checks(b)) ++ if (bch2_debug_check_btree_accounting) + __bch2_verify_btree_nr_keys(b); +} + +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000000..45d44c8785bd +index 000000000000..09774f56f11c --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1063 @@ +@@ -0,0 +1,1072 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -12610,7 +12597,7 @@ index 000000000000..45d44c8785bd + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ -+ if (verify_btree_ondisk(c)) ++ if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent); + else + __bch2_btree_node_write(c, b, SIX_LOCK_read); @@ -12653,7 +12640,7 @@ index 000000000000..45d44c8785bd + unsigned long freed = 0; + unsigned i, flags; + -+ if (btree_shrinker_disabled(c)) ++ if (bch2_btree_shrinker_disabled) + return SHRINK_STOP; + + /* Return -1 if we can't do anything right now */ @@ -12727,9 +12714,9 @@ index 000000000000..45d44c8785bd + clear_btree_node_accessed(b); + } + -+ memalloc_nofs_restore(flags); + mutex_unlock(&bc->lock); +out: ++ memalloc_nofs_restore(flags); + return (unsigned long) freed * btree_pages(c); +} + @@ -12740,7 +12727,7 @@ index 000000000000..45d44c8785bd + btree_cache.shrink); + struct btree_cache *bc = &c->btree_cache; + -+ if (btree_shrinker_disabled(c)) ++ if (bch2_btree_shrinker_disabled) + return 0; + + return btree_cache_can_free(bc) * btree_pages(c); @@ -12780,11 +12767,13 @@ index 000000000000..45d44c8785bd + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); -+ clear_btree_node_dirty(b); ++ clear_btree_node_dirty(c, b); + + btree_node_data_free(c, b); + } + ++ BUG_ON(atomic_read(&c->btree_cache.dirty)); ++ + while (!list_empty(&bc->freed)) { + b = list_first_entry(&bc->freed, struct btree, list); + list_del(&b->list); @@ -12844,7 +12833,7 @@ index 000000000000..45d44c8785bd + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.seeks = 4; + bc->shrink.batch = btree_pages(c) * 2; -+ register_shrinker(&bc->shrink); ++ ret = register_shrinker(&bc->shrink); +out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; @@ -12989,7 +12978,7 @@ index 000000000000..45d44c8785bd + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; -+ bch2_btree_keys_init(b, &c->expensive_debug_checks); ++ bch2_btree_keys_init(b); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); @@ -13104,7 +13093,8 @@ index 000000000000..45d44c8785bd + */ +struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, + const struct bkey_i *k, unsigned level, -+ enum six_lock_type lock_type) ++ enum six_lock_type lock_type, ++ unsigned long trace_ip) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; @@ -13166,7 +13156,7 @@ index 000000000000..45d44c8785bd + btree_node_unlock(iter, level + 1); + + if (!btree_node_lock(b, k->k.p, level, iter, lock_type, -+ lock_node_check_fn, (void *) k)) { ++ lock_node_check_fn, (void *) k, trace_ip)) { + if (b->hash_val != btree_ptr_hash_val(k)) + goto retry; + return ERR_PTR(-EINTR); @@ -13334,7 +13324,7 @@ index 000000000000..45d44c8785bd + bch2_bkey_unpack(parent, &tmp.k, k); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); ++ SIX_LOCK_intent, _THIS_IP_); + + if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { + struct btree_iter *linked; @@ -13354,7 +13344,7 @@ index 000000000000..45d44c8785bd + btree_node_unlock(iter, level); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); ++ SIX_LOCK_intent, _THIS_IP_); + + /* + * before btree_iter_relock() calls btree_iter_verify_locks(): @@ -13460,12 +13450,18 @@ index 000000000000..45d44c8785bd + stats.floats, + stats.failed); +} ++ ++void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ pr_buf(out, "nr nodes:\t%u\n", c->btree_cache.used); ++ pr_buf(out, "nr dirty:\t%u\n", atomic_read(&c->btree_cache.dirty)); ++} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 -index 000000000000..d0d3a85bb8be +index 000000000000..e766ef552ce7 --- /dev/null +++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,104 @@ +@@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H @@ -13491,7 +13487,7 @@ index 000000000000..d0d3a85bb8be + +struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, unsigned, -+ enum six_lock_type); ++ enum six_lock_type, unsigned long); + +struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, + enum btree_id, unsigned); @@ -13568,11 +13564,12 @@ index 000000000000..d0d3a85bb8be + +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + struct btree *); ++void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000000..e8c1e752a25d +index 000000000000..ac81c9b9a06a --- /dev/null +++ b/fs/bcachefs/btree_gc.c @@ -0,0 +1,1438 @@ @@ -13679,7 +13676,7 @@ index 000000000000..e8c1e752a25d + int ret = 0; + + if (initial) { -+ BUG_ON(journal_seq_verify(c) && ++ BUG_ON(bch2_journal_seq_verify && + k.k->version.lo > journal_cur_seq(&c->journal)); + + /* XXX change to fsck check */ @@ -13787,7 +13784,7 @@ index 000000000000..e8c1e752a25d + struct btree_iter *iter; + struct btree *b; + unsigned depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 ++ : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; @@ -13814,8 +13811,8 @@ index 000000000000..e8c1e752a25d + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOWAIT| + BTREE_INSERT_GC_LOCK_HELD); -+ else if (!btree_gc_rewrite_disabled(c) && -+ (btree_gc_always_rewrite(c) || max_stale > 16)) ++ else if (!bch2_btree_gc_rewrite_disabled && ++ (bch2_btree_gc_always_rewrite || max_stale > 16)) + bch2_btree_node_rewrite(c, iter, + b->data->keys.seq, + BTREE_INSERT_NOWAIT| @@ -13906,7 +13903,7 @@ index 000000000000..e8c1e752a25d +{ + struct btree *b; + unsigned target_depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 ++ : bch2_expensive_debug_checks ? 0 + : !btree_node_type_needs_gc(btree_id) ? 1 + : 0; + u8 max_stale = 0; @@ -14413,7 +14410,7 @@ index 000000000000..e8c1e752a25d +out: + if (!ret && + (test_bit(BCH_FS_FIXED_GENS, &c->flags) || -+ (!iter && test_restart_gc(c)))) { ++ (!iter && bch2_test_restart_gc))) { + /* + * XXX: make sure gens we fixed got saved + */ @@ -15005,7 +15002,7 @@ index 000000000000..e8c1e752a25d + + BUG_ON(c->gc_thread); + -+ p = kthread_create(bch2_gc_thread, c, "bch_gc"); ++ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) + return PTR_ERR(p); + @@ -15143,10 +15140,10 @@ index 000000000000..3694a3df62a8 +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000000..682f599cbef5 +index 000000000000..2406745fb365 --- /dev/null +++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,1838 @@ +@@ -0,0 +1,1845 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -15191,7 +15188,7 @@ index 000000000000..682f599cbef5 + BUG_ON(extents + ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 + : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); -+ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); ++ //BUG_ON(bch2_bkey_cmp_packed(&b->format, p, k) >= 0); + } +#endif +} @@ -15251,14 +15248,14 @@ index 000000000000..682f599cbef5 + break; + + for (b = a; c = 2 * b + 1, (d = c + 1) < n;) -+ b = bkey_cmp_packed(bt, ++ b = bch2_bkey_cmp_packed(bt, + ptrs[c], + ptrs[d]) >= 0 ? c : d; + if (d == n) + b = c; + + while (b != a && -+ bkey_cmp_packed(bt, ++ bch2_bkey_cmp_packed(bt, + ptrs[a], + ptrs[b]) >= 0) + b = (b - 1) / 2; @@ -16193,7 +16190,7 @@ index 000000000000..682f599cbef5 + const char *invalid = bch2_bkey_val_invalid(c, u.s_c); + + if (invalid || -+ (inject_invalid_keys(c) && ++ (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { + char buf[160]; + @@ -16591,8 +16588,10 @@ index 000000000000..682f599cbef5 + + ret = validate_bset(c, b, i, sectors, WRITE, false) ?: + validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); -+ if (ret) ++ if (ret) { + bch2_inconsistent_error(c); ++ dump_stack(); ++ } + + return ret; +} @@ -16647,6 +16646,8 @@ index 000000000000..682f599cbef5 + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); + ++ atomic_dec(&c->btree_cache.dirty); ++ + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); + @@ -16679,6 +16680,9 @@ index 000000000000..682f599cbef5 + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + ++ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ ++ bytes += 8; ++ + data = btree_bounce_alloc(c, bytes, &used_mempool); + + if (!b->written) { @@ -16987,10 +16991,10 @@ index 000000000000..682f599cbef5 +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 -index 000000000000..626d0f071b70 +index 000000000000..1a4b11e99cc4 --- /dev/null +++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,220 @@ +@@ -0,0 +1,237 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_IO_H +#define _BCACHEFS_BTREE_IO_H @@ -17007,6 +17011,23 @@ index 000000000000..626d0f071b70 +struct btree; +struct btree_iter; + ++static inline bool btree_node_dirty(struct btree *b) ++{ ++ return test_bit(BTREE_NODE_dirty, &b->flags); ++} ++ ++static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b) ++{ ++ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) ++ atomic_inc(&c->btree_cache.dirty); ++} ++ ++static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b) ++{ ++ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) ++ atomic_dec(&c->btree_cache.dirty); ++} ++ +struct btree_read_bio { + struct bch_fs *c; + u64 start_time; @@ -17213,10 +17234,10 @@ index 000000000000..626d0f071b70 +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000000..ec831dcd6a12 +index 000000000000..96cc5394295e --- /dev/null +++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,2445 @@ +@@ -0,0 +1,2506 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -17416,13 +17437,13 @@ index 000000000000..ec831dcd6a12 +bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, + unsigned level, struct btree_iter *iter, + enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, -+ void *p) ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) +{ + struct btree_trans *trans = iter->trans; -+ struct btree_iter *linked; ++ struct btree_iter *linked, *deadlock_iter = NULL; + u64 start_time = local_clock(); -+ bool ret = true; ++ unsigned reason = 9; + + /* Check if it's safe to block: */ + trans_for_each_iter(trans, linked) { @@ -17447,42 +17468,64 @@ index 000000000000..ec831dcd6a12 + linked->locks_want = max_t(unsigned, + linked->locks_want, + __fls(linked->nodes_locked) + 1); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; ++ if (!btree_iter_get_locks(linked, true, false)) { ++ deadlock_iter = linked; ++ reason = 1; ++ } + } else { -+ ret = false; ++ deadlock_iter = linked; ++ reason = 2; + } + } + ++ if (linked->btree_id != iter->btree_id) { ++ if (linked->btree_id > iter->btree_id) { ++ deadlock_iter = linked; ++ reason = 3; ++ } ++ continue; ++ } ++ ++ /* ++ * Within the same btree, cached iterators come before non ++ * cached iterators: ++ */ ++ if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) { ++ if (btree_iter_is_cached(iter)) { ++ deadlock_iter = linked; ++ reason = 4; ++ } ++ continue; ++ } ++ + /* + * Interior nodes must be locked before their descendants: if + * another iterator has possible descendants locked of the node + * we're about to lock, it must have the ancestors locked too: + */ -+ if (linked->btree_id == iter->btree_id && -+ level > __fls(linked->nodes_locked)) { ++ if (level > __fls(linked->nodes_locked)) { + if (!(trans->nounlock)) { + linked->locks_want = + max(level + 1, max_t(unsigned, + linked->locks_want, + iter->locks_want)); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; ++ if (!btree_iter_get_locks(linked, true, false)) { ++ deadlock_iter = linked; ++ reason = 5; ++ } + } else { -+ ret = false; ++ deadlock_iter = linked; ++ reason = 6; + } + } + + /* Must lock btree nodes in key order: */ -+ if ((cmp_int(iter->btree_id, linked->btree_id) ?: -+ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) -+ ret = false; -+ -+ if (iter->btree_id == linked->btree_id && -+ btree_node_locked(linked, level) && ++ if (btree_node_locked(linked, level) && + bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, -+ btree_iter_type(linked))) <= 0) -+ ret = false; ++ btree_iter_type(linked))) <= 0) { ++ deadlock_iter = linked; ++ reason = 7; ++ } + + /* + * Recheck if this is a node we already have locked - since one @@ -17496,8 +17539,13 @@ index 000000000000..ec831dcd6a12 + } + } + -+ if (unlikely(!ret)) { -+ trace_trans_restart_would_deadlock(iter->trans->ip); ++ if (unlikely(deadlock_iter)) { ++ trace_trans_restart_would_deadlock(iter->trans->ip, ip, ++ reason, ++ deadlock_iter->btree_id, ++ btree_iter_type(deadlock_iter), ++ iter->btree_id, ++ btree_iter_type(iter)); + return false; + } + @@ -17690,7 +17738,7 @@ index 000000000000..ec831dcd6a12 + char buf1[100], buf2[100]; + const char *msg; + -+ if (!debug_check_iterators(iter->trans->c)) ++ if (!bch2_debug_check_iterators) + return; + + if (btree_iter_type(iter) == BTREE_ITER_CACHED) { @@ -17786,7 +17834,7 @@ index 000000000000..ec831dcd6a12 +{ + struct btree_iter *iter; + -+ if (!debug_check_iterators(trans->c)) ++ if (!bch2_debug_check_iterators) + return; + + trans_for_each_iter_with_node(trans, b, iter) @@ -17958,7 +18006,7 @@ index 000000000000..ec831dcd6a12 + __bch2_btree_node_iter_fix(iter, b, node_iter, t, + where, clobber_u64s, new_u64s); + -+ if (debug_check_iterators(iter->trans->c)) ++ if (bch2_debug_check_iterators) + bch2_btree_node_iter_verify(node_iter, b); + } + @@ -17988,7 +18036,7 @@ index 000000000000..ec831dcd6a12 + + ret = bkey_disassemble(l->b, k, u); + -+ if (debug_check_bkeys(iter->trans->c)) ++ if (bch2_debug_check_bkeys) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + + return ret; @@ -18164,7 +18212,8 @@ index 000000000000..ec831dcd6a12 +} + +static inline int btree_iter_lock_root(struct btree_iter *iter, -+ unsigned depth_want) ++ unsigned depth_want, ++ unsigned long trace_ip) +{ + struct bch_fs *c = iter->trans->c; + struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; @@ -18193,7 +18242,8 @@ index 000000000000..ec831dcd6a12 + lock_type = __btree_lock_want(iter, iter->level); + if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, + iter, lock_type, -+ lock_root_check_fn, rootp))) ++ lock_root_check_fn, rootp, ++ trace_ip))) + return -EINTR; + + if (likely(b == READ_ONCE(*rootp) && @@ -18265,7 +18315,8 @@ index 000000000000..ec831dcd6a12 + btree_node_unlock(iter, plevel); +} + -+static __always_inline int btree_iter_down(struct btree_iter *iter) ++static __always_inline int btree_iter_down(struct btree_iter *iter, ++ unsigned long trace_ip) +{ + struct bch_fs *c = iter->trans->c; + struct btree_iter_level *l = &iter->l[iter->level]; @@ -18279,7 +18330,7 @@ index 000000000000..ec831dcd6a12 + bch2_bkey_unpack(l->b, &tmp.k, + bch2_btree_node_iter_peek(&l->iter, l->b)); + -+ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); ++ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip); + if (unlikely(IS_ERR(b))) + return PTR_ERR(b); + @@ -18303,7 +18354,7 @@ index 000000000000..ec831dcd6a12 + btree_node_unlock(iter, iter->level++); +} + -+static int btree_iter_traverse_one(struct btree_iter *); ++static int btree_iter_traverse_one(struct btree_iter *, unsigned long); + +static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) +{ @@ -18328,6 +18379,7 @@ index 000000000000..ec831dcd6a12 + bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); +#undef btree_iter_cmp_by_idx + bch2_trans_unlock(trans); ++ cond_resched(); + + if (unlikely(ret == -ENOMEM)) { + struct closure cl; @@ -18358,7 +18410,7 @@ index 000000000000..ec831dcd6a12 + if (!(trans->iters_linked & (1ULL << idx))) + continue; + -+ ret = btree_iter_traverse_one(&trans->iters[idx]); ++ ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_); + if (ret) + goto retry_all; + } @@ -18421,7 +18473,8 @@ index 000000000000..ec831dcd6a12 + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_trans_exit(). + */ -+static int btree_iter_traverse_one(struct btree_iter *iter) ++static int btree_iter_traverse_one(struct btree_iter *iter, ++ unsigned long trace_ip) +{ + unsigned depth_want = iter->level; + @@ -18468,8 +18521,8 @@ index 000000000000..ec831dcd6a12 + */ + while (iter->level > depth_want) { + int ret = btree_iter_node(iter, iter->level) -+ ? btree_iter_down(iter) -+ : btree_iter_lock_root(iter, depth_want); ++ ? btree_iter_down(iter, trace_ip) ++ : btree_iter_lock_root(iter, depth_want, trace_ip); + if (unlikely(ret)) { + if (ret == 1) + return 0; @@ -18500,7 +18553,7 @@ index 000000000000..ec831dcd6a12 + int ret; + + ret = bch2_trans_cond_resched(trans) ?: -+ btree_iter_traverse_one(iter); ++ btree_iter_traverse_one(iter, _RET_IP_); + if (unlikely(ret)) + ret = __btree_iter_traverse_all(trans, ret); + @@ -18764,13 +18817,13 @@ index 000000000000..ec831dcd6a12 + + ret.v = bkeyp_val(&l->b->format, _k); + -+ if (debug_check_iterators(iter->trans->c)) { ++ if (bch2_debug_check_iterators) { + struct bkey k = bkey_unpack_key(l->b, _k); + + BUG_ON(memcmp(&k, &iter->k, sizeof(k))); + } + -+ if (debug_check_bkeys(iter->trans->c)) ++ if (bch2_debug_check_bkeys) + bch2_bkey_debugcheck(iter->trans->c, l->b, ret); + } + @@ -19189,6 +19242,7 @@ index 000000000000..ec831dcd6a12 + return bch2_trans_iter_put(trans, iter); +} + ++#if 0 +static int bch2_trans_realloc_iters(struct btree_trans *trans, + unsigned new_size) +{ @@ -19237,8 +19291,7 @@ index 000000000000..ec831dcd6a12 + sizeof(struct btree_iter) * trans->nr_iters + + sizeof(struct btree_insert_entry) * trans->nr_iters); + -+ if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); ++ kfree(trans->iters); + + trans->iters = new_iters; + trans->updates = new_updates; @@ -19252,6 +19305,7 @@ index 000000000000..ec831dcd6a12 + + return 0; +} ++#endif + +static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) +{ @@ -19261,28 +19315,27 @@ index 000000000000..ec831dcd6a12 + goto got_slot; + + if (trans->nr_iters == trans->size) { -+ int ret; ++ struct btree_iter *iter; + -+ if (trans->nr_iters >= BTREE_ITER_MAX) { -+ struct btree_iter *iter; ++ BUG_ON(trans->size < BTREE_ITER_MAX); + -+ trans_for_each_iter(trans, iter) { -+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", -+ bch2_btree_ids[iter->btree_id], -+ iter->pos.inode, -+ iter->pos.offset, -+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", -+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", -+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", -+ (void *) iter->ip_allocated); -+ } -+ -+ panic("trans iter oveflow\n"); ++ trans_for_each_iter(trans, iter) { ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); + } + ++ panic("trans iter oveflow\n"); ++#if 0 + ret = bch2_trans_realloc_iters(trans, trans->size * 2); + if (ret) + return ERR_PTR(ret); ++#endif + } + + idx = trans->nr_iters++; @@ -19524,31 +19577,47 @@ index 000000000000..ec831dcd6a12 + bch2_btree_iter_traverse_all(trans); +} + ++static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c) ++{ ++ unsigned new_size = BTREE_ITER_MAX; ++ size_t iters_bytes = sizeof(struct btree_iter) * new_size; ++ size_t updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ void *p = NULL; ++ ++ BUG_ON(trans->used_mempool); ++ ++#ifdef __KERNEL__ ++ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL); ++#endif ++ if (!p) ++ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ ++ trans->iters = p; p += iters_bytes; ++ trans->updates = p; p += updates_bytes; ++ trans->updates2 = p; p += updates_bytes; ++ trans->size = new_size; ++} ++ +void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, + unsigned expected_nr_iters, + size_t expected_mem_bytes) +{ -+ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); ++ memset(trans, 0, sizeof(*trans)); ++ trans->c = c; ++ trans->ip = _RET_IP_; + + /* + * reallocating iterators currently completely breaks -+ * bch2_trans_iter_put(): ++ * bch2_trans_iter_put(), we always allocate the max: + */ -+ expected_nr_iters = BTREE_ITER_MAX; ++ bch2_trans_alloc_iters(trans, c); + -+ trans->c = c; -+ trans->ip = _RET_IP_; -+ trans->size = ARRAY_SIZE(trans->iters_onstack); -+ trans->iters = trans->iters_onstack; -+ trans->updates = trans->updates_onstack; -+ trans->updates2 = trans->updates2_onstack; -+ trans->fs_usage_deltas = NULL; ++ if (expected_mem_bytes) { ++ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes); ++ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL); ++ } + -+ if (expected_nr_iters > trans->size) -+ bch2_trans_realloc_iters(trans, expected_nr_iters); -+ -+ if (expected_mem_bytes) -+ bch2_trans_preload_mem(trans, expected_mem_bytes); ++ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + +#ifdef CONFIG_BCACHEFS_DEBUG + trans->pid = current->pid; @@ -19560,6 +19629,8 @@ index 000000000000..ec831dcd6a12 + +int bch2_trans_exit(struct btree_trans *trans) +{ ++ struct bch_fs *c = trans->c; ++ + bch2_trans_unlock(trans); + +#ifdef CONFIG_BCACHEFS_DEBUG @@ -19568,23 +19639,32 @@ index 000000000000..ec831dcd6a12 + mutex_unlock(&trans->c->btree_trans_lock); +#endif + ++ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); ++ + bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); + + kfree(trans->fs_usage_deltas); + kfree(trans->mem); -+ if (trans->used_mempool) ++ ++#ifdef __KERNEL__ ++ /* ++ * Userspace doesn't have a real percpu implementation: ++ */ ++ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters); ++#endif ++ if (trans->iters) + mempool_free(trans->iters, &trans->c->btree_iters_pool); -+ else if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); ++ + trans->mem = (void *) 0x1; + trans->iters = (void *) 0x1; + + return trans->error ? -EIO : 0; +} + -+static void bch2_btree_iter_node_to_text(struct printbuf *out, -+ struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) ++static void __maybe_unused ++bch2_btree_iter_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) +{ + pr_buf(out, " %px l=%u %s:", + _b, _b->level, bch2_btree_ids[_b->btree_id]); @@ -19648,6 +19728,7 @@ index 000000000000..ec831dcd6a12 +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ + mempool_exit(&c->btree_iters_pool); ++ cleanup_srcu_struct(&c->btree_trans_barrier); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) @@ -19657,14 +19738,15 @@ index 000000000000..ec831dcd6a12 + INIT_LIST_HEAD(&c->btree_trans_list); + mutex_init(&c->btree_trans_lock); + -+ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ return init_srcu_struct(&c->btree_trans_barrier) ?: ++ mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, + sizeof(struct btree_iter) * nr + + sizeof(struct btree_insert_entry) * nr + + sizeof(struct btree_insert_entry) * nr); +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000000..f80e09255f68 +index 000000000000..f7a73619c85b --- /dev/null +++ b/fs/bcachefs/btree_iter.h @@ -0,0 +1,315 @@ @@ -19852,7 +19934,7 @@ index 000000000000..f80e09255f68 + const struct btree_iter *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: -+ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: ++ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?: + bkey_cmp(l->pos, r->pos); +} + @@ -19985,10 +20067,10 @@ index 000000000000..f80e09255f68 +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 -index 000000000000..61662750dfc0 +index 000000000000..a8d05b4739b7 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,519 @@ +@@ -0,0 +1,651 @@ + +#include "bcachefs.h" +#include "btree_cache.h" @@ -20000,8 +20082,11 @@ index 000000000000..61662750dfc0 +#include "journal.h" +#include "journal_reclaim.h" + ++#include +#include + ++static struct kmem_cache *bch2_key_cache; ++ +static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) +{ @@ -20020,8 +20105,8 @@ index 000000000000..61662750dfc0 +}; + +__flatten -+static inline struct bkey_cached * -+btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++inline struct bkey_cached * ++bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) +{ + struct bkey_cached_key key = { + .btree_id = btree_id, @@ -20057,12 +20142,22 @@ index 000000000000..61662750dfc0 + BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params)); + memset(&ck->key, ~0, sizeof(ck->key)); ++ ++ c->nr_keys--; +} + -+static void bkey_cached_free(struct btree_key_cache *c, ++static void bkey_cached_free(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ -+ list_move(&ck->list, &c->freed); ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); ++ ++ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); ++ ++ ck->btree_trans_barrier_seq = ++ start_poll_synchronize_srcu(&c->btree_trans_barrier); ++ ++ list_move_tail(&ck->list, &bc->freed); ++ bc->nr_freed++; + + kfree(ck->k); + ck->k = NULL; @@ -20077,9 +20172,20 @@ index 000000000000..61662750dfc0 +{ + struct bkey_cached *ck; + -+ list_for_each_entry(ck, &c->freed, list) -+ if (bkey_cached_lock_for_evict(ck)) ++ list_for_each_entry_reverse(ck, &c->freed, list) ++ if (bkey_cached_lock_for_evict(ck)) { ++ c->nr_freed--; + return ck; ++ } ++ ++ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO); ++ if (likely(ck)) { ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ return ck; ++ } + + list_for_each_entry(ck, &c->clean, list) + if (bkey_cached_lock_for_evict(ck)) { @@ -20087,16 +20193,7 @@ index 000000000000..61662750dfc0 + return ck; + } + -+ ck = kzalloc(sizeof(*ck), GFP_NOFS); -+ if (!ck) -+ return NULL; -+ -+ INIT_LIST_HEAD(&ck->list); -+ six_lock_init(&ck->c.lock); -+ BUG_ON(!six_trylock_intent(&ck->c.lock)); -+ BUG_ON(!six_trylock_write(&ck->c.lock)); -+ -+ return ck; ++ return NULL; +} + +static struct bkey_cached * @@ -20115,8 +20212,7 @@ index 000000000000..61662750dfc0 + ck->key.btree_id = btree_id; + ck->key.pos = pos; + ck->valid = false; -+ -+ BUG_ON(ck->flags); ++ ck->flags = 1U << BKEY_CACHED_ACCESSED; + + if (rhashtable_lookup_insert_fast(&c->table, + &ck->hash, @@ -20126,6 +20222,8 @@ index 000000000000..61662750dfc0 + return NULL; + } + ++ c->nr_keys++; ++ + list_move(&ck->list, &c->clean); + six_unlock_write(&ck->c.lock); + @@ -20195,6 +20293,7 @@ index 000000000000..61662750dfc0 + !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; +} + ++__flatten +int bch2_btree_iter_traverse_cached(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; @@ -20209,7 +20308,7 @@ index 000000000000..61662750dfc0 + goto fill; + } +retry: -+ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); ++ ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos); + if (!ck) { + if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { + iter->l[0].b = NULL; @@ -20233,7 +20332,7 @@ index 000000000000..61662750dfc0 + enum six_lock_type lock_want = __btree_lock_want(iter, 0); + + if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, -+ bkey_cached_check_fn, iter)) { ++ bkey_cached_check_fn, iter, _THIS_IP_)) { + if (ck->key.btree_id != iter->btree_id || + bkey_cmp(ck->key.pos, iter->pos)) { + goto retry; @@ -20270,6 +20369,9 @@ index 000000000000..61662750dfc0 + goto err; + } + ++ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ set_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ + iter->uptodate = BTREE_ITER_NEED_PEEK; + bch2_btree_iter_downgrade(iter); + return ret; @@ -20345,10 +20447,14 @@ index 000000000000..61662750dfc0 + + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + + if (!evict) { + mutex_lock(&c->btree_key_cache.lock); ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ c->btree_key_cache.nr_dirty--; ++ } ++ + list_move_tail(&ck->list, &c->btree_key_cache.clean); + mutex_unlock(&c->btree_key_cache.lock); + } else { @@ -20361,6 +20467,11 @@ index 000000000000..61662750dfc0 + six_lock_write(&ck->c.lock, NULL, NULL); + + mutex_lock(&c->btree_key_cache.lock); ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ c->btree_key_cache.nr_dirty--; ++ } ++ + bkey_cached_evict(&c->btree_key_cache, ck); + bkey_cached_free(&c->btree_key_cache, ck); + mutex_unlock(&c->btree_key_cache.lock); @@ -20381,19 +20492,23 @@ index 000000000000..61662750dfc0 + struct bkey_cached_key key; + struct btree_trans trans; + ++ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); ++ + six_lock_read(&ck->c.lock, NULL, NULL); + key = ck->key; + + if (ck->journal.seq != seq || + !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + six_unlock_read(&ck->c.lock); -+ return; ++ goto unlock; + } + six_unlock_read(&ck->c.lock); + + bch2_trans_init(&trans, c, 0, 0); + btree_key_cache_flush_pos(&trans, key, seq, false); + bch2_trans_exit(&trans); ++unlock: ++ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); +} + +/* @@ -20406,7 +20521,7 @@ index 000000000000..61662750dfc0 + struct bkey_cached_key key = { id, pos }; + + /* Fastpath - assume it won't be found: */ -+ if (!btree_key_cache_find(c, id, pos)) ++ if (!bch2_btree_key_cache_find(c, id, pos)) + return 0; + + return btree_key_cache_flush_pos(trans, key, 0, true); @@ -20418,6 +20533,7 @@ index 000000000000..61662750dfc0 +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) iter->l[0].b; ++ bool kick_reclaim = false; + + BUG_ON(insert->u64s > ck->u64s); + @@ -20438,14 +20554,22 @@ index 000000000000..61662750dfc0 + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + mutex_lock(&c->btree_key_cache.lock); -+ list_del_init(&ck->list); ++ list_move(&ck->list, &c->btree_key_cache.dirty); + + set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ c->btree_key_cache.nr_dirty++; ++ ++ if (bch2_nr_btree_keys_need_flush(c)) ++ kick_reclaim = true; ++ + mutex_unlock(&c->btree_key_cache.lock); + } + + bch2_journal_pin_update(&c->journal, trans->journal_res.seq, + &ck->journal, btree_key_cache_journal_flush); ++ ++ if (kick_reclaim) ++ journal_reclaim_kick(&c->journal); + return true; +} + @@ -20453,24 +20577,110 @@ index 000000000000..61662750dfc0 +void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, + enum btree_id id, struct bpos pos) +{ -+ BUG_ON(btree_key_cache_find(trans->c, id, pos)); ++ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos)); +} +#endif + -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) ++static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) +{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_key_cache.shrink); ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ struct bkey_cached *ck, *t; ++ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; ++ unsigned flags; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ flags = memalloc_nofs_save(); ++ ++ /* ++ * Newest freed entries are at the end of the list - once we hit one ++ * that's too new to be freed, we can bail out: ++ */ ++ list_for_each_entry_safe(ck, t, &bc->freed, list) { ++ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, ++ ck->btree_trans_barrier_seq)) ++ break; ++ ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); ++ bc->nr_freed--; ++ scanned++; ++ freed++; ++ } ++ ++ if (scanned >= nr) ++ goto out; ++ ++ list_for_each_entry_safe(ck, t, &bc->clean, list) { ++ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) ++ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); ++ else if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(bc, ck); ++ bkey_cached_free(bc, ck); ++ } ++ ++ scanned++; ++ if (scanned >= nr) { ++ if (&t->list != &bc->clean) ++ list_move_tail(&bc->clean, &t->list); ++ goto out; ++ } ++ } ++out: ++ memalloc_nofs_restore(flags); ++ mutex_unlock(&bc->lock); ++ ++ return freed; ++} ++ ++static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_key_cache.shrink); ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ ++ return bc->nr_keys; ++} ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) ++{ ++ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bkey_cached *ck, *n; + -+ mutex_lock(&c->lock); -+ list_for_each_entry_safe(ck, n, &c->clean, list) { -+ kfree(ck->k); -+ kfree(ck); -+ } -+ list_for_each_entry_safe(ck, n, &c->freed, list) -+ kfree(ck); -+ mutex_unlock(&c->lock); ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); + -+ rhashtable_destroy(&c->table); ++ mutex_lock(&bc->lock); ++ list_splice(&bc->dirty, &bc->clean); ++ ++ list_for_each_entry_safe(ck, n, &bc->clean, list) { ++ bch2_journal_pin_drop(&c->journal, &ck->journal); ++ bch2_journal_preres_put(&c->journal, &ck->res); ++ ++ kfree(ck->k); ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); ++ bc->nr_keys--; ++ } ++ ++ BUG_ON(bc->nr_dirty && !bch2_journal_error(&c->journal)); ++ BUG_ON(bc->nr_keys); ++ ++ list_for_each_entry_safe(ck, n, &bc->freed, list) { ++ list_del(&ck->list); ++ kmem_cache_free(bch2_key_cache, ck); ++ } ++ mutex_unlock(&bc->lock); ++ ++ rhashtable_destroy(&bc->table); +} + +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) @@ -20478,45 +20688,70 @@ index 000000000000..61662750dfc0 + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->freed); + INIT_LIST_HEAD(&c->clean); ++ INIT_LIST_HEAD(&c->dirty); +} + +int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) +{ -+ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++ c->shrink.seeks = 1; ++ c->shrink.count_objects = bch2_btree_key_cache_count; ++ c->shrink.scan_objects = bch2_btree_key_cache_scan; ++ ++ return register_shrinker(&c->shrink) ?: ++ rhashtable_init(&c->table, &bch2_btree_key_cache_params); +} + +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +{ -+ struct bucket_table *tbl; -+ struct bkey_cached *ck; -+ struct rhash_head *pos; -+ size_t i; ++ pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed); ++ pr_buf(out, "nr_keys:\t%zu\n", c->nr_keys); ++ pr_buf(out, "nr_dirty:\t%zu\n", c->nr_dirty); ++} + -+ mutex_lock(&c->lock); -+ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++void bch2_btree_key_cache_exit(void) ++{ ++ if (bch2_key_cache) ++ kmem_cache_destroy(bch2_key_cache); ++} + -+ for (i = 0; i < tbl->size; i++) { -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ pr_buf(out, "%s:", -+ bch2_btree_ids[ck->key.btree_id]); -+ bch2_bpos_to_text(out, ck->key.pos); ++int __init bch2_btree_key_cache_init(void) ++{ ++ bch2_key_cache = KMEM_CACHE(bkey_cached, 0); ++ if (!bch2_key_cache) ++ return -ENOMEM; + -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) -+ pr_buf(out, " journal seq %llu", ck->journal.seq); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->lock); ++ return 0; +} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h new file mode 100644 -index 000000000000..b1756c6c622c +index 000000000000..d7d31a0662c3 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,25 @@ +@@ -0,0 +1,49 @@ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H +#define _BCACHEFS_BTREE_KEY_CACHE_H + ++static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) ++{ ++ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t max_dirty = 4096 + nr_keys / 2; ++ ++ return max_t(ssize_t, 0, nr_dirty - max_dirty); ++} ++ ++static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) ++{ ++ size_t nr_dirty = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t nr_keys = READ_ONCE(c->btree_key_cache.nr_dirty); ++ size_t max_dirty = 4096 + (nr_keys * 3) / 4; ++ ++ return nr_dirty > max_dirty; ++} ++ ++struct bkey_cached * ++bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); ++ +int bch2_btree_iter_traverse_cached(struct btree_iter *); + +bool bch2_btree_insert_key_cached(struct btree_trans *, @@ -20538,13 +20773,16 @@ index 000000000000..b1756c6c622c + +void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); + ++void bch2_btree_key_cache_exit(void); ++int __init bch2_btree_key_cache_init(void); ++ +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000000..81fbf3e18647 +index 000000000000..38323e32731f --- /dev/null +++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,257 @@ +@@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H @@ -20723,13 +20961,15 @@ index 000000000000..81fbf3e18647 + +bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, + struct btree_iter *, enum six_lock_type, -+ six_lock_should_sleep_fn, void *); ++ six_lock_should_sleep_fn, void *, ++ unsigned long); + +static inline bool btree_node_lock(struct btree *b, + struct bpos pos, unsigned level, + struct btree_iter *iter, + enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) ++ six_lock_should_sleep_fn should_sleep_fn, void *p, ++ unsigned long ip) +{ + struct btree_trans *trans = iter->trans; + bool ret; @@ -20747,7 +20987,7 @@ index 000000000000..81fbf3e18647 + ret = likely(six_trylock_type(&b->c.lock, type)) || + btree_node_lock_increment(trans, b, level, type) || + __bch2_btree_node_lock(b, pos, level, iter, type, -+ should_sleep_fn, p); ++ should_sleep_fn, p, ip); + +#ifdef CONFIG_BCACHEFS_DEBUG + trans->locking = NULL; @@ -20804,10 +21044,10 @@ index 000000000000..81fbf3e18647 + diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000000..cc01baeec138 +index 000000000000..2d142ef601e1 --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,663 @@ +@@ -0,0 +1,670 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H @@ -20940,10 +21180,6 @@ index 000000000000..cc01baeec138 + + struct btree_write writes[2]; + -+#ifdef CONFIG_BCACHEFS_DEBUG -+ bool *expensive_debug_checks; -+#endif -+ + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +}; @@ -20972,6 +21208,7 @@ index 000000000000..cc01baeec138 + /* Number of elements in live + freeable lists */ + unsigned used; + unsigned reserve; ++ atomic_t dirty; + struct shrinker shrink; + + /* @@ -21093,6 +21330,11 @@ index 000000000000..cc01baeec138 + return iter->flags & BTREE_ITER_TYPE; +} + ++static inline bool btree_iter_is_cached(const struct btree_iter *iter) ++{ ++ return btree_iter_type(iter) == BTREE_ITER_CACHED; ++} ++ +static inline struct btree_iter_level *iter_l(struct btree_iter *iter) +{ + return iter->l + iter->level; @@ -21103,6 +21345,12 @@ index 000000000000..cc01baeec138 + struct rhashtable table; + struct list_head freed; + struct list_head clean; ++ struct list_head dirty; ++ struct shrinker shrink; ++ ++ size_t nr_freed; ++ size_t nr_keys; ++ size_t nr_dirty; +}; + +struct bkey_cached_key { @@ -21110,7 +21358,8 @@ index 000000000000..cc01baeec138 + struct bpos pos; +} __attribute__((packed, aligned(4))); + -+#define BKEY_CACHED_DIRTY 0 ++#define BKEY_CACHED_ACCESSED 0 ++#define BKEY_CACHED_DIRTY 1 + +struct bkey_cached { + struct btree_bkey_cached_common c; @@ -21118,6 +21367,7 @@ index 000000000000..cc01baeec138 + unsigned long flags; + u8 u64s; + bool valid; ++ u32 btree_trans_barrier_seq; + struct bkey_cached_key key; + + struct rhash_head hash; @@ -21154,6 +21404,7 @@ index 000000000000..cc01baeec138 + pid_t pid; +#endif + unsigned long ip; ++ int srcu_idx; + + u64 iters_linked; + u64 iters_live; @@ -21190,10 +21441,6 @@ index 000000000000..cc01baeec138 + unsigned journal_u64s; + unsigned journal_preres_u64s; + struct replicas_delta_list *fs_usage_deltas; -+ -+ struct btree_iter iters_onstack[2]; -+ struct btree_insert_entry updates_onstack[2]; -+ struct btree_insert_entry updates2_onstack[2]; +}; + +#define BTREE_FLAG(flag) \ @@ -21224,7 +21471,6 @@ index 000000000000..cc01baeec138 + +BTREE_FLAG(read_in_flight); +BTREE_FLAG(read_error); -+BTREE_FLAG(dirty); +BTREE_FLAG(need_write); +BTREE_FLAG(noevict); +BTREE_FLAG(write_idx); @@ -21453,6 +21699,7 @@ index 000000000000..cc01baeec138 + BTREE_INSERT_ENOSPC, + BTREE_INSERT_NEED_MARK_REPLICAS, + BTREE_INSERT_NEED_JOURNAL_RES, ++ BTREE_INSERT_NEED_JOURNAL_RECLAIM, +}; + +enum btree_gc_coalesce_fail_reason { @@ -21473,7 +21720,7 @@ index 000000000000..cc01baeec138 +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 -index 000000000000..e0b1bde37484 +index 000000000000..adb07043cbb3 --- /dev/null +++ b/fs/bcachefs/btree_update.h @@ -0,0 +1,144 @@ @@ -21546,8 +21793,8 @@ index 000000000000..e0b1bde37484 +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, u64 *, int flags); + -+int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *); ++int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, ++ struct bpos, struct bpos, u64 *); +int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, u64 *); + @@ -21623,10 +21870,10 @@ index 000000000000..e0b1bde37484 +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000000..a2604b0ce2d8 +index 000000000000..5143896e1b29 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2075 @@ +@@ -0,0 +1,2105 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -21640,6 +21887,7 @@ index 000000000000..a2604b0ce2d8 +#include "btree_iter.h" +#include "btree_locking.h" +#include "buckets.h" ++#include "error.h" +#include "extents.h" +#include "journal.h" +#include "journal_reclaim.h" @@ -21677,12 +21925,27 @@ index 000000000000..a2604b0ce2d8 + break; + bp = bkey_s_c_to_btree_ptr_v2(k); + -+ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); ++ if (bkey_cmp(next_node, bp.v->min_key)) { ++ bch2_dump_btree_node(c, b); ++ panic("expected next min_key %llu:%llu got %llu:%llu\n", ++ next_node.inode, ++ next_node.offset, ++ bp.v->min_key.inode, ++ bp.v->min_key.offset); ++ } + + bch2_btree_node_iter_advance(&iter, b); + + if (bch2_btree_node_iter_end(&iter)) { -+ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); ++ ++ if (bkey_cmp(k.k->p, b->key.k.p)) { ++ bch2_dump_btree_node(c, b); ++ panic("expected end %llu:%llu got %llu:%llu\n", ++ b->key.k.p.inode, ++ b->key.k.p.offset, ++ k.k->p.inode, ++ k.k->p.offset); ++ } + break; + } + @@ -21778,7 +22041,7 @@ index 000000000000..a2604b0ce2d8 + + b->ob.nr = 0; + -+ clear_btree_node_dirty(b); ++ clear_btree_node_dirty(c, b); + + btree_node_lock_type(c, b, SIX_LOCK_write); + __btree_node_free(c, b); @@ -21893,7 +22156,7 @@ index 000000000000..a2604b0ce2d8 + b = as->prealloc_nodes[--as->nr_prealloc_nodes]; + + set_btree_node_accessed(b); -+ set_btree_node_dirty(b); ++ set_btree_node_dirty(c, b); + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); @@ -22152,6 +22415,7 @@ index 000000000000..a2604b0ce2d8 +{ + struct bch_fs *c = as->c; + struct btree *b = as->b; ++ struct btree_trans trans; + u64 journal_seq = 0; + unsigned i; + int ret; @@ -22169,14 +22433,16 @@ index 000000000000..a2604b0ce2d8 + * journal reclaim does btree updates when flushing bkey_cached entries, + * which may require allocations as well. + */ -+ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_JOURNAL_RESERVED, -+ btree_update_nodes_written_trans(&trans, as)); ++ bch2_trans_init(&trans, c, 0, 512); ++ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ BTREE_INSERT_JOURNAL_RESERVED, ++ btree_update_nodes_written_trans(&trans, as)); ++ bch2_trans_exit(&trans); + BUG_ON(ret && !bch2_journal_error(&c->journal)); + + if (b) { @@ -22456,7 +22722,7 @@ index 000000000000..a2604b0ce2d8 + closure_wake_up(&c->btree_interior_update_wait); + } + -+ clear_btree_node_dirty(b); ++ clear_btree_node_dirty(c, b); + clear_btree_node_need_write(b); + + /* @@ -22647,7 +22913,19 @@ index 000000000000..a2604b0ce2d8 + struct bkey_i *insert, + struct btree_node_iter *node_iter) +{ ++ struct bch_fs *c = as->c; + struct bkey_packed *k; ++ const char *invalid; ++ ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert)); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert)); ++ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid); ++ dump_stack(); ++ } + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); @@ -22663,7 +22941,7 @@ index 000000000000..a2604b0ce2d8 + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(iter, b, node_iter, insert); -+ set_btree_node_dirty(b); ++ set_btree_node_dirty(c, b); + set_btree_node_need_write(b); +} + @@ -22942,7 +23220,7 @@ index 000000000000..a2604b0ce2d8 + * the node the iterator points to: + */ + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && -+ (bkey_cmp_packed(b, k, &insert->k) >= 0)) ++ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) + ; + + for_each_keylist_key(keys, insert) @@ -22982,9 +23260,6 @@ index 000000000000..a2604b0ce2d8 + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + -+ if (as->must_rewrite) -+ goto split; -+ + bch2_btree_node_lock_for_insert(c, b, iter); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { @@ -22992,6 +23267,8 @@ index 000000000000..a2604b0ce2d8 + goto split; + } + ++ btree_node_interior_verify(c, b); ++ + bch2_btree_insert_keys_interior(as, b, iter, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; @@ -23704,10 +23981,10 @@ index 000000000000..a2604b0ce2d8 +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 -index 000000000000..7668225e72c6 +index 000000000000..45d212730fd7 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,331 @@ +@@ -0,0 +1,333 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H @@ -23757,7 +24034,6 @@ index 000000000000..7668225e72c6 + BTREE_INTERIOR_UPDATING_AS, + } mode; + -+ unsigned must_rewrite:1; + unsigned nodes_written:1; + + enum btree_id btree_id; @@ -23947,6 +24223,9 @@ index 000000000000..7668225e72c6 + b->whiteout_u64s; + ssize_t total = c->opts.btree_node_size << 6; + ++ /* Always leave one extra u64 for bch2_varint_decode: */ ++ used++; ++ + return total - used; +} + @@ -24041,10 +24320,10 @@ index 000000000000..7668225e72c6 +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c new file mode 100644 -index 000000000000..852cece6d4a5 +index 000000000000..bbc6d5124275 --- /dev/null +++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1179 @@ +@@ -0,0 +1,1181 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -24119,7 +24398,7 @@ index 000000000000..852cece6d4a5 + EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); + + k = bch2_btree_node_iter_peek_all(node_iter, b); -+ if (k && bkey_cmp_packed(b, k, &insert->k)) ++ if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ @@ -24238,7 +24517,7 @@ index 000000000000..852cece6d4a5 + bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); + + if (unlikely(!btree_node_dirty(b))) -+ set_btree_node_dirty(b); ++ set_btree_node_dirty(c, b); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; @@ -24267,7 +24546,7 @@ index 000000000000..852cece6d4a5 + struct bch_fs *c = trans->c; + + BUG_ON(bkey_cmp(insert->k.p, iter->pos)); -+ BUG_ON(debug_check_bkeys(c) && ++ BUG_ON(bch2_debug_check_bkeys && + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + __btree_node_type(iter->level, iter->btree_id))); +} @@ -24333,6 +24612,10 @@ index 000000000000..852cece6d4a5 + + BUG_ON(iter->level); + ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && ++ bch2_btree_key_cache_must_wait(trans->c)) ++ return BTREE_INSERT_NEED_JOURNAL_RECLAIM; ++ + if (u64s <= ck->u64s) + return BTREE_INSERT_OK; + @@ -24487,10 +24770,10 @@ index 000000000000..852cece6d4a5 + */ + + if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { -+ if (journal_seq_verify(c)) ++ if (bch2_journal_seq_verify) + trans_for_each_update2(trans, i) + i->k->k.version.lo = trans->journal_res.seq; -+ else if (inject_invalid_keys(c)) ++ else if (bch2_inject_invalid_keys) + trans_for_each_update2(trans, i) + i->k->k.version = MAX_VERSION; + } @@ -24689,22 +24972,26 @@ index 000000000000..852cece6d4a5 + trace_trans_restart_journal_res_get(trans->ip); + ret = -EINTR; + break; ++ case BTREE_INSERT_NEED_JOURNAL_RECLAIM: ++ bch2_trans_unlock(trans); ++ ++ while (bch2_btree_key_cache_must_wait(c)) { ++ mutex_lock(&c->journal.reclaim_lock); ++ bch2_journal_reclaim(&c->journal); ++ mutex_unlock(&c->journal.reclaim_lock); ++ } ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_journal_reclaim(trans->ip); ++ ret = -EINTR; ++ break; + default: + BUG_ON(ret >= 0); + break; + } + -+ if (ret == -EINTR) { -+ int ret2 = bch2_btree_iter_traverse_all(trans); -+ -+ if (ret2) { -+ trace_trans_restart_traverse(trans->ip); -+ return ret2; -+ } -+ -+ trace_trans_restart_atomic(trans->ip); -+ } -+ + return ret; +} + @@ -25123,13 +25410,32 @@ index 000000000000..852cece6d4a5 + __bch2_btree_insert(&trans, id, k)); +} + -+int bch2_btree_delete_at_range(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end, -+ u64 *journal_seq) ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) +{ ++ struct bkey_i k; ++ ++ bkey_init(&k.k); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(trans, iter, &k, 0); ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE|flags); ++} ++ ++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ++ struct bpos start, struct bpos end, ++ u64 *journal_seq) ++{ ++ struct btree_iter *iter; + struct bkey_s_c k; + int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; +retry: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && @@ -25141,6 +25447,10 @@ index 000000000000..852cece6d4a5 + bkey_init(&delete.k); + + /* ++ * This could probably be more efficient for extents: ++ */ ++ ++ /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos @@ -25179,22 +25489,8 @@ index 000000000000..852cece6d4a5 + goto retry; + } + ++ bch2_trans_iter_put(trans, iter); + return ret; -+ -+} -+ -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned flags) -+{ -+ struct bkey_i k; -+ -+ bkey_init(&k.k); -+ k.k.p = iter->pos; -+ -+ bch2_trans_update(trans, iter, &k, 0); -+ return bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE|flags); +} + +/* @@ -25206,30 +25502,15 @@ index 000000000000..852cece6d4a5 + struct bpos start, struct bpos end, + u64 *journal_seq) +{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ /* -+ * XXX: whether we need mem/more iters depends on whether this btree id -+ * has triggers -+ */ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); -+ -+ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ return ret; ++ return bch2_trans_do(c, NULL, journal_seq, 0, ++ bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq)); +} diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000000..82f1cc4ca693 +index 000000000000..f7bdb14372f8 --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2257 @@ +@@ -0,0 +1,2247 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -25555,7 +25836,7 @@ index 000000000000..82f1cc4ca693 + +static u64 avail_factor(u64 r) +{ -+ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); ++ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); +} + +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) @@ -27276,16 +27557,6 @@ index 000000000000..82f1cc4ca693 + return avail_factor(__bch2_fs_usage_read_short(c).free); +} + -+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) -+{ -+ percpu_down_read(&c->mark_lock); -+ this_cpu_sub(c->usage[0]->online_reserved, -+ res->sectors); -+ percpu_up_read(&c->mark_lock); -+ -+ res->sectors = 0; -+} -+ +#define SECTORS_CACHE 1024 + +int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, @@ -27489,10 +27760,10 @@ index 000000000000..82f1cc4ca693 +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000000..a3873becbb70 +index 000000000000..856dc5a8c8a3 --- /dev/null +++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,318 @@ +@@ -0,0 +1,316 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. @@ -27767,13 +28038,11 @@ index 000000000000..a3873becbb70 + +/* disk reservations: */ + -+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); -+ +static inline void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) +{ -+ if (res->sectors) -+ __bch2_disk_reservation_put(c, res); ++ this_cpu_sub(c->usage[0]->online_reserved, res->sectors); ++ res->sectors = 0; +} + +#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) @@ -27956,10 +28225,10 @@ index 000000000000..d6057d22b18e +#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c new file mode 100644 -index 000000000000..0377f9018d27 +index 000000000000..e7c8969aaad1 --- /dev/null +++ b/fs/bcachefs/chardev.c -@@ -0,0 +1,704 @@ +@@ -0,0 +1,728 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_CHARDEV + @@ -27967,6 +28236,7 @@ index 000000000000..0377f9018d27 +#include "bcachefs_ioctl.h" +#include "buckets.h" +#include "chardev.h" ++#include "journal.h" +#include "move.h" +#include "replicas.h" +#include "super.h" @@ -28302,7 +28572,8 @@ index 000000000000..0377f9018d27 + ctx->c = c; + ctx->arg = arg; + -+ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); ++ ctx->thread = kthread_create(bch2_data_thread, ctx, ++ "bch-data/%s", c->name); + if (IS_ERR(ctx->thread)) { + ret = PTR_ERR(ctx->thread); + goto err; @@ -28525,6 +28796,26 @@ index 000000000000..0377f9018d27 + return ret; +} + ++static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, ++ struct bch_ioctl_disk_resize_journal arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ +#define BCH_IOCTL(_name, _argtype) \ +do { \ + _argtype i; \ @@ -28581,6 +28872,8 @@ index 000000000000..0377f9018d27 + BCH_IOCTL(data, struct bch_ioctl_data); + case BCH_IOCTL_DISK_RESIZE: + BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ case BCH_IOCTL_DISK_RESIZE_JOURNAL: ++ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); + + default: + return -ENOTTY; @@ -29819,7 +30112,7 @@ index 000000000000..92c740a47565 +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000000..b50d2b0d5fd3 +index 000000000000..aebf46bb1d21 --- /dev/null +++ b/fs/bcachefs/compress.c @@ -0,0 +1,629 @@ @@ -29895,7 +30188,7 @@ index 000000000000..b50d2b0d5fd3 + + BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); + -+ if (!IS_ENABLED(CONFIG_HIGHMEM) && ++ if (!PageHighMem(bio_iter_page(bio, start)) && + bio_phys_contig(bio, start)) + return (struct bbuf) { + .b = page_address(bio_iter_page(bio, start)) + @@ -30478,7 +30771,7 @@ index 000000000000..4bab1f61b3b5 +#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 -index 000000000000..aa10591a3b1a +index 000000000000..bbe3fefa2651 --- /dev/null +++ b/fs/bcachefs/debug.c @@ -0,0 +1,432 @@ @@ -30538,7 +30831,7 @@ index 000000000000..aa10591a3b1a + v->written = 0; + v->c.level = b->c.level; + v->c.btree_id = b->c.btree_id; -+ bch2_btree_keys_init(v, &c->expensive_debug_checks); ++ bch2_btree_keys_init(v); + + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick) <= 0) @@ -30916,10 +31209,10 @@ index 000000000000..aa10591a3b1a +} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h new file mode 100644 -index 000000000000..56c2d1ab5f63 +index 000000000000..7ac1615e9447 --- /dev/null +++ b/fs/bcachefs/debug.h -@@ -0,0 +1,63 @@ +@@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DEBUG_H +#define _BCACHEFS_DEBUG_H @@ -30930,44 +31223,15 @@ index 000000000000..56c2d1ab5f63 +struct btree; +struct bch_fs; + -+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_ALWAYS() -+#undef BCH_DEBUG_PARAM -+ +#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ +void __bch2_btree_verify(struct bch_fs *, struct btree *); -+ -+#define bypass_torture_test(d) ((d)->bypass_torture_test) -+ -+#else /* DEBUG */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) { return false; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ ++#else +static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} -+ -+#define bypass_torture_test(d) 0 -+ +#endif + +static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ -+ if (verify_btree_ondisk(c)) ++ if (bch2_verify_btree_ondisk) + __bch2_btree_verify(c, b); +} + @@ -32034,7 +32298,7 @@ index 000000000000..3d84f23c34ed +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000000..e4a4805ef218 +index 000000000000..d7ba0e7fc3b3 --- /dev/null +++ b/fs/bcachefs/ec.c @@ -0,0 +1,1661 @@ @@ -33626,7 +33890,7 @@ index 000000000000..e4a4805ef218 + size_t i; + + spin_lock(&c->ec_stripes_heap_lock); -+ for (i = 0; i < min(h->used, 20UL); i++) { ++ for (i = 0; i < min_t(size_t, h->used, 20); i++) { + m = genradix_ptr(&c->stripes[0], h->data[i].idx); + + pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, @@ -34575,7 +34839,7 @@ index 000000000000..38dc084627d2 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000000..88297b30f622 +index 000000000000..7fae6a4ba26f --- /dev/null +++ b/fs/bcachefs/extents.c @@ -0,0 +1,1260 @@ @@ -34670,7 +34934,7 @@ index 000000000000..88297b30f622 + return bch2_rand_range(l1 + l2) > l1; + } + -+ if (force_reconstruct_read(c)) ++ if (bch2_force_reconstruct_read) + return p1.idx > p2.idx; + + return p1.idx < p2.idx; @@ -34718,7 +34982,7 @@ index 000000000000..88297b30f622 + !bch2_dev_is_readable(ca)) + p.idx++; + -+ if (force_reconstruct_read(c) && ++ if (bch2_force_reconstruct_read && + !p.idx && p.has_ec) + p.idx++; + @@ -36946,10 +37210,10 @@ index 000000000000..cdb272708a4b +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 -index 000000000000..878419d40992 +index 000000000000..503ce1920f39 --- /dev/null +++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,317 @@ +@@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -36986,9 +37250,7 @@ index 000000000000..878419d40992 + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + -+ ret = bch2_inode_create(trans, new_inode, -+ BLOCKDEV_INODE_MAX, 0, -+ &c->unused_inode_hint); ++ ret = bch2_inode_create(trans, new_inode); + if (ret) + goto err; + @@ -37312,10 +37574,10 @@ index 000000000000..2273b7961c9b +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 -index 000000000000..3aed2ca4dced +index 000000000000..389f23ee6f91 --- /dev/null +++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3141 @@ +@@ -0,0 +1,3135 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -37353,6 +37615,22 @@ index 000000000000..3aed2ca4dced +#include +#include + ++static inline struct address_space *faults_disabled_mapping(void) ++{ ++ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); ++} ++ ++static inline void set_fdm_dropped_locks(void) ++{ ++ current->faults_disabled_mapping = ++ (void *) (((unsigned long) current->faults_disabled_mapping)|1); ++} ++ ++static inline bool fdm_dropped_locks(void) ++{ ++ return ((unsigned long) current->faults_disabled_mapping) & 1; ++} ++ +struct quota_res { + u64 sectors; +}; @@ -37583,28 +37861,13 @@ index 000000000000..3aed2ca4dced +/* for newly allocated pages: */ +static void __bch2_page_state_release(struct page *page) +{ -+ struct bch_page_state *s = __bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); ++ kfree(detach_page_private(page)); +} + +static void bch2_page_state_release(struct page *page) +{ -+ struct bch_page_state *s = bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); ++ EBUG_ON(!PageLocked(page)); ++ __bch2_page_state_release(page); +} + +/* for newly allocated pages: */ @@ -37618,13 +37881,7 @@ index 000000000000..3aed2ca4dced + return NULL; + + spin_lock_init(&s->lock); -+ /* -+ * migrate_page_move_mapping() assumes that pages with private data -+ * have their count elevated by 1. -+ */ -+ get_page(page); -+ set_page_private(page, (unsigned long) s); -+ SetPagePrivate(page); ++ attach_page_private(page, s); + return s; +} + @@ -37832,10 +38089,35 @@ index 000000000000..3aed2ca4dced +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; ++ struct address_space *mapping = file->f_mapping; ++ struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + int ret; + ++ if (fdm == mapping) ++ return VM_FAULT_SIGBUS; ++ ++ /* Lock ordering: */ ++ if (fdm > mapping) { ++ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); ++ ++ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) ++ goto got_lock; ++ ++ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); ++ ++ /* Signal that lock has been dropped: */ ++ set_fdm_dropped_locks(); ++ return VM_FAULT_SIGBUS; ++ } ++ + bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + @@ -37926,14 +38208,8 @@ index 000000000000..3aed2ca4dced + if (ret != MIGRATEPAGE_SUCCESS) + return ret; + -+ if (PagePrivate(page)) { -+ ClearPagePrivate(page); -+ get_page(newpage); -+ set_page_private(newpage, page_private(page)); -+ set_page_private(page, 0); -+ put_page(page); -+ SetPagePrivate(newpage); -+ } ++ if (PagePrivate(page)) ++ attach_page_private(newpage, detach_page_private(page)); + + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); @@ -37965,41 +38241,33 @@ index 000000000000..3aed2ca4dced + bio_put(bio); +} + -+static inline void page_state_init_for_read(struct page *page) -+{ -+ SetPagePrivate(page); -+ page->private = 0; -+} -+ +struct readpages_iter { + struct address_space *mapping; + struct page **pages; + unsigned nr_pages; -+ unsigned nr_added; + unsigned idx; + pgoff_t offset; +}; + +static int readpages_iter_init(struct readpages_iter *iter, -+ struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) ++ struct readahead_control *ractl) +{ ++ unsigned i, nr_pages = readahead_count(ractl); ++ + memset(iter, 0, sizeof(*iter)); + -+ iter->mapping = mapping; -+ iter->offset = list_last_entry(pages, struct page, lru)->index; ++ iter->mapping = ractl->mapping; ++ iter->offset = readahead_index(ractl); ++ iter->nr_pages = nr_pages; + + iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!iter->pages) + return -ENOMEM; + -+ while (!list_empty(pages)) { -+ struct page *page = list_last_entry(pages, struct page, lru); -+ -+ __bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ iter->pages[iter->nr_pages++] = page; -+ list_del(&page->lru); ++ __readahead_batch(ractl, iter->pages, nr_pages); ++ for (i = 0; i < nr_pages; i++) { ++ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL); ++ put_page(iter->pages[i]); + } + + return 0; @@ -38007,41 +38275,9 @@ index 000000000000..3aed2ca4dced + +static inline struct page *readpage_iter_next(struct readpages_iter *iter) +{ -+ struct page *page; -+ unsigned i; -+ int ret; ++ if (iter->idx >= iter->nr_pages) ++ return NULL; + -+ BUG_ON(iter->idx > iter->nr_added); -+ BUG_ON(iter->nr_added > iter->nr_pages); -+ -+ if (iter->idx < iter->nr_added) -+ goto out; -+ -+ while (1) { -+ if (iter->idx == iter->nr_pages) -+ return NULL; -+ -+ ret = add_to_page_cache_lru_vec(iter->mapping, -+ iter->pages + iter->nr_added, -+ iter->nr_pages - iter->nr_added, -+ iter->offset + iter->nr_added, -+ GFP_NOFS); -+ if (ret > 0) -+ break; -+ -+ page = iter->pages[iter->nr_added]; -+ iter->idx++; -+ iter->nr_added++; -+ -+ __bch2_page_state_release(page); -+ put_page(page); -+ } -+ -+ iter->nr_added += ret; -+ -+ for (i = iter->idx; i < iter->nr_added; i++) -+ put_page(iter->pages[i]); -+out: + EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); + + return iter->pages[iter->idx]; @@ -38207,10 +38443,9 @@ index 000000000000..3aed2ca4dced + bkey_on_stack_exit(&sk, c); +} + -+int bch2_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) ++void bch2_readahead(struct readahead_control *ractl) +{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts = io_opts(c, &inode->ei_inode); + struct btree_trans trans; @@ -38219,7 +38454,7 @@ index 000000000000..3aed2ca4dced + struct readpages_iter readpages_iter; + int ret; + -+ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); ++ ret = readpages_iter_init(&readpages_iter, ractl); + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); @@ -38254,8 +38489,6 @@ index 000000000000..3aed2ca4dced + + bch2_trans_exit(&trans); + kfree(readpages_iter.pages); -+ -+ return 0; +} + +static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, @@ -39130,14 +39363,16 @@ index 000000000000..3aed2ca4dced + struct bio *bio = &dio->op.wbio.bio; + struct bvec_iter_all iter; + struct bio_vec *bv; -+ unsigned unaligned; -+ bool sync = dio->sync; ++ unsigned unaligned, iter_count; ++ bool sync = dio->sync, dropped_locks; + long ret; + + if (dio->loop) + goto loop; + + while (1) { ++ iter_count = dio->iter.count; ++ + if (kthread) + kthread_use_mm(dio->mm); + BUG_ON(current->faults_disabled_mapping); @@ -39145,13 +39380,34 @@ index 000000000000..3aed2ca4dced + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + ++ dropped_locks = fdm_dropped_locks(); ++ + current->faults_disabled_mapping = NULL; + if (kthread) + kthread_unuse_mm(dio->mm); + ++ /* ++ * If the fault handler returned an error but also signalled ++ * that it dropped & retook ei_pagecache_lock, we just need to ++ * re-shoot down the page cache and retry: ++ */ ++ if (dropped_locks && ret) ++ ret = 0; ++ + if (unlikely(ret < 0)) + goto err; + ++ if (unlikely(dropped_locks)) { ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter_count - 1); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (!bio->bi_iter.bi_size) ++ continue; ++ } ++ + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); @@ -40459,10 +40715,10 @@ index 000000000000..3aed2ca4dced +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h new file mode 100644 -index 000000000000..7063556d289b +index 000000000000..2537a3d25ede --- /dev/null +++ b/fs/bcachefs/fs-io.h -@@ -0,0 +1,57 @@ +@@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_H +#define _BCACHEFS_FS_IO_H @@ -40484,8 +40740,7 @@ index 000000000000..7063556d289b +int bch2_readpage(struct file *, struct page *); + +int bch2_writepages(struct address_space *, struct writeback_control *); -+int bch2_readpages(struct file *, struct address_space *, -+ struct list_head *, unsigned); ++void bch2_readahead(struct readahead_control *); + +int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **); @@ -40927,10 +41182,10 @@ index 000000000000..f201980ef2c3 +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000000..1d66acaca33c +index 000000000000..f3f6fe6c776a --- /dev/null +++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1628 @@ +@@ -0,0 +1,1658 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -40975,6 +41230,11 @@ index 000000000000..1d66acaca33c + struct bch_inode_info *dst, + u64 journal_seq) +{ ++ /* ++ * atomic64_cmpxchg has a fallback for archs that don't support it, ++ * cmpxchg does not: ++ */ ++ atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; + u64 old, v = READ_ONCE(dst->ei_journal_seq); + + do { @@ -40982,7 +41242,7 @@ index 000000000000..1d66acaca33c + + if (old >= journal_seq) + break; -+ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++ } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); + + bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); +} @@ -41019,6 +41279,11 @@ index 000000000000..1d66acaca33c + __pagecache_lock_put(lock, 1); +} + ++bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) ++{ ++ return __pagecache_lock_tryget(lock, 1); ++} ++ +void bch2_pagecache_add_get(struct pagecache_lock *lock) +{ + __pagecache_lock_get(lock, 1); @@ -41158,6 +41423,13 @@ index 000000000000..1d66acaca33c + return &inode->v; +} + ++static int inum_test(struct inode *inode, void *p) ++{ ++ unsigned long *ino = p; ++ ++ return *ino == inode->i_ino; ++} ++ +static struct bch_inode_info * +__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, bool tmpfile) @@ -41192,7 +41464,8 @@ index 000000000000..1d66acaca33c + if (!tmpfile) + mutex_lock(&dir->ei_update_lock); + -+ bch2_trans_init(&trans, c, 8, 1024); ++ bch2_trans_init(&trans, c, 8, ++ 2048 + (!tmpfile ? dentry->d_name.len : 0)); +retry: + bch2_trans_begin(&trans); + @@ -41237,8 +41510,12 @@ index 000000000000..1d66acaca33c + * thread pulling the inode in and modifying it: + */ + -+ old = to_bch_ei(insert_inode_locked2(&inode->v)); -+ if (unlikely(old)) { ++ inode->v.i_state |= I_CREATING; ++ old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, ++ inum_test, NULL, &inode->v.i_ino)); ++ BUG_ON(!old); ++ ++ if (unlikely(old != inode)) { + /* + * We raced, another process pulled the new inode into cache + * before us: @@ -41740,7 +42017,7 @@ index 000000000000..1d66acaca33c + struct fiemap_extent_info *info, + struct bkey_s_c k, unsigned flags) +{ -+ if (bkey_extent_is_data(k.k)) { ++ if (bkey_extent_is_direct_data(k.k)) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -41771,6 +42048,12 @@ index 000000000000..1d66acaca33c + } + + return 0; ++ } else if (bkey_extent_is_inline_data(k.k)) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DATA_INLINE); + } else if (k.k->type == KEY_TYPE_reservation) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, @@ -41824,9 +42107,7 @@ index 000000000000..1d66acaca33c + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + -+ bkey_on_stack_realloc(&cur, c, k.k->u64s); -+ bkey_on_stack_realloc(&prev, c, k.k->u64s); -+ bkey_reassemble(cur.k, k); ++ bkey_on_stack_reassemble(&cur, c, k); + + ret = bch2_read_indirect_extent(&trans, + &offset_into_extent, &cur); @@ -41834,14 +42115,14 @@ index 000000000000..1d66acaca33c + break; + + k = bkey_i_to_s_c(cur.k); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); + + sectors = min(sectors, k.k->size - offset_into_extent); + -+ if (offset_into_extent) -+ bch2_cut_front(POS(k.k->p.inode, -+ bkey_start_offset(k.k) + -+ offset_into_extent), -+ cur.k); ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ cur.k); + bch2_key_resize(&cur.k->k, sectors); + cur.k->k.p = iter->pos; + cur.k->k.p.offset += cur.k->k.size; @@ -41856,10 +42137,8 @@ index 000000000000..1d66acaca33c + bkey_copy(prev.k, cur.k); + have_extent = true; + -+ if (k.k->type == KEY_TYPE_reflink_v) -+ bch2_btree_iter_set_pos(iter, k.k->p); -+ else -+ bch2_btree_iter_next(iter); ++ bch2_btree_iter_set_pos(iter, ++ POS(iter->pos.inode, iter->pos.offset + sectors)); + } + + if (ret == -EINTR) @@ -41995,7 +42274,7 @@ index 000000000000..1d66acaca33c + .writepage = bch2_writepage, + .readpage = bch2_readpage, + .writepages = bch2_writepages, -+ .readpages = bch2_readpages, ++ .readahead = bch2_readahead, + .set_page_dirty = __set_page_dirty_nobuffers, + .write_begin = bch2_write_begin, + .write_end = bch2_write_end, @@ -42161,7 +42440,7 @@ index 000000000000..1d66acaca33c + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); -+ bch2_inode_rm(c, inode->v.i_ino); ++ bch2_inode_rm(c, inode->v.i_ino, true); + } +} + @@ -42171,6 +42450,11 @@ index 000000000000..1d66acaca33c + struct bch_fs *c = sb->s_fs_info; + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); + unsigned shift = sb->s_blocksize_bits - 9; ++ /* ++ * this assumes inodes take up 64 bytes, which is a decent average ++ * number: ++ */ ++ u64 avail_inodes = ((usage.capacity - usage.used) << 3); + u64 fsid; + + buf->f_type = BCACHEFS_STATFS_MAGIC; @@ -42178,8 +42462,9 @@ index 000000000000..1d66acaca33c + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = (usage.capacity - usage.used) >> shift; + buf->f_bavail = buf->f_bfree; -+ buf->f_files = 0; -+ buf->f_ffree = 0; ++ ++ buf->f_files = usage.nr_inodes + avail_inodes; ++ buf->f_ffree = avail_inodes; + + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); @@ -42561,10 +42846,10 @@ index 000000000000..1d66acaca33c +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 -index 000000000000..eda903a45325 +index 000000000000..4ee1ac994420 --- /dev/null +++ b/fs/bcachefs/fs.h -@@ -0,0 +1,174 @@ +@@ -0,0 +1,175 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H @@ -42593,6 +42878,7 @@ index 000000000000..eda903a45325 +} + +void bch2_pagecache_add_put(struct pagecache_lock *); ++bool bch2_pagecache_add_tryget(struct pagecache_lock *); +void bch2_pagecache_add_get(struct pagecache_lock *); +void bch2_pagecache_block_put(struct pagecache_lock *); +void bch2_pagecache_block_get(struct pagecache_lock *); @@ -42741,10 +43027,10 @@ index 000000000000..eda903a45325 +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000000..5a6df3d1973a +index 000000000000..09ce6c29b88c --- /dev/null +++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,1502 @@ +@@ -0,0 +1,1489 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -43284,7 +43570,7 @@ index 000000000000..5a6df3d1973a + + bch2_trans_unlock(&trans); + -+ bch2_inode_pack(&p, &w.inode); ++ bch2_inode_pack(c, &p, &w.inode); + + ret = bch2_btree_insert(c, BTREE_ID_INODES, + &p.inode.k_i, NULL, NULL, @@ -43555,7 +43841,7 @@ index 000000000000..5a6df3d1973a + 0, NULL); + root_inode->bi_inum = BCACHEFS_ROOT_INO; + -+ bch2_inode_pack(&packed, root_inode); ++ bch2_inode_pack(c, &packed, root_inode); + + return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, + NULL, NULL, @@ -43613,36 +43899,22 @@ index 000000000000..5a6df3d1973a + return ret; +} + -+struct inode_bitmap { -+ unsigned long *bits; -+ size_t size; -+}; ++typedef GENRADIX(unsigned long) inode_bitmap; + -+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) ++static inline bool inode_bitmap_test(inode_bitmap *b, size_t nr) +{ -+ return nr < b->size ? test_bit(nr, b->bits) : false; ++ unsigned long *w = genradix_ptr(b, nr / BITS_PER_LONG); ++ return w ? test_bit(nr & (BITS_PER_LONG - 1), w) : false; +} + -+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) ++static inline int inode_bitmap_set(inode_bitmap *b, size_t nr) +{ -+ if (nr >= b->size) { -+ size_t new_size = max_t(size_t, max_t(size_t, -+ PAGE_SIZE * 8, -+ b->size * 2), -+ nr + 1); -+ void *n; ++ unsigned long *w = genradix_ptr_alloc(b, nr / BITS_PER_LONG, GFP_KERNEL); + -+ new_size = roundup_pow_of_two(new_size); -+ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); -+ if (!n) { -+ return -ENOMEM; -+ } ++ if (!w) ++ return -ENOMEM; + -+ b->bits = n; -+ b->size = new_size; -+ } -+ -+ __set_bit(nr, b->bits); ++ *w |= 1UL << (nr & (BITS_PER_LONG - 1)); + return 0; +} + @@ -43681,7 +43953,7 @@ index 000000000000..5a6df3d1973a +static int check_directory_structure(struct bch_fs *c, + struct bch_inode_unpacked *lostfound_inode) +{ -+ struct inode_bitmap dirs_done = { NULL, 0 }; ++ inode_bitmap dirs_done; + struct pathbuf path = { 0, 0, NULL }; + struct pathbuf_entry *e; + struct btree_trans trans; @@ -43698,6 +43970,7 @@ index 000000000000..5a6df3d1973a + + /* DFS: */ +restart_dfs: ++ genradix_init(&dirs_done); + had_unreachable = false; + + ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); @@ -43804,7 +44077,7 @@ index 000000000000..5a6df3d1973a + + if (had_unreachable) { + bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); -+ kfree(dirs_done.bits); ++ genradix_free(&dirs_done); + kfree(path.entries); + memset(&dirs_done, 0, sizeof(dirs_done)); + memset(&path, 0, sizeof(path)); @@ -43813,7 +44086,7 @@ index 000000000000..5a6df3d1973a +err: +fsck_err: + ret = bch2_trans_exit(&trans) ?: ret; -+ kfree(dirs_done.bits); ++ genradix_free(&dirs_done); + kfree(path.entries); + return ret; +} @@ -44014,7 +44287,7 @@ index 000000000000..5a6df3d1973a + + bch2_fs_lazy_rw(c); + -+ ret = bch2_inode_rm(c, u.bi_inum); ++ ret = bch2_inode_rm(c, u.bi_inum, false); + if (ret) + bch_err(c, "error in fsck: error %i while deleting inode", ret); + return ret; @@ -44073,7 +44346,7 @@ index 000000000000..5a6df3d1973a + if (do_update) { + struct bkey_inode_buf p; + -+ bch2_inode_pack(&p, &u); ++ bch2_inode_pack(c, &p, &u); + + ret = __bch2_trans_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| @@ -44264,19 +44537,21 @@ index 000000000000..9e4af02bde1e +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000000..28edc0834a92 +index 000000000000..82099e5a48d8 --- /dev/null +++ b/fs/bcachefs/inode.c -@@ -0,0 +1,556 @@ +@@ -0,0 +1,664 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" ++#include "btree_key_cache.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "str_hash.h" ++#include "varint.h" + +#include + @@ -44358,22 +44633,17 @@ index 000000000000..28edc0834a92 + return bytes; +} + -+void bch2_inode_pack(struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) ++static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) +{ -+ u8 *out = packed->inode.v.fields; ++ struct bkey_i_inode *k = &packed->inode; ++ u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + -+ bkey_inode_init(&packed->inode.k_i); -+ packed->inode.k.p.offset = inode->bi_inum; -+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; -+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); -+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); -+ -+#define x(_name, _bits) \ ++#define x(_name, _bits) \ + out += inode_encode_field(out, end, 0, inode->_name); \ + nr_fields++; \ + \ @@ -44392,7 +44662,69 @@ index 000000000000..28edc0834a92 + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + -+ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); ++ SET_INODE_NR_FIELDS(&k->v, nr_fields); ++} ++ ++static void bch2_inode_pack_v2(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ struct bkey_i_inode *k = &packed->inode; ++ u8 *out = k->v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ int ret; ++ ++#define x(_name, _bits) \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ ret = bch2_varint_encode(out, inode->_name); \ ++ out += ret; \ ++ \ ++ if (_bits > 64) \ ++ *out++ = 0; \ ++ \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } else { \ ++ *out++ = 0; \ ++ \ ++ if (_bits > 64) \ ++ *out++ = 0; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ BUG_ON(out > end); ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODE_NR_FIELDS(&k->v, nr_fields); ++} ++ ++void bch2_inode_pack(struct bch_fs *c, ++ struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ bkey_inode_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) { ++ SET_INODE_NEW_VARINT(&packed->inode.v, true); ++ bch2_inode_pack_v2(packed, inode); ++ } else { ++ bch2_inode_pack_v1(packed, inode); ++ } + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; @@ -44404,26 +44736,23 @@ index 000000000000..28edc0834a92 + BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); + BUG_ON(unpacked.bi_mode != inode->bi_mode); + -+#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); ++#define x(_name, _bits) if (unpacked._name != inode->_name) \ ++ panic("unpacked %llu should be %llu", \ ++ (u64) unpacked._name, (u64) inode->_name); + BCH_INODE_FIELDS() +#undef x + } +} + -+int bch2_inode_unpack(struct bkey_s_c_inode inode, -+ struct bch_inode_unpacked *unpacked) ++static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; -+ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); ++ const u8 *end = bkey_val_end(inode); + u64 field[2]; + unsigned fieldnr = 0, field_bits; + int ret; + -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); -+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -+ +#define x(_name, _bits) \ + if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ + memset(&unpacked->_name, 0, \ @@ -44446,6 +44775,62 @@ index 000000000000..28edc0834a92 +#undef x + + /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = bkey_val_end(inode); ++ unsigned fieldnr = 0; ++ int ret; ++ u64 v[2]; ++ ++#define x(_name, _bits) \ ++ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \ ++ ret = bch2_varint_decode(in, end, &v[0]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ \ ++ if (_bits > 64) { \ ++ ret = bch2_varint_decode(in, end, &v[1]); \ ++ if (ret < 0) \ ++ return ret; \ ++ in += ret; \ ++ } else { \ ++ v[1] = 0; \ ++ } \ ++ } else { \ ++ v[0] = v[1] = 0; \ ++ } \ ++ \ ++ unpacked->_name = v[0]; \ ++ if (v[1] || v[0] != unpacked->_name) \ ++ return -1; \ ++ fieldnr++; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ return 0; ++} ++ ++int bch2_inode_unpack(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++ if (INODE_NEW_VARINT(inode.v)) { ++ return bch2_inode_unpack_v2(inode, unpacked); ++ } else { ++ return bch2_inode_unpack_v1(inode, unpacked); ++ } + + return 0; +} @@ -44459,11 +44844,11 @@ index 000000000000..28edc0834a92 + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), -+ BTREE_ITER_SLOTS|flags); ++ BTREE_ITER_CACHED|flags); + if (IS_ERR(iter)) + return iter; + -+ k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); + if (ret) + goto err; @@ -44492,7 +44877,7 @@ index 000000000000..28edc0834a92 + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + -+ bch2_inode_pack(inode_p, inode); ++ bch2_inode_pack(trans->c, inode_p, inode); + bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); + return 0; +} @@ -44631,20 +45016,24 @@ index 000000000000..28edc0834a92 +} + +int bch2_inode_create(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ u64 min, u64 max, u64 *hint) ++ struct bch_inode_unpacked *inode_u) +{ ++ struct bch_fs *c = trans->c; + struct bkey_inode_buf *inode_p; + struct btree_iter *iter = NULL; + struct bkey_s_c k; -+ u64 start; ++ u64 min, max, start, *hint; + int ret; + -+ if (!max) -+ max = ULLONG_MAX; ++ unsigned cpu = raw_smp_processor_id(); ++ unsigned bits = (c->opts.inodes_32bit ++ ? 31 : 63) - c->inode_shard_bits; + -+ if (trans->c->opts.inodes_32bit) -+ max = min_t(u64, max, U32_MAX); ++ min = (cpu << bits); ++ max = (cpu << bits) | ~(ULLONG_MAX << bits); ++ ++ min = max_t(u64, min, BLOCKDEV_INODE_MAX); ++ hint = c->unused_inode_hints + cpu; + + start = READ_ONCE(*hint); + @@ -44660,7 +45049,17 @@ index 000000000000..28edc0834a92 + if (bkey_cmp(iter->pos, POS(0, max)) > 0) + break; + -+ if (k.k->type != KEY_TYPE_inode) ++ /* ++ * There's a potential cache coherency issue with the btree key ++ * cache code here - we're iterating over the btree, skipping ++ * that cache. We should never see an empty slot that isn't ++ * actually empty due to a pending update in the key cache ++ * because the update that creates the inode isn't done with a ++ * cached iterator, but - better safe than sorry, check the ++ * cache before using a slot: ++ */ ++ if (k.k->type != KEY_TYPE_inode && ++ !bch2_btree_key_cache_find(c, BTREE_ID_INODES, iter->pos)) + goto found_slot; + } + @@ -44681,21 +45080,24 @@ index 000000000000..28edc0834a92 + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + -+ bch2_inode_pack(inode_p, inode_u); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ ret = bch2_inode_write(trans, iter, inode_u); + bch2_trans_iter_put(trans, iter); -+ return 0; ++ return ret; +} + -+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) ++int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_i_inode_generation delete; + struct bpos start = POS(inode_nr, 0); + struct bpos end = POS(inode_nr + 1, 0); ++ struct bkey_s_c k; ++ u64 bi_generation; + int ret; + ++ bch2_trans_init(&trans, c, 0, 0); ++ + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should @@ -44704,61 +45106,69 @@ index 000000000000..28edc0834a92 + * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, -+ start, end, NULL); ++ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_EXTENTS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range_trans(&trans, BTREE_ID_XATTRS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range_trans(&trans, BTREE_ID_DIRENTS, ++ start, end, NULL); + if (ret) -+ return ret; ++ goto err; ++retry: ++ bch2_trans_begin(&trans); + -+ bch2_trans_init(&trans, c, 0, 0); ++ bi_generation = 0; + -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ do { -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ u32 bi_generation = 0; ++ if (cached) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_CACHED|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_cached(iter); ++ } else { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ } + -+ ret = bkey_err(k); -+ if (ret) -+ break; ++ ret = bkey_err(k); ++ if (ret) ++ goto err; + -+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, -+ "inode %llu not found when deleting", -+ inode_nr); ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, trans.c, ++ "inode %llu not found when deleting", ++ inode_nr); + -+ switch (k.k->type) { -+ case KEY_TYPE_inode: { -+ struct bch_inode_unpacked inode_u; ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bch_inode_unpacked inode_u; + -+ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) -+ bi_generation = inode_u.bi_generation + 1; -+ break; -+ } -+ case KEY_TYPE_inode_generation: { -+ struct bkey_s_c_inode_generation g = -+ bkey_s_c_to_inode_generation(k); -+ bi_generation = le32_to_cpu(g.v->bi_generation); -+ break; -+ } -+ } ++ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) ++ bi_generation = inode_u.bi_generation + 1; ++ break; ++ } ++ case KEY_TYPE_inode_generation: { ++ struct bkey_s_c_inode_generation g = ++ bkey_s_c_to_inode_generation(k); ++ bi_generation = le32_to_cpu(g.v->bi_generation); ++ break; ++ } ++ } + -+ if (!bi_generation) { -+ bkey_init(&delete.k); -+ delete.k.p.offset = inode_nr; -+ } else { -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p.offset = inode_nr; -+ delete.v.bi_generation = cpu_to_le32(bi_generation); -+ } ++ if (!bi_generation) { ++ bkey_init(&delete.k); ++ delete.k.p.offset = inode_nr; ++ } else { ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p.offset = inode_nr; ++ delete.v.bi_generation = cpu_to_le32(bi_generation); ++ } + -+ bch2_trans_update(&trans, iter, &delete.k_i, 0); ++ bch2_trans_update(&trans, iter, &delete.k_i, 0); + -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ if (ret == -EINTR) ++ goto retry; + + bch2_trans_exit(&trans); + return ret; @@ -44772,11 +45182,11 @@ index 000000000000..28edc0834a92 + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, -+ POS(0, inode_nr), BTREE_ITER_SLOTS); ++ POS(0, inode_nr), BTREE_ITER_CACHED); + if (IS_ERR(iter)) + return PTR_ERR(iter); + -+ k = bch2_btree_iter_peek_slot(iter); ++ k = bch2_btree_iter_peek_cached(iter); + ret = bkey_err(k); + if (ret) + goto err; @@ -44795,41 +45205,12 @@ index 000000000000..28edc0834a92 + return bch2_trans_do(c, NULL, NULL, 0, + bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); +} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void) -+{ -+ struct bch_inode_unpacked *u, test_inodes[] = { -+ { -+ .bi_atime = U64_MAX, -+ .bi_ctime = U64_MAX, -+ .bi_mtime = U64_MAX, -+ .bi_otime = U64_MAX, -+ .bi_size = U64_MAX, -+ .bi_sectors = U64_MAX, -+ .bi_uid = U32_MAX, -+ .bi_gid = U32_MAX, -+ .bi_nlink = U32_MAX, -+ .bi_generation = U32_MAX, -+ .bi_dev = U32_MAX, -+ }, -+ }; -+ -+ for (u = test_inodes; -+ u < test_inodes + ARRAY_SIZE(test_inodes); -+ u++) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, u); -+ } -+} -+#endif diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000000..bb759a46dc41 +index 000000000000..dbdfcf63d079 --- /dev/null +++ b/fs/bcachefs/inode.h -@@ -0,0 +1,177 @@ +@@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H @@ -44856,6 +45237,14 @@ index 000000000000..bb759a46dc41 + .val_to_text = bch2_inode_generation_to_text, \ +} + ++#if 0 ++typedef struct { ++ u64 lo; ++ u32 hi; ++} __packed __aligned(4) u96; ++#endif ++typedef u64 u96; ++ +struct bch_inode_unpacked { + u64 bi_inum; + __le64 bi_hash_seed; @@ -44875,7 +45264,8 @@ index 000000000000..bb759a46dc41 +#undef x +} __attribute__((packed, aligned(8))); + -+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); ++void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *, ++ const struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); + +struct btree_iter *bch2_inode_peek(struct btree_trans *, @@ -44892,11 +45282,9 @@ index 000000000000..bb759a46dc41 + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + -+int bch2_inode_create(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ u64, u64, u64 *); ++int bch2_inode_create(struct btree_trans *, struct bch_inode_unpacked *); + -+int bch2_inode_rm(struct bch_fs *, u64); ++int bch2_inode_rm(struct bch_fs *, u64, bool); + +int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *); @@ -45000,16 +45388,10 @@ index 000000000000..bb759a46dc41 + } +} + -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void); -+#else -+static inline void bch2_inode_pack_test(void) {} -+#endif -+ +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 -index 000000000000..8add8ccd129d +index 000000000000..21087d1193dc --- /dev/null +++ b/fs/bcachefs/io.c @@ -0,0 +1,2392 @@ @@ -45186,7 +45568,7 @@ index 000000000000..8add8ccd129d + + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); -+ unsigned len = min(PAGE_SIZE, size); ++ unsigned len = min_t(size_t, PAGE_SIZE, size); + + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; @@ -45316,7 +45698,7 @@ index 000000000000..8add8ccd129d + inode_u.bi_sectors += delta; + + if (delta || new_i_size) { -+ bch2_inode_pack(&inode_p, &inode_u); ++ bch2_inode_pack(trans->c, &inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i, 0); + } @@ -47736,10 +48118,10 @@ index 000000000000..b23727d212b9 +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000000..c2cafd3892a4 +index 000000000000..5874a9ff2204 --- /dev/null +++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1265 @@ +@@ -0,0 +1,1186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs journalling code, for btree insertions @@ -47760,7 +48142,19 @@ index 000000000000..c2cafd3892a4 + +#include + -+static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); ++static u64 last_unwritten_seq(struct journal *j) ++{ ++ union journal_res_state s = READ_ONCE(j->reservations); ++ ++ lockdep_assert_held(&j->lock); ++ ++ return journal_cur_seq(j) - s.prev_buf_unwritten; ++} ++ ++static inline bool journal_seq_unwritten(struct journal *j, u64 seq) ++{ ++ return seq >= last_unwritten_seq(j); ++} + +static bool __journal_entry_is_open(union journal_res_state state) +{ @@ -47772,6 +48166,22 @@ index 000000000000..c2cafd3892a4 + return __journal_entry_is_open(j->reservations); +} + ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ struct journal_buf *buf = NULL; ++ ++ EBUG_ON(seq > journal_cur_seq(j)); ++ EBUG_ON(seq == journal_cur_seq(j) && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); ++ ++ if (journal_seq_unwritten(j, seq)) { ++ buf = j->buf + (seq & 1); ++ EBUG_ON(le64_to_cpu(buf->data->seq) != seq); ++ } ++ return buf; ++} ++ +static void journal_pin_new_entry(struct journal *j, int count) +{ + struct journal_entry_pin_list *p; @@ -47793,6 +48203,8 @@ index 000000000000..c2cafd3892a4 +{ + struct journal_buf *buf = journal_cur_buf(j); + ++ bkey_extent_init(&buf->key); ++ + memset(buf->has_inode, 0, sizeof(buf->has_inode)); + + memset(buf->data, 0, sizeof(*buf->data)); @@ -47814,6 +48226,7 @@ index 000000000000..c2cafd3892a4 + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + ++ j->err_seq = journal_cur_seq(j); + journal_wake(j); + closure_wake_up(&journal_cur_buf(j)->wait); +} @@ -47881,8 +48294,6 @@ index 000000000000..c2cafd3892a4 + BUG_ON(sectors > buf->sectors); + buf->sectors = sectors; + -+ bkey_extent_init(&buf->key); -+ + /* + * We have to set last_seq here, _before_ opening a new journal entry: + * @@ -47904,11 +48315,6 @@ index 000000000000..c2cafd3892a4 + */ + buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); + -+ if (journal_entry_empty(buf->data)) -+ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ else -+ set_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ + journal_pin_new_entry(j, 1); + + bch2_journal_buf_init(j); @@ -47944,16 +48350,19 @@ index 000000000000..c2cafd3892a4 + */ +static int journal_entry_open(struct journal *j) +{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); + union journal_res_state old, new; + int u64s; + u64 v; + ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); + + if (j->blocked) -+ return -EAGAIN; ++ return cur_entry_blocked; + + if (j->cur_entry_error) + return j->cur_entry_error; @@ -47969,7 +48378,7 @@ index 000000000000..c2cafd3892a4 + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= le32_to_cpu(buf->data->u64s)) -+ return -ENOSPC; ++ return cur_entry_journal_full; + + /* + * Must be set before marking the journal entry as open: @@ -47981,7 +48390,7 @@ index 000000000000..c2cafd3892a4 + old.v = new.v = v; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return -EROFS; ++ return cur_entry_insufficient_devices; + + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); @@ -48094,7 +48503,7 @@ index 000000000000..c2cafd3892a4 + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ -+ ret = -ENOSPC; ++ ret = cur_entry_journal_full; + goto unlock; + } + @@ -48117,14 +48526,16 @@ index 000000000000..c2cafd3892a4 + * there's still a previous one in flight: + */ + trace_journal_entry_full(c); -+ ret = -EAGAIN; ++ ret = cur_entry_blocked; + } else { + ret = journal_entry_open(j); + } +unlock: -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) ++ if ((ret && ret != cur_entry_insufficient_devices) && ++ !j->res_get_blocked_start) { + j->res_get_blocked_start = local_clock() ?: 1; ++ trace_journal_full(c); ++ } + + can_discard = j->can_discard; + spin_unlock(&j->lock); @@ -48132,32 +48543,39 @@ index 000000000000..c2cafd3892a4 + if (!ret) + goto retry; + -+ if (ret == -ENOSPC) { -+ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), -+ "JOURNAL_RES_GET_RESERVED set but journal full"); ++ if (WARN_ONCE(ret == cur_entry_journal_full && ++ !can_discard && ++ (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full")) { ++ char *buf; + -+ /* -+ * Journal is full - can't rely on reclaim from work item due to -+ * freezing: -+ */ -+ trace_journal_full(c); -+ -+ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { -+ if (can_discard) { -+ bch2_journal_do_discards(j); -+ goto retry; -+ } -+ -+ if (mutex_trylock(&j->reclaim_lock)) { -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+ } ++ buf = kmalloc(4096, GFP_NOFS); ++ if (buf) { ++ bch2_journal_debug_to_text(&_PBUF(buf, 4096), j); ++ pr_err("\n%s", buf); ++ kfree(buf); + } -+ -+ ret = -EAGAIN; + } + -+ return ret; ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ if ((ret == cur_entry_journal_full || ++ ret == cur_entry_journal_pin_full) && ++ !(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; ++ } ++ ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ } ++ ++ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; +} + +/* @@ -48190,8 +48608,10 @@ index 000000000000..c2cafd3892a4 +{ + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); + -+ if (!ret) -+ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ if (!ret && mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } + + return ret; +} @@ -48245,147 +48665,37 @@ index 000000000000..c2cafd3892a4 + +/* journal flushing: */ + -+u64 bch2_journal_last_unwritten_seq(struct journal *j) -+{ -+ u64 seq; -+ -+ spin_lock(&j->lock); -+ seq = journal_cur_seq(j); -+ if (j->reservations.prev_buf_unwritten) -+ seq--; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+/** -+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't -+ * open yet, or wait if we cannot -+ * -+ * used by the btree interior update machinery, when it needs to write a new -+ * btree root - every journal entry contains the roots of all the btrees, so it -+ * doesn't need to bother with getting a journal reservation -+ */ -+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ int ret; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Can't try to open more than one sequence number ahead: -+ */ -+ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); -+ -+ if (journal_cur_seq(j) > seq || -+ journal_entry_is_open(j)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (journal_cur_seq(j) < seq && -+ !__journal_entry_close(j)) { -+ /* haven't finished writing out the previous one: */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ BUG_ON(journal_cur_seq(j) != seq); -+ -+ ret = journal_entry_open(j); -+ } -+ -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ if (ret == -EAGAIN || ret == -ENOSPC) -+ closure_wait(&j->async_wait, cl); -+ -+ spin_unlock(&j->lock); -+ -+ if (ret == -ENOSPC) { -+ trace_journal_full(c); -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+static int journal_seq_error(struct journal *j, u64 seq) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ -+ if (seq == journal_cur_seq(j)) -+ return bch2_journal_error(j); -+ -+ if (seq + 1 == journal_cur_seq(j) && -+ !state.prev_buf_unwritten && -+ seq > j->seq_ondisk) -+ return -EIO; -+ -+ return 0; -+} -+ -+static inline struct journal_buf * -+journal_seq_to_buf(struct journal *j, u64 seq) -+{ -+ /* seq should be for a journal entry that has been opened: */ -+ BUG_ON(seq > journal_cur_seq(j)); -+ BUG_ON(seq == journal_cur_seq(j) && -+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); -+ -+ if (seq == journal_cur_seq(j)) -+ return journal_cur_buf(j); -+ if (seq + 1 == journal_cur_seq(j) && -+ j->reservations.prev_buf_unwritten) -+ return journal_prev_buf(j); -+ return NULL; -+} -+ -+/** -+ * bch2_journal_wait_on_seq - wait for a journal entry to be written -+ * -+ * does _not_ cause @seq to be written immediately - if there is no other -+ * activity to cause the relevant journal entry to be filled up or flushed it -+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is -+ * configurable). -+ */ -+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if ((buf = journal_seq_to_buf(j, seq))) { -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) { -+ smp_mb(); -+ if (bch2_journal_error(j)) -+ closure_wake_up(&buf->wait); -+ } -+ } -+ -+ spin_unlock(&j->lock); -+} -+ +/** + * bch2_journal_flush_seq_async - wait for a journal entry to be written + * + * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary + */ -+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, -+ struct closure *parent) ++int bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++ struct closure *parent) +{ + struct journal_buf *buf; ++ int ret = 0; ++ ++ if (seq <= j->err_seq) ++ return -EIO; ++ ++ if (seq <= j->seq_ondisk) ++ return 1; + + spin_lock(&j->lock); + ++ /* Recheck under lock: */ ++ if (seq <= j->err_seq) { ++ ret = -EIO; ++ goto out; ++ } ++ ++ if (seq <= j->seq_ondisk) { ++ ret = 1; ++ goto out; ++ } ++ + if (parent && + (buf = journal_seq_to_buf(j, seq))) + if (!closure_wait(&buf->wait, parent)) @@ -48393,20 +48703,8 @@ index 000000000000..c2cafd3892a4 + + if (seq == journal_cur_seq(j)) + __journal_entry_close(j); ++out: + spin_unlock(&j->lock); -+} -+ -+static int journal_seq_flushed(struct journal *j, u64 seq) -+{ -+ int ret; -+ -+ spin_lock(&j->lock); -+ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ + return ret; +} + @@ -48415,28 +48713,13 @@ index 000000000000..c2cafd3892a4 + u64 start_time = local_clock(); + int ret, ret2; + -+ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); ++ ret = wait_event_killable(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + + bch2_time_stats_update(j->flush_seq_time, start_time); + + return ret ?: ret2 < 0 ? ret2 : 0; +} + -+/** -+ * bch2_journal_meta_async - force a journal entry to be written -+ */ -+void bch2_journal_meta_async(struct journal *j, struct closure *parent) -+{ -+ struct journal_res res; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ bch2_journal_res_put(j, &res); -+ -+ bch2_journal_flush_seq_async(j, res.seq, parent); -+} -+ +int bch2_journal_meta(struct journal *j) +{ + struct journal_res res; @@ -48532,16 +48815,19 @@ index 000000000000..c2cafd3892a4 + if (nr <= ja->nr) + return 0; + -+ ret = -ENOMEM; + new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); + new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ if (!new_buckets || !new_bucket_seq) ++ if (!new_buckets || !new_bucket_seq) { ++ ret = -ENOMEM; + goto err; ++ } + + journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, -+ nr + sizeof(*journal_buckets) / sizeof(u64)); -+ if (!journal_buckets) ++ nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (!journal_buckets) { ++ ret = -ENOSPC; + goto err; ++ } + + /* + * We may be called from the device add path, before the new device has @@ -48570,8 +48856,10 @@ index 000000000000..c2cafd3892a4 + goto err; + } + } else { ++ rcu_read_lock(); + ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, + false, cl); ++ rcu_read_unlock(); + if (IS_ERR(ob)) { + ret = cl ? -EAGAIN : -ENOSPC; + goto err; @@ -48585,6 +48873,12 @@ index 000000000000..c2cafd3892a4 + spin_lock(&c->journal.lock); + } + ++ /* ++ * XXX ++ * For resize at runtime, we should be writing the new ++ * superblock before inserting into the journal array ++ */ ++ + pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; + __array_insert_item(ja->buckets, ja->nr, pos); + __array_insert_item(ja->bucket_seq, ja->nr, pos); @@ -48617,9 +48911,9 @@ index 000000000000..c2cafd3892a4 + if (!new_fs) + bch2_open_bucket_put(c, ob); + } -+ -+ ret = 0; +err: ++ bch2_sb_resize_journal(&ca->disk_sb, ++ ja->nr + sizeof(*journal_buckets) / sizeof(u64)); + kfree(new_bucket_seq); + kfree(new_buckets); + @@ -48731,10 +49025,11 @@ index 000000000000..c2cafd3892a4 + journal_quiesce(j); + + BUG_ON(!bch2_journal_error(j) && -+ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); ++ (journal_entry_is_open(j) || ++ j->last_empty_seq + 1 != journal_cur_seq(j))); + + cancel_delayed_work_sync(&j->write_work); -+ cancel_delayed_work_sync(&j->reclaim_work); ++ bch2_journal_reclaim_stop(j); +} + +int bch2_fs_journal_start(struct journal *j, u64 cur_seq, @@ -48789,6 +49084,9 @@ index 000000000000..c2cafd3892a4 + set_bit(JOURNAL_STARTED, &j->flags); + + journal_pin_new_entry(j, 1); ++ ++ j->reservations.idx = journal_cur_seq(j); ++ + bch2_journal_buf_init(j); + + c->last_bucket_seq_cleanup = journal_cur_seq(j); @@ -48859,7 +49157,6 @@ index 000000000000..c2cafd3892a4 + spin_lock_init(&j->err_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); -+ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); + init_waitqueue_head(&j->pin_flush_wait); + mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); @@ -48911,7 +49208,10 @@ index 000000000000..c2cafd3892a4 + "last_seq:\t\t%llu\n" + "last_seq_ondisk:\t%llu\n" + "prereserved:\t\t%u/%u\n" ++ "nr direct reclaim:\t%llu\n" ++ "nr background reclaim:\t%llu\n" + "current entry sectors:\t%u\n" ++ "current entry error:\t%u\n" + "current entry:\t\t", + fifo_used(&j->pin), + journal_cur_seq(j), @@ -48919,7 +49219,10 @@ index 000000000000..c2cafd3892a4 + j->last_seq_ondisk, + j->prereserved.reserved, + j->prereserved.remaining, -+ j->cur_entry_sectors); ++ j->nr_direct_reclaim, ++ j->nr_background_reclaim, ++ j->cur_entry_sectors, ++ j->cur_entry_error); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: @@ -49007,10 +49310,10 @@ index 000000000000..c2cafd3892a4 +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 -index 000000000000..f60bc964ee1f +index 000000000000..25c6876765ac --- /dev/null +++ b/fs/bcachefs/journal.h -@@ -0,0 +1,520 @@ +@@ -0,0 +1,515 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H @@ -49477,13 +49780,8 @@ index 000000000000..f60bc964ee1f + struct journal_entry_res *, + unsigned); + -+u64 bch2_journal_last_unwritten_seq(struct journal *); -+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); -+ -+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +void bch2_journal_flush_async(struct journal *, struct closure *); -+void bch2_journal_meta_async(struct journal *, struct closure *); + +int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush(struct journal *); @@ -49533,10 +49831,10 @@ index 000000000000..f60bc964ee1f +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000000..bd0e6b371701 +index 000000000000..d1367cf067d3 --- /dev/null +++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1183 @@ +@@ -0,0 +1,1210 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_foreground.h" @@ -49700,6 +49998,8 @@ index 000000000000..bd0e6b371701 +#define journal_entry_err_on(cond, c, msg, ...) \ + ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) + ++#define FSCK_DELETED_KEY 5 ++ +static int journal_validate_key(struct bch_fs *c, struct jset *jset, + struct jset_entry *entry, + unsigned level, enum btree_id btree_id, @@ -49712,28 +50012,42 @@ index 000000000000..bd0e6b371701 + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, -+ "invalid %s in journal: k->u64s 0", type)) { ++ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s))) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); -+ return 0; ++ return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), c, -+ "invalid %s in journal: extends past end of journal entry", -+ type)) { ++ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s))) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); -+ return 0; ++ return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, -+ "invalid %s in journal: bad format %u", -+ type, k->k.format)) { -+ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s), ++ k->k.format)) { ++ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); -+ return 0; ++ return FSCK_DELETED_KEY; + } + + if (!write) @@ -49747,13 +50061,18 @@ index 000000000000..bd0e6b371701 + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); -+ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", -+ type, invalid, buf); ++ mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s", ++ type, le64_to_cpu(jset->seq), ++ (u64 *) entry - jset->_data, ++ le32_to_cpu(jset->u64s), ++ (u64 *) k - entry->_data, ++ le16_to_cpu(entry->u64s), ++ invalid, buf); + -+ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); -+ return 0; ++ return FSCK_DELETED_KEY; + } + + if (write) @@ -49769,15 +50088,17 @@ index 000000000000..bd0e6b371701 + struct jset_entry *entry, + int write) +{ -+ struct bkey_i *k; ++ struct bkey_i *k = entry->start; + -+ vstruct_for_each(entry, k) { ++ while (k != vstruct_last(entry)) { + int ret = journal_validate_key(c, jset, entry, + entry->level, + entry->btree_id, + k, "key", write); -+ if (ret) -+ return ret; ++ if (ret == FSCK_DELETED_KEY) ++ continue; ++ ++ k = bkey_next(k); + } + + return 0; @@ -49971,46 +50292,45 @@ index 000000000000..bd0e6b371701 + "%s sector %llu seq %llu: unknown journal entry version %u", + ca->name, sector, le64_to_cpu(jset->seq), + version)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; ++ /* don't try to continue: */ ++ return EINVAL; + } + ++ if (bytes > (sectors_read << 9) && ++ sectors_read < bucket_sectors_left) ++ return JOURNAL_ENTRY_REREAD; ++ + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca->name, sector, le64_to_cpu(jset->seq), bytes)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; ++ ret = JOURNAL_ENTRY_BAD; ++ le32_add_cpu(&jset->u64s, ++ -((bytes - (bucket_sectors_left << 9)) / 8)); + } + -+ if (bytes > sectors_read << 9) -+ return JOURNAL_ENTRY_REREAD; -+ + if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca->name, sector, le64_to_cpu(jset->seq), -+ JSET_CSUM_TYPE(jset))) -+ return JOURNAL_ENTRY_BAD; ++ JSET_CSUM_TYPE(jset))) { ++ ret = JOURNAL_ENTRY_BAD; ++ goto bad_csum_type; ++ } + + csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); + if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, + "%s sector %llu seq %llu: journal checksum bad", -+ ca->name, sector, le64_to_cpu(jset->seq))) { -+ /* XXX: retry IO, when we start retrying checksum errors */ -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } ++ ca->name, sector, le64_to_cpu(jset->seq))) ++ ret = JOURNAL_ENTRY_BAD; + + bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); -+ ++bad_csum_type: + if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, + "invalid journal entry: last_seq > seq")) { + jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } -+ -+ return 0; +fsck_err: + return ret; +} @@ -50478,24 +50798,29 @@ index 000000000000..bd0e6b371701 + struct bch_replicas_padded replicas; + u64 seq = le64_to_cpu(w->data->seq); + u64 last_seq = le64_to_cpu(w->data->last_seq); ++ int err = 0; + + bch2_time_stats_update(j->write_time, j->write_start_time); + + if (!devs.nr) { + bch_err(c, "unable to write journal to sufficient devices"); -+ goto err; ++ err = -EIO; ++ } else { ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); ++ if (bch2_mark_replicas(c, &replicas.e)) ++ err = -EIO; + } + -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); -+ -+ if (bch2_mark_replicas(c, &replicas.e)) -+ goto err; ++ if (err) ++ bch2_fatal_error(c); + + spin_lock(&j->lock); + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = devs; + + j->seq_ondisk = seq; ++ if (err && (!j->err_seq || seq < j->err_seq)) ++ j->err_seq = seq; + j->last_seq_ondisk = last_seq; + bch2_journal_space_available(j); + @@ -50506,8 +50831,8 @@ index 000000000000..bd0e6b371701 + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ -+ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -+out: ++ journal_reclaim_kick(&c->journal); ++ + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + @@ -50521,11 +50846,6 @@ index 000000000000..bd0e6b371701 + if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) + mod_delayed_work(system_freezable_wq, &j->write_work, 0); + spin_unlock(&j->lock); -+ return; -+err: -+ bch2_fatal_error(c); -+ spin_lock(&j->lock); -+ goto out; +} + +static void journal_write_endio(struct bio *bio) @@ -50562,6 +50882,8 @@ index 000000000000..bd0e6b371701 + unsigned i, sectors, bytes, u64s; + int ret; + ++ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ + bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); + + journal_buf_realloc(j, w); @@ -50606,6 +50928,9 @@ index 000000000000..bd0e6b371701 + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + ++ if (journal_entry_empty(jset)) ++ j->last_empty_seq = le64_to_cpu(jset->seq); ++ + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + @@ -50772,19 +51097,24 @@ index 000000000000..6958ee0f8cf2 +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 -index 000000000000..57591983eebd +index 000000000000..2fa87c7dab7a --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,644 @@ +@@ -0,0 +1,756 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" ++#include "btree_key_cache.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "super.h" + ++#include ++#include ++#include ++ +/* Free space calculations: */ + +static unsigned journal_space_from(struct journal_device *ja, @@ -50942,12 +51272,12 @@ index 000000000000..57591983eebd + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { -+ ret = -EROFS; ++ ret = cur_entry_insufficient_devices; + goto out; + } + + if (!fifo_free(&j->pin)) { -+ ret = -ENOSPC; ++ ret = cur_entry_journal_pin_full; + goto out; + } + @@ -50958,7 +51288,7 @@ index 000000000000..57591983eebd + clean = __journal_space_available(j, nr_devs_want, journal_space_clean); + + if (!discarded.next_entry) -+ ret = -ENOSPC; ++ ret = cur_entry_journal_full; + + overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * + journal_entry_overhead(j); @@ -51041,6 +51371,7 @@ index 000000000000..57591983eebd + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed)); + BUG_ON(!fifo_pop(&j->pin, temp)); + popped = true; + } @@ -51209,7 +51540,6 @@ index 000000000000..57591983eebd + list_move(&ret->list, &pin_list->flushed); + BUG_ON(j->flush_in_progress); + j->flush_in_progress = ret; -+ j->last_flushed = jiffies; + } + + spin_unlock(&j->lock); @@ -51218,17 +51548,24 @@ index 000000000000..57591983eebd +} + +/* returns true if we did work */ -+static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, -+ unsigned min_nr) ++static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) +{ + struct journal_entry_pin *pin; -+ bool ret = false; -+ u64 seq; ++ u64 seq, ret = 0; + + lockdep_assert_held(&j->reclaim_lock); + -+ while ((pin = journal_get_next_pin(j, min_nr -+ ? U64_MAX : seq_to_flush, &seq))) { ++ while (1) { ++ cond_resched(); ++ ++ j->last_flushed = jiffies; ++ ++ pin = journal_get_next_pin(j, min_nr ++ ? U64_MAX : seq_to_flush, &seq); ++ if (!pin) ++ break; ++ + if (min_nr) + min_nr--; + @@ -51237,40 +51574,18 @@ index 000000000000..57591983eebd + BUG_ON(j->flush_in_progress != pin); + j->flush_in_progress = NULL; + wake_up(&j->pin_flush_wait); -+ ret = true; ++ ret++; + } + + return ret; +} + -+/** -+ * bch2_journal_reclaim - free up journal buckets -+ * -+ * Background journal reclaim writes out btree nodes. It should be run -+ * early enough so that we never completely run out of journal buckets. -+ * -+ * High watermarks for triggering background reclaim: -+ * - FIFO has fewer than 512 entries left -+ * - fewer than 25% journal buckets free -+ * -+ * Background reclaim runs until low watermarks are reached: -+ * - FIFO has more than 1024 entries left -+ * - more than 50% journal buckets free -+ * -+ * As long as a reclaim can complete in the time it takes to fill up -+ * 512 journal entries or 25% of all journal buckets, then -+ * journal_next_bucket() should not stall. -+ */ -+void bch2_journal_reclaim(struct journal *j) ++static u64 journal_seq_to_flush(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; -+ unsigned iter, min_nr = 0; + u64 seq_to_flush = 0; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ bch2_journal_do_discards(j); ++ unsigned iter; + + spin_lock(&j->lock); + @@ -51302,34 +51617,156 @@ index 000000000000..57591983eebd + (j->pin.size >> 1)); + spin_unlock(&j->lock); + -+ /* -+ * If it's been longer than j->reclaim_delay_ms since we last flushed, -+ * make sure to flush at least one journal pin: -+ */ -+ if (time_after(jiffies, j->last_flushed + -+ msecs_to_jiffies(j->reclaim_delay_ms))) -+ min_nr = 1; -+ -+ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { -+ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); -+ min_nr = 1; -+ } -+ -+ journal_flush_pins(j, seq_to_flush, min_nr); -+ -+ if (!bch2_journal_error(j)) -+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, -+ msecs_to_jiffies(j->reclaim_delay_ms)); ++ return seq_to_flush; +} + -+void bch2_journal_reclaim_work(struct work_struct *work) ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++static void __bch2_journal_reclaim(struct journal *j, bool direct) +{ -+ struct journal *j = container_of(to_delayed_work(work), -+ struct journal, reclaim_work); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ u64 seq_to_flush, nr_flushed = 0; ++ size_t min_nr; ++ unsigned flags; + -+ mutex_lock(&j->reclaim_lock); -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); ++ /* ++ * We can't invoke memory reclaim while holding the reclaim_lock - ++ * journal reclaim is required to make progress for memory reclaim ++ * (cleaning the caches), so we can't get stuck in memory reclaim while ++ * we're holding the reclaim lock: ++ */ ++ lockdep_assert_held(&j->reclaim_lock); ++ flags = memalloc_noreclaim_save(); ++ ++ do { ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ bch2_journal_do_discards(j); ++ ++ seq_to_flush = journal_seq_to_flush(j); ++ min_nr = 0; ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(j->reclaim_delay_ms))) ++ min_nr = 1; ++ ++ if (j->prereserved.reserved * 2 > j->prereserved.remaining) ++ min_nr = 1; ++ ++ if (atomic_read(&c->btree_cache.dirty) * 4 > ++ c->btree_cache.used * 3) ++ min_nr = 1; ++ ++ min_nr = max(min_nr, bch2_nr_btree_keys_need_flush(c)); ++ ++ trace_journal_reclaim_start(c, ++ min_nr, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ atomic_read(&c->btree_cache.dirty), ++ c->btree_cache.used, ++ c->btree_key_cache.nr_dirty, ++ c->btree_key_cache.nr_keys); ++ ++ nr_flushed = journal_flush_pins(j, seq_to_flush, min_nr); ++ ++ if (direct) ++ j->nr_direct_reclaim += nr_flushed; ++ else ++ j->nr_background_reclaim += nr_flushed; ++ trace_journal_reclaim_finish(c, nr_flushed); ++ } while (min_nr); ++ ++ memalloc_noreclaim_restore(flags); ++} ++ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ __bch2_journal_reclaim(j, true); ++} ++ ++static int bch2_journal_reclaim_thread(void *arg) ++{ ++ struct journal *j = arg; ++ unsigned long next; ++ ++ while (!kthread_should_stop()) { ++ j->reclaim_kicked = false; ++ ++ mutex_lock(&j->reclaim_lock); ++ __bch2_journal_reclaim(j, false); ++ mutex_unlock(&j->reclaim_lock); ++ ++ next = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) ++ break; ++ if (j->reclaim_kicked) ++ break; ++ if (time_after_eq(jiffies, next)) ++ break; ++ schedule_timeout(next - jiffies); ++ ++ } ++ __set_current_state(TASK_RUNNING); ++ } ++ ++ return 0; ++} ++ ++void bch2_journal_reclaim_stop(struct journal *j) ++{ ++ struct task_struct *p = j->reclaim_thread; ++ ++ j->reclaim_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_journal_reclaim_start(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct task_struct *p; ++ ++ if (j->reclaim_thread) ++ return 0; ++ ++ p = kthread_create(bch2_journal_reclaim_thread, j, ++ "bch-reclaim/%s", c->name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ j->reclaim_thread = p; ++ wake_up_process(p); ++ return 0; +} + +static int journal_flush_done(struct journal *j, u64 seq_to_flush, @@ -51343,7 +51780,7 @@ index 000000000000..57591983eebd + + mutex_lock(&j->reclaim_lock); + -+ *did_work = journal_flush_pins(j, seq_to_flush, 0); ++ *did_work = journal_flush_pins(j, seq_to_flush, 0) != 0; + + spin_lock(&j->lock); + /* @@ -51422,10 +51859,10 @@ index 000000000000..57591983eebd +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h new file mode 100644 -index 000000000000..8128907a7623 +index 000000000000..bae2c9210db8 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.h -@@ -0,0 +1,69 @@ +@@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_RECLAIM_H +#define _BCACHEFS_JOURNAL_RECLAIM_H @@ -51438,6 +51875,17 @@ index 000000000000..8128907a7623 + journal_space_clean, +}; + ++static inline void journal_reclaim_kick(struct journal *j) ++{ ++ struct task_struct *p = READ_ONCE(j->reclaim_thread); ++ ++ if (p && !j->reclaim_kicked) { ++ j->reclaim_kicked = true; ++ if (p) ++ wake_up_process(p); ++ } ++} ++ +unsigned bch2_journal_dev_buckets_available(struct journal *, + struct journal_device *, + enum journal_space_from); @@ -51483,7 +51931,9 @@ index 000000000000..8128907a7623 + +void bch2_journal_do_discards(struct journal *); +void bch2_journal_reclaim(struct journal *); -+void bch2_journal_reclaim_work(struct work_struct *); ++ ++void bch2_journal_reclaim_stop(struct journal *); ++int bch2_journal_reclaim_start(struct journal *); + +bool bch2_journal_flush_pins(struct journal *, u64); + @@ -51840,10 +52290,10 @@ index 000000000000..afb886ec8e25 +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h new file mode 100644 -index 000000000000..154b51b891d3 +index 000000000000..4640bb8687cc --- /dev/null +++ b/fs/bcachefs/journal_types.h -@@ -0,0 +1,277 @@ +@@ -0,0 +1,288 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_TYPES_H +#define _BCACHEFS_JOURNAL_TYPES_H @@ -51973,7 +52423,6 @@ index 000000000000..154b51b891d3 + JOURNAL_STARTED, + JOURNAL_RECLAIM_STARTED, + JOURNAL_NEED_WRITE, -+ JOURNAL_NOT_EMPTY, + JOURNAL_MAY_GET_UNRESERVED, +}; + @@ -51993,7 +52442,13 @@ index 000000000000..154b51b891d3 + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ -+ int cur_entry_error; ++ enum { ++ cur_entry_ok, ++ cur_entry_blocked, ++ cur_entry_journal_full, ++ cur_entry_journal_pin_full, ++ cur_entry_insufficient_devices, ++ } cur_entry_error; + + union journal_preres_state prereserved; + @@ -52027,6 +52482,8 @@ index 000000000000..154b51b891d3 + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; + u64 last_seq_ondisk; ++ u64 err_seq; ++ u64 last_empty_seq; + + /* + * FIFO of journal entries whose btree updates have not yet been @@ -52055,8 +52512,12 @@ index 000000000000..154b51b891d3 + struct write_point wp; + spinlock_t err_lock; + -+ struct delayed_work reclaim_work; + struct mutex reclaim_lock; ++ struct task_struct *reclaim_thread; ++ bool reclaim_kicked; ++ u64 nr_direct_reclaim; ++ u64 nr_background_reclaim; ++ + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + wait_queue_head_t pin_flush_wait; @@ -53417,7 +53878,7 @@ index 000000000000..fc0de165af9f +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 -index 000000000000..ddfda1ef8a79 +index 000000000000..4834f41f48ed --- /dev/null +++ b/fs/bcachefs/movinggc.c @@ -0,0 +1,364 @@ @@ -53768,7 +54229,7 @@ index 000000000000..ddfda1ef8a79 + if (bch2_fs_init_fault("copygc_start")) + return -ENOMEM; + -+ t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); ++ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); + if (IS_ERR(t)) + return PTR_ERR(t); + @@ -55607,7 +56068,7 @@ index 000000000000..6a136083d389 +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 -index 000000000000..44d2651be970 +index 000000000000..c3373c48fa81 --- /dev/null +++ b/fs/bcachefs/rebalance.c @@ -0,0 +1,332 @@ @@ -55927,7 +56388,7 @@ index 000000000000..44d2651be970 + if (c->opts.nochanges) + return 0; + -+ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); ++ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + if (IS_ERR(p)) + return PTR_ERR(p); + @@ -56012,10 +56473,10 @@ index 000000000000..192c6be20ced +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000000..32fed6b81a52 +index 000000000000..6750063663b5 --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1366 @@ +@@ -0,0 +1,1369 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -56474,6 +56935,7 @@ index 000000000000..32fed6b81a52 + __bch2_btree_iter_set_pos(split_iter, split->k.p, false); + bch2_trans_update(&trans, split_iter, split, + BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(&trans, split_iter); + + bch2_btree_iter_set_pos(iter, split->k.p); + @@ -56499,6 +56961,8 @@ index 000000000000..32fed6b81a52 + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY); +err: ++ bch2_trans_iter_put(&trans, iter); ++ + if (ret == -EINTR) + goto retry; + @@ -57338,7 +57802,7 @@ index 000000000000..32fed6b81a52 + bch2_inode_init(c, &root_inode, 0, 0, + S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; -+ bch2_inode_pack(&packed_inode, &root_inode); ++ bch2_inode_pack(c, &packed_inode, &root_inode); + + err = "error creating root directory"; + ret = bch2_btree_insert(c, BTREE_ID_INODES, @@ -60958,10 +61422,10 @@ index 000000000000..7a068158efca +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000000..015bbd9f21fd +index 000000000000..e3bbd0b0d698 --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,2037 @@ +@@ -0,0 +1,2049 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -61013,7 +61477,6 @@ index 000000000000..015bbd9f21fd +#include +#include +#include -+#include +#include +#include +#include @@ -61223,7 +61686,7 @@ index 000000000000..015bbd9f21fd +void bch2_fs_read_only(struct bch_fs *c) +{ + if (!test_bit(BCH_FS_RW, &c->flags)) { -+ cancel_delayed_work_sync(&c->journal.reclaim_work); ++ BUG_ON(c->journal.reclaim_thread); + return; + } + @@ -61381,6 +61844,12 @@ index 000000000000..015bbd9f21fd + + set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); + ++ ret = bch2_journal_reclaim_start(&c->journal); ++ if (ret) { ++ bch_err(c, "error starting journal reclaim: %i", ret); ++ return ret; ++ } ++ + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) @@ -61389,9 +61858,6 @@ index 000000000000..015bbd9f21fd + + percpu_ref_reinit(&c->writes); + set_bit(BCH_FS_RW, &c->flags); -+ -+ queue_delayed_work(c->journal_reclaim_wq, -+ &c->journal.reclaim_work, 0); + return 0; +err: + __bch2_fs_read_only(c); @@ -61415,6 +61881,7 @@ index 000000000000..015bbd9f21fd +static void __bch2_fs_free(struct bch_fs *c) +{ + unsigned i; ++ int cpu; + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); @@ -61439,6 +61906,12 @@ index 000000000000..015bbd9f21fd + free_percpu(c->usage[1]); + free_percpu(c->usage[0]); + kfree(c->usage_base); ++ ++ if (c->btree_iters_bufs) ++ for_each_possible_cpu(cpu) ++ kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter); ++ ++ free_percpu(c->btree_iters_bufs); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); @@ -61449,10 +61922,9 @@ index 000000000000..015bbd9f21fd + kfree(c->replicas_gc.entries); + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); ++ kfree(c->unused_inode_hints); + free_heap(&c->copygc_heap); + -+ if (c->journal_reclaim_wq) -+ destroy_workqueue(c->journal_reclaim_wq); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); + if (c->wq) @@ -61700,12 +62172,12 @@ index 000000000000..015bbd9f21fd + (btree_blocks(c) + 1) * 2 * + sizeof(struct sort_iter_set); + ++ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); ++ + if (!(c->wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->copygc_wq = alloc_workqueue("bcache_copygc", ++ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || @@ -61714,9 +62186,12 @@ index 000000000000..015bbd9f21fd + offsetof(struct btree_write_bio, wbio.bio)), + BIOSET_NEED_BVECS) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || ++ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, ++ sizeof(u64), GFP_KERNEL)) || + bch2_io_clock_init(&c->io_clock[READ]) || + bch2_io_clock_init(&c->io_clock[WRITE]) || + bch2_fs_journal_init(&c->journal) || @@ -62969,6 +63444,7 @@ index 000000000000..015bbd9f21fd + bch2_debug_exit(); + bch2_vfs_exit(); + bch2_chardev_exit(); ++ bch2_btree_key_cache_exit(); + if (bcachefs_kset) + kset_unregister(bcachefs_kset); +} @@ -62976,9 +63452,9 @@ index 000000000000..015bbd9f21fd +static int __init bcachefs_init(void) +{ + bch2_bkey_pack_test(); -+ bch2_inode_pack_test(); + + if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_btree_key_cache_init() || + bch2_chardev_init() || + bch2_vfs_init() || + bch2_debug_init()) @@ -63305,10 +63781,10 @@ index 000000000000..20406ebd6f5b +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000000..0cb29f43d99d +index 000000000000..900eda88a5dc --- /dev/null +++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1074 @@ +@@ -0,0 +1,1062 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces @@ -63476,6 +63952,7 @@ index 000000000000..0cb29f43d99d +read_attribute(journal_pins); +read_attribute(btree_updates); +read_attribute(dirty_btree_nodes); ++read_attribute(btree_cache); +read_attribute(btree_key_cache); +read_attribute(btree_transactions); +read_attribute(stripes_heap); @@ -63519,12 +63996,6 @@ index 000000000000..0cb29f43d99d +write_attribute(perf_test); +#endif /* CONFIG_BCACHEFS_TESTS */ + -+#define BCH_DEBUG_PARAM(name, description) \ -+ rw_attribute(name); -+ -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ +#define x(_name) \ + static struct attribute sysfs_time_stat_##_name = \ + { .name = #_name, .mode = S_IRUGO }; @@ -63691,6 +64162,11 @@ index 000000000000..0cb29f43d99d + return out.pos - buf; + } + ++ if (attr == &sysfs_btree_cache) { ++ bch2_btree_cache_to_text(&out, c); ++ return out.pos - buf; ++ } ++ + if (attr == &sysfs_btree_key_cache) { + bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); + return out.pos - buf; @@ -63725,10 +64201,6 @@ index 000000000000..0cb29f43d99d + return out.pos - buf; + } + -+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ + return 0; +} + @@ -63773,17 +64245,13 @@ index 000000000000..0cb29f43d99d + + /* Debugging: */ + -+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EPERM; + + /* Debugging: */ + + if (attr == &sysfs_trigger_journal_flush) -+ bch2_journal_meta_async(&c->journal, NULL); ++ bch2_journal_meta(&c->journal); + + if (attr == &sysfs_trigger_btree_coalesce) + bch2_coalesce(c); @@ -63875,6 +64343,7 @@ index 000000000000..0cb29f43d99d + &sysfs_journal_pins, + &sysfs_btree_updates, + &sysfs_dirty_btree_nodes, ++ &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_btree_transactions, + &sysfs_stripes_heap, @@ -63901,11 +64370,6 @@ index 000000000000..0cb29f43d99d + &sysfs_io_timers_write, + + &sysfs_internal_uuid, -+ -+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ + NULL +}; + @@ -65205,7 +65669,7 @@ index 000000000000..59e8dfa3d245 +#include diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000000..fd4044a6a08f +index 000000000000..2709163e02b5 --- /dev/null +++ b/fs/bcachefs/util.c @@ -0,0 +1,907 @@ @@ -65731,7 +66195,7 @@ index 000000000000..fd4044a6a08f +{ + while (size) { + struct page *page = alloc_page(gfp_mask); -+ unsigned len = min(PAGE_SIZE, size); ++ unsigned len = min_t(size_t, PAGE_SIZE, size); + + if (!page) + return -ENOMEM; @@ -66118,10 +66582,10 @@ index 000000000000..fd4044a6a08f +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000000..f48c6380684f +index 000000000000..6e5335440b4b --- /dev/null +++ b/fs/bcachefs/util.h -@@ -0,0 +1,761 @@ +@@ -0,0 +1,750 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H @@ -66161,17 +66625,6 @@ index 000000000000..f48c6380684f +#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) +#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) + -+#define memcpy(dst, src, len) \ -+({ \ -+ void *_dst = (dst); \ -+ const void *_src = (src); \ -+ size_t _len = (len); \ -+ \ -+ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ -+ (void *) (_dst) + (_len) <= (void *) (_src))); \ -+ memcpy(_dst, _src, _len); \ -+}) -+ +#else /* DEBUG */ + +#define EBUG_ON(cond) @@ -66883,6 +67336,68 @@ index 000000000000..f48c6380684f +#define cmp_int(l, r) ((l > r) - (l < r)) + +#endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +new file mode 100644 +index 000000000000..a3d252c741c8 +--- /dev/null ++++ b/fs/bcachefs/varint.c +@@ -0,0 +1,42 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++ ++#include "varint.h" ++ ++int bch2_varint_encode(u8 *out, u64 v) ++{ ++ unsigned bits = fls64(v|1); ++ unsigned bytes = DIV_ROUND_UP(bits, 7); ++ ++ if (likely(bytes < 9)) { ++ v <<= bytes; ++ v |= ~(~0 << (bytes - 1)); ++ } else { ++ *out++ = 255; ++ bytes = 9; ++ } ++ ++ put_unaligned_le64(v, out); ++ return bytes; ++} ++ ++int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) ++{ ++ u64 v = get_unaligned_le64(in); ++ unsigned bytes = ffz(v & 255) + 1; ++ ++ if (unlikely(in + bytes > end)) ++ return -1; ++ ++ if (likely(bytes < 9)) { ++ v >>= bytes; ++ v &= ~(~0ULL << (7 * bytes)); ++ } else { ++ v = get_unaligned_le64(++in); ++ } ++ ++ *out = v; ++ return bytes; ++} +diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h +new file mode 100644 +index 000000000000..8daf813576b7 +--- /dev/null ++++ b/fs/bcachefs/varint.h +@@ -0,0 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_VARINT_H ++#define _BCACHEFS_VARINT_H ++ ++int bch2_varint_encode(u8 *, u64); ++int bch2_varint_decode(const u8 *, const u8 *, u64 *); ++ ++#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h new file mode 100644 index 000000000000..c099cdc0605f @@ -67670,57 +68185,6 @@ index ea0485861d93..b4d6e3e86285 100644 d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); -diff --git a/fs/inode.c b/fs/inode.c -index 72c4c347afb7..e70ad3d2d01c 100644 ---- a/fs/inode.c -+++ b/fs/inode.c -@@ -1578,6 +1578,46 @@ int insert_inode_locked(struct inode *inode) - } - EXPORT_SYMBOL(insert_inode_locked); - -+struct inode *insert_inode_locked2(struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ ino_t ino = inode->i_ino; -+ struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ -+ while (1) { -+ struct inode *old = NULL; -+ spin_lock(&inode_hash_lock); -+ hlist_for_each_entry(old, head, i_hash) { -+ if (old->i_ino != ino) -+ continue; -+ if (old->i_sb != sb) -+ continue; -+ spin_lock(&old->i_lock); -+ if (old->i_state & (I_FREEING|I_WILL_FREE)) { -+ spin_unlock(&old->i_lock); -+ continue; -+ } -+ break; -+ } -+ if (likely(!old)) { -+ spin_lock(&inode->i_lock); -+ inode->i_state |= I_NEW | I_CREATING; -+ hlist_add_head(&inode->i_hash, head); -+ spin_unlock(&inode->i_lock); -+ spin_unlock(&inode_hash_lock); -+ return NULL; -+ } -+ __iget(old); -+ spin_unlock(&old->i_lock); -+ spin_unlock(&inode_hash_lock); -+ wait_on_inode(old); -+ if (unlikely(!inode_unhashed(old))) -+ return old; -+ iput(old); -+ } -+} -+EXPORT_SYMBOL(insert_inode_locked2); -+ - int insert_inode_locked4(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) - { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 868e11face00..d9e3b7b0175e 100644 --- a/include/linux/blkdev.h @@ -68164,18 +68628,6 @@ index 65d975bf9390..008573618071 100644 extern void d_tmpfile(struct dentry *, struct inode *); extern struct dentry *d_find_alias(struct inode *); -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 7519ae003a08..305d316f01f3 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -2953,6 +2953,7 @@ extern struct inode *find_inode_rcu(struct super_block *, unsigned long, - extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); - extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); - extern int insert_inode_locked(struct inode *); -+extern struct inode *insert_inode_locked2(struct inode *); - #ifdef CONFIG_DEBUG_LOCK_ALLOC - extern void lockdep_annotate_inode_mutex_key(struct inode *inode); - #else diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 434c9c34aeb6..620535006624 100644 --- a/include/linux/pagemap.h @@ -68221,6 +68673,19 @@ index 434c9c34aeb6..620535006624 100644 /** * struct readahead_control - Describes a readahead request. * +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index d15d46db61f7..750554aba7f9 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -33,6 +33,8 @@ + #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) + #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) + #define ulong2long(a) (*(long *)(&(a))) ++#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) ++#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) + + /* Exported common interfaces */ + void call_rcu(struct rcu_head *head, rcu_callback_t func); diff --git a/include/linux/sched.h b/include/linux/sched.h index afe01e232935..793b07788062 100644 --- a/include/linux/sched.h @@ -68436,6 +68901,42 @@ index 000000000000..a16e94f482e9 +void six_lock_wakeup_all(struct six_lock *); + +#endif /* _LINUX_SIX_H */ +diff --git a/include/linux/srcu.h b/include/linux/srcu.h +index e432cc92c73d..a0895bbf71ce 100644 +--- a/include/linux/srcu.h ++++ b/include/linux/srcu.h +@@ -60,6 +60,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp); + int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); + void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); + void synchronize_srcu(struct srcu_struct *ssp); ++unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); ++unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); ++bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + +diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h +index 5a5a1941ca15..fed4a2d9d0b6 100644 +--- a/include/linux/srcutiny.h ++++ b/include/linux/srcutiny.h +@@ -15,7 +15,7 @@ + + struct srcu_struct { + short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ +- short srcu_idx; /* Current reader array element. */ ++ unsigned short srcu_idx; /* Current reader array element in bit 0x2. */ + u8 srcu_gp_running; /* GP workqueue running? */ + u8 srcu_gp_waiting; /* GP waiting for readers? */ + struct swait_queue_head srcu_wq; +@@ -59,7 +59,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) + { + int idx; + +- idx = READ_ONCE(ssp->srcu_idx); ++ idx = (READ_ONCE(ssp->srcu_idx) & 0x2) / 2; + WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1); + return idx; + } diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 0221f852a7e1..f81f60d891ac 100644 --- a/include/linux/vmalloc.h @@ -68450,10 +68951,10 @@ index 0221f852a7e1..f81f60d891ac 100644 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h new file mode 100644 -index 000000000000..9b4e8295ed75 +index 000000000000..d4cb7a298cc2 --- /dev/null +++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,664 @@ +@@ -0,0 +1,760 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs @@ -68577,6 +69078,65 @@ index 000000000000..9b4e8295ed75 + TP_ARGS(bio) +); + ++TRACE_EVENT(journal_reclaim_start, ++ TP_PROTO(struct bch_fs *c, u64 min_nr, ++ u64 prereserved, u64 prereserved_total, ++ u64 btree_cache_dirty, u64 btree_cache_total, ++ u64 btree_key_cache_dirty, u64 btree_key_cache_total), ++ TP_ARGS(c, min_nr, prereserved, prereserved_total, ++ btree_cache_dirty, btree_cache_total, ++ btree_key_cache_dirty, btree_key_cache_total), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, min_nr ) ++ __field(u64, prereserved ) ++ __field(u64, prereserved_total ) ++ __field(u64, btree_cache_dirty ) ++ __field(u64, btree_cache_total ) ++ __field(u64, btree_key_cache_dirty ) ++ __field(u64, btree_key_cache_total ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->min_nr = min_nr; ++ __entry->prereserved = prereserved; ++ __entry->prereserved_total = prereserved_total; ++ __entry->btree_cache_dirty = btree_cache_dirty; ++ __entry->btree_cache_total = btree_cache_total; ++ __entry->btree_key_cache_dirty = btree_key_cache_dirty; ++ __entry->btree_key_cache_total = btree_key_cache_total; ++ ), ++ ++ TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", ++ __entry->uuid, ++ __entry->min_nr, ++ __entry->prereserved, ++ __entry->prereserved_total, ++ __entry->btree_cache_dirty, ++ __entry->btree_cache_total, ++ __entry->btree_key_cache_dirty, ++ __entry->btree_key_cache_total) ++); ++ ++TRACE_EVENT(journal_reclaim_finish, ++ TP_PROTO(struct bch_fs *c, u64 nr_flushed), ++ TP_ARGS(c, nr_flushed), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, nr_flushed ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->nr_flushed = nr_flushed; ++ ), ++ ++ TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed) ++); ++ +/* bset.c: */ + +DEFINE_EVENT(bpos, bkey_pack_pos_fail, @@ -68969,7 +69529,7 @@ index 000000000000..9b4e8295ed75 + __entry->ip = ip; + ), + -+ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) ++ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip) +); + +DECLARE_EVENT_CLASS(transaction_restart, @@ -68984,7 +69544,7 @@ index 000000000000..9b4e8295ed75 + __entry->ip = ip; + ), + -+ TP_printk("%pf", (void *) __entry->ip) ++ TP_printk("%ps", (void *) __entry->ip) +); + +DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, @@ -68992,9 +69552,46 @@ index 000000000000..9b4e8295ed75 + TP_ARGS(ip) +); + -+DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) ++TRACE_EVENT(trans_restart_would_deadlock, ++ TP_PROTO(unsigned long trans_ip, ++ unsigned long caller_ip, ++ unsigned reason, ++ enum btree_id have_btree_id, ++ unsigned have_iter_type, ++ enum btree_id want_btree_id, ++ unsigned want_iter_type), ++ TP_ARGS(trans_ip, caller_ip, reason, ++ have_btree_id, have_iter_type, ++ want_btree_id, want_iter_type), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, trans_ip ) ++ __field(unsigned long, caller_ip ) ++ __field(u8, reason ) ++ __field(u8, have_btree_id ) ++ __field(u8, have_iter_type ) ++ __field(u8, want_btree_id ) ++ __field(u8, want_iter_type ) ++ ), ++ ++ TP_fast_assign( ++ __entry->trans_ip = trans_ip; ++ __entry->caller_ip = caller_ip; ++ __entry->reason = reason; ++ __entry->have_btree_id = have_btree_id; ++ __entry->have_iter_type = have_iter_type; ++ __entry->want_btree_id = want_btree_id; ++ __entry->want_iter_type = want_iter_type; ++ ), ++ ++ TP_printk("%ps %pS because %u have %u:%u want %u:%u", ++ (void *) __entry->trans_ip, ++ (void *) __entry->caller_ip, ++ __entry->reason, ++ __entry->have_btree_id, ++ __entry->have_iter_type, ++ __entry->want_btree_id, ++ __entry->want_iter_type) +); + +TRACE_EVENT(trans_restart_iters_realloced, @@ -69011,7 +69608,7 @@ index 000000000000..9b4e8295ed75 + __entry->nr = nr; + ), + -+ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ++ TP_printk("%ps nr %u", (void *) __entry->ip, __entry->nr) +); + +TRACE_EVENT(trans_restart_mem_realloced, @@ -69028,7 +69625,7 @@ index 000000000000..9b4e8295ed75 + __entry->bytes = bytes; + ), + -+ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ++ TP_printk("%ps bytes %lu", (void *) __entry->ip, __entry->bytes) +); + +DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, @@ -69041,6 +69638,11 @@ index 000000000000..9b4e8295ed75 + TP_ARGS(ip) +); + ++DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ +DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, + TP_PROTO(unsigned long ip), + TP_ARGS(ip) @@ -69076,11 +69678,6 @@ index 000000000000..9b4e8295ed75 + TP_ARGS(ip) +); + -+DEFINE_EVENT(transaction_restart, trans_restart_atomic, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ +DECLARE_EVENT_CLASS(node_lock_fail, + TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), + TP_ARGS(level, iter_seq, node, node_seq), @@ -69746,6 +70343,270 @@ index 1c5cff34d9f2..8f9f37b0bfaa 100644 } bool __weak module_init_section(const char *name) +diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c +index 6208c1dae5c9..26a5122b9923 100644 +--- a/kernel/rcu/srcutiny.c ++++ b/kernel/rcu/srcutiny.c +@@ -114,7 +114,8 @@ void srcu_drive_gp(struct work_struct *wp) + struct srcu_struct *ssp; + + ssp = container_of(wp, struct srcu_struct, srcu_work); +- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) ++ // if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) ++ if (ssp->srcu_gp_running) + return; /* Already running or nothing to do. */ + + /* Remove recently arrived callbacks and wait for readers. */ +@@ -124,11 +125,12 @@ void srcu_drive_gp(struct work_struct *wp) + ssp->srcu_cb_head = NULL; + ssp->srcu_cb_tail = &ssp->srcu_cb_head; + local_irq_enable(); +- idx = ssp->srcu_idx; +- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); ++ idx = (ssp->srcu_idx & 0x2) / 2; ++ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ ++ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + + /* Invoke the callbacks we removed above. */ + while (lh) { +@@ -151,6 +153,16 @@ void srcu_drive_gp(struct work_struct *wp) + } + EXPORT_SYMBOL_GPL(srcu_drive_gp); + ++static void srcu_gp_start_if_needed(struct srcu_struct *ssp) ++{ ++ if (!READ_ONCE(ssp->srcu_gp_running)) { ++ if (likely(srcu_init_done)) ++ schedule_work(&ssp->srcu_work); ++ else if (list_empty(&ssp->srcu_work.entry)) ++ list_add(&ssp->srcu_work.entry, &srcu_boot_list); ++ } ++} ++ + /* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. +@@ -166,12 +178,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + *ssp->srcu_cb_tail = rhp; + ssp->srcu_cb_tail = &rhp->next; + local_irq_restore(flags); +- if (!READ_ONCE(ssp->srcu_gp_running)) { +- if (likely(srcu_init_done)) +- schedule_work(&ssp->srcu_work); +- else if (list_empty(&ssp->srcu_work.entry)) +- list_add(&ssp->srcu_work.entry, &srcu_boot_list); +- } ++ srcu_gp_start_if_needed(ssp); + } + EXPORT_SYMBOL_GPL(call_srcu); + +@@ -190,6 +197,48 @@ void synchronize_srcu(struct srcu_struct *ssp) + } + EXPORT_SYMBOL_GPL(synchronize_srcu); + ++/* ++ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie ++ */ ++unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) ++{ ++ unsigned long ret; ++ ++ barrier(); ++ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; ++ barrier(); ++ return ret & USHRT_MAX; ++} ++EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); ++ ++/* ++ * start_poll_synchronize_srcu - Provide cookie and start grace period ++ * ++ * The difference between this and get_state_synchronize_srcu() is that ++ * this function ensures that the poll_state_synchronize_srcu() will ++ * eventually return the value true. ++ */ ++unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) ++{ ++ unsigned long ret = get_state_synchronize_srcu(ssp); ++ ++ srcu_gp_start_if_needed(ssp); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); ++ ++/* ++ * poll_state_synchronize_srcu - Has cookie's grace period ended? ++ */ ++bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) ++{ ++ bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); ++ ++ barrier(); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); ++ + /* Lockdep diagnostics. */ + void __init rcu_scheduler_starting(void) + { +diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c +index c100acf332ed..1256de299a96 100644 +--- a/kernel/rcu/srcutree.c ++++ b/kernel/rcu/srcutree.c +@@ -818,6 +818,45 @@ static void srcu_leak_callback(struct rcu_head *rhp) + { + } + ++/* ++ * Start an SRCU grace period, and also queue the callback if non-NULL. ++ */ ++static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, ++ struct rcu_head *rhp, bool do_norm) ++{ ++ unsigned long flags; ++ int idx; ++ bool needexp = false; ++ bool needgp = false; ++ unsigned long s; ++ struct srcu_data *sdp; ++ ++ idx = srcu_read_lock(ssp); ++ sdp = raw_cpu_ptr(ssp->sda); ++ spin_lock_irqsave_rcu_node(sdp, flags); ++ if (rhp) ++ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); ++ rcu_segcblist_advance(&sdp->srcu_cblist, ++ rcu_seq_current(&ssp->srcu_gp_seq)); ++ s = rcu_seq_snap(&ssp->srcu_gp_seq); ++ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); ++ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { ++ sdp->srcu_gp_seq_needed = s; ++ needgp = true; ++ } ++ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { ++ sdp->srcu_gp_seq_needed_exp = s; ++ needexp = true; ++ } ++ spin_unlock_irqrestore_rcu_node(sdp, flags); ++ if (needgp) ++ srcu_funnel_gp_start(ssp, sdp, s, do_norm); ++ else if (needexp) ++ srcu_funnel_exp_start(ssp, sdp->mynode, s); ++ srcu_read_unlock(ssp, idx); ++ return s; ++} ++ + /* + * Enqueue an SRCU callback on the srcu_data structure associated with + * the current CPU and the specified srcu_struct structure, initiating +@@ -849,13 +888,6 @@ static void srcu_leak_callback(struct rcu_head *rhp) + static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) + { +- unsigned long flags; +- int idx; +- bool needexp = false; +- bool needgp = false; +- unsigned long s; +- struct srcu_data *sdp; +- + check_init_srcu_struct(ssp); + if (debug_rcu_head_queue(rhp)) { + /* Probable double call_srcu(), so leak the callback. */ +@@ -864,28 +896,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + return; + } + rhp->func = func; +- idx = srcu_read_lock(ssp); +- sdp = raw_cpu_ptr(ssp->sda); +- spin_lock_irqsave_rcu_node(sdp, flags); +- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); +- rcu_segcblist_advance(&sdp->srcu_cblist, +- rcu_seq_current(&ssp->srcu_gp_seq)); +- s = rcu_seq_snap(&ssp->srcu_gp_seq); +- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); +- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { +- sdp->srcu_gp_seq_needed = s; +- needgp = true; +- } +- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { +- sdp->srcu_gp_seq_needed_exp = s; +- needexp = true; +- } +- spin_unlock_irqrestore_rcu_node(sdp, flags); +- if (needgp) +- srcu_funnel_gp_start(ssp, sdp, s, do_norm); +- else if (needexp) +- srcu_funnel_exp_start(ssp, sdp->mynode, s); +- srcu_read_unlock(ssp, idx); ++ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); + } + + /** +@@ -1014,6 +1025,60 @@ void synchronize_srcu(struct srcu_struct *ssp) + } + EXPORT_SYMBOL_GPL(synchronize_srcu); + ++/** ++ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie ++ * @ssp: srcu_struct to provide cookie for. ++ * ++ * This function returns a cookie that can be passed to ++ * poll_state_synchronize_srcu(), which will return true if a full grace ++ * period has elapsed in the meantime. It is the caller's responsibility ++ * to make sure that grace period happens, for example, by invoking ++ * call_srcu() after return from get_state_synchronize_srcu(). ++ */ ++unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) ++{ ++ // Any prior manipulation of SRCU-protected data must happen ++ // before the load from ->srcu_gp_seq. ++ smp_mb(); ++ return rcu_seq_snap(&ssp->srcu_gp_seq); ++} ++EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); ++ ++/** ++ * start_poll_synchronize_srcu - Provide cookie and start grace period ++ * @ssp: srcu_struct to provide cookie for. ++ * ++ * This function returns a cookie that can be passed to ++ * poll_state_synchronize_srcu(), which will return true if a full grace ++ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), ++ * this function also ensures that any needed SRCU grace period will be ++ * started. This convenience does come at a cost in terms of CPU overhead. ++ */ ++unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) ++{ ++ return srcu_gp_start_if_needed(ssp, NULL, true); ++} ++EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); ++ ++/** ++ * poll_state_synchronize_srcu - Has cookie's grace period ended? ++ * @ssp: srcu_struct to provide cookie for. ++ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). ++ * ++ * This function takes the cookie that was returned from either ++ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and ++ * returns @true if an SRCU grace period elapsed since the time that the ++ * cookie was created. ++ */ ++bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) ++{ ++ if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) ++ return false; ++ smp_mb(); // ^^^ ++ return true; ++} ++EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); ++ + /* + * Callback function for srcu_barrier() use. + */ diff --git a/lib/Kconfig b/lib/Kconfig index b4b98a03ff98..7ec0b400c545 100644 --- a/lib/Kconfig @@ -70581,7 +71442,7 @@ index 99c49eeae71b..5b724e5b4b89 100644 /** * generic_file_buffered_read - generic file read routine * @iocb: the iocb to read -@@ -2158,284 +2509,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, +@@ -2158,294 +2509,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, struct iov_iter *iter, ssize_t written) { struct file *filp = iocb->ki_filp; @@ -70808,10 +71669,15 @@ index 99c49eeae71b..5b724e5b4b89 100644 - -page_not_up_to_date: - /* Get exclusive access to the page ... */ -- if (iocb->ki_flags & IOCB_WAITQ) +- if (iocb->ki_flags & IOCB_WAITQ) { +- if (written) { +- put_page(page); +- goto out; +- } - error = lock_page_async(page, iocb->ki_waitq); -- else +- } else { - error = lock_page_killable(page); +- } - if (unlikely(error)) - goto readpage_error; - @@ -70865,10 +71731,15 @@ index 99c49eeae71b..5b724e5b4b89 100644 + ra->prev_pos = iocb->ki_pos; - if (!PageUptodate(page)) { -- if (iocb->ki_flags & IOCB_WAITQ) +- if (iocb->ki_flags & IOCB_WAITQ) { +- if (written) { +- put_page(page); +- goto out; +- } - error = lock_page_async(page, iocb->ki_waitq); -- else +- } else { - error = lock_page_killable(page); +- } - - if (unlikely(error)) - goto readpage_error; @@ -70941,24 +71812,6 @@ index 99c49eeae71b..5b724e5b4b89 100644 return written ? written : error; } EXPORT_SYMBOL_GPL(generic_file_buffered_read); -diff --git a/mm/gup.c b/mm/gup.c -index e869c634cc9a..9bfb3e933deb 100644 ---- a/mm/gup.c -+++ b/mm/gup.c -@@ -1085,6 +1085,13 @@ static long __get_user_pages(struct mm_struct *mm, - } - cond_resched(); - -+ if (current->faults_disabled_mapping && -+ vma->vm_file && -+ vma->vm_file->f_mapping == current->faults_disabled_mapping) { -+ ret = -EFAULT; -+ goto out; -+ } -+ - page = follow_page_mask(vma, start, foll_flags, &ctx); - if (!page) { - ret = faultin_page(vma, start, &foll_flags, locked); diff --git a/mm/nommu.c b/mm/nommu.c index 75a327149af1..fe0a77d01656 100644 --- a/mm/nommu.c