72893 lines
1.8 MiB
72893 lines
1.8 MiB
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
|
|
index d4c9a016074b..4ca66f8b2591 100644
|
|
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
|
|
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
|
|
@@ -2600,6 +2600,24 @@ also includes ``DEFINE_SRCU()``, ``DEFINE_STATIC_SRCU()``, and
|
|
``init_srcu_struct()`` APIs for defining and initializing
|
|
``srcu_struct`` structures.
|
|
|
|
+More recently, the SRCU API has added polling interfaces:
|
|
+
|
|
+#. start_poll_synchronize_srcu() returns a cookie identifying
|
|
+ the completion of a future SRCU grace period and ensures
|
|
+ that this grace period will be started.
|
|
+#. poll_state_synchronize_srcu() returns ``true`` iff the
|
|
+ specified cookie corresponds to an already-completed
|
|
+ SRCU grace period.
|
|
+#. get_state_synchronize_srcu() returns a cookie just like
|
|
+ start_poll_synchronize_srcu() does, but differs in that
|
|
+ it does nothing to ensure that any future SRCU grace period
|
|
+ will be started.
|
|
+
|
|
+These functions are used to avoid unnecessary SRCU grace periods in
|
|
+certain types of buffer-cache algorithms having multi-stage age-out
|
|
+mechanisms. The idea is that by the time the block has aged completely
|
|
+from the cache, an SRCU grace period will be very likely to have elapsed.
|
|
+
|
|
Tasks RCU
|
|
~~~~~~~~~
|
|
|
|
diff --git a/block/bio.c b/block/bio.c
|
|
index 1f2cc1fbe283..e640b0763b1c 100644
|
|
--- a/block/bio.c
|
|
+++ b/block/bio.c
|
|
@@ -1321,6 +1321,7 @@ void bio_set_pages_dirty(struct bio *bio)
|
|
set_page_dirty_lock(bvec->bv_page);
|
|
}
|
|
}
|
|
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
|
|
|
|
/*
|
|
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
|
|
@@ -1380,6 +1381,7 @@ void bio_check_pages_dirty(struct bio *bio)
|
|
spin_unlock_irqrestore(&bio_dirty_lock, flags);
|
|
schedule_work(&bio_dirty_work);
|
|
}
|
|
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
|
|
|
|
static inline bool bio_remaining_done(struct bio *bio)
|
|
{
|
|
diff --git a/block/blk-core.c b/block/blk-core.c
|
|
index 7663a9b94b80..e27b4ba513ce 100644
|
|
--- a/block/blk-core.c
|
|
+++ b/block/blk-core.c
|
|
@@ -218,18 +218,23 @@ int blk_status_to_errno(blk_status_t status)
|
|
}
|
|
EXPORT_SYMBOL_GPL(blk_status_to_errno);
|
|
|
|
-static void print_req_error(struct request *req, blk_status_t status,
|
|
- const char *caller)
|
|
+const char *blk_status_to_str(blk_status_t status)
|
|
{
|
|
int idx = (__force int)status;
|
|
|
|
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
|
|
- return;
|
|
+ return "(invalid error)";
|
|
+ return blk_errors[idx].name;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(blk_status_to_str);
|
|
|
|
+static void print_req_error(struct request *req, blk_status_t status,
|
|
+ const char *caller)
|
|
+{
|
|
printk_ratelimited(KERN_ERR
|
|
"%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
|
|
"phys_seg %u prio class %u\n",
|
|
- caller, blk_errors[idx].name,
|
|
+ caller, blk_status_to_str(status),
|
|
req->rq_disk ? req->rq_disk->disk_name : "?",
|
|
blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
|
|
req->cmd_flags & ~REQ_OP_MASK,
|
|
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
|
|
index d1ca4d059c20..e63646b103c4 100644
|
|
--- a/drivers/md/bcache/Kconfig
|
|
+++ b/drivers/md/bcache/Kconfig
|
|
@@ -3,6 +3,7 @@
|
|
config BCACHE
|
|
tristate "Block device as cache"
|
|
select CRC64
|
|
+ select CLOSURES
|
|
help
|
|
Allows a block device to be used as cache for other devices; uses
|
|
a btree for indexing and the layout is optimized for SSDs.
|
|
@@ -18,15 +19,6 @@ config BCACHE_DEBUG
|
|
Enables extra debugging tools, allows expensive runtime checks to be
|
|
turned on.
|
|
|
|
-config BCACHE_CLOSURES_DEBUG
|
|
- bool "Debug closures"
|
|
- depends on BCACHE
|
|
- select DEBUG_FS
|
|
- help
|
|
- Keeps all active closures in a linked list and provides a debugfs
|
|
- interface to list them, which makes it possible to see asynchronous
|
|
- operations that get stuck.
|
|
-
|
|
config BCACHE_ASYNC_REGISTRATION
|
|
bool "Asynchronous device registration (EXPERIMENTAL)"
|
|
depends on BCACHE
|
|
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
|
|
index 5b87e59676b8..054e8a33a7ab 100644
|
|
--- a/drivers/md/bcache/Makefile
|
|
+++ b/drivers/md/bcache/Makefile
|
|
@@ -2,6 +2,6 @@
|
|
|
|
obj-$(CONFIG_BCACHE) += bcache.o
|
|
|
|
-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\
|
|
- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
|
|
+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\
|
|
+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
|
|
util.o writeback.o features.o
|
|
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
|
|
index 1d57f48307e6..45b296310f43 100644
|
|
--- a/drivers/md/bcache/bcache.h
|
|
+++ b/drivers/md/bcache/bcache.h
|
|
@@ -180,6 +180,7 @@
|
|
|
|
#include <linux/bcache.h>
|
|
#include <linux/bio.h>
|
|
+#include <linux/closure.h>
|
|
#include <linux/kobject.h>
|
|
#include <linux/list.h>
|
|
#include <linux/mutex.h>
|
|
@@ -192,7 +193,6 @@
|
|
|
|
#include "bset.h"
|
|
#include "util.h"
|
|
-#include "closure.h"
|
|
|
|
struct bucket {
|
|
atomic_t pin;
|
|
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
|
|
deleted file mode 100644
|
|
index d8d9394a6beb..000000000000
|
|
--- a/drivers/md/bcache/closure.c
|
|
+++ /dev/null
|
|
@@ -1,207 +0,0 @@
|
|
-// SPDX-License-Identifier: GPL-2.0
|
|
-/*
|
|
- * Asynchronous refcounty things
|
|
- *
|
|
- * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
- * Copyright 2012 Google, Inc.
|
|
- */
|
|
-
|
|
-#include <linux/debugfs.h>
|
|
-#include <linux/module.h>
|
|
-#include <linux/seq_file.h>
|
|
-#include <linux/sched/debug.h>
|
|
-
|
|
-#include "closure.h"
|
|
-
|
|
-static inline void closure_put_after_sub(struct closure *cl, int flags)
|
|
-{
|
|
- int r = flags & CLOSURE_REMAINING_MASK;
|
|
-
|
|
- BUG_ON(flags & CLOSURE_GUARD_MASK);
|
|
- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
|
|
-
|
|
- if (!r) {
|
|
- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
|
|
- atomic_set(&cl->remaining,
|
|
- CLOSURE_REMAINING_INITIALIZER);
|
|
- closure_queue(cl);
|
|
- } else {
|
|
- struct closure *parent = cl->parent;
|
|
- closure_fn *destructor = cl->fn;
|
|
-
|
|
- closure_debug_destroy(cl);
|
|
-
|
|
- if (destructor)
|
|
- destructor(cl);
|
|
-
|
|
- if (parent)
|
|
- closure_put(parent);
|
|
- }
|
|
- }
|
|
-}
|
|
-
|
|
-/* For clearing flags with the same atomic op as a put */
|
|
-void closure_sub(struct closure *cl, int v)
|
|
-{
|
|
- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
|
|
-}
|
|
-
|
|
-/*
|
|
- * closure_put - decrement a closure's refcount
|
|
- */
|
|
-void closure_put(struct closure *cl)
|
|
-{
|
|
- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
|
|
-}
|
|
-
|
|
-/*
|
|
- * closure_wake_up - wake up all closures on a wait list, without memory barrier
|
|
- */
|
|
-void __closure_wake_up(struct closure_waitlist *wait_list)
|
|
-{
|
|
- struct llist_node *list;
|
|
- struct closure *cl, *t;
|
|
- struct llist_node *reverse = NULL;
|
|
-
|
|
- list = llist_del_all(&wait_list->list);
|
|
-
|
|
- /* We first reverse the list to preserve FIFO ordering and fairness */
|
|
- reverse = llist_reverse_order(list);
|
|
-
|
|
- /* Then do the wakeups */
|
|
- llist_for_each_entry_safe(cl, t, reverse, list) {
|
|
- closure_set_waiting(cl, 0);
|
|
- closure_sub(cl, CLOSURE_WAITING + 1);
|
|
- }
|
|
-}
|
|
-
|
|
-/**
|
|
- * closure_wait - add a closure to a waitlist
|
|
- * @waitlist: will own a ref on @cl, which will be released when
|
|
- * closure_wake_up() is called on @waitlist.
|
|
- * @cl: closure pointer.
|
|
- *
|
|
- */
|
|
-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
|
|
-{
|
|
- if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
|
|
- return false;
|
|
-
|
|
- closure_set_waiting(cl, _RET_IP_);
|
|
- atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
|
|
- llist_add(&cl->list, &waitlist->list);
|
|
-
|
|
- return true;
|
|
-}
|
|
-
|
|
-struct closure_syncer {
|
|
- struct task_struct *task;
|
|
- int done;
|
|
-};
|
|
-
|
|
-static void closure_sync_fn(struct closure *cl)
|
|
-{
|
|
- struct closure_syncer *s = cl->s;
|
|
- struct task_struct *p;
|
|
-
|
|
- rcu_read_lock();
|
|
- p = READ_ONCE(s->task);
|
|
- s->done = 1;
|
|
- wake_up_process(p);
|
|
- rcu_read_unlock();
|
|
-}
|
|
-
|
|
-void __sched __closure_sync(struct closure *cl)
|
|
-{
|
|
- struct closure_syncer s = { .task = current };
|
|
-
|
|
- cl->s = &s;
|
|
- continue_at(cl, closure_sync_fn, NULL);
|
|
-
|
|
- while (1) {
|
|
- set_current_state(TASK_UNINTERRUPTIBLE);
|
|
- if (s.done)
|
|
- break;
|
|
- schedule();
|
|
- }
|
|
-
|
|
- __set_current_state(TASK_RUNNING);
|
|
-}
|
|
-
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
-
|
|
-static LIST_HEAD(closure_list);
|
|
-static DEFINE_SPINLOCK(closure_list_lock);
|
|
-
|
|
-void closure_debug_create(struct closure *cl)
|
|
-{
|
|
- unsigned long flags;
|
|
-
|
|
- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
|
|
- cl->magic = CLOSURE_MAGIC_ALIVE;
|
|
-
|
|
- spin_lock_irqsave(&closure_list_lock, flags);
|
|
- list_add(&cl->all, &closure_list);
|
|
- spin_unlock_irqrestore(&closure_list_lock, flags);
|
|
-}
|
|
-
|
|
-void closure_debug_destroy(struct closure *cl)
|
|
-{
|
|
- unsigned long flags;
|
|
-
|
|
- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
|
|
- cl->magic = CLOSURE_MAGIC_DEAD;
|
|
-
|
|
- spin_lock_irqsave(&closure_list_lock, flags);
|
|
- list_del(&cl->all);
|
|
- spin_unlock_irqrestore(&closure_list_lock, flags);
|
|
-}
|
|
-
|
|
-static struct dentry *closure_debug;
|
|
-
|
|
-static int debug_show(struct seq_file *f, void *data)
|
|
-{
|
|
- struct closure *cl;
|
|
-
|
|
- spin_lock_irq(&closure_list_lock);
|
|
-
|
|
- list_for_each_entry(cl, &closure_list, all) {
|
|
- int r = atomic_read(&cl->remaining);
|
|
-
|
|
- seq_printf(f, "%p: %pS -> %pS p %p r %i ",
|
|
- cl, (void *) cl->ip, cl->fn, cl->parent,
|
|
- r & CLOSURE_REMAINING_MASK);
|
|
-
|
|
- seq_printf(f, "%s%s\n",
|
|
- test_bit(WORK_STRUCT_PENDING_BIT,
|
|
- work_data_bits(&cl->work)) ? "Q" : "",
|
|
- r & CLOSURE_RUNNING ? "R" : "");
|
|
-
|
|
- if (r & CLOSURE_WAITING)
|
|
- seq_printf(f, " W %pS\n",
|
|
- (void *) cl->waiting_on);
|
|
-
|
|
- seq_printf(f, "\n");
|
|
- }
|
|
-
|
|
- spin_unlock_irq(&closure_list_lock);
|
|
- return 0;
|
|
-}
|
|
-
|
|
-DEFINE_SHOW_ATTRIBUTE(debug);
|
|
-
|
|
-void __init closure_debug_init(void)
|
|
-{
|
|
- if (!IS_ERR_OR_NULL(bcache_debug))
|
|
- /*
|
|
- * it is unnecessary to check return value of
|
|
- * debugfs_create_file(), we should not care
|
|
- * about this.
|
|
- */
|
|
- closure_debug = debugfs_create_file(
|
|
- "closures", 0400, bcache_debug, NULL, &debug_fops);
|
|
-}
|
|
-#endif
|
|
-
|
|
-MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
|
|
-MODULE_LICENSE("GPL");
|
|
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
|
|
deleted file mode 100644
|
|
index c88cdc4ae4ec..000000000000
|
|
--- a/drivers/md/bcache/closure.h
|
|
+++ /dev/null
|
|
@@ -1,378 +0,0 @@
|
|
-/* SPDX-License-Identifier: GPL-2.0 */
|
|
-#ifndef _LINUX_CLOSURE_H
|
|
-#define _LINUX_CLOSURE_H
|
|
-
|
|
-#include <linux/llist.h>
|
|
-#include <linux/sched.h>
|
|
-#include <linux/sched/task_stack.h>
|
|
-#include <linux/workqueue.h>
|
|
-
|
|
-/*
|
|
- * Closure is perhaps the most overused and abused term in computer science, but
|
|
- * since I've been unable to come up with anything better you're stuck with it
|
|
- * again.
|
|
- *
|
|
- * What are closures?
|
|
- *
|
|
- * They embed a refcount. The basic idea is they count "things that are in
|
|
- * progress" - in flight bios, some other thread that's doing something else -
|
|
- * anything you might want to wait on.
|
|
- *
|
|
- * The refcount may be manipulated with closure_get() and closure_put().
|
|
- * closure_put() is where many of the interesting things happen, when it causes
|
|
- * the refcount to go to 0.
|
|
- *
|
|
- * Closures can be used to wait on things both synchronously and asynchronously,
|
|
- * and synchronous and asynchronous use can be mixed without restriction. To
|
|
- * wait synchronously, use closure_sync() - you will sleep until your closure's
|
|
- * refcount hits 1.
|
|
- *
|
|
- * To wait asynchronously, use
|
|
- * continue_at(cl, next_function, workqueue);
|
|
- *
|
|
- * passing it, as you might expect, the function to run when nothing is pending
|
|
- * and the workqueue to run that function out of.
|
|
- *
|
|
- * continue_at() also, critically, requires a 'return' immediately following the
|
|
- * location where this macro is referenced, to return to the calling function.
|
|
- * There's good reason for this.
|
|
- *
|
|
- * To use safely closures asynchronously, they must always have a refcount while
|
|
- * they are running owned by the thread that is running them. Otherwise, suppose
|
|
- * you submit some bios and wish to have a function run when they all complete:
|
|
- *
|
|
- * foo_endio(struct bio *bio)
|
|
- * {
|
|
- * closure_put(cl);
|
|
- * }
|
|
- *
|
|
- * closure_init(cl);
|
|
- *
|
|
- * do_stuff();
|
|
- * closure_get(cl);
|
|
- * bio1->bi_endio = foo_endio;
|
|
- * bio_submit(bio1);
|
|
- *
|
|
- * do_more_stuff();
|
|
- * closure_get(cl);
|
|
- * bio2->bi_endio = foo_endio;
|
|
- * bio_submit(bio2);
|
|
- *
|
|
- * continue_at(cl, complete_some_read, system_wq);
|
|
- *
|
|
- * If closure's refcount started at 0, complete_some_read() could run before the
|
|
- * second bio was submitted - which is almost always not what you want! More
|
|
- * importantly, it wouldn't be possible to say whether the original thread or
|
|
- * complete_some_read()'s thread owned the closure - and whatever state it was
|
|
- * associated with!
|
|
- *
|
|
- * So, closure_init() initializes a closure's refcount to 1 - and when a
|
|
- * closure_fn is run, the refcount will be reset to 1 first.
|
|
- *
|
|
- * Then, the rule is - if you got the refcount with closure_get(), release it
|
|
- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
|
|
- * on a closure because you called closure_init() or you were run out of a
|
|
- * closure - _always_ use continue_at(). Doing so consistently will help
|
|
- * eliminate an entire class of particularly pernicious races.
|
|
- *
|
|
- * Lastly, you might have a wait list dedicated to a specific event, and have no
|
|
- * need for specifying the condition - you just want to wait until someone runs
|
|
- * closure_wake_up() on the appropriate wait list. In that case, just use
|
|
- * closure_wait(). It will return either true or false, depending on whether the
|
|
- * closure was already on a wait list or not - a closure can only be on one wait
|
|
- * list at a time.
|
|
- *
|
|
- * Parents:
|
|
- *
|
|
- * closure_init() takes two arguments - it takes the closure to initialize, and
|
|
- * a (possibly null) parent.
|
|
- *
|
|
- * If parent is non null, the new closure will have a refcount for its lifetime;
|
|
- * a closure is considered to be "finished" when its refcount hits 0 and the
|
|
- * function to run is null. Hence
|
|
- *
|
|
- * continue_at(cl, NULL, NULL);
|
|
- *
|
|
- * returns up the (spaghetti) stack of closures, precisely like normal return
|
|
- * returns up the C stack. continue_at() with non null fn is better thought of
|
|
- * as doing a tail call.
|
|
- *
|
|
- * All this implies that a closure should typically be embedded in a particular
|
|
- * struct (which its refcount will normally control the lifetime of), and that
|
|
- * struct can very much be thought of as a stack frame.
|
|
- */
|
|
-
|
|
-struct closure;
|
|
-struct closure_syncer;
|
|
-typedef void (closure_fn) (struct closure *);
|
|
-extern struct dentry *bcache_debug;
|
|
-
|
|
-struct closure_waitlist {
|
|
- struct llist_head list;
|
|
-};
|
|
-
|
|
-enum closure_state {
|
|
- /*
|
|
- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
|
|
- * the thread that owns the closure, and cleared by the thread that's
|
|
- * waking up the closure.
|
|
- *
|
|
- * The rest are for debugging and don't affect behaviour:
|
|
- *
|
|
- * CLOSURE_RUNNING: Set when a closure is running (i.e. by
|
|
- * closure_init() and when closure_put() runs then next function), and
|
|
- * must be cleared before remaining hits 0. Primarily to help guard
|
|
- * against incorrect usage and accidentally transferring references.
|
|
- * continue_at() and closure_return() clear it for you, if you're doing
|
|
- * something unusual you can use closure_set_dead() which also helps
|
|
- * annotate where references are being transferred.
|
|
- */
|
|
-
|
|
- CLOSURE_BITS_START = (1U << 26),
|
|
- CLOSURE_DESTRUCTOR = (1U << 26),
|
|
- CLOSURE_WAITING = (1U << 28),
|
|
- CLOSURE_RUNNING = (1U << 30),
|
|
-};
|
|
-
|
|
-#define CLOSURE_GUARD_MASK \
|
|
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
|
|
-
|
|
-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
|
|
-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
|
|
-
|
|
-struct closure {
|
|
- union {
|
|
- struct {
|
|
- struct workqueue_struct *wq;
|
|
- struct closure_syncer *s;
|
|
- struct llist_node list;
|
|
- closure_fn *fn;
|
|
- };
|
|
- struct work_struct work;
|
|
- };
|
|
-
|
|
- struct closure *parent;
|
|
-
|
|
- atomic_t remaining;
|
|
-
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
-#define CLOSURE_MAGIC_DEAD 0xc054dead
|
|
-#define CLOSURE_MAGIC_ALIVE 0xc054a11e
|
|
-
|
|
- unsigned int magic;
|
|
- struct list_head all;
|
|
- unsigned long ip;
|
|
- unsigned long waiting_on;
|
|
-#endif
|
|
-};
|
|
-
|
|
-void closure_sub(struct closure *cl, int v);
|
|
-void closure_put(struct closure *cl);
|
|
-void __closure_wake_up(struct closure_waitlist *list);
|
|
-bool closure_wait(struct closure_waitlist *list, struct closure *cl);
|
|
-void __closure_sync(struct closure *cl);
|
|
-
|
|
-/**
|
|
- * closure_sync - sleep until a closure a closure has nothing left to wait on
|
|
- *
|
|
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
|
|
- * the last refcount.
|
|
- */
|
|
-static inline void closure_sync(struct closure *cl)
|
|
-{
|
|
- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
|
|
- __closure_sync(cl);
|
|
-}
|
|
-
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
-
|
|
-void closure_debug_init(void);
|
|
-void closure_debug_create(struct closure *cl);
|
|
-void closure_debug_destroy(struct closure *cl);
|
|
-
|
|
-#else
|
|
-
|
|
-static inline void closure_debug_init(void) {}
|
|
-static inline void closure_debug_create(struct closure *cl) {}
|
|
-static inline void closure_debug_destroy(struct closure *cl) {}
|
|
-
|
|
-#endif
|
|
-
|
|
-static inline void closure_set_ip(struct closure *cl)
|
|
-{
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
- cl->ip = _THIS_IP_;
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline void closure_set_ret_ip(struct closure *cl)
|
|
-{
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
- cl->ip = _RET_IP_;
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline void closure_set_waiting(struct closure *cl, unsigned long f)
|
|
-{
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
- cl->waiting_on = f;
|
|
-#endif
|
|
-}
|
|
-
|
|
-static inline void closure_set_stopped(struct closure *cl)
|
|
-{
|
|
- atomic_sub(CLOSURE_RUNNING, &cl->remaining);
|
|
-}
|
|
-
|
|
-static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
|
|
- struct workqueue_struct *wq)
|
|
-{
|
|
- closure_set_ip(cl);
|
|
- cl->fn = fn;
|
|
- cl->wq = wq;
|
|
- /* between atomic_dec() in closure_put() */
|
|
- smp_mb__before_atomic();
|
|
-}
|
|
-
|
|
-static inline void closure_queue(struct closure *cl)
|
|
-{
|
|
- struct workqueue_struct *wq = cl->wq;
|
|
- /**
|
|
- * Changes made to closure, work_struct, or a couple of other structs
|
|
- * may cause work.func not pointing to the right location.
|
|
- */
|
|
- BUILD_BUG_ON(offsetof(struct closure, fn)
|
|
- != offsetof(struct work_struct, func));
|
|
- if (wq) {
|
|
- INIT_WORK(&cl->work, cl->work.func);
|
|
- BUG_ON(!queue_work(wq, &cl->work));
|
|
- } else
|
|
- cl->fn(cl);
|
|
-}
|
|
-
|
|
-/**
|
|
- * closure_get - increment a closure's refcount
|
|
- */
|
|
-static inline void closure_get(struct closure *cl)
|
|
-{
|
|
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
|
|
- BUG_ON((atomic_inc_return(&cl->remaining) &
|
|
- CLOSURE_REMAINING_MASK) <= 1);
|
|
-#else
|
|
- atomic_inc(&cl->remaining);
|
|
-#endif
|
|
-}
|
|
-
|
|
-/**
|
|
- * closure_init - Initialize a closure, setting the refcount to 1
|
|
- * @cl: closure to initialize
|
|
- * @parent: parent of the new closure. cl will take a refcount on it for its
|
|
- * lifetime; may be NULL.
|
|
- */
|
|
-static inline void closure_init(struct closure *cl, struct closure *parent)
|
|
-{
|
|
- memset(cl, 0, sizeof(struct closure));
|
|
- cl->parent = parent;
|
|
- if (parent)
|
|
- closure_get(parent);
|
|
-
|
|
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
|
|
-
|
|
- closure_debug_create(cl);
|
|
- closure_set_ip(cl);
|
|
-}
|
|
-
|
|
-static inline void closure_init_stack(struct closure *cl)
|
|
-{
|
|
- memset(cl, 0, sizeof(struct closure));
|
|
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
|
|
-}
|
|
-
|
|
-/**
|
|
- * closure_wake_up - wake up all closures on a wait list,
|
|
- * with memory barrier
|
|
- */
|
|
-static inline void closure_wake_up(struct closure_waitlist *list)
|
|
-{
|
|
- /* Memory barrier for the wait list */
|
|
- smp_mb();
|
|
- __closure_wake_up(list);
|
|
-}
|
|
-
|
|
-/**
|
|
- * continue_at - jump to another function with barrier
|
|
- *
|
|
- * After @cl is no longer waiting on anything (i.e. all outstanding refs have
|
|
- * been dropped with closure_put()), it will resume execution at @fn running out
|
|
- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
|
|
- *
|
|
- * This is because after calling continue_at() you no longer have a ref on @cl,
|
|
- * and whatever @cl owns may be freed out from under you - a running closure fn
|
|
- * has a ref on its own closure which continue_at() drops.
|
|
- *
|
|
- * Note you are expected to immediately return after using this macro.
|
|
- */
|
|
-#define continue_at(_cl, _fn, _wq) \
|
|
-do { \
|
|
- set_closure_fn(_cl, _fn, _wq); \
|
|
- closure_sub(_cl, CLOSURE_RUNNING + 1); \
|
|
-} while (0)
|
|
-
|
|
-/**
|
|
- * closure_return - finish execution of a closure
|
|
- *
|
|
- * This is used to indicate that @cl is finished: when all outstanding refs on
|
|
- * @cl have been dropped @cl's ref on its parent closure (as passed to
|
|
- * closure_init()) will be dropped, if one was specified - thus this can be
|
|
- * thought of as returning to the parent closure.
|
|
- */
|
|
-#define closure_return(_cl) continue_at((_cl), NULL, NULL)
|
|
-
|
|
-/**
|
|
- * continue_at_nobarrier - jump to another function without barrier
|
|
- *
|
|
- * Causes @fn to be executed out of @cl, in @wq context (or called directly if
|
|
- * @wq is NULL).
|
|
- *
|
|
- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
|
|
- * thus it's not safe to touch anything protected by @cl after a
|
|
- * continue_at_nobarrier().
|
|
- */
|
|
-#define continue_at_nobarrier(_cl, _fn, _wq) \
|
|
-do { \
|
|
- set_closure_fn(_cl, _fn, _wq); \
|
|
- closure_queue(_cl); \
|
|
-} while (0)
|
|
-
|
|
-/**
|
|
- * closure_return_with_destructor - finish execution of a closure,
|
|
- * with destructor
|
|
- *
|
|
- * Works like closure_return(), except @destructor will be called when all
|
|
- * outstanding refs on @cl have been dropped; @destructor may be used to safely
|
|
- * free the memory occupied by @cl, and it is called with the ref on the parent
|
|
- * closure still held - so @destructor could safely return an item to a
|
|
- * freelist protected by @cl's parent.
|
|
- */
|
|
-#define closure_return_with_destructor(_cl, _destructor) \
|
|
-do { \
|
|
- set_closure_fn(_cl, _destructor, NULL); \
|
|
- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
|
|
-} while (0)
|
|
-
|
|
-/**
|
|
- * closure_call - execute @fn out of a new, uninitialized closure
|
|
- *
|
|
- * Typically used when running out of one closure, and we want to run @fn
|
|
- * asynchronously out of a new closure - @parent will then wait for @cl to
|
|
- * finish.
|
|
- */
|
|
-static inline void closure_call(struct closure *cl, closure_fn fn,
|
|
- struct workqueue_struct *wq,
|
|
- struct closure *parent)
|
|
-{
|
|
- closure_init(cl, parent);
|
|
- continue_at_nobarrier(cl, fn, wq);
|
|
-}
|
|
-
|
|
-#endif /* _LINUX_CLOSURE_H */
|
|
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
|
|
index 2047a9cccdb5..b5b01f1093df 100644
|
|
--- a/drivers/md/bcache/super.c
|
|
+++ b/drivers/md/bcache/super.c
|
|
@@ -2893,7 +2893,6 @@ static int __init bcache_init(void)
|
|
goto err;
|
|
|
|
bch_debug_init();
|
|
- closure_debug_init();
|
|
|
|
bcache_is_reboot = false;
|
|
|
|
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
|
|
index c029f7443190..59093f9f1793 100644
|
|
--- a/drivers/md/bcache/util.h
|
|
+++ b/drivers/md/bcache/util.h
|
|
@@ -4,6 +4,7 @@
|
|
#define _BCACHE_UTIL_H
|
|
|
|
#include <linux/blkdev.h>
|
|
+#include <linux/closure.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/kernel.h>
|
|
#include <linux/sched/clock.h>
|
|
@@ -13,8 +14,6 @@
|
|
#include <linux/workqueue.h>
|
|
#include <linux/crc64.h>
|
|
|
|
-#include "closure.h"
|
|
-
|
|
#define PAGE_SECTORS (PAGE_SIZE / 512)
|
|
|
|
struct closure;
|
|
diff --git a/fs/Kconfig b/fs/Kconfig
|
|
index da524c4d7b7e..faeb7a1a442d 100644
|
|
--- a/fs/Kconfig
|
|
+++ b/fs/Kconfig
|
|
@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig"
|
|
source "fs/btrfs/Kconfig"
|
|
source "fs/nilfs2/Kconfig"
|
|
source "fs/f2fs/Kconfig"
|
|
+source "fs/bcachefs/Kconfig"
|
|
source "fs/zonefs/Kconfig"
|
|
|
|
config FS_DAX
|
|
diff --git a/fs/Makefile b/fs/Makefile
|
|
index 999d1a23f036..6aeadd8cf9c0 100644
|
|
--- a/fs/Makefile
|
|
+++ b/fs/Makefile
|
|
@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
|
|
obj-$(CONFIG_BTRFS_FS) += btrfs/
|
|
obj-$(CONFIG_GFS2_FS) += gfs2/
|
|
obj-$(CONFIG_F2FS_FS) += f2fs/
|
|
+obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
|
|
obj-$(CONFIG_CEPH_FS) += ceph/
|
|
obj-$(CONFIG_PSTORE) += pstore/
|
|
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
|
|
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
|
|
new file mode 100644
|
|
index 000000000000..57c5d58c2d87
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/Kconfig
|
|
@@ -0,0 +1,51 @@
|
|
+
|
|
+config BCACHEFS_FS
|
|
+ tristate "bcachefs filesystem support"
|
|
+ depends on BLOCK
|
|
+ select EXPORTFS
|
|
+ select CLOSURES
|
|
+ select LIBCRC32C
|
|
+ select CRC64
|
|
+ select FS_POSIX_ACL
|
|
+ select LZ4_COMPRESS
|
|
+ select LZ4_DECOMPRESS
|
|
+ select ZLIB_DEFLATE
|
|
+ select ZLIB_INFLATE
|
|
+ select ZSTD_COMPRESS
|
|
+ select ZSTD_DECOMPRESS
|
|
+ select CRYPTO_SHA256
|
|
+ select CRYPTO_CHACHA20
|
|
+ select CRYPTO_POLY1305
|
|
+ select KEYS
|
|
+ select SIXLOCKS
|
|
+ select RAID6_PQ
|
|
+ select XOR_BLOCKS
|
|
+ select SRCU
|
|
+ help
|
|
+ The bcachefs filesystem - a modern, copy on write filesystem, with
|
|
+ support for multiple devices, compression, checksumming, etc.
|
|
+
|
|
+config BCACHEFS_QUOTA
|
|
+ bool "bcachefs quota support"
|
|
+ depends on BCACHEFS_FS
|
|
+ select QUOTACTL
|
|
+
|
|
+config BCACHEFS_POSIX_ACL
|
|
+ bool "bcachefs POSIX ACL support"
|
|
+ depends on BCACHEFS_FS
|
|
+ select FS_POSIX_ACL
|
|
+
|
|
+config BCACHEFS_DEBUG
|
|
+ bool "bcachefs debugging"
|
|
+ depends on BCACHEFS_FS
|
|
+ help
|
|
+ Enables many extra debugging checks and assertions.
|
|
+
|
|
+ The resulting code will be significantly slower than normal; you
|
|
+ probably shouldn't select this option unless you're a developer.
|
|
+
|
|
+config BCACHEFS_TESTS
|
|
+ bool "bcachefs unit and performance tests"
|
|
+ depends on BCACHEFS_FS
|
|
+ help
|
|
+ Include some unit and performance tests for the core btree code
|
|
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
|
|
new file mode 100644
|
|
index 000000000000..2fbf978424ed
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/Makefile
|
|
@@ -0,0 +1,60 @@
|
|
+
|
|
+obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
|
|
+
|
|
+bcachefs-y := \
|
|
+ acl.o \
|
|
+ alloc_background.o \
|
|
+ alloc_foreground.o \
|
|
+ bkey.o \
|
|
+ bkey_methods.o \
|
|
+ bkey_sort.o \
|
|
+ bset.o \
|
|
+ btree_cache.o \
|
|
+ btree_gc.o \
|
|
+ btree_io.o \
|
|
+ btree_iter.o \
|
|
+ btree_key_cache.o \
|
|
+ btree_update_interior.o \
|
|
+ btree_update_leaf.o \
|
|
+ buckets.o \
|
|
+ chardev.o \
|
|
+ checksum.o \
|
|
+ clock.o \
|
|
+ compress.o \
|
|
+ debug.o \
|
|
+ dirent.o \
|
|
+ disk_groups.o \
|
|
+ ec.o \
|
|
+ error.o \
|
|
+ extents.o \
|
|
+ extent_update.o \
|
|
+ fs.o \
|
|
+ fs-common.o \
|
|
+ fs-ioctl.o \
|
|
+ fs-io.o \
|
|
+ fsck.o \
|
|
+ inode.o \
|
|
+ io.o \
|
|
+ journal.o \
|
|
+ journal_io.o \
|
|
+ journal_reclaim.o \
|
|
+ journal_seq_blacklist.o \
|
|
+ keylist.o \
|
|
+ migrate.o \
|
|
+ move.o \
|
|
+ movinggc.o \
|
|
+ opts.o \
|
|
+ quota.o \
|
|
+ rebalance.o \
|
|
+ recovery.o \
|
|
+ reflink.o \
|
|
+ replicas.o \
|
|
+ siphash.o \
|
|
+ super.o \
|
|
+ super-io.o \
|
|
+ sysfs.o \
|
|
+ tests.o \
|
|
+ trace.o \
|
|
+ util.o \
|
|
+ varint.o \
|
|
+ xattr.o
|
|
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
|
|
new file mode 100644
|
|
index 000000000000..0f2d7437c740
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/acl.c
|
|
@@ -0,0 +1,395 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+
|
|
+#include "bcachefs.h"
|
|
+
|
|
+#include <linux/fs.h>
|
|
+#include <linux/posix_acl.h>
|
|
+#include <linux/posix_acl_xattr.h>
|
|
+#include <linux/sched.h>
|
|
+#include <linux/slab.h>
|
|
+
|
|
+#include "acl.h"
|
|
+#include "fs.h"
|
|
+#include "xattr.h"
|
|
+
|
|
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
|
|
+{
|
|
+ return sizeof(bch_acl_header) +
|
|
+ sizeof(bch_acl_entry_short) * nr_short +
|
|
+ sizeof(bch_acl_entry) * nr_long;
|
|
+}
|
|
+
|
|
+static inline int acl_to_xattr_type(int type)
|
|
+{
|
|
+ switch (type) {
|
|
+ case ACL_TYPE_ACCESS:
|
|
+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS;
|
|
+ case ACL_TYPE_DEFAULT:
|
|
+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Convert from filesystem to in-memory representation.
|
|
+ */
|
|
+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
|
|
+{
|
|
+ const void *p, *end = value + size;
|
|
+ struct posix_acl *acl;
|
|
+ struct posix_acl_entry *out;
|
|
+ unsigned count = 0;
|
|
+
|
|
+ if (!value)
|
|
+ return NULL;
|
|
+ if (size < sizeof(bch_acl_header))
|
|
+ goto invalid;
|
|
+ if (((bch_acl_header *)value)->a_version !=
|
|
+ cpu_to_le32(BCH_ACL_VERSION))
|
|
+ goto invalid;
|
|
+
|
|
+ p = value + sizeof(bch_acl_header);
|
|
+ while (p < end) {
|
|
+ const bch_acl_entry *entry = p;
|
|
+
|
|
+ if (p + sizeof(bch_acl_entry_short) > end)
|
|
+ goto invalid;
|
|
+
|
|
+ switch (le16_to_cpu(entry->e_tag)) {
|
|
+ case ACL_USER_OBJ:
|
|
+ case ACL_GROUP_OBJ:
|
|
+ case ACL_MASK:
|
|
+ case ACL_OTHER:
|
|
+ p += sizeof(bch_acl_entry_short);
|
|
+ break;
|
|
+ case ACL_USER:
|
|
+ case ACL_GROUP:
|
|
+ p += sizeof(bch_acl_entry);
|
|
+ break;
|
|
+ default:
|
|
+ goto invalid;
|
|
+ }
|
|
+
|
|
+ count++;
|
|
+ }
|
|
+
|
|
+ if (p > end)
|
|
+ goto invalid;
|
|
+
|
|
+ if (!count)
|
|
+ return NULL;
|
|
+
|
|
+ acl = posix_acl_alloc(count, GFP_KERNEL);
|
|
+ if (!acl)
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ out = acl->a_entries;
|
|
+
|
|
+ p = value + sizeof(bch_acl_header);
|
|
+ while (p < end) {
|
|
+ const bch_acl_entry *in = p;
|
|
+
|
|
+ out->e_tag = le16_to_cpu(in->e_tag);
|
|
+ out->e_perm = le16_to_cpu(in->e_perm);
|
|
+
|
|
+ switch (out->e_tag) {
|
|
+ case ACL_USER_OBJ:
|
|
+ case ACL_GROUP_OBJ:
|
|
+ case ACL_MASK:
|
|
+ case ACL_OTHER:
|
|
+ p += sizeof(bch_acl_entry_short);
|
|
+ break;
|
|
+ case ACL_USER:
|
|
+ out->e_uid = make_kuid(&init_user_ns,
|
|
+ le32_to_cpu(in->e_id));
|
|
+ p += sizeof(bch_acl_entry);
|
|
+ break;
|
|
+ case ACL_GROUP:
|
|
+ out->e_gid = make_kgid(&init_user_ns,
|
|
+ le32_to_cpu(in->e_id));
|
|
+ p += sizeof(bch_acl_entry);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ out++;
|
|
+ }
|
|
+
|
|
+ BUG_ON(out != acl->a_entries + acl->a_count);
|
|
+
|
|
+ return acl;
|
|
+invalid:
|
|
+ pr_err("invalid acl entry");
|
|
+ return ERR_PTR(-EINVAL);
|
|
+}
|
|
+
|
|
+#define acl_for_each_entry(acl, acl_e) \
|
|
+ for (acl_e = acl->a_entries; \
|
|
+ acl_e < acl->a_entries + acl->a_count; \
|
|
+ acl_e++)
|
|
+
|
|
+/*
|
|
+ * Convert from in-memory to filesystem representation.
|
|
+ */
|
|
+static struct bkey_i_xattr *
|
|
+bch2_acl_to_xattr(struct btree_trans *trans,
|
|
+ const struct posix_acl *acl,
|
|
+ int type)
|
|
+{
|
|
+ struct bkey_i_xattr *xattr;
|
|
+ bch_acl_header *acl_header;
|
|
+ const struct posix_acl_entry *acl_e;
|
|
+ void *outptr;
|
|
+ unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
|
|
+
|
|
+ acl_for_each_entry(acl, acl_e) {
|
|
+ switch (acl_e->e_tag) {
|
|
+ case ACL_USER:
|
|
+ case ACL_GROUP:
|
|
+ nr_long++;
|
|
+ break;
|
|
+ case ACL_USER_OBJ:
|
|
+ case ACL_GROUP_OBJ:
|
|
+ case ACL_MASK:
|
|
+ case ACL_OTHER:
|
|
+ nr_short++;
|
|
+ break;
|
|
+ default:
|
|
+ return ERR_PTR(-EINVAL);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ acl_len = bch2_acl_size(nr_short, nr_long);
|
|
+ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
|
|
+
|
|
+ if (u64s > U8_MAX)
|
|
+ return ERR_PTR(-E2BIG);
|
|
+
|
|
+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
|
|
+ if (IS_ERR(xattr))
|
|
+ return xattr;
|
|
+
|
|
+ bkey_xattr_init(&xattr->k_i);
|
|
+ xattr->k.u64s = u64s;
|
|
+ xattr->v.x_type = acl_to_xattr_type(type);
|
|
+ xattr->v.x_name_len = 0,
|
|
+ xattr->v.x_val_len = cpu_to_le16(acl_len);
|
|
+
|
|
+ acl_header = xattr_val(&xattr->v);
|
|
+ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
|
|
+
|
|
+ outptr = (void *) acl_header + sizeof(*acl_header);
|
|
+
|
|
+ acl_for_each_entry(acl, acl_e) {
|
|
+ bch_acl_entry *entry = outptr;
|
|
+
|
|
+ entry->e_tag = cpu_to_le16(acl_e->e_tag);
|
|
+ entry->e_perm = cpu_to_le16(acl_e->e_perm);
|
|
+ switch (acl_e->e_tag) {
|
|
+ case ACL_USER:
|
|
+ entry->e_id = cpu_to_le32(
|
|
+ from_kuid(&init_user_ns, acl_e->e_uid));
|
|
+ outptr += sizeof(bch_acl_entry);
|
|
+ break;
|
|
+ case ACL_GROUP:
|
|
+ entry->e_id = cpu_to_le32(
|
|
+ from_kgid(&init_user_ns, acl_e->e_gid));
|
|
+ outptr += sizeof(bch_acl_entry);
|
|
+ break;
|
|
+
|
|
+ case ACL_USER_OBJ:
|
|
+ case ACL_GROUP_OBJ:
|
|
+ case ACL_MASK:
|
|
+ case ACL_OTHER:
|
|
+ outptr += sizeof(bch_acl_entry_short);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
|
|
+
|
|
+ return xattr;
|
|
+}
|
|
+
|
|
+struct posix_acl *bch2_get_acl(struct inode *vinode, int type)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c_xattr xattr;
|
|
+ struct posix_acl *acl = NULL;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
|
|
+ &hash, inode->v.i_ino,
|
|
+ &X_SEARCH(acl_to_xattr_type(type), "", 0),
|
|
+ 0);
|
|
+ if (IS_ERR(iter)) {
|
|
+ if (PTR_ERR(iter) == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ if (PTR_ERR(iter) != -ENOENT)
|
|
+ acl = ERR_CAST(iter);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
|
|
+ acl = bch2_acl_from_disk(xattr_val(xattr.v),
|
|
+ le16_to_cpu(xattr.v->x_val_len));
|
|
+
|
|
+ if (!IS_ERR(acl))
|
|
+ set_cached_acl(&inode->v, type, acl);
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+out:
|
|
+ bch2_trans_exit(&trans);
|
|
+ return acl;
|
|
+}
|
|
+
|
|
+int bch2_set_acl_trans(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode_u,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ struct posix_acl *acl, int type)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (type == ACL_TYPE_DEFAULT &&
|
|
+ !S_ISDIR(inode_u->bi_mode))
|
|
+ return acl ? -EACCES : 0;
|
|
+
|
|
+ if (acl) {
|
|
+ struct bkey_i_xattr *xattr =
|
|
+ bch2_acl_to_xattr(trans, acl, type);
|
|
+ if (IS_ERR(xattr))
|
|
+ return PTR_ERR(xattr);
|
|
+
|
|
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
|
|
+ inode_u->bi_inum, &xattr->k_i, 0);
|
|
+ } else {
|
|
+ struct xattr_search_key search =
|
|
+ X_SEARCH(acl_to_xattr_type(type), "", 0);
|
|
+
|
|
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
|
|
+ inode_u->bi_inum, &search);
|
|
+ }
|
|
+
|
|
+ return ret == -ENOENT ? 0 : ret;
|
|
+}
|
|
+
|
|
+int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *inode_iter;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct bch_hash_info hash_info;
|
|
+ struct posix_acl *acl;
|
|
+ umode_t mode;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+ acl = _acl;
|
|
+
|
|
+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(inode_iter);
|
|
+ if (ret)
|
|
+ goto btree_err;
|
|
+
|
|
+ mode = inode_u.bi_mode;
|
|
+
|
|
+ if (type == ACL_TYPE_ACCESS) {
|
|
+ ret = posix_acl_update_mode(&inode->v, &mode, &acl);
|
|
+ if (ret)
|
|
+ goto btree_err;
|
|
+ }
|
|
+
|
|
+ hash_info = bch2_hash_info_init(c, &inode_u);
|
|
+
|
|
+ ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type);
|
|
+ if (ret)
|
|
+ goto btree_err;
|
|
+
|
|
+ inode_u.bi_ctime = bch2_current_time(c);
|
|
+ inode_u.bi_mode = mode;
|
|
+
|
|
+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
|
|
+ bch2_trans_commit(&trans, NULL,
|
|
+ &inode->ei_journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK);
|
|
+btree_err:
|
|
+ bch2_trans_iter_put(&trans, inode_iter);
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ bch2_inode_update_after_write(c, inode, &inode_u,
|
|
+ ATTR_CTIME|ATTR_MODE);
|
|
+
|
|
+ set_cached_acl(&inode->v, type, acl);
|
|
+err:
|
|
+ bch2_trans_exit(&trans);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_acl_chmod(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ umode_t mode,
|
|
+ struct posix_acl **new_acl)
|
|
+{
|
|
+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode);
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c_xattr xattr;
|
|
+ struct bkey_i_xattr *new;
|
|
+ struct posix_acl *acl;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
|
|
+ &hash_info, inode->bi_inum,
|
|
+ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(iter);
|
|
+ if (ret)
|
|
+ return ret == -ENOENT ? 0 : ret;
|
|
+
|
|
+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
|
|
+ acl = bch2_acl_from_disk(xattr_val(xattr.v),
|
|
+ le16_to_cpu(xattr.v->x_val_len));
|
|
+ ret = PTR_ERR_OR_ZERO(acl);
|
|
+ if (ret || !acl)
|
|
+ goto err;
|
|
+
|
|
+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
|
|
+ if (IS_ERR(new)) {
|
|
+ ret = PTR_ERR(new);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ new->k.p = iter->pos;
|
|
+ bch2_trans_update(trans, iter, &new->k_i, 0);
|
|
+ *new_acl = acl;
|
|
+ acl = NULL;
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ kfree(acl);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
|
|
diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h
|
|
new file mode 100644
|
|
index 000000000000..ba210c26d5c1
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/acl.h
|
|
@@ -0,0 +1,59 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ACL_H
|
|
+#define _BCACHEFS_ACL_H
|
|
+
|
|
+struct bch_inode_unpacked;
|
|
+struct bch_hash_info;
|
|
+struct bch_inode_info;
|
|
+struct posix_acl;
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+
|
|
+#define BCH_ACL_VERSION 0x0001
|
|
+
|
|
+typedef struct {
|
|
+ __le16 e_tag;
|
|
+ __le16 e_perm;
|
|
+ __le32 e_id;
|
|
+} bch_acl_entry;
|
|
+
|
|
+typedef struct {
|
|
+ __le16 e_tag;
|
|
+ __le16 e_perm;
|
|
+} bch_acl_entry_short;
|
|
+
|
|
+typedef struct {
|
|
+ __le32 a_version;
|
|
+} bch_acl_header;
|
|
+
|
|
+struct posix_acl *bch2_get_acl(struct inode *, int);
|
|
+
|
|
+int bch2_set_acl_trans(struct btree_trans *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ const struct bch_hash_info *,
|
|
+ struct posix_acl *, int);
|
|
+int bch2_set_acl(struct inode *, struct posix_acl *, int);
|
|
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *,
|
|
+ umode_t, struct posix_acl **);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode_u,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ struct posix_acl *acl, int type)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int bch2_acl_chmod(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ umode_t mode,
|
|
+ struct posix_acl **new_acl)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
|
|
+
|
|
+#endif /* _BCACHEFS_ACL_H */
|
|
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
|
|
new file mode 100644
|
|
index 000000000000..3ac8b03029f8
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/alloc_background.c
|
|
@@ -0,0 +1,1228 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_gc.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "debug.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "recovery.h"
|
|
+#include "varint.h"
|
|
+
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/math64.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/rculist.h>
|
|
+#include <linux/rcupdate.h>
|
|
+#include <linux/sched/task.h>
|
|
+#include <linux/sort.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+const char * const bch2_allocator_states[] = {
|
|
+#define x(n) #n,
|
|
+ ALLOC_THREAD_STATES()
|
|
+#undef x
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
|
|
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
|
|
+ BCH_ALLOC_FIELDS_V1()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+/* Persistent alloc info: */
|
|
+
|
|
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
|
|
+ const void **p, unsigned field)
|
|
+{
|
|
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
|
|
+ u64 v;
|
|
+
|
|
+ if (!(a->fields & (1 << field)))
|
|
+ return 0;
|
|
+
|
|
+ switch (bytes) {
|
|
+ case 1:
|
|
+ v = *((const u8 *) *p);
|
|
+ break;
|
|
+ case 2:
|
|
+ v = le16_to_cpup(*p);
|
|
+ break;
|
|
+ case 4:
|
|
+ v = le32_to_cpup(*p);
|
|
+ break;
|
|
+ case 8:
|
|
+ v = le64_to_cpup(*p);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ *p += bytes;
|
|
+ return v;
|
|
+}
|
|
+
|
|
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
|
|
+ unsigned field, u64 v)
|
|
+{
|
|
+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
|
|
+
|
|
+ if (!v)
|
|
+ return;
|
|
+
|
|
+ a->v.fields |= 1 << field;
|
|
+
|
|
+ switch (bytes) {
|
|
+ case 1:
|
|
+ *((u8 *) *p) = v;
|
|
+ break;
|
|
+ case 2:
|
|
+ *((__le16 *) *p) = cpu_to_le16(v);
|
|
+ break;
|
|
+ case 4:
|
|
+ *((__le32 *) *p) = cpu_to_le32(v);
|
|
+ break;
|
|
+ case 8:
|
|
+ *((__le64 *) *p) = cpu_to_le64(v);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ *p += bytes;
|
|
+}
|
|
+
|
|
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
|
|
+ const void *d = in->data;
|
|
+ unsigned idx = 0;
|
|
+
|
|
+ out->gen = in->gen;
|
|
+
|
|
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
|
|
+ BCH_ALLOC_FIELDS_V1()
|
|
+#undef x
|
|
+}
|
|
+
|
|
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
|
|
+ const u8 *in = a.v->data;
|
|
+ const u8 *end = bkey_val_end(a);
|
|
+ unsigned fieldnr = 0;
|
|
+ int ret;
|
|
+ u64 v;
|
|
+
|
|
+ out->gen = a.v->gen;
|
|
+ out->oldest_gen = a.v->oldest_gen;
|
|
+ out->data_type = a.v->data_type;
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ if (fieldnr < a.v->nr_fields) { \
|
|
+ ret = bch2_varint_decode(in, end, &v); \
|
|
+ if (ret < 0) \
|
|
+ return ret; \
|
|
+ in += ret; \
|
|
+ } else { \
|
|
+ v = 0; \
|
|
+ } \
|
|
+ out->_name = v; \
|
|
+ if (v != out->_name) \
|
|
+ return -1; \
|
|
+ fieldnr++;
|
|
+
|
|
+ BCH_ALLOC_FIELDS_V2()
|
|
+#undef x
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
|
|
+ const struct bkey_alloc_unpacked src)
|
|
+{
|
|
+ struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
|
|
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
|
|
+ u8 *out = a->v.data;
|
|
+ u8 *end = (void *) &dst[1];
|
|
+ u8 *last_nonzero_field = out;
|
|
+ unsigned bytes;
|
|
+
|
|
+ a->k.p = POS(src.dev, src.bucket);
|
|
+ a->v.gen = src.gen;
|
|
+ a->v.oldest_gen = src.oldest_gen;
|
|
+ a->v.data_type = src.data_type;
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ nr_fields++; \
|
|
+ \
|
|
+ if (src._name) { \
|
|
+ out += bch2_varint_encode(out, src._name); \
|
|
+ \
|
|
+ last_nonzero_field = out; \
|
|
+ last_nonzero_fieldnr = nr_fields; \
|
|
+ } else { \
|
|
+ *out++ = 0; \
|
|
+ }
|
|
+
|
|
+ BCH_ALLOC_FIELDS_V2()
|
|
+#undef x
|
|
+ BUG_ON(out > end);
|
|
+
|
|
+ out = last_nonzero_field;
|
|
+ a->v.nr_fields = last_nonzero_fieldnr;
|
|
+
|
|
+ bytes = (u8 *) out - (u8 *) &a->v;
|
|
+ set_bkey_val_bytes(&a->k, bytes);
|
|
+ memset_u64s_tail(&a->v, 0, bytes);
|
|
+}
|
|
+
|
|
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_alloc_unpacked ret = {
|
|
+ .dev = k.k->p.inode,
|
|
+ .bucket = k.k->p.offset,
|
|
+ .gen = 0,
|
|
+ };
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_alloc_v2)
|
|
+ bch2_alloc_unpack_v2(&ret, k);
|
|
+ else if (k.k->type == KEY_TYPE_alloc)
|
|
+ bch2_alloc_unpack_v1(&ret, k);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_alloc_pack(struct bch_fs *c,
|
|
+ struct bkey_alloc_buf *dst,
|
|
+ const struct bkey_alloc_unpacked src)
|
|
+{
|
|
+ bch2_alloc_pack_v2(dst, src);
|
|
+}
|
|
+
|
|
+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
|
|
+{
|
|
+ unsigned i, bytes = offsetof(struct bch_alloc, data);
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
|
|
+ if (a->fields & (1 << i))
|
|
+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
|
|
+
|
|
+ return DIV_ROUND_UP(bytes, sizeof(u64));
|
|
+}
|
|
+
|
|
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
|
|
+
|
|
+ if (k.k->p.inode >= c->sb.nr_devices ||
|
|
+ !c->devs[k.k->p.inode])
|
|
+ return "invalid device";
|
|
+
|
|
+ /* allow for unknown fields */
|
|
+ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_alloc_unpacked u;
|
|
+
|
|
+ if (k.k->p.inode >= c->sb.nr_devices ||
|
|
+ !c->devs[k.k->p.inode])
|
|
+ return "invalid device";
|
|
+
|
|
+ if (bch2_alloc_unpack_v2(&u, k))
|
|
+ return "unpack error";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
|
|
+
|
|
+ pr_buf(out, "gen %u oldest_gen %u data_type %s",
|
|
+ u.gen, u.oldest_gen, bch2_data_types[u.data_type]);
|
|
+#define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
|
|
+ BCH_ALLOC_FIELDS_V2()
|
|
+#undef x
|
|
+}
|
|
+
|
|
+static int bch2_alloc_read_fn(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ struct bucket *g;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_alloc &&
|
|
+ k.k->type != KEY_TYPE_alloc_v2)
|
|
+ return 0;
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
|
|
+ g = bucket(ca, k.k->p.offset);
|
|
+ u = bch2_alloc_unpack(k);
|
|
+
|
|
+ g->_mark.gen = u.gen;
|
|
+ g->_mark.data_type = u.data_type;
|
|
+ g->_mark.dirty_sectors = u.dirty_sectors;
|
|
+ g->_mark.cached_sectors = u.cached_sectors;
|
|
+ g->io_time[READ] = u.read_time;
|
|
+ g->io_time[WRITE] = u.write_time;
|
|
+ g->oldest_gen = u.oldest_gen;
|
|
+ g->gen_valid = 1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_alloc_read(struct bch_fs *c)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ down_read(&c->gc_lock);
|
|
+ ret = bch2_btree_and_journal_walk(c, BTREE_ID_alloc, bch2_alloc_read_fn);
|
|
+ up_read(&c->gc_lock);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error reading alloc info: %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_alloc_write_key(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_s_c k;
|
|
+ struct bch_dev *ca;
|
|
+ struct bucket *g;
|
|
+ struct bucket_mark m;
|
|
+ struct bkey_alloc_unpacked old_u, new_u;
|
|
+ struct bkey_alloc_buf a;
|
|
+ int ret;
|
|
+retry:
|
|
+ bch2_trans_begin(trans);
|
|
+
|
|
+ ret = bch2_btree_key_cache_flush(trans,
|
|
+ BTREE_ID_alloc, iter->pos);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ old_u = bch2_alloc_unpack(k);
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ ca = bch_dev_bkey_exists(c, iter->pos.inode);
|
|
+ g = bucket(ca, iter->pos.offset);
|
|
+ m = READ_ONCE(g->mark);
|
|
+ new_u = alloc_mem_to_key(iter, g, m);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ if (!bkey_alloc_unpacked_cmp(old_u, new_u))
|
|
+ return 0;
|
|
+
|
|
+ bch2_alloc_pack(c, &a, new_u);
|
|
+ bch2_trans_update(trans, iter, &a.k,
|
|
+ BTREE_TRIGGER_NORUN);
|
|
+ ret = bch2_trans_commit(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|flags);
|
|
+err:
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ bch2_btree_iter_set_pos(iter,
|
|
+ POS(ca->dev_idx, ca->mi.first_bucket));
|
|
+
|
|
+ while (iter->pos.offset < ca->mi.nbuckets) {
|
|
+ bch2_trans_cond_resched(&trans);
|
|
+
|
|
+ ret = bch2_alloc_write_key(&trans, iter, flags);
|
|
+ if (ret) {
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ goto err;
|
|
+ }
|
|
+ bch2_btree_iter_next_slot(iter);
|
|
+ }
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Bucket IO clocks: */
|
|
+
|
|
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
|
|
+ size_t bucket_nr, int rw)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
|
|
+ struct btree_iter *iter;
|
|
+ struct bucket *g;
|
|
+ struct bkey_alloc_buf *a;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+ u64 *time, now;
|
|
+ int ret = 0;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
|
|
+ BTREE_ITER_CACHED|
|
|
+ BTREE_ITER_CACHED_NOFILL|
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
|
|
+ ret = PTR_ERR_OR_ZERO(a);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ g = bucket(ca, bucket_nr);
|
|
+ u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ time = rw == READ ? &u.read_time : &u.write_time;
|
|
+ now = atomic64_read(&c->io_clock[rw].now);
|
|
+ if (*time == now)
|
|
+ goto out;
|
|
+
|
|
+ *time = now;
|
|
+
|
|
+ bch2_alloc_pack(c, a, u);
|
|
+ ret = bch2_trans_update(trans, iter, &a->k, 0) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL, 0);
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Background allocator thread: */
|
|
+
|
|
+/*
|
|
+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
|
|
+ * (marking them as invalidated on disk), then optionally issues discard
|
|
+ * commands to the newly free buckets, then puts them on the various freelists.
|
|
+ */
|
|
+
|
|
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
|
|
+ struct bucket_mark m)
|
|
+{
|
|
+ u8 gc_gen;
|
|
+
|
|
+ if (!is_available_bucket(m))
|
|
+ return false;
|
|
+
|
|
+ if (m.owned_by_allocator)
|
|
+ return false;
|
|
+
|
|
+ if (ca->buckets_nouse &&
|
|
+ test_bit(b, ca->buckets_nouse))
|
|
+ return false;
|
|
+
|
|
+ gc_gen = bucket_gc_gen(bucket(ca, b));
|
|
+
|
|
+ ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
|
|
+ ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
|
|
+
|
|
+ return gc_gen < BUCKET_GC_GEN_MAX;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Determines what order we're going to reuse buckets, smallest bucket_key()
|
|
+ * first.
|
|
+ */
|
|
+
|
|
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
|
|
+ u64 now, u64 last_seq_ondisk)
|
|
+{
|
|
+ unsigned used = bucket_sectors_used(m);
|
|
+
|
|
+ if (used) {
|
|
+ /*
|
|
+ * Prefer to keep buckets that have been read more recently, and
|
|
+ * buckets that have more data in them:
|
|
+ */
|
|
+ u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
|
|
+ u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
|
|
+
|
|
+ return -last_read_scaled;
|
|
+ } else {
|
|
+ /*
|
|
+ * Prefer to use buckets with smaller gc_gen so that we don't
|
|
+ * have to walk the btree and recalculate oldest_gen - but shift
|
|
+ * off the low bits so that buckets will still have equal sort
|
|
+ * keys when there's only a small difference, so that we can
|
|
+ * keep sequential buckets together:
|
|
+ */
|
|
+ return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
|
|
+ (bucket_gc_gen(g) >> 4);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline int bucket_alloc_cmp(alloc_heap *h,
|
|
+ struct alloc_heap_entry l,
|
|
+ struct alloc_heap_entry r)
|
|
+{
|
|
+ return cmp_int(l.key, r.key) ?:
|
|
+ cmp_int(r.nr, l.nr) ?:
|
|
+ cmp_int(l.bucket, r.bucket);
|
|
+}
|
|
+
|
|
+static inline int bucket_idx_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ const struct alloc_heap_entry *l = _l, *r = _r;
|
|
+
|
|
+ return cmp_int(l->bucket, r->bucket);
|
|
+}
|
|
+
|
|
+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct bucket_array *buckets;
|
|
+ struct alloc_heap_entry e = { 0 };
|
|
+ u64 now, last_seq_ondisk;
|
|
+ size_t b, i, nr = 0;
|
|
+
|
|
+ down_read(&ca->bucket_lock);
|
|
+
|
|
+ buckets = bucket_array(ca);
|
|
+ ca->alloc_heap.used = 0;
|
|
+ now = atomic64_read(&c->io_clock[READ].now);
|
|
+ last_seq_ondisk = c->journal.last_seq_ondisk;
|
|
+
|
|
+ /*
|
|
+ * Find buckets with lowest read priority, by building a maxheap sorted
|
|
+ * by read priority and repeatedly replacing the maximum element until
|
|
+ * all buckets have been visited.
|
|
+ */
|
|
+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
|
|
+ struct bucket *g = &buckets->b[b];
|
|
+ struct bucket_mark m = READ_ONCE(g->mark);
|
|
+ unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
|
|
+
|
|
+ cond_resched();
|
|
+
|
|
+ if (!bch2_can_invalidate_bucket(ca, b, m))
|
|
+ continue;
|
|
+
|
|
+ if (e.nr && e.bucket + e.nr == b && e.key == key) {
|
|
+ e.nr++;
|
|
+ } else {
|
|
+ if (e.nr)
|
|
+ heap_add_or_replace(&ca->alloc_heap, e,
|
|
+ -bucket_alloc_cmp, NULL);
|
|
+
|
|
+ e = (struct alloc_heap_entry) {
|
|
+ .bucket = b,
|
|
+ .nr = 1,
|
|
+ .key = key,
|
|
+ };
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (e.nr)
|
|
+ heap_add_or_replace(&ca->alloc_heap, e,
|
|
+ -bucket_alloc_cmp, NULL);
|
|
+
|
|
+ for (i = 0; i < ca->alloc_heap.used; i++)
|
|
+ nr += ca->alloc_heap.data[i].nr;
|
|
+
|
|
+ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
|
|
+ nr -= ca->alloc_heap.data[0].nr;
|
|
+ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
|
|
+ }
|
|
+
|
|
+ up_read(&ca->bucket_lock);
|
|
+}
|
|
+
|
|
+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct bucket_array *buckets = bucket_array(ca);
|
|
+ struct bucket_mark m;
|
|
+ size_t b, start;
|
|
+
|
|
+ if (ca->fifo_last_bucket < ca->mi.first_bucket ||
|
|
+ ca->fifo_last_bucket >= ca->mi.nbuckets)
|
|
+ ca->fifo_last_bucket = ca->mi.first_bucket;
|
|
+
|
|
+ start = ca->fifo_last_bucket;
|
|
+
|
|
+ do {
|
|
+ ca->fifo_last_bucket++;
|
|
+ if (ca->fifo_last_bucket == ca->mi.nbuckets)
|
|
+ ca->fifo_last_bucket = ca->mi.first_bucket;
|
|
+
|
|
+ b = ca->fifo_last_bucket;
|
|
+ m = READ_ONCE(buckets->b[b].mark);
|
|
+
|
|
+ if (bch2_can_invalidate_bucket(ca, b, m)) {
|
|
+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
|
|
+
|
|
+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
|
|
+ if (heap_full(&ca->alloc_heap))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ cond_resched();
|
|
+ } while (ca->fifo_last_bucket != start);
|
|
+}
|
|
+
|
|
+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct bucket_array *buckets = bucket_array(ca);
|
|
+ struct bucket_mark m;
|
|
+ size_t checked, i;
|
|
+
|
|
+ for (checked = 0;
|
|
+ checked < ca->mi.nbuckets / 2;
|
|
+ checked++) {
|
|
+ size_t b = bch2_rand_range(ca->mi.nbuckets -
|
|
+ ca->mi.first_bucket) +
|
|
+ ca->mi.first_bucket;
|
|
+
|
|
+ m = READ_ONCE(buckets->b[b].mark);
|
|
+
|
|
+ if (bch2_can_invalidate_bucket(ca, b, m)) {
|
|
+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
|
|
+
|
|
+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
|
|
+ if (heap_full(&ca->alloc_heap))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ cond_resched();
|
|
+ }
|
|
+
|
|
+ sort(ca->alloc_heap.data,
|
|
+ ca->alloc_heap.used,
|
|
+ sizeof(ca->alloc_heap.data[0]),
|
|
+ bucket_idx_cmp, NULL);
|
|
+
|
|
+ /* remove duplicates: */
|
|
+ for (i = 0; i + 1 < ca->alloc_heap.used; i++)
|
|
+ if (ca->alloc_heap.data[i].bucket ==
|
|
+ ca->alloc_heap.data[i + 1].bucket)
|
|
+ ca->alloc_heap.data[i].nr = 0;
|
|
+}
|
|
+
|
|
+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ size_t i, nr = 0;
|
|
+
|
|
+ ca->inc_gen_needs_gc = 0;
|
|
+ ca->inc_gen_really_needs_gc = 0;
|
|
+
|
|
+ switch (ca->mi.replacement) {
|
|
+ case BCH_CACHE_REPLACEMENT_lru:
|
|
+ find_reclaimable_buckets_lru(c, ca);
|
|
+ break;
|
|
+ case BCH_CACHE_REPLACEMENT_fifo:
|
|
+ find_reclaimable_buckets_fifo(c, ca);
|
|
+ break;
|
|
+ case BCH_CACHE_REPLACEMENT_random:
|
|
+ find_reclaimable_buckets_random(c, ca);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
|
|
+
|
|
+ for (i = 0; i < ca->alloc_heap.used; i++)
|
|
+ nr += ca->alloc_heap.data[i].nr;
|
|
+
|
|
+ return nr;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * returns sequence number of most recent journal entry that updated this
|
|
+ * bucket:
|
|
+ */
|
|
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
|
|
+{
|
|
+ if (m.journal_seq_valid) {
|
|
+ u64 journal_seq = atomic64_read(&c->journal.seq);
|
|
+ u64 bucket_seq = journal_seq;
|
|
+
|
|
+ bucket_seq &= ~((u64) U16_MAX);
|
|
+ bucket_seq |= m.journal_seq;
|
|
+
|
|
+ if (bucket_seq > journal_seq)
|
|
+ bucket_seq -= 1 << 16;
|
|
+
|
|
+ return bucket_seq;
|
|
+ } else {
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int bucket_invalidate_btree(struct btree_trans *trans,
|
|
+ struct bch_dev *ca, u64 b)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_alloc_buf *a;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+ struct bucket *g;
|
|
+ struct bucket_mark m;
|
|
+ struct btree_iter *iter =
|
|
+ bch2_trans_get_iter(trans, BTREE_ID_alloc,
|
|
+ POS(ca->dev_idx, b),
|
|
+ BTREE_ITER_CACHED|
|
|
+ BTREE_ITER_CACHED_NOFILL|
|
|
+ BTREE_ITER_INTENT);
|
|
+ int ret;
|
|
+
|
|
+ a = bch2_trans_kmalloc(trans, sizeof(*a));
|
|
+ ret = PTR_ERR_OR_ZERO(a);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ g = bucket(ca, b);
|
|
+ m = READ_ONCE(g->mark);
|
|
+ u = alloc_mem_to_key(iter, g, m);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ u.gen++;
|
|
+ u.data_type = 0;
|
|
+ u.dirty_sectors = 0;
|
|
+ u.cached_sectors = 0;
|
|
+ u.read_time = atomic64_read(&c->io_clock[READ].now);
|
|
+ u.write_time = atomic64_read(&c->io_clock[WRITE].now);
|
|
+
|
|
+ bch2_alloc_pack(c, a, u);
|
|
+ bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_BUCKET_INVALIDATE);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
|
|
+ u64 *journal_seq, unsigned flags)
|
|
+{
|
|
+ struct bucket *g;
|
|
+ struct bucket_mark m;
|
|
+ size_t b;
|
|
+ int ret = 0;
|
|
+
|
|
+ BUG_ON(!ca->alloc_heap.used ||
|
|
+ !ca->alloc_heap.data[0].nr);
|
|
+ b = ca->alloc_heap.data[0].bucket;
|
|
+
|
|
+ /* first, put on free_inc and mark as owned by allocator: */
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ g = bucket(ca, b);
|
|
+ m = READ_ONCE(g->mark);
|
|
+
|
|
+ BUG_ON(m.dirty_sectors);
|
|
+
|
|
+ bch2_mark_alloc_bucket(c, ca, b, true);
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ verify_not_on_freelist(c, ca, b);
|
|
+ BUG_ON(!fifo_push(&ca->free_inc, b));
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ /*
|
|
+ * If we're not invalidating cached data, we only increment the bucket
|
|
+ * gen in memory here, the incremented gen will be updated in the btree
|
|
+ * by bch2_trans_mark_pointer():
|
|
+ */
|
|
+ if (!m.cached_sectors &&
|
|
+ !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
|
|
+ BUG_ON(m.data_type);
|
|
+ bucket_cmpxchg(g, m, m.gen++);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ /*
|
|
+ * If the read-only path is trying to shut down, we can't be generating
|
|
+ * new btree updates:
|
|
+ */
|
|
+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
|
|
+ ret = 1;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_do(c, NULL, journal_seq,
|
|
+ BTREE_INSERT_NOCHECK_RW|
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_JOURNAL_RESERVED|
|
|
+ flags,
|
|
+ bucket_invalidate_btree(&trans, ca, b));
|
|
+out:
|
|
+ if (!ret) {
|
|
+ /* remove from alloc_heap: */
|
|
+ struct alloc_heap_entry e, *top = ca->alloc_heap.data;
|
|
+
|
|
+ top->bucket++;
|
|
+ top->nr--;
|
|
+
|
|
+ if (!top->nr)
|
|
+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
|
|
+
|
|
+ /*
|
|
+ * Make sure we flush the last journal entry that updated this
|
|
+ * bucket (i.e. deleting the last reference) before writing to
|
|
+ * this bucket again:
|
|
+ */
|
|
+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
|
|
+ } else {
|
|
+ size_t b2;
|
|
+
|
|
+ /* remove from free_inc: */
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ spin_lock(&c->freelist_lock);
|
|
+
|
|
+ bch2_mark_alloc_bucket(c, ca, b, false);
|
|
+
|
|
+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
|
|
+ BUG_ON(b != b2);
|
|
+
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ return ret < 0 ? ret : 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
|
|
+ */
|
|
+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ u64 journal_seq = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ /* Only use nowait if we've already invalidated at least one bucket: */
|
|
+ while (!ret &&
|
|
+ !fifo_full(&ca->free_inc) &&
|
|
+ ca->alloc_heap.used) {
|
|
+ ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
|
|
+ (!fifo_empty(&ca->free_inc)
|
|
+ ? BTREE_INSERT_NOWAIT : 0));
|
|
+ /*
|
|
+ * We only want to batch up invalidates when they're going to
|
|
+ * require flushing the journal:
|
|
+ */
|
|
+ if (!journal_seq)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* If we used NOWAIT, don't return the error: */
|
|
+ if (!fifo_empty(&ca->free_inc))
|
|
+ ret = 0;
|
|
+ if (ret) {
|
|
+ bch_err(ca, "error invalidating buckets: %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (journal_seq)
|
|
+ ret = bch2_journal_flush_seq(&c->journal, journal_seq);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "journal error: %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
|
|
+{
|
|
+ if (ca->allocator_state != new_state) {
|
|
+ ca->allocator_state = new_state;
|
|
+ closure_wake_up(&ca->fs->freelist_wait);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
|
|
+{
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ for (i = 0; i < RESERVE_NR; i++) {
|
|
+ /*
|
|
+ * Don't strand buckets on the copygc freelist until
|
|
+ * after recovery is finished:
|
|
+ */
|
|
+ if (i == RESERVE_MOVINGGC &&
|
|
+ !test_bit(BCH_FS_STARTED, &c->flags))
|
|
+ continue;
|
|
+
|
|
+ if (fifo_push(&ca->free[i], b)) {
|
|
+ fifo_pop(&ca->free_inc, b);
|
|
+ ret = 1;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ ca->allocator_state = ret
|
|
+ ? ALLOCATOR_running
|
|
+ : ALLOCATOR_blocked_full;
|
|
+ closure_wake_up(&c->freelist_wait);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
|
|
+{
|
|
+ if (ca->mi.discard &&
|
|
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
|
|
+ blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
|
|
+ ca->mi.bucket_size, GFP_NOFS, 0);
|
|
+}
|
|
+
|
|
+static bool allocator_thread_running(struct bch_dev *ca)
|
|
+{
|
|
+ unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
|
|
+ test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
|
|
+ ? ALLOCATOR_running
|
|
+ : ALLOCATOR_stopped;
|
|
+ alloc_thread_set_state(ca, state);
|
|
+ return state == ALLOCATOR_running;
|
|
+}
|
|
+
|
|
+static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
|
|
+{
|
|
+ s64 available = dev_buckets_reclaimable(ca) -
|
|
+ (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
|
|
+ bool ret = available > 0;
|
|
+
|
|
+ alloc_thread_set_state(ca, ret
|
|
+ ? ALLOCATOR_running
|
|
+ : ALLOCATOR_blocked);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_allocator_thread - move buckets from free_inc to reserves
|
|
+ *
|
|
+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and
|
|
+ * the reserves are depleted by bucket allocation. When we run out
|
|
+ * of free_inc, try to invalidate some buckets and write out
|
|
+ * prios and gens.
|
|
+ */
|
|
+static int bch2_allocator_thread(void *arg)
|
|
+{
|
|
+ struct bch_dev *ca = arg;
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ unsigned long gc_count = c->gc_count;
|
|
+ size_t nr;
|
|
+ int ret;
|
|
+
|
|
+ set_freezable();
|
|
+
|
|
+ while (1) {
|
|
+ ret = kthread_wait_freezable(allocator_thread_running(ca));
|
|
+ if (ret)
|
|
+ goto stop;
|
|
+
|
|
+ while (!ca->alloc_heap.used) {
|
|
+ cond_resched();
|
|
+
|
|
+ ret = kthread_wait_freezable(buckets_available(ca, gc_count));
|
|
+ if (ret)
|
|
+ goto stop;
|
|
+
|
|
+ gc_count = c->gc_count;
|
|
+ nr = find_reclaimable_buckets(c, ca);
|
|
+
|
|
+ trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
|
|
+ ca->inc_gen_really_needs_gc);
|
|
+
|
|
+ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
|
|
+ ca->inc_gen_really_needs_gc) &&
|
|
+ c->gc_thread) {
|
|
+ atomic_inc(&c->kick_gc);
|
|
+ wake_up_process(c->gc_thread);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_invalidate_buckets(c, ca);
|
|
+ if (ret)
|
|
+ goto stop;
|
|
+
|
|
+ while (!fifo_empty(&ca->free_inc)) {
|
|
+ u64 b = fifo_peek(&ca->free_inc);
|
|
+
|
|
+ discard_one_bucket(c, ca, b);
|
|
+
|
|
+ ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
|
|
+ if (ret)
|
|
+ goto stop;
|
|
+ }
|
|
+ }
|
|
+stop:
|
|
+ alloc_thread_set_state(ca, ALLOCATOR_stopped);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Startup/shutdown (ro/rw): */
|
|
+
|
|
+void bch2_recalc_capacity(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ u64 capacity = 0, reserved_sectors = 0, gc_reserve;
|
|
+ unsigned bucket_size_max = 0;
|
|
+ unsigned long ra_pages = 0;
|
|
+ unsigned i, j;
|
|
+
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi;
|
|
+
|
|
+ ra_pages += bdi->ra_pages;
|
|
+ }
|
|
+
|
|
+ bch2_set_ra_pages(c, ra_pages);
|
|
+
|
|
+ for_each_rw_member(ca, c, i) {
|
|
+ u64 dev_reserve = 0;
|
|
+
|
|
+ /*
|
|
+ * We need to reserve buckets (from the number
|
|
+ * of currently available buckets) against
|
|
+ * foreground writes so that mainly copygc can
|
|
+ * make forward progress.
|
|
+ *
|
|
+ * We need enough to refill the various reserves
|
|
+ * from scratch - copygc will use its entire
|
|
+ * reserve all at once, then run against when
|
|
+ * its reserve is refilled (from the formerly
|
|
+ * available buckets).
|
|
+ *
|
|
+ * This reserve is just used when considering if
|
|
+ * allocations for foreground writes must wait -
|
|
+ * not -ENOSPC calculations.
|
|
+ */
|
|
+ for (j = 0; j < RESERVE_NONE; j++)
|
|
+ dev_reserve += ca->free[j].size;
|
|
+
|
|
+ dev_reserve += 1; /* btree write point */
|
|
+ dev_reserve += 1; /* copygc write point */
|
|
+ dev_reserve += 1; /* rebalance write point */
|
|
+
|
|
+ dev_reserve *= ca->mi.bucket_size;
|
|
+
|
|
+ capacity += bucket_to_sector(ca, ca->mi.nbuckets -
|
|
+ ca->mi.first_bucket);
|
|
+
|
|
+ reserved_sectors += dev_reserve * 2;
|
|
+
|
|
+ bucket_size_max = max_t(unsigned, bucket_size_max,
|
|
+ ca->mi.bucket_size);
|
|
+ }
|
|
+
|
|
+ gc_reserve = c->opts.gc_reserve_bytes
|
|
+ ? c->opts.gc_reserve_bytes >> 9
|
|
+ : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
|
|
+
|
|
+ reserved_sectors = max(gc_reserve, reserved_sectors);
|
|
+
|
|
+ reserved_sectors = min(reserved_sectors, capacity);
|
|
+
|
|
+ c->capacity = capacity - reserved_sectors;
|
|
+
|
|
+ c->bucket_size_max = bucket_size_max;
|
|
+
|
|
+ /* Wake up case someone was waiting for buckets */
|
|
+ closure_wake_up(&c->freelist_wait);
|
|
+}
|
|
+
|
|
+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ bool ret = false;
|
|
+
|
|
+ for (ob = c->open_buckets;
|
|
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
|
|
+ ob++) {
|
|
+ spin_lock(&ob->lock);
|
|
+ if (ob->valid && !ob->on_partial_list &&
|
|
+ ob->ptr.dev == ca->dev_idx)
|
|
+ ret = true;
|
|
+ spin_unlock(&ob->lock);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* device goes ro: */
|
|
+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(ca->alloc_thread);
|
|
+
|
|
+ /* First, remove device from allocation groups: */
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
|
+ clear_bit(ca->dev_idx, c->rw_devs[i].d);
|
|
+
|
|
+ /*
|
|
+ * Capacity is calculated based off of devices in allocation groups:
|
|
+ */
|
|
+ bch2_recalc_capacity(c);
|
|
+
|
|
+ /* Next, close write points that point to this device... */
|
|
+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
|
|
+ bch2_writepoint_stop(c, ca, &c->write_points[i]);
|
|
+
|
|
+ bch2_writepoint_stop(c, ca, &c->copygc_write_point);
|
|
+ bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
|
|
+ bch2_writepoint_stop(c, ca, &c->btree_write_point);
|
|
+
|
|
+ mutex_lock(&c->btree_reserve_cache_lock);
|
|
+ while (c->btree_reserve_cache_nr) {
|
|
+ struct btree_alloc *a =
|
|
+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
|
|
+
|
|
+ bch2_open_buckets_put(c, &a->ob);
|
|
+ }
|
|
+ mutex_unlock(&c->btree_reserve_cache_lock);
|
|
+
|
|
+ while (1) {
|
|
+ struct open_bucket *ob;
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ if (!ca->open_buckets_partial_nr) {
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+ break;
|
|
+ }
|
|
+ ob = c->open_buckets +
|
|
+ ca->open_buckets_partial[--ca->open_buckets_partial_nr];
|
|
+ ob->on_partial_list = false;
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ bch2_open_bucket_put(c, ob);
|
|
+ }
|
|
+
|
|
+ bch2_ec_stop_dev(c, ca);
|
|
+
|
|
+ /*
|
|
+ * Wake up threads that were blocked on allocation, so they can notice
|
|
+ * the device can no longer be removed and the capacity has changed:
|
|
+ */
|
|
+ closure_wake_up(&c->freelist_wait);
|
|
+
|
|
+ /*
|
|
+ * journal_res_get() can block waiting for free space in the journal -
|
|
+ * it needs to notice there may not be devices to allocate from anymore:
|
|
+ */
|
|
+ wake_up(&c->journal.wait);
|
|
+
|
|
+ /* Now wait for any in flight writes: */
|
|
+
|
|
+ closure_wait_event(&c->open_buckets_wait,
|
|
+ !bch2_dev_has_open_write_point(c, ca));
|
|
+}
|
|
+
|
|
+/* device goes rw: */
|
|
+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
|
|
+ if (ca->mi.data_allowed & (1 << i))
|
|
+ set_bit(ca->dev_idx, c->rw_devs[i].d);
|
|
+}
|
|
+
|
|
+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ if (ca->alloc_thread)
|
|
+ closure_wait_event(&c->freelist_wait,
|
|
+ ca->allocator_state != ALLOCATOR_running);
|
|
+}
|
|
+
|
|
+/* stop allocator thread: */
|
|
+void bch2_dev_allocator_stop(struct bch_dev *ca)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ p = rcu_dereference_protected(ca->alloc_thread, 1);
|
|
+ ca->alloc_thread = NULL;
|
|
+
|
|
+ /*
|
|
+ * We need an rcu barrier between setting ca->alloc_thread = NULL and
|
|
+ * the thread shutting down to avoid bch2_wake_allocator() racing:
|
|
+ *
|
|
+ * XXX: it would be better to have the rcu barrier be asynchronous
|
|
+ * instead of blocking us here
|
|
+ */
|
|
+ synchronize_rcu();
|
|
+
|
|
+ if (p) {
|
|
+ kthread_stop(p);
|
|
+ put_task_struct(p);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* start allocator thread: */
|
|
+int bch2_dev_allocator_start(struct bch_dev *ca)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ /*
|
|
+ * allocator thread already started?
|
|
+ */
|
|
+ if (ca->alloc_thread)
|
|
+ return 0;
|
|
+
|
|
+ p = kthread_create(bch2_allocator_thread, ca,
|
|
+ "bch-alloc/%s", ca->name);
|
|
+ if (IS_ERR(p)) {
|
|
+ bch_err(ca->fs, "error creating allocator thread: %li",
|
|
+ PTR_ERR(p));
|
|
+ return PTR_ERR(p);
|
|
+ }
|
|
+
|
|
+ get_task_struct(p);
|
|
+ rcu_assign_pointer(ca->alloc_thread, p);
|
|
+ wake_up_process(p);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_fs_allocator_background_init(struct bch_fs *c)
|
|
+{
|
|
+ spin_lock_init(&c->freelist_lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
|
|
new file mode 100644
|
|
index 000000000000..9cadfdb5b83d
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/alloc_background.h
|
|
@@ -0,0 +1,135 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H
|
|
+#define _BCACHEFS_ALLOC_BACKGROUND_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_types.h"
|
|
+#include "debug.h"
|
|
+
|
|
+extern const char * const bch2_allocator_states[];
|
|
+
|
|
+struct bkey_alloc_unpacked {
|
|
+ u64 bucket;
|
|
+ u8 dev;
|
|
+ u8 gen;
|
|
+ u8 oldest_gen;
|
|
+ u8 data_type;
|
|
+#define x(_name, _bits) u##_bits _name;
|
|
+ BCH_ALLOC_FIELDS_V2()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct bkey_alloc_buf {
|
|
+ struct bkey_i k;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+#define x(_name, _bits) + _bits / 8
|
|
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V1()];
|
|
+#undef x
|
|
+ } _v1;
|
|
+ struct {
|
|
+#define x(_name, _bits) + 8 + _bits / 8
|
|
+ u8 _pad[8 + BCH_ALLOC_FIELDS_V2()];
|
|
+#undef x
|
|
+ } _v2;
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* How out of date a pointer gen is allowed to be: */
|
|
+#define BUCKET_GC_GEN_MAX 96U
|
|
+
|
|
+/* returns true if not equal */
|
|
+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
|
|
+ struct bkey_alloc_unpacked r)
|
|
+{
|
|
+ return l.gen != r.gen ||
|
|
+ l.oldest_gen != r.oldest_gen ||
|
|
+ l.data_type != r.data_type
|
|
+#define x(_name, ...) || l._name != r._name
|
|
+ BCH_ALLOC_FIELDS_V2()
|
|
+#undef x
|
|
+ ;
|
|
+}
|
|
+
|
|
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
|
|
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
|
|
+ const struct bkey_alloc_unpacked);
|
|
+
|
|
+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
|
|
+
|
|
+static inline struct bkey_alloc_unpacked
|
|
+alloc_mem_to_key(struct btree_iter *iter,
|
|
+ struct bucket *g, struct bucket_mark m)
|
|
+{
|
|
+ return (struct bkey_alloc_unpacked) {
|
|
+ .dev = iter->pos.inode,
|
|
+ .bucket = iter->pos.offset,
|
|
+ .gen = m.gen,
|
|
+ .oldest_gen = g->oldest_gen,
|
|
+ .data_type = m.data_type,
|
|
+ .dirty_sectors = m.dirty_sectors,
|
|
+ .cached_sectors = m.cached_sectors,
|
|
+ .read_time = g->io_time[READ],
|
|
+ .write_time = g->io_time[WRITE],
|
|
+ };
|
|
+}
|
|
+
|
|
+#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
|
|
+
|
|
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_alloc (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_alloc_v1_invalid, \
|
|
+ .val_to_text = bch2_alloc_to_text, \
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_alloc_v2_invalid, \
|
|
+ .val_to_text = bch2_alloc_to_text, \
|
|
+}
|
|
+
|
|
+int bch2_alloc_read(struct bch_fs *);
|
|
+
|
|
+static inline void bch2_wake_allocator(struct bch_dev *ca)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ p = rcu_dereference(ca->alloc_thread);
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t bucket)
|
|
+{
|
|
+ if (bch2_expensive_debug_checks) {
|
|
+ size_t iter;
|
|
+ long i;
|
|
+ unsigned j;
|
|
+
|
|
+ for (j = 0; j < RESERVE_NR; j++)
|
|
+ fifo_for_each_entry(i, &ca->free[j], iter)
|
|
+ BUG_ON(i == bucket);
|
|
+ fifo_for_each_entry(i, &ca->free_inc, iter)
|
|
+ BUG_ON(i == bucket);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_recalc_capacity(struct bch_fs *);
|
|
+
|
|
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
|
|
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
|
|
+void bch2_dev_allocator_stop(struct bch_dev *);
|
|
+int bch2_dev_allocator_start(struct bch_dev *);
|
|
+
|
|
+int bch2_alloc_write(struct bch_fs *, unsigned);
|
|
+void bch2_fs_allocator_background_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
|
|
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
|
|
new file mode 100644
|
|
index 000000000000..412fed479482
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/alloc_foreground.c
|
|
@@ -0,0 +1,960 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ *
|
|
+ * Foreground allocator code: allocate buckets from freelist, and allocate in
|
|
+ * sector granularity from writepoints.
|
|
+ *
|
|
+ * bch2_bucket_alloc() allocates a single bucket from a specific device.
|
|
+ *
|
|
+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices
|
|
+ * in a given filesystem.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "btree_gc.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "debug.h"
|
|
+#include "disk_groups.h"
|
|
+#include "ec.h"
|
|
+#include "io.h"
|
|
+
|
|
+#include <linux/math64.h>
|
|
+#include <linux/rculist.h>
|
|
+#include <linux/rcupdate.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+/*
|
|
+ * Open buckets represent a bucket that's currently being allocated from. They
|
|
+ * serve two purposes:
|
|
+ *
|
|
+ * - They track buckets that have been partially allocated, allowing for
|
|
+ * sub-bucket sized allocations - they're used by the sector allocator below
|
|
+ *
|
|
+ * - They provide a reference to the buckets they own that mark and sweep GC
|
|
+ * can find, until the new allocation has a pointer to it inserted into the
|
|
+ * btree
|
|
+ *
|
|
+ * When allocating some space with the sector allocator, the allocation comes
|
|
+ * with a reference to an open bucket - the caller is required to put that
|
|
+ * reference _after_ doing the index update that makes its allocation reachable.
|
|
+ */
|
|
+
|
|
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
|
+{
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+
|
|
+ if (ob->ec) {
|
|
+ bch2_ec_bucket_written(c, ob);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ spin_lock(&ob->lock);
|
|
+
|
|
+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), false);
|
|
+ ob->valid = false;
|
|
+ ob->type = 0;
|
|
+
|
|
+ spin_unlock(&ob->lock);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ ob->freelist = c->open_buckets_freelist;
|
|
+ c->open_buckets_freelist = ob - c->open_buckets;
|
|
+
|
|
+ c->open_buckets_nr_free++;
|
|
+ ca->nr_open_buckets--;
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ closure_wake_up(&c->open_buckets_wait);
|
|
+}
|
|
+
|
|
+void bch2_open_bucket_write_error(struct bch_fs *c,
|
|
+ struct open_buckets *obs,
|
|
+ unsigned dev)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, obs, ob, i)
|
|
+ if (ob->ptr.dev == dev &&
|
|
+ ob->ec)
|
|
+ bch2_ec_bucket_cancel(c, ob);
|
|
+}
|
|
+
|
|
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+
|
|
+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
|
|
+
|
|
+ ob = c->open_buckets + c->open_buckets_freelist;
|
|
+ c->open_buckets_freelist = ob->freelist;
|
|
+ atomic_set(&ob->pin, 1);
|
|
+ ob->type = 0;
|
|
+
|
|
+ c->open_buckets_nr_free--;
|
|
+ return ob;
|
|
+}
|
|
+
|
|
+static void open_bucket_free_unused(struct bch_fs *c,
|
|
+ struct write_point *wp,
|
|
+ struct open_bucket *ob)
|
|
+{
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+ bool may_realloc = wp->type == BCH_DATA_user;
|
|
+
|
|
+ BUG_ON(ca->open_buckets_partial_nr >
|
|
+ ARRAY_SIZE(ca->open_buckets_partial));
|
|
+
|
|
+ if (ca->open_buckets_partial_nr <
|
|
+ ARRAY_SIZE(ca->open_buckets_partial) &&
|
|
+ may_realloc) {
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ ob->on_partial_list = true;
|
|
+ ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
|
|
+ ob - c->open_buckets;
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ closure_wake_up(&c->open_buckets_wait);
|
|
+ closure_wake_up(&c->freelist_wait);
|
|
+ } else {
|
|
+ bch2_open_bucket_put(c, ob);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, obs, ob, i) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+
|
|
+ BUG_ON(ptr_stale(ca, &ob->ptr));
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+/* _only_ for allocating the journal on a new device: */
|
|
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
|
|
+{
|
|
+ struct bucket_array *buckets;
|
|
+ ssize_t b;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ buckets = bucket_array(ca);
|
|
+
|
|
+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++)
|
|
+ if (is_available_bucket(buckets->b[b].mark) &&
|
|
+ !buckets->b[b].mark.owned_by_allocator)
|
|
+ goto success;
|
|
+ b = -1;
|
|
+success:
|
|
+ rcu_read_unlock();
|
|
+ return b;
|
|
+}
|
|
+
|
|
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
|
|
+{
|
|
+ switch (reserve) {
|
|
+ case RESERVE_BTREE:
|
|
+ case RESERVE_BTREE_MOVINGGC:
|
|
+ return 0;
|
|
+ case RESERVE_MOVINGGC:
|
|
+ return OPEN_BUCKETS_COUNT / 4;
|
|
+ default:
|
|
+ return OPEN_BUCKETS_COUNT / 2;
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_bucket_alloc - allocate a single bucket from a specific device
|
|
+ *
|
|
+ * Returns index of bucket on success, 0 on failure
|
|
+ * */
|
|
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
|
|
+ enum alloc_reserve reserve,
|
|
+ bool may_alloc_partial,
|
|
+ struct closure *cl)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ long b = 0;
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+
|
|
+ if (may_alloc_partial) {
|
|
+ int i;
|
|
+
|
|
+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
|
|
+ ob = c->open_buckets + ca->open_buckets_partial[i];
|
|
+
|
|
+ if (reserve <= ob->alloc_reserve) {
|
|
+ array_remove_item(ca->open_buckets_partial,
|
|
+ ca->open_buckets_partial_nr,
|
|
+ i);
|
|
+ ob->on_partial_list = false;
|
|
+ ob->alloc_reserve = reserve;
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+ return ob;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
|
|
+ if (cl)
|
|
+ closure_wait(&c->open_buckets_wait, cl);
|
|
+
|
|
+ if (!c->blocked_allocate_open_bucket)
|
|
+ c->blocked_allocate_open_bucket = local_clock();
|
|
+
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+ trace_open_bucket_alloc_fail(ca, reserve);
|
|
+ return ERR_PTR(-OPEN_BUCKETS_EMPTY);
|
|
+ }
|
|
+
|
|
+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
|
|
+ goto out;
|
|
+
|
|
+ switch (reserve) {
|
|
+ case RESERVE_BTREE_MOVINGGC:
|
|
+ case RESERVE_MOVINGGC:
|
|
+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
|
|
+ goto out;
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (cl)
|
|
+ closure_wait(&c->freelist_wait, cl);
|
|
+
|
|
+ if (!c->blocked_allocate)
|
|
+ c->blocked_allocate = local_clock();
|
|
+
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ trace_bucket_alloc_fail(ca, reserve);
|
|
+ return ERR_PTR(-FREELIST_EMPTY);
|
|
+out:
|
|
+ verify_not_on_freelist(c, ca, b);
|
|
+
|
|
+ ob = bch2_open_bucket_alloc(c);
|
|
+
|
|
+ spin_lock(&ob->lock);
|
|
+
|
|
+ ob->valid = true;
|
|
+ ob->sectors_free = ca->mi.bucket_size;
|
|
+ ob->alloc_reserve = reserve;
|
|
+ ob->ptr = (struct bch_extent_ptr) {
|
|
+ .type = 1 << BCH_EXTENT_ENTRY_ptr,
|
|
+ .gen = bucket(ca, b)->mark.gen,
|
|
+ .offset = bucket_to_sector(ca, b),
|
|
+ .dev = ca->dev_idx,
|
|
+ };
|
|
+
|
|
+ spin_unlock(&ob->lock);
|
|
+
|
|
+ if (c->blocked_allocate_open_bucket) {
|
|
+ bch2_time_stats_update(
|
|
+ &c->times[BCH_TIME_blocked_allocate_open_bucket],
|
|
+ c->blocked_allocate_open_bucket);
|
|
+ c->blocked_allocate_open_bucket = 0;
|
|
+ }
|
|
+
|
|
+ if (c->blocked_allocate) {
|
|
+ bch2_time_stats_update(
|
|
+ &c->times[BCH_TIME_blocked_allocate],
|
|
+ c->blocked_allocate);
|
|
+ c->blocked_allocate = 0;
|
|
+ }
|
|
+
|
|
+ ca->nr_open_buckets++;
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ bch2_wake_allocator(ca);
|
|
+
|
|
+ trace_bucket_alloc(ca, reserve);
|
|
+ return ob;
|
|
+}
|
|
+
|
|
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
|
|
+ unsigned l, unsigned r)
|
|
+{
|
|
+ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
|
|
+ (stripe->next_alloc[l] < stripe->next_alloc[r]));
|
|
+}
|
|
+
|
|
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
|
|
+
|
|
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
|
|
+ struct dev_stripe_state *stripe,
|
|
+ struct bch_devs_mask *devs)
|
|
+{
|
|
+ struct dev_alloc_list ret = { .nr = 0 };
|
|
+ unsigned i;
|
|
+
|
|
+ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX)
|
|
+ ret.devs[ret.nr++] = i;
|
|
+
|
|
+ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_dev_stripe_increment(struct bch_dev *ca,
|
|
+ struct dev_stripe_state *stripe)
|
|
+{
|
|
+ u64 *v = stripe->next_alloc + ca->dev_idx;
|
|
+ u64 free_space = dev_buckets_available(ca);
|
|
+ u64 free_space_inv = free_space
|
|
+ ? div64_u64(1ULL << 48, free_space)
|
|
+ : 1ULL << 48;
|
|
+ u64 scale = *v / 4;
|
|
+
|
|
+ if (*v + free_space_inv >= *v)
|
|
+ *v += free_space_inv;
|
|
+ else
|
|
+ *v = U64_MAX;
|
|
+
|
|
+ for (v = stripe->next_alloc;
|
|
+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
|
|
+ *v = *v < scale ? 0 : *v - scale;
|
|
+}
|
|
+
|
|
+#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0)
|
|
+#define BUCKET_ALLOC_USE_DURABILITY (1 << 1)
|
|
+
|
|
+static void add_new_bucket(struct bch_fs *c,
|
|
+ struct open_buckets *ptrs,
|
|
+ struct bch_devs_mask *devs_may_alloc,
|
|
+ unsigned *nr_effective,
|
|
+ bool *have_cache,
|
|
+ unsigned flags,
|
|
+ struct open_bucket *ob)
|
|
+{
|
|
+ unsigned durability =
|
|
+ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability;
|
|
+
|
|
+ __clear_bit(ob->ptr.dev, devs_may_alloc->d);
|
|
+ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY)
|
|
+ ? durability : 1;
|
|
+ *have_cache |= !durability;
|
|
+
|
|
+ ob_push(c, ptrs, ob);
|
|
+}
|
|
+
|
|
+enum bucket_alloc_ret
|
|
+bch2_bucket_alloc_set(struct bch_fs *c,
|
|
+ struct open_buckets *ptrs,
|
|
+ struct dev_stripe_state *stripe,
|
|
+ struct bch_devs_mask *devs_may_alloc,
|
|
+ unsigned nr_replicas,
|
|
+ unsigned *nr_effective,
|
|
+ bool *have_cache,
|
|
+ enum alloc_reserve reserve,
|
|
+ unsigned flags,
|
|
+ struct closure *cl)
|
|
+{
|
|
+ struct dev_alloc_list devs_sorted =
|
|
+ bch2_dev_alloc_list(c, stripe, devs_may_alloc);
|
|
+ struct bch_dev *ca;
|
|
+ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES;
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(*nr_effective >= nr_replicas);
|
|
+
|
|
+ for (i = 0; i < devs_sorted.nr; i++) {
|
|
+ struct open_bucket *ob;
|
|
+
|
|
+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
|
|
+ if (!ca)
|
|
+ continue;
|
|
+
|
|
+ if (!ca->mi.durability && *have_cache)
|
|
+ continue;
|
|
+
|
|
+ ob = bch2_bucket_alloc(c, ca, reserve,
|
|
+ flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
|
|
+ if (IS_ERR(ob)) {
|
|
+ ret = -PTR_ERR(ob);
|
|
+
|
|
+ if (cl)
|
|
+ return ret;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ add_new_bucket(c, ptrs, devs_may_alloc,
|
|
+ nr_effective, have_cache, flags, ob);
|
|
+
|
|
+ bch2_dev_stripe_increment(ca, stripe);
|
|
+
|
|
+ if (*nr_effective >= nr_replicas)
|
|
+ return ALLOC_SUCCESS;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Allocate from stripes: */
|
|
+
|
|
+/*
|
|
+ * if we can't allocate a new stripe because there are already too many
|
|
+ * partially filled stripes, force allocating from an existing stripe even when
|
|
+ * it's to a device we don't want:
|
|
+ */
|
|
+
|
|
+static enum bucket_alloc_ret
|
|
+bucket_alloc_from_stripe(struct bch_fs *c,
|
|
+ struct open_buckets *ptrs,
|
|
+ struct write_point *wp,
|
|
+ struct bch_devs_mask *devs_may_alloc,
|
|
+ u16 target,
|
|
+ unsigned erasure_code,
|
|
+ unsigned nr_replicas,
|
|
+ unsigned *nr_effective,
|
|
+ bool *have_cache,
|
|
+ unsigned flags,
|
|
+ struct closure *cl)
|
|
+{
|
|
+ struct dev_alloc_list devs_sorted;
|
|
+ struct ec_stripe_head *h;
|
|
+ struct open_bucket *ob;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, ec_idx;
|
|
+
|
|
+ if (!erasure_code)
|
|
+ return 0;
|
|
+
|
|
+ if (nr_replicas < 2)
|
|
+ return 0;
|
|
+
|
|
+ if (ec_open_bucket(c, ptrs))
|
|
+ return 0;
|
|
+
|
|
+ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
|
|
+ wp == &c->copygc_write_point,
|
|
+ cl);
|
|
+ if (IS_ERR(h))
|
|
+ return -PTR_ERR(h);
|
|
+ if (!h)
|
|
+ return 0;
|
|
+
|
|
+ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
|
|
+
|
|
+ for (i = 0; i < devs_sorted.nr; i++)
|
|
+ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) {
|
|
+ if (!h->s->blocks[ec_idx])
|
|
+ continue;
|
|
+
|
|
+ ob = c->open_buckets + h->s->blocks[ec_idx];
|
|
+ if (ob->ptr.dev == devs_sorted.devs[i] &&
|
|
+ !test_and_set_bit(ec_idx, h->s->blocks_allocated))
|
|
+ goto got_bucket;
|
|
+ }
|
|
+ goto out_put_head;
|
|
+got_bucket:
|
|
+ ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+
|
|
+ ob->ec_idx = ec_idx;
|
|
+ ob->ec = h->s;
|
|
+
|
|
+ add_new_bucket(c, ptrs, devs_may_alloc,
|
|
+ nr_effective, have_cache, flags, ob);
|
|
+ atomic_inc(&h->s->pin);
|
|
+out_put_head:
|
|
+ bch2_ec_stripe_head_put(c, h);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Sector allocator */
|
|
+
|
|
+static void get_buckets_from_writepoint(struct bch_fs *c,
|
|
+ struct open_buckets *ptrs,
|
|
+ struct write_point *wp,
|
|
+ struct bch_devs_mask *devs_may_alloc,
|
|
+ unsigned nr_replicas,
|
|
+ unsigned *nr_effective,
|
|
+ bool *have_cache,
|
|
+ unsigned flags,
|
|
+ bool need_ec)
|
|
+{
|
|
+ struct open_buckets ptrs_skip = { .nr = 0 };
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+
|
|
+ if (*nr_effective < nr_replicas &&
|
|
+ test_bit(ob->ptr.dev, devs_may_alloc->d) &&
|
|
+ (ca->mi.durability ||
|
|
+ (wp->type == BCH_DATA_user && !*have_cache)) &&
|
|
+ (ob->ec || !need_ec)) {
|
|
+ add_new_bucket(c, ptrs, devs_may_alloc,
|
|
+ nr_effective, have_cache,
|
|
+ flags, ob);
|
|
+ } else {
|
|
+ ob_push(c, &ptrs_skip, ob);
|
|
+ }
|
|
+ }
|
|
+ wp->ptrs = ptrs_skip;
|
|
+}
|
|
+
|
|
+static enum bucket_alloc_ret
|
|
+open_bucket_add_buckets(struct bch_fs *c,
|
|
+ struct open_buckets *ptrs,
|
|
+ struct write_point *wp,
|
|
+ struct bch_devs_list *devs_have,
|
|
+ u16 target,
|
|
+ unsigned erasure_code,
|
|
+ unsigned nr_replicas,
|
|
+ unsigned *nr_effective,
|
|
+ bool *have_cache,
|
|
+ enum alloc_reserve reserve,
|
|
+ unsigned flags,
|
|
+ struct closure *_cl)
|
|
+{
|
|
+ struct bch_devs_mask devs;
|
|
+ struct open_bucket *ob;
|
|
+ struct closure *cl = NULL;
|
|
+ enum bucket_alloc_ret ret;
|
|
+ unsigned i;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ devs = target_rw_devs(c, wp->type, target);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ /* Don't allocate from devices we already have pointers to: */
|
|
+ for (i = 0; i < devs_have->nr; i++)
|
|
+ __clear_bit(devs_have->devs[i], devs.d);
|
|
+
|
|
+ open_bucket_for_each(c, ptrs, ob, i)
|
|
+ __clear_bit(ob->ptr.dev, devs.d);
|
|
+
|
|
+ if (erasure_code) {
|
|
+ if (!ec_open_bucket(c, ptrs)) {
|
|
+ get_buckets_from_writepoint(c, ptrs, wp, &devs,
|
|
+ nr_replicas, nr_effective,
|
|
+ have_cache, flags, true);
|
|
+ if (*nr_effective >= nr_replicas)
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (!ec_open_bucket(c, ptrs)) {
|
|
+ ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
|
|
+ target, erasure_code,
|
|
+ nr_replicas, nr_effective,
|
|
+ have_cache, flags, _cl);
|
|
+ if (ret == FREELIST_EMPTY ||
|
|
+ ret == OPEN_BUCKETS_EMPTY)
|
|
+ return ret;
|
|
+ if (*nr_effective >= nr_replicas)
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ get_buckets_from_writepoint(c, ptrs, wp, &devs,
|
|
+ nr_replicas, nr_effective,
|
|
+ have_cache, flags, false);
|
|
+ if (*nr_effective >= nr_replicas)
|
|
+ return 0;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ rcu_read_lock();
|
|
+
|
|
+retry_blocking:
|
|
+ /*
|
|
+ * Try nonblocking first, so that if one device is full we'll try from
|
|
+ * other devices:
|
|
+ */
|
|
+ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
|
|
+ nr_replicas, nr_effective, have_cache,
|
|
+ reserve, flags, cl);
|
|
+ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) {
|
|
+ cl = _cl;
|
|
+ goto retry_blocking;
|
|
+ }
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
|
|
+ struct open_buckets *obs)
|
|
+{
|
|
+ struct open_buckets ptrs = { .nr = 0 };
|
|
+ struct open_bucket *ob, *ob2;
|
|
+ unsigned i, j;
|
|
+
|
|
+ open_bucket_for_each(c, obs, ob, i) {
|
|
+ bool drop = !ca || ob->ptr.dev == ca->dev_idx;
|
|
+
|
|
+ if (!drop && ob->ec) {
|
|
+ mutex_lock(&ob->ec->lock);
|
|
+ for (j = 0; j < ob->ec->new_stripe.key.v.nr_blocks; j++) {
|
|
+ if (!ob->ec->blocks[j])
|
|
+ continue;
|
|
+
|
|
+ ob2 = c->open_buckets + ob->ec->blocks[j];
|
|
+ drop |= ob2->ptr.dev == ca->dev_idx;
|
|
+ }
|
|
+ mutex_unlock(&ob->ec->lock);
|
|
+ }
|
|
+
|
|
+ if (drop)
|
|
+ bch2_open_bucket_put(c, ob);
|
|
+ else
|
|
+ ob_push(c, &ptrs, ob);
|
|
+ }
|
|
+
|
|
+ *obs = ptrs;
|
|
+}
|
|
+
|
|
+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
|
|
+ struct write_point *wp)
|
|
+{
|
|
+ mutex_lock(&wp->lock);
|
|
+ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs);
|
|
+ mutex_unlock(&wp->lock);
|
|
+}
|
|
+
|
|
+static inline struct hlist_head *writepoint_hash(struct bch_fs *c,
|
|
+ unsigned long write_point)
|
|
+{
|
|
+ unsigned hash =
|
|
+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash)));
|
|
+
|
|
+ return &c->write_points_hash[hash];
|
|
+}
|
|
+
|
|
+static struct write_point *__writepoint_find(struct hlist_head *head,
|
|
+ unsigned long write_point)
|
|
+{
|
|
+ struct write_point *wp;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ hlist_for_each_entry_rcu(wp, head, node)
|
|
+ if (wp->write_point == write_point)
|
|
+ goto out;
|
|
+ wp = NULL;
|
|
+out:
|
|
+ rcu_read_unlock();
|
|
+ return wp;
|
|
+}
|
|
+
|
|
+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
|
|
+{
|
|
+ u64 stranded = c->write_points_nr * c->bucket_size_max;
|
|
+ u64 free = bch2_fs_usage_read_short(c).free;
|
|
+
|
|
+ return stranded * factor > free;
|
|
+}
|
|
+
|
|
+static bool try_increase_writepoints(struct bch_fs *c)
|
|
+{
|
|
+ struct write_point *wp;
|
|
+
|
|
+ if (c->write_points_nr == ARRAY_SIZE(c->write_points) ||
|
|
+ too_many_writepoints(c, 32))
|
|
+ return false;
|
|
+
|
|
+ wp = c->write_points + c->write_points_nr++;
|
|
+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool try_decrease_writepoints(struct bch_fs *c,
|
|
+ unsigned old_nr)
|
|
+{
|
|
+ struct write_point *wp;
|
|
+
|
|
+ mutex_lock(&c->write_points_hash_lock);
|
|
+ if (c->write_points_nr < old_nr) {
|
|
+ mutex_unlock(&c->write_points_hash_lock);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ if (c->write_points_nr == 1 ||
|
|
+ !too_many_writepoints(c, 8)) {
|
|
+ mutex_unlock(&c->write_points_hash_lock);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ wp = c->write_points + --c->write_points_nr;
|
|
+
|
|
+ hlist_del_rcu(&wp->node);
|
|
+ mutex_unlock(&c->write_points_hash_lock);
|
|
+
|
|
+ bch2_writepoint_stop(c, NULL, wp);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static struct write_point *writepoint_find(struct bch_fs *c,
|
|
+ unsigned long write_point)
|
|
+{
|
|
+ struct write_point *wp, *oldest;
|
|
+ struct hlist_head *head;
|
|
+
|
|
+ if (!(write_point & 1UL)) {
|
|
+ wp = (struct write_point *) write_point;
|
|
+ mutex_lock(&wp->lock);
|
|
+ return wp;
|
|
+ }
|
|
+
|
|
+ head = writepoint_hash(c, write_point);
|
|
+restart_find:
|
|
+ wp = __writepoint_find(head, write_point);
|
|
+ if (wp) {
|
|
+lock_wp:
|
|
+ mutex_lock(&wp->lock);
|
|
+ if (wp->write_point == write_point)
|
|
+ goto out;
|
|
+ mutex_unlock(&wp->lock);
|
|
+ goto restart_find;
|
|
+ }
|
|
+restart_find_oldest:
|
|
+ oldest = NULL;
|
|
+ for (wp = c->write_points;
|
|
+ wp < c->write_points + c->write_points_nr; wp++)
|
|
+ if (!oldest || time_before64(wp->last_used, oldest->last_used))
|
|
+ oldest = wp;
|
|
+
|
|
+ mutex_lock(&oldest->lock);
|
|
+ mutex_lock(&c->write_points_hash_lock);
|
|
+ if (oldest >= c->write_points + c->write_points_nr ||
|
|
+ try_increase_writepoints(c)) {
|
|
+ mutex_unlock(&c->write_points_hash_lock);
|
|
+ mutex_unlock(&oldest->lock);
|
|
+ goto restart_find_oldest;
|
|
+ }
|
|
+
|
|
+ wp = __writepoint_find(head, write_point);
|
|
+ if (wp && wp != oldest) {
|
|
+ mutex_unlock(&c->write_points_hash_lock);
|
|
+ mutex_unlock(&oldest->lock);
|
|
+ goto lock_wp;
|
|
+ }
|
|
+
|
|
+ wp = oldest;
|
|
+ hlist_del_rcu(&wp->node);
|
|
+ wp->write_point = write_point;
|
|
+ hlist_add_head_rcu(&wp->node, head);
|
|
+ mutex_unlock(&c->write_points_hash_lock);
|
|
+out:
|
|
+ wp->last_used = sched_clock();
|
|
+ return wp;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Get us an open_bucket we can allocate from, return with it locked:
|
|
+ */
|
|
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
|
|
+ unsigned target,
|
|
+ unsigned erasure_code,
|
|
+ struct write_point_specifier write_point,
|
|
+ struct bch_devs_list *devs_have,
|
|
+ unsigned nr_replicas,
|
|
+ unsigned nr_replicas_required,
|
|
+ enum alloc_reserve reserve,
|
|
+ unsigned flags,
|
|
+ struct closure *cl)
|
|
+{
|
|
+ struct write_point *wp;
|
|
+ struct open_bucket *ob;
|
|
+ struct open_buckets ptrs;
|
|
+ unsigned nr_effective, write_points_nr;
|
|
+ unsigned ob_flags = 0;
|
|
+ bool have_cache;
|
|
+ enum bucket_alloc_ret ret;
|
|
+ int i;
|
|
+
|
|
+ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS))
|
|
+ ob_flags |= BUCKET_ALLOC_USE_DURABILITY;
|
|
+
|
|
+ BUG_ON(!nr_replicas || !nr_replicas_required);
|
|
+retry:
|
|
+ ptrs.nr = 0;
|
|
+ nr_effective = 0;
|
|
+ write_points_nr = c->write_points_nr;
|
|
+ have_cache = false;
|
|
+
|
|
+ wp = writepoint_find(c, write_point.v);
|
|
+
|
|
+ if (wp->type == BCH_DATA_user)
|
|
+ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL;
|
|
+
|
|
+ /* metadata may not allocate on cache devices: */
|
|
+ if (wp->type != BCH_DATA_user)
|
|
+ have_cache = true;
|
|
+
|
|
+ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
|
|
+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
|
|
+ target, erasure_code,
|
|
+ nr_replicas, &nr_effective,
|
|
+ &have_cache, reserve,
|
|
+ ob_flags, cl);
|
|
+ } else {
|
|
+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
|
|
+ target, erasure_code,
|
|
+ nr_replicas, &nr_effective,
|
|
+ &have_cache, reserve,
|
|
+ ob_flags, NULL);
|
|
+ if (!ret)
|
|
+ goto alloc_done;
|
|
+
|
|
+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
|
|
+ 0, erasure_code,
|
|
+ nr_replicas, &nr_effective,
|
|
+ &have_cache, reserve,
|
|
+ ob_flags, cl);
|
|
+ }
|
|
+alloc_done:
|
|
+ BUG_ON(!ret && nr_effective < nr_replicas);
|
|
+
|
|
+ if (erasure_code && !ec_open_bucket(c, &ptrs))
|
|
+ pr_debug("failed to get ec bucket: ret %u", ret);
|
|
+
|
|
+ if (ret == INSUFFICIENT_DEVICES &&
|
|
+ nr_effective >= nr_replicas_required)
|
|
+ ret = 0;
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /* Free buckets we didn't use: */
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
+ open_bucket_free_unused(c, wp, ob);
|
|
+
|
|
+ wp->ptrs = ptrs;
|
|
+
|
|
+ wp->sectors_free = UINT_MAX;
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
+ wp->sectors_free = min(wp->sectors_free, ob->sectors_free);
|
|
+
|
|
+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
|
|
+
|
|
+ verify_not_stale(c, &wp->ptrs);
|
|
+
|
|
+ return wp;
|
|
+err:
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
+ if (ptrs.nr < ARRAY_SIZE(ptrs.v))
|
|
+ ob_push(c, &ptrs, ob);
|
|
+ else
|
|
+ open_bucket_free_unused(c, wp, ob);
|
|
+ wp->ptrs = ptrs;
|
|
+
|
|
+ mutex_unlock(&wp->lock);
|
|
+
|
|
+ if (ret == FREELIST_EMPTY &&
|
|
+ try_decrease_writepoints(c, write_points_nr))
|
|
+ goto retry;
|
|
+
|
|
+ switch (ret) {
|
|
+ case OPEN_BUCKETS_EMPTY:
|
|
+ case FREELIST_EMPTY:
|
|
+ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC);
|
|
+ case INSUFFICIENT_DEVICES:
|
|
+ return ERR_PTR(-EROFS);
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
|
|
+ * as allocated out of @ob
|
|
+ */
|
|
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
|
|
+ struct bkey_i *k, unsigned sectors)
|
|
+
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(sectors > wp->sectors_free);
|
|
+ wp->sectors_free -= sectors;
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+ struct bch_extent_ptr tmp = ob->ptr;
|
|
+
|
|
+ tmp.cached = !ca->mi.durability &&
|
|
+ wp->type == BCH_DATA_user;
|
|
+
|
|
+ tmp.offset += ca->mi.bucket_size - ob->sectors_free;
|
|
+ bch2_bkey_append_ptr(k, tmp);
|
|
+
|
|
+ BUG_ON(sectors > ob->sectors_free);
|
|
+ ob->sectors_free -= sectors;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Append pointers to the space we just allocated to @k, and mark @sectors space
|
|
+ * as allocated out of @ob
|
|
+ */
|
|
+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
|
|
+{
|
|
+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 };
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
|
|
+ wp->ptrs = keep;
|
|
+
|
|
+ mutex_unlock(&wp->lock);
|
|
+
|
|
+ bch2_open_buckets_put(c, &ptrs);
|
|
+}
|
|
+
|
|
+static inline void writepoint_init(struct write_point *wp,
|
|
+ enum bch_data_type type)
|
|
+{
|
|
+ mutex_init(&wp->lock);
|
|
+ wp->type = type;
|
|
+}
|
|
+
|
|
+void bch2_fs_allocator_foreground_init(struct bch_fs *c)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ struct write_point *wp;
|
|
+
|
|
+ mutex_init(&c->write_points_hash_lock);
|
|
+ c->write_points_nr = ARRAY_SIZE(c->write_points);
|
|
+
|
|
+ /* open bucket 0 is a sentinal NULL: */
|
|
+ spin_lock_init(&c->open_buckets[0].lock);
|
|
+
|
|
+ for (ob = c->open_buckets + 1;
|
|
+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) {
|
|
+ spin_lock_init(&ob->lock);
|
|
+ c->open_buckets_nr_free++;
|
|
+
|
|
+ ob->freelist = c->open_buckets_freelist;
|
|
+ c->open_buckets_freelist = ob - c->open_buckets;
|
|
+ }
|
|
+
|
|
+ writepoint_init(&c->btree_write_point, BCH_DATA_btree);
|
|
+ writepoint_init(&c->rebalance_write_point, BCH_DATA_user);
|
|
+ writepoint_init(&c->copygc_write_point, BCH_DATA_user);
|
|
+
|
|
+ for (wp = c->write_points;
|
|
+ wp < c->write_points + c->write_points_nr; wp++) {
|
|
+ writepoint_init(wp, BCH_DATA_user);
|
|
+
|
|
+ wp->last_used = sched_clock();
|
|
+ wp->write_point = (unsigned long) wp;
|
|
+ hlist_add_head_rcu(&wp->node,
|
|
+ writepoint_hash(c, wp->write_point));
|
|
+ }
|
|
+}
|
|
diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h
|
|
new file mode 100644
|
|
index 000000000000..c658295cb8e0
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/alloc_foreground.h
|
|
@@ -0,0 +1,138 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H
|
|
+#define _BCACHEFS_ALLOC_FOREGROUND_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_types.h"
|
|
+
|
|
+#include <linux/hash.h>
|
|
+
|
|
+struct bkey;
|
|
+struct bch_dev;
|
|
+struct bch_fs;
|
|
+struct bch_devs_List;
|
|
+
|
|
+enum bucket_alloc_ret {
|
|
+ ALLOC_SUCCESS,
|
|
+ OPEN_BUCKETS_EMPTY,
|
|
+ FREELIST_EMPTY, /* Allocator thread not keeping up */
|
|
+ INSUFFICIENT_DEVICES,
|
|
+};
|
|
+
|
|
+struct dev_alloc_list {
|
|
+ unsigned nr;
|
|
+ u8 devs[BCH_SB_MEMBERS_MAX];
|
|
+};
|
|
+
|
|
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
|
|
+ struct dev_stripe_state *,
|
|
+ struct bch_devs_mask *);
|
|
+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *);
|
|
+
|
|
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
|
|
+
|
|
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *,
|
|
+ enum alloc_reserve, bool,
|
|
+ struct closure *);
|
|
+
|
|
+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
|
|
+ struct open_bucket *ob)
|
|
+{
|
|
+ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v));
|
|
+
|
|
+ obs->v[obs->nr++] = ob - c->open_buckets;
|
|
+}
|
|
+
|
|
+#define open_bucket_for_each(_c, _obs, _ob, _i) \
|
|
+ for ((_i) = 0; \
|
|
+ (_i) < (_obs)->nr && \
|
|
+ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \
|
|
+ (_i)++)
|
|
+
|
|
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
|
|
+ struct open_buckets *obs)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, obs, ob, i)
|
|
+ if (ob->ec)
|
|
+ return ob;
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_open_bucket_write_error(struct bch_fs *,
|
|
+ struct open_buckets *, unsigned);
|
|
+
|
|
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
|
|
+
|
|
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
|
|
+{
|
|
+ if (atomic_dec_and_test(&ob->pin))
|
|
+ __bch2_open_bucket_put(c, ob);
|
|
+}
|
|
+
|
|
+static inline void bch2_open_buckets_put(struct bch_fs *c,
|
|
+ struct open_buckets *ptrs)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, ptrs, ob, i)
|
|
+ bch2_open_bucket_put(c, ob);
|
|
+ ptrs->nr = 0;
|
|
+}
|
|
+
|
|
+static inline void bch2_open_bucket_get(struct bch_fs *c,
|
|
+ struct write_point *wp,
|
|
+ struct open_buckets *ptrs)
|
|
+{
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
|
+ ob->type = wp->type;
|
|
+ atomic_inc(&ob->pin);
|
|
+ ob_push(c, ptrs, ob);
|
|
+ }
|
|
+}
|
|
+
|
|
+enum bucket_alloc_ret
|
|
+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
|
|
+ struct dev_stripe_state *, struct bch_devs_mask *,
|
|
+ unsigned, unsigned *, bool *, enum alloc_reserve,
|
|
+ unsigned, struct closure *);
|
|
+
|
|
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
|
|
+ unsigned, unsigned,
|
|
+ struct write_point_specifier,
|
|
+ struct bch_devs_list *,
|
|
+ unsigned, unsigned,
|
|
+ enum alloc_reserve,
|
|
+ unsigned,
|
|
+ struct closure *);
|
|
+
|
|
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
|
|
+ struct bkey_i *, unsigned);
|
|
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
|
|
+
|
|
+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
|
|
+ struct open_buckets *);
|
|
+
|
|
+void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
|
|
+ struct write_point *);
|
|
+
|
|
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
|
|
+{
|
|
+ return (struct write_point_specifier) { .v = v | 1 };
|
|
+}
|
|
+
|
|
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
|
|
+{
|
|
+ return (struct write_point_specifier) { .v = (unsigned long) wp };
|
|
+}
|
|
+
|
|
+void bch2_fs_allocator_foreground_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */
|
|
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
|
|
new file mode 100644
|
|
index 000000000000..4a1cd8b73d16
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/alloc_types.h
|
|
@@ -0,0 +1,98 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ALLOC_TYPES_H
|
|
+#define _BCACHEFS_ALLOC_TYPES_H
|
|
+
|
|
+#include <linux/mutex.h>
|
|
+#include <linux/spinlock.h>
|
|
+
|
|
+#include "clock_types.h"
|
|
+#include "fifo.h"
|
|
+
|
|
+struct ec_bucket_buf;
|
|
+
|
|
+#define ALLOC_THREAD_STATES() \
|
|
+ x(stopped) \
|
|
+ x(running) \
|
|
+ x(blocked) \
|
|
+ x(blocked_full)
|
|
+
|
|
+enum allocator_states {
|
|
+#define x(n) ALLOCATOR_##n,
|
|
+ ALLOC_THREAD_STATES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+enum alloc_reserve {
|
|
+ RESERVE_BTREE_MOVINGGC = -2,
|
|
+ RESERVE_BTREE = -1,
|
|
+ RESERVE_MOVINGGC = 0,
|
|
+ RESERVE_NONE = 1,
|
|
+ RESERVE_NR = 2,
|
|
+};
|
|
+
|
|
+typedef FIFO(long) alloc_fifo;
|
|
+
|
|
+#define OPEN_BUCKETS_COUNT 1024
|
|
+
|
|
+#define WRITE_POINT_HASH_NR 32
|
|
+#define WRITE_POINT_MAX 32
|
|
+
|
|
+typedef u16 open_bucket_idx_t;
|
|
+
|
|
+struct open_bucket {
|
|
+ spinlock_t lock;
|
|
+ atomic_t pin;
|
|
+ open_bucket_idx_t freelist;
|
|
+
|
|
+ /*
|
|
+ * When an open bucket has an ec_stripe attached, this is the index of
|
|
+ * the block in the stripe this open_bucket corresponds to:
|
|
+ */
|
|
+ u8 ec_idx;
|
|
+ u8 type;
|
|
+ unsigned valid:1;
|
|
+ unsigned on_partial_list:1;
|
|
+ int alloc_reserve:3;
|
|
+ unsigned sectors_free;
|
|
+ struct bch_extent_ptr ptr;
|
|
+ struct ec_stripe_new *ec;
|
|
+};
|
|
+
|
|
+#define OPEN_BUCKET_LIST_MAX 15
|
|
+
|
|
+struct open_buckets {
|
|
+ open_bucket_idx_t nr;
|
|
+ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX];
|
|
+};
|
|
+
|
|
+struct dev_stripe_state {
|
|
+ u64 next_alloc[BCH_SB_MEMBERS_MAX];
|
|
+};
|
|
+
|
|
+struct write_point {
|
|
+ struct hlist_node node;
|
|
+ struct mutex lock;
|
|
+ u64 last_used;
|
|
+ unsigned long write_point;
|
|
+ enum bch_data_type type;
|
|
+
|
|
+ /* calculated based on how many pointers we're actually going to use: */
|
|
+ unsigned sectors_free;
|
|
+
|
|
+ struct open_buckets ptrs;
|
|
+ struct dev_stripe_state stripe;
|
|
+};
|
|
+
|
|
+struct write_point_specifier {
|
|
+ unsigned long v;
|
|
+};
|
|
+
|
|
+struct alloc_heap_entry {
|
|
+ size_t bucket;
|
|
+ size_t nr;
|
|
+ unsigned long key;
|
|
+};
|
|
+
|
|
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
|
|
+
|
|
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
|
|
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
|
|
new file mode 100644
|
|
index 000000000000..24aa2cc7d965
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bcachefs.h
|
|
@@ -0,0 +1,909 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_H
|
|
+#define _BCACHEFS_H
|
|
+
|
|
+/*
|
|
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
|
|
+ *
|
|
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
|
|
+ *
|
|
+ * Support for multiple cache devices hasn't quite been finished off yet, but
|
|
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
|
|
+ * like a md raid array and its component devices. Most of the code doesn't care
|
|
+ * about individual cache devices, the main abstraction is the cache set.
|
|
+ *
|
|
+ * Multiple cache devices is intended to give us the ability to mirror dirty
|
|
+ * cached data and metadata, without mirroring clean cached data.
|
|
+ *
|
|
+ * Backing devices are different, in that they have a lifetime independent of a
|
|
+ * cache set. When you register a newly formatted backing device it'll come up
|
|
+ * in passthrough mode, and then you can attach and detach a backing device from
|
|
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
|
|
+ * invalidates any cached data for that backing device.
|
|
+ *
|
|
+ * A cache set can have multiple (many) backing devices attached to it.
|
|
+ *
|
|
+ * There's also flash only volumes - this is the reason for the distinction
|
|
+ * between struct cached_dev and struct bcache_device. A flash only volume
|
|
+ * works much like a bcache device that has a backing device, except the
|
|
+ * "cached" data is always dirty. The end result is that we get thin
|
|
+ * provisioning with very little additional code.
|
|
+ *
|
|
+ * Flash only volumes work but they're not production ready because the moving
|
|
+ * garbage collector needs more work. More on that later.
|
|
+ *
|
|
+ * BUCKETS/ALLOCATION:
|
|
+ *
|
|
+ * Bcache is primarily designed for caching, which means that in normal
|
|
+ * operation all of our available space will be allocated. Thus, we need an
|
|
+ * efficient way of deleting things from the cache so we can write new things to
|
|
+ * it.
|
|
+ *
|
|
+ * To do this, we first divide the cache device up into buckets. A bucket is the
|
|
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
|
|
+ * works efficiently.
|
|
+ *
|
|
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
|
|
+ * it. The gens and priorities for all the buckets are stored contiguously and
|
|
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
|
|
+ * of bcache's metadata is stored in buckets).
|
|
+ *
|
|
+ * The priority is used to implement an LRU. We reset a bucket's priority when
|
|
+ * we allocate it or on cache it, and every so often we decrement the priority
|
|
+ * of each bucket. It could be used to implement something more sophisticated,
|
|
+ * if anyone ever gets around to it.
|
|
+ *
|
|
+ * The generation is used for invalidating buckets. Each pointer also has an 8
|
|
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
|
|
+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all
|
|
+ * we have to do is increment its gen (and write its new gen to disk; we batch
|
|
+ * this up).
|
|
+ *
|
|
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
|
|
+ * contain metadata (including btree nodes).
|
|
+ *
|
|
+ * THE BTREE:
|
|
+ *
|
|
+ * Bcache is in large part design around the btree.
|
|
+ *
|
|
+ * At a high level, the btree is just an index of key -> ptr tuples.
|
|
+ *
|
|
+ * Keys represent extents, and thus have a size field. Keys also have a variable
|
|
+ * number of pointers attached to them (potentially zero, which is handy for
|
|
+ * invalidating the cache).
|
|
+ *
|
|
+ * The key itself is an inode:offset pair. The inode number corresponds to a
|
|
+ * backing device or a flash only volume. The offset is the ending offset of the
|
|
+ * extent within the inode - not the starting offset; this makes lookups
|
|
+ * slightly more convenient.
|
|
+ *
|
|
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
|
|
+ * generation number. More on the gen later.
|
|
+ *
|
|
+ * Index lookups are not fully abstracted - cache lookups in particular are
|
|
+ * still somewhat mixed in with the btree code, but things are headed in that
|
|
+ * direction.
|
|
+ *
|
|
+ * Updates are fairly well abstracted, though. There are two different ways of
|
|
+ * updating the btree; insert and replace.
|
|
+ *
|
|
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
|
|
+ * overwriting (possibly only partially) any extents they overlap with. This is
|
|
+ * used to update the index after a write.
|
|
+ *
|
|
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
|
|
+ * overwriting a key that matches another given key. This is used for inserting
|
|
+ * data into the cache after a cache miss, and for background writeback, and for
|
|
+ * the moving garbage collector.
|
|
+ *
|
|
+ * There is no "delete" operation; deleting things from the index is
|
|
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
|
|
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
|
|
+ * previously present at that location in the index.
|
|
+ *
|
|
+ * This means that there are always stale/invalid keys in the btree. They're
|
|
+ * filtered out by the code that iterates through a btree node, and removed when
|
|
+ * a btree node is rewritten.
|
|
+ *
|
|
+ * BTREE NODES:
|
|
+ *
|
|
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
|
|
+ * free smaller than a bucket - so, that's how big our btree nodes are.
|
|
+ *
|
|
+ * (If buckets are really big we'll only use part of the bucket for a btree node
|
|
+ * - no less than 1/4th - but a bucket still contains no more than a single
|
|
+ * btree node. I'd actually like to change this, but for now we rely on the
|
|
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
|
|
+ *
|
|
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
|
|
+ * btree implementation.
|
|
+ *
|
|
+ * The way this is solved is that btree nodes are internally log structured; we
|
|
+ * can append new keys to an existing btree node without rewriting it. This
|
|
+ * means each set of keys we write is sorted, but the node is not.
|
|
+ *
|
|
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
|
|
+ * be expensive, and we have to distinguish between the keys we have written and
|
|
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
|
|
+ * each sorted set. But we do merge written sets together lazily, so the cost of
|
|
+ * these extra searches is quite low (normally most of the keys in a btree node
|
|
+ * will be in one big set, and then there'll be one or two sets that are much
|
|
+ * smaller).
|
|
+ *
|
|
+ * This log structure makes bcache's btree more of a hybrid between a
|
|
+ * conventional btree and a compacting data structure, with some of the
|
|
+ * advantages of both.
|
|
+ *
|
|
+ * GARBAGE COLLECTION:
|
|
+ *
|
|
+ * We can't just invalidate any bucket - it might contain dirty data or
|
|
+ * metadata. If it once contained dirty data, other writes might overwrite it
|
|
+ * later, leaving no valid pointers into that bucket in the index.
|
|
+ *
|
|
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
|
|
+ * It also counts how much valid data it each bucket currently contains, so that
|
|
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
|
|
+ *
|
|
+ * It also does some things that are really internal to the btree
|
|
+ * implementation. If a btree node contains pointers that are stale by more than
|
|
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
|
|
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
|
|
+ *
|
|
+ * THE JOURNAL:
|
|
+ *
|
|
+ * Bcache's journal is not necessary for consistency; we always strictly
|
|
+ * order metadata writes so that the btree and everything else is consistent on
|
|
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
|
|
+ * caching (with recovery from unclean shutdown) before journalling was
|
|
+ * implemented.
|
|
+ *
|
|
+ * Rather, the journal is purely a performance optimization; we can't complete a
|
|
+ * write until we've updated the index on disk, otherwise the cache would be
|
|
+ * inconsistent in the event of an unclean shutdown. This means that without the
|
|
+ * journal, on random write workloads we constantly have to update all the leaf
|
|
+ * nodes in the btree, and those writes will be mostly empty (appending at most
|
|
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
|
|
+ * and it puts more strain on the various btree resorting/compacting code.
|
|
+ *
|
|
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
|
|
+ * all the keys in the open journal entries. That means that when we're updating
|
|
+ * a node in the btree, we can wait until a 4k block of keys fills up before
|
|
+ * writing them out.
|
|
+ *
|
|
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
|
|
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
|
|
+ * the complexity to deal with journalling them (in particular, journal replay)
|
|
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
|
|
+ */
|
|
+
|
|
+#undef pr_fmt
|
|
+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__
|
|
+
|
|
+#include <linux/bug.h>
|
|
+#include <linux/bio.h>
|
|
+#include <linux/closure.h>
|
|
+#include <linux/kobject.h>
|
|
+#include <linux/list.h>
|
|
+#include <linux/math64.h>
|
|
+#include <linux/mutex.h>
|
|
+#include <linux/percpu-refcount.h>
|
|
+#include <linux/percpu-rwsem.h>
|
|
+#include <linux/rhashtable.h>
|
|
+#include <linux/rwsem.h>
|
|
+#include <linux/semaphore.h>
|
|
+#include <linux/seqlock.h>
|
|
+#include <linux/shrinker.h>
|
|
+#include <linux/srcu.h>
|
|
+#include <linux/types.h>
|
|
+#include <linux/workqueue.h>
|
|
+#include <linux/zstd.h>
|
|
+
|
|
+#include "bcachefs_format.h"
|
|
+#include "fifo.h"
|
|
+#include "opts.h"
|
|
+#include "util.h"
|
|
+
|
|
+#define dynamic_fault(...) 0
|
|
+#define race_fault(...) 0
|
|
+
|
|
+#define bch2_fs_init_fault(name) \
|
|
+ dynamic_fault("bcachefs:bch_fs_init:" name)
|
|
+#define bch2_meta_read_fault(name) \
|
|
+ dynamic_fault("bcachefs:meta:read:" name)
|
|
+#define bch2_meta_write_fault(name) \
|
|
+ dynamic_fault("bcachefs:meta:write:" name)
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name)
|
|
+#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum)
|
|
+#else
|
|
+#define bch2_fmt(_c, fmt) fmt "\n"
|
|
+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum)
|
|
+#endif
|
|
+
|
|
+#define bch_info(c, fmt, ...) \
|
|
+ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
+#define bch_notice(c, fmt, ...) \
|
|
+ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
+#define bch_warn(c, fmt, ...) \
|
|
+ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
+#define bch_warn_ratelimited(c, fmt, ...) \
|
|
+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
+#define bch_err(c, fmt, ...) \
|
|
+ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
+
|
|
+#define bch_err_ratelimited(c, fmt, ...) \
|
|
+ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
|
|
+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
|
|
+ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
|
|
+
|
|
+#define bch_verbose(c, fmt, ...) \
|
|
+do { \
|
|
+ if ((c)->opts.verbose) \
|
|
+ bch_info(c, fmt, ##__VA_ARGS__); \
|
|
+} while (0)
|
|
+
|
|
+#define pr_verbose_init(opts, fmt, ...) \
|
|
+do { \
|
|
+ if (opt_get(opts, verbose)) \
|
|
+ pr_info(fmt, ##__VA_ARGS__); \
|
|
+} while (0)
|
|
+
|
|
+/* Parameters that are useful for debugging, but should always be compiled in: */
|
|
+#define BCH_DEBUG_PARAMS_ALWAYS() \
|
|
+ BCH_DEBUG_PARAM(key_merging_disabled, \
|
|
+ "Disables merging of extents") \
|
|
+ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \
|
|
+ "Causes mark and sweep to compact and rewrite every " \
|
|
+ "btree node it traverses") \
|
|
+ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \
|
|
+ "Disables rewriting of btree nodes during mark and sweep")\
|
|
+ BCH_DEBUG_PARAM(btree_shrinker_disabled, \
|
|
+ "Disables the shrinker callback for the btree node cache")\
|
|
+ BCH_DEBUG_PARAM(verify_btree_ondisk, \
|
|
+ "Reread btree nodes at various points to verify the " \
|
|
+ "mergesort in the read path against modifications " \
|
|
+ "done in memory")
|
|
+
|
|
+/* Parameters that should only be compiled in in debug mode: */
|
|
+#define BCH_DEBUG_PARAMS_DEBUG() \
|
|
+ BCH_DEBUG_PARAM(expensive_debug_checks, \
|
|
+ "Enables various runtime debugging checks that " \
|
|
+ "significantly affect performance") \
|
|
+ BCH_DEBUG_PARAM(debug_check_iterators, \
|
|
+ "Enables extra verification for btree iterators") \
|
|
+ BCH_DEBUG_PARAM(debug_check_bkeys, \
|
|
+ "Run bkey_debugcheck (primarily checking GC/allocation "\
|
|
+ "information) when iterating over keys") \
|
|
+ BCH_DEBUG_PARAM(debug_check_btree_accounting, \
|
|
+ "Verify btree accounting for keys within a node") \
|
|
+ BCH_DEBUG_PARAM(journal_seq_verify, \
|
|
+ "Store the journal sequence number in the version " \
|
|
+ "number of every btree key, and verify that btree " \
|
|
+ "update ordering is preserved during recovery") \
|
|
+ BCH_DEBUG_PARAM(inject_invalid_keys, \
|
|
+ "Store the journal sequence number in the version " \
|
|
+ "number of every btree key, and verify that btree " \
|
|
+ "update ordering is preserved during recovery") \
|
|
+ BCH_DEBUG_PARAM(test_alloc_startup, \
|
|
+ "Force allocator startup to use the slowpath where it" \
|
|
+ "can't find enough free buckets without invalidating" \
|
|
+ "cached data") \
|
|
+ BCH_DEBUG_PARAM(force_reconstruct_read, \
|
|
+ "Force reads to use the reconstruct path, when reading" \
|
|
+ "from erasure coded extents") \
|
|
+ BCH_DEBUG_PARAM(test_restart_gc, \
|
|
+ "Test restarting mark and sweep gc when bucket gens change")
|
|
+
|
|
+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL()
|
|
+#else
|
|
+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
|
|
+#endif
|
|
+
|
|
+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name;
|
|
+BCH_DEBUG_PARAMS()
|
|
+#undef BCH_DEBUG_PARAM
|
|
+
|
|
+#ifndef CONFIG_BCACHEFS_DEBUG
|
|
+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name;
|
|
+BCH_DEBUG_PARAMS_DEBUG()
|
|
+#undef BCH_DEBUG_PARAM
|
|
+#endif
|
|
+
|
|
+#define BCH_TIME_STATS() \
|
|
+ x(btree_node_mem_alloc) \
|
|
+ x(btree_node_split) \
|
|
+ x(btree_node_sort) \
|
|
+ x(btree_node_read) \
|
|
+ x(btree_gc) \
|
|
+ x(btree_lock_contended_read) \
|
|
+ x(btree_lock_contended_intent) \
|
|
+ x(btree_lock_contended_write) \
|
|
+ x(data_write) \
|
|
+ x(data_read) \
|
|
+ x(data_promote) \
|
|
+ x(journal_write) \
|
|
+ x(journal_delay) \
|
|
+ x(journal_flush_seq) \
|
|
+ x(blocked_journal) \
|
|
+ x(blocked_allocate) \
|
|
+ x(blocked_allocate_open_bucket)
|
|
+
|
|
+enum bch_time_stats {
|
|
+#define x(name) BCH_TIME_##name,
|
|
+ BCH_TIME_STATS()
|
|
+#undef x
|
|
+ BCH_TIME_STAT_NR
|
|
+};
|
|
+
|
|
+#include "alloc_types.h"
|
|
+#include "btree_types.h"
|
|
+#include "buckets_types.h"
|
|
+#include "clock_types.h"
|
|
+#include "ec_types.h"
|
|
+#include "journal_types.h"
|
|
+#include "keylist_types.h"
|
|
+#include "quota_types.h"
|
|
+#include "rebalance_types.h"
|
|
+#include "replicas_types.h"
|
|
+#include "super_types.h"
|
|
+
|
|
+/* Number of nodes btree coalesce will try to coalesce at once */
|
|
+#define GC_MERGE_NODES 4U
|
|
+
|
|
+/* Maximum number of nodes we might need to allocate atomically: */
|
|
+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1))
|
|
+
|
|
+/* Size of the freelist we allocate btree nodes from: */
|
|
+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4)
|
|
+
|
|
+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX)
|
|
+
|
|
+struct btree;
|
|
+
|
|
+enum gc_phase {
|
|
+ GC_PHASE_NOT_RUNNING,
|
|
+ GC_PHASE_START,
|
|
+ GC_PHASE_SB,
|
|
+
|
|
+ GC_PHASE_BTREE_stripes,
|
|
+ GC_PHASE_BTREE_extents,
|
|
+ GC_PHASE_BTREE_inodes,
|
|
+ GC_PHASE_BTREE_dirents,
|
|
+ GC_PHASE_BTREE_xattrs,
|
|
+ GC_PHASE_BTREE_alloc,
|
|
+ GC_PHASE_BTREE_quotas,
|
|
+ GC_PHASE_BTREE_reflink,
|
|
+
|
|
+ GC_PHASE_PENDING_DELETE,
|
|
+};
|
|
+
|
|
+struct gc_pos {
|
|
+ enum gc_phase phase;
|
|
+ struct bpos pos;
|
|
+ unsigned level;
|
|
+};
|
|
+
|
|
+struct io_count {
|
|
+ u64 sectors[2][BCH_DATA_NR];
|
|
+};
|
|
+
|
|
+struct bch_dev {
|
|
+ struct kobject kobj;
|
|
+ struct percpu_ref ref;
|
|
+ struct completion ref_completion;
|
|
+ struct percpu_ref io_ref;
|
|
+ struct completion io_ref_completion;
|
|
+
|
|
+ struct bch_fs *fs;
|
|
+
|
|
+ u8 dev_idx;
|
|
+ /*
|
|
+ * Cached version of this device's member info from superblock
|
|
+ * Committed by bch2_write_super() -> bch_fs_mi_update()
|
|
+ */
|
|
+ struct bch_member_cpu mi;
|
|
+ uuid_le uuid;
|
|
+ char name[BDEVNAME_SIZE];
|
|
+
|
|
+ struct bch_sb_handle disk_sb;
|
|
+ struct bch_sb *sb_read_scratch;
|
|
+ int sb_write_error;
|
|
+
|
|
+ struct bch_devs_mask self;
|
|
+
|
|
+ /* biosets used in cloned bios for writing multiple replicas */
|
|
+ struct bio_set replica_set;
|
|
+
|
|
+ /*
|
|
+ * Buckets:
|
|
+ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and
|
|
+ * gc_lock, for device resize - holding any is sufficient for access:
|
|
+ * Or rcu_read_lock(), but only for ptr_stale():
|
|
+ */
|
|
+ struct bucket_array __rcu *buckets[2];
|
|
+ unsigned long *buckets_nouse;
|
|
+ struct rw_semaphore bucket_lock;
|
|
+
|
|
+ struct bch_dev_usage *usage_base;
|
|
+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR];
|
|
+ struct bch_dev_usage __percpu *usage_gc;
|
|
+
|
|
+ /* Allocator: */
|
|
+ struct task_struct __rcu *alloc_thread;
|
|
+
|
|
+ /*
|
|
+ * free: Buckets that are ready to be used
|
|
+ *
|
|
+ * free_inc: Incoming buckets - these are buckets that currently have
|
|
+ * cached data in them, and we can't reuse them until after we write
|
|
+ * their new gen to disk. After prio_write() finishes writing the new
|
|
+ * gens/prios, they'll be moved to the free list (and possibly discarded
|
|
+ * in the process)
|
|
+ */
|
|
+ alloc_fifo free[RESERVE_NR];
|
|
+ alloc_fifo free_inc;
|
|
+ unsigned nr_open_buckets;
|
|
+
|
|
+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT];
|
|
+ open_bucket_idx_t open_buckets_partial_nr;
|
|
+
|
|
+ size_t fifo_last_bucket;
|
|
+
|
|
+ size_t inc_gen_needs_gc;
|
|
+ size_t inc_gen_really_needs_gc;
|
|
+
|
|
+ enum allocator_states allocator_state;
|
|
+
|
|
+ alloc_heap alloc_heap;
|
|
+
|
|
+ atomic64_t rebalance_work;
|
|
+
|
|
+ struct journal_device journal;
|
|
+ u64 prev_journal_sector;
|
|
+
|
|
+ struct work_struct io_error_work;
|
|
+
|
|
+ /* The rest of this all shows up in sysfs */
|
|
+ atomic64_t cur_latency[2];
|
|
+ struct time_stats io_latency[2];
|
|
+
|
|
+#define CONGESTED_MAX 1024
|
|
+ atomic_t congested;
|
|
+ u64 congested_last;
|
|
+
|
|
+ struct io_count __percpu *io_done;
|
|
+};
|
|
+
|
|
+enum {
|
|
+ /* startup: */
|
|
+ BCH_FS_ALLOC_READ_DONE,
|
|
+ BCH_FS_ALLOC_CLEAN,
|
|
+ BCH_FS_ALLOCATOR_RUNNING,
|
|
+ BCH_FS_ALLOCATOR_STOPPING,
|
|
+ BCH_FS_INITIAL_GC_DONE,
|
|
+ BCH_FS_INITIAL_GC_UNFIXED,
|
|
+ BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
|
|
+ BCH_FS_FSCK_DONE,
|
|
+ BCH_FS_STARTED,
|
|
+ BCH_FS_RW,
|
|
+ BCH_FS_WAS_RW,
|
|
+
|
|
+ /* shutdown: */
|
|
+ BCH_FS_STOPPING,
|
|
+ BCH_FS_EMERGENCY_RO,
|
|
+ BCH_FS_WRITE_DISABLE_COMPLETE,
|
|
+
|
|
+ /* errors: */
|
|
+ BCH_FS_ERROR,
|
|
+ BCH_FS_TOPOLOGY_ERROR,
|
|
+ BCH_FS_ERRORS_FIXED,
|
|
+ BCH_FS_ERRORS_NOT_FIXED,
|
|
+
|
|
+ /* misc: */
|
|
+ BCH_FS_NEED_ANOTHER_GC,
|
|
+ BCH_FS_DELETED_NODES,
|
|
+ BCH_FS_NEED_ALLOC_WRITE,
|
|
+ BCH_FS_REBUILD_REPLICAS,
|
|
+ BCH_FS_HOLD_BTREE_WRITES,
|
|
+};
|
|
+
|
|
+struct btree_debug {
|
|
+ unsigned id;
|
|
+ struct dentry *btree;
|
|
+ struct dentry *btree_format;
|
|
+ struct dentry *failed;
|
|
+};
|
|
+
|
|
+struct bch_fs_pcpu {
|
|
+ u64 sectors_available;
|
|
+};
|
|
+
|
|
+struct journal_seq_blacklist_table {
|
|
+ size_t nr;
|
|
+ struct journal_seq_blacklist_table_entry {
|
|
+ u64 start;
|
|
+ u64 end;
|
|
+ bool dirty;
|
|
+ } entries[0];
|
|
+};
|
|
+
|
|
+struct journal_keys {
|
|
+ struct journal_key {
|
|
+ enum btree_id btree_id:8;
|
|
+ unsigned level:8;
|
|
+ bool allocated;
|
|
+ struct bkey_i *k;
|
|
+ u32 journal_seq;
|
|
+ u32 journal_offset;
|
|
+ } *d;
|
|
+ size_t nr;
|
|
+ size_t size;
|
|
+ u64 journal_seq_base;
|
|
+};
|
|
+
|
|
+struct btree_iter_buf {
|
|
+ struct btree_iter *iter;
|
|
+};
|
|
+
|
|
+#define REPLICAS_DELTA_LIST_MAX (1U << 16)
|
|
+
|
|
+struct bch_fs {
|
|
+ struct closure cl;
|
|
+
|
|
+ struct list_head list;
|
|
+ struct kobject kobj;
|
|
+ struct kobject internal;
|
|
+ struct kobject opts_dir;
|
|
+ struct kobject time_stats;
|
|
+ unsigned long flags;
|
|
+
|
|
+ int minor;
|
|
+ struct device *chardev;
|
|
+ struct super_block *vfs_sb;
|
|
+ char name[40];
|
|
+
|
|
+ /* ro/rw, add/remove/resize devices: */
|
|
+ struct rw_semaphore state_lock;
|
|
+
|
|
+ /* Counts outstanding writes, for clean transition to read-only */
|
|
+ struct percpu_ref writes;
|
|
+ struct work_struct read_only_work;
|
|
+
|
|
+ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
|
|
+
|
|
+ struct bch_replicas_cpu replicas;
|
|
+ struct bch_replicas_cpu replicas_gc;
|
|
+ struct mutex replicas_gc_lock;
|
|
+ mempool_t replicas_delta_pool;
|
|
+
|
|
+ struct journal_entry_res btree_root_journal_res;
|
|
+ struct journal_entry_res replicas_journal_res;
|
|
+ struct journal_entry_res clock_journal_res;
|
|
+ struct journal_entry_res dev_usage_journal_res;
|
|
+
|
|
+ struct bch_disk_groups_cpu __rcu *disk_groups;
|
|
+
|
|
+ struct bch_opts opts;
|
|
+
|
|
+ /* Updated by bch2_sb_update():*/
|
|
+ struct {
|
|
+ uuid_le uuid;
|
|
+ uuid_le user_uuid;
|
|
+
|
|
+ u16 version;
|
|
+ u16 version_min;
|
|
+ u16 encoded_extent_max;
|
|
+
|
|
+ u8 nr_devices;
|
|
+ u8 clean;
|
|
+
|
|
+ u8 encryption_type;
|
|
+
|
|
+ u64 time_base_lo;
|
|
+ u32 time_base_hi;
|
|
+ unsigned time_units_per_sec;
|
|
+ unsigned nsec_per_time_unit;
|
|
+ u64 features;
|
|
+ u64 compat;
|
|
+ } sb;
|
|
+
|
|
+
|
|
+ struct bch_sb_handle disk_sb;
|
|
+
|
|
+ unsigned short block_bits; /* ilog2(block_size) */
|
|
+
|
|
+ u16 btree_foreground_merge_threshold;
|
|
+
|
|
+ struct closure sb_write;
|
|
+ struct mutex sb_lock;
|
|
+
|
|
+ /* BTREE CACHE */
|
|
+ struct bio_set btree_bio;
|
|
+
|
|
+ struct btree_root btree_roots[BTREE_ID_NR];
|
|
+ struct mutex btree_root_lock;
|
|
+
|
|
+ struct btree_cache btree_cache;
|
|
+
|
|
+ /*
|
|
+ * Cache of allocated btree nodes - if we allocate a btree node and
|
|
+ * don't use it, if we free it that space can't be reused until going
|
|
+ * _all_ the way through the allocator (which exposes us to a livelock
|
|
+ * when allocating btree reserves fail halfway through) - instead, we
|
|
+ * can stick them here:
|
|
+ */
|
|
+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2];
|
|
+ unsigned btree_reserve_cache_nr;
|
|
+ struct mutex btree_reserve_cache_lock;
|
|
+
|
|
+ mempool_t btree_interior_update_pool;
|
|
+ struct list_head btree_interior_update_list;
|
|
+ struct list_head btree_interior_updates_unwritten;
|
|
+ struct mutex btree_interior_update_lock;
|
|
+ struct closure_waitlist btree_interior_update_wait;
|
|
+
|
|
+ struct workqueue_struct *btree_interior_update_worker;
|
|
+ struct work_struct btree_interior_update_work;
|
|
+
|
|
+ /* btree_iter.c: */
|
|
+ struct mutex btree_trans_lock;
|
|
+ struct list_head btree_trans_list;
|
|
+ mempool_t btree_iters_pool;
|
|
+ mempool_t btree_trans_mem_pool;
|
|
+ struct btree_iter_buf __percpu *btree_iters_bufs;
|
|
+
|
|
+ struct srcu_struct btree_trans_barrier;
|
|
+
|
|
+ struct btree_key_cache btree_key_cache;
|
|
+
|
|
+ struct workqueue_struct *wq;
|
|
+ /* copygc needs its own workqueue for index updates.. */
|
|
+ struct workqueue_struct *copygc_wq;
|
|
+
|
|
+ /* ALLOCATION */
|
|
+ struct bch_devs_mask rw_devs[BCH_DATA_NR];
|
|
+
|
|
+ u64 capacity; /* sectors */
|
|
+
|
|
+ /*
|
|
+ * When capacity _decreases_ (due to a disk being removed), we
|
|
+ * increment capacity_gen - this invalidates outstanding reservations
|
|
+ * and forces them to be revalidated
|
|
+ */
|
|
+ u32 capacity_gen;
|
|
+ unsigned bucket_size_max;
|
|
+
|
|
+ atomic64_t sectors_available;
|
|
+ struct mutex sectors_available_lock;
|
|
+
|
|
+ struct bch_fs_pcpu __percpu *pcpu;
|
|
+
|
|
+ struct percpu_rw_semaphore mark_lock;
|
|
+
|
|
+ seqcount_t usage_lock;
|
|
+ struct bch_fs_usage *usage_base;
|
|
+ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR];
|
|
+ struct bch_fs_usage __percpu *usage_gc;
|
|
+ u64 __percpu *online_reserved;
|
|
+
|
|
+ /* single element mempool: */
|
|
+ struct mutex usage_scratch_lock;
|
|
+ struct bch_fs_usage_online *usage_scratch;
|
|
+
|
|
+ struct io_clock io_clock[2];
|
|
+
|
|
+ /* JOURNAL SEQ BLACKLIST */
|
|
+ struct journal_seq_blacklist_table *
|
|
+ journal_seq_blacklist_table;
|
|
+ struct work_struct journal_seq_blacklist_gc_work;
|
|
+
|
|
+ /* ALLOCATOR */
|
|
+ spinlock_t freelist_lock;
|
|
+ struct closure_waitlist freelist_wait;
|
|
+ u64 blocked_allocate;
|
|
+ u64 blocked_allocate_open_bucket;
|
|
+ open_bucket_idx_t open_buckets_freelist;
|
|
+ open_bucket_idx_t open_buckets_nr_free;
|
|
+ struct closure_waitlist open_buckets_wait;
|
|
+ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT];
|
|
+
|
|
+ struct write_point btree_write_point;
|
|
+ struct write_point rebalance_write_point;
|
|
+
|
|
+ struct write_point write_points[WRITE_POINT_MAX];
|
|
+ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR];
|
|
+ struct mutex write_points_hash_lock;
|
|
+ unsigned write_points_nr;
|
|
+
|
|
+ /* GARBAGE COLLECTION */
|
|
+ struct task_struct *gc_thread;
|
|
+ atomic_t kick_gc;
|
|
+ unsigned long gc_count;
|
|
+
|
|
+ enum btree_id gc_gens_btree;
|
|
+ struct bpos gc_gens_pos;
|
|
+
|
|
+ /*
|
|
+ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos]
|
|
+ * has been marked by GC.
|
|
+ *
|
|
+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.)
|
|
+ *
|
|
+ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
|
|
+ * can read without a lock.
|
|
+ */
|
|
+ seqcount_t gc_pos_lock;
|
|
+ struct gc_pos gc_pos;
|
|
+
|
|
+ /*
|
|
+ * The allocation code needs gc_mark in struct bucket to be correct, but
|
|
+ * it's not while a gc is in progress.
|
|
+ */
|
|
+ struct rw_semaphore gc_lock;
|
|
+
|
|
+ /* IO PATH */
|
|
+ struct semaphore io_in_flight;
|
|
+ struct bio_set bio_read;
|
|
+ struct bio_set bio_read_split;
|
|
+ struct bio_set bio_write;
|
|
+ struct mutex bio_bounce_pages_lock;
|
|
+ mempool_t bio_bounce_pages;
|
|
+ struct rhashtable promote_table;
|
|
+
|
|
+ mempool_t compression_bounce[2];
|
|
+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR];
|
|
+ mempool_t decompress_workspace;
|
|
+ ZSTD_parameters zstd_params;
|
|
+
|
|
+ struct crypto_shash *sha256;
|
|
+ struct crypto_sync_skcipher *chacha20;
|
|
+ struct crypto_shash *poly1305;
|
|
+
|
|
+ atomic64_t key_version;
|
|
+
|
|
+ mempool_t large_bkey_pool;
|
|
+
|
|
+ /* REBALANCE */
|
|
+ struct bch_fs_rebalance rebalance;
|
|
+
|
|
+ /* COPYGC */
|
|
+ struct task_struct *copygc_thread;
|
|
+ copygc_heap copygc_heap;
|
|
+ struct write_point copygc_write_point;
|
|
+ s64 copygc_wait;
|
|
+
|
|
+ /* STRIPES: */
|
|
+ GENRADIX(struct stripe) stripes[2];
|
|
+
|
|
+ ec_stripes_heap ec_stripes_heap;
|
|
+ spinlock_t ec_stripes_heap_lock;
|
|
+
|
|
+ /* ERASURE CODING */
|
|
+ struct list_head ec_stripe_head_list;
|
|
+ struct mutex ec_stripe_head_lock;
|
|
+
|
|
+ struct list_head ec_stripe_new_list;
|
|
+ struct mutex ec_stripe_new_lock;
|
|
+
|
|
+ struct work_struct ec_stripe_create_work;
|
|
+ u64 ec_stripe_hint;
|
|
+
|
|
+ struct bio_set ec_bioset;
|
|
+
|
|
+ struct work_struct ec_stripe_delete_work;
|
|
+ struct llist_head ec_stripe_delete_list;
|
|
+
|
|
+ /* REFLINK */
|
|
+ u64 reflink_hint;
|
|
+
|
|
+ /* VFS IO PATH - fs-io.c */
|
|
+ struct bio_set writepage_bioset;
|
|
+ struct bio_set dio_write_bioset;
|
|
+ struct bio_set dio_read_bioset;
|
|
+
|
|
+
|
|
+ atomic64_t btree_writes_nr;
|
|
+ atomic64_t btree_writes_sectors;
|
|
+ struct bio_list btree_write_error_list;
|
|
+ struct work_struct btree_write_error_work;
|
|
+ spinlock_t btree_write_error_lock;
|
|
+
|
|
+ /* ERRORS */
|
|
+ struct list_head fsck_errors;
|
|
+ struct mutex fsck_error_lock;
|
|
+ bool fsck_alloc_err;
|
|
+
|
|
+ /* QUOTAS */
|
|
+ struct bch_memquota_type quotas[QTYP_NR];
|
|
+
|
|
+ /* DEBUG JUNK */
|
|
+ struct dentry *debug;
|
|
+ struct btree_debug btree_debug[BTREE_ID_NR];
|
|
+ struct btree *verify_data;
|
|
+ struct btree_node *verify_ondisk;
|
|
+ struct mutex verify_lock;
|
|
+
|
|
+ u64 *unused_inode_hints;
|
|
+ unsigned inode_shard_bits;
|
|
+
|
|
+ /*
|
|
+ * A btree node on disk could have too many bsets for an iterator to fit
|
|
+ * on the stack - have to dynamically allocate them
|
|
+ */
|
|
+ mempool_t fill_iter;
|
|
+
|
|
+ mempool_t btree_bounce_pool;
|
|
+
|
|
+ struct journal journal;
|
|
+ struct list_head journal_entries;
|
|
+ struct journal_keys journal_keys;
|
|
+ struct list_head journal_iters;
|
|
+
|
|
+ u64 last_bucket_seq_cleanup;
|
|
+
|
|
+ /* The rest of this all shows up in sysfs */
|
|
+ atomic_long_t read_realloc_races;
|
|
+ atomic_long_t extent_migrate_done;
|
|
+ atomic_long_t extent_migrate_raced;
|
|
+
|
|
+ unsigned btree_gc_periodic:1;
|
|
+ unsigned copy_gc_enabled:1;
|
|
+ bool promote_whole_extents;
|
|
+
|
|
+ struct time_stats times[BCH_TIME_STAT_NR];
|
|
+};
|
|
+
|
|
+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
|
|
+{
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+ if (c->vfs_sb)
|
|
+ c->vfs_sb->s_bdi->ra_pages = ra_pages;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline unsigned bucket_bytes(const struct bch_dev *ca)
|
|
+{
|
|
+ return ca->mi.bucket_size << 9;
|
|
+}
|
|
+
|
|
+static inline unsigned block_bytes(const struct bch_fs *c)
|
|
+{
|
|
+ return c->opts.block_size << 9;
|
|
+}
|
|
+
|
|
+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, s64 time)
|
|
+{
|
|
+ struct timespec64 t;
|
|
+ s32 rem;
|
|
+
|
|
+ time += c->sb.time_base_lo;
|
|
+
|
|
+ t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem);
|
|
+ t.tv_nsec = rem * c->sb.nsec_per_time_unit;
|
|
+ return t;
|
|
+}
|
|
+
|
|
+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts)
|
|
+{
|
|
+ return (ts.tv_sec * c->sb.time_units_per_sec +
|
|
+ (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo;
|
|
+}
|
|
+
|
|
+static inline s64 bch2_current_time(struct bch_fs *c)
|
|
+{
|
|
+ struct timespec64 now;
|
|
+
|
|
+ ktime_get_coarse_real_ts64(&now);
|
|
+ return timespec_to_bch2_time(c, now);
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
|
|
+{
|
|
+ return dev < c->sb.nr_devices && c->devs[dev];
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_H */
|
|
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
|
|
new file mode 100644
|
|
index 000000000000..d640a3115adc
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bcachefs_format.h
|
|
@@ -0,0 +1,1783 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FORMAT_H
|
|
+#define _BCACHEFS_FORMAT_H
|
|
+
|
|
+/*
|
|
+ * bcachefs on disk data structures
|
|
+ *
|
|
+ * OVERVIEW:
|
|
+ *
|
|
+ * There are three main types of on disk data structures in bcachefs (this is
|
|
+ * reduced from 5 in bcache)
|
|
+ *
|
|
+ * - superblock
|
|
+ * - journal
|
|
+ * - btree
|
|
+ *
|
|
+ * The btree is the primary structure; most metadata exists as keys in the
|
|
+ * various btrees. There are only a small number of btrees, they're not
|
|
+ * sharded - we have one btree for extents, another for inodes, et cetera.
|
|
+ *
|
|
+ * SUPERBLOCK:
|
|
+ *
|
|
+ * The superblock contains the location of the journal, the list of devices in
|
|
+ * the filesystem, and in general any metadata we need in order to decide
|
|
+ * whether we can start a filesystem or prior to reading the journal/btree
|
|
+ * roots.
|
|
+ *
|
|
+ * The superblock is extensible, and most of the contents of the superblock are
|
|
+ * in variable length, type tagged fields; see struct bch_sb_field.
|
|
+ *
|
|
+ * Backup superblocks do not reside in a fixed location; also, superblocks do
|
|
+ * not have a fixed size. To locate backup superblocks we have struct
|
|
+ * bch_sb_layout; we store a copy of this inside every superblock, and also
|
|
+ * before the first superblock.
|
|
+ *
|
|
+ * JOURNAL:
|
|
+ *
|
|
+ * The journal primarily records btree updates in the order they occurred;
|
|
+ * journal replay consists of just iterating over all the keys in the open
|
|
+ * journal entries and re-inserting them into the btrees.
|
|
+ *
|
|
+ * The journal also contains entry types for the btree roots, and blacklisted
|
|
+ * journal sequence numbers (see journal_seq_blacklist.c).
|
|
+ *
|
|
+ * BTREE:
|
|
+ *
|
|
+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically
|
|
+ * 128k-256k) and log structured. We use struct btree_node for writing the first
|
|
+ * entry in a given node (offset 0), and struct btree_node_entry for all
|
|
+ * subsequent writes.
|
|
+ *
|
|
+ * After the header, btree node entries contain a list of keys in sorted order.
|
|
+ * Values are stored inline with the keys; since values are variable length (and
|
|
+ * keys effectively are variable length too, due to packing) we can't do random
|
|
+ * access without building up additional in memory tables in the btree node read
|
|
+ * path.
|
|
+ *
|
|
+ * BTREE KEYS (struct bkey):
|
|
+ *
|
|
+ * The various btrees share a common format for the key - so as to avoid
|
|
+ * switching in fastpath lookup/comparison code - but define their own
|
|
+ * structures for the key values.
|
|
+ *
|
|
+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max
|
|
+ * size is just under 2k. The common part also contains a type tag for the
|
|
+ * value, and a format field indicating whether the key is packed or not (and
|
|
+ * also meant to allow adding new key fields in the future, if desired).
|
|
+ *
|
|
+ * bkeys, when stored within a btree node, may also be packed. In that case, the
|
|
+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can
|
|
+ * be generous with field sizes in the common part of the key format (64 bit
|
|
+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost.
|
|
+ */
|
|
+
|
|
+#include <asm/types.h>
|
|
+#include <asm/byteorder.h>
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/uuid.h>
|
|
+
|
|
+#define LE_BITMASK(_bits, name, type, field, offset, end) \
|
|
+static const unsigned name##_OFFSET = offset; \
|
|
+static const unsigned name##_BITS = (end - offset); \
|
|
+static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \
|
|
+ \
|
|
+static inline __u64 name(const type *k) \
|
|
+{ \
|
|
+ return (__le##_bits##_to_cpu(k->field) >> offset) & \
|
|
+ ~(~0ULL << (end - offset)); \
|
|
+} \
|
|
+ \
|
|
+static inline void SET_##name(type *k, __u64 v) \
|
|
+{ \
|
|
+ __u##_bits new = __le##_bits##_to_cpu(k->field); \
|
|
+ \
|
|
+ new &= ~(~(~0ULL << (end - offset)) << offset); \
|
|
+ new |= (v & ~(~0ULL << (end - offset))) << offset; \
|
|
+ k->field = __cpu_to_le##_bits(new); \
|
|
+}
|
|
+
|
|
+#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e)
|
|
+#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e)
|
|
+#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e)
|
|
+
|
|
+struct bkey_format {
|
|
+ __u8 key_u64s;
|
|
+ __u8 nr_fields;
|
|
+ /* One unused slot for now: */
|
|
+ __u8 bits_per_field[6];
|
|
+ __le64 field_offset[6];
|
|
+};
|
|
+
|
|
+/* Btree keys - all units are in sectors */
|
|
+
|
|
+struct bpos {
|
|
+ /*
|
|
+ * Word order matches machine byte order - btree code treats a bpos as a
|
|
+ * single large integer, for search/comparison purposes
|
|
+ *
|
|
+ * Note that wherever a bpos is embedded in another on disk data
|
|
+ * structure, it has to be byte swabbed when reading in metadata that
|
|
+ * wasn't written in native endian order:
|
|
+ */
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+ __u32 snapshot;
|
|
+ __u64 offset;
|
|
+ __u64 inode;
|
|
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
+ __u64 inode;
|
|
+ __u64 offset; /* Points to end of extent - sectors */
|
|
+ __u32 snapshot;
|
|
+#else
|
|
+#error edit for your odd byteorder.
|
|
+#endif
|
|
+} __attribute__((packed, aligned(4)));
|
|
+
|
|
+#define KEY_INODE_MAX ((__u64)~0ULL)
|
|
+#define KEY_OFFSET_MAX ((__u64)~0ULL)
|
|
+#define KEY_SNAPSHOT_MAX ((__u32)~0U)
|
|
+#define KEY_SIZE_MAX ((__u32)~0U)
|
|
+
|
|
+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot)
|
|
+{
|
|
+ return (struct bpos) {
|
|
+ .inode = inode,
|
|
+ .offset = offset,
|
|
+ .snapshot = snapshot,
|
|
+ };
|
|
+}
|
|
+
|
|
+#define POS_MIN SPOS(0, 0, 0)
|
|
+#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX)
|
|
+#define POS(_inode, _offset) SPOS(_inode, _offset, 0)
|
|
+
|
|
+/* Empty placeholder struct, for container_of() */
|
|
+struct bch_val {
|
|
+ __u64 __nothing[0];
|
|
+};
|
|
+
|
|
+struct bversion {
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+ __u64 lo;
|
|
+ __u32 hi;
|
|
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
+ __u32 hi;
|
|
+ __u64 lo;
|
|
+#endif
|
|
+} __attribute__((packed, aligned(4)));
|
|
+
|
|
+struct bkey {
|
|
+ /* Size of combined key and value, in u64s */
|
|
+ __u8 u64s;
|
|
+
|
|
+ /* Format of key (0 for format local to btree node) */
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 format:7,
|
|
+ needs_whiteout:1;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 needs_whiteout:1,
|
|
+ format:7;
|
|
+#else
|
|
+#error edit for your odd byteorder.
|
|
+#endif
|
|
+
|
|
+ /* Type of the value */
|
|
+ __u8 type;
|
|
+
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+ __u8 pad[1];
|
|
+
|
|
+ struct bversion version;
|
|
+ __u32 size; /* extent size, in sectors */
|
|
+ struct bpos p;
|
|
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
+ struct bpos p;
|
|
+ __u32 size; /* extent size, in sectors */
|
|
+ struct bversion version;
|
|
+
|
|
+ __u8 pad[1];
|
|
+#endif
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bkey_packed {
|
|
+ __u64 _data[0];
|
|
+
|
|
+ /* Size of combined key and value, in u64s */
|
|
+ __u8 u64s;
|
|
+
|
|
+ /* Format of key (0 for format local to btree node) */
|
|
+
|
|
+ /*
|
|
+ * XXX: next incompat on disk format change, switch format and
|
|
+ * needs_whiteout - bkey_packed() will be cheaper if format is the high
|
|
+ * bits of the bitfield
|
|
+ */
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u8 format:7,
|
|
+ needs_whiteout:1;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u8 needs_whiteout:1,
|
|
+ format:7;
|
|
+#endif
|
|
+
|
|
+ /* Type of the value */
|
|
+ __u8 type;
|
|
+ __u8 key_start[0];
|
|
+
|
|
+ /*
|
|
+ * We copy bkeys with struct assignment in various places, and while
|
|
+ * that shouldn't be done with packed bkeys we can't disallow it in C,
|
|
+ * and it's legal to cast a bkey to a bkey_packed - so padding it out
|
|
+ * to the same size as struct bkey should hopefully be safest.
|
|
+ */
|
|
+ __u8 pad[sizeof(struct bkey) - 3];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64))
|
|
+#define BKEY_U64s_MAX U8_MAX
|
|
+#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s)
|
|
+
|
|
+#define KEY_PACKED_BITS_START 24
|
|
+
|
|
+#define KEY_FORMAT_LOCAL_BTREE 0
|
|
+#define KEY_FORMAT_CURRENT 1
|
|
+
|
|
+enum bch_bkey_fields {
|
|
+ BKEY_FIELD_INODE,
|
|
+ BKEY_FIELD_OFFSET,
|
|
+ BKEY_FIELD_SNAPSHOT,
|
|
+ BKEY_FIELD_SIZE,
|
|
+ BKEY_FIELD_VERSION_HI,
|
|
+ BKEY_FIELD_VERSION_LO,
|
|
+ BKEY_NR_FIELDS,
|
|
+};
|
|
+
|
|
+#define bkey_format_field(name, field) \
|
|
+ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8)
|
|
+
|
|
+#define BKEY_FORMAT_CURRENT \
|
|
+((struct bkey_format) { \
|
|
+ .key_u64s = BKEY_U64s, \
|
|
+ .nr_fields = BKEY_NR_FIELDS, \
|
|
+ .bits_per_field = { \
|
|
+ bkey_format_field(INODE, p.inode), \
|
|
+ bkey_format_field(OFFSET, p.offset), \
|
|
+ bkey_format_field(SNAPSHOT, p.snapshot), \
|
|
+ bkey_format_field(SIZE, size), \
|
|
+ bkey_format_field(VERSION_HI, version.hi), \
|
|
+ bkey_format_field(VERSION_LO, version.lo), \
|
|
+ }, \
|
|
+})
|
|
+
|
|
+/* bkey with inline value */
|
|
+struct bkey_i {
|
|
+ __u64 _data[0];
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ /* Size of combined key and value, in u64s */
|
|
+ __u8 u64s;
|
|
+ };
|
|
+ struct {
|
|
+ struct bkey k;
|
|
+ struct bch_val v;
|
|
+ };
|
|
+ };
|
|
+};
|
|
+
|
|
+#define KEY(_inode, _offset, _size) \
|
|
+((struct bkey) { \
|
|
+ .u64s = BKEY_U64s, \
|
|
+ .format = KEY_FORMAT_CURRENT, \
|
|
+ .p = POS(_inode, _offset), \
|
|
+ .size = _size, \
|
|
+})
|
|
+
|
|
+static inline void bkey_init(struct bkey *k)
|
|
+{
|
|
+ *k = KEY(0, 0, 0);
|
|
+}
|
|
+
|
|
+#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64))
|
|
+
|
|
+#define __BKEY_PADDED(key, pad) \
|
|
+ struct { struct bkey_i key; __u64 key ## _pad[pad]; }
|
|
+
|
|
+/*
|
|
+ * - DELETED keys are used internally to mark keys that should be ignored but
|
|
+ * override keys in composition order. Their version number is ignored.
|
|
+ *
|
|
+ * - DISCARDED keys indicate that the data is all 0s because it has been
|
|
+ * discarded. DISCARDs may have a version; if the version is nonzero the key
|
|
+ * will be persistent, otherwise the key will be dropped whenever the btree
|
|
+ * node is rewritten (like DELETED keys).
|
|
+ *
|
|
+ * - ERROR: any read of the data returns a read error, as the data was lost due
|
|
+ * to a failing device. Like DISCARDED keys, they can be removed (overridden)
|
|
+ * by new writes or cluster-wide GC. Node repair can also overwrite them with
|
|
+ * the same or a more recent version number, but not with an older version
|
|
+ * number.
|
|
+ *
|
|
+ * - WHITEOUT: for hash table btrees
|
|
+*/
|
|
+#define BCH_BKEY_TYPES() \
|
|
+ x(deleted, 0) \
|
|
+ x(discard, 1) \
|
|
+ x(error, 2) \
|
|
+ x(cookie, 3) \
|
|
+ x(hash_whiteout, 4) \
|
|
+ x(btree_ptr, 5) \
|
|
+ x(extent, 6) \
|
|
+ x(reservation, 7) \
|
|
+ x(inode, 8) \
|
|
+ x(inode_generation, 9) \
|
|
+ x(dirent, 10) \
|
|
+ x(xattr, 11) \
|
|
+ x(alloc, 12) \
|
|
+ x(quota, 13) \
|
|
+ x(stripe, 14) \
|
|
+ x(reflink_p, 15) \
|
|
+ x(reflink_v, 16) \
|
|
+ x(inline_data, 17) \
|
|
+ x(btree_ptr_v2, 18) \
|
|
+ x(indirect_inline_data, 19) \
|
|
+ x(alloc_v2, 20)
|
|
+
|
|
+enum bch_bkey_type {
|
|
+#define x(name, nr) KEY_TYPE_##name = nr,
|
|
+ BCH_BKEY_TYPES()
|
|
+#undef x
|
|
+ KEY_TYPE_MAX,
|
|
+};
|
|
+
|
|
+struct bch_deleted {
|
|
+ struct bch_val v;
|
|
+};
|
|
+
|
|
+struct bch_discard {
|
|
+ struct bch_val v;
|
|
+};
|
|
+
|
|
+struct bch_error {
|
|
+ struct bch_val v;
|
|
+};
|
|
+
|
|
+struct bch_cookie {
|
|
+ struct bch_val v;
|
|
+ __le64 cookie;
|
|
+};
|
|
+
|
|
+struct bch_hash_whiteout {
|
|
+ struct bch_val v;
|
|
+};
|
|
+
|
|
+/* Extents */
|
|
+
|
|
+/*
|
|
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
|
|
+ * preceded by checksum/compression information (bch_extent_crc32 or
|
|
+ * bch_extent_crc64).
|
|
+ *
|
|
+ * One major determining factor in the format of extents is how we handle and
|
|
+ * represent extents that have been partially overwritten and thus trimmed:
|
|
+ *
|
|
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
|
|
+ * don't have to remember the extent we originally allocated and wrote: we can
|
|
+ * merely adjust ptr->offset to point to the start of the data that is currently
|
|
+ * live. The size field in struct bkey records the current (live) size of the
|
|
+ * extent, and is also used to mean "size of region on disk that we point to" in
|
|
+ * this case.
|
|
+ *
|
|
+ * Thus an extent that is not checksummed or compressed will consist only of a
|
|
+ * list of bch_extent_ptrs, with none of the fields in
|
|
+ * bch_extent_crc32/bch_extent_crc64.
|
|
+ *
|
|
+ * When an extent is checksummed or compressed, it's not possible to read only
|
|
+ * the data that is currently live: we have to read the entire extent that was
|
|
+ * originally written, and then return only the part of the extent that is
|
|
+ * currently live.
|
|
+ *
|
|
+ * Thus, in addition to the current size of the extent in struct bkey, we need
|
|
+ * to store the size of the originally allocated space - this is the
|
|
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
|
|
+ * when the extent is trimmed, instead of modifying the offset field of the
|
|
+ * pointer, we keep a second smaller offset field - "offset into the original
|
|
+ * extent of the currently live region".
|
|
+ *
|
|
+ * The other major determining factor is replication and data migration:
|
|
+ *
|
|
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
|
|
+ * write, we will initially write all the replicas in the same format, with the
|
|
+ * same checksum type and compression format - however, when copygc runs later (or
|
|
+ * tiering/cache promotion, anything that moves data), it is not in general
|
|
+ * going to rewrite all the pointers at once - one of the replicas may be in a
|
|
+ * bucket on one device that has very little fragmentation while another lives
|
|
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
|
|
+ * sooner than the rest.
|
|
+ *
|
|
+ * Thus it will only move a subset of the pointers (or in the case of
|
|
+ * tiering/cache promotion perhaps add a single pointer without dropping any
|
|
+ * current pointers), and if the extent has been partially overwritten it must
|
|
+ * write only the currently live portion (or copygc would not be able to reduce
|
|
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
|
|
+ * the new pointer.
|
|
+ *
|
|
+ * But in the interests of space efficiency, we don't want to store one
|
|
+ * bch_extent_crc for each pointer if we don't have to.
|
|
+ *
|
|
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
|
|
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
|
|
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
|
|
+ * type, not a size), encoding the type in the position of the first set bit:
|
|
+ *
|
|
+ * bch_extent_crc32 - 0b1
|
|
+ * bch_extent_ptr - 0b10
|
|
+ * bch_extent_crc64 - 0b100
|
|
+ *
|
|
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
|
|
+ * bch_extent_crc64 is the least constrained).
|
|
+ *
|
|
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
|
|
+ * until the next bch_extent_crc32/64.
|
|
+ *
|
|
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
|
|
+ * is neither checksummed nor compressed.
|
|
+ */
|
|
+
|
|
+/* 128 bits, sufficient for cryptographic MACs: */
|
|
+struct bch_csum {
|
|
+ __le64 lo;
|
|
+ __le64 hi;
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BCH_EXTENT_ENTRY_TYPES() \
|
|
+ x(ptr, 0) \
|
|
+ x(crc32, 1) \
|
|
+ x(crc64, 2) \
|
|
+ x(crc128, 3) \
|
|
+ x(stripe_ptr, 4)
|
|
+#define BCH_EXTENT_ENTRY_MAX 5
|
|
+
|
|
+enum bch_extent_entry_type {
|
|
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
|
|
+ BCH_EXTENT_ENTRY_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+/* Compressed/uncompressed size are stored biased by 1: */
|
|
+struct bch_extent_crc32 {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u32 type:2,
|
|
+ _compressed_size:7,
|
|
+ _uncompressed_size:7,
|
|
+ offset:7,
|
|
+ _unused:1,
|
|
+ csum_type:4,
|
|
+ compression_type:4;
|
|
+ __u32 csum;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u32 csum;
|
|
+ __u32 compression_type:4,
|
|
+ csum_type:4,
|
|
+ _unused:1,
|
|
+ offset:7,
|
|
+ _uncompressed_size:7,
|
|
+ _compressed_size:7,
|
|
+ type:2;
|
|
+#endif
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define CRC32_SIZE_MAX (1U << 7)
|
|
+#define CRC32_NONCE_MAX 0
|
|
+
|
|
+struct bch_extent_crc64 {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:3,
|
|
+ _compressed_size:9,
|
|
+ _uncompressed_size:9,
|
|
+ offset:9,
|
|
+ nonce:10,
|
|
+ csum_type:4,
|
|
+ compression_type:4,
|
|
+ csum_hi:16;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 csum_hi:16,
|
|
+ compression_type:4,
|
|
+ csum_type:4,
|
|
+ nonce:10,
|
|
+ offset:9,
|
|
+ _uncompressed_size:9,
|
|
+ _compressed_size:9,
|
|
+ type:3;
|
|
+#endif
|
|
+ __u64 csum_lo;
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define CRC64_SIZE_MAX (1U << 9)
|
|
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
|
|
+
|
|
+struct bch_extent_crc128 {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:4,
|
|
+ _compressed_size:13,
|
|
+ _uncompressed_size:13,
|
|
+ offset:13,
|
|
+ nonce:13,
|
|
+ csum_type:4,
|
|
+ compression_type:4;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 compression_type:4,
|
|
+ csum_type:4,
|
|
+ nonce:13,
|
|
+ offset:13,
|
|
+ _uncompressed_size:13,
|
|
+ _compressed_size:13,
|
|
+ type:4;
|
|
+#endif
|
|
+ struct bch_csum csum;
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define CRC128_SIZE_MAX (1U << 13)
|
|
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
|
|
+
|
|
+/*
|
|
+ * @reservation - pointer hasn't been written to, just reserved
|
|
+ */
|
|
+struct bch_extent_ptr {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:1,
|
|
+ cached:1,
|
|
+ unused:1,
|
|
+ reservation:1,
|
|
+ offset:44, /* 8 petabytes */
|
|
+ dev:8,
|
|
+ gen:8;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 gen:8,
|
|
+ dev:8,
|
|
+ offset:44,
|
|
+ reservation:1,
|
|
+ unused:1,
|
|
+ cached:1,
|
|
+ type:1;
|
|
+#endif
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_extent_stripe_ptr {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:5,
|
|
+ block:8,
|
|
+ redundancy:4,
|
|
+ idx:47;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 idx:47,
|
|
+ redundancy:4,
|
|
+ block:8,
|
|
+ type:5;
|
|
+#endif
|
|
+};
|
|
+
|
|
+struct bch_extent_reservation {
|
|
+#if defined(__LITTLE_ENDIAN_BITFIELD)
|
|
+ __u64 type:6,
|
|
+ unused:22,
|
|
+ replicas:4,
|
|
+ generation:32;
|
|
+#elif defined (__BIG_ENDIAN_BITFIELD)
|
|
+ __u64 generation:32,
|
|
+ replicas:4,
|
|
+ unused:22,
|
|
+ type:6;
|
|
+#endif
|
|
+};
|
|
+
|
|
+union bch_extent_entry {
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
|
|
+ unsigned long type;
|
|
+#elif __BITS_PER_LONG == 32
|
|
+ struct {
|
|
+ unsigned long pad;
|
|
+ unsigned long type;
|
|
+ };
|
|
+#else
|
|
+#error edit for your odd byteorder.
|
|
+#endif
|
|
+
|
|
+#define x(f, n) struct bch_extent_##f f;
|
|
+ BCH_EXTENT_ENTRY_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct bch_btree_ptr {
|
|
+ struct bch_val v;
|
|
+
|
|
+ struct bch_extent_ptr start[0];
|
|
+ __u64 _data[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_btree_ptr_v2 {
|
|
+ struct bch_val v;
|
|
+
|
|
+ __u64 mem_ptr;
|
|
+ __le64 seq;
|
|
+ __le16 sectors_written;
|
|
+ __le16 flags;
|
|
+ struct bpos min_key;
|
|
+ struct bch_extent_ptr start[0];
|
|
+ __u64 _data[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
|
|
+
|
|
+struct bch_extent {
|
|
+ struct bch_val v;
|
|
+
|
|
+ union bch_extent_entry start[0];
|
|
+ __u64 _data[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_reservation {
|
|
+ struct bch_val v;
|
|
+
|
|
+ __le32 generation;
|
|
+ __u8 nr_replicas;
|
|
+ __u8 pad[3];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* Maximum size (in u64s) a single pointer could be: */
|
|
+#define BKEY_EXTENT_PTR_U64s_MAX\
|
|
+ ((sizeof(struct bch_extent_crc128) + \
|
|
+ sizeof(struct bch_extent_ptr)) / sizeof(u64))
|
|
+
|
|
+/* Maximum possible size of an entire extent value: */
|
|
+#define BKEY_EXTENT_VAL_U64s_MAX \
|
|
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
|
|
+
|
|
+/* * Maximum possible size of an entire extent, key + value: */
|
|
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
|
|
+
|
|
+/* Btree pointers don't carry around checksums: */
|
|
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
|
|
+ ((sizeof(struct bch_btree_ptr_v2) + \
|
|
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64))
|
|
+#define BKEY_BTREE_PTR_U64s_MAX \
|
|
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
|
|
+
|
|
+/* Inodes */
|
|
+
|
|
+#define BLOCKDEV_INODE_MAX 4096
|
|
+
|
|
+#define BCACHEFS_ROOT_INO 4096
|
|
+
|
|
+struct bch_inode {
|
|
+ struct bch_val v;
|
|
+
|
|
+ __le64 bi_hash_seed;
|
|
+ __le32 bi_flags;
|
|
+ __le16 bi_mode;
|
|
+ __u8 fields[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_inode_generation {
|
|
+ struct bch_val v;
|
|
+
|
|
+ __le32 bi_generation;
|
|
+ __le32 pad;
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BCH_INODE_FIELDS() \
|
|
+ x(bi_atime, 96) \
|
|
+ x(bi_ctime, 96) \
|
|
+ x(bi_mtime, 96) \
|
|
+ x(bi_otime, 96) \
|
|
+ x(bi_size, 64) \
|
|
+ x(bi_sectors, 64) \
|
|
+ x(bi_uid, 32) \
|
|
+ x(bi_gid, 32) \
|
|
+ x(bi_nlink, 32) \
|
|
+ x(bi_generation, 32) \
|
|
+ x(bi_dev, 32) \
|
|
+ x(bi_data_checksum, 8) \
|
|
+ x(bi_compression, 8) \
|
|
+ x(bi_project, 32) \
|
|
+ x(bi_background_compression, 8) \
|
|
+ x(bi_data_replicas, 8) \
|
|
+ x(bi_promote_target, 16) \
|
|
+ x(bi_foreground_target, 16) \
|
|
+ x(bi_background_target, 16) \
|
|
+ x(bi_erasure_code, 16) \
|
|
+ x(bi_fields_set, 16) \
|
|
+ x(bi_dir, 64) \
|
|
+ x(bi_dir_offset, 64)
|
|
+
|
|
+/* subset of BCH_INODE_FIELDS */
|
|
+#define BCH_INODE_OPTS() \
|
|
+ x(data_checksum, 8) \
|
|
+ x(compression, 8) \
|
|
+ x(project, 32) \
|
|
+ x(background_compression, 8) \
|
|
+ x(data_replicas, 8) \
|
|
+ x(promote_target, 16) \
|
|
+ x(foreground_target, 16) \
|
|
+ x(background_target, 16) \
|
|
+ x(erasure_code, 16)
|
|
+
|
|
+enum inode_opt_id {
|
|
+#define x(name, ...) \
|
|
+ Inode_opt_##name,
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ Inode_opt_nr,
|
|
+};
|
|
+
|
|
+enum {
|
|
+ /*
|
|
+ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL
|
|
+ * flags)
|
|
+ */
|
|
+ __BCH_INODE_SYNC = 0,
|
|
+ __BCH_INODE_IMMUTABLE = 1,
|
|
+ __BCH_INODE_APPEND = 2,
|
|
+ __BCH_INODE_NODUMP = 3,
|
|
+ __BCH_INODE_NOATIME = 4,
|
|
+
|
|
+ __BCH_INODE_I_SIZE_DIRTY= 5,
|
|
+ __BCH_INODE_I_SECTORS_DIRTY= 6,
|
|
+ __BCH_INODE_UNLINKED = 7,
|
|
+ __BCH_INODE_BACKPTR_UNTRUSTED = 8,
|
|
+
|
|
+ /* bits 20+ reserved for packed fields below: */
|
|
+};
|
|
+
|
|
+#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC)
|
|
+#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE)
|
|
+#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND)
|
|
+#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP)
|
|
+#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME)
|
|
+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY)
|
|
+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY)
|
|
+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED)
|
|
+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED)
|
|
+
|
|
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
|
|
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
|
|
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
|
|
+
|
|
+/* Dirents */
|
|
+
|
|
+/*
|
|
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
|
|
+ * doesn't support arbitrary length strings for the key, we instead index by a
|
|
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
|
|
+ * field of the key - using linear probing to resolve hash collisions. This also
|
|
+ * provides us with the readdir cookie posix requires.
|
|
+ *
|
|
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
|
|
+ * collision:
|
|
+ */
|
|
+
|
|
+struct bch_dirent {
|
|
+ struct bch_val v;
|
|
+
|
|
+ /* Target inode number: */
|
|
+ __le64 d_inum;
|
|
+
|
|
+ /*
|
|
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
|
|
+ * the filetype without having to do a stat()
|
|
+ */
|
|
+ __u8 d_type;
|
|
+
|
|
+ __u8 d_name[];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \
|
|
+ sizeof(struct bkey) - \
|
|
+ offsetof(struct bch_dirent, d_name))
|
|
+
|
|
+
|
|
+/* Xattrs */
|
|
+
|
|
+#define KEY_TYPE_XATTR_INDEX_USER 0
|
|
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
|
|
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
|
|
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
|
|
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
|
|
+
|
|
+struct bch_xattr {
|
|
+ struct bch_val v;
|
|
+ __u8 x_type;
|
|
+ __u8 x_name_len;
|
|
+ __le16 x_val_len;
|
|
+ __u8 x_name[];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* Bucket/allocation information: */
|
|
+
|
|
+struct bch_alloc {
|
|
+ struct bch_val v;
|
|
+ __u8 fields;
|
|
+ __u8 gen;
|
|
+ __u8 data[];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BCH_ALLOC_FIELDS_V1() \
|
|
+ x(read_time, 16) \
|
|
+ x(write_time, 16) \
|
|
+ x(data_type, 8) \
|
|
+ x(dirty_sectors, 16) \
|
|
+ x(cached_sectors, 16) \
|
|
+ x(oldest_gen, 8) \
|
|
+ x(stripe, 32) \
|
|
+ x(stripe_redundancy, 8)
|
|
+
|
|
+struct bch_alloc_v2 {
|
|
+ struct bch_val v;
|
|
+ __u8 nr_fields;
|
|
+ __u8 gen;
|
|
+ __u8 oldest_gen;
|
|
+ __u8 data_type;
|
|
+ __u8 data[];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BCH_ALLOC_FIELDS_V2() \
|
|
+ x(read_time, 64) \
|
|
+ x(write_time, 64) \
|
|
+ x(dirty_sectors, 16) \
|
|
+ x(cached_sectors, 16) \
|
|
+ x(stripe, 32) \
|
|
+ x(stripe_redundancy, 8)
|
|
+
|
|
+enum {
|
|
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
|
|
+ BCH_ALLOC_FIELDS_V1()
|
|
+#undef x
|
|
+ BCH_ALLOC_FIELD_NR
|
|
+};
|
|
+
|
|
+/* Quotas: */
|
|
+
|
|
+enum quota_types {
|
|
+ QTYP_USR = 0,
|
|
+ QTYP_GRP = 1,
|
|
+ QTYP_PRJ = 2,
|
|
+ QTYP_NR = 3,
|
|
+};
|
|
+
|
|
+enum quota_counters {
|
|
+ Q_SPC = 0,
|
|
+ Q_INO = 1,
|
|
+ Q_COUNTERS = 2,
|
|
+};
|
|
+
|
|
+struct bch_quota_counter {
|
|
+ __le64 hardlimit;
|
|
+ __le64 softlimit;
|
|
+};
|
|
+
|
|
+struct bch_quota {
|
|
+ struct bch_val v;
|
|
+ struct bch_quota_counter c[Q_COUNTERS];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* Erasure coding */
|
|
+
|
|
+struct bch_stripe {
|
|
+ struct bch_val v;
|
|
+ __le16 sectors;
|
|
+ __u8 algorithm;
|
|
+ __u8 nr_blocks;
|
|
+ __u8 nr_redundant;
|
|
+
|
|
+ __u8 csum_granularity_bits;
|
|
+ __u8 csum_type;
|
|
+ __u8 pad;
|
|
+
|
|
+ struct bch_extent_ptr ptrs[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* Reflink: */
|
|
+
|
|
+struct bch_reflink_p {
|
|
+ struct bch_val v;
|
|
+ __le64 idx;
|
|
+
|
|
+ __le32 reservation_generation;
|
|
+ __u8 nr_replicas;
|
|
+ __u8 pad[3];
|
|
+};
|
|
+
|
|
+struct bch_reflink_v {
|
|
+ struct bch_val v;
|
|
+ __le64 refcount;
|
|
+ union bch_extent_entry start[0];
|
|
+ __u64 _data[0];
|
|
+};
|
|
+
|
|
+struct bch_indirect_inline_data {
|
|
+ struct bch_val v;
|
|
+ __le64 refcount;
|
|
+ u8 data[0];
|
|
+};
|
|
+
|
|
+/* Inline data */
|
|
+
|
|
+struct bch_inline_data {
|
|
+ struct bch_val v;
|
|
+ u8 data[0];
|
|
+};
|
|
+
|
|
+/* Optional/variable size superblock sections: */
|
|
+
|
|
+struct bch_sb_field {
|
|
+ __u64 _data[0];
|
|
+ __le32 u64s;
|
|
+ __le32 type;
|
|
+};
|
|
+
|
|
+#define BCH_SB_FIELDS() \
|
|
+ x(journal, 0) \
|
|
+ x(members, 1) \
|
|
+ x(crypt, 2) \
|
|
+ x(replicas_v0, 3) \
|
|
+ x(quota, 4) \
|
|
+ x(disk_groups, 5) \
|
|
+ x(clean, 6) \
|
|
+ x(replicas, 7) \
|
|
+ x(journal_seq_blacklist, 8)
|
|
+
|
|
+enum bch_sb_field_type {
|
|
+#define x(f, nr) BCH_SB_FIELD_##f = nr,
|
|
+ BCH_SB_FIELDS()
|
|
+#undef x
|
|
+ BCH_SB_FIELD_NR
|
|
+};
|
|
+
|
|
+/* BCH_SB_FIELD_journal: */
|
|
+
|
|
+struct bch_sb_field_journal {
|
|
+ struct bch_sb_field field;
|
|
+ __le64 buckets[0];
|
|
+};
|
|
+
|
|
+/* BCH_SB_FIELD_members: */
|
|
+
|
|
+#define BCH_MIN_NR_NBUCKETS (1 << 6)
|
|
+
|
|
+struct bch_member {
|
|
+ uuid_le uuid;
|
|
+ __le64 nbuckets; /* device size */
|
|
+ __le16 first_bucket; /* index of first bucket used */
|
|
+ __le16 bucket_size; /* sectors */
|
|
+ __le32 pad;
|
|
+ __le64 last_mount; /* time_t */
|
|
+
|
|
+ __le64 flags[2];
|
|
+};
|
|
+
|
|
+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4)
|
|
+/* 4-10 unused, was TIER, HAS_(META)DATA */
|
|
+LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14)
|
|
+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15)
|
|
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20)
|
|
+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28)
|
|
+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30)
|
|
+
|
|
+#define BCH_TIER_MAX 4U
|
|
+
|
|
+#if 0
|
|
+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
|
|
+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
|
|
+#endif
|
|
+
|
|
+#define BCH_MEMBER_STATES() \
|
|
+ x(rw, 0) \
|
|
+ x(ro, 1) \
|
|
+ x(failed, 2) \
|
|
+ x(spare, 3)
|
|
+
|
|
+enum bch_member_state {
|
|
+#define x(t, n) BCH_MEMBER_STATE_##t = n,
|
|
+ BCH_MEMBER_STATES()
|
|
+#undef x
|
|
+ BCH_MEMBER_STATE_NR
|
|
+};
|
|
+
|
|
+#define BCH_CACHE_REPLACEMENT_POLICIES() \
|
|
+ x(lru, 0) \
|
|
+ x(fifo, 1) \
|
|
+ x(random, 2)
|
|
+
|
|
+enum bch_cache_replacement_policies {
|
|
+#define x(t, n) BCH_CACHE_REPLACEMENT_##t = n,
|
|
+ BCH_CACHE_REPLACEMENT_POLICIES()
|
|
+#undef x
|
|
+ BCH_CACHE_REPLACEMENT_NR
|
|
+};
|
|
+
|
|
+struct bch_sb_field_members {
|
|
+ struct bch_sb_field field;
|
|
+ struct bch_member members[0];
|
|
+};
|
|
+
|
|
+/* BCH_SB_FIELD_crypt: */
|
|
+
|
|
+struct nonce {
|
|
+ __le32 d[4];
|
|
+};
|
|
+
|
|
+struct bch_key {
|
|
+ __le64 key[4];
|
|
+};
|
|
+
|
|
+#define BCH_KEY_MAGIC \
|
|
+ (((u64) 'b' << 0)|((u64) 'c' << 8)| \
|
|
+ ((u64) 'h' << 16)|((u64) '*' << 24)| \
|
|
+ ((u64) '*' << 32)|((u64) 'k' << 40)| \
|
|
+ ((u64) 'e' << 48)|((u64) 'y' << 56))
|
|
+
|
|
+struct bch_encrypted_key {
|
|
+ __le64 magic;
|
|
+ struct bch_key key;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * If this field is present in the superblock, it stores an encryption key which
|
|
+ * is used encrypt all other data/metadata. The key will normally be encrypted
|
|
+ * with the key userspace provides, but if encryption has been turned off we'll
|
|
+ * just store the master key unencrypted in the superblock so we can access the
|
|
+ * previously encrypted data.
|
|
+ */
|
|
+struct bch_sb_field_crypt {
|
|
+ struct bch_sb_field field;
|
|
+
|
|
+ __le64 flags;
|
|
+ __le64 kdf_flags;
|
|
+ struct bch_encrypted_key key;
|
|
+};
|
|
+
|
|
+LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4);
|
|
+
|
|
+enum bch_kdf_types {
|
|
+ BCH_KDF_SCRYPT = 0,
|
|
+ BCH_KDF_NR = 1,
|
|
+};
|
|
+
|
|
+/* stored as base 2 log of scrypt params: */
|
|
+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
|
|
+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
|
|
+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
|
|
+
|
|
+/* BCH_SB_FIELD_replicas: */
|
|
+
|
|
+#define BCH_DATA_TYPES() \
|
|
+ x(none, 0) \
|
|
+ x(sb, 1) \
|
|
+ x(journal, 2) \
|
|
+ x(btree, 3) \
|
|
+ x(user, 4) \
|
|
+ x(cached, 5) \
|
|
+ x(parity, 6)
|
|
+
|
|
+enum bch_data_type {
|
|
+#define x(t, n) BCH_DATA_##t,
|
|
+ BCH_DATA_TYPES()
|
|
+#undef x
|
|
+ BCH_DATA_NR
|
|
+};
|
|
+
|
|
+struct bch_replicas_entry_v0 {
|
|
+ __u8 data_type;
|
|
+ __u8 nr_devs;
|
|
+ __u8 devs[0];
|
|
+} __attribute__((packed));
|
|
+
|
|
+struct bch_sb_field_replicas_v0 {
|
|
+ struct bch_sb_field field;
|
|
+ struct bch_replicas_entry_v0 entries[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_replicas_entry {
|
|
+ __u8 data_type;
|
|
+ __u8 nr_devs;
|
|
+ __u8 nr_required;
|
|
+ __u8 devs[0];
|
|
+} __attribute__((packed));
|
|
+
|
|
+#define replicas_entry_bytes(_i) \
|
|
+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
|
|
+
|
|
+struct bch_sb_field_replicas {
|
|
+ struct bch_sb_field field;
|
|
+ struct bch_replicas_entry entries[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* BCH_SB_FIELD_quota: */
|
|
+
|
|
+struct bch_sb_quota_counter {
|
|
+ __le32 timelimit;
|
|
+ __le32 warnlimit;
|
|
+};
|
|
+
|
|
+struct bch_sb_quota_type {
|
|
+ __le64 flags;
|
|
+ struct bch_sb_quota_counter c[Q_COUNTERS];
|
|
+};
|
|
+
|
|
+struct bch_sb_field_quota {
|
|
+ struct bch_sb_field field;
|
|
+ struct bch_sb_quota_type q[QTYP_NR];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/* BCH_SB_FIELD_disk_groups: */
|
|
+
|
|
+#define BCH_SB_LABEL_SIZE 32
|
|
+
|
|
+struct bch_disk_group {
|
|
+ __u8 label[BCH_SB_LABEL_SIZE];
|
|
+ __le64 flags[2];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
|
|
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
|
|
+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
|
|
+
|
|
+struct bch_sb_field_disk_groups {
|
|
+ struct bch_sb_field field;
|
|
+ struct bch_disk_group entries[0];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/*
|
|
+ * On clean shutdown, store btree roots and current journal sequence number in
|
|
+ * the superblock:
|
|
+ */
|
|
+struct jset_entry {
|
|
+ __le16 u64s;
|
|
+ __u8 btree_id;
|
|
+ __u8 level;
|
|
+ __u8 type; /* designates what this jset holds */
|
|
+ __u8 pad[3];
|
|
+
|
|
+ union {
|
|
+ struct bkey_i start[0];
|
|
+ __u64 _data[0];
|
|
+ };
|
|
+};
|
|
+
|
|
+struct bch_sb_field_clean {
|
|
+ struct bch_sb_field field;
|
|
+
|
|
+ __le32 flags;
|
|
+ __le16 _read_clock; /* no longer used */
|
|
+ __le16 _write_clock;
|
|
+ __le64 journal_seq;
|
|
+
|
|
+ union {
|
|
+ struct jset_entry start[0];
|
|
+ __u64 _data[0];
|
|
+ };
|
|
+};
|
|
+
|
|
+struct journal_seq_blacklist_entry {
|
|
+ __le64 start;
|
|
+ __le64 end;
|
|
+};
|
|
+
|
|
+struct bch_sb_field_journal_seq_blacklist {
|
|
+ struct bch_sb_field field;
|
|
+
|
|
+ union {
|
|
+ struct journal_seq_blacklist_entry start[0];
|
|
+ __u64 _data[0];
|
|
+ };
|
|
+};
|
|
+
|
|
+/* Superblock: */
|
|
+
|
|
+/*
|
|
+ * New versioning scheme:
|
|
+ * One common version number for all on disk data structures - superblock, btree
|
|
+ * nodes, journal entries
|
|
+ */
|
|
+#define BCH_JSET_VERSION_OLD 2
|
|
+#define BCH_BSET_VERSION_OLD 3
|
|
+
|
|
+enum bcachefs_metadata_version {
|
|
+ bcachefs_metadata_version_min = 9,
|
|
+ bcachefs_metadata_version_new_versioning = 10,
|
|
+ bcachefs_metadata_version_bkey_renumber = 10,
|
|
+ bcachefs_metadata_version_inode_btree_change = 11,
|
|
+ bcachefs_metadata_version_snapshot = 12,
|
|
+ bcachefs_metadata_version_inode_backpointers = 13,
|
|
+ bcachefs_metadata_version_max = 14,
|
|
+};
|
|
+
|
|
+#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
|
|
+
|
|
+#define BCH_SB_SECTOR 8
|
|
+#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
|
|
+
|
|
+struct bch_sb_layout {
|
|
+ uuid_le magic; /* bcachefs superblock UUID */
|
|
+ __u8 layout_type;
|
|
+ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */
|
|
+ __u8 nr_superblocks;
|
|
+ __u8 pad[5];
|
|
+ __le64 sb_offset[61];
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#define BCH_SB_LAYOUT_SECTOR 7
|
|
+
|
|
+/*
|
|
+ * @offset - sector where this sb was written
|
|
+ * @version - on disk format version
|
|
+ * @version_min - Oldest metadata version this filesystem contains; so we can
|
|
+ * safely drop compatibility code and refuse to mount filesystems
|
|
+ * we'd need it for
|
|
+ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC)
|
|
+ * @seq - incremented each time superblock is written
|
|
+ * @uuid - used for generating various magic numbers and identifying
|
|
+ * member devices, never changes
|
|
+ * @user_uuid - user visible UUID, may be changed
|
|
+ * @label - filesystem label
|
|
+ * @seq - identifies most recent superblock, incremented each time
|
|
+ * superblock is written
|
|
+ * @features - enabled incompatible features
|
|
+ */
|
|
+struct bch_sb {
|
|
+ struct bch_csum csum;
|
|
+ __le16 version;
|
|
+ __le16 version_min;
|
|
+ __le16 pad[2];
|
|
+ uuid_le magic;
|
|
+ uuid_le uuid;
|
|
+ uuid_le user_uuid;
|
|
+ __u8 label[BCH_SB_LABEL_SIZE];
|
|
+ __le64 offset;
|
|
+ __le64 seq;
|
|
+
|
|
+ __le16 block_size;
|
|
+ __u8 dev_idx;
|
|
+ __u8 nr_devices;
|
|
+ __le32 u64s;
|
|
+
|
|
+ __le64 time_base_lo;
|
|
+ __le32 time_base_hi;
|
|
+ __le32 time_precision;
|
|
+
|
|
+ __le64 flags[8];
|
|
+ __le64 features[2];
|
|
+ __le64 compat[2];
|
|
+
|
|
+ struct bch_sb_layout layout;
|
|
+
|
|
+ union {
|
|
+ struct bch_sb_field start[0];
|
|
+ __le64 _data[0];
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+/*
|
|
+ * Flags:
|
|
+ * BCH_SB_INITALIZED - set on first mount
|
|
+ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect
|
|
+ * behaviour of mount/recovery path:
|
|
+ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits
|
|
+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80
|
|
+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides
|
|
+ * DATA/META_CSUM_TYPE. Also indicates encryption
|
|
+ * algorithm in use, if/when we get more than one
|
|
+ */
|
|
+
|
|
+LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1);
|
|
+LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2);
|
|
+LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8);
|
|
+LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33);
|
|
+LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44);
|
|
+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52);
|
|
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57);
|
|
+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58);
|
|
+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59);
|
|
+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61);
|
|
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4);
|
|
+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8);
|
|
+LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10);
|
|
+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14);
|
|
+
|
|
+/*
|
|
+ * Max size of an extent that may require bouncing to read or write
|
|
+ * (checksummed, compressed): 64k
|
|
+ */
|
|
+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
|
|
+ struct bch_sb, flags[1], 14, 20);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24);
|
|
+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40);
|
|
+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52);
|
|
+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
|
|
+ struct bch_sb, flags[2], 0, 4);
|
|
+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64);
|
|
+
|
|
+LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16);
|
|
+LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28);
|
|
+
|
|
+/*
|
|
+ * Features:
|
|
+ *
|
|
+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist
|
|
+ * reflink: gates KEY_TYPE_reflink
|
|
+ * inline_data: gates KEY_TYPE_inline_data
|
|
+ * new_siphash: gates BCH_STR_HASH_SIPHASH
|
|
+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE
|
|
+ */
|
|
+#define BCH_SB_FEATURES() \
|
|
+ x(lz4, 0) \
|
|
+ x(gzip, 1) \
|
|
+ x(zstd, 2) \
|
|
+ x(atomic_nlink, 3) \
|
|
+ x(ec, 4) \
|
|
+ x(journal_seq_blacklist_v3, 5) \
|
|
+ x(reflink, 6) \
|
|
+ x(new_siphash, 7) \
|
|
+ x(inline_data, 8) \
|
|
+ x(new_extent_overwrite, 9) \
|
|
+ x(incompressible, 10) \
|
|
+ x(btree_ptr_v2, 11) \
|
|
+ x(extents_above_btree_updates, 12) \
|
|
+ x(btree_updates_journalled, 13) \
|
|
+ x(reflink_inline_data, 14) \
|
|
+ x(new_varint, 15) \
|
|
+ x(journal_no_flush, 16) \
|
|
+ x(alloc_v2, 17) \
|
|
+ x(extents_across_btree_nodes, 18)
|
|
+
|
|
+#define BCH_SB_FEATURES_ALWAYS \
|
|
+ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \
|
|
+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
|
|
+ (1ULL << BCH_FEATURE_btree_updates_journalled)|\
|
|
+ (1ULL << BCH_FEATURE_alloc_v2)|\
|
|
+ (1ULL << BCH_FEATURE_extents_across_btree_nodes))
|
|
+
|
|
+#define BCH_SB_FEATURES_ALL \
|
|
+ (BCH_SB_FEATURES_ALWAYS| \
|
|
+ (1ULL << BCH_FEATURE_new_siphash)| \
|
|
+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \
|
|
+ (1ULL << BCH_FEATURE_new_varint)| \
|
|
+ (1ULL << BCH_FEATURE_journal_no_flush))
|
|
+
|
|
+enum bch_sb_feature {
|
|
+#define x(f, n) BCH_FEATURE_##f,
|
|
+ BCH_SB_FEATURES()
|
|
+#undef x
|
|
+ BCH_FEATURE_NR,
|
|
+};
|
|
+
|
|
+#define BCH_SB_COMPAT() \
|
|
+ x(alloc_info, 0) \
|
|
+ x(alloc_metadata, 1) \
|
|
+ x(extents_above_btree_updates_done, 2) \
|
|
+ x(bformat_overflow_done, 3)
|
|
+
|
|
+enum bch_sb_compat {
|
|
+#define x(f, n) BCH_COMPAT_##f,
|
|
+ BCH_SB_COMPAT()
|
|
+#undef x
|
|
+ BCH_COMPAT_NR,
|
|
+};
|
|
+
|
|
+/* options: */
|
|
+
|
|
+#define BCH_REPLICAS_MAX 4U
|
|
+
|
|
+#define BCH_BKEY_PTRS_MAX 16U
|
|
+
|
|
+#define BCH_ERROR_ACTIONS() \
|
|
+ x(continue, 0) \
|
|
+ x(ro, 1) \
|
|
+ x(panic, 2)
|
|
+
|
|
+enum bch_error_actions {
|
|
+#define x(t, n) BCH_ON_ERROR_##t = n,
|
|
+ BCH_ERROR_ACTIONS()
|
|
+#undef x
|
|
+ BCH_ON_ERROR_NR
|
|
+};
|
|
+
|
|
+enum bch_str_hash_type {
|
|
+ BCH_STR_HASH_CRC32C = 0,
|
|
+ BCH_STR_HASH_CRC64 = 1,
|
|
+ BCH_STR_HASH_SIPHASH_OLD = 2,
|
|
+ BCH_STR_HASH_SIPHASH = 3,
|
|
+ BCH_STR_HASH_NR = 4,
|
|
+};
|
|
+
|
|
+#define BCH_STR_HASH_OPTS() \
|
|
+ x(crc32c, 0) \
|
|
+ x(crc64, 1) \
|
|
+ x(siphash, 2)
|
|
+
|
|
+enum bch_str_hash_opts {
|
|
+#define x(t, n) BCH_STR_HASH_OPT_##t = n,
|
|
+ BCH_STR_HASH_OPTS()
|
|
+#undef x
|
|
+ BCH_STR_HASH_OPT_NR
|
|
+};
|
|
+
|
|
+enum bch_csum_type {
|
|
+ BCH_CSUM_NONE = 0,
|
|
+ BCH_CSUM_CRC32C_NONZERO = 1,
|
|
+ BCH_CSUM_CRC64_NONZERO = 2,
|
|
+ BCH_CSUM_CHACHA20_POLY1305_80 = 3,
|
|
+ BCH_CSUM_CHACHA20_POLY1305_128 = 4,
|
|
+ BCH_CSUM_CRC32C = 5,
|
|
+ BCH_CSUM_CRC64 = 6,
|
|
+ BCH_CSUM_NR = 7,
|
|
+};
|
|
+
|
|
+static const unsigned bch_crc_bytes[] = {
|
|
+ [BCH_CSUM_NONE] = 0,
|
|
+ [BCH_CSUM_CRC32C_NONZERO] = 4,
|
|
+ [BCH_CSUM_CRC32C] = 4,
|
|
+ [BCH_CSUM_CRC64_NONZERO] = 8,
|
|
+ [BCH_CSUM_CRC64] = 8,
|
|
+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10,
|
|
+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16,
|
|
+};
|
|
+
|
|
+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_CHACHA20_POLY1305_80:
|
|
+ case BCH_CSUM_CHACHA20_POLY1305_128:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+#define BCH_CSUM_OPTS() \
|
|
+ x(none, 0) \
|
|
+ x(crc32c, 1) \
|
|
+ x(crc64, 2)
|
|
+
|
|
+enum bch_csum_opts {
|
|
+#define x(t, n) BCH_CSUM_OPT_##t = n,
|
|
+ BCH_CSUM_OPTS()
|
|
+#undef x
|
|
+ BCH_CSUM_OPT_NR
|
|
+};
|
|
+
|
|
+#define BCH_COMPRESSION_TYPES() \
|
|
+ x(none, 0) \
|
|
+ x(lz4_old, 1) \
|
|
+ x(gzip, 2) \
|
|
+ x(lz4, 3) \
|
|
+ x(zstd, 4) \
|
|
+ x(incompressible, 5)
|
|
+
|
|
+enum bch_compression_type {
|
|
+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n,
|
|
+ BCH_COMPRESSION_TYPES()
|
|
+#undef x
|
|
+ BCH_COMPRESSION_TYPE_NR
|
|
+};
|
|
+
|
|
+#define BCH_COMPRESSION_OPTS() \
|
|
+ x(none, 0) \
|
|
+ x(lz4, 1) \
|
|
+ x(gzip, 2) \
|
|
+ x(zstd, 3)
|
|
+
|
|
+enum bch_compression_opts {
|
|
+#define x(t, n) BCH_COMPRESSION_OPT_##t = n,
|
|
+ BCH_COMPRESSION_OPTS()
|
|
+#undef x
|
|
+ BCH_COMPRESSION_OPT_NR
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Magic numbers
|
|
+ *
|
|
+ * The various other data structures have their own magic numbers, which are
|
|
+ * xored with the first part of the cache set's UUID
|
|
+ */
|
|
+
|
|
+#define BCACHE_MAGIC \
|
|
+ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \
|
|
+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
|
|
+
|
|
+#define BCACHEFS_STATFS_MAGIC 0xca451a4e
|
|
+
|
|
+#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL)
|
|
+#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL)
|
|
+
|
|
+static inline __le64 __bch2_sb_magic(struct bch_sb *sb)
|
|
+{
|
|
+ __le64 ret;
|
|
+ memcpy(&ret, &sb->uuid, sizeof(ret));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline __u64 __jset_magic(struct bch_sb *sb)
|
|
+{
|
|
+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC);
|
|
+}
|
|
+
|
|
+static inline __u64 __bset_magic(struct bch_sb *sb)
|
|
+{
|
|
+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC);
|
|
+}
|
|
+
|
|
+/* Journal */
|
|
+
|
|
+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
|
|
+
|
|
+#define BCH_JSET_ENTRY_TYPES() \
|
|
+ x(btree_keys, 0) \
|
|
+ x(btree_root, 1) \
|
|
+ x(prio_ptrs, 2) \
|
|
+ x(blacklist, 3) \
|
|
+ x(blacklist_v2, 4) \
|
|
+ x(usage, 5) \
|
|
+ x(data_usage, 6) \
|
|
+ x(clock, 7) \
|
|
+ x(dev_usage, 8)
|
|
+
|
|
+enum {
|
|
+#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
|
|
+ BCH_JSET_ENTRY_TYPES()
|
|
+#undef x
|
|
+ BCH_JSET_ENTRY_NR
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
|
|
+ * number of all the journal entries they contain updates for, so that on
|
|
+ * recovery we can ignore those bsets that contain index updates newer that what
|
|
+ * made it into the journal.
|
|
+ *
|
|
+ * This means that we can't reuse that journal_seq - we have to skip it, and
|
|
+ * then record that we skipped it so that the next time we crash and recover we
|
|
+ * don't think there was a missing journal entry.
|
|
+ */
|
|
+struct jset_entry_blacklist {
|
|
+ struct jset_entry entry;
|
|
+ __le64 seq;
|
|
+};
|
|
+
|
|
+struct jset_entry_blacklist_v2 {
|
|
+ struct jset_entry entry;
|
|
+ __le64 start;
|
|
+ __le64 end;
|
|
+};
|
|
+
|
|
+enum {
|
|
+ FS_USAGE_RESERVED = 0,
|
|
+ FS_USAGE_INODES = 1,
|
|
+ FS_USAGE_KEY_VERSION = 2,
|
|
+ FS_USAGE_NR = 3
|
|
+};
|
|
+
|
|
+struct jset_entry_usage {
|
|
+ struct jset_entry entry;
|
|
+ __le64 v;
|
|
+} __attribute__((packed));
|
|
+
|
|
+struct jset_entry_data_usage {
|
|
+ struct jset_entry entry;
|
|
+ __le64 v;
|
|
+ struct bch_replicas_entry r;
|
|
+} __attribute__((packed));
|
|
+
|
|
+struct jset_entry_clock {
|
|
+ struct jset_entry entry;
|
|
+ __u8 rw;
|
|
+ __u8 pad[7];
|
|
+ __le64 time;
|
|
+} __attribute__((packed));
|
|
+
|
|
+struct jset_entry_dev_usage_type {
|
|
+ __le64 buckets;
|
|
+ __le64 sectors;
|
|
+ __le64 fragmented;
|
|
+} __attribute__((packed));
|
|
+
|
|
+struct jset_entry_dev_usage {
|
|
+ struct jset_entry entry;
|
|
+ __le32 dev;
|
|
+ __u32 pad;
|
|
+
|
|
+ __le64 buckets_ec;
|
|
+ __le64 buckets_unavailable;
|
|
+
|
|
+ struct jset_entry_dev_usage_type d[];
|
|
+} __attribute__((packed));
|
|
+
|
|
+/*
|
|
+ * On disk format for a journal entry:
|
|
+ * seq is monotonically increasing; every journal entry has its own unique
|
|
+ * sequence number.
|
|
+ *
|
|
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
|
|
+ * flushed to disk yet.
|
|
+ *
|
|
+ * version is for on disk format changes.
|
|
+ */
|
|
+struct jset {
|
|
+ struct bch_csum csum;
|
|
+
|
|
+ __le64 magic;
|
|
+ __le64 seq;
|
|
+ __le32 version;
|
|
+ __le32 flags;
|
|
+
|
|
+ __le32 u64s; /* size of d[] in u64s */
|
|
+
|
|
+ __u8 encrypted_start[0];
|
|
+
|
|
+ __le16 _read_clock; /* no longer used */
|
|
+ __le16 _write_clock;
|
|
+
|
|
+ /* Sequence number of oldest dirty journal entry */
|
|
+ __le64 last_seq;
|
|
+
|
|
+
|
|
+ union {
|
|
+ struct jset_entry start[0];
|
|
+ __u64 _data[0];
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4);
|
|
+LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5);
|
|
+LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6);
|
|
+
|
|
+#define BCH_JOURNAL_BUCKETS_MIN 8
|
|
+
|
|
+/* Btree: */
|
|
+
|
|
+#define BCH_BTREE_IDS() \
|
|
+ x(extents, 0) \
|
|
+ x(inodes, 1) \
|
|
+ x(dirents, 2) \
|
|
+ x(xattrs, 3) \
|
|
+ x(alloc, 4) \
|
|
+ x(quotas, 5) \
|
|
+ x(stripes, 6) \
|
|
+ x(reflink, 7)
|
|
+
|
|
+enum btree_id {
|
|
+#define x(kwd, val) BTREE_ID_##kwd = val,
|
|
+ BCH_BTREE_IDS()
|
|
+#undef x
|
|
+ BTREE_ID_NR
|
|
+};
|
|
+
|
|
+#define BTREE_MAX_DEPTH 4U
|
|
+
|
|
+/* Btree nodes */
|
|
+
|
|
+/*
|
|
+ * Btree nodes
|
|
+ *
|
|
+ * On disk a btree node is a list/log of these; within each set the keys are
|
|
+ * sorted
|
|
+ */
|
|
+struct bset {
|
|
+ __le64 seq;
|
|
+
|
|
+ /*
|
|
+ * Highest journal entry this bset contains keys for.
|
|
+ * If on recovery we don't see that journal entry, this bset is ignored:
|
|
+ * this allows us to preserve the order of all index updates after a
|
|
+ * crash, since the journal records a total order of all index updates
|
|
+ * and anything that didn't make it to the journal doesn't get used.
|
|
+ */
|
|
+ __le64 journal_seq;
|
|
+
|
|
+ __le32 flags;
|
|
+ __le16 version;
|
|
+ __le16 u64s; /* count of d[] in u64s */
|
|
+
|
|
+ union {
|
|
+ struct bkey_packed start[0];
|
|
+ __u64 _data[0];
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4);
|
|
+
|
|
+LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5);
|
|
+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS,
|
|
+ struct bset, flags, 5, 6);
|
|
+
|
|
+struct btree_node {
|
|
+ struct bch_csum csum;
|
|
+ __le64 magic;
|
|
+
|
|
+ /* this flags field is encrypted, unlike bset->flags: */
|
|
+ __le64 flags;
|
|
+
|
|
+ /* Closed interval: */
|
|
+ struct bpos min_key;
|
|
+ struct bpos max_key;
|
|
+ struct bch_extent_ptr _ptr; /* not used anymore */
|
|
+ struct bkey_format format;
|
|
+
|
|
+ union {
|
|
+ struct bset keys;
|
|
+ struct {
|
|
+ __u8 pad[22];
|
|
+ __le16 u64s;
|
|
+ __u64 _data[0];
|
|
+
|
|
+ };
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
|
|
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
|
|
+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE,
|
|
+ struct btree_node, flags, 8, 9);
|
|
+/* 9-32 unused */
|
|
+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
|
|
+
|
|
+struct btree_node_entry {
|
|
+ struct bch_csum csum;
|
|
+
|
|
+ union {
|
|
+ struct bset keys;
|
|
+ struct {
|
|
+ __u8 pad[22];
|
|
+ __le16 u64s;
|
|
+ __u64 _data[0];
|
|
+
|
|
+ };
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+#endif /* _BCACHEFS_FORMAT_H */
|
|
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
|
|
new file mode 100644
|
|
index 000000000000..f679fc2151bc
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bcachefs_ioctl.h
|
|
@@ -0,0 +1,352 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_IOCTL_H
|
|
+#define _BCACHEFS_IOCTL_H
|
|
+
|
|
+#include <linux/uuid.h>
|
|
+#include <asm/ioctl.h>
|
|
+#include "bcachefs_format.h"
|
|
+
|
|
+/*
|
|
+ * Flags common to multiple ioctls:
|
|
+ */
|
|
+#define BCH_FORCE_IF_DATA_LOST (1 << 0)
|
|
+#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
|
|
+#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
|
|
+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
|
|
+
|
|
+#define BCH_FORCE_IF_LOST \
|
|
+ (BCH_FORCE_IF_DATA_LOST| \
|
|
+ BCH_FORCE_IF_METADATA_LOST)
|
|
+#define BCH_FORCE_IF_DEGRADED \
|
|
+ (BCH_FORCE_IF_DATA_DEGRADED| \
|
|
+ BCH_FORCE_IF_METADATA_DEGRADED)
|
|
+
|
|
+/*
|
|
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
|
|
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
|
|
+ * filesystem:
|
|
+ */
|
|
+#define BCH_BY_INDEX (1 << 4)
|
|
+
|
|
+/*
|
|
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
|
|
+ * wide superblock:
|
|
+ */
|
|
+#define BCH_READ_DEV (1 << 5)
|
|
+
|
|
+/* global control dev: */
|
|
+
|
|
+/* These are currently broken, and probably unnecessary: */
|
|
+#if 0
|
|
+#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
|
|
+#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
|
|
+
|
|
+struct bch_ioctl_assemble {
|
|
+ __u32 flags;
|
|
+ __u32 nr_devs;
|
|
+ __u64 pad;
|
|
+ __u64 devs[];
|
|
+};
|
|
+
|
|
+struct bch_ioctl_incremental {
|
|
+ __u32 flags;
|
|
+ __u64 pad;
|
|
+ __u64 dev;
|
|
+};
|
|
+#endif
|
|
+
|
|
+/* filesystem ioctls: */
|
|
+
|
|
+#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
|
|
+
|
|
+/* These only make sense when we also have incremental assembly */
|
|
+#if 0
|
|
+#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
|
|
+#define BCH_IOCTL_STOP _IO(0xbc, 3)
|
|
+#endif
|
|
+
|
|
+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
|
|
+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
|
|
+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
|
|
+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
|
|
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
|
|
+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
|
|
+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage)
|
|
+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage)
|
|
+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
|
|
+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
|
|
+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize)
|
|
+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal)
|
|
+
|
|
+/* ioctl below act on a particular file, not the filesystem as a whole: */
|
|
+
|
|
+#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
|
|
+ *
|
|
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
|
|
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
|
|
+ * this UUID.
|
|
+ */
|
|
+struct bch_ioctl_query_uuid {
|
|
+ uuid_le uuid;
|
|
+};
|
|
+
|
|
+#if 0
|
|
+struct bch_ioctl_start {
|
|
+ __u32 flags;
|
|
+ __u32 pad;
|
|
+};
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
|
|
+ *
|
|
+ * The specified device must not be open or in use. On success, the new device
|
|
+ * will be an online member of the filesystem just like any other member.
|
|
+ *
|
|
+ * The device must first be prepared by userspace by formatting with a bcachefs
|
|
+ * superblock, which is only used for passing in superblock options/parameters
|
|
+ * for that device (in struct bch_member). The new device's superblock should
|
|
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
|
|
+ * ignored.
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
|
|
+ *
|
|
+ * Any data present on @dev will be permanently deleted, and @dev will be
|
|
+ * removed from its slot in the filesystem's list of member devices. The device
|
|
+ * may be either offline or offline.
|
|
+ *
|
|
+ * Will fail removing @dev would leave us with insufficient read write devices
|
|
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
|
|
+ * set.
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
|
|
+ * but is not open (e.g. because we started in degraded mode), bring it online
|
|
+ *
|
|
+ * all existing data on @dev will be available once the device is online,
|
|
+ * exactly as if @dev was present when the filesystem was first mounted
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
|
|
+ * block device, without removing it from the filesystem (so it can be brought
|
|
+ * back online later)
|
|
+ *
|
|
+ * Data present on @dev will be unavailable while @dev is offline (unless
|
|
+ * replicated), but will still be intact and untouched if @dev is brought back
|
|
+ * online
|
|
+ *
|
|
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
|
|
+ * leave us with insufficient read write devices or degraded/unavailable data,
|
|
+ * unless the approprate BCH_FORCE_IF_* flags are set.
|
|
+ */
|
|
+
|
|
+struct bch_ioctl_disk {
|
|
+ __u32 flags;
|
|
+ __u32 pad;
|
|
+ __u64 dev;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
|
|
+ *
|
|
+ * @new_state - one of the bch_member_state states (rw, ro, failed,
|
|
+ * spare)
|
|
+ *
|
|
+ * Will refuse to change member state if we would then have insufficient devices
|
|
+ * to write to, or if it would result in degraded data (when @new_state is
|
|
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
|
|
+ */
|
|
+struct bch_ioctl_disk_set_state {
|
|
+ __u32 flags;
|
|
+ __u8 new_state;
|
|
+ __u8 pad[3];
|
|
+ __u64 dev;
|
|
+};
|
|
+
|
|
+enum bch_data_ops {
|
|
+ BCH_DATA_OP_SCRUB = 0,
|
|
+ BCH_DATA_OP_REREPLICATE = 1,
|
|
+ BCH_DATA_OP_MIGRATE = 2,
|
|
+ BCH_DATA_OP_REWRITE_OLD_NODES = 3,
|
|
+ BCH_DATA_OP_NR = 4,
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
|
|
+ * scrub, rereplicate, migrate).
|
|
+ *
|
|
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
|
|
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
|
|
+ * indicating current progress, and closing the file descriptor will stop the
|
|
+ * job. The file descriptor is O_CLOEXEC.
|
|
+ */
|
|
+struct bch_ioctl_data {
|
|
+ __u16 op;
|
|
+ __u8 start_btree;
|
|
+ __u8 end_btree;
|
|
+ __u32 flags;
|
|
+
|
|
+ struct bpos start_pos;
|
|
+ struct bpos end_pos;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ __u32 dev;
|
|
+ __u32 pad;
|
|
+ } migrate;
|
|
+ struct {
|
|
+ __u64 pad[8];
|
|
+ };
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+enum bch_data_event {
|
|
+ BCH_DATA_EVENT_PROGRESS = 0,
|
|
+ /* XXX: add an event for reporting errors */
|
|
+ BCH_DATA_EVENT_NR = 1,
|
|
+};
|
|
+
|
|
+struct bch_ioctl_data_progress {
|
|
+ __u8 data_type;
|
|
+ __u8 btree_id;
|
|
+ __u8 pad[2];
|
|
+ struct bpos pos;
|
|
+
|
|
+ __u64 sectors_done;
|
|
+ __u64 sectors_total;
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_ioctl_data_event {
|
|
+ __u8 type;
|
|
+ __u8 pad[7];
|
|
+ union {
|
|
+ struct bch_ioctl_data_progress p;
|
|
+ __u64 pad2[15];
|
|
+ };
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+struct bch_replicas_usage {
|
|
+ __u64 sectors;
|
|
+ struct bch_replicas_entry r;
|
|
+} __attribute__((packed));
|
|
+
|
|
+static inline struct bch_replicas_usage *
|
|
+replicas_usage_next(struct bch_replicas_usage *u)
|
|
+{
|
|
+ return (void *) u + replicas_entry_bytes(&u->r) + 8;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage
|
|
+ *
|
|
+ * Returns disk space usage broken out by data type, number of replicas, and
|
|
+ * by component device
|
|
+ *
|
|
+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries
|
|
+ *
|
|
+ * On success, @replica_entries_bytes will be changed to indicate the number of
|
|
+ * bytes actually used.
|
|
+ *
|
|
+ * Returns -ERANGE if @replica_entries_bytes was too small
|
|
+ */
|
|
+struct bch_ioctl_fs_usage {
|
|
+ __u64 capacity;
|
|
+ __u64 used;
|
|
+ __u64 online_reserved;
|
|
+ __u64 persistent_reserved[BCH_REPLICAS_MAX];
|
|
+
|
|
+ __u32 replica_entries_bytes;
|
|
+ __u32 pad;
|
|
+
|
|
+ struct bch_replicas_usage replicas[0];
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DEV_USAGE: query device disk space usage
|
|
+ *
|
|
+ * Returns disk space usage broken out by data type - both by buckets and
|
|
+ * sectors.
|
|
+ */
|
|
+struct bch_ioctl_dev_usage {
|
|
+ __u64 dev;
|
|
+ __u32 flags;
|
|
+ __u8 state;
|
|
+ __u8 pad[7];
|
|
+
|
|
+ __u32 bucket_size;
|
|
+ __u64 nr_buckets;
|
|
+ __u64 available_buckets;
|
|
+
|
|
+ __u64 buckets[BCH_DATA_NR];
|
|
+ __u64 sectors[BCH_DATA_NR];
|
|
+
|
|
+ __u64 ec_buckets;
|
|
+ __u64 ec_sectors;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
|
|
+ *
|
|
+ * Equivalent to reading the superblock directly from the block device, except
|
|
+ * avoids racing with the kernel writing the superblock or having to figure out
|
|
+ * which block device to read
|
|
+ *
|
|
+ * @sb - buffer to read into
|
|
+ * @size - size of userspace allocated buffer
|
|
+ * @dev - device to read superblock for, if BCH_READ_DEV flag is
|
|
+ * specified
|
|
+ *
|
|
+ * Returns -ERANGE if buffer provided is too small
|
|
+ */
|
|
+struct bch_ioctl_read_super {
|
|
+ __u32 flags;
|
|
+ __u32 pad;
|
|
+ __u64 dev;
|
|
+ __u64 size;
|
|
+ __u64 sb;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
|
|
+ * determine if disk is a (online) member - if so, returns device's index
|
|
+ *
|
|
+ * Returns -ENOENT if not found
|
|
+ */
|
|
+struct bch_ioctl_disk_get_idx {
|
|
+ __u64 dev;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
|
|
+ *
|
|
+ * @dev - member to resize
|
|
+ * @nbuckets - new number of buckets
|
|
+ */
|
|
+struct bch_ioctl_disk_resize {
|
|
+ __u32 flags;
|
|
+ __u32 pad;
|
|
+ __u64 dev;
|
|
+ __u64 nbuckets;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device
|
|
+ *
|
|
+ * @dev - member to resize
|
|
+ * @nbuckets - new number of buckets
|
|
+ */
|
|
+struct bch_ioctl_disk_resize_journal {
|
|
+ __u32 flags;
|
|
+ __u32 pad;
|
|
+ __u64 dev;
|
|
+ __u64 nbuckets;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_IOCTL_H */
|
|
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
|
|
new file mode 100644
|
|
index 000000000000..3af56062601f
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey.c
|
|
@@ -0,0 +1,1164 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "bset.h"
|
|
+#include "util.h"
|
|
+
|
|
+#undef EBUG_ON
|
|
+
|
|
+#ifdef DEBUG_BKEYS
|
|
+#define EBUG_ON(cond) BUG_ON(cond)
|
|
+#else
|
|
+#define EBUG_ON(cond)
|
|
+#endif
|
|
+
|
|
+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
|
|
+
|
|
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
|
|
+ const struct bkey_packed *);
|
|
+
|
|
+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits)
|
|
+{
|
|
+ unsigned bit = high_bit_offset, done = 0;
|
|
+
|
|
+ while (1) {
|
|
+ while (bit < 64) {
|
|
+ if (done && !(done % 8))
|
|
+ *out++ = ' ';
|
|
+ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0';
|
|
+ bit++;
|
|
+ done++;
|
|
+ if (done == nr_bits) {
|
|
+ *out++ = '\0';
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ p = next_word(p);
|
|
+ bit = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+
|
|
+static void bch2_bkey_pack_verify(const struct bkey_packed *packed,
|
|
+ const struct bkey *unpacked,
|
|
+ const struct bkey_format *format)
|
|
+{
|
|
+ struct bkey tmp;
|
|
+
|
|
+ BUG_ON(bkeyp_val_u64s(format, packed) !=
|
|
+ bkey_val_u64s(unpacked));
|
|
+
|
|
+ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed));
|
|
+
|
|
+ tmp = __bch2_bkey_unpack_key(format, packed);
|
|
+
|
|
+ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) {
|
|
+ char buf1[160], buf2[160];
|
|
+ char buf3[160], buf4[160];
|
|
+
|
|
+ bch2_bkey_to_text(&PBUF(buf1), unpacked);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), &tmp);
|
|
+ bch2_to_binary(buf3, (void *) unpacked, 80);
|
|
+ bch2_to_binary(buf4, high_word(format, packed), 80);
|
|
+
|
|
+ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n",
|
|
+ format->key_u64s,
|
|
+ format->bits_per_field[0],
|
|
+ format->bits_per_field[1],
|
|
+ format->bits_per_field[2],
|
|
+ format->bits_per_field[3],
|
|
+ format->bits_per_field[4],
|
|
+ buf1, buf2, buf3, buf4);
|
|
+ }
|
|
+}
|
|
+
|
|
+#else
|
|
+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed,
|
|
+ const struct bkey *unpacked,
|
|
+ const struct bkey_format *format) {}
|
|
+#endif
|
|
+
|
|
+struct pack_state {
|
|
+ const struct bkey_format *format;
|
|
+ unsigned bits; /* bits remaining in current word */
|
|
+ u64 w; /* current word */
|
|
+ u64 *p; /* pointer to next word */
|
|
+};
|
|
+
|
|
+__always_inline
|
|
+static struct pack_state pack_state_init(const struct bkey_format *format,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ u64 *p = high_word(format, k);
|
|
+
|
|
+ return (struct pack_state) {
|
|
+ .format = format,
|
|
+ .bits = 64 - high_bit_offset,
|
|
+ .w = 0,
|
|
+ .p = p,
|
|
+ };
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static void pack_state_finish(struct pack_state *state,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ EBUG_ON(state->p < k->_data);
|
|
+ EBUG_ON(state->p >= k->_data + state->format->key_u64s);
|
|
+
|
|
+ *state->p = state->w;
|
|
+}
|
|
+
|
|
+struct unpack_state {
|
|
+ const struct bkey_format *format;
|
|
+ unsigned bits; /* bits remaining in current word */
|
|
+ u64 w; /* current word */
|
|
+ const u64 *p; /* pointer to next word */
|
|
+};
|
|
+
|
|
+__always_inline
|
|
+static struct unpack_state unpack_state_init(const struct bkey_format *format,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ const u64 *p = high_word(format, k);
|
|
+
|
|
+ return (struct unpack_state) {
|
|
+ .format = format,
|
|
+ .bits = 64 - high_bit_offset,
|
|
+ .w = *p << high_bit_offset,
|
|
+ .p = p,
|
|
+ };
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static u64 get_inc_field(struct unpack_state *state, unsigned field)
|
|
+{
|
|
+ unsigned bits = state->format->bits_per_field[field];
|
|
+ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]);
|
|
+
|
|
+ if (bits >= state->bits) {
|
|
+ v = state->w >> (64 - bits);
|
|
+ bits -= state->bits;
|
|
+
|
|
+ state->p = next_word(state->p);
|
|
+ state->w = *state->p;
|
|
+ state->bits = 64;
|
|
+ }
|
|
+
|
|
+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
|
|
+ v |= (state->w >> 1) >> (63 - bits);
|
|
+ state->w <<= bits;
|
|
+ state->bits -= bits;
|
|
+
|
|
+ return v + offset;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v)
|
|
+{
|
|
+ unsigned bits = state->format->bits_per_field[field];
|
|
+ u64 offset = le64_to_cpu(state->format->field_offset[field]);
|
|
+
|
|
+ if (v < offset)
|
|
+ return false;
|
|
+
|
|
+ v -= offset;
|
|
+
|
|
+ if (fls64(v) > bits)
|
|
+ return false;
|
|
+
|
|
+ if (bits > state->bits) {
|
|
+ bits -= state->bits;
|
|
+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */
|
|
+ state->w |= (v >> 1) >> (bits - 1);
|
|
+
|
|
+ *state->p = state->w;
|
|
+ state->p = next_word(state->p);
|
|
+ state->w = 0;
|
|
+ state->bits = 64;
|
|
+ }
|
|
+
|
|
+ state->bits -= bits;
|
|
+ state->w |= v << state->bits;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Note: does NOT set out->format (we don't know what it should be here!)
|
|
+ *
|
|
+ * Also: doesn't work on extents - it doesn't preserve the invariant that
|
|
+ * if k is packed bkey_start_pos(k) will successfully pack
|
|
+ */
|
|
+static bool bch2_bkey_transform_key(const struct bkey_format *out_f,
|
|
+ struct bkey_packed *out,
|
|
+ const struct bkey_format *in_f,
|
|
+ const struct bkey_packed *in)
|
|
+{
|
|
+ struct pack_state out_s = pack_state_init(out_f, out);
|
|
+ struct unpack_state in_s = unpack_state_init(in_f, in);
|
|
+ unsigned i;
|
|
+
|
|
+ out->_data[0] = 0;
|
|
+
|
|
+ for (i = 0; i < BKEY_NR_FIELDS; i++)
|
|
+ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i)))
|
|
+ return false;
|
|
+
|
|
+ /* Can't happen because the val would be too big to unpack: */
|
|
+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX);
|
|
+
|
|
+ pack_state_finish(&out_s, out);
|
|
+ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s;
|
|
+ out->needs_whiteout = in->needs_whiteout;
|
|
+ out->type = in->type;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+bool bch2_bkey_transform(const struct bkey_format *out_f,
|
|
+ struct bkey_packed *out,
|
|
+ const struct bkey_format *in_f,
|
|
+ const struct bkey_packed *in)
|
|
+{
|
|
+ if (!bch2_bkey_transform_key(out_f, out, in_f, in))
|
|
+ return false;
|
|
+
|
|
+ memcpy_u64s((u64 *) out + out_f->key_u64s,
|
|
+ (u64 *) in + in_f->key_u64s,
|
|
+ (in->u64s - in_f->key_u64s));
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#define bkey_fields() \
|
|
+ x(BKEY_FIELD_INODE, p.inode) \
|
|
+ x(BKEY_FIELD_OFFSET, p.offset) \
|
|
+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \
|
|
+ x(BKEY_FIELD_SIZE, size) \
|
|
+ x(BKEY_FIELD_VERSION_HI, version.hi) \
|
|
+ x(BKEY_FIELD_VERSION_LO, version.lo)
|
|
+
|
|
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format,
|
|
+ const struct bkey_packed *in)
|
|
+{
|
|
+ struct unpack_state state = unpack_state_init(format, in);
|
|
+ struct bkey out;
|
|
+
|
|
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
|
|
+ EBUG_ON(in->u64s < format->key_u64s);
|
|
+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
|
|
+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX);
|
|
+
|
|
+ out.u64s = BKEY_U64s + in->u64s - format->key_u64s;
|
|
+ out.format = KEY_FORMAT_CURRENT;
|
|
+ out.needs_whiteout = in->needs_whiteout;
|
|
+ out.type = in->type;
|
|
+ out.pad[0] = 0;
|
|
+
|
|
+#define x(id, field) out.field = get_inc_field(&state, id);
|
|
+ bkey_fields()
|
|
+#undef x
|
|
+
|
|
+ return out;
|
|
+}
|
|
+
|
|
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
|
|
+struct bpos __bkey_unpack_pos(const struct bkey_format *format,
|
|
+ const struct bkey_packed *in)
|
|
+{
|
|
+ struct unpack_state state = unpack_state_init(format, in);
|
|
+ struct bpos out;
|
|
+
|
|
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
|
|
+ EBUG_ON(in->u64s < format->key_u64s);
|
|
+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE);
|
|
+
|
|
+ out.inode = get_inc_field(&state, BKEY_FIELD_INODE);
|
|
+ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET);
|
|
+ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT);
|
|
+
|
|
+ return out;
|
|
+}
|
|
+#endif
|
|
+
|
|
+/**
|
|
+ * bch2_bkey_pack_key -- pack just the key, not the value
|
|
+ */
|
|
+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in,
|
|
+ const struct bkey_format *format)
|
|
+{
|
|
+ struct pack_state state = pack_state_init(format, out);
|
|
+
|
|
+ EBUG_ON((void *) in == (void *) out);
|
|
+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS);
|
|
+ EBUG_ON(in->format != KEY_FORMAT_CURRENT);
|
|
+
|
|
+ out->_data[0] = 0;
|
|
+
|
|
+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false;
|
|
+ bkey_fields()
|
|
+#undef x
|
|
+
|
|
+ /*
|
|
+ * Extents - we have to guarantee that if an extent is packed, a trimmed
|
|
+ * version will also pack:
|
|
+ */
|
|
+ if (bkey_start_offset(in) <
|
|
+ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET]))
|
|
+ return false;
|
|
+
|
|
+ pack_state_finish(&state, out);
|
|
+ out->u64s = format->key_u64s + in->u64s - BKEY_U64s;
|
|
+ out->format = KEY_FORMAT_LOCAL_BTREE;
|
|
+ out->needs_whiteout = in->needs_whiteout;
|
|
+ out->type = in->type;
|
|
+
|
|
+ bch2_bkey_pack_verify(out, in, format);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_bkey_unpack -- unpack the key and the value
|
|
+ */
|
|
+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+ __bkey_unpack_key(b, &dst->k, src);
|
|
+
|
|
+ memcpy_u64s(&dst->v,
|
|
+ bkeyp_val(&b->format, src),
|
|
+ bkeyp_val_u64s(&b->format, src));
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_bkey_pack -- pack the key and the value
|
|
+ */
|
|
+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in,
|
|
+ const struct bkey_format *format)
|
|
+{
|
|
+ struct bkey_packed tmp;
|
|
+
|
|
+ if (!bch2_bkey_pack_key(&tmp, &in->k, format))
|
|
+ return false;
|
|
+
|
|
+ memmove_u64s((u64 *) out + format->key_u64s,
|
|
+ &in->v,
|
|
+ bkey_val_u64s(&in->k));
|
|
+ memcpy_u64s(out, &tmp, format->key_u64s);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v)
|
|
+{
|
|
+ unsigned bits = state->format->bits_per_field[field];
|
|
+ u64 offset = le64_to_cpu(state->format->field_offset[field]);
|
|
+ bool ret = true;
|
|
+
|
|
+ EBUG_ON(v < offset);
|
|
+ v -= offset;
|
|
+
|
|
+ if (fls64(v) > bits) {
|
|
+ v = ~(~0ULL << bits);
|
|
+ ret = false;
|
|
+ }
|
|
+
|
|
+ if (bits > state->bits) {
|
|
+ bits -= state->bits;
|
|
+ state->w |= (v >> 1) >> (bits - 1);
|
|
+
|
|
+ *state->p = state->w;
|
|
+ state->p = next_word(state->p);
|
|
+ state->w = 0;
|
|
+ state->bits = 64;
|
|
+ }
|
|
+
|
|
+ state->bits -= bits;
|
|
+ state->w |= v << state->bits;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static bool bkey_packed_successor(struct bkey_packed *out,
|
|
+ const struct btree *b,
|
|
+ struct bkey_packed k)
|
|
+{
|
|
+ const struct bkey_format *f = &b->format;
|
|
+ unsigned nr_key_bits = b->nr_key_bits;
|
|
+ unsigned first_bit, offset;
|
|
+ u64 *p;
|
|
+
|
|
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
|
|
+
|
|
+ if (!nr_key_bits)
|
|
+ return false;
|
|
+
|
|
+ *out = k;
|
|
+
|
|
+ first_bit = high_bit_offset + nr_key_bits - 1;
|
|
+ p = nth_word(high_word(f, out), first_bit >> 6);
|
|
+ offset = 63 - (first_bit & 63);
|
|
+
|
|
+ while (nr_key_bits) {
|
|
+ unsigned bits = min(64 - offset, nr_key_bits);
|
|
+ u64 mask = (~0ULL >> (64 - bits)) << offset;
|
|
+
|
|
+ if ((*p & mask) != mask) {
|
|
+ *p += 1ULL << offset;
|
|
+ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ *p &= ~mask;
|
|
+ p = prev_word(p);
|
|
+ nr_key_bits -= bits;
|
|
+ offset = 0;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * Returns a packed key that compares <= in
|
|
+ *
|
|
+ * This is used in bset_search_tree(), where we need a packed pos in order to be
|
|
+ * able to compare against the keys in the auxiliary search tree - and it's
|
|
+ * legal to use a packed pos that isn't equivalent to the original pos,
|
|
+ * _provided_ it compares <= to the original pos.
|
|
+ */
|
|
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out,
|
|
+ struct bpos in,
|
|
+ const struct btree *b)
|
|
+{
|
|
+ const struct bkey_format *f = &b->format;
|
|
+ struct pack_state state = pack_state_init(f, out);
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct bpos orig = in;
|
|
+#endif
|
|
+ bool exact = true;
|
|
+
|
|
+ out->_data[0] = 0;
|
|
+
|
|
+ if (unlikely(in.snapshot <
|
|
+ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) {
|
|
+ if (!in.offset-- &&
|
|
+ !in.inode--)
|
|
+ return BKEY_PACK_POS_FAIL;
|
|
+ in.snapshot = KEY_SNAPSHOT_MAX;
|
|
+ exact = false;
|
|
+ }
|
|
+
|
|
+ if (unlikely(in.offset <
|
|
+ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) {
|
|
+ if (!in.inode--)
|
|
+ return BKEY_PACK_POS_FAIL;
|
|
+ in.offset = KEY_OFFSET_MAX;
|
|
+ in.snapshot = KEY_SNAPSHOT_MAX;
|
|
+ exact = false;
|
|
+ }
|
|
+
|
|
+ if (unlikely(in.inode <
|
|
+ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE])))
|
|
+ return BKEY_PACK_POS_FAIL;
|
|
+
|
|
+ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) {
|
|
+ in.offset = KEY_OFFSET_MAX;
|
|
+ in.snapshot = KEY_SNAPSHOT_MAX;
|
|
+ exact = false;
|
|
+ }
|
|
+
|
|
+ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) {
|
|
+ in.snapshot = KEY_SNAPSHOT_MAX;
|
|
+ exact = false;
|
|
+ }
|
|
+
|
|
+ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))
|
|
+ exact = false;
|
|
+
|
|
+ pack_state_finish(&state, out);
|
|
+ out->u64s = f->key_u64s;
|
|
+ out->format = KEY_FORMAT_LOCAL_BTREE;
|
|
+ out->type = KEY_TYPE_deleted;
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ if (exact) {
|
|
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig));
|
|
+ } else {
|
|
+ struct bkey_packed successor;
|
|
+
|
|
+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0);
|
|
+ BUG_ON(bkey_packed_successor(&successor, b, *out) &&
|
|
+ bkey_cmp_left_packed(b, &successor, &orig) < 0);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER;
|
|
+}
|
|
+
|
|
+void bch2_bkey_format_init(struct bkey_format_state *s)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++)
|
|
+ s->field_min[i] = U64_MAX;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(s->field_max); i++)
|
|
+ s->field_max[i] = 0;
|
|
+
|
|
+ /* Make sure we can store a size of 0: */
|
|
+ s->field_min[BKEY_FIELD_SIZE] = 0;
|
|
+}
|
|
+
|
|
+static void __bkey_format_add(struct bkey_format_state *s,
|
|
+ unsigned field, u64 v)
|
|
+{
|
|
+ s->field_min[field] = min(s->field_min[field], v);
|
|
+ s->field_max[field] = max(s->field_max[field], v);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Changes @format so that @k can be successfully packed with @format
|
|
+ */
|
|
+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k)
|
|
+{
|
|
+#define x(id, field) __bkey_format_add(s, id, k->field);
|
|
+ bkey_fields()
|
|
+#undef x
|
|
+ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k));
|
|
+}
|
|
+
|
|
+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p)
|
|
+{
|
|
+ unsigned field = 0;
|
|
+
|
|
+ __bkey_format_add(s, field++, p.inode);
|
|
+ __bkey_format_add(s, field++, p.offset);
|
|
+ __bkey_format_add(s, field++, p.snapshot);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We don't want it to be possible for the packed format to represent fields
|
|
+ * bigger than a u64... that will cause confusion and issues (like with
|
|
+ * bkey_packed_successor())
|
|
+ */
|
|
+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i,
|
|
+ unsigned bits, u64 offset)
|
|
+{
|
|
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
|
+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
|
|
+
|
|
+ bits = min(bits, unpacked_bits);
|
|
+
|
|
+ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1));
|
|
+
|
|
+ f->bits_per_field[i] = bits;
|
|
+ f->field_offset[i] = cpu_to_le64(offset);
|
|
+}
|
|
+
|
|
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s)
|
|
+{
|
|
+ unsigned i, bits = KEY_PACKED_BITS_START;
|
|
+ struct bkey_format ret = {
|
|
+ .nr_fields = BKEY_NR_FIELDS,
|
|
+ };
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) {
|
|
+ s->field_min[i] = min(s->field_min[i], s->field_max[i]);
|
|
+
|
|
+ set_format_field(&ret, i,
|
|
+ fls64(s->field_max[i] - s->field_min[i]),
|
|
+ s->field_min[i]);
|
|
+
|
|
+ bits += ret.bits_per_field[i];
|
|
+ }
|
|
+
|
|
+ /* allow for extent merging: */
|
|
+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) {
|
|
+ ret.bits_per_field[BKEY_FIELD_SIZE] += 4;
|
|
+ bits += 4;
|
|
+ }
|
|
+
|
|
+ ret.key_u64s = DIV_ROUND_UP(bits, 64);
|
|
+
|
|
+ /* if we have enough spare bits, round fields up to nearest byte */
|
|
+ bits = ret.key_u64s * 64 - bits;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) {
|
|
+ unsigned r = round_up(ret.bits_per_field[i], 8) -
|
|
+ ret.bits_per_field[i];
|
|
+
|
|
+ if (r <= bits) {
|
|
+ set_format_field(&ret, i,
|
|
+ ret.bits_per_field[i] + r,
|
|
+ le64_to_cpu(ret.field_offset[i]));
|
|
+ bits -= r;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ EBUG_ON(bch2_bkey_format_validate(&ret));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+const char *bch2_bkey_format_validate(struct bkey_format *f)
|
|
+{
|
|
+ unsigned i, bits = KEY_PACKED_BITS_START;
|
|
+
|
|
+ if (f->nr_fields != BKEY_NR_FIELDS)
|
|
+ return "incorrect number of fields";
|
|
+
|
|
+ for (i = 0; i < f->nr_fields; i++) {
|
|
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
|
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
|
|
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
|
|
+
|
|
+ if (f->bits_per_field[i] > unpacked_bits)
|
|
+ return "field too large";
|
|
+
|
|
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
|
|
+ return "offset + bits overflow";
|
|
+
|
|
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
|
|
+ unpacked_mask) <
|
|
+ field_offset)
|
|
+ return "offset + bits overflow";
|
|
+
|
|
+ bits += f->bits_per_field[i];
|
|
+ }
|
|
+
|
|
+ if (f->key_u64s != DIV_ROUND_UP(bits, 64))
|
|
+ return "incorrect key_u64s";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Most significant differing bit
|
|
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
|
|
+ */
|
|
+__pure
|
|
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b,
|
|
+ const struct bkey_packed *l_k,
|
|
+ const struct bkey_packed *r_k)
|
|
+{
|
|
+ const u64 *l = high_word(&b->format, l_k);
|
|
+ const u64 *r = high_word(&b->format, r_k);
|
|
+ unsigned nr_key_bits = b->nr_key_bits;
|
|
+ unsigned word_bits = 64 - high_bit_offset;
|
|
+ u64 l_v, r_v;
|
|
+
|
|
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
|
|
+
|
|
+ /* for big endian, skip past header */
|
|
+ l_v = *l & (~0ULL >> high_bit_offset);
|
|
+ r_v = *r & (~0ULL >> high_bit_offset);
|
|
+
|
|
+ while (nr_key_bits) {
|
|
+ if (nr_key_bits < word_bits) {
|
|
+ l_v >>= word_bits - nr_key_bits;
|
|
+ r_v >>= word_bits - nr_key_bits;
|
|
+ nr_key_bits = 0;
|
|
+ } else {
|
|
+ nr_key_bits -= word_bits;
|
|
+ }
|
|
+
|
|
+ if (l_v != r_v)
|
|
+ return fls64(l_v ^ r_v) - 1 + nr_key_bits;
|
|
+
|
|
+ l = next_word(l);
|
|
+ r = next_word(r);
|
|
+
|
|
+ l_v = *l;
|
|
+ r_v = *r;
|
|
+ word_bits = 64;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * First set bit
|
|
+ * Bits are indexed from 0 - return is [0, nr_key_bits)
|
|
+ */
|
|
+__pure
|
|
+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k)
|
|
+{
|
|
+ const u64 *p = high_word(&b->format, k);
|
|
+ unsigned nr_key_bits = b->nr_key_bits;
|
|
+ unsigned ret = 0, offset;
|
|
+
|
|
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format));
|
|
+
|
|
+ offset = nr_key_bits;
|
|
+ while (offset > 64) {
|
|
+ p = next_word(p);
|
|
+ offset -= 64;
|
|
+ }
|
|
+
|
|
+ offset = 64 - offset;
|
|
+
|
|
+ while (nr_key_bits) {
|
|
+ unsigned bits = nr_key_bits + offset < 64
|
|
+ ? nr_key_bits
|
|
+ : 64 - offset;
|
|
+
|
|
+ u64 mask = (~0ULL >> (64 - bits)) << offset;
|
|
+
|
|
+ if (*p & mask)
|
|
+ return ret + __ffs64(*p & mask) - offset;
|
|
+
|
|
+ p = prev_word(p);
|
|
+ nr_key_bits -= bits;
|
|
+ ret += bits;
|
|
+ offset = 0;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_X86_64
|
|
+
|
|
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
|
|
+ unsigned nr_key_bits)
|
|
+{
|
|
+ long d0, d1, d2, d3;
|
|
+ int cmp;
|
|
+
|
|
+ /* we shouldn't need asm for this, but gcc is being retarded: */
|
|
+
|
|
+ asm(".intel_syntax noprefix;"
|
|
+ "xor eax, eax;"
|
|
+ "xor edx, edx;"
|
|
+ "1:;"
|
|
+ "mov r8, [rdi];"
|
|
+ "mov r9, [rsi];"
|
|
+ "sub ecx, 64;"
|
|
+ "jl 2f;"
|
|
+
|
|
+ "cmp r8, r9;"
|
|
+ "jnz 3f;"
|
|
+
|
|
+ "lea rdi, [rdi - 8];"
|
|
+ "lea rsi, [rsi - 8];"
|
|
+ "jmp 1b;"
|
|
+
|
|
+ "2:;"
|
|
+ "not ecx;"
|
|
+ "shr r8, 1;"
|
|
+ "shr r9, 1;"
|
|
+ "shr r8, cl;"
|
|
+ "shr r9, cl;"
|
|
+ "cmp r8, r9;"
|
|
+
|
|
+ "3:\n"
|
|
+ "seta al;"
|
|
+ "setb dl;"
|
|
+ "sub eax, edx;"
|
|
+ ".att_syntax prefix;"
|
|
+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp)
|
|
+ : "0" (l), "1" (r), "3" (nr_key_bits)
|
|
+ : "r8", "r9", "cc", "memory");
|
|
+
|
|
+ return cmp;
|
|
+}
|
|
+
|
|
+#define I(_x) (*(out)++ = (_x))
|
|
+#define I1(i0) I(i0)
|
|
+#define I2(i0, i1) (I1(i0), I(i1))
|
|
+#define I3(i0, i1, i2) (I2(i0, i1), I(i2))
|
|
+#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3))
|
|
+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4))
|
|
+
|
|
+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out,
|
|
+ enum bch_bkey_fields field,
|
|
+ unsigned dst_offset, unsigned dst_size,
|
|
+ bool *eax_zeroed)
|
|
+{
|
|
+ unsigned bits = format->bits_per_field[field];
|
|
+ u64 offset = le64_to_cpu(format->field_offset[field]);
|
|
+ unsigned i, byte, bit_offset, align, shl, shr;
|
|
+
|
|
+ if (!bits && !offset) {
|
|
+ if (!*eax_zeroed) {
|
|
+ /* xor eax, eax */
|
|
+ I2(0x31, 0xc0);
|
|
+ }
|
|
+
|
|
+ *eax_zeroed = true;
|
|
+ goto set_field;
|
|
+ }
|
|
+
|
|
+ if (!bits) {
|
|
+ /* just return offset: */
|
|
+
|
|
+ switch (dst_size) {
|
|
+ case 8:
|
|
+ if (offset > S32_MAX) {
|
|
+ /* mov [rdi + dst_offset], offset */
|
|
+ I3(0xc7, 0x47, dst_offset);
|
|
+ memcpy(out, &offset, 4);
|
|
+ out += 4;
|
|
+
|
|
+ I3(0xc7, 0x47, dst_offset + 4);
|
|
+ memcpy(out, (void *) &offset + 4, 4);
|
|
+ out += 4;
|
|
+ } else {
|
|
+ /* mov [rdi + dst_offset], offset */
|
|
+ /* sign extended */
|
|
+ I4(0x48, 0xc7, 0x47, dst_offset);
|
|
+ memcpy(out, &offset, 4);
|
|
+ out += 4;
|
|
+ }
|
|
+ break;
|
|
+ case 4:
|
|
+ /* mov [rdi + dst_offset], offset */
|
|
+ I3(0xc7, 0x47, dst_offset);
|
|
+ memcpy(out, &offset, 4);
|
|
+ out += 4;
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ return out;
|
|
+ }
|
|
+
|
|
+ bit_offset = format->key_u64s * 64;
|
|
+ for (i = 0; i <= field; i++)
|
|
+ bit_offset -= format->bits_per_field[i];
|
|
+
|
|
+ byte = bit_offset / 8;
|
|
+ bit_offset -= byte * 8;
|
|
+
|
|
+ *eax_zeroed = false;
|
|
+
|
|
+ if (bit_offset == 0 && bits == 8) {
|
|
+ /* movzx eax, BYTE PTR [rsi + imm8] */
|
|
+ I4(0x0f, 0xb6, 0x46, byte);
|
|
+ } else if (bit_offset == 0 && bits == 16) {
|
|
+ /* movzx eax, WORD PTR [rsi + imm8] */
|
|
+ I4(0x0f, 0xb7, 0x46, byte);
|
|
+ } else if (bit_offset + bits <= 32) {
|
|
+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
|
|
+ byte -= align;
|
|
+ bit_offset += align * 8;
|
|
+
|
|
+ BUG_ON(bit_offset + bits > 32);
|
|
+
|
|
+ /* mov eax, [rsi + imm8] */
|
|
+ I3(0x8b, 0x46, byte);
|
|
+
|
|
+ if (bit_offset) {
|
|
+ /* shr eax, imm8 */
|
|
+ I3(0xc1, 0xe8, bit_offset);
|
|
+ }
|
|
+
|
|
+ if (bit_offset + bits < 32) {
|
|
+ unsigned mask = ~0U >> (32 - bits);
|
|
+
|
|
+ /* and eax, imm32 */
|
|
+ I1(0x25);
|
|
+ memcpy(out, &mask, 4);
|
|
+ out += 4;
|
|
+ }
|
|
+ } else if (bit_offset + bits <= 64) {
|
|
+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7);
|
|
+ byte -= align;
|
|
+ bit_offset += align * 8;
|
|
+
|
|
+ BUG_ON(bit_offset + bits > 64);
|
|
+
|
|
+ /* mov rax, [rsi + imm8] */
|
|
+ I4(0x48, 0x8b, 0x46, byte);
|
|
+
|
|
+ shl = 64 - bit_offset - bits;
|
|
+ shr = bit_offset + shl;
|
|
+
|
|
+ if (shl) {
|
|
+ /* shl rax, imm8 */
|
|
+ I4(0x48, 0xc1, 0xe0, shl);
|
|
+ }
|
|
+
|
|
+ if (shr) {
|
|
+ /* shr rax, imm8 */
|
|
+ I4(0x48, 0xc1, 0xe8, shr);
|
|
+ }
|
|
+ } else {
|
|
+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3);
|
|
+ byte -= align;
|
|
+ bit_offset += align * 8;
|
|
+
|
|
+ BUG_ON(bit_offset + bits > 96);
|
|
+
|
|
+ /* mov rax, [rsi + byte] */
|
|
+ I4(0x48, 0x8b, 0x46, byte);
|
|
+
|
|
+ /* mov edx, [rsi + byte + 8] */
|
|
+ I3(0x8b, 0x56, byte + 8);
|
|
+
|
|
+ /* bits from next word: */
|
|
+ shr = bit_offset + bits - 64;
|
|
+ BUG_ON(shr > bit_offset);
|
|
+
|
|
+ /* shr rax, bit_offset */
|
|
+ I4(0x48, 0xc1, 0xe8, shr);
|
|
+
|
|
+ /* shl rdx, imm8 */
|
|
+ I4(0x48, 0xc1, 0xe2, 64 - shr);
|
|
+
|
|
+ /* or rax, rdx */
|
|
+ I3(0x48, 0x09, 0xd0);
|
|
+
|
|
+ shr = bit_offset - shr;
|
|
+
|
|
+ if (shr) {
|
|
+ /* shr rax, imm8 */
|
|
+ I4(0x48, 0xc1, 0xe8, shr);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* rax += offset: */
|
|
+ if (offset > S32_MAX) {
|
|
+ /* mov rdx, imm64 */
|
|
+ I2(0x48, 0xba);
|
|
+ memcpy(out, &offset, 8);
|
|
+ out += 8;
|
|
+ /* add %rdx, %rax */
|
|
+ I3(0x48, 0x01, 0xd0);
|
|
+ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) {
|
|
+ /* add rax, imm32 */
|
|
+ I2(0x48, 0x05);
|
|
+ memcpy(out, &offset, 4);
|
|
+ out += 4;
|
|
+ } else if (offset) {
|
|
+ /* add eax, imm32 */
|
|
+ I1(0x05);
|
|
+ memcpy(out, &offset, 4);
|
|
+ out += 4;
|
|
+ }
|
|
+set_field:
|
|
+ switch (dst_size) {
|
|
+ case 8:
|
|
+ /* mov [rdi + dst_offset], rax */
|
|
+ I4(0x48, 0x89, 0x47, dst_offset);
|
|
+ break;
|
|
+ case 4:
|
|
+ /* mov [rdi + dst_offset], eax */
|
|
+ I3(0x89, 0x47, dst_offset);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ return out;
|
|
+}
|
|
+
|
|
+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out)
|
|
+{
|
|
+ bool eax_zeroed = false;
|
|
+ u8 *out = _out;
|
|
+
|
|
+ /*
|
|
+ * rdi: dst - unpacked key
|
|
+ * rsi: src - packed key
|
|
+ */
|
|
+
|
|
+ /* k->u64s, k->format, k->type */
|
|
+
|
|
+ /* mov eax, [rsi] */
|
|
+ I2(0x8b, 0x06);
|
|
+
|
|
+ /* add eax, BKEY_U64s - format->key_u64s */
|
|
+ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0);
|
|
+
|
|
+ /* and eax, imm32: mask out k->pad: */
|
|
+ I5(0x25, 0xff, 0xff, 0xff, 0);
|
|
+
|
|
+ /* mov [rdi], eax */
|
|
+ I2(0x89, 0x07);
|
|
+
|
|
+#define x(id, field) \
|
|
+ out = compile_bkey_field(format, out, id, \
|
|
+ offsetof(struct bkey, field), \
|
|
+ sizeof(((struct bkey *) NULL)->field), \
|
|
+ &eax_zeroed);
|
|
+ bkey_fields()
|
|
+#undef x
|
|
+
|
|
+ /* retq */
|
|
+ I1(0xc3);
|
|
+
|
|
+ return (void *) out - _out;
|
|
+}
|
|
+
|
|
+#else
|
|
+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r,
|
|
+ unsigned nr_key_bits)
|
|
+{
|
|
+ u64 l_v, r_v;
|
|
+
|
|
+ if (!nr_key_bits)
|
|
+ return 0;
|
|
+
|
|
+ /* for big endian, skip past header */
|
|
+ nr_key_bits += high_bit_offset;
|
|
+ l_v = *l & (~0ULL >> high_bit_offset);
|
|
+ r_v = *r & (~0ULL >> high_bit_offset);
|
|
+
|
|
+ while (1) {
|
|
+ if (nr_key_bits < 64) {
|
|
+ l_v >>= 64 - nr_key_bits;
|
|
+ r_v >>= 64 - nr_key_bits;
|
|
+ nr_key_bits = 0;
|
|
+ } else {
|
|
+ nr_key_bits -= 64;
|
|
+ }
|
|
+
|
|
+ if (!nr_key_bits || l_v != r_v)
|
|
+ break;
|
|
+
|
|
+ l = next_word(l);
|
|
+ r = next_word(r);
|
|
+
|
|
+ l_v = *l;
|
|
+ r_v = *r;
|
|
+ }
|
|
+
|
|
+ return cmp_int(l_v, r_v);
|
|
+}
|
|
+#endif
|
|
+
|
|
+__pure
|
|
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l,
|
|
+ const struct bkey_packed *r,
|
|
+ const struct btree *b)
|
|
+{
|
|
+ const struct bkey_format *f = &b->format;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r));
|
|
+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f));
|
|
+
|
|
+ ret = __bkey_cmp_bits(high_word(f, l),
|
|
+ high_word(f, r),
|
|
+ b->nr_key_bits);
|
|
+
|
|
+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l),
|
|
+ bkey_unpack_pos(b, r)));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+__pure __flatten
|
|
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bpos *r)
|
|
+{
|
|
+ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r);
|
|
+}
|
|
+
|
|
+__pure __flatten
|
|
+int bch2_bkey_cmp_packed(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bkey_packed *r)
|
|
+{
|
|
+ struct bkey unpacked;
|
|
+
|
|
+ if (likely(bkey_packed(l) && bkey_packed(r)))
|
|
+ return __bch2_bkey_cmp_packed_format_checked(l, r, b);
|
|
+
|
|
+ if (bkey_packed(l)) {
|
|
+ __bkey_unpack_key_format_checked(b, &unpacked, l);
|
|
+ l = (void*) &unpacked;
|
|
+ } else if (bkey_packed(r)) {
|
|
+ __bkey_unpack_key_format_checked(b, &unpacked, r);
|
|
+ r = (void*) &unpacked;
|
|
+ }
|
|
+
|
|
+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p);
|
|
+}
|
|
+
|
|
+__pure __flatten
|
|
+int __bch2_bkey_cmp_left_packed(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bpos *r)
|
|
+{
|
|
+ const struct bkey *l_unpacked;
|
|
+
|
|
+ return unlikely(l_unpacked = packed_to_bkey_c(l))
|
|
+ ? bpos_cmp(l_unpacked->p, *r)
|
|
+ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
|
|
+}
|
|
+
|
|
+void bch2_bpos_swab(struct bpos *p)
|
|
+{
|
|
+ u8 *l = (u8 *) p;
|
|
+ u8 *h = ((u8 *) &p[1]) - 1;
|
|
+
|
|
+ while (l < h) {
|
|
+ swap(*l, *h);
|
|
+ l++;
|
|
+ --h;
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k)
|
|
+{
|
|
+ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current;
|
|
+ u8 *l = k->key_start;
|
|
+ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1;
|
|
+
|
|
+ while (l < h) {
|
|
+ swap(*l, *h);
|
|
+ l++;
|
|
+ --h;
|
|
+ }
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_bkey_pack_test(void)
|
|
+{
|
|
+ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0);
|
|
+ struct bkey_packed p;
|
|
+
|
|
+ struct bkey_format test_format = {
|
|
+ .key_u64s = 3,
|
|
+ .nr_fields = BKEY_NR_FIELDS,
|
|
+ .bits_per_field = {
|
|
+ 13,
|
|
+ 64,
|
|
+ 32,
|
|
+ },
|
|
+ };
|
|
+
|
|
+ struct unpack_state in_s =
|
|
+ unpack_state_init(&bch2_bkey_format_current, (void *) &t);
|
|
+ struct pack_state out_s = pack_state_init(&test_format, &p);
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < out_s.format->nr_fields; i++) {
|
|
+ u64 a, v = get_inc_field(&in_s, i);
|
|
+
|
|
+ switch (i) {
|
|
+#define x(id, field) case id: a = t.field; break;
|
|
+ bkey_fields()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ if (a != v)
|
|
+ panic("got %llu actual %llu i %u\n", v, a, i);
|
|
+
|
|
+ if (!set_inc_field(&out_s, i, v))
|
|
+ panic("failed at %u\n", i);
|
|
+ }
|
|
+
|
|
+ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format));
|
|
+}
|
|
+#endif
|
|
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
|
|
new file mode 100644
|
|
index 000000000000..2e45d88fab03
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey.h
|
|
@@ -0,0 +1,597 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BKEY_H
|
|
+#define _BCACHEFS_BKEY_H
|
|
+
|
|
+#include <linux/bug.h>
|
|
+#include "bcachefs_format.h"
|
|
+
|
|
+#include "util.h"
|
|
+#include "vstructs.h"
|
|
+
|
|
+#ifdef CONFIG_X86_64
|
|
+#define HAVE_BCACHEFS_COMPILED_UNPACK 1
|
|
+#endif
|
|
+
|
|
+void bch2_to_binary(char *, const u64 *, unsigned);
|
|
+
|
|
+/* bkey with split value, const */
|
|
+struct bkey_s_c {
|
|
+ const struct bkey *k;
|
|
+ const struct bch_val *v;
|
|
+};
|
|
+
|
|
+/* bkey with split value */
|
|
+struct bkey_s {
|
|
+ union {
|
|
+ struct {
|
|
+ struct bkey *k;
|
|
+ struct bch_val *v;
|
|
+ };
|
|
+ struct bkey_s_c s_c;
|
|
+ };
|
|
+};
|
|
+
|
|
+#define bkey_next(_k) vstruct_next(_k)
|
|
+
|
|
+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s)
|
|
+
|
|
+static inline size_t bkey_val_bytes(const struct bkey *k)
|
|
+{
|
|
+ return bkey_val_u64s(k) * sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
|
|
+{
|
|
+ k->u64s = BKEY_U64s + val_u64s;
|
|
+}
|
|
+
|
|
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
|
|
+{
|
|
+ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64));
|
|
+}
|
|
+
|
|
+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
|
|
+
|
|
+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted)
|
|
+
|
|
+#define bkey_whiteout(_k) \
|
|
+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard)
|
|
+
|
|
+enum bkey_lr_packed {
|
|
+ BKEY_PACKED_BOTH,
|
|
+ BKEY_PACKED_RIGHT,
|
|
+ BKEY_PACKED_LEFT,
|
|
+ BKEY_PACKED_NONE,
|
|
+};
|
|
+
|
|
+#define bkey_lr_packed(_l, _r) \
|
|
+ ((_l)->format + ((_r)->format << 1))
|
|
+
|
|
+#define bkey_copy(_dst, _src) \
|
|
+do { \
|
|
+ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \
|
|
+ !type_is(_dst, struct bkey_packed *)); \
|
|
+ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \
|
|
+ !type_is(_src, struct bkey_packed *)); \
|
|
+ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \
|
|
+ (u64 *) (_dst) < (u64 *) (_src) + \
|
|
+ ((struct bkey *) (_src))->u64s); \
|
|
+ \
|
|
+ memcpy_u64s_small((_dst), (_src), \
|
|
+ ((struct bkey *) (_src))->u64s); \
|
|
+} while (0)
|
|
+
|
|
+struct btree;
|
|
+
|
|
+struct bkey_format_state {
|
|
+ u64 field_min[BKEY_NR_FIELDS];
|
|
+ u64 field_max[BKEY_NR_FIELDS];
|
|
+};
|
|
+
|
|
+void bch2_bkey_format_init(struct bkey_format_state *);
|
|
+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *);
|
|
+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos);
|
|
+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *);
|
|
+const char *bch2_bkey_format_validate(struct bkey_format *);
|
|
+
|
|
+__pure
|
|
+unsigned bch2_bkey_greatest_differing_bit(const struct btree *,
|
|
+ const struct bkey_packed *,
|
|
+ const struct bkey_packed *);
|
|
+__pure
|
|
+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *);
|
|
+
|
|
+__pure
|
|
+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *,
|
|
+ const struct bkey_packed *,
|
|
+ const struct btree *);
|
|
+
|
|
+__pure
|
|
+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *,
|
|
+ const struct bkey_packed *,
|
|
+ const struct bpos *);
|
|
+
|
|
+__pure
|
|
+int bch2_bkey_cmp_packed(const struct btree *,
|
|
+ const struct bkey_packed *,
|
|
+ const struct bkey_packed *);
|
|
+
|
|
+__pure
|
|
+int __bch2_bkey_cmp_left_packed(const struct btree *,
|
|
+ const struct bkey_packed *,
|
|
+ const struct bpos *);
|
|
+
|
|
+static inline __pure
|
|
+int bkey_cmp_left_packed(const struct btree *b,
|
|
+ const struct bkey_packed *l, const struct bpos *r)
|
|
+{
|
|
+ return __bch2_bkey_cmp_left_packed(b, l, r);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to
|
|
+ * pass it by by val... as much as I hate c++, const ref would be nice here:
|
|
+ */
|
|
+__pure __flatten
|
|
+static inline int bkey_cmp_left_packed_byval(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ struct bpos r)
|
|
+{
|
|
+ return bkey_cmp_left_packed(b, l, &r);
|
|
+}
|
|
+
|
|
+static __always_inline int bpos_cmp(struct bpos l, struct bpos r)
|
|
+{
|
|
+ return cmp_int(l.inode, r.inode) ?:
|
|
+ cmp_int(l.offset, r.offset) ?:
|
|
+ cmp_int(l.snapshot, r.snapshot);
|
|
+}
|
|
+
|
|
+static __always_inline int bkey_cmp(struct bpos l, struct bpos r)
|
|
+{
|
|
+ return cmp_int(l.inode, r.inode) ?:
|
|
+ cmp_int(l.offset, r.offset);
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_min(struct bpos l, struct bpos r)
|
|
+{
|
|
+ return bpos_cmp(l, r) < 0 ? l : r;
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_max(struct bpos l, struct bpos r)
|
|
+{
|
|
+ return bpos_cmp(l, r) > 0 ? l : r;
|
|
+}
|
|
+
|
|
+#define sbb(a, b, borrow) \
|
|
+do { \
|
|
+ typeof(a) d1, d2; \
|
|
+ \
|
|
+ d1 = a - borrow; \
|
|
+ borrow = d1 > a; \
|
|
+ \
|
|
+ d2 = d1 - b; \
|
|
+ borrow += d2 > d1; \
|
|
+ a = d2; \
|
|
+} while (0)
|
|
+
|
|
+/* returns a - b: */
|
|
+static inline struct bpos bpos_sub(struct bpos a, struct bpos b)
|
|
+{
|
|
+ int borrow = 0;
|
|
+
|
|
+ sbb(a.snapshot, b.snapshot, borrow);
|
|
+ sbb(a.offset, b.offset, borrow);
|
|
+ sbb(a.inode, b.inode, borrow);
|
|
+ return a;
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
|
|
+{
|
|
+ if (bpos_cmp(l, r) > 0)
|
|
+ swap(l, r);
|
|
+
|
|
+ return bpos_sub(r, l);
|
|
+}
|
|
+
|
|
+void bch2_bpos_swab(struct bpos *);
|
|
+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *);
|
|
+
|
|
+static __always_inline int bversion_cmp(struct bversion l, struct bversion r)
|
|
+{
|
|
+ return cmp_int(l.hi, r.hi) ?:
|
|
+ cmp_int(l.lo, r.lo);
|
|
+}
|
|
+
|
|
+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 })
|
|
+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL })
|
|
+
|
|
+static __always_inline int bversion_zero(struct bversion v)
|
|
+{
|
|
+ return !bversion_cmp(v, ZERO_VERSION);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+/* statement expressions confusing unlikely()? */
|
|
+#define bkey_packed(_k) \
|
|
+ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \
|
|
+ (_k)->format != KEY_FORMAT_CURRENT; })
|
|
+#else
|
|
+#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT)
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse
|
|
+ */
|
|
+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k)
|
|
+{
|
|
+ return (struct bkey_packed *) k;
|
|
+}
|
|
+
|
|
+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k)
|
|
+{
|
|
+ return (const struct bkey_packed *) k;
|
|
+}
|
|
+
|
|
+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k)
|
|
+{
|
|
+ return bkey_packed(k) ? NULL : (struct bkey_i *) k;
|
|
+}
|
|
+
|
|
+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k)
|
|
+{
|
|
+ return bkey_packed(k) ? NULL : (const struct bkey *) k;
|
|
+}
|
|
+
|
|
+static inline unsigned bkey_format_key_bits(const struct bkey_format *format)
|
|
+{
|
|
+ return format->bits_per_field[BKEY_FIELD_INODE] +
|
|
+ format->bits_per_field[BKEY_FIELD_OFFSET] +
|
|
+ format->bits_per_field[BKEY_FIELD_SNAPSHOT];
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_successor(struct bpos p)
|
|
+{
|
|
+ if (!++p.snapshot &&
|
|
+ !++p.offset &&
|
|
+ !++p.inode)
|
|
+ BUG();
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_predecessor(struct bpos p)
|
|
+{
|
|
+ if (!p.snapshot-- &&
|
|
+ !p.offset-- &&
|
|
+ !p.inode--)
|
|
+ BUG();
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_nosnap_successor(struct bpos p)
|
|
+{
|
|
+ p.snapshot = 0;
|
|
+
|
|
+ if (!++p.offset &&
|
|
+ !++p.inode)
|
|
+ BUG();
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline struct bpos bpos_nosnap_predecessor(struct bpos p)
|
|
+{
|
|
+ p.snapshot = 0;
|
|
+
|
|
+ if (!p.offset-- &&
|
|
+ !p.inode--)
|
|
+ BUG();
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline u64 bkey_start_offset(const struct bkey *k)
|
|
+{
|
|
+ return k->p.offset - k->size;
|
|
+}
|
|
+
|
|
+static inline struct bpos bkey_start_pos(const struct bkey *k)
|
|
+{
|
|
+ return (struct bpos) {
|
|
+ .inode = k->p.inode,
|
|
+ .offset = bkey_start_offset(k),
|
|
+ .snapshot = k->p.snapshot,
|
|
+ };
|
|
+}
|
|
+
|
|
+/* Packed helpers */
|
|
+
|
|
+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
|
|
+
|
|
+ EBUG_ON(k->u64s < ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ return bkeyp_key_u64s(format, k) * sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ return k->u64s - bkeyp_key_u64s(format, k);
|
|
+}
|
|
+
|
|
+static inline size_t bkeyp_val_bytes(const struct bkey_format *format,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ return bkeyp_val_u64s(format, k) * sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline void set_bkeyp_val_u64s(const struct bkey_format *format,
|
|
+ struct bkey_packed *k, unsigned val_u64s)
|
|
+{
|
|
+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s;
|
|
+}
|
|
+
|
|
+#define bkeyp_val(_format, _k) \
|
|
+ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k)))
|
|
+
|
|
+extern const struct bkey_format bch2_bkey_format_current;
|
|
+
|
|
+bool bch2_bkey_transform(const struct bkey_format *,
|
|
+ struct bkey_packed *,
|
|
+ const struct bkey_format *,
|
|
+ const struct bkey_packed *);
|
|
+
|
|
+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
|
|
+ const struct bkey_packed *);
|
|
+
|
|
+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK
|
|
+struct bpos __bkey_unpack_pos(const struct bkey_format *,
|
|
+ const struct bkey_packed *);
|
|
+#endif
|
|
+
|
|
+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *,
|
|
+ const struct bkey_format *);
|
|
+
|
|
+enum bkey_pack_pos_ret {
|
|
+ BKEY_PACK_POS_EXACT,
|
|
+ BKEY_PACK_POS_SMALLER,
|
|
+ BKEY_PACK_POS_FAIL,
|
|
+};
|
|
+
|
|
+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos,
|
|
+ const struct btree *);
|
|
+
|
|
+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in,
|
|
+ const struct btree *b)
|
|
+{
|
|
+ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT;
|
|
+}
|
|
+
|
|
+void bch2_bkey_unpack(const struct btree *, struct bkey_i *,
|
|
+ const struct bkey_packed *);
|
|
+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *,
|
|
+ const struct bkey_format *);
|
|
+
|
|
+static inline u64 bkey_field_max(const struct bkey_format *f,
|
|
+ enum bch_bkey_fields nr)
|
|
+{
|
|
+ return f->bits_per_field[nr] < 64
|
|
+ ? (le64_to_cpu(f->field_offset[nr]) +
|
|
+ ~(~0ULL << f->bits_per_field[nr]))
|
|
+ : U64_MAX;
|
|
+}
|
|
+
|
|
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
|
|
+
|
|
+int bch2_compile_bkey_format(const struct bkey_format *, void *);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline int bch2_compile_bkey_format(const struct bkey_format *format,
|
|
+ void *out) { return 0; }
|
|
+
|
|
+#endif
|
|
+
|
|
+static inline void bkey_reassemble(struct bkey_i *dst,
|
|
+ struct bkey_s_c src)
|
|
+{
|
|
+ dst->k = *src.k;
|
|
+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
|
|
+}
|
|
+
|
|
+#define bkey_s_null ((struct bkey_s) { .k = NULL })
|
|
+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL })
|
|
+
|
|
+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) })
|
|
+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) })
|
|
+
|
|
+static inline struct bkey_s bkey_to_s(struct bkey *k)
|
|
+{
|
|
+ return (struct bkey_s) { .k = k, .v = NULL };
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
|
|
+{
|
|
+ return (struct bkey_s_c) { .k = k, .v = NULL };
|
|
+}
|
|
+
|
|
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
|
|
+{
|
|
+ return (struct bkey_s) { .k = &k->k, .v = &k->v };
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
|
|
+{
|
|
+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For a given type of value (e.g. struct bch_extent), generates the types for
|
|
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
|
|
+ * functions, which also check that the value is of the correct type.
|
|
+ *
|
|
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
|
|
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
|
|
+ * functions.
|
|
+ */
|
|
+#define x(name, ...) \
|
|
+struct bkey_i_##name { \
|
|
+ union { \
|
|
+ struct bkey k; \
|
|
+ struct bkey_i k_i; \
|
|
+ }; \
|
|
+ struct bch_##name v; \
|
|
+}; \
|
|
+ \
|
|
+struct bkey_s_c_##name { \
|
|
+ union { \
|
|
+ struct { \
|
|
+ const struct bkey *k; \
|
|
+ const struct bch_##name *v; \
|
|
+ }; \
|
|
+ struct bkey_s_c s_c; \
|
|
+ }; \
|
|
+}; \
|
|
+ \
|
|
+struct bkey_s_##name { \
|
|
+ union { \
|
|
+ struct { \
|
|
+ struct bkey *k; \
|
|
+ struct bch_##name *v; \
|
|
+ }; \
|
|
+ struct bkey_s_c_##name c; \
|
|
+ struct bkey_s s; \
|
|
+ struct bkey_s_c s_c; \
|
|
+ }; \
|
|
+}; \
|
|
+ \
|
|
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \
|
|
+{ \
|
|
+ EBUG_ON(k->k.type != KEY_TYPE_##name); \
|
|
+ return container_of(&k->k, struct bkey_i_##name, k); \
|
|
+} \
|
|
+ \
|
|
+static inline const struct bkey_i_##name * \
|
|
+bkey_i_to_##name##_c(const struct bkey_i *k) \
|
|
+{ \
|
|
+ EBUG_ON(k->k.type != KEY_TYPE_##name); \
|
|
+ return container_of(&k->k, struct bkey_i_##name, k); \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \
|
|
+{ \
|
|
+ EBUG_ON(k.k->type != KEY_TYPE_##name); \
|
|
+ return (struct bkey_s_##name) { \
|
|
+ .k = k.k, \
|
|
+ .v = container_of(k.v, struct bch_##name, v), \
|
|
+ }; \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
|
|
+{ \
|
|
+ EBUG_ON(k.k->type != KEY_TYPE_##name); \
|
|
+ return (struct bkey_s_c_##name) { \
|
|
+ .k = k.k, \
|
|
+ .v = container_of(k.v, struct bch_##name, v), \
|
|
+ }; \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
|
|
+{ \
|
|
+ return (struct bkey_s_##name) { \
|
|
+ .k = &k->k, \
|
|
+ .v = &k->v, \
|
|
+ }; \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_s_c_##name \
|
|
+name##_i_to_s_c(const struct bkey_i_##name *k) \
|
|
+{ \
|
|
+ return (struct bkey_s_c_##name) { \
|
|
+ .k = &k->k, \
|
|
+ .v = &k->v, \
|
|
+ }; \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \
|
|
+{ \
|
|
+ EBUG_ON(k->k.type != KEY_TYPE_##name); \
|
|
+ return (struct bkey_s_##name) { \
|
|
+ .k = &k->k, \
|
|
+ .v = container_of(&k->v, struct bch_##name, v), \
|
|
+ }; \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_s_c_##name \
|
|
+bkey_i_to_s_c_##name(const struct bkey_i *k) \
|
|
+{ \
|
|
+ EBUG_ON(k->k.type != KEY_TYPE_##name); \
|
|
+ return (struct bkey_s_c_##name) { \
|
|
+ .k = &k->k, \
|
|
+ .v = container_of(&k->v, struct bch_##name, v), \
|
|
+ }; \
|
|
+} \
|
|
+ \
|
|
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
|
|
+{ \
|
|
+ struct bkey_i_##name *k = \
|
|
+ container_of(&_k->k, struct bkey_i_##name, k); \
|
|
+ \
|
|
+ bkey_init(&k->k); \
|
|
+ memset(&k->v, 0, sizeof(k->v)); \
|
|
+ k->k.type = KEY_TYPE_##name; \
|
|
+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \
|
|
+ \
|
|
+ return k; \
|
|
+}
|
|
+
|
|
+BCH_BKEY_TYPES();
|
|
+#undef x
|
|
+
|
|
+/* byte order helpers */
|
|
+
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+
|
|
+static inline unsigned high_word_offset(const struct bkey_format *f)
|
|
+{
|
|
+ return f->key_u64s - 1;
|
|
+}
|
|
+
|
|
+#define high_bit_offset 0
|
|
+#define nth_word(p, n) ((p) - (n))
|
|
+
|
|
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
+
|
|
+static inline unsigned high_word_offset(const struct bkey_format *f)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define high_bit_offset KEY_PACKED_BITS_START
|
|
+#define nth_word(p, n) ((p) + (n))
|
|
+
|
|
+#else
|
|
+#error edit for your odd byteorder.
|
|
+#endif
|
|
+
|
|
+#define high_word(f, k) ((k)->_data + high_word_offset(f))
|
|
+#define next_word(p) nth_word(p, 1)
|
|
+#define prev_word(p) nth_word(p, -1)
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_bkey_pack_test(void);
|
|
+#else
|
|
+static inline void bch2_bkey_pack_test(void) {}
|
|
+#endif
|
|
+
|
|
+#endif /* _BCACHEFS_BKEY_H */
|
|
diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h
|
|
new file mode 100644
|
|
index 000000000000..0d7c67a959af
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey_buf.h
|
|
@@ -0,0 +1,60 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BKEY_BUF_H
|
|
+#define _BCACHEFS_BKEY_BUF_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+
|
|
+struct bkey_buf {
|
|
+ struct bkey_i *k;
|
|
+ u64 onstack[12];
|
|
+};
|
|
+
|
|
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
|
|
+ struct bch_fs *c, unsigned u64s)
|
|
+{
|
|
+ if (s->k == (void *) s->onstack &&
|
|
+ u64s > ARRAY_SIZE(s->onstack)) {
|
|
+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
|
|
+ memcpy(s->k, s->onstack, sizeof(s->onstack));
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
|
|
+ struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ bch2_bkey_buf_realloc(s, c, k.k->u64s);
|
|
+ bkey_reassemble(s->k, k);
|
|
+}
|
|
+
|
|
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
|
|
+ struct bch_fs *c,
|
|
+ struct bkey_i *src)
|
|
+{
|
|
+ bch2_bkey_buf_realloc(s, c, src->k.u64s);
|
|
+ bkey_copy(s->k, src);
|
|
+}
|
|
+
|
|
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
|
|
+ struct bch_fs *c,
|
|
+ struct btree *b,
|
|
+ struct bkey_packed *src)
|
|
+{
|
|
+ bch2_bkey_buf_realloc(s, c, BKEY_U64s +
|
|
+ bkeyp_val_u64s(&b->format, src));
|
|
+ bch2_bkey_unpack(b, s->k, src);
|
|
+}
|
|
+
|
|
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
|
|
+{
|
|
+ s->k = (void *) s->onstack;
|
|
+}
|
|
+
|
|
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
|
|
+{
|
|
+ if (s->k != (void *) s->onstack)
|
|
+ mempool_free(s->k, &c->large_bkey_pool);
|
|
+ s->k = NULL;
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_BKEY_BUF_H */
|
|
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
|
|
new file mode 100644
|
|
index 000000000000..9f869bed9f1c
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey_methods.c
|
|
@@ -0,0 +1,440 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_types.h"
|
|
+#include "alloc_background.h"
|
|
+#include "dirent.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "inode.h"
|
|
+#include "quota.h"
|
|
+#include "reflink.h"
|
|
+#include "xattr.h"
|
|
+
|
|
+const char * const bch2_bkey_types[] = {
|
|
+#define x(name, nr) #name,
|
|
+ BCH_BKEY_TYPES()
|
|
+#undef x
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static const char *deleted_key_invalid(const struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_deleted (struct bkey_ops) { \
|
|
+ .key_invalid = deleted_key_invalid, \
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_discard (struct bkey_ops) { \
|
|
+ .key_invalid = deleted_key_invalid, \
|
|
+}
|
|
+
|
|
+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ if (bkey_val_bytes(k.k))
|
|
+ return "value size should be zero";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_error (struct bkey_ops) { \
|
|
+ .key_invalid = empty_val_key_invalid, \
|
|
+}
|
|
+
|
|
+static const char *key_type_cookie_invalid(const struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_cookie (struct bkey_ops) { \
|
|
+ .key_invalid = key_type_cookie_invalid, \
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_hash_whiteout (struct bkey_ops) { \
|
|
+ .key_invalid = empty_val_key_invalid, \
|
|
+}
|
|
+
|
|
+static const char *key_type_inline_data_invalid(const struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
|
|
+ unsigned datalen = bkey_inline_data_bytes(k.k);
|
|
+
|
|
+ pr_buf(out, "datalen %u: %*phN",
|
|
+ datalen, min(datalen, 32U), d.v->data);
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_inline_data (struct bkey_ops) { \
|
|
+ .key_invalid = key_type_inline_data_invalid, \
|
|
+ .val_to_text = key_type_inline_data_to_text, \
|
|
+}
|
|
+
|
|
+static const struct bkey_ops bch2_bkey_ops[] = {
|
|
+#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
|
|
+ BCH_BKEY_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ if (k.k->type >= KEY_TYPE_MAX)
|
|
+ return "invalid type";
|
|
+
|
|
+ return bch2_bkey_ops[k.k->type].key_invalid(c, k);
|
|
+}
|
|
+
|
|
+static unsigned bch2_key_types_allowed[] = {
|
|
+ [BKEY_TYPE_extents] =
|
|
+ (1U << KEY_TYPE_error)|
|
|
+ (1U << KEY_TYPE_extent)|
|
|
+ (1U << KEY_TYPE_reservation)|
|
|
+ (1U << KEY_TYPE_reflink_p)|
|
|
+ (1U << KEY_TYPE_inline_data),
|
|
+ [BKEY_TYPE_inodes] =
|
|
+ (1U << KEY_TYPE_inode)|
|
|
+ (1U << KEY_TYPE_inode_generation),
|
|
+ [BKEY_TYPE_dirents] =
|
|
+ (1U << KEY_TYPE_hash_whiteout)|
|
|
+ (1U << KEY_TYPE_dirent),
|
|
+ [BKEY_TYPE_xattrs] =
|
|
+ (1U << KEY_TYPE_hash_whiteout)|
|
|
+ (1U << KEY_TYPE_xattr),
|
|
+ [BKEY_TYPE_alloc] =
|
|
+ (1U << KEY_TYPE_alloc)|
|
|
+ (1U << KEY_TYPE_alloc_v2),
|
|
+ [BKEY_TYPE_quotas] =
|
|
+ (1U << KEY_TYPE_quota),
|
|
+ [BKEY_TYPE_stripes] =
|
|
+ (1U << KEY_TYPE_stripe),
|
|
+ [BKEY_TYPE_reflink] =
|
|
+ (1U << KEY_TYPE_reflink_v)|
|
|
+ (1U << KEY_TYPE_indirect_inline_data),
|
|
+ [BKEY_TYPE_btree] =
|
|
+ (1U << KEY_TYPE_btree_ptr)|
|
|
+ (1U << KEY_TYPE_btree_ptr_v2),
|
|
+};
|
|
+
|
|
+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
|
|
+ enum btree_node_type type)
|
|
+{
|
|
+ unsigned key_types_allowed = (1U << KEY_TYPE_deleted)|
|
|
+ bch2_key_types_allowed[type] ;
|
|
+
|
|
+ if (k.k->u64s < BKEY_U64s)
|
|
+ return "u64s too small";
|
|
+
|
|
+ if (!(key_types_allowed & (1U << k.k->type)))
|
|
+ return "invalid key type for this btree";
|
|
+
|
|
+ if (type == BKEY_TYPE_btree &&
|
|
+ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
|
|
+ return "value too big";
|
|
+
|
|
+ if (btree_node_type_is_extents(type)) {
|
|
+ if ((k.k->size == 0) != bkey_deleted(k.k))
|
|
+ return "bad size field";
|
|
+
|
|
+ if (k.k->size > k.k->p.offset)
|
|
+ return "size greater than offset";
|
|
+ } else {
|
|
+ if (k.k->size)
|
|
+ return "nonzero size field";
|
|
+ }
|
|
+
|
|
+ if (type != BKEY_TYPE_btree &&
|
|
+ !btree_type_has_snapshots(type) &&
|
|
+ k.k->p.snapshot)
|
|
+ return "nonzero snapshot";
|
|
+
|
|
+ if (type != BKEY_TYPE_btree &&
|
|
+ btree_type_has_snapshots(type) &&
|
|
+ k.k->p.snapshot != U32_MAX)
|
|
+ return "invalid snapshot field";
|
|
+
|
|
+ if (type != BKEY_TYPE_btree &&
|
|
+ !bkey_cmp(k.k->p, POS_MAX))
|
|
+ return "POS_MAX key";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
|
|
+ enum btree_node_type type)
|
|
+{
|
|
+ return __bch2_bkey_invalid(c, k, type) ?:
|
|
+ bch2_bkey_val_invalid(c, k);
|
|
+}
|
|
+
|
|
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
|
|
+{
|
|
+ if (bpos_cmp(k.k->p, b->data->min_key) < 0)
|
|
+ return "key before start of btree node";
|
|
+
|
|
+ if (bpos_cmp(k.k->p, b->data->max_key) > 0)
|
|
+ return "key past end of btree node";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
|
|
+{
|
|
+ const char *invalid;
|
|
+
|
|
+ BUG_ON(!k.k->u64s);
|
|
+
|
|
+ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?:
|
|
+ bch2_bkey_in_btree_node(b, k);
|
|
+ if (invalid) {
|
|
+ char buf[160];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
|
|
+ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos)
|
|
+{
|
|
+ if (!bpos_cmp(pos, POS_MIN))
|
|
+ pr_buf(out, "POS_MIN");
|
|
+ else if (!bpos_cmp(pos, POS_MAX))
|
|
+ pr_buf(out, "POS_MAX");
|
|
+ else {
|
|
+ if (pos.inode == U64_MAX)
|
|
+ pr_buf(out, "U64_MAX");
|
|
+ else
|
|
+ pr_buf(out, "%llu", pos.inode);
|
|
+ pr_buf(out, ":");
|
|
+ if (pos.offset == U64_MAX)
|
|
+ pr_buf(out, "U64_MAX");
|
|
+ else
|
|
+ pr_buf(out, "%llu", pos.offset);
|
|
+ pr_buf(out, ":");
|
|
+ if (pos.snapshot == U32_MAX)
|
|
+ pr_buf(out, "U32_MAX");
|
|
+ else
|
|
+ pr_buf(out, "%u", pos.snapshot);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k)
|
|
+{
|
|
+ if (k) {
|
|
+ pr_buf(out, "u64s %u type ", k->u64s);
|
|
+
|
|
+ if (k->type < KEY_TYPE_MAX)
|
|
+ pr_buf(out, "%s ", bch2_bkey_types[k->type]);
|
|
+ else
|
|
+ pr_buf(out, "%u ", k->type);
|
|
+
|
|
+ bch2_bpos_to_text(out, k->p);
|
|
+
|
|
+ pr_buf(out, " len %u ver %llu", k->size, k->version.lo);
|
|
+ } else {
|
|
+ pr_buf(out, "(null)");
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ if (k.k->type < KEY_TYPE_MAX) {
|
|
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
|
|
+
|
|
+ if (likely(ops->val_to_text))
|
|
+ ops->val_to_text(out, c, k);
|
|
+ } else {
|
|
+ pr_buf(out, "(invalid type %u)", k.k->type);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ bch2_bkey_to_text(out, k.k);
|
|
+
|
|
+ if (k.k) {
|
|
+ pr_buf(out, ": ");
|
|
+ bch2_val_to_text(out, c, k);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bkey_swab_val(struct bkey_s k)
|
|
+{
|
|
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
|
|
+
|
|
+ if (ops->swab)
|
|
+ ops->swab(k);
|
|
+}
|
|
+
|
|
+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k)
|
|
+{
|
|
+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type];
|
|
+
|
|
+ return ops->key_normalize
|
|
+ ? ops->key_normalize(c, k)
|
|
+ : false;
|
|
+}
|
|
+
|
|
+enum merge_result bch2_bkey_merge(struct bch_fs *c,
|
|
+ struct bkey_s l, struct bkey_s r)
|
|
+{
|
|
+ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type];
|
|
+ enum merge_result ret;
|
|
+
|
|
+ if (bch2_key_merging_disabled ||
|
|
+ !ops->key_merge ||
|
|
+ l.k->type != r.k->type ||
|
|
+ bversion_cmp(l.k->version, r.k->version) ||
|
|
+ bpos_cmp(l.k->p, bkey_start_pos(r.k)))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ ret = ops->key_merge(c, l, r);
|
|
+
|
|
+ if (ret != BCH_MERGE_NOMERGE)
|
|
+ l.k->needs_whiteout |= r.k->needs_whiteout;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static const struct old_bkey_type {
|
|
+ u8 btree_node_type;
|
|
+ u8 old;
|
|
+ u8 new;
|
|
+} bkey_renumber_table[] = {
|
|
+ {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr },
|
|
+ {BKEY_TYPE_extents, 128, KEY_TYPE_extent },
|
|
+ {BKEY_TYPE_extents, 129, KEY_TYPE_extent },
|
|
+ {BKEY_TYPE_extents, 130, KEY_TYPE_reservation },
|
|
+ {BKEY_TYPE_inodes, 128, KEY_TYPE_inode },
|
|
+ {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation },
|
|
+ {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent },
|
|
+ {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout },
|
|
+ {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr },
|
|
+ {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout },
|
|
+ {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc },
|
|
+ {BKEY_TYPE_quotas, 128, KEY_TYPE_quota },
|
|
+};
|
|
+
|
|
+void bch2_bkey_renumber(enum btree_node_type btree_node_type,
|
|
+ struct bkey_packed *k,
|
|
+ int write)
|
|
+{
|
|
+ const struct old_bkey_type *i;
|
|
+
|
|
+ for (i = bkey_renumber_table;
|
|
+ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table);
|
|
+ i++)
|
|
+ if (btree_node_type == i->btree_node_type &&
|
|
+ k->type == (write ? i->new : i->old)) {
|
|
+ k->type = write ? i->old : i->new;
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id,
|
|
+ unsigned version, unsigned big_endian,
|
|
+ int write,
|
|
+ struct bkey_format *f,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ const struct bkey_ops *ops;
|
|
+ struct bkey uk;
|
|
+ struct bkey_s u;
|
|
+ unsigned nr_compat = 5;
|
|
+ int i;
|
|
+
|
|
+ /*
|
|
+ * Do these operations in reverse order in the write path:
|
|
+ */
|
|
+
|
|
+ for (i = 0; i < nr_compat; i++)
|
|
+ switch (!write ? i : nr_compat - 1 - i) {
|
|
+ case 0:
|
|
+ if (big_endian != CPU_BIG_ENDIAN)
|
|
+ bch2_bkey_swab_key(f, k);
|
|
+ break;
|
|
+ case 1:
|
|
+ if (version < bcachefs_metadata_version_bkey_renumber)
|
|
+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write);
|
|
+ break;
|
|
+ case 2:
|
|
+ if (version < bcachefs_metadata_version_inode_btree_change &&
|
|
+ btree_id == BTREE_ID_inodes) {
|
|
+ if (!bkey_packed(k)) {
|
|
+ struct bkey_i *u = packed_to_bkey(k);
|
|
+ swap(u->k.p.inode, u->k.p.offset);
|
|
+ } else if (f->bits_per_field[BKEY_FIELD_INODE] &&
|
|
+ f->bits_per_field[BKEY_FIELD_OFFSET]) {
|
|
+ struct bkey_format tmp = *f, *in = f, *out = &tmp;
|
|
+
|
|
+ swap(tmp.bits_per_field[BKEY_FIELD_INODE],
|
|
+ tmp.bits_per_field[BKEY_FIELD_OFFSET]);
|
|
+ swap(tmp.field_offset[BKEY_FIELD_INODE],
|
|
+ tmp.field_offset[BKEY_FIELD_OFFSET]);
|
|
+
|
|
+ if (!write)
|
|
+ swap(in, out);
|
|
+
|
|
+ uk = __bch2_bkey_unpack_key(in, k);
|
|
+ swap(uk.p.inode, uk.p.offset);
|
|
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out));
|
|
+ }
|
|
+ }
|
|
+ break;
|
|
+ case 3:
|
|
+ if (version < bcachefs_metadata_version_snapshot &&
|
|
+ (level || btree_type_has_snapshots(btree_id))) {
|
|
+ struct bkey_i *u = packed_to_bkey(k);
|
|
+
|
|
+ if (u) {
|
|
+ u->k.p.snapshot = write
|
|
+ ? 0 : U32_MAX;
|
|
+ } else {
|
|
+ u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT];
|
|
+ u64 max_packed = min_packed +
|
|
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
|
|
+
|
|
+ uk = __bch2_bkey_unpack_key(f, k);
|
|
+ uk.p.snapshot = write
|
|
+ ? min_packed : min_t(u64, U32_MAX, max_packed);
|
|
+
|
|
+ BUG_ON(!bch2_bkey_pack_key(k, &uk, f));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ case 4:
|
|
+ if (!bkey_packed(k)) {
|
|
+ u = bkey_i_to_s(packed_to_bkey(k));
|
|
+ } else {
|
|
+ uk = __bch2_bkey_unpack_key(f, k);
|
|
+ u.k = &uk;
|
|
+ u.v = bkeyp_val(f, k);
|
|
+ }
|
|
+
|
|
+ if (big_endian != CPU_BIG_ENDIAN)
|
|
+ bch2_bkey_swab_val(u);
|
|
+
|
|
+ ops = &bch2_bkey_ops[k->type];
|
|
+
|
|
+ if (ops->compat)
|
|
+ ops->compat(btree_id, version, big_endian, write, u);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
|
|
new file mode 100644
|
|
index 000000000000..bfa6f112aeed
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey_methods.h
|
|
@@ -0,0 +1,81 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BKEY_METHODS_H
|
|
+#define _BCACHEFS_BKEY_METHODS_H
|
|
+
|
|
+#include "bkey.h"
|
|
+
|
|
+struct bch_fs;
|
|
+struct btree;
|
|
+struct bkey;
|
|
+enum btree_node_type;
|
|
+
|
|
+extern const char * const bch2_bkey_types[];
|
|
+
|
|
+enum merge_result {
|
|
+ BCH_MERGE_NOMERGE,
|
|
+
|
|
+ /*
|
|
+ * The keys were mergeable, but would have overflowed size - so instead
|
|
+ * l was changed to the maximum size, and both keys were modified:
|
|
+ */
|
|
+ BCH_MERGE_PARTIAL,
|
|
+ BCH_MERGE_MERGE,
|
|
+};
|
|
+
|
|
+struct bkey_ops {
|
|
+ /* Returns reason for being invalid if invalid, else NULL: */
|
|
+ const char * (*key_invalid)(const struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+ void (*val_to_text)(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+ void (*swab)(struct bkey_s);
|
|
+ bool (*key_normalize)(struct bch_fs *, struct bkey_s);
|
|
+ enum merge_result (*key_merge)(struct bch_fs *,
|
|
+ struct bkey_s, struct bkey_s);
|
|
+ void (*compat)(enum btree_id id, unsigned version,
|
|
+ unsigned big_endian, int write,
|
|
+ struct bkey_s);
|
|
+};
|
|
+
|
|
+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c);
|
|
+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
|
|
+ enum btree_node_type);
|
|
+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c,
|
|
+ enum btree_node_type);
|
|
+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c);
|
|
+
|
|
+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
|
|
+
|
|
+void bch2_bpos_to_text(struct printbuf *, struct bpos);
|
|
+void bch2_bkey_to_text(struct printbuf *, const struct bkey *);
|
|
+void bch2_val_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+
|
|
+void bch2_bkey_swab_val(struct bkey_s);
|
|
+
|
|
+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s);
|
|
+
|
|
+enum merge_result bch2_bkey_merge(struct bch_fs *,
|
|
+ struct bkey_s, struct bkey_s);
|
|
+
|
|
+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
|
|
+
|
|
+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned,
|
|
+ int, struct bkey_format *, struct bkey_packed *);
|
|
+
|
|
+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id,
|
|
+ unsigned version, unsigned big_endian,
|
|
+ int write,
|
|
+ struct bkey_format *f,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ if (version < bcachefs_metadata_version_current ||
|
|
+ big_endian != CPU_BIG_ENDIAN)
|
|
+ __bch2_bkey_compat(level, btree_id, version,
|
|
+ big_endian, write, f, k);
|
|
+
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_BKEY_METHODS_H */
|
|
diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c
|
|
new file mode 100644
|
|
index 000000000000..537ab7919e88
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey_sort.c
|
|
@@ -0,0 +1,253 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "bkey_sort.h"
|
|
+#include "bset.h"
|
|
+#include "extents.h"
|
|
+
|
|
+typedef int (*sort_cmp_fn)(struct btree *,
|
|
+ struct bkey_packed *,
|
|
+ struct bkey_packed *);
|
|
+
|
|
+static inline bool sort_iter_end(struct sort_iter *iter)
|
|
+{
|
|
+ return !iter->used;
|
|
+}
|
|
+
|
|
+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from,
|
|
+ sort_cmp_fn cmp)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = from;
|
|
+ i + 1 < iter->used &&
|
|
+ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0;
|
|
+ i++)
|
|
+ swap(iter->data[i], iter->data[i + 1]);
|
|
+}
|
|
+
|
|
+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp)
|
|
+{
|
|
+ unsigned i = iter->used;
|
|
+
|
|
+ while (i--)
|
|
+ sort_iter_sift(iter, i, cmp);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
|
|
+{
|
|
+ return !sort_iter_end(iter) ? iter->data->k : NULL;
|
|
+}
|
|
+
|
|
+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
|
|
+{
|
|
+ struct sort_iter_set *i = iter->data;
|
|
+
|
|
+ BUG_ON(!iter->used);
|
|
+
|
|
+ i->k = bkey_next(i->k);
|
|
+
|
|
+ BUG_ON(i->k > i->end);
|
|
+
|
|
+ if (i->k == i->end)
|
|
+ array_remove_item(iter->data, iter->used, 0);
|
|
+ else
|
|
+ sort_iter_sift(iter, 0, cmp);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter,
|
|
+ sort_cmp_fn cmp)
|
|
+{
|
|
+ struct bkey_packed *ret = sort_iter_peek(iter);
|
|
+
|
|
+ if (ret)
|
|
+ sort_iter_advance(iter, cmp);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If keys compare equal, compare by pointer order:
|
|
+ */
|
|
+static inline int key_sort_fix_overlapping_cmp(struct btree *b,
|
|
+ struct bkey_packed *l,
|
|
+ struct bkey_packed *r)
|
|
+{
|
|
+ return bch2_bkey_cmp_packed(b, l, r) ?:
|
|
+ cmp_int((unsigned long) l, (unsigned long) r);
|
|
+}
|
|
+
|
|
+static inline bool should_drop_next_key(struct sort_iter *iter)
|
|
+{
|
|
+ /*
|
|
+ * key_sort_cmp() ensures that when keys compare equal the older key
|
|
+ * comes first; so if l->k compares equal to r->k then l->k is older
|
|
+ * and should be dropped.
|
|
+ */
|
|
+ return iter->used >= 2 &&
|
|
+ !bch2_bkey_cmp_packed(iter->b,
|
|
+ iter->data[0].k,
|
|
+ iter->data[1].k);
|
|
+}
|
|
+
|
|
+struct btree_nr_keys
|
|
+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
|
|
+ struct sort_iter *iter)
|
|
+{
|
|
+ struct bkey_packed *out = dst->start;
|
|
+ struct bkey_packed *k;
|
|
+ struct btree_nr_keys nr;
|
|
+
|
|
+ memset(&nr, 0, sizeof(nr));
|
|
+
|
|
+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp);
|
|
+
|
|
+ while ((k = sort_iter_peek(iter))) {
|
|
+ if (!bkey_deleted(k) &&
|
|
+ !should_drop_next_key(iter)) {
|
|
+ bkey_copy(out, k);
|
|
+ btree_keys_account_key_add(&nr, 0, out);
|
|
+ out = bkey_next(out);
|
|
+ }
|
|
+
|
|
+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp);
|
|
+ }
|
|
+
|
|
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
|
|
+ return nr;
|
|
+}
|
|
+
|
|
+static void extent_sort_append(struct bch_fs *c,
|
|
+ struct bkey_format *f,
|
|
+ struct btree_nr_keys *nr,
|
|
+ struct bkey_packed **out,
|
|
+ struct bkey_s k)
|
|
+{
|
|
+ if (!bkey_deleted(k.k)) {
|
|
+ if (!bch2_bkey_pack_key(*out, k.k, f))
|
|
+ memcpy_u64s_small(*out, k.k, BKEY_U64s);
|
|
+
|
|
+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
|
|
+
|
|
+ btree_keys_account_key_add(nr, 0, *out);
|
|
+ *out = bkey_next(*out);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Sort + repack in a new format: */
|
|
+struct btree_nr_keys
|
|
+bch2_sort_repack(struct bset *dst, struct btree *src,
|
|
+ struct btree_node_iter *src_iter,
|
|
+ struct bkey_format *out_f,
|
|
+ bool filter_whiteouts)
|
|
+{
|
|
+ struct bkey_format *in_f = &src->format;
|
|
+ struct bkey_packed *in, *out = vstruct_last(dst);
|
|
+ struct btree_nr_keys nr;
|
|
+
|
|
+ memset(&nr, 0, sizeof(nr));
|
|
+
|
|
+ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) {
|
|
+ if (filter_whiteouts && bkey_deleted(in))
|
|
+ continue;
|
|
+
|
|
+ if (bch2_bkey_transform(out_f, out, bkey_packed(in)
|
|
+ ? in_f : &bch2_bkey_format_current, in))
|
|
+ out->format = KEY_FORMAT_LOCAL_BTREE;
|
|
+ else
|
|
+ bch2_bkey_unpack(src, (void *) out, in);
|
|
+
|
|
+ btree_keys_account_key_add(&nr, 0, out);
|
|
+ out = bkey_next(out);
|
|
+ }
|
|
+
|
|
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
|
|
+ return nr;
|
|
+}
|
|
+
|
|
+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
|
|
+struct btree_nr_keys
|
|
+bch2_sort_repack_merge(struct bch_fs *c,
|
|
+ struct bset *dst, struct btree *src,
|
|
+ struct btree_node_iter *iter,
|
|
+ struct bkey_format *out_f,
|
|
+ bool filter_whiteouts)
|
|
+{
|
|
+ struct bkey_packed *out = vstruct_last(dst), *k_packed;
|
|
+ struct bkey_buf k;
|
|
+ struct btree_nr_keys nr;
|
|
+
|
|
+ memset(&nr, 0, sizeof(nr));
|
|
+ bch2_bkey_buf_init(&k);
|
|
+
|
|
+ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
|
|
+ if (filter_whiteouts && bkey_deleted(k_packed))
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * NOTE:
|
|
+ * bch2_bkey_normalize may modify the key we pass it (dropping
|
|
+ * stale pointers) and we don't have a write lock on the src
|
|
+ * node; we have to make a copy of the entire key before calling
|
|
+ * normalize
|
|
+ */
|
|
+ bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
|
|
+ bch2_bkey_unpack(src, k.k, k_packed);
|
|
+
|
|
+ if (filter_whiteouts &&
|
|
+ bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
|
|
+ continue;
|
|
+
|
|
+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
|
|
+ }
|
|
+
|
|
+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
|
|
+ bch2_bkey_buf_exit(&k, c);
|
|
+ return nr;
|
|
+}
|
|
+
|
|
+static inline int sort_keys_cmp(struct btree *b,
|
|
+ struct bkey_packed *l,
|
|
+ struct bkey_packed *r)
|
|
+{
|
|
+ return bch2_bkey_cmp_packed(b, l, r) ?:
|
|
+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?:
|
|
+ (int) l->needs_whiteout - (int) r->needs_whiteout;
|
|
+}
|
|
+
|
|
+unsigned bch2_sort_keys(struct bkey_packed *dst,
|
|
+ struct sort_iter *iter,
|
|
+ bool filter_whiteouts)
|
|
+{
|
|
+ const struct bkey_format *f = &iter->b->format;
|
|
+ struct bkey_packed *in, *next, *out = dst;
|
|
+
|
|
+ sort_iter_sort(iter, sort_keys_cmp);
|
|
+
|
|
+ while ((in = sort_iter_next(iter, sort_keys_cmp))) {
|
|
+ bool needs_whiteout = false;
|
|
+
|
|
+ if (bkey_deleted(in) &&
|
|
+ (filter_whiteouts || !in->needs_whiteout))
|
|
+ continue;
|
|
+
|
|
+ while ((next = sort_iter_peek(iter)) &&
|
|
+ !bch2_bkey_cmp_packed(iter->b, in, next)) {
|
|
+ BUG_ON(in->needs_whiteout &&
|
|
+ next->needs_whiteout);
|
|
+ needs_whiteout |= in->needs_whiteout;
|
|
+ in = sort_iter_next(iter, sort_keys_cmp);
|
|
+ }
|
|
+
|
|
+ if (bkey_deleted(in)) {
|
|
+ memcpy_u64s(out, in, bkeyp_key_u64s(f, in));
|
|
+ set_bkeyp_val_u64s(f, out, 0);
|
|
+ } else {
|
|
+ bkey_copy(out, in);
|
|
+ }
|
|
+ out->needs_whiteout |= needs_whiteout;
|
|
+ out = bkey_next(out);
|
|
+ }
|
|
+
|
|
+ return (u64 *) out - (u64 *) dst;
|
|
+}
|
|
diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h
|
|
new file mode 100644
|
|
index 000000000000..1059996dac78
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bkey_sort.h
|
|
@@ -0,0 +1,49 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BKEY_SORT_H
|
|
+#define _BCACHEFS_BKEY_SORT_H
|
|
+
|
|
+struct sort_iter {
|
|
+ struct btree *b;
|
|
+ unsigned used;
|
|
+ unsigned size;
|
|
+
|
|
+ struct sort_iter_set {
|
|
+ struct bkey_packed *k, *end;
|
|
+ } data[MAX_BSETS + 1];
|
|
+};
|
|
+
|
|
+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b)
|
|
+{
|
|
+ iter->b = b;
|
|
+ iter->used = 0;
|
|
+ iter->size = ARRAY_SIZE(iter->data);
|
|
+}
|
|
+
|
|
+static inline void sort_iter_add(struct sort_iter *iter,
|
|
+ struct bkey_packed *k,
|
|
+ struct bkey_packed *end)
|
|
+{
|
|
+ BUG_ON(iter->used >= iter->size);
|
|
+
|
|
+ if (k != end)
|
|
+ iter->data[iter->used++] = (struct sort_iter_set) { k, end };
|
|
+}
|
|
+
|
|
+struct btree_nr_keys
|
|
+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *,
|
|
+ struct sort_iter *);
|
|
+
|
|
+struct btree_nr_keys
|
|
+bch2_sort_repack(struct bset *, struct btree *,
|
|
+ struct btree_node_iter *,
|
|
+ struct bkey_format *, bool);
|
|
+struct btree_nr_keys
|
|
+bch2_sort_repack_merge(struct bch_fs *,
|
|
+ struct bset *, struct btree *,
|
|
+ struct btree_node_iter *,
|
|
+ struct bkey_format *, bool);
|
|
+
|
|
+unsigned bch2_sort_keys(struct bkey_packed *,
|
|
+ struct sort_iter *, bool);
|
|
+
|
|
+#endif /* _BCACHEFS_BKEY_SORT_H */
|
|
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
|
|
new file mode 100644
|
|
index 000000000000..f92a757f953d
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bset.c
|
|
@@ -0,0 +1,1712 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Code for working with individual keys, and sorted sets of keys with in a
|
|
+ * btree node
|
|
+ *
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_cache.h"
|
|
+#include "bset.h"
|
|
+#include "eytzinger.h"
|
|
+#include "util.h"
|
|
+
|
|
+#include <asm/unaligned.h>
|
|
+#include <linux/console.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/prefetch.h>
|
|
+
|
|
+/* hack.. */
|
|
+#include "alloc_types.h"
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *,
|
|
+ struct btree *);
|
|
+
|
|
+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter)
|
|
+{
|
|
+ unsigned n = ARRAY_SIZE(iter->data);
|
|
+
|
|
+ while (n && __btree_node_iter_set_end(iter, n - 1))
|
|
+ --n;
|
|
+
|
|
+ return n;
|
|
+}
|
|
+
|
|
+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k)
|
|
+{
|
|
+ unsigned offset = __btree_node_key_to_offset(b, k);
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ if (offset <= t->end_offset) {
|
|
+ EBUG_ON(offset < btree_bkey_first_offset(t));
|
|
+ return t;
|
|
+ }
|
|
+
|
|
+ BUG();
|
|
+}
|
|
+
|
|
+/*
|
|
+ * There are never duplicate live keys in the btree - but including keys that
|
|
+ * have been flagged as deleted (and will be cleaned up later) we _will_ see
|
|
+ * duplicates.
|
|
+ *
|
|
+ * Thus the sort order is: usual key comparison first, but for keys that compare
|
|
+ * equal the deleted key(s) come first, and the (at most one) live version comes
|
|
+ * last.
|
|
+ *
|
|
+ * The main reason for this is insertion: to handle overwrites, we first iterate
|
|
+ * over keys that compare equal to our insert key, and then insert immediately
|
|
+ * prior to the first key greater than the key we're inserting - our insert
|
|
+ * position will be after all keys that compare equal to our insert key, which
|
|
+ * by the time we actually do the insert will all be deleted.
|
|
+ */
|
|
+
|
|
+void bch2_dump_bset(struct bch_fs *c, struct btree *b,
|
|
+ struct bset *i, unsigned set)
|
|
+{
|
|
+ struct bkey_packed *_k, *_n;
|
|
+ struct bkey uk, n;
|
|
+ struct bkey_s_c k;
|
|
+ char buf[200];
|
|
+
|
|
+ if (!i->u64s)
|
|
+ return;
|
|
+
|
|
+ for (_k = i->start;
|
|
+ _k < vstruct_last(i);
|
|
+ _k = _n) {
|
|
+ _n = bkey_next(_k);
|
|
+
|
|
+ k = bkey_disassemble(b, _k, &uk);
|
|
+ if (c)
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
|
|
+ else
|
|
+ bch2_bkey_to_text(&PBUF(buf), k.k);
|
|
+ printk(KERN_ERR "block %u key %5zu: %s\n", set,
|
|
+ _k->_data - i->_data, buf);
|
|
+
|
|
+ if (_n == vstruct_last(i))
|
|
+ continue;
|
|
+
|
|
+ n = bkey_unpack_key(b, _n);
|
|
+
|
|
+ if (bpos_cmp(n.p, k.k->p) < 0) {
|
|
+ printk(KERN_ERR "Key skipped backwards\n");
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (!bkey_deleted(k.k) &&
|
|
+ !bpos_cmp(n.p, k.k->p))
|
|
+ printk(KERN_ERR "Duplicate keys\n");
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ console_lock();
|
|
+ for_each_bset(b, t)
|
|
+ bch2_dump_bset(c, b, bset(b, t), t - b->set);
|
|
+ console_unlock();
|
|
+}
|
|
+
|
|
+void bch2_dump_btree_node_iter(struct btree *b,
|
|
+ struct btree_node_iter *iter)
|
|
+{
|
|
+ struct btree_node_iter_set *set;
|
|
+
|
|
+ printk(KERN_ERR "btree node iter with %u/%u sets:\n",
|
|
+ __btree_node_iter_used(iter), b->nsets);
|
|
+
|
|
+ btree_node_iter_for_each(iter, set) {
|
|
+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
|
|
+ struct bkey uk = bkey_unpack_key(b, k);
|
|
+ char buf[100];
|
|
+
|
|
+ bch2_bkey_to_text(&PBUF(buf), &uk);
|
|
+ printk(KERN_ERR "set %zu key %u: %s\n",
|
|
+ t - b->set, set->k, buf);
|
|
+ }
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+
|
|
+void __bch2_verify_btree_nr_keys(struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+ struct bkey_packed *k;
|
|
+ struct btree_nr_keys nr = { 0 };
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ bset_tree_for_each_key(b, t, k)
|
|
+ if (!bkey_deleted(k))
|
|
+ btree_keys_account_key_add(&nr, t - b->set, k);
|
|
+
|
|
+ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
|
|
+}
|
|
+
|
|
+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct btree_node_iter iter = *_iter;
|
|
+ const struct bkey_packed *k, *n;
|
|
+
|
|
+ k = bch2_btree_node_iter_peek_all(&iter, b);
|
|
+ __bch2_btree_node_iter_advance(&iter, b);
|
|
+ n = bch2_btree_node_iter_peek_all(&iter, b);
|
|
+
|
|
+ bkey_unpack_key(b, k);
|
|
+
|
|
+ if (n &&
|
|
+ bkey_iter_cmp(b, k, n) > 0) {
|
|
+ struct btree_node_iter_set *set;
|
|
+ struct bkey ku = bkey_unpack_key(b, k);
|
|
+ struct bkey nu = bkey_unpack_key(b, n);
|
|
+ char buf1[80], buf2[80];
|
|
+
|
|
+ bch2_dump_btree_node(NULL, b);
|
|
+ bch2_bkey_to_text(&PBUF(buf1), &ku);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), &nu);
|
|
+ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n",
|
|
+ buf1, buf2);
|
|
+ printk(KERN_ERR "iter was:");
|
|
+
|
|
+ btree_node_iter_for_each(_iter, set) {
|
|
+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k);
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
|
|
+ printk(" [%zi %zi]", t - b->set,
|
|
+ k->_data - bset(b, t)->_data);
|
|
+ }
|
|
+ panic("\n");
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct btree_node_iter_set *set, *s2;
|
|
+ struct bkey_packed *k, *p;
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ if (bch2_btree_node_iter_end(iter))
|
|
+ return;
|
|
+
|
|
+ /* Verify no duplicates: */
|
|
+ btree_node_iter_for_each(iter, set)
|
|
+ btree_node_iter_for_each(iter, s2)
|
|
+ BUG_ON(set != s2 && set->end == s2->end);
|
|
+
|
|
+ /* Verify that set->end is correct: */
|
|
+ btree_node_iter_for_each(iter, set) {
|
|
+ for_each_bset(b, t)
|
|
+ if (set->end == t->end_offset)
|
|
+ goto found;
|
|
+ BUG();
|
|
+found:
|
|
+ BUG_ON(set->k < btree_bkey_first_offset(t) ||
|
|
+ set->k >= t->end_offset);
|
|
+ }
|
|
+
|
|
+ /* Verify iterator is sorted: */
|
|
+ btree_node_iter_for_each(iter, set)
|
|
+ BUG_ON(set != iter->data &&
|
|
+ btree_node_iter_cmp(b, set[-1], set[0]) > 0);
|
|
+
|
|
+ k = bch2_btree_node_iter_peek_all(iter, b);
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ if (iter->data[0].end == t->end_offset)
|
|
+ continue;
|
|
+
|
|
+ p = bch2_bkey_prev_all(b, t,
|
|
+ bch2_btree_node_iter_bset_pos(iter, b, t));
|
|
+
|
|
+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where,
|
|
+ struct bkey_packed *insert, unsigned clobber_u64s)
|
|
+{
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, where);
|
|
+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where);
|
|
+ struct bkey_packed *next = (void *) (where->_data + clobber_u64s);
|
|
+#if 0
|
|
+ BUG_ON(prev &&
|
|
+ bkey_iter_cmp(b, prev, insert) > 0);
|
|
+#else
|
|
+ if (prev &&
|
|
+ bkey_iter_cmp(b, prev, insert) > 0) {
|
|
+ struct bkey k1 = bkey_unpack_key(b, prev);
|
|
+ struct bkey k2 = bkey_unpack_key(b, insert);
|
|
+ char buf1[100];
|
|
+ char buf2[100];
|
|
+
|
|
+ bch2_dump_btree_node(NULL, b);
|
|
+ bch2_bkey_to_text(&PBUF(buf1), &k1);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), &k2);
|
|
+
|
|
+ panic("prev > insert:\n"
|
|
+ "prev key %s\n"
|
|
+ "insert key %s\n",
|
|
+ buf1, buf2);
|
|
+ }
|
|
+#endif
|
|
+#if 0
|
|
+ BUG_ON(next != btree_bkey_last(b, t) &&
|
|
+ bkey_iter_cmp(b, insert, next) > 0);
|
|
+#else
|
|
+ if (next != btree_bkey_last(b, t) &&
|
|
+ bkey_iter_cmp(b, insert, next) > 0) {
|
|
+ struct bkey k1 = bkey_unpack_key(b, insert);
|
|
+ struct bkey k2 = bkey_unpack_key(b, next);
|
|
+ char buf1[100];
|
|
+ char buf2[100];
|
|
+
|
|
+ bch2_dump_btree_node(NULL, b);
|
|
+ bch2_bkey_to_text(&PBUF(buf1), &k1);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), &k2);
|
|
+
|
|
+ panic("insert > next:\n"
|
|
+ "insert key %s\n"
|
|
+ "next key %s\n",
|
|
+ buf1, buf2);
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter,
|
|
+ struct btree *b) {}
|
|
+
|
|
+#endif
|
|
+
|
|
+/* Auxiliary search trees */
|
|
+
|
|
+#define BFLOAT_FAILED_UNPACKED U8_MAX
|
|
+#define BFLOAT_FAILED U8_MAX
|
|
+
|
|
+struct bkey_float {
|
|
+ u8 exponent;
|
|
+ u8 key_offset;
|
|
+ u16 mantissa;
|
|
+};
|
|
+#define BKEY_MANTISSA_BITS 16
|
|
+
|
|
+static unsigned bkey_float_byte_offset(unsigned idx)
|
|
+{
|
|
+ return idx * sizeof(struct bkey_float);
|
|
+}
|
|
+
|
|
+struct ro_aux_tree {
|
|
+ struct bkey_float f[0];
|
|
+};
|
|
+
|
|
+struct rw_aux_tree {
|
|
+ u16 offset;
|
|
+ struct bpos k;
|
|
+};
|
|
+
|
|
+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t)
|
|
+{
|
|
+ BUG_ON(t->aux_data_offset == U16_MAX);
|
|
+
|
|
+ switch (bset_aux_tree_type(t)) {
|
|
+ case BSET_NO_AUX_TREE:
|
|
+ return t->aux_data_offset;
|
|
+ case BSET_RO_AUX_TREE:
|
|
+ return t->aux_data_offset +
|
|
+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) +
|
|
+ t->size * sizeof(u8), 8);
|
|
+ case BSET_RW_AUX_TREE:
|
|
+ return t->aux_data_offset +
|
|
+ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8);
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static unsigned bset_aux_tree_buf_start(const struct btree *b,
|
|
+ const struct bset_tree *t)
|
|
+{
|
|
+ return t == b->set
|
|
+ ? DIV_ROUND_UP(b->unpack_fn_len, 8)
|
|
+ : bset_aux_tree_buf_end(t - 1);
|
|
+}
|
|
+
|
|
+static void *__aux_tree_base(const struct btree *b,
|
|
+ const struct bset_tree *t)
|
|
+{
|
|
+ return b->aux_data + t->aux_data_offset * 8;
|
|
+}
|
|
+
|
|
+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b,
|
|
+ const struct bset_tree *t)
|
|
+{
|
|
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
|
|
+
|
|
+ return __aux_tree_base(b, t);
|
|
+}
|
|
+
|
|
+static u8 *ro_aux_tree_prev(const struct btree *b,
|
|
+ const struct bset_tree *t)
|
|
+{
|
|
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
|
|
+
|
|
+ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size);
|
|
+}
|
|
+
|
|
+static struct bkey_float *bkey_float(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned idx)
|
|
+{
|
|
+ return ro_aux_tree_base(b, t)->f + idx;
|
|
+}
|
|
+
|
|
+static void bset_aux_tree_verify(const struct btree *b)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ const struct bset_tree *t;
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ if (t->aux_data_offset == U16_MAX)
|
|
+ continue;
|
|
+
|
|
+ BUG_ON(t != b->set &&
|
|
+ t[-1].aux_data_offset == U16_MAX);
|
|
+
|
|
+ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t));
|
|
+ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b));
|
|
+ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b));
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+void bch2_btree_keys_init(struct btree *b)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ b->nsets = 0;
|
|
+ memset(&b->nr, 0, sizeof(b->nr));
|
|
+
|
|
+ for (i = 0; i < MAX_BSETS; i++)
|
|
+ b->set[i].data_offset = U16_MAX;
|
|
+
|
|
+ bch2_bset_set_no_aux_tree(b, b->set);
|
|
+}
|
|
+
|
|
+/* Binary tree stuff for auxiliary search trees */
|
|
+
|
|
+/*
|
|
+ * Cacheline/offset <-> bkey pointer arithmetic:
|
|
+ *
|
|
+ * t->tree is a binary search tree in an array; each node corresponds to a key
|
|
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
|
|
+ *
|
|
+ * This means we don't have to store the full index of the key that a node in
|
|
+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and
|
|
+ * then bkey_float->m gives us the offset within that cacheline, in units of 8
|
|
+ * bytes.
|
|
+ *
|
|
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to
|
|
+ * make this work.
|
|
+ *
|
|
+ * To construct the bfloat for an arbitrary key we need to know what the key
|
|
+ * immediately preceding it is: we have to check if the two keys differ in the
|
|
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
|
|
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
|
|
+ */
|
|
+
|
|
+static inline void *bset_cacheline(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned cacheline)
|
|
+{
|
|
+ return (void *) round_down((unsigned long) btree_bkey_first(b, t),
|
|
+ L1_CACHE_BYTES) +
|
|
+ cacheline * BSET_CACHELINE;
|
|
+}
|
|
+
|
|
+static struct bkey_packed *cacheline_to_bkey(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned cacheline,
|
|
+ unsigned offset)
|
|
+{
|
|
+ return bset_cacheline(b, t, cacheline) + offset * 8;
|
|
+}
|
|
+
|
|
+static unsigned bkey_to_cacheline(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE;
|
|
+}
|
|
+
|
|
+static ssize_t __bkey_to_cacheline_offset(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned cacheline,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline);
|
|
+}
|
|
+
|
|
+static unsigned bkey_to_cacheline_offset(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned cacheline,
|
|
+ const struct bkey_packed *k)
|
|
+{
|
|
+ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k);
|
|
+
|
|
+ EBUG_ON(m > U8_MAX);
|
|
+ return m;
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *tree_to_bkey(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned j)
|
|
+{
|
|
+ return cacheline_to_bkey(b, t,
|
|
+ __eytzinger1_to_inorder(j, t->size, t->extra),
|
|
+ bkey_float(b, t, j)->key_offset);
|
|
+}
|
|
+
|
|
+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ unsigned j)
|
|
+{
|
|
+ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j];
|
|
+
|
|
+ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s);
|
|
+}
|
|
+
|
|
+static struct rw_aux_tree *rw_aux_tree(const struct btree *b,
|
|
+ const struct bset_tree *t)
|
|
+{
|
|
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
|
|
+
|
|
+ return __aux_tree_base(b, t);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For the write set - the one we're currently inserting keys into - we don't
|
|
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
|
|
+ */
|
|
+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ unsigned j)
|
|
+{
|
|
+ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset);
|
|
+}
|
|
+
|
|
+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t,
|
|
+ unsigned j, struct bkey_packed *k)
|
|
+{
|
|
+ EBUG_ON(k >= btree_bkey_last(b, t));
|
|
+
|
|
+ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) {
|
|
+ .offset = __btree_node_key_to_offset(b, k),
|
|
+ .k = bkey_unpack_pos(b, k),
|
|
+ };
|
|
+}
|
|
+
|
|
+static void bch2_bset_verify_rw_aux_tree(struct btree *b,
|
|
+ struct bset_tree *t)
|
|
+{
|
|
+ struct bkey_packed *k = btree_bkey_first(b, t);
|
|
+ unsigned j = 0;
|
|
+
|
|
+ if (!bch2_expensive_debug_checks)
|
|
+ return;
|
|
+
|
|
+ BUG_ON(bset_has_ro_aux_tree(t));
|
|
+
|
|
+ if (!bset_has_rw_aux_tree(t))
|
|
+ return;
|
|
+
|
|
+ BUG_ON(t->size < 1);
|
|
+ BUG_ON(rw_aux_to_bkey(b, t, j) != k);
|
|
+
|
|
+ goto start;
|
|
+ while (1) {
|
|
+ if (rw_aux_to_bkey(b, t, j) == k) {
|
|
+ BUG_ON(bpos_cmp(rw_aux_tree(b, t)[j].k,
|
|
+ bkey_unpack_pos(b, k)));
|
|
+start:
|
|
+ if (++j == t->size)
|
|
+ break;
|
|
+
|
|
+ BUG_ON(rw_aux_tree(b, t)[j].offset <=
|
|
+ rw_aux_tree(b, t)[j - 1].offset);
|
|
+ }
|
|
+
|
|
+ k = bkey_next(k);
|
|
+ BUG_ON(k >= btree_bkey_last(b, t));
|
|
+ }
|
|
+}
|
|
+
|
|
+/* returns idx of first entry >= offset: */
|
|
+static unsigned rw_aux_tree_bsearch(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ unsigned offset)
|
|
+{
|
|
+ unsigned bset_offs = offset - btree_bkey_first_offset(t);
|
|
+ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t);
|
|
+ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0;
|
|
+
|
|
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE);
|
|
+ EBUG_ON(!t->size);
|
|
+ EBUG_ON(idx > t->size);
|
|
+
|
|
+ while (idx < t->size &&
|
|
+ rw_aux_tree(b, t)[idx].offset < offset)
|
|
+ idx++;
|
|
+
|
|
+ while (idx &&
|
|
+ rw_aux_tree(b, t)[idx - 1].offset >= offset)
|
|
+ idx--;
|
|
+
|
|
+ EBUG_ON(idx < t->size &&
|
|
+ rw_aux_tree(b, t)[idx].offset < offset);
|
|
+ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset);
|
|
+ EBUG_ON(idx + 1 < t->size &&
|
|
+ rw_aux_tree(b, t)[idx].offset ==
|
|
+ rw_aux_tree(b, t)[idx + 1].offset);
|
|
+
|
|
+ return idx;
|
|
+}
|
|
+
|
|
+static inline unsigned bkey_mantissa(const struct bkey_packed *k,
|
|
+ const struct bkey_float *f,
|
|
+ unsigned idx)
|
|
+{
|
|
+ u64 v;
|
|
+
|
|
+ EBUG_ON(!bkey_packed(k));
|
|
+
|
|
+ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3)));
|
|
+
|
|
+ /*
|
|
+ * In little endian, we're shifting off low bits (and then the bits we
|
|
+ * want are at the low end), in big endian we're shifting off high bits
|
|
+ * (and then the bits we want are at the high end, so we shift them
|
|
+ * back down):
|
|
+ */
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+ v >>= f->exponent & 7;
|
|
+#else
|
|
+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS;
|
|
+#endif
|
|
+ return (u16) v;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static inline void __make_bfloat(struct btree *b, struct bset_tree *t,
|
|
+ unsigned j,
|
|
+ struct bkey_packed *min_key,
|
|
+ struct bkey_packed *max_key)
|
|
+{
|
|
+ struct bkey_float *f = bkey_float(b, t, j);
|
|
+ struct bkey_packed *m = tree_to_bkey(b, t, j);
|
|
+ struct bkey_packed *l = is_power_of_2(j)
|
|
+ ? min_key
|
|
+ : tree_to_prev_bkey(b, t, j >> ffs(j));
|
|
+ struct bkey_packed *r = is_power_of_2(j + 1)
|
|
+ ? max_key
|
|
+ : tree_to_bkey(b, t, j >> (ffz(j) + 1));
|
|
+ unsigned mantissa;
|
|
+ int shift, exponent, high_bit;
|
|
+
|
|
+ /*
|
|
+ * for failed bfloats, the lookup code falls back to comparing against
|
|
+ * the original key.
|
|
+ */
|
|
+
|
|
+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) ||
|
|
+ !b->nr_key_bits) {
|
|
+ f->exponent = BFLOAT_FAILED_UNPACKED;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * The greatest differing bit of l and r is the first bit we must
|
|
+ * include in the bfloat mantissa we're creating in order to do
|
|
+ * comparisons - that bit always becomes the high bit of
|
|
+ * bfloat->mantissa, and thus the exponent we're calculating here is
|
|
+ * the position of what will become the low bit in bfloat->mantissa:
|
|
+ *
|
|
+ * Note that this may be negative - we may be running off the low end
|
|
+ * of the key: we handle this later:
|
|
+ */
|
|
+ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r),
|
|
+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1);
|
|
+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1);
|
|
+
|
|
+ /*
|
|
+ * Then we calculate the actual shift value, from the start of the key
|
|
+ * (k->_data), to get the key bits starting at exponent:
|
|
+ */
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent;
|
|
+
|
|
+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64);
|
|
+#else
|
|
+ shift = high_bit_offset +
|
|
+ b->nr_key_bits -
|
|
+ exponent -
|
|
+ BKEY_MANTISSA_BITS;
|
|
+
|
|
+ EBUG_ON(shift < KEY_PACKED_BITS_START);
|
|
+#endif
|
|
+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED);
|
|
+
|
|
+ f->exponent = shift;
|
|
+ mantissa = bkey_mantissa(m, f, j);
|
|
+
|
|
+ /*
|
|
+ * If we've got garbage bits, set them to all 1s - it's legal for the
|
|
+ * bfloat to compare larger than the original key, but not smaller:
|
|
+ */
|
|
+ if (exponent < 0)
|
|
+ mantissa |= ~(~0U << -exponent);
|
|
+
|
|
+ f->mantissa = mantissa;
|
|
+}
|
|
+
|
|
+static void make_bfloat(struct btree *b, struct bset_tree *t,
|
|
+ unsigned j,
|
|
+ struct bkey_packed *min_key,
|
|
+ struct bkey_packed *max_key)
|
|
+{
|
|
+ struct bkey_i *k;
|
|
+
|
|
+ if (is_power_of_2(j) &&
|
|
+ !min_key->u64s) {
|
|
+ if (!bkey_pack_pos(min_key, b->data->min_key, b)) {
|
|
+ k = (void *) min_key;
|
|
+ bkey_init(&k->k);
|
|
+ k->k.p = b->data->min_key;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (is_power_of_2(j + 1) &&
|
|
+ !max_key->u64s) {
|
|
+ if (!bkey_pack_pos(max_key, b->data->max_key, b)) {
|
|
+ k = (void *) max_key;
|
|
+ bkey_init(&k->k);
|
|
+ k->k.p = b->data->max_key;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ __make_bfloat(b, t, j, min_key, max_key);
|
|
+}
|
|
+
|
|
+/* bytes remaining - only valid for last bset: */
|
|
+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t)
|
|
+{
|
|
+ bset_aux_tree_verify(b);
|
|
+
|
|
+ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64);
|
|
+}
|
|
+
|
|
+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t)
|
|
+{
|
|
+ return __bset_tree_capacity(b, t) /
|
|
+ (sizeof(struct bkey_float) + sizeof(u8));
|
|
+}
|
|
+
|
|
+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t)
|
|
+{
|
|
+ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree);
|
|
+}
|
|
+
|
|
+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ t->size = 1;
|
|
+ t->extra = BSET_RW_AUX_TREE_VAL;
|
|
+ rw_aux_tree(b, t)[0].offset =
|
|
+ __btree_node_key_to_offset(b, btree_bkey_first(b, t));
|
|
+
|
|
+ bset_tree_for_each_key(b, t, k) {
|
|
+ if (t->size == bset_rw_tree_capacity(b, t))
|
|
+ break;
|
|
+
|
|
+ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) >
|
|
+ L1_CACHE_BYTES)
|
|
+ rw_aux_tree_set(b, t, t->size++, k);
|
|
+ }
|
|
+}
|
|
+
|
|
+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
|
|
+{
|
|
+ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
|
|
+ struct bkey_i min_key, max_key;
|
|
+ unsigned j, cacheline = 1;
|
|
+
|
|
+ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
|
|
+ bset_ro_tree_capacity(b, t));
|
|
+retry:
|
|
+ if (t->size < 2) {
|
|
+ t->size = 0;
|
|
+ t->extra = BSET_NO_AUX_TREE_VAL;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
|
|
+
|
|
+ /* First we figure out where the first key in each cacheline is */
|
|
+ eytzinger1_for_each(j, t->size) {
|
|
+ while (bkey_to_cacheline(b, t, k) < cacheline)
|
|
+ prev = k, k = bkey_next(k);
|
|
+
|
|
+ if (k >= btree_bkey_last(b, t)) {
|
|
+ /* XXX: this path sucks */
|
|
+ t->size--;
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ ro_aux_tree_prev(b, t)[j] = prev->u64s;
|
|
+ bkey_float(b, t, j)->key_offset =
|
|
+ bkey_to_cacheline_offset(b, t, cacheline++, k);
|
|
+
|
|
+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev);
|
|
+ EBUG_ON(tree_to_bkey(b, t, j) != k);
|
|
+ }
|
|
+
|
|
+ while (k != btree_bkey_last(b, t))
|
|
+ prev = k, k = bkey_next(k);
|
|
+
|
|
+ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) {
|
|
+ bkey_init(&min_key.k);
|
|
+ min_key.k.p = b->data->min_key;
|
|
+ }
|
|
+
|
|
+ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) {
|
|
+ bkey_init(&max_key.k);
|
|
+ max_key.k.p = b->data->max_key;
|
|
+ }
|
|
+
|
|
+ /* Then we build the tree */
|
|
+ eytzinger1_for_each(j, t->size)
|
|
+ __make_bfloat(b, t, j,
|
|
+ bkey_to_packed(&min_key),
|
|
+ bkey_to_packed(&max_key));
|
|
+}
|
|
+
|
|
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
|
|
+{
|
|
+ struct bset_tree *i;
|
|
+
|
|
+ for (i = b->set; i != t; i++)
|
|
+ BUG_ON(bset_has_rw_aux_tree(i));
|
|
+
|
|
+ bch2_bset_set_no_aux_tree(b, t);
|
|
+
|
|
+ /* round up to next cacheline: */
|
|
+ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t),
|
|
+ SMP_CACHE_BYTES / sizeof(u64));
|
|
+
|
|
+ bset_aux_tree_verify(b);
|
|
+}
|
|
+
|
|
+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t,
|
|
+ bool writeable)
|
|
+{
|
|
+ if (writeable
|
|
+ ? bset_has_rw_aux_tree(t)
|
|
+ : bset_has_ro_aux_tree(t))
|
|
+ return;
|
|
+
|
|
+ bset_alloc_tree(b, t);
|
|
+
|
|
+ if (!__bset_tree_capacity(b, t))
|
|
+ return;
|
|
+
|
|
+ if (writeable)
|
|
+ __build_rw_aux_tree(b, t);
|
|
+ else
|
|
+ __build_ro_aux_tree(b, t);
|
|
+
|
|
+ bset_aux_tree_verify(b);
|
|
+}
|
|
+
|
|
+void bch2_bset_init_first(struct btree *b, struct bset *i)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ BUG_ON(b->nsets);
|
|
+
|
|
+ memset(i, 0, sizeof(*i));
|
|
+ get_random_bytes(&i->seq, sizeof(i->seq));
|
|
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
|
|
+
|
|
+ t = &b->set[b->nsets++];
|
|
+ set_btree_bset(b, t, i);
|
|
+}
|
|
+
|
|
+void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
|
|
+ struct btree_node_entry *bne)
|
|
+{
|
|
+ struct bset *i = &bne->keys;
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
|
|
+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
|
|
+ BUG_ON(b->nsets >= MAX_BSETS);
|
|
+
|
|
+ memset(i, 0, sizeof(*i));
|
|
+ i->seq = btree_bset_first(b)->seq;
|
|
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
|
|
+
|
|
+ t = &b->set[b->nsets++];
|
|
+ set_btree_bset(b, t, i);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the
|
|
+ * immediate predecessor:
|
|
+ */
|
|
+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ struct bkey_packed *p;
|
|
+ unsigned offset;
|
|
+ int j;
|
|
+
|
|
+ EBUG_ON(k < btree_bkey_first(b, t) ||
|
|
+ k > btree_bkey_last(b, t));
|
|
+
|
|
+ if (k == btree_bkey_first(b, t))
|
|
+ return NULL;
|
|
+
|
|
+ switch (bset_aux_tree_type(t)) {
|
|
+ case BSET_NO_AUX_TREE:
|
|
+ p = btree_bkey_first(b, t);
|
|
+ break;
|
|
+ case BSET_RO_AUX_TREE:
|
|
+ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k));
|
|
+
|
|
+ do {
|
|
+ p = j ? tree_to_bkey(b, t,
|
|
+ __inorder_to_eytzinger1(j--,
|
|
+ t->size, t->extra))
|
|
+ : btree_bkey_first(b, t);
|
|
+ } while (p >= k);
|
|
+ break;
|
|
+ case BSET_RW_AUX_TREE:
|
|
+ offset = __btree_node_key_to_offset(b, k);
|
|
+ j = rw_aux_tree_bsearch(b, t, offset);
|
|
+ p = j ? rw_aux_to_bkey(b, t, j - 1)
|
|
+ : btree_bkey_first(b, t);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bkey_packed *k,
|
|
+ unsigned min_key_type)
|
|
+{
|
|
+ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
|
|
+
|
|
+ while ((p = __bkey_prev(b, t, k)) && !ret) {
|
|
+ for (i = p; i != k; i = bkey_next(i))
|
|
+ if (i->type >= min_key_type)
|
|
+ ret = i;
|
|
+
|
|
+ k = p;
|
|
+ }
|
|
+
|
|
+ if (bch2_expensive_debug_checks) {
|
|
+ BUG_ON(ret >= orig_k);
|
|
+
|
|
+ for (i = ret
|
|
+ ? bkey_next(ret)
|
|
+ : btree_bkey_first(b, t);
|
|
+ i != orig_k;
|
|
+ i = bkey_next(i))
|
|
+ BUG_ON(i->type >= min_key_type);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Insert */
|
|
+
|
|
+static void rw_aux_tree_fix_invalidated_key(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ unsigned offset = __btree_node_key_to_offset(b, k);
|
|
+ unsigned j = rw_aux_tree_bsearch(b, t, offset);
|
|
+
|
|
+ if (j < t->size &&
|
|
+ rw_aux_tree(b, t)[j].offset == offset)
|
|
+ rw_aux_tree_set(b, t, j, k);
|
|
+
|
|
+ bch2_bset_verify_rw_aux_tree(b, t);
|
|
+}
|
|
+
|
|
+static void ro_aux_tree_fix_invalidated_key(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ struct bkey_packed min_key, max_key;
|
|
+ unsigned inorder, j;
|
|
+
|
|
+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE);
|
|
+
|
|
+ /* signal to make_bfloat() that they're uninitialized: */
|
|
+ min_key.u64s = max_key.u64s = 0;
|
|
+
|
|
+ if (bkey_next(k) == btree_bkey_last(b, t)) {
|
|
+ for (j = 1; j < t->size; j = j * 2 + 1)
|
|
+ make_bfloat(b, t, j, &min_key, &max_key);
|
|
+ }
|
|
+
|
|
+ inorder = bkey_to_cacheline(b, t, k);
|
|
+
|
|
+ if (inorder &&
|
|
+ inorder < t->size) {
|
|
+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
|
|
+
|
|
+ if (k == tree_to_bkey(b, t, j)) {
|
|
+ /* Fix the node this key corresponds to */
|
|
+ make_bfloat(b, t, j, &min_key, &max_key);
|
|
+
|
|
+ /* Children for which this key is the right boundary */
|
|
+ for (j = eytzinger1_left_child(j);
|
|
+ j < t->size;
|
|
+ j = eytzinger1_right_child(j))
|
|
+ make_bfloat(b, t, j, &min_key, &max_key);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (inorder + 1 < t->size) {
|
|
+ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra);
|
|
+
|
|
+ if (k == tree_to_prev_bkey(b, t, j)) {
|
|
+ make_bfloat(b, t, j, &min_key, &max_key);
|
|
+
|
|
+ /* Children for which this key is the left boundary */
|
|
+ for (j = eytzinger1_right_child(j);
|
|
+ j < t->size;
|
|
+ j = eytzinger1_left_child(j))
|
|
+ make_bfloat(b, t, j, &min_key, &max_key);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been
|
|
+ * modified, fix any auxiliary search tree by remaking all the nodes in the
|
|
+ * auxiliary search tree that @k corresponds to
|
|
+ */
|
|
+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k)
|
|
+{
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
|
|
+
|
|
+ switch (bset_aux_tree_type(t)) {
|
|
+ case BSET_NO_AUX_TREE:
|
|
+ break;
|
|
+ case BSET_RO_AUX_TREE:
|
|
+ ro_aux_tree_fix_invalidated_key(b, t, k);
|
|
+ break;
|
|
+ case BSET_RW_AUX_TREE:
|
|
+ rw_aux_tree_fix_invalidated_key(b, t, k);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bch2_bset_fix_lookup_table(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bkey_packed *_where,
|
|
+ unsigned clobber_u64s,
|
|
+ unsigned new_u64s)
|
|
+{
|
|
+ int shift = new_u64s - clobber_u64s;
|
|
+ unsigned l, j, where = __btree_node_key_to_offset(b, _where);
|
|
+
|
|
+ EBUG_ON(bset_has_ro_aux_tree(t));
|
|
+
|
|
+ if (!bset_has_rw_aux_tree(t))
|
|
+ return;
|
|
+
|
|
+ /* returns first entry >= where */
|
|
+ l = rw_aux_tree_bsearch(b, t, where);
|
|
+
|
|
+ if (!l) /* never delete first entry */
|
|
+ l++;
|
|
+ else if (l < t->size &&
|
|
+ where < t->end_offset &&
|
|
+ rw_aux_tree(b, t)[l].offset == where)
|
|
+ rw_aux_tree_set(b, t, l++, _where);
|
|
+
|
|
+ /* l now > where */
|
|
+
|
|
+ for (j = l;
|
|
+ j < t->size &&
|
|
+ rw_aux_tree(b, t)[j].offset < where + clobber_u64s;
|
|
+ j++)
|
|
+ ;
|
|
+
|
|
+ if (j < t->size &&
|
|
+ rw_aux_tree(b, t)[j].offset + shift ==
|
|
+ rw_aux_tree(b, t)[l - 1].offset)
|
|
+ j++;
|
|
+
|
|
+ memmove(&rw_aux_tree(b, t)[l],
|
|
+ &rw_aux_tree(b, t)[j],
|
|
+ (void *) &rw_aux_tree(b, t)[t->size] -
|
|
+ (void *) &rw_aux_tree(b, t)[j]);
|
|
+ t->size -= j - l;
|
|
+
|
|
+ for (j = l; j < t->size; j++)
|
|
+ rw_aux_tree(b, t)[j].offset += shift;
|
|
+
|
|
+ EBUG_ON(l < t->size &&
|
|
+ rw_aux_tree(b, t)[l].offset ==
|
|
+ rw_aux_tree(b, t)[l - 1].offset);
|
|
+
|
|
+ if (t->size < bset_rw_tree_capacity(b, t) &&
|
|
+ (l < t->size
|
|
+ ? rw_aux_tree(b, t)[l].offset
|
|
+ : t->end_offset) -
|
|
+ rw_aux_tree(b, t)[l - 1].offset >
|
|
+ L1_CACHE_BYTES / sizeof(u64)) {
|
|
+ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1);
|
|
+ struct bkey_packed *end = l < t->size
|
|
+ ? rw_aux_to_bkey(b, t, l)
|
|
+ : btree_bkey_last(b, t);
|
|
+ struct bkey_packed *k = start;
|
|
+
|
|
+ while (1) {
|
|
+ k = bkey_next(k);
|
|
+ if (k == end)
|
|
+ break;
|
|
+
|
|
+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) {
|
|
+ memmove(&rw_aux_tree(b, t)[l + 1],
|
|
+ &rw_aux_tree(b, t)[l],
|
|
+ (void *) &rw_aux_tree(b, t)[t->size] -
|
|
+ (void *) &rw_aux_tree(b, t)[l]);
|
|
+ t->size++;
|
|
+ rw_aux_tree_set(b, t, l, k);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_bset_verify_rw_aux_tree(b, t);
|
|
+ bset_aux_tree_verify(b);
|
|
+}
|
|
+
|
|
+void bch2_bset_insert(struct btree *b,
|
|
+ struct btree_node_iter *iter,
|
|
+ struct bkey_packed *where,
|
|
+ struct bkey_i *insert,
|
|
+ unsigned clobber_u64s)
|
|
+{
|
|
+ struct bkey_format *f = &b->format;
|
|
+ struct bset_tree *t = bset_tree_last(b);
|
|
+ struct bkey_packed packed, *src = bkey_to_packed(insert);
|
|
+
|
|
+ bch2_bset_verify_rw_aux_tree(b, t);
|
|
+ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s);
|
|
+
|
|
+ if (bch2_bkey_pack_key(&packed, &insert->k, f))
|
|
+ src = &packed;
|
|
+
|
|
+ if (!bkey_deleted(&insert->k))
|
|
+ btree_keys_account_key_add(&b->nr, t - b->set, src);
|
|
+
|
|
+ if (src->u64s != clobber_u64s) {
|
|
+ u64 *src_p = where->_data + clobber_u64s;
|
|
+ u64 *dst_p = where->_data + src->u64s;
|
|
+
|
|
+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) <
|
|
+ (int) clobber_u64s - src->u64s);
|
|
+
|
|
+ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
|
|
+ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s);
|
|
+ set_btree_bset_end(b, t);
|
|
+ }
|
|
+
|
|
+ memcpy_u64s(where, src,
|
|
+ bkeyp_key_u64s(f, src));
|
|
+ memcpy_u64s(bkeyp_val(f, where), &insert->v,
|
|
+ bkeyp_val_u64s(f, src));
|
|
+
|
|
+ if (src->u64s != clobber_u64s)
|
|
+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s);
|
|
+
|
|
+ bch2_verify_btree_nr_keys(b);
|
|
+}
|
|
+
|
|
+void bch2_bset_delete(struct btree *b,
|
|
+ struct bkey_packed *where,
|
|
+ unsigned clobber_u64s)
|
|
+{
|
|
+ struct bset_tree *t = bset_tree_last(b);
|
|
+ u64 *src_p = where->_data + clobber_u64s;
|
|
+ u64 *dst_p = where->_data;
|
|
+
|
|
+ bch2_bset_verify_rw_aux_tree(b, t);
|
|
+
|
|
+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s);
|
|
+
|
|
+ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p);
|
|
+ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s);
|
|
+ set_btree_bset_end(b, t);
|
|
+
|
|
+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0);
|
|
+}
|
|
+
|
|
+/* Lookup */
|
|
+
|
|
+__flatten
|
|
+static struct bkey_packed *bset_search_write_set(const struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bpos *search)
|
|
+{
|
|
+ unsigned l = 0, r = t->size;
|
|
+
|
|
+ while (l + 1 != r) {
|
|
+ unsigned m = (l + r) >> 1;
|
|
+
|
|
+ if (bpos_cmp(rw_aux_tree(b, t)[m].k, *search) < 0)
|
|
+ l = m;
|
|
+ else
|
|
+ r = m;
|
|
+ }
|
|
+
|
|
+ return rw_aux_to_bkey(b, t, l);
|
|
+}
|
|
+
|
|
+static inline void prefetch_four_cachelines(void *p)
|
|
+{
|
|
+#ifdef CONFIG_X86_64
|
|
+ asm(".intel_syntax noprefix;"
|
|
+ "prefetcht0 [%0 - 127 + 64 * 0];"
|
|
+ "prefetcht0 [%0 - 127 + 64 * 1];"
|
|
+ "prefetcht0 [%0 - 127 + 64 * 2];"
|
|
+ "prefetcht0 [%0 - 127 + 64 * 3];"
|
|
+ ".att_syntax prefix;"
|
|
+ :
|
|
+ : "r" (p + 127));
|
|
+#else
|
|
+ prefetch(p + L1_CACHE_BYTES * 0);
|
|
+ prefetch(p + L1_CACHE_BYTES * 1);
|
|
+ prefetch(p + L1_CACHE_BYTES * 2);
|
|
+ prefetch(p + L1_CACHE_BYTES * 3);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline bool bkey_mantissa_bits_dropped(const struct btree *b,
|
|
+ const struct bkey_float *f,
|
|
+ unsigned idx)
|
|
+{
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits;
|
|
+
|
|
+ return f->exponent > key_bits_start;
|
|
+#else
|
|
+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits;
|
|
+
|
|
+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end;
|
|
+#endif
|
|
+}
|
|
+
|
|
+__flatten
|
|
+static struct bkey_packed *bset_search_tree(const struct btree *b,
|
|
+ const struct bset_tree *t,
|
|
+ const struct bpos *search,
|
|
+ const struct bkey_packed *packed_search)
|
|
+{
|
|
+ struct ro_aux_tree *base = ro_aux_tree_base(b, t);
|
|
+ struct bkey_float *f;
|
|
+ struct bkey_packed *k;
|
|
+ unsigned inorder, n = 1, l, r;
|
|
+ int cmp;
|
|
+
|
|
+ do {
|
|
+ if (likely(n << 4 < t->size))
|
|
+ prefetch(&base->f[n << 4]);
|
|
+
|
|
+ f = &base->f[n];
|
|
+ if (unlikely(f->exponent >= BFLOAT_FAILED))
|
|
+ goto slowpath;
|
|
+
|
|
+ l = f->mantissa;
|
|
+ r = bkey_mantissa(packed_search, f, n);
|
|
+
|
|
+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n))
|
|
+ goto slowpath;
|
|
+
|
|
+ n = n * 2 + (l < r);
|
|
+ continue;
|
|
+slowpath:
|
|
+ k = tree_to_bkey(b, t, n);
|
|
+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search);
|
|
+ if (!cmp)
|
|
+ return k;
|
|
+
|
|
+ n = n * 2 + (cmp < 0);
|
|
+ } while (n < t->size);
|
|
+
|
|
+ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra);
|
|
+
|
|
+ /*
|
|
+ * n would have been the node we recursed to - the low bit tells us if
|
|
+ * we recursed left or recursed right.
|
|
+ */
|
|
+ if (likely(!(n & 1))) {
|
|
+ --inorder;
|
|
+ if (unlikely(!inorder))
|
|
+ return btree_bkey_first(b, t);
|
|
+
|
|
+ f = &base->f[eytzinger1_prev(n >> 1, t->size)];
|
|
+ }
|
|
+
|
|
+ return cacheline_to_bkey(b, t, inorder, f->key_offset);
|
|
+}
|
|
+
|
|
+static __always_inline __flatten
|
|
+struct bkey_packed *__bch2_bset_search(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bpos *search,
|
|
+ const struct bkey_packed *lossy_packed_search)
|
|
+{
|
|
+
|
|
+ /*
|
|
+ * First, we search for a cacheline, then lastly we do a linear search
|
|
+ * within that cacheline.
|
|
+ *
|
|
+ * To search for the cacheline, there's three different possibilities:
|
|
+ * * The set is too small to have a search tree, so we just do a linear
|
|
+ * search over the whole set.
|
|
+ * * The set is the one we're currently inserting into; keeping a full
|
|
+ * auxiliary search tree up to date would be too expensive, so we
|
|
+ * use a much simpler lookup table to do a binary search -
|
|
+ * bset_search_write_set().
|
|
+ * * Or we use the auxiliary search tree we constructed earlier -
|
|
+ * bset_search_tree()
|
|
+ */
|
|
+
|
|
+ switch (bset_aux_tree_type(t)) {
|
|
+ case BSET_NO_AUX_TREE:
|
|
+ return btree_bkey_first(b, t);
|
|
+ case BSET_RW_AUX_TREE:
|
|
+ return bset_search_write_set(b, t, search);
|
|
+ case BSET_RO_AUX_TREE:
|
|
+ return bset_search_tree(b, t, search, lossy_packed_search);
|
|
+ default:
|
|
+ unreachable();
|
|
+ }
|
|
+}
|
|
+
|
|
+static __always_inline __flatten
|
|
+struct bkey_packed *bch2_bset_search_linear(struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bpos *search,
|
|
+ struct bkey_packed *packed_search,
|
|
+ const struct bkey_packed *lossy_packed_search,
|
|
+ struct bkey_packed *m)
|
|
+{
|
|
+ if (lossy_packed_search)
|
|
+ while (m != btree_bkey_last(b, t) &&
|
|
+ bkey_iter_cmp_p_or_unp(b, m,
|
|
+ lossy_packed_search, search) < 0)
|
|
+ m = bkey_next(m);
|
|
+
|
|
+ if (!packed_search)
|
|
+ while (m != btree_bkey_last(b, t) &&
|
|
+ bkey_iter_pos_cmp(b, m, search) < 0)
|
|
+ m = bkey_next(m);
|
|
+
|
|
+ if (bch2_expensive_debug_checks) {
|
|
+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
|
|
+
|
|
+ BUG_ON(prev &&
|
|
+ bkey_iter_cmp_p_or_unp(b, prev,
|
|
+ packed_search, search) >= 0);
|
|
+ }
|
|
+
|
|
+ return m;
|
|
+}
|
|
+
|
|
+/* Btree node iterator */
|
|
+
|
|
+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter,
|
|
+ struct btree *b,
|
|
+ const struct bkey_packed *k,
|
|
+ const struct bkey_packed *end)
|
|
+{
|
|
+ if (k != end) {
|
|
+ struct btree_node_iter_set *pos;
|
|
+
|
|
+ btree_node_iter_for_each(iter, pos)
|
|
+ ;
|
|
+
|
|
+ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data));
|
|
+ *pos = (struct btree_node_iter_set) {
|
|
+ __btree_node_key_to_offset(b, k),
|
|
+ __btree_node_key_to_offset(b, end)
|
|
+ };
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_push(struct btree_node_iter *iter,
|
|
+ struct btree *b,
|
|
+ const struct bkey_packed *k,
|
|
+ const struct bkey_packed *end)
|
|
+{
|
|
+ __bch2_btree_node_iter_push(iter, b, k, end);
|
|
+ bch2_btree_node_iter_sort(iter, b);
|
|
+}
|
|
+
|
|
+noinline __flatten __attribute__((cold))
|
|
+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter,
|
|
+ struct btree *b, struct bpos *search)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ trace_bkey_pack_pos_fail(search);
|
|
+
|
|
+ bch2_btree_node_iter_init_from_start(iter, b);
|
|
+
|
|
+ while ((k = bch2_btree_node_iter_peek(iter, b)) &&
|
|
+ bkey_iter_pos_cmp(b, k, search) < 0)
|
|
+ bch2_btree_node_iter_advance(iter, b);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a
|
|
+ * given position
|
|
+ *
|
|
+ * Main entry point to the lookup code for individual btree nodes:
|
|
+ *
|
|
+ * NOTE:
|
|
+ *
|
|
+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate
|
|
+ * keys. This doesn't matter for most code, but it does matter for lookups.
|
|
+ *
|
|
+ * Some adjacent keys with a string of equal keys:
|
|
+ * i j k k k k l m
|
|
+ *
|
|
+ * If you search for k, the lookup code isn't guaranteed to return you any
|
|
+ * specific k. The lookup code is conceptually doing a binary search and
|
|
+ * iterating backwards is very expensive so if the pivot happens to land at the
|
|
+ * last k that's what you'll get.
|
|
+ *
|
|
+ * This works out ok, but it's something to be aware of:
|
|
+ *
|
|
+ * - For non extents, we guarantee that the live key comes last - see
|
|
+ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't
|
|
+ * see will only be deleted keys you don't care about.
|
|
+ *
|
|
+ * - For extents, deleted keys sort last (see the comment at the top of this
|
|
+ * file). But when you're searching for extents, you actually want the first
|
|
+ * key strictly greater than your search key - an extent that compares equal
|
|
+ * to the search key is going to have 0 sectors after the search key.
|
|
+ *
|
|
+ * But this does mean that we can't just search for
|
|
+ * bpos_successor(start_of_range) to get the first extent that overlaps with
|
|
+ * the range we want - if we're unlucky and there's an extent that ends
|
|
+ * exactly where we searched, then there could be a deleted key at the same
|
|
+ * position and we'd get that when we search instead of the preceding extent
|
|
+ * we needed.
|
|
+ *
|
|
+ * So we've got to search for start_of_range, then after the lookup iterate
|
|
+ * past any extents that compare equal to the position we searched for.
|
|
+ */
|
|
+__flatten
|
|
+void bch2_btree_node_iter_init(struct btree_node_iter *iter,
|
|
+ struct btree *b, struct bpos *search)
|
|
+{
|
|
+ struct bkey_packed p, *packed_search = NULL;
|
|
+ struct btree_node_iter_set *pos = iter->data;
|
|
+ struct bkey_packed *k[MAX_BSETS];
|
|
+ unsigned i;
|
|
+
|
|
+ EBUG_ON(bpos_cmp(*search, b->data->min_key) < 0);
|
|
+ EBUG_ON(bpos_cmp(*search, b->data->max_key) > 0);
|
|
+ bset_aux_tree_verify(b);
|
|
+
|
|
+ memset(iter, 0, sizeof(*iter));
|
|
+
|
|
+ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) {
|
|
+ case BKEY_PACK_POS_EXACT:
|
|
+ packed_search = &p;
|
|
+ break;
|
|
+ case BKEY_PACK_POS_SMALLER:
|
|
+ packed_search = NULL;
|
|
+ break;
|
|
+ case BKEY_PACK_POS_FAIL:
|
|
+ btree_node_iter_init_pack_failed(iter, b, search);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < b->nsets; i++) {
|
|
+ k[i] = __bch2_bset_search(b, b->set + i, search, &p);
|
|
+ prefetch_four_cachelines(k[i]);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < b->nsets; i++) {
|
|
+ struct bset_tree *t = b->set + i;
|
|
+ struct bkey_packed *end = btree_bkey_last(b, t);
|
|
+
|
|
+ k[i] = bch2_bset_search_linear(b, t, search,
|
|
+ packed_search, &p, k[i]);
|
|
+ if (k[i] != end)
|
|
+ *pos++ = (struct btree_node_iter_set) {
|
|
+ __btree_node_key_to_offset(b, k[i]),
|
|
+ __btree_node_key_to_offset(b, end)
|
|
+ };
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_iter_sort(iter, b);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ memset(iter, 0, sizeof(*iter));
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ __bch2_btree_node_iter_push(iter, b,
|
|
+ btree_bkey_first(b, t),
|
|
+ btree_bkey_last(b, t));
|
|
+ bch2_btree_node_iter_sort(iter, b);
|
|
+}
|
|
+
|
|
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct bset_tree *t)
|
|
+{
|
|
+ struct btree_node_iter_set *set;
|
|
+
|
|
+ btree_node_iter_for_each(iter, set)
|
|
+ if (set->end == t->end_offset)
|
|
+ return __btree_node_offset_to_key(b, set->k);
|
|
+
|
|
+ return btree_bkey_last(b, t);
|
|
+}
|
|
+
|
|
+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter,
|
|
+ struct btree *b,
|
|
+ unsigned first)
|
|
+{
|
|
+ bool ret;
|
|
+
|
|
+ if ((ret = (btree_node_iter_cmp(b,
|
|
+ iter->data[first],
|
|
+ iter->data[first + 1]) > 0)))
|
|
+ swap(iter->data[first], iter->data[first + 1]);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_sort(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ /* unrolled bubble sort: */
|
|
+
|
|
+ if (!__btree_node_iter_set_end(iter, 2)) {
|
|
+ btree_node_iter_sort_two(iter, b, 0);
|
|
+ btree_node_iter_sort_two(iter, b, 1);
|
|
+ }
|
|
+
|
|
+ if (!__btree_node_iter_set_end(iter, 1))
|
|
+ btree_node_iter_sort_two(iter, b, 0);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter,
|
|
+ struct btree_node_iter_set *set)
|
|
+{
|
|
+ struct btree_node_iter_set *last =
|
|
+ iter->data + ARRAY_SIZE(iter->data) - 1;
|
|
+
|
|
+ memmove(&set[0], &set[1], (void *) last - (void *) set);
|
|
+ *last = (struct btree_node_iter_set) { 0, 0 };
|
|
+}
|
|
+
|
|
+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s;
|
|
+
|
|
+ EBUG_ON(iter->data->k > iter->data->end);
|
|
+
|
|
+ while (!__btree_node_iter_set_end(iter, 0) &&
|
|
+ !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
|
|
+ iter->data->k++;
|
|
+
|
|
+ if (unlikely(__btree_node_iter_set_end(iter, 0))) {
|
|
+ bch2_btree_node_iter_set_drop(iter, iter->data);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (__btree_node_iter_set_end(iter, 1))
|
|
+ return;
|
|
+
|
|
+ if (!btree_node_iter_sort_two(iter, b, 0))
|
|
+ return;
|
|
+
|
|
+ if (__btree_node_iter_set_end(iter, 2))
|
|
+ return;
|
|
+
|
|
+ btree_node_iter_sort_two(iter, b, 1);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_advance(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ if (bch2_expensive_debug_checks) {
|
|
+ bch2_btree_node_iter_verify(iter, b);
|
|
+ bch2_btree_node_iter_next_check(iter, b);
|
|
+ }
|
|
+
|
|
+ __bch2_btree_node_iter_advance(iter, b);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Expensive:
|
|
+ */
|
|
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct bkey_packed *k, *prev = NULL;
|
|
+ struct btree_node_iter_set *set;
|
|
+ struct bset_tree *t;
|
|
+ unsigned end = 0;
|
|
+
|
|
+ if (bch2_expensive_debug_checks)
|
|
+ bch2_btree_node_iter_verify(iter, b);
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ k = bch2_bkey_prev_all(b, t,
|
|
+ bch2_btree_node_iter_bset_pos(iter, b, t));
|
|
+ if (k &&
|
|
+ (!prev || bkey_iter_cmp(b, k, prev) > 0)) {
|
|
+ prev = k;
|
|
+ end = t->end_offset;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!prev)
|
|
+ return NULL;
|
|
+
|
|
+ /*
|
|
+ * We're manually memmoving instead of just calling sort() to ensure the
|
|
+ * prev we picked ends up in slot 0 - sort won't necessarily put it
|
|
+ * there because of duplicate deleted keys:
|
|
+ */
|
|
+ btree_node_iter_for_each(iter, set)
|
|
+ if (set->end == end)
|
|
+ goto found;
|
|
+
|
|
+ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]);
|
|
+found:
|
|
+ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data));
|
|
+
|
|
+ memmove(&iter->data[1],
|
|
+ &iter->data[0],
|
|
+ (void *) set - (void *) &iter->data[0]);
|
|
+
|
|
+ iter->data[0].k = __btree_node_key_to_offset(b, prev);
|
|
+ iter->data[0].end = end;
|
|
+
|
|
+ if (bch2_expensive_debug_checks)
|
|
+ bch2_btree_node_iter_verify(iter, b);
|
|
+ return prev;
|
|
+}
|
|
+
|
|
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct bkey_packed *prev;
|
|
+
|
|
+ do {
|
|
+ prev = bch2_btree_node_iter_prev_all(iter, b);
|
|
+ } while (prev && bkey_deleted(prev));
|
|
+
|
|
+ return prev;
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct bkey *u)
|
|
+{
|
|
+ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b);
|
|
+
|
|
+ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null;
|
|
+}
|
|
+
|
|
+/* Mergesort */
|
|
+
|
|
+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ enum bset_aux_tree_type type = bset_aux_tree_type(t);
|
|
+ size_t j;
|
|
+
|
|
+ stats->sets[type].nr++;
|
|
+ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) *
|
|
+ sizeof(u64);
|
|
+
|
|
+ if (bset_has_ro_aux_tree(t)) {
|
|
+ stats->floats += t->size - 1;
|
|
+
|
|
+ for (j = 1; j < t->size; j++)
|
|
+ stats->failed +=
|
|
+ bkey_float(b, t, j)->exponent ==
|
|
+ BFLOAT_FAILED;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
|
|
+ struct bkey uk;
|
|
+ unsigned j, inorder;
|
|
+
|
|
+ if (out->pos != out->end)
|
|
+ *out->pos = '\0';
|
|
+
|
|
+ if (!bset_has_ro_aux_tree(t))
|
|
+ return;
|
|
+
|
|
+ inorder = bkey_to_cacheline(b, t, k);
|
|
+ if (!inorder || inorder >= t->size)
|
|
+ return;
|
|
+
|
|
+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra);
|
|
+ if (k != tree_to_bkey(b, t, j))
|
|
+ return;
|
|
+
|
|
+ switch (bkey_float(b, t, j)->exponent) {
|
|
+ case BFLOAT_FAILED:
|
|
+ uk = bkey_unpack_key(b, k);
|
|
+ pr_buf(out,
|
|
+ " failed unpacked at depth %u\n"
|
|
+ "\t",
|
|
+ ilog2(j));
|
|
+ bch2_bpos_to_text(out, uk.p);
|
|
+ pr_buf(out, "\n");
|
|
+ break;
|
|
+ }
|
|
+}
|
|
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
|
|
new file mode 100644
|
|
index 000000000000..e42f866cf2ec
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/bset.h
|
|
@@ -0,0 +1,616 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BSET_H
|
|
+#define _BCACHEFS_BSET_H
|
|
+
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/types.h>
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_types.h"
|
|
+#include "util.h" /* for time_stats */
|
|
+#include "vstructs.h"
|
|
+
|
|
+/*
|
|
+ * BKEYS:
|
|
+ *
|
|
+ * A bkey contains a key, a size field, a variable number of pointers, and some
|
|
+ * ancillary flag bits.
|
|
+ *
|
|
+ * We use two different functions for validating bkeys, bkey_invalid and
|
|
+ * bkey_deleted().
|
|
+ *
|
|
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
|
|
+ * that it also filters out keys of size 0 - these are keys that have been
|
|
+ * completely overwritten. It'd be safe to delete these in memory while leaving
|
|
+ * them on disk, just unnecessary work - so we filter them out when resorting
|
|
+ * instead.
|
|
+ *
|
|
+ * We can't filter out stale keys when we're resorting, because garbage
|
|
+ * collection needs to find them to ensure bucket gens don't wrap around -
|
|
+ * unless we're rewriting the btree node those stale keys still exist on disk.
|
|
+ *
|
|
+ * We also implement functions here for removing some number of sectors from the
|
|
+ * front or the back of a bkey - this is mainly used for fixing overlapping
|
|
+ * extents, by removing the overlapping sectors from the older key.
|
|
+ *
|
|
+ * BSETS:
|
|
+ *
|
|
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
|
|
+ * along with a header. A btree node is made up of a number of these, written at
|
|
+ * different times.
|
|
+ *
|
|
+ * There could be many of them on disk, but we never allow there to be more than
|
|
+ * 4 in memory - we lazily resort as needed.
|
|
+ *
|
|
+ * We implement code here for creating and maintaining auxiliary search trees
|
|
+ * (described below) for searching an individial bset, and on top of that we
|
|
+ * implement a btree iterator.
|
|
+ *
|
|
+ * BTREE ITERATOR:
|
|
+ *
|
|
+ * Most of the code in bcache doesn't care about an individual bset - it needs
|
|
+ * to search entire btree nodes and iterate over them in sorted order.
|
|
+ *
|
|
+ * The btree iterator code serves both functions; it iterates through the keys
|
|
+ * in a btree node in sorted order, starting from either keys after a specific
|
|
+ * point (if you pass it a search key) or the start of the btree node.
|
|
+ *
|
|
+ * AUXILIARY SEARCH TREES:
|
|
+ *
|
|
+ * Since keys are variable length, we can't use a binary search on a bset - we
|
|
+ * wouldn't be able to find the start of the next key. But binary searches are
|
|
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
|
|
+ * searches and that code topped out at under 50k lookups/second.
|
|
+ *
|
|
+ * So we need to construct some sort of lookup table. Since we only insert keys
|
|
+ * into the last (unwritten) set, most of the keys within a given btree node are
|
|
+ * usually in sets that are mostly constant. We use two different types of
|
|
+ * lookup tables to take advantage of this.
|
|
+ *
|
|
+ * Both lookup tables share in common that they don't index every key in the
|
|
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
|
|
+ * is used for the rest.
|
|
+ *
|
|
+ * For sets that have been written to disk and are no longer being inserted
|
|
+ * into, we construct a binary search tree in an array - traversing a binary
|
|
+ * search tree in an array gives excellent locality of reference and is very
|
|
+ * fast, since both children of any node are adjacent to each other in memory
|
|
+ * (and their grandchildren, and great grandchildren...) - this means
|
|
+ * prefetching can be used to great effect.
|
|
+ *
|
|
+ * It's quite useful performance wise to keep these nodes small - not just
|
|
+ * because they're more likely to be in L2, but also because we can prefetch
|
|
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
|
|
+ * when traversing this tree.
|
|
+ *
|
|
+ * Nodes in the auxiliary search tree must contain both a key to compare against
|
|
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
|
|
+ * and a pointer to the key. We use a few tricks to compress both of these.
|
|
+ *
|
|
+ * To compress the pointer, we take advantage of the fact that one node in the
|
|
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
|
|
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
|
|
+ * returns what its index would be in an inorder traversal, so we only have to
|
|
+ * store the low bits of the offset.
|
|
+ *
|
|
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
|
|
+ * compress that, we take advantage of the fact that when we're traversing the
|
|
+ * search tree at every iteration we know that both our search key and the key
|
|
+ * we're looking for lie within some range - bounded by our previous
|
|
+ * comparisons. (We special case the start of a search so that this is true even
|
|
+ * at the root of the tree).
|
|
+ *
|
|
+ * So we know the key we're looking for is between a and b, and a and b don't
|
|
+ * differ higher than bit 50, we don't need to check anything higher than bit
|
|
+ * 50.
|
|
+ *
|
|
+ * We don't usually need the rest of the bits, either; we only need enough bits
|
|
+ * to partition the key range we're currently checking. Consider key n - the
|
|
+ * key our auxiliary search tree node corresponds to, and key p, the key
|
|
+ * immediately preceding n. The lowest bit we need to store in the auxiliary
|
|
+ * search tree is the highest bit that differs between n and p.
|
|
+ *
|
|
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
|
|
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
|
|
+ * of fixed size.
|
|
+ *
|
|
+ * The solution is to make them fixed size, and when we're constructing a node
|
|
+ * check if p and n differed in the bits we needed them to. If they don't we
|
|
+ * flag that node, and when doing lookups we fallback to comparing against the
|
|
+ * real key. As long as this doesn't happen to often (and it seems to reliably
|
|
+ * happen a bit less than 1% of the time), we win - even on failures, that key
|
|
+ * is then more likely to be in cache than if we were doing binary searches all
|
|
+ * the way, since we're touching so much less memory.
|
|
+ *
|
|
+ * The keys in the auxiliary search tree are stored in (software) floating
|
|
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
|
|
+ * to address all the bits in the original key, but the number of bits in the
|
|
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
|
|
+ *
|
|
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
|
|
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
|
|
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
|
|
+ * search trees take up 3% as much memory as the btree itself.
|
|
+ *
|
|
+ * Constructing these auxiliary search trees is moderately expensive, and we
|
|
+ * don't want to be constantly rebuilding the search tree for the last set
|
|
+ * whenever we insert another key into it. For the unwritten set, we use a much
|
|
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
|
|
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
|
|
+ * within each byte range works the same as with the auxiliary search trees.
|
|
+ *
|
|
+ * These are much easier to keep up to date when we insert a key - we do it
|
|
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
|
|
+ * to it, only when it would overflow do we go to the trouble of finding the
|
|
+ * first key in that range of bytes again.
|
|
+ */
|
|
+
|
|
+enum bset_aux_tree_type {
|
|
+ BSET_NO_AUX_TREE,
|
|
+ BSET_RO_AUX_TREE,
|
|
+ BSET_RW_AUX_TREE,
|
|
+};
|
|
+
|
|
+#define BSET_TREE_NR_TYPES 3
|
|
+
|
|
+#define BSET_NO_AUX_TREE_VAL (U16_MAX)
|
|
+#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1)
|
|
+
|
|
+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t)
|
|
+{
|
|
+ switch (t->extra) {
|
|
+ case BSET_NO_AUX_TREE_VAL:
|
|
+ EBUG_ON(t->size);
|
|
+ return BSET_NO_AUX_TREE;
|
|
+ case BSET_RW_AUX_TREE_VAL:
|
|
+ EBUG_ON(!t->size);
|
|
+ return BSET_RW_AUX_TREE;
|
|
+ default:
|
|
+ EBUG_ON(!t->size);
|
|
+ return BSET_RO_AUX_TREE;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
|
|
+ * it used to be 64, but I realized the lookup code would touch slightly less
|
|
+ * memory if it was 128.
|
|
+ *
|
|
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
|
|
+ * the auxiliar search tree - when we're done searching the bset_float tree we
|
|
+ * have this many bytes left that we do a linear search over.
|
|
+ *
|
|
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
|
|
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
|
|
+ * cacheline in the linear search - but the linear search might stop before it
|
|
+ * gets to the second cacheline.
|
|
+ */
|
|
+
|
|
+#define BSET_CACHELINE 256
|
|
+
|
|
+static inline size_t btree_keys_cachelines(const struct btree *b)
|
|
+{
|
|
+ return (1U << b->byte_order) / BSET_CACHELINE;
|
|
+}
|
|
+
|
|
+static inline size_t btree_aux_data_bytes(const struct btree *b)
|
|
+{
|
|
+ return btree_keys_cachelines(b) * 8;
|
|
+}
|
|
+
|
|
+static inline size_t btree_aux_data_u64s(const struct btree *b)
|
|
+{
|
|
+ return btree_aux_data_bytes(b) / sizeof(u64);
|
|
+}
|
|
+
|
|
+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *);
|
|
+
|
|
+static inline void
|
|
+__bkey_unpack_key_format_checked(const struct btree *b,
|
|
+ struct bkey *dst,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
|
|
+ {
|
|
+ compiled_unpack_fn unpack_fn = b->aux_data;
|
|
+ unpack_fn(dst, src);
|
|
+
|
|
+ if (bch2_expensive_debug_checks) {
|
|
+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
|
|
+
|
|
+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst)));
|
|
+ }
|
|
+ }
|
|
+#else
|
|
+ *dst = __bch2_bkey_unpack_key(&b->format, src);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline struct bkey
|
|
+bkey_unpack_key_format_checked(const struct btree *b,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+ struct bkey dst;
|
|
+
|
|
+ __bkey_unpack_key_format_checked(b, &dst, src);
|
|
+ return dst;
|
|
+}
|
|
+
|
|
+static inline void __bkey_unpack_key(const struct btree *b,
|
|
+ struct bkey *dst,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+ if (likely(bkey_packed(src)))
|
|
+ __bkey_unpack_key_format_checked(b, dst, src);
|
|
+ else
|
|
+ *dst = *packed_to_bkey_c(src);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bkey_unpack_key -- unpack just the key, not the value
|
|
+ */
|
|
+static inline struct bkey bkey_unpack_key(const struct btree *b,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+ return likely(bkey_packed(src))
|
|
+ ? bkey_unpack_key_format_checked(b, src)
|
|
+ : *packed_to_bkey_c(src);
|
|
+}
|
|
+
|
|
+static inline struct bpos
|
|
+bkey_unpack_pos_format_checked(const struct btree *b,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK
|
|
+ return bkey_unpack_key_format_checked(b, src).p;
|
|
+#else
|
|
+ return __bkey_unpack_pos(&b->format, src);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline struct bpos bkey_unpack_pos(const struct btree *b,
|
|
+ const struct bkey_packed *src)
|
|
+{
|
|
+ return likely(bkey_packed(src))
|
|
+ ? bkey_unpack_pos_format_checked(b, src)
|
|
+ : packed_to_bkey_c(src)->p;
|
|
+}
|
|
+
|
|
+/* Disassembled bkeys */
|
|
+
|
|
+static inline struct bkey_s_c bkey_disassemble(struct btree *b,
|
|
+ const struct bkey_packed *k,
|
|
+ struct bkey *u)
|
|
+{
|
|
+ __bkey_unpack_key(b, u, k);
|
|
+
|
|
+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), };
|
|
+}
|
|
+
|
|
+/* non const version: */
|
|
+static inline struct bkey_s __bkey_disassemble(struct btree *b,
|
|
+ struct bkey_packed *k,
|
|
+ struct bkey *u)
|
|
+{
|
|
+ __bkey_unpack_key(b, u, k);
|
|
+
|
|
+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
|
|
+}
|
|
+
|
|
+#define for_each_bset(_b, _t) \
|
|
+ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
|
|
+
|
|
+#define bset_tree_for_each_key(_b, _t, _k) \
|
|
+ for (_k = btree_bkey_first(_b, _t); \
|
|
+ _k != btree_bkey_last(_b, _t); \
|
|
+ _k = bkey_next(_k))
|
|
+
|
|
+static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
|
|
+{
|
|
+ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
|
|
+}
|
|
+
|
|
+static inline bool bset_has_rw_aux_tree(struct bset_tree *t)
|
|
+{
|
|
+ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE;
|
|
+}
|
|
+
|
|
+static inline void bch2_bset_set_no_aux_tree(struct btree *b,
|
|
+ struct bset_tree *t)
|
|
+{
|
|
+ BUG_ON(t < b->set);
|
|
+
|
|
+ for (; t < b->set + ARRAY_SIZE(b->set); t++) {
|
|
+ t->size = 0;
|
|
+ t->extra = BSET_NO_AUX_TREE_VAL;
|
|
+ t->aux_data_offset = U16_MAX;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void btree_node_set_format(struct btree *b,
|
|
+ struct bkey_format f)
|
|
+{
|
|
+ int len;
|
|
+
|
|
+ b->format = f;
|
|
+ b->nr_key_bits = bkey_format_key_bits(&f);
|
|
+
|
|
+ len = bch2_compile_bkey_format(&b->format, b->aux_data);
|
|
+ BUG_ON(len < 0 || len > U8_MAX);
|
|
+
|
|
+ b->unpack_fn_len = len;
|
|
+
|
|
+ bch2_bset_set_no_aux_tree(b, b->set);
|
|
+}
|
|
+
|
|
+static inline struct bset *bset_next_set(struct btree *b,
|
|
+ unsigned block_bytes)
|
|
+{
|
|
+ struct bset *i = btree_bset_last(b);
|
|
+
|
|
+ EBUG_ON(!is_power_of_2(block_bytes));
|
|
+
|
|
+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes);
|
|
+}
|
|
+
|
|
+void bch2_btree_keys_init(struct btree *);
|
|
+
|
|
+void bch2_bset_init_first(struct btree *, struct bset *);
|
|
+void bch2_bset_init_next(struct bch_fs *, struct btree *,
|
|
+ struct btree_node_entry *);
|
|
+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
|
|
+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *);
|
|
+
|
|
+void bch2_bset_insert(struct btree *, struct btree_node_iter *,
|
|
+ struct bkey_packed *, struct bkey_i *, unsigned);
|
|
+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned);
|
|
+
|
|
+/* Bkey utility code */
|
|
+
|
|
+/* packed or unpacked */
|
|
+static inline int bkey_cmp_p_or_unp(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bkey_packed *r_packed,
|
|
+ const struct bpos *r)
|
|
+{
|
|
+ EBUG_ON(r_packed && !bkey_packed(r_packed));
|
|
+
|
|
+ if (unlikely(!bkey_packed(l)))
|
|
+ return bpos_cmp(packed_to_bkey_c(l)->p, *r);
|
|
+
|
|
+ if (likely(r_packed))
|
|
+ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b);
|
|
+
|
|
+ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r);
|
|
+}
|
|
+
|
|
+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *);
|
|
+
|
|
+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *,
|
|
+ struct bkey_packed *, unsigned);
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
|
|
+{
|
|
+ return bch2_bkey_prev_filter(b, t, k, 0);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k)
|
|
+{
|
|
+ return bch2_bkey_prev_filter(b, t, k, 1);
|
|
+}
|
|
+
|
|
+/* Btree key iteration */
|
|
+
|
|
+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *,
|
|
+ const struct bkey_packed *,
|
|
+ const struct bkey_packed *);
|
|
+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *,
|
|
+ struct bpos *);
|
|
+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *,
|
|
+ struct btree *);
|
|
+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *,
|
|
+ struct btree *,
|
|
+ struct bset_tree *);
|
|
+
|
|
+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *);
|
|
+void bch2_btree_node_iter_set_drop(struct btree_node_iter *,
|
|
+ struct btree_node_iter_set *);
|
|
+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *);
|
|
+
|
|
+#define btree_node_iter_for_each(_iter, _set) \
|
|
+ for (_set = (_iter)->data; \
|
|
+ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \
|
|
+ (_set)->k != (_set)->end; \
|
|
+ _set++)
|
|
+
|
|
+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter,
|
|
+ unsigned i)
|
|
+{
|
|
+ return iter->data[i].k == iter->data[i].end;
|
|
+}
|
|
+
|
|
+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter)
|
|
+{
|
|
+ return __btree_node_iter_set_end(iter, 0);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * When keys compare equal, deleted keys compare first:
|
|
+ *
|
|
+ * XXX: only need to compare pointers for keys that are both within a
|
|
+ * btree_node_iterator - we need to break ties for prev() to work correctly
|
|
+ */
|
|
+static inline int bkey_iter_cmp(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bkey_packed *r)
|
|
+{
|
|
+ return bch2_bkey_cmp_packed(b, l, r)
|
|
+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l)
|
|
+ ?: cmp_int(l, r);
|
|
+}
|
|
+
|
|
+static inline int btree_node_iter_cmp(const struct btree *b,
|
|
+ struct btree_node_iter_set l,
|
|
+ struct btree_node_iter_set r)
|
|
+{
|
|
+ return bkey_iter_cmp(b,
|
|
+ __btree_node_offset_to_key(b, l.k),
|
|
+ __btree_node_offset_to_key(b, r.k));
|
|
+}
|
|
+
|
|
+/* These assume r (the search key) is not a deleted key: */
|
|
+static inline int bkey_iter_pos_cmp(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bpos *r)
|
|
+{
|
|
+ return bkey_cmp_left_packed(b, l, r)
|
|
+ ?: -((int) bkey_deleted(l));
|
|
+}
|
|
+
|
|
+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b,
|
|
+ const struct bkey_packed *l,
|
|
+ const struct bkey_packed *r_packed,
|
|
+ const struct bpos *r)
|
|
+{
|
|
+ return bkey_cmp_p_or_unp(b, l, r_packed, r)
|
|
+ ?: -((int) bkey_deleted(l));
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return __btree_node_offset_to_key(b, iter->data->k);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b)
|
|
+{
|
|
+ return !bch2_btree_node_iter_end(iter)
|
|
+ ? __btree_node_offset_to_key(b, iter->data->k)
|
|
+ : NULL;
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ while ((k = bch2_btree_node_iter_peek_all(iter, b)) &&
|
|
+ bkey_deleted(k))
|
|
+ bch2_btree_node_iter_advance(iter, b);
|
|
+
|
|
+ return k;
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b)
|
|
+{
|
|
+ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b);
|
|
+
|
|
+ if (ret)
|
|
+ bch2_btree_node_iter_advance(iter, b);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *,
|
|
+ struct btree *);
|
|
+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *,
|
|
+ struct btree *);
|
|
+
|
|
+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
|
|
+ struct btree *,
|
|
+ struct bkey *);
|
|
+
|
|
+#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \
|
|
+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \
|
|
+ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\
|
|
+ bch2_btree_node_iter_advance(iter, b))
|
|
+
|
|
+/* Accounting: */
|
|
+
|
|
+static inline void btree_keys_account_key(struct btree_nr_keys *n,
|
|
+ unsigned bset,
|
|
+ struct bkey_packed *k,
|
|
+ int sign)
|
|
+{
|
|
+ n->live_u64s += k->u64s * sign;
|
|
+ n->bset_u64s[bset] += k->u64s * sign;
|
|
+
|
|
+ if (bkey_packed(k))
|
|
+ n->packed_keys += sign;
|
|
+ else
|
|
+ n->unpacked_keys += sign;
|
|
+}
|
|
+
|
|
+static inline void btree_keys_account_val_delta(struct btree *b,
|
|
+ struct bkey_packed *k,
|
|
+ int delta)
|
|
+{
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, k);
|
|
+
|
|
+ b->nr.live_u64s += delta;
|
|
+ b->nr.bset_u64s[t - b->set] += delta;
|
|
+}
|
|
+
|
|
+#define btree_keys_account_key_add(_nr, _bset_idx, _k) \
|
|
+ btree_keys_account_key(_nr, _bset_idx, _k, 1)
|
|
+#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \
|
|
+ btree_keys_account_key(_nr, _bset_idx, _k, -1)
|
|
+
|
|
+#define btree_account_key_add(_b, _k) \
|
|
+ btree_keys_account_key(&(_b)->nr, \
|
|
+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1)
|
|
+#define btree_account_key_drop(_b, _k) \
|
|
+ btree_keys_account_key(&(_b)->nr, \
|
|
+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1)
|
|
+
|
|
+struct bset_stats {
|
|
+ struct {
|
|
+ size_t nr, bytes;
|
|
+ } sets[BSET_TREE_NR_TYPES];
|
|
+
|
|
+ size_t floats;
|
|
+ size_t failed;
|
|
+};
|
|
+
|
|
+void bch2_btree_keys_stats(struct btree *, struct bset_stats *);
|
|
+void bch2_bfloat_to_text(struct printbuf *, struct btree *,
|
|
+ struct bkey_packed *);
|
|
+
|
|
+/* Debug stuff */
|
|
+
|
|
+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned);
|
|
+void bch2_dump_btree_node(struct bch_fs *, struct btree *);
|
|
+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *);
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+
|
|
+void __bch2_verify_btree_nr_keys(struct btree *);
|
|
+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *);
|
|
+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *,
|
|
+ struct bkey_packed *, unsigned);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {}
|
|
+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter,
|
|
+ struct btree *b) {}
|
|
+static inline void bch2_verify_insert_pos(struct btree *b,
|
|
+ struct bkey_packed *where,
|
|
+ struct bkey_packed *insert,
|
|
+ unsigned clobber_u64s) {}
|
|
+#endif
|
|
+
|
|
+static inline void bch2_verify_btree_nr_keys(struct btree *b)
|
|
+{
|
|
+ if (bch2_debug_check_btree_accounting)
|
|
+ __bch2_verify_btree_nr_keys(b);
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_BSET_H */
|
|
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
|
|
new file mode 100644
|
|
index 000000000000..f6adbe8955d7
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_cache.c
|
|
@@ -0,0 +1,1024 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_locking.h"
|
|
+#include "debug.h"
|
|
+#include "error.h"
|
|
+
|
|
+#include <linux/prefetch.h>
|
|
+#include <linux/sched/mm.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+void bch2_recalc_btree_reserve(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i, reserve = 16;
|
|
+
|
|
+ if (!c->btree_roots[0].b)
|
|
+ reserve += 8;
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ if (c->btree_roots[i].b)
|
|
+ reserve += min_t(unsigned, 1,
|
|
+ c->btree_roots[i].b->c.level) * 8;
|
|
+
|
|
+ c->btree_cache.reserve = reserve;
|
|
+}
|
|
+
|
|
+static inline unsigned btree_cache_can_free(struct btree_cache *bc)
|
|
+{
|
|
+ return max_t(int, 0, bc->used - bc->reserve);
|
|
+}
|
|
+
|
|
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+
|
|
+ EBUG_ON(btree_node_write_in_flight(b));
|
|
+
|
|
+ kvpfree(b->data, btree_bytes(c));
|
|
+ b->data = NULL;
|
|
+#ifdef __KERNEL__
|
|
+ vfree(b->aux_data);
|
|
+#else
|
|
+ munmap(b->aux_data, btree_aux_data_bytes(b));
|
|
+#endif
|
|
+ b->aux_data = NULL;
|
|
+
|
|
+ bc->used--;
|
|
+ list_move(&b->list, &bc->freed);
|
|
+}
|
|
+
|
|
+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
|
|
+ const void *obj)
|
|
+{
|
|
+ const struct btree *b = obj;
|
|
+ const u64 *v = arg->key;
|
|
+
|
|
+ return b->hash_val == *v ? 0 : 1;
|
|
+}
|
|
+
|
|
+static const struct rhashtable_params bch_btree_cache_params = {
|
|
+ .head_offset = offsetof(struct btree, hash),
|
|
+ .key_offset = offsetof(struct btree, hash_val),
|
|
+ .key_len = sizeof(u64),
|
|
+ .obj_cmpfn = bch2_btree_cache_cmp_fn,
|
|
+};
|
|
+
|
|
+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
|
|
+{
|
|
+ BUG_ON(b->data || b->aux_data);
|
|
+
|
|
+ b->data = kvpmalloc(btree_bytes(c), gfp);
|
|
+ if (!b->data)
|
|
+ return -ENOMEM;
|
|
+#ifdef __KERNEL__
|
|
+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
|
|
+#else
|
|
+ b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
|
|
+ PROT_READ|PROT_WRITE|PROT_EXEC,
|
|
+ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
|
|
+#endif
|
|
+ if (!b->aux_data) {
|
|
+ kvpfree(b->data, btree_bytes(c));
|
|
+ b->data = NULL;
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
|
|
+{
|
|
+ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL);
|
|
+ if (!b)
|
|
+ return NULL;
|
|
+
|
|
+ bkey_btree_ptr_init(&b->key);
|
|
+ six_lock_init(&b->c.lock);
|
|
+ INIT_LIST_HEAD(&b->list);
|
|
+ INIT_LIST_HEAD(&b->write_blocked);
|
|
+ b->byte_order = ilog2(btree_bytes(c));
|
|
+ return b;
|
|
+}
|
|
+
|
|
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b = __btree_node_mem_alloc(c);
|
|
+ if (!b)
|
|
+ return NULL;
|
|
+
|
|
+ if (btree_node_data_alloc(c, b, GFP_KERNEL)) {
|
|
+ kfree(b);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ bc->used++;
|
|
+ list_add(&b->list, &bc->freeable);
|
|
+ return b;
|
|
+}
|
|
+
|
|
+/* Btree in memory cache - hash table */
|
|
+
|
|
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
|
|
+{
|
|
+ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
|
|
+
|
|
+ /* Cause future lookups for this node to fail: */
|
|
+ b->hash_val = 0;
|
|
+
|
|
+ six_lock_wakeup_all(&b->c.lock);
|
|
+}
|
|
+
|
|
+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
|
|
+{
|
|
+ BUG_ON(b->hash_val);
|
|
+ b->hash_val = btree_ptr_hash_val(&b->key);
|
|
+
|
|
+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash,
|
|
+ bch_btree_cache_params);
|
|
+}
|
|
+
|
|
+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b,
|
|
+ unsigned level, enum btree_id id)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ b->c.level = level;
|
|
+ b->c.btree_id = id;
|
|
+
|
|
+ if (level)
|
|
+ six_lock_pcpu_alloc(&b->c.lock);
|
|
+ else
|
|
+ six_lock_pcpu_free_rcu(&b->c.lock);
|
|
+
|
|
+ mutex_lock(&bc->lock);
|
|
+ ret = __bch2_btree_node_hash_insert(bc, b);
|
|
+ if (!ret)
|
|
+ list_add(&b->list, &bc->live);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+__flatten
|
|
+static inline struct btree *btree_cache_find(struct btree_cache *bc,
|
|
+ const struct bkey_i *k)
|
|
+{
|
|
+ u64 v = btree_ptr_hash_val(k);
|
|
+
|
|
+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * this version is for btree nodes that have already been freed (we're not
|
|
+ * reaping a real btree node)
|
|
+ */
|
|
+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ int ret = 0;
|
|
+
|
|
+ lockdep_assert_held(&bc->lock);
|
|
+
|
|
+ if (!six_trylock_intent(&b->c.lock))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ if (!six_trylock_write(&b->c.lock))
|
|
+ goto out_unlock_intent;
|
|
+
|
|
+ if (btree_node_noevict(b))
|
|
+ goto out_unlock;
|
|
+
|
|
+ if (!btree_node_may_write(b))
|
|
+ goto out_unlock;
|
|
+
|
|
+ if (btree_node_dirty(b) &&
|
|
+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
|
|
+ goto out_unlock;
|
|
+
|
|
+ if (btree_node_dirty(b) ||
|
|
+ btree_node_write_in_flight(b) ||
|
|
+ btree_node_read_in_flight(b)) {
|
|
+ if (!flush)
|
|
+ goto out_unlock;
|
|
+
|
|
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+
|
|
+ /*
|
|
+ * Using the underscore version because we don't want to compact
|
|
+ * bsets after the write, since this node is about to be evicted
|
|
+ * - unless btree verify mode is enabled, since it runs out of
|
|
+ * the post write cleanup:
|
|
+ */
|
|
+ if (bch2_verify_btree_ondisk)
|
|
+ bch2_btree_node_write(c, b, SIX_LOCK_intent);
|
|
+ else
|
|
+ __bch2_btree_node_write(c, b);
|
|
+
|
|
+ /* wait for any in flight btree write */
|
|
+ btree_node_wait_on_io(b);
|
|
+ }
|
|
+out:
|
|
+ if (b->hash_val && !ret)
|
|
+ trace_btree_node_reap(c, b);
|
|
+ return ret;
|
|
+out_unlock:
|
|
+ six_unlock_write(&b->c.lock);
|
|
+out_unlock_intent:
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static int btree_node_reclaim(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ return __btree_node_reclaim(c, b, false);
|
|
+}
|
|
+
|
|
+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ return __btree_node_reclaim(c, b, true);
|
|
+}
|
|
+
|
|
+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
|
|
+ struct shrink_control *sc)
|
|
+{
|
|
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
|
|
+ btree_cache.shrink);
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b, *t;
|
|
+ unsigned long nr = sc->nr_to_scan;
|
|
+ unsigned long can_free;
|
|
+ unsigned long touched = 0;
|
|
+ unsigned long freed = 0;
|
|
+ unsigned i, flags;
|
|
+
|
|
+ if (bch2_btree_shrinker_disabled)
|
|
+ return SHRINK_STOP;
|
|
+
|
|
+ /* Return -1 if we can't do anything right now */
|
|
+ if (sc->gfp_mask & __GFP_FS)
|
|
+ mutex_lock(&bc->lock);
|
|
+ else if (!mutex_trylock(&bc->lock))
|
|
+ return -1;
|
|
+
|
|
+ flags = memalloc_nofs_save();
|
|
+
|
|
+ /*
|
|
+ * It's _really_ critical that we don't free too many btree nodes - we
|
|
+ * have to always leave ourselves a reserve. The reserve is how we
|
|
+ * guarantee that allocating memory for a new btree node can always
|
|
+ * succeed, so that inserting keys into the btree can always succeed and
|
|
+ * IO can always make forward progress:
|
|
+ */
|
|
+ nr /= btree_pages(c);
|
|
+ can_free = btree_cache_can_free(bc);
|
|
+ nr = min_t(unsigned long, nr, can_free);
|
|
+
|
|
+ i = 0;
|
|
+ list_for_each_entry_safe(b, t, &bc->freeable, list) {
|
|
+ touched++;
|
|
+
|
|
+ if (freed >= nr)
|
|
+ break;
|
|
+
|
|
+ if (++i > 3 &&
|
|
+ !btree_node_reclaim(c, b)) {
|
|
+ btree_node_data_free(c, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ freed++;
|
|
+ }
|
|
+ }
|
|
+restart:
|
|
+ list_for_each_entry_safe(b, t, &bc->live, list) {
|
|
+ touched++;
|
|
+
|
|
+ if (freed >= nr) {
|
|
+ /* Save position */
|
|
+ if (&t->list != &bc->live)
|
|
+ list_move_tail(&bc->live, &t->list);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!btree_node_accessed(b) &&
|
|
+ !btree_node_reclaim(c, b)) {
|
|
+ /* can't call bch2_btree_node_hash_remove under lock */
|
|
+ freed++;
|
|
+ if (&t->list != &bc->live)
|
|
+ list_move_tail(&bc->live, &t->list);
|
|
+
|
|
+ btree_node_data_free(c, b);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ bch2_btree_node_hash_remove(bc, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+
|
|
+ if (freed >= nr)
|
|
+ goto out;
|
|
+
|
|
+ if (sc->gfp_mask & __GFP_FS)
|
|
+ mutex_lock(&bc->lock);
|
|
+ else if (!mutex_trylock(&bc->lock))
|
|
+ goto out;
|
|
+ goto restart;
|
|
+ } else
|
|
+ clear_btree_node_accessed(b);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&bc->lock);
|
|
+out:
|
|
+ memalloc_nofs_restore(flags);
|
|
+ return (unsigned long) freed * btree_pages(c);
|
|
+}
|
|
+
|
|
+static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
|
|
+ struct shrink_control *sc)
|
|
+{
|
|
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
|
|
+ btree_cache.shrink);
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+
|
|
+ if (bch2_btree_shrinker_disabled)
|
|
+ return 0;
|
|
+
|
|
+ return btree_cache_can_free(bc) * btree_pages(c);
|
|
+}
|
|
+
|
|
+void bch2_fs_btree_cache_exit(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+ unsigned i, flags;
|
|
+
|
|
+ if (bc->shrink.list.next)
|
|
+ unregister_shrinker(&bc->shrink);
|
|
+
|
|
+ /* vfree() can allocate memory: */
|
|
+ flags = memalloc_nofs_save();
|
|
+ mutex_lock(&bc->lock);
|
|
+
|
|
+ if (c->verify_data)
|
|
+ list_move(&c->verify_data->list, &bc->live);
|
|
+
|
|
+ kvpfree(c->verify_ondisk, btree_bytes(c));
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ if (c->btree_roots[i].b)
|
|
+ list_add(&c->btree_roots[i].b->list, &bc->live);
|
|
+
|
|
+ list_splice(&bc->freeable, &bc->live);
|
|
+
|
|
+ while (!list_empty(&bc->live)) {
|
|
+ b = list_first_entry(&bc->live, struct btree, list);
|
|
+
|
|
+ BUG_ON(btree_node_read_in_flight(b) ||
|
|
+ btree_node_write_in_flight(b));
|
|
+
|
|
+ if (btree_node_dirty(b))
|
|
+ bch2_btree_complete_write(c, b, btree_current_write(b));
|
|
+ clear_btree_node_dirty(c, b);
|
|
+
|
|
+ btree_node_data_free(c, b);
|
|
+ }
|
|
+
|
|
+ BUG_ON(atomic_read(&c->btree_cache.dirty));
|
|
+
|
|
+ while (!list_empty(&bc->freed)) {
|
|
+ b = list_first_entry(&bc->freed, struct btree, list);
|
|
+ list_del(&b->list);
|
|
+ six_lock_pcpu_free(&b->c.lock);
|
|
+ kfree(b);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&bc->lock);
|
|
+ memalloc_nofs_restore(flags);
|
|
+
|
|
+ if (bc->table_init_done)
|
|
+ rhashtable_destroy(&bc->table);
|
|
+}
|
|
+
|
|
+int bch2_fs_btree_cache_init(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_verbose_init(c->opts, "");
|
|
+
|
|
+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ bc->table_init_done = true;
|
|
+
|
|
+ bch2_recalc_btree_reserve(c);
|
|
+
|
|
+ for (i = 0; i < bc->reserve; i++)
|
|
+ if (!__bch2_btree_node_mem_alloc(c)) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ list_splice_init(&bc->live, &bc->freeable);
|
|
+
|
|
+ mutex_init(&c->verify_lock);
|
|
+
|
|
+ bc->shrink.count_objects = bch2_btree_cache_count;
|
|
+ bc->shrink.scan_objects = bch2_btree_cache_scan;
|
|
+ bc->shrink.seeks = 4;
|
|
+ bc->shrink.batch = btree_pages(c) * 2;
|
|
+ ret = register_shrinker(&bc->shrink);
|
|
+out:
|
|
+ pr_verbose_init(c->opts, "ret %i", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
|
|
+{
|
|
+ mutex_init(&bc->lock);
|
|
+ INIT_LIST_HEAD(&bc->live);
|
|
+ INIT_LIST_HEAD(&bc->freeable);
|
|
+ INIT_LIST_HEAD(&bc->freed);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
|
|
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
|
|
+ * cannibalize_bucket() will take. This means every time we unlock the root of
|
|
+ * the btree, we need to release this lock if we have it held.
|
|
+ */
|
|
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+
|
|
+ if (bc->alloc_lock == current) {
|
|
+ trace_btree_node_cannibalize_unlock(c);
|
|
+ bc->alloc_lock = NULL;
|
|
+ closure_wake_up(&bc->alloc_wait);
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct task_struct *old;
|
|
+
|
|
+ old = cmpxchg(&bc->alloc_lock, NULL, current);
|
|
+ if (old == NULL || old == current)
|
|
+ goto success;
|
|
+
|
|
+ if (!cl) {
|
|
+ trace_btree_node_cannibalize_lock_fail(c);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ closure_wait(&bc->alloc_wait, cl);
|
|
+
|
|
+ /* Try again, after adding ourselves to waitlist */
|
|
+ old = cmpxchg(&bc->alloc_lock, NULL, current);
|
|
+ if (old == NULL || old == current) {
|
|
+ /* We raced */
|
|
+ closure_wake_up(&bc->alloc_wait);
|
|
+ goto success;
|
|
+ }
|
|
+
|
|
+ trace_btree_node_cannibalize_lock_fail(c);
|
|
+ return -EAGAIN;
|
|
+
|
|
+success:
|
|
+ trace_btree_node_cannibalize_lock(c);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct btree *btree_node_cannibalize(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+
|
|
+ list_for_each_entry_reverse(b, &bc->live, list)
|
|
+ if (!btree_node_reclaim(c, b))
|
|
+ return b;
|
|
+
|
|
+ while (1) {
|
|
+ list_for_each_entry_reverse(b, &bc->live, list)
|
|
+ if (!btree_node_write_and_reclaim(c, b))
|
|
+ return b;
|
|
+
|
|
+ /*
|
|
+ * Rare case: all nodes were intent-locked.
|
|
+ * Just busy-wait.
|
|
+ */
|
|
+ WARN_ONCE(1, "btree cache cannibalize failed\n");
|
|
+ cond_resched();
|
|
+ }
|
|
+}
|
|
+
|
|
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+ u64 start_time = local_clock();
|
|
+ unsigned flags;
|
|
+
|
|
+ flags = memalloc_nofs_save();
|
|
+ mutex_lock(&bc->lock);
|
|
+
|
|
+ /*
|
|
+ * btree_free() doesn't free memory; it sticks the node on the end of
|
|
+ * the list. Check if there's any freed nodes there:
|
|
+ */
|
|
+ list_for_each_entry(b, &bc->freeable, list)
|
|
+ if (!btree_node_reclaim(c, b))
|
|
+ goto got_node;
|
|
+
|
|
+ /*
|
|
+ * We never free struct btree itself, just the memory that holds the on
|
|
+ * disk node. Check the freed list before allocating a new one:
|
|
+ */
|
|
+ list_for_each_entry(b, &bc->freed, list)
|
|
+ if (!btree_node_reclaim(c, b))
|
|
+ goto got_node;
|
|
+
|
|
+ b = NULL;
|
|
+got_node:
|
|
+ if (b)
|
|
+ list_del_init(&b->list);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ if (!b) {
|
|
+ b = __btree_node_mem_alloc(c);
|
|
+ if (!b)
|
|
+ goto err;
|
|
+
|
|
+ BUG_ON(!six_trylock_intent(&b->c.lock));
|
|
+ BUG_ON(!six_trylock_write(&b->c.lock));
|
|
+ }
|
|
+
|
|
+ if (!b->data) {
|
|
+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL))
|
|
+ goto err;
|
|
+
|
|
+ mutex_lock(&bc->lock);
|
|
+ bc->used++;
|
|
+ mutex_unlock(&bc->lock);
|
|
+ }
|
|
+
|
|
+ BUG_ON(btree_node_hashed(b));
|
|
+ BUG_ON(btree_node_write_in_flight(b));
|
|
+out:
|
|
+ b->flags = 0;
|
|
+ b->written = 0;
|
|
+ b->nsets = 0;
|
|
+ b->sib_u64s[0] = 0;
|
|
+ b->sib_u64s[1] = 0;
|
|
+ b->whiteout_u64s = 0;
|
|
+ bch2_btree_keys_init(b);
|
|
+ set_btree_node_accessed(b);
|
|
+
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc],
|
|
+ start_time);
|
|
+
|
|
+ memalloc_nofs_restore(flags);
|
|
+ return b;
|
|
+err:
|
|
+ mutex_lock(&bc->lock);
|
|
+
|
|
+ if (b) {
|
|
+ list_add(&b->list, &bc->freed);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ }
|
|
+
|
|
+ /* Try to cannibalize another cached btree node: */
|
|
+ if (bc->alloc_lock == current) {
|
|
+ b = btree_node_cannibalize(c);
|
|
+ list_del_init(&b->list);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ bch2_btree_node_hash_remove(bc, b);
|
|
+
|
|
+ trace_btree_node_cannibalize(c);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&bc->lock);
|
|
+ memalloc_nofs_restore(flags);
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+}
|
|
+
|
|
+/* Slowpath, don't want it inlined into btree_iter_traverse() */
|
|
+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
|
|
+ struct btree_iter *iter,
|
|
+ const struct bkey_i *k,
|
|
+ enum btree_id btree_id,
|
|
+ unsigned level,
|
|
+ enum six_lock_type lock_type,
|
|
+ bool sync)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+
|
|
+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
|
|
+ /*
|
|
+ * Parent node must be locked, else we could read in a btree node that's
|
|
+ * been freed:
|
|
+ */
|
|
+ if (iter && !bch2_btree_node_relock(iter, level + 1))
|
|
+ return ERR_PTR(-EINTR);
|
|
+
|
|
+ b = bch2_btree_node_mem_alloc(c);
|
|
+ if (IS_ERR(b))
|
|
+ return b;
|
|
+
|
|
+ bkey_copy(&b->key, k);
|
|
+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
|
|
+ /* raced with another fill: */
|
|
+
|
|
+ /* mark as unhashed... */
|
|
+ b->hash_val = 0;
|
|
+
|
|
+ mutex_lock(&bc->lock);
|
|
+ list_add(&b->list, &bc->freeable);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Unlock before doing IO:
|
|
+ *
|
|
+ * XXX: ideally should be dropping all btree node locks here
|
|
+ */
|
|
+ if (iter && btree_node_read_locked(iter, level + 1))
|
|
+ btree_node_unlock(iter, level + 1);
|
|
+
|
|
+ bch2_btree_node_read(c, b, sync);
|
|
+
|
|
+ six_unlock_write(&b->c.lock);
|
|
+
|
|
+ if (!sync) {
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ if (lock_type == SIX_LOCK_read)
|
|
+ six_lock_downgrade(&b->c.lock);
|
|
+
|
|
+ return b;
|
|
+}
|
|
+
|
|
+static int lock_node_check_fn(struct six_lock *lock, void *p)
|
|
+{
|
|
+ struct btree *b = container_of(lock, struct btree, c.lock);
|
|
+ const struct bkey_i *k = p;
|
|
+
|
|
+ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
|
|
+}
|
|
+
|
|
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ char buf1[100], buf2[100], buf3[100], buf4[100];
|
|
+
|
|
+ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
|
|
+ return;
|
|
+
|
|
+ bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
|
|
+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
|
|
+ : POS_MIN);
|
|
+ bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
|
|
+
|
|
+ bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
|
|
+ bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
|
|
+ bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
|
|
+ "btree: ptr %u header %llu\n"
|
|
+ "level: ptr %u header %llu\n"
|
|
+ "min ptr %s node header %s\n"
|
|
+ "max ptr %s node header %s",
|
|
+ b->c.btree_id, BTREE_NODE_ID(b->data),
|
|
+ b->c.level, BTREE_NODE_LEVEL(b->data),
|
|
+ buf1, buf2, buf3, buf4);
|
|
+}
|
|
+
|
|
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
|
|
+ b->c.level != BTREE_NODE_LEVEL(b->data) ||
|
|
+ bpos_cmp(b->data->max_key, b->key.k.p) ||
|
|
+ (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
+ bpos_cmp(b->data->min_key,
|
|
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
|
|
+ btree_bad_header(c, b);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
|
|
+ * in from disk if necessary.
|
|
+ *
|
|
+ * If IO is necessary and running under generic_make_request, returns -EAGAIN.
|
|
+ *
|
|
+ * The btree node will have either a read or a write lock held, depending on
|
|
+ * the @write parameter.
|
|
+ */
|
|
+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
|
|
+ const struct bkey_i *k, unsigned level,
|
|
+ enum six_lock_type lock_type,
|
|
+ unsigned long trace_ip)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
+
|
|
+ b = btree_node_mem_ptr(k);
|
|
+ if (b)
|
|
+ goto lock_node;
|
|
+retry:
|
|
+ b = btree_cache_find(bc, k);
|
|
+ if (unlikely(!b)) {
|
|
+ /*
|
|
+ * We must have the parent locked to call bch2_btree_node_fill(),
|
|
+ * else we could read in a btree node from disk that's been
|
|
+ * freed:
|
|
+ */
|
|
+ b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
|
|
+ level, lock_type, true);
|
|
+
|
|
+ /* We raced and found the btree node in the cache */
|
|
+ if (!b)
|
|
+ goto retry;
|
|
+
|
|
+ if (IS_ERR(b))
|
|
+ return b;
|
|
+ } else {
|
|
+lock_node:
|
|
+ /*
|
|
+ * There's a potential deadlock with splits and insertions into
|
|
+ * interior nodes we have to avoid:
|
|
+ *
|
|
+ * The other thread might be holding an intent lock on the node
|
|
+ * we want, and they want to update its parent node so they're
|
|
+ * going to upgrade their intent lock on the parent node to a
|
|
+ * write lock.
|
|
+ *
|
|
+ * But if we're holding a read lock on the parent, and we're
|
|
+ * trying to get the intent lock they're holding, we deadlock.
|
|
+ *
|
|
+ * So to avoid this we drop the read locks on parent nodes when
|
|
+ * we're starting to take intent locks - and handle the race.
|
|
+ *
|
|
+ * The race is that they might be about to free the node we
|
|
+ * want, and dropping our read lock on the parent node lets them
|
|
+ * update the parent marking the node we want as freed, and then
|
|
+ * free it:
|
|
+ *
|
|
+ * To guard against this, btree nodes are evicted from the cache
|
|
+ * when they're freed - and b->hash_val is zeroed out, which we
|
|
+ * check for after we lock the node.
|
|
+ *
|
|
+ * Then, bch2_btree_node_relock() on the parent will fail - because
|
|
+ * the parent was modified, when the pointer to the node we want
|
|
+ * was removed - and we'll bail out:
|
|
+ */
|
|
+ if (btree_node_read_locked(iter, level + 1))
|
|
+ btree_node_unlock(iter, level + 1);
|
|
+
|
|
+ if (!btree_node_lock(b, k->k.p, level, iter, lock_type,
|
|
+ lock_node_check_fn, (void *) k, trace_ip)) {
|
|
+ if (b->hash_val != btree_ptr_hash_val(k))
|
|
+ goto retry;
|
|
+ return ERR_PTR(-EINTR);
|
|
+ }
|
|
+
|
|
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
|
|
+ b->c.level != level ||
|
|
+ race_fault())) {
|
|
+ six_unlock_type(&b->c.lock, lock_type);
|
|
+ if (bch2_btree_node_relock(iter, level + 1))
|
|
+ goto retry;
|
|
+
|
|
+ trace_trans_restart_btree_node_reused(iter->trans->ip);
|
|
+ return ERR_PTR(-EINTR);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* XXX: waiting on IO with btree locks held: */
|
|
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+
|
|
+ prefetch(b->aux_data);
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
|
+
|
|
+ prefetch(p + L1_CACHE_BYTES * 0);
|
|
+ prefetch(p + L1_CACHE_BYTES * 1);
|
|
+ prefetch(p + L1_CACHE_BYTES * 2);
|
|
+ }
|
|
+
|
|
+ /* avoid atomic set bit if it's not needed: */
|
|
+ if (!btree_node_accessed(b))
|
|
+ set_btree_node_accessed(b);
|
|
+
|
|
+ if (unlikely(btree_node_read_error(b))) {
|
|
+ six_unlock_type(&b->c.lock, lock_type);
|
|
+ return ERR_PTR(-EIO);
|
|
+ }
|
|
+
|
|
+ EBUG_ON(b->c.btree_id != iter->btree_id);
|
|
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
|
|
+ btree_check_header(c, b);
|
|
+
|
|
+ return b;
|
|
+}
|
|
+
|
|
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
|
|
+ const struct bkey_i *k,
|
|
+ enum btree_id btree_id,
|
|
+ unsigned level,
|
|
+ bool nofill)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+ struct bset_tree *t;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
+
|
|
+ b = btree_node_mem_ptr(k);
|
|
+ if (b)
|
|
+ goto lock_node;
|
|
+retry:
|
|
+ b = btree_cache_find(bc, k);
|
|
+ if (unlikely(!b)) {
|
|
+ if (nofill)
|
|
+ goto out;
|
|
+
|
|
+ b = bch2_btree_node_fill(c, NULL, k, btree_id,
|
|
+ level, SIX_LOCK_read, true);
|
|
+
|
|
+ /* We raced and found the btree node in the cache */
|
|
+ if (!b)
|
|
+ goto retry;
|
|
+
|
|
+ if (IS_ERR(b) &&
|
|
+ !bch2_btree_cache_cannibalize_lock(c, NULL))
|
|
+ goto retry;
|
|
+
|
|
+ if (IS_ERR(b))
|
|
+ goto out;
|
|
+ } else {
|
|
+lock_node:
|
|
+ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k);
|
|
+ if (ret)
|
|
+ goto retry;
|
|
+
|
|
+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
|
|
+ b->c.btree_id != btree_id ||
|
|
+ b->c.level != level)) {
|
|
+ six_unlock_read(&b->c.lock);
|
|
+ goto retry;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* XXX: waiting on IO with btree locks held: */
|
|
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+
|
|
+ prefetch(b->aux_data);
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ void *p = (u64 *) b->aux_data + t->aux_data_offset;
|
|
+
|
|
+ prefetch(p + L1_CACHE_BYTES * 0);
|
|
+ prefetch(p + L1_CACHE_BYTES * 1);
|
|
+ prefetch(p + L1_CACHE_BYTES * 2);
|
|
+ }
|
|
+
|
|
+ /* avoid atomic set bit if it's not needed: */
|
|
+ if (!btree_node_accessed(b))
|
|
+ set_btree_node_accessed(b);
|
|
+
|
|
+ if (unlikely(btree_node_read_error(b))) {
|
|
+ six_unlock_read(&b->c.lock);
|
|
+ b = ERR_PTR(-EIO);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ EBUG_ON(b->c.btree_id != btree_id);
|
|
+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
|
|
+ btree_check_header(c, b);
|
|
+out:
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+ return b;
|
|
+}
|
|
+
|
|
+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
|
|
+ const struct bkey_i *k,
|
|
+ enum btree_id btree_id, unsigned level)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+
|
|
+ BUG_ON(iter && !btree_node_locked(iter, level + 1));
|
|
+ BUG_ON(level >= BTREE_MAX_DEPTH);
|
|
+
|
|
+ b = btree_cache_find(bc, k);
|
|
+ if (b)
|
|
+ return;
|
|
+
|
|
+ bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
|
|
+{
|
|
+ struct btree_cache *bc = &c->btree_cache;
|
|
+ struct btree *b;
|
|
+
|
|
+ b = btree_cache_find(bc, k);
|
|
+ if (!b)
|
|
+ return;
|
|
+
|
|
+ six_lock_intent(&b->c.lock, NULL, NULL);
|
|
+ six_lock_write(&b->c.lock, NULL, NULL);
|
|
+
|
|
+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+ __bch2_btree_node_write(c, b);
|
|
+
|
|
+ /* wait for any in flight btree write */
|
|
+ btree_node_wait_on_io(b);
|
|
+
|
|
+ BUG_ON(btree_node_dirty(b));
|
|
+
|
|
+ mutex_lock(&bc->lock);
|
|
+ btree_node_data_free(c, b);
|
|
+ bch2_btree_node_hash_remove(bc, b);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ const struct bkey_format *f = &b->format;
|
|
+ struct bset_stats stats;
|
|
+
|
|
+ memset(&stats, 0, sizeof(stats));
|
|
+
|
|
+ bch2_btree_keys_stats(b, &stats);
|
|
+
|
|
+ pr_buf(out, "l %u ", b->c.level);
|
|
+ bch2_bpos_to_text(out, b->data->min_key);
|
|
+ pr_buf(out, " - ");
|
|
+ bch2_bpos_to_text(out, b->data->max_key);
|
|
+ pr_buf(out, ":\n"
|
|
+ " ptrs: ");
|
|
+ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key));
|
|
+
|
|
+ pr_buf(out, "\n"
|
|
+ " format: u64s %u fields %u %u %u %u %u\n"
|
|
+ " unpack fn len: %u\n"
|
|
+ " bytes used %zu/%zu (%zu%% full)\n"
|
|
+ " sib u64s: %u, %u (merge threshold %u)\n"
|
|
+ " nr packed keys %u\n"
|
|
+ " nr unpacked keys %u\n"
|
|
+ " floats %zu\n"
|
|
+ " failed unpacked %zu\n",
|
|
+ f->key_u64s,
|
|
+ f->bits_per_field[0],
|
|
+ f->bits_per_field[1],
|
|
+ f->bits_per_field[2],
|
|
+ f->bits_per_field[3],
|
|
+ f->bits_per_field[4],
|
|
+ b->unpack_fn_len,
|
|
+ b->nr.live_u64s * sizeof(u64),
|
|
+ btree_bytes(c) - sizeof(struct btree_node),
|
|
+ b->nr.live_u64s * 100 / btree_max_u64s(c),
|
|
+ b->sib_u64s[0],
|
|
+ b->sib_u64s[1],
|
|
+ c->btree_foreground_merge_threshold,
|
|
+ b->nr.packed_keys,
|
|
+ b->nr.unpacked_keys,
|
|
+ stats.floats,
|
|
+ stats.failed);
|
|
+}
|
|
+
|
|
+void bch2_btree_cache_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ pr_buf(out, "nr nodes:\t\t%u\n", c->btree_cache.used);
|
|
+ pr_buf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty));
|
|
+ pr_buf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
|
|
new file mode 100644
|
|
index 000000000000..40dd263a7caa
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_cache.h
|
|
@@ -0,0 +1,103 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_CACHE_H
|
|
+#define _BCACHEFS_BTREE_CACHE_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_types.h"
|
|
+
|
|
+struct btree_iter;
|
|
+
|
|
+void bch2_recalc_btree_reserve(struct bch_fs *);
|
|
+
|
|
+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
|
|
+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
|
|
+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
|
|
+ unsigned, enum btree_id);
|
|
+
|
|
+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
|
|
+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
|
|
+
|
|
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
|
|
+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
|
|
+
|
|
+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
|
|
+ const struct bkey_i *, unsigned,
|
|
+ enum six_lock_type, unsigned long);
|
|
+
|
|
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
|
|
+ enum btree_id, unsigned, bool);
|
|
+
|
|
+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
|
|
+ const struct bkey_i *, enum btree_id, unsigned);
|
|
+
|
|
+void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
|
|
+
|
|
+void bch2_fs_btree_cache_exit(struct bch_fs *);
|
|
+int bch2_fs_btree_cache_init(struct bch_fs *);
|
|
+void bch2_fs_btree_cache_init_early(struct btree_cache *);
|
|
+
|
|
+static inline u64 btree_ptr_hash_val(const struct bkey_i *k)
|
|
+{
|
|
+ switch (k->k.type) {
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start);
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq;
|
|
+ default:
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k)
|
|
+{
|
|
+ return k->k.type == KEY_TYPE_btree_ptr_v2
|
|
+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr
|
|
+ : NULL;
|
|
+}
|
|
+
|
|
+/* is btree node in hash table? */
|
|
+static inline bool btree_node_hashed(struct btree *b)
|
|
+{
|
|
+ return b->hash_val != 0;
|
|
+}
|
|
+
|
|
+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \
|
|
+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \
|
|
+ &(_c)->btree_cache.table), \
|
|
+ _iter = 0; _iter < (_tbl)->size; _iter++) \
|
|
+ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
|
|
+
|
|
+static inline size_t btree_bytes(struct bch_fs *c)
|
|
+{
|
|
+ return c->opts.btree_node_size << 9;
|
|
+}
|
|
+
|
|
+static inline size_t btree_max_u64s(struct bch_fs *c)
|
|
+{
|
|
+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline size_t btree_pages(struct bch_fs *c)
|
|
+{
|
|
+ return btree_bytes(c) / PAGE_SIZE;
|
|
+}
|
|
+
|
|
+static inline unsigned btree_blocks(struct bch_fs *c)
|
|
+{
|
|
+ return c->opts.btree_node_size >> c->block_bits;
|
|
+}
|
|
+
|
|
+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3)
|
|
+
|
|
+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3)
|
|
+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \
|
|
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \
|
|
+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2))
|
|
+
|
|
+#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b)
|
|
+
|
|
+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct btree *);
|
|
+void bch2_btree_cache_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_CACHE_H */
|
|
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
|
|
new file mode 100644
|
|
index 000000000000..24fa279d1cdb
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_gc.c
|
|
@@ -0,0 +1,1598 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright (C) 2014 Datera Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_locking.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_gc.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "debug.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "journal.h"
|
|
+#include "keylist.h"
|
|
+#include "move.h"
|
|
+#include "recovery.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/slab.h>
|
|
+#include <linux/bitops.h>
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/preempt.h>
|
|
+#include <linux/rcupdate.h>
|
|
+#include <linux/sched/task.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
|
|
+{
|
|
+ preempt_disable();
|
|
+ write_seqcount_begin(&c->gc_pos_lock);
|
|
+ c->gc_pos = new_pos;
|
|
+ write_seqcount_end(&c->gc_pos_lock);
|
|
+ preempt_enable();
|
|
+}
|
|
+
|
|
+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
|
|
+{
|
|
+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0);
|
|
+ __gc_pos_set(c, new_pos);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Missing: if an interior btree node is empty, we need to do something -
|
|
+ * perhaps just kill it
|
|
+ */
|
|
+static int bch2_gc_check_topology(struct bch_fs *c,
|
|
+ struct btree *b,
|
|
+ struct bkey_buf *prev,
|
|
+ struct bkey_buf cur,
|
|
+ bool is_last)
|
|
+{
|
|
+ struct bpos node_start = b->data->min_key;
|
|
+ struct bpos node_end = b->data->max_key;
|
|
+ struct bpos expected_start = bkey_deleted(&prev->k->k)
|
|
+ ? node_start
|
|
+ : bpos_successor(prev->k->k.p);
|
|
+ char buf1[200], buf2[200];
|
|
+ int ret = 0;
|
|
+
|
|
+ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
|
|
+
|
|
+ if (bkey_deleted(&prev->k->k)) {
|
|
+ struct printbuf out = PBUF(buf1);
|
|
+ pr_buf(&out, "start of node: ");
|
|
+ bch2_bpos_to_text(&out, node_start);
|
|
+ } else {
|
|
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
|
|
+ }
|
|
+
|
|
+ if (bpos_cmp(expected_start, bp->v.min_key)) {
|
|
+ bch2_topology_error(c);
|
|
+
|
|
+ if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n"
|
|
+ " prev %s\n"
|
|
+ " cur %s",
|
|
+ bch2_btree_ids[b->c.btree_id], b->c.level,
|
|
+ buf1,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
|
|
+ bch_info(c, "Halting mark and sweep to start topology repair pass");
|
|
+ return FSCK_ERR_START_TOPOLOGY_REPAIR;
|
|
+ } else {
|
|
+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
|
|
+ bch2_topology_error(c);
|
|
+
|
|
+ if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n"
|
|
+ " %s\n"
|
|
+ " expected %s",
|
|
+ bch2_btree_ids[b->c.btree_id], b->c.level,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
|
|
+ (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
|
|
+ bch_info(c, "Halting mark and sweep to start topology repair pass");
|
|
+ return FSCK_ERR_START_TOPOLOGY_REPAIR;
|
|
+ } else {
|
|
+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_copy(prev, c, cur.k);
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
|
|
+{
|
|
+ switch (b->key.k.type) {
|
|
+ case KEY_TYPE_btree_ptr: {
|
|
+ struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
|
|
+
|
|
+ dst->k.p = src->k.p;
|
|
+ dst->v.mem_ptr = 0;
|
|
+ dst->v.seq = b->data->keys.seq;
|
|
+ dst->v.sectors_written = 0;
|
|
+ dst->v.flags = 0;
|
|
+ dst->v.min_key = b->data->min_key;
|
|
+ set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
|
|
+ memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
|
|
+ break;
|
|
+ }
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ bkey_copy(&dst->k_i, &b->key);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
|
|
+{
|
|
+ struct bkey_i_btree_ptr_v2 *new;
|
|
+ int ret;
|
|
+
|
|
+ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
|
|
+ if (!new)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ btree_ptr_to_v2(b, new);
|
|
+ b->data->min_key = new_min;
|
|
+ new->v.min_key = new_min;
|
|
+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
|
|
+
|
|
+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
|
|
+ if (ret) {
|
|
+ kfree(new);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_drop_keys_outside_node(b);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
|
|
+{
|
|
+ struct bkey_i_btree_ptr_v2 *new;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
|
|
+ if (!new)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ btree_ptr_to_v2(b, new);
|
|
+ b->data->max_key = new_max;
|
|
+ new->k.p = new_max;
|
|
+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
|
|
+
|
|
+ ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
|
|
+ if (ret) {
|
|
+ kfree(new);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_drop_keys_outside_node(b);
|
|
+
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
|
|
+
|
|
+ bkey_copy(&b->key, &new->k_i);
|
|
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
|
|
+ BUG_ON(ret);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
|
|
+ struct btree *prev, struct btree *cur)
|
|
+{
|
|
+ struct bpos expected_start = !prev
|
|
+ ? b->data->min_key
|
|
+ : bpos_successor(prev->key.k.p);
|
|
+ char buf1[200], buf2[200];
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!prev) {
|
|
+ struct printbuf out = PBUF(buf1);
|
|
+ pr_buf(&out, "start of node: ");
|
|
+ bch2_bpos_to_text(&out, b->data->min_key);
|
|
+ } else {
|
|
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
|
|
+ }
|
|
+
|
|
+ if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
|
|
+ "btree node with incorrect min_key at btree %s level %u:\n"
|
|
+ " prev %s\n"
|
|
+ " cur %s",
|
|
+ bch2_btree_ids[b->c.btree_id], b->c.level,
|
|
+ buf1,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) {
|
|
+ if (prev &&
|
|
+ bpos_cmp(expected_start, cur->data->min_key) > 0 &&
|
|
+ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data))
|
|
+ ret = set_node_max(c, prev,
|
|
+ bpos_predecessor(cur->data->min_key));
|
|
+ else
|
|
+ ret = set_node_min(c, cur, expected_start);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
|
|
+ struct btree *child)
|
|
+{
|
|
+ char buf1[200], buf2[200];
|
|
+ int ret = 0;
|
|
+
|
|
+ if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
|
|
+ "btree node with incorrect max_key at btree %s level %u:\n"
|
|
+ " %s\n"
|
|
+ " expected %s",
|
|
+ bch2_btree_ids[b->c.btree_id], b->c.level,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
|
|
+ (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
|
|
+ ret = set_node_max(c, child, b->key.k.p);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#define DROP_THIS_NODE 10
|
|
+
|
|
+static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct btree_and_journal_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf tmp;
|
|
+ struct btree *prev = NULL, *cur = NULL;
|
|
+ bool have_child, dropped_children = false;
|
|
+ char buf[200];
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!b->c.level)
|
|
+ return 0;
|
|
+again:
|
|
+ have_child = dropped_children = false;
|
|
+ bch2_bkey_buf_init(&tmp);
|
|
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
|
|
+
|
|
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+ bch2_bkey_buf_reassemble(&tmp, c, k);
|
|
+
|
|
+ cur = bch2_btree_node_get_noiter(c, tmp.k,
|
|
+ b->c.btree_id, b->c.level - 1,
|
|
+ false);
|
|
+ ret = PTR_ERR_OR_ZERO(cur);
|
|
+
|
|
+ if (mustfix_fsck_err_on(ret == -EIO, c,
|
|
+ "Unreadable btree node at btree %s level %u:\n"
|
|
+ " %s",
|
|
+ bch2_btree_ids[b->c.btree_id],
|
|
+ b->c.level - 1,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
|
|
+ bch2_btree_node_evict(c, tmp.k);
|
|
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
|
|
+ b->c.level, tmp.k->k.p);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err(c, "%s: error %i getting btree node",
|
|
+ __func__, ret);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = btree_repair_node_start(c, b, prev, cur);
|
|
+ if (prev)
|
|
+ six_unlock_read(&prev->c.lock);
|
|
+ prev = cur;
|
|
+ cur = NULL;
|
|
+
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!ret && !IS_ERR_OR_NULL(prev)) {
|
|
+ BUG_ON(cur);
|
|
+ ret = btree_repair_node_end(c, b, prev);
|
|
+ }
|
|
+
|
|
+ if (!IS_ERR_OR_NULL(prev))
|
|
+ six_unlock_read(&prev->c.lock);
|
|
+ prev = NULL;
|
|
+ if (!IS_ERR_OR_NULL(cur))
|
|
+ six_unlock_read(&cur->c.lock);
|
|
+ cur = NULL;
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_btree_and_journal_iter_exit(&iter);
|
|
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
|
|
+
|
|
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
|
+ bch2_bkey_buf_reassemble(&tmp, c, k);
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+
|
|
+ cur = bch2_btree_node_get_noiter(c, tmp.k,
|
|
+ b->c.btree_id, b->c.level - 1,
|
|
+ false);
|
|
+ ret = PTR_ERR_OR_ZERO(cur);
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err(c, "%s: error %i getting btree node",
|
|
+ __func__, ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_btree_repair_topology_recurse(c, cur);
|
|
+ six_unlock_read(&cur->c.lock);
|
|
+ cur = NULL;
|
|
+
|
|
+ if (ret == DROP_THIS_NODE) {
|
|
+ bch2_btree_node_evict(c, tmp.k);
|
|
+ ret = bch2_journal_key_delete(c, b->c.btree_id,
|
|
+ b->c.level, tmp.k->k.p);
|
|
+ dropped_children = true;
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ have_child = true;
|
|
+ }
|
|
+
|
|
+ if (mustfix_fsck_err_on(!have_child, c,
|
|
+ "empty interior btree node at btree %s level %u\n"
|
|
+ " %s",
|
|
+ bch2_btree_ids[b->c.btree_id],
|
|
+ b->c.level,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
|
|
+ ret = DROP_THIS_NODE;
|
|
+err:
|
|
+fsck_err:
|
|
+ if (!IS_ERR_OR_NULL(prev))
|
|
+ six_unlock_read(&prev->c.lock);
|
|
+ if (!IS_ERR_OR_NULL(cur))
|
|
+ six_unlock_read(&cur->c.lock);
|
|
+
|
|
+ bch2_btree_and_journal_iter_exit(&iter);
|
|
+ bch2_bkey_buf_exit(&tmp, c);
|
|
+
|
|
+ if (!ret && dropped_children)
|
|
+ goto again;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_repair_topology(struct bch_fs *c)
|
|
+{
|
|
+ struct btree *b;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR && !ret; i++) {
|
|
+ b = c->btree_roots[i].b;
|
|
+ if (btree_node_fake(b))
|
|
+ continue;
|
|
+
|
|
+ six_lock_read(&b->c.lock, NULL, NULL);
|
|
+ ret = bch2_btree_repair_topology_recurse(c, b);
|
|
+ six_unlock_read(&b->c.lock);
|
|
+
|
|
+ if (ret == DROP_THIS_NODE) {
|
|
+ bch_err(c, "empty btree root - repair unimplemented");
|
|
+ ret = FSCK_ERR_EXIT;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
|
|
+ unsigned level, bool is_root,
|
|
+ struct bkey_s_c *k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p = { 0 };
|
|
+ bool do_update = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, true);
|
|
+ struct bucket *g2 = PTR_BUCKET(ca, &p.ptr, false);
|
|
+
|
|
+ if (fsck_err_on(!g->gen_valid, c,
|
|
+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
|
|
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
|
|
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
|
|
+ p.ptr.gen)) {
|
|
+ if (p.ptr.cached) {
|
|
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
|
|
+ g2->gen_valid = g->gen_valid = true;
|
|
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
|
|
+ } else {
|
|
+ do_update = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
|
|
+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
|
|
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
|
|
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
|
|
+ p.ptr.gen, g->mark.gen)) {
|
|
+ if (p.ptr.cached) {
|
|
+ g2->_mark.gen = g->_mark.gen = p.ptr.gen;
|
|
+ g2->gen_valid = g->gen_valid = true;
|
|
+ g2->_mark.data_type = 0;
|
|
+ g2->_mark.dirty_sectors = 0;
|
|
+ g2->_mark.cached_sectors = 0;
|
|
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
|
|
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
|
|
+ } else {
|
|
+ do_update = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(!p.ptr.cached &&
|
|
+ gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
|
|
+ "bucket %u:%zu data type %s stale dirty ptr: %u < %u",
|
|
+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
|
|
+ bch2_data_types[ptr_data_type(k->k, &p.ptr)],
|
|
+ p.ptr.gen, g->mark.gen))
|
|
+ do_update = true;
|
|
+
|
|
+ if (p.has_ec) {
|
|
+ struct stripe *m = genradix_ptr(&c->stripes[true], p.ec.idx);
|
|
+
|
|
+ if (fsck_err_on(!m || !m->alive, c,
|
|
+ "pointer to nonexistent stripe %llu",
|
|
+ (u64) p.ec.idx))
|
|
+ do_update = true;
|
|
+
|
|
+ if (fsck_err_on(!bch2_ptr_matches_stripe_m(m, p), c,
|
|
+ "pointer does not match stripe %llu",
|
|
+ (u64) p.ec.idx))
|
|
+ do_update = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (do_update) {
|
|
+ struct bkey_ptrs ptrs;
|
|
+ union bch_extent_entry *entry;
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct bkey_i *new;
|
|
+
|
|
+ if (is_root) {
|
|
+ bch_err(c, "cannot update btree roots yet");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
|
|
+ if (!new) {
|
|
+ bch_err(c, "%s: error allocating new key", __func__);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ bkey_reassemble(new, *k);
|
|
+
|
|
+ if (level) {
|
|
+ /*
|
|
+ * We don't want to drop btree node pointers - if the
|
|
+ * btree node isn't there anymore, the read path will
|
|
+ * sort it out:
|
|
+ */
|
|
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
|
|
+
|
|
+ ptr->gen = g->mark.gen;
|
|
+ }
|
|
+ } else {
|
|
+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
|
|
+
|
|
+ (ptr->cached &&
|
|
+ (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
|
|
+ (!ptr->cached &&
|
|
+ gen_cmp(ptr->gen, g->mark.gen) < 0);
|
|
+ }));
|
|
+again:
|
|
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
|
|
+ bkey_extent_entry_for_each(ptrs, entry) {
|
|
+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) {
|
|
+ struct stripe *m = genradix_ptr(&c->stripes[true],
|
|
+ entry->stripe_ptr.idx);
|
|
+ union bch_extent_entry *next_ptr;
|
|
+
|
|
+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry)
|
|
+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr)
|
|
+ goto found;
|
|
+ next_ptr = NULL;
|
|
+found:
|
|
+ if (!next_ptr) {
|
|
+ bch_err(c, "aieee, found stripe ptr with no data ptr");
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (!m || !m->alive ||
|
|
+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block],
|
|
+ &next_ptr->ptr,
|
|
+ m->sectors)) {
|
|
+ bch2_bkey_extent_entry_drop(new, entry);
|
|
+ goto again;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_journal_key_insert(c, btree_id, level, new);
|
|
+ if (ret)
|
|
+ kfree(new);
|
|
+ else
|
|
+ *k = bkey_i_to_s_c(new);
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* marking of btree keys/nodes: */
|
|
+
|
|
+static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
|
|
+ unsigned level, bool is_root,
|
|
+ struct bkey_s_c *k,
|
|
+ u8 *max_stale, bool initial)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs;
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+ unsigned flags =
|
|
+ BTREE_TRIGGER_GC|
|
|
+ (initial ? BTREE_TRIGGER_NOATOMIC : 0);
|
|
+ int ret = 0;
|
|
+
|
|
+ if (initial) {
|
|
+ BUG_ON(bch2_journal_seq_verify &&
|
|
+ k->k->version.lo > journal_cur_seq(&c->journal));
|
|
+
|
|
+ ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
|
|
+ "key version number higher than recorded: %llu > %llu",
|
|
+ k->k->version.lo,
|
|
+ atomic64_read(&c->key_version)))
|
|
+ atomic64_set(&c->key_version, k->k->version.lo);
|
|
+
|
|
+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
|
|
+ fsck_err_on(!bch2_bkey_replicas_marked(c, *k), c,
|
|
+ "superblock not marked as containing replicas (type %u)",
|
|
+ k->k->type)) {
|
|
+ ret = bch2_mark_bkey_replicas(c, *k);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error marking bkey replicas: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ptrs = bch2_bkey_ptrs_c(*k);
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, ptr, true);
|
|
+
|
|
+ if (gen_after(g->oldest_gen, ptr->gen))
|
|
+ g->oldest_gen = ptr->gen;
|
|
+
|
|
+ *max_stale = max(*max_stale, ptr_stale(ca, ptr));
|
|
+ }
|
|
+
|
|
+ bch2_mark_key(c, *k, 0, k->k->size, NULL, 0, flags);
|
|
+fsck_err:
|
|
+err:
|
|
+ if (ret)
|
|
+ bch_err(c, "%s: ret %i", __func__, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
|
|
+ bool initial)
|
|
+{
|
|
+ struct btree_node_iter iter;
|
|
+ struct bkey unpacked;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf prev, cur;
|
|
+ int ret = 0;
|
|
+
|
|
+ *max_stale = 0;
|
|
+
|
|
+ if (!btree_node_type_needs_gc(btree_node_type(b)))
|
|
+ return 0;
|
|
+
|
|
+ bch2_btree_node_iter_init_from_start(&iter, b);
|
|
+ bch2_bkey_buf_init(&prev);
|
|
+ bch2_bkey_buf_init(&cur);
|
|
+ bkey_init(&prev.k->k);
|
|
+
|
|
+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
|
|
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
|
|
+ &k, max_stale, initial);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ bch2_btree_node_iter_advance(&iter, b);
|
|
+
|
|
+ if (b->c.level) {
|
|
+ bch2_bkey_buf_reassemble(&cur, c, k);
|
|
+
|
|
+ ret = bch2_gc_check_topology(c, b, &prev, cur,
|
|
+ bch2_btree_node_iter_end(&iter));
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_exit(&cur, c);
|
|
+ bch2_bkey_buf_exit(&prev, c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
|
|
+ bool initial, bool metadata_only)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct btree *b;
|
|
+ unsigned depth = metadata_only ? 1
|
|
+ : bch2_expensive_debug_checks ? 0
|
|
+ : !btree_node_type_needs_gc(btree_id) ? 1
|
|
+ : 0;
|
|
+ u8 max_stale = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
|
|
+
|
|
+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
|
|
+ 0, depth, BTREE_ITER_PREFETCH, b) {
|
|
+ bch2_verify_btree_nr_keys(b);
|
|
+
|
|
+ gc_pos_set(c, gc_pos_btree_node(b));
|
|
+
|
|
+ ret = btree_gc_mark_node(c, b, &max_stale, initial);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (!initial) {
|
|
+ if (max_stale > 64)
|
|
+ bch2_btree_node_rewrite(c, iter,
|
|
+ b->data->keys.seq,
|
|
+ BTREE_INSERT_NOWAIT|
|
|
+ BTREE_INSERT_GC_LOCK_HELD);
|
|
+ else if (!bch2_btree_gc_rewrite_disabled &&
|
|
+ (bch2_btree_gc_always_rewrite || max_stale > 16))
|
|
+ bch2_btree_node_rewrite(c, iter,
|
|
+ b->data->keys.seq,
|
|
+ BTREE_INSERT_NOWAIT|
|
|
+ BTREE_INSERT_GC_LOCK_HELD);
|
|
+ }
|
|
+
|
|
+ bch2_trans_cond_resched(&trans);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ mutex_lock(&c->btree_root_lock);
|
|
+ b = c->btree_roots[btree_id].b;
|
|
+ if (!btree_node_fake(b)) {
|
|
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
|
|
+
|
|
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
|
|
+ &k, &max_stale, initial);
|
|
+ }
|
|
+ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
|
|
+ mutex_unlock(&c->btree_root_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
|
|
+ unsigned target_depth)
|
|
+{
|
|
+ struct btree_and_journal_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf cur, prev;
|
|
+ u8 max_stale = 0;
|
|
+ char buf[200];
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
|
|
+ bch2_bkey_buf_init(&prev);
|
|
+ bch2_bkey_buf_init(&cur);
|
|
+ bkey_init(&prev.k->k);
|
|
+
|
|
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
|
+ BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0);
|
|
+ BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0);
|
|
+
|
|
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
|
|
+ &k, &max_stale, true);
|
|
+ if (ret) {
|
|
+ bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
|
|
+ goto fsck_err;
|
|
+ }
|
|
+
|
|
+ if (b->c.level) {
|
|
+ bch2_bkey_buf_reassemble(&cur, c, k);
|
|
+ k = bkey_i_to_s_c(cur.k);
|
|
+
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+
|
|
+ ret = bch2_gc_check_topology(c, b,
|
|
+ &prev, cur,
|
|
+ !bch2_btree_and_journal_iter_peek(&iter).k);
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
+ } else {
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (b->c.level > target_depth) {
|
|
+ bch2_btree_and_journal_iter_exit(&iter);
|
|
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
|
|
+
|
|
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
|
+ struct btree *child;
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&cur, c, k);
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+
|
|
+ child = bch2_btree_node_get_noiter(c, cur.k,
|
|
+ b->c.btree_id, b->c.level - 1,
|
|
+ false);
|
|
+ ret = PTR_ERR_OR_ZERO(child);
|
|
+
|
|
+ if (ret == -EIO) {
|
|
+ bch2_topology_error(c);
|
|
+
|
|
+ if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n"
|
|
+ " %s",
|
|
+ bch2_btree_ids[b->c.btree_id],
|
|
+ b->c.level - 1,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
|
|
+ ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
|
|
+ bch_info(c, "Halting mark and sweep to start topology repair pass");
|
|
+ goto fsck_err;
|
|
+ } else {
|
|
+ /* Continue marking when opted to not
|
|
+ * fix the error: */
|
|
+ ret = 0;
|
|
+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
|
|
+ continue;
|
|
+ }
|
|
+ } else if (ret) {
|
|
+ bch_err(c, "%s: error %i getting btree node",
|
|
+ __func__, ret);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = bch2_gc_btree_init_recurse(c, child,
|
|
+ target_depth);
|
|
+ six_unlock_read(&child->c.lock);
|
|
+
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+fsck_err:
|
|
+ bch2_bkey_buf_exit(&cur, c);
|
|
+ bch2_bkey_buf_exit(&prev, c);
|
|
+ bch2_btree_and_journal_iter_exit(&iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_gc_btree_init(struct bch_fs *c,
|
|
+ enum btree_id btree_id,
|
|
+ bool metadata_only)
|
|
+{
|
|
+ struct btree *b;
|
|
+ unsigned target_depth = metadata_only ? 1
|
|
+ : bch2_expensive_debug_checks ? 0
|
|
+ : !btree_node_type_needs_gc(btree_id) ? 1
|
|
+ : 0;
|
|
+ u8 max_stale = 0;
|
|
+ char buf[100];
|
|
+ int ret = 0;
|
|
+
|
|
+ b = c->btree_roots[btree_id].b;
|
|
+
|
|
+ if (btree_node_fake(b))
|
|
+ return 0;
|
|
+
|
|
+ six_lock_read(&b->c.lock, NULL, NULL);
|
|
+ if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
|
|
+ "btree root with incorrect min_key: %s",
|
|
+ (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
|
|
+ bch_err(c, "repair unimplemented");
|
|
+ ret = FSCK_ERR_EXIT;
|
|
+ goto fsck_err;
|
|
+ }
|
|
+
|
|
+ if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
|
|
+ "btree root with incorrect max_key: %s",
|
|
+ (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
|
|
+ bch_err(c, "repair unimplemented");
|
|
+ ret = FSCK_ERR_EXIT;
|
|
+ goto fsck_err;
|
|
+ }
|
|
+
|
|
+ if (b->c.level >= target_depth)
|
|
+ ret = bch2_gc_btree_init_recurse(c, b, target_depth);
|
|
+
|
|
+ if (!ret) {
|
|
+ struct bkey_s_c k = bkey_i_to_s_c(&b->key);
|
|
+
|
|
+ ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
|
|
+ &k, &max_stale, true);
|
|
+ }
|
|
+fsck_err:
|
|
+ six_unlock_read(&b->c.lock);
|
|
+
|
|
+ if (ret < 0)
|
|
+ bch_err(c, "%s: ret %i", __func__, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
|
|
+{
|
|
+ return (int) btree_id_to_gc_phase(l) -
|
|
+ (int) btree_id_to_gc_phase(r);
|
|
+}
|
|
+
|
|
+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
|
|
+{
|
|
+ enum btree_id ids[BTREE_ID_NR];
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ ids[i] = i;
|
|
+ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR && !ret; i++)
|
|
+ ret = initial
|
|
+ ? bch2_gc_btree_init(c, ids[i], metadata_only)
|
|
+ : bch2_gc_btree(c, ids[i], initial, metadata_only);
|
|
+
|
|
+ if (ret < 0)
|
|
+ bch_err(c, "%s: ret %i", __func__, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
|
|
+ u64 start, u64 end,
|
|
+ enum bch_data_type type,
|
|
+ unsigned flags)
|
|
+{
|
|
+ u64 b = sector_to_bucket(ca, start);
|
|
+
|
|
+ do {
|
|
+ unsigned sectors =
|
|
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
|
|
+
|
|
+ bch2_mark_metadata_bucket(c, ca, b, type, sectors,
|
|
+ gc_phase(GC_PHASE_SB), flags);
|
|
+ b++;
|
|
+ start += sectors;
|
|
+ } while (start < end);
|
|
+}
|
|
+
|
|
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
|
+ unsigned i;
|
|
+ u64 b;
|
|
+
|
|
+ /*
|
|
+ * This conditional is kind of gross, but we may be called from the
|
|
+ * device add path, before the new device has actually been added to the
|
|
+ * running filesystem:
|
|
+ */
|
|
+ if (c) {
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < layout->nr_superblocks; i++) {
|
|
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
|
+
|
|
+ if (offset == BCH_SB_SECTOR)
|
|
+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
|
|
+ BCH_DATA_sb, flags);
|
|
+
|
|
+ mark_metadata_sectors(c, ca, offset,
|
|
+ offset + (1 << layout->sb_max_size_bits),
|
|
+ BCH_DATA_sb, flags);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < ca->journal.nr; i++) {
|
|
+ b = ca->journal.buckets[i];
|
|
+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
|
|
+ ca->mi.bucket_size,
|
|
+ gc_phase(GC_PHASE_SB), flags);
|
|
+ }
|
|
+
|
|
+ if (c)
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+}
|
|
+
|
|
+static void bch2_mark_superblocks(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ gc_pos_set(c, gc_phase(GC_PHASE_SB));
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
+
|
|
+#if 0
|
|
+/* Also see bch2_pending_btree_node_free_insert_done() */
|
|
+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_update *as;
|
|
+ struct pending_btree_node_free *d;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE));
|
|
+
|
|
+ for_each_pending_btree_node_free(c, as, d)
|
|
+ if (d->index_update_done)
|
|
+ bch2_mark_key(c, bkey_i_to_s_c(&d->key),
|
|
+ 0, 0, NULL, 0,
|
|
+ BTREE_TRIGGER_GC);
|
|
+
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+}
|
|
+#endif
|
|
+
|
|
+static void bch2_gc_free(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ genradix_free(&c->stripes[1]);
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
|
|
+ sizeof(struct bucket_array) +
|
|
+ ca->mi.nbuckets * sizeof(struct bucket));
|
|
+ ca->buckets[1] = NULL;
|
|
+
|
|
+ free_percpu(ca->usage_gc);
|
|
+ ca->usage_gc = NULL;
|
|
+ }
|
|
+
|
|
+ free_percpu(c->usage_gc);
|
|
+ c->usage_gc = NULL;
|
|
+}
|
|
+
|
|
+static int bch2_gc_done(struct bch_fs *c,
|
|
+ bool initial, bool metadata_only)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ bool verify = !metadata_only && (!initial ||
|
|
+ (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
|
|
+ unsigned i, dev;
|
|
+ int ret = 0;
|
|
+
|
|
+#define copy_field(_f, _msg, ...) \
|
|
+ if (dst->_f != src->_f) { \
|
|
+ if (verify) \
|
|
+ fsck_err(c, _msg ": got %llu, should be %llu" \
|
|
+ , ##__VA_ARGS__, dst->_f, src->_f); \
|
|
+ dst->_f = src->_f; \
|
|
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
|
|
+ }
|
|
+#define copy_stripe_field(_f, _msg, ...) \
|
|
+ if (dst->_f != src->_f) { \
|
|
+ if (verify) \
|
|
+ fsck_err(c, "stripe %zu has wrong "_msg \
|
|
+ ": got %u, should be %u", \
|
|
+ iter.pos, ##__VA_ARGS__, \
|
|
+ dst->_f, src->_f); \
|
|
+ dst->_f = src->_f; \
|
|
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
|
|
+ }
|
|
+#define copy_bucket_field(_f) \
|
|
+ if (dst->b[b].mark._f != src->b[b].mark._f) { \
|
|
+ if (verify) \
|
|
+ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \
|
|
+ ": got %u, should be %u", dev, b, \
|
|
+ dst->b[b].mark.gen, \
|
|
+ bch2_data_types[dst->b[b].mark.data_type],\
|
|
+ dst->b[b].mark._f, src->b[b].mark._f); \
|
|
+ dst->b[b]._mark._f = src->b[b].mark._f; \
|
|
+ set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags); \
|
|
+ }
|
|
+#define copy_dev_field(_f, _msg, ...) \
|
|
+ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
|
|
+#define copy_fs_field(_f, _msg, ...) \
|
|
+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
|
|
+
|
|
+ if (!metadata_only) {
|
|
+ struct genradix_iter iter = genradix_iter_init(&c->stripes[1], 0);
|
|
+ struct stripe *dst, *src;
|
|
+
|
|
+ while ((src = genradix_iter_peek(&iter, &c->stripes[1]))) {
|
|
+ dst = genradix_ptr_alloc(&c->stripes[0], iter.pos, GFP_KERNEL);
|
|
+
|
|
+ if (dst->alive != src->alive ||
|
|
+ dst->sectors != src->sectors ||
|
|
+ dst->algorithm != src->algorithm ||
|
|
+ dst->nr_blocks != src->nr_blocks ||
|
|
+ dst->nr_redundant != src->nr_redundant) {
|
|
+ bch_err(c, "unexpected stripe inconsistency at bch2_gc_done, confused");
|
|
+ ret = -EINVAL;
|
|
+ goto fsck_err;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++)
|
|
+ copy_stripe_field(block_sectors[i],
|
|
+ "block_sectors[%u]", i);
|
|
+
|
|
+ dst->blocks_nonempty = 0;
|
|
+ for (i = 0; i < dst->nr_blocks; i++)
|
|
+ dst->blocks_nonempty += dst->block_sectors[i] != 0;
|
|
+
|
|
+ genradix_iter_advance(&iter, &c->stripes[1]);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
|
+ bch2_fs_usage_acc_to_base(c, i);
|
|
+
|
|
+ for_each_member_device(ca, c, dev) {
|
|
+ struct bucket_array *dst = __bucket_array(ca, 0);
|
|
+ struct bucket_array *src = __bucket_array(ca, 1);
|
|
+ size_t b;
|
|
+
|
|
+ for (b = 0; b < src->nbuckets; b++) {
|
|
+ copy_bucket_field(gen);
|
|
+ copy_bucket_field(data_type);
|
|
+ copy_bucket_field(stripe);
|
|
+ copy_bucket_field(dirty_sectors);
|
|
+ copy_bucket_field(cached_sectors);
|
|
+
|
|
+ dst->b[b].oldest_gen = src->b[b].oldest_gen;
|
|
+ }
|
|
+
|
|
+ {
|
|
+ struct bch_dev_usage *dst = ca->usage_base;
|
|
+ struct bch_dev_usage *src = (void *)
|
|
+ bch2_acc_percpu_u64s((void *) ca->usage_gc,
|
|
+ dev_usage_u64s());
|
|
+
|
|
+ copy_dev_field(buckets_ec, "buckets_ec");
|
|
+ copy_dev_field(buckets_unavailable, "buckets_unavailable");
|
|
+
|
|
+ for (i = 0; i < BCH_DATA_NR; i++) {
|
|
+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]);
|
|
+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]);
|
|
+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
|
|
+ }
|
|
+ }
|
|
+ };
|
|
+
|
|
+ {
|
|
+ unsigned nr = fs_usage_u64s(c);
|
|
+ struct bch_fs_usage *dst = c->usage_base;
|
|
+ struct bch_fs_usage *src = (void *)
|
|
+ bch2_acc_percpu_u64s((void *) c->usage_gc, nr);
|
|
+
|
|
+ copy_fs_field(hidden, "hidden");
|
|
+ copy_fs_field(btree, "btree");
|
|
+
|
|
+ if (!metadata_only) {
|
|
+ copy_fs_field(data, "data");
|
|
+ copy_fs_field(cached, "cached");
|
|
+ copy_fs_field(reserved, "reserved");
|
|
+ copy_fs_field(nr_inodes,"nr_inodes");
|
|
+
|
|
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
|
|
+ copy_fs_field(persistent_reserved[i],
|
|
+ "persistent_reserved[%i]", i);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+ char buf[80];
|
|
+
|
|
+ if (metadata_only &&
|
|
+ (e->data_type == BCH_DATA_user ||
|
|
+ e->data_type == BCH_DATA_cached))
|
|
+ continue;
|
|
+
|
|
+ bch2_replicas_entry_to_text(&PBUF(buf), e);
|
|
+
|
|
+ copy_fs_field(replicas[i], "%s", buf);
|
|
+ }
|
|
+ }
|
|
+
|
|
+#undef copy_fs_field
|
|
+#undef copy_dev_field
|
|
+#undef copy_bucket_field
|
|
+#undef copy_stripe_field
|
|
+#undef copy_field
|
|
+fsck_err:
|
|
+ if (ret)
|
|
+ bch_err(c, "%s: ret %i", __func__, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_gc_start(struct bch_fs *c,
|
|
+ bool metadata_only)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(c->usage_gc);
|
|
+
|
|
+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
|
|
+ sizeof(u64), GFP_KERNEL);
|
|
+ if (!c->usage_gc) {
|
|
+ bch_err(c, "error allocating c->usage_gc");
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ BUG_ON(ca->buckets[1]);
|
|
+ BUG_ON(ca->usage_gc);
|
|
+
|
|
+ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
|
|
+ ca->mi.nbuckets * sizeof(struct bucket),
|
|
+ GFP_KERNEL|__GFP_ZERO);
|
|
+ if (!ca->buckets[1]) {
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ bch_err(c, "error allocating ca->buckets[gc]");
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ ca->usage_gc = alloc_percpu(struct bch_dev_usage);
|
|
+ if (!ca->usage_gc) {
|
|
+ bch_err(c, "error allocating ca->usage_gc");
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_ec_mem_alloc(c, true);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error allocating ec gc mem");
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+
|
|
+ /*
|
|
+ * indicate to stripe code that we need to allocate for the gc stripes
|
|
+ * radix tree, too
|
|
+ */
|
|
+ gc_pos_set(c, gc_phase(GC_PHASE_START));
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ struct bucket_array *dst = __bucket_array(ca, 1);
|
|
+ struct bucket_array *src = __bucket_array(ca, 0);
|
|
+ size_t b;
|
|
+
|
|
+ dst->first_bucket = src->first_bucket;
|
|
+ dst->nbuckets = src->nbuckets;
|
|
+
|
|
+ for (b = 0; b < src->nbuckets; b++) {
|
|
+ struct bucket *d = &dst->b[b];
|
|
+ struct bucket *s = &src->b[b];
|
|
+
|
|
+ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
|
|
+ d->gen_valid = s->gen_valid;
|
|
+
|
|
+ if (metadata_only &&
|
|
+ (s->mark.data_type == BCH_DATA_user ||
|
|
+ s->mark.data_type == BCH_DATA_cached))
|
|
+ d->_mark = s->mark;
|
|
+ }
|
|
+ };
|
|
+
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
|
|
+ *
|
|
+ * Order matters here:
|
|
+ * - Concurrent GC relies on the fact that we have a total ordering for
|
|
+ * everything that GC walks - see gc_will_visit_node(),
|
|
+ * gc_will_visit_root()
|
|
+ *
|
|
+ * - also, references move around in the course of index updates and
|
|
+ * various other crap: everything needs to agree on the ordering
|
|
+ * references are allowed to move around in - e.g., we're allowed to
|
|
+ * start with a reference owned by an open_bucket (the allocator) and
|
|
+ * move it to the btree, but not the reverse.
|
|
+ *
|
|
+ * This is necessary to ensure that gc doesn't miss references that
|
|
+ * move around - if references move backwards in the ordering GC
|
|
+ * uses, GC could skip past them
|
|
+ */
|
|
+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ u64 start_time = local_clock();
|
|
+ unsigned i, iter = 0;
|
|
+ int ret;
|
|
+
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+ trace_gc_start(c);
|
|
+
|
|
+ down_write(&c->gc_lock);
|
|
+
|
|
+ /* flush interior btree updates: */
|
|
+ closure_wait_event(&c->btree_interior_update_wait,
|
|
+ !bch2_btree_interior_updates_nr_pending(c));
|
|
+again:
|
|
+ ret = bch2_gc_start(c, metadata_only);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ bch2_mark_superblocks(c);
|
|
+
|
|
+ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
|
|
+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
|
|
+ c->opts.fix_errors != FSCK_OPT_NO) {
|
|
+ bch_info(c, "starting topology repair pass");
|
|
+ ret = bch2_repair_topology(c);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ bch_info(c, "topology repair pass done");
|
|
+ }
|
|
+
|
|
+ ret = bch2_gc_btrees(c, initial, metadata_only);
|
|
+
|
|
+ if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
|
|
+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
|
|
+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
|
|
+ ret = 0;
|
|
+ }
|
|
+
|
|
+ if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
|
|
+ ret = FSCK_ERR_EXIT;
|
|
+
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+#if 0
|
|
+ bch2_mark_pending_btree_node_frees(c);
|
|
+#endif
|
|
+ c->gc_count++;
|
|
+
|
|
+ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
|
|
+ (!iter && bch2_test_restart_gc)) {
|
|
+ /*
|
|
+ * XXX: make sure gens we fixed got saved
|
|
+ */
|
|
+ if (iter++ <= 2) {
|
|
+ bch_info(c, "Second GC pass needed, restarting:");
|
|
+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
|
|
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
|
|
+
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+ bch2_gc_free(c);
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+ /* flush fsck errors, reset counters */
|
|
+ bch2_flush_fsck_errs(c);
|
|
+
|
|
+ goto again;
|
|
+ }
|
|
+
|
|
+ bch_info(c, "Unable to fix bucket gens, looping");
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+out:
|
|
+ if (!ret) {
|
|
+ bch2_journal_block(&c->journal);
|
|
+
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+ ret = bch2_gc_done(c, initial, metadata_only);
|
|
+
|
|
+ bch2_journal_unblock(&c->journal);
|
|
+ } else {
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ /* Indicates that gc is no longer in progress: */
|
|
+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
|
|
+
|
|
+ bch2_gc_free(c);
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+
|
|
+ up_write(&c->gc_lock);
|
|
+
|
|
+ trace_gc_end(c);
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
|
|
+
|
|
+ /*
|
|
+ * Wake up allocator in case it was waiting for buckets
|
|
+ * because of not being able to inc gens
|
|
+ */
|
|
+ for_each_member_device(ca, c, i)
|
|
+ bch2_wake_allocator(ca);
|
|
+
|
|
+ /*
|
|
+ * At startup, allocations can happen directly instead of via the
|
|
+ * allocator thread - issue wakeup in case they blocked on gc_lock:
|
|
+ */
|
|
+ closure_wake_up(&c->freelist_wait);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, ptr, false);
|
|
+
|
|
+ if (gen_after(g->mark.gen, ptr->gen) > 16) {
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ return true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, ptr, false);
|
|
+
|
|
+ if (gen_after(g->gc_gen, ptr->gen))
|
|
+ g->gc_gen = ptr->gen;
|
|
+ }
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
|
|
+ * node pointers currently never have cached pointers that can become stale:
|
|
+ */
|
|
+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf sk;
|
|
+ int ret = 0, commit_err = 0;
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
|
|
+ BTREE_ITER_PREFETCH|
|
|
+ BTREE_ITER_NOT_EXTENTS|
|
|
+ BTREE_ITER_ALL_SNAPSHOTS);
|
|
+
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k))) {
|
|
+ c->gc_gens_pos = iter->pos;
|
|
+
|
|
+ if (gc_btree_gens_key(c, k) && !commit_err) {
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+ bch2_extent_normalize(c, bkey_i_to_s(sk.k));
|
|
+
|
|
+ bch2_trans_update(&trans, iter, sk.k, 0);
|
|
+
|
|
+ commit_err = bch2_trans_commit(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOWAIT|
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+ if (commit_err == -EINTR) {
|
|
+ commit_err = 0;
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_gc_gens(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ struct bucket_array *buckets;
|
|
+ struct bucket *g;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * Ideally we would be using state_lock and not gc_lock here, but that
|
|
+ * introduces a deadlock in the RO path - we currently take the state
|
|
+ * lock at the start of going RO, thus the gc thread may get stuck:
|
|
+ */
|
|
+ down_read(&c->gc_lock);
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ down_read(&ca->bucket_lock);
|
|
+ buckets = bucket_array(ca);
|
|
+
|
|
+ for_each_bucket(g, buckets)
|
|
+ g->gc_gen = g->mark.gen;
|
|
+ up_read(&ca->bucket_lock);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ if ((1 << i) & BTREE_ID_HAS_PTRS) {
|
|
+ c->gc_gens_btree = i;
|
|
+ c->gc_gens_pos = POS_MIN;
|
|
+ ret = bch2_gc_btree_gens(c, i);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error recalculating oldest_gen: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ down_read(&ca->bucket_lock);
|
|
+ buckets = bucket_array(ca);
|
|
+
|
|
+ for_each_bucket(g, buckets)
|
|
+ g->oldest_gen = g->gc_gen;
|
|
+ up_read(&ca->bucket_lock);
|
|
+ }
|
|
+
|
|
+ c->gc_gens_btree = 0;
|
|
+ c->gc_gens_pos = POS_MIN;
|
|
+
|
|
+ c->gc_count++;
|
|
+err:
|
|
+ up_read(&c->gc_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_gc_thread(void *arg)
|
|
+{
|
|
+ struct bch_fs *c = arg;
|
|
+ struct io_clock *clock = &c->io_clock[WRITE];
|
|
+ unsigned long last = atomic64_read(&clock->now);
|
|
+ unsigned last_kick = atomic_read(&c->kick_gc);
|
|
+ int ret;
|
|
+
|
|
+ set_freezable();
|
|
+
|
|
+ while (1) {
|
|
+ while (1) {
|
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
|
+
|
|
+ if (kthread_should_stop()) {
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (atomic_read(&c->kick_gc) != last_kick)
|
|
+ break;
|
|
+
|
|
+ if (c->btree_gc_periodic) {
|
|
+ unsigned long next = last + c->capacity / 16;
|
|
+
|
|
+ if (atomic64_read(&clock->now) >= next)
|
|
+ break;
|
|
+
|
|
+ bch2_io_clock_schedule_timeout(clock, next);
|
|
+ } else {
|
|
+ schedule();
|
|
+ }
|
|
+
|
|
+ try_to_freeze();
|
|
+ }
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+
|
|
+ last = atomic64_read(&clock->now);
|
|
+ last_kick = atomic_read(&c->kick_gc);
|
|
+
|
|
+ /*
|
|
+ * Full gc is currently incompatible with btree key cache:
|
|
+ */
|
|
+#if 0
|
|
+ ret = bch2_gc(c, false, false);
|
|
+#else
|
|
+ ret = bch2_gc_gens(c);
|
|
+#endif
|
|
+ if (ret < 0)
|
|
+ bch_err(c, "btree gc failed: %i", ret);
|
|
+
|
|
+ debug_check_no_locks_held();
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_gc_thread_stop(struct bch_fs *c)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ p = c->gc_thread;
|
|
+ c->gc_thread = NULL;
|
|
+
|
|
+ if (p) {
|
|
+ kthread_stop(p);
|
|
+ put_task_struct(p);
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_gc_thread_start(struct bch_fs *c)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ if (c->gc_thread)
|
|
+ return 0;
|
|
+
|
|
+ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name);
|
|
+ if (IS_ERR(p)) {
|
|
+ bch_err(c, "error creating gc thread: %li", PTR_ERR(p));
|
|
+ return PTR_ERR(p);
|
|
+ }
|
|
+
|
|
+ get_task_struct(p);
|
|
+ c->gc_thread = p;
|
|
+ wake_up_process(p);
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h
|
|
new file mode 100644
|
|
index 000000000000..e9a87394370a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_gc.h
|
|
@@ -0,0 +1,106 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_GC_H
|
|
+#define _BCACHEFS_BTREE_GC_H
|
|
+
|
|
+#include "btree_types.h"
|
|
+
|
|
+int bch2_gc(struct bch_fs *, bool, bool);
|
|
+int bch2_gc_gens(struct bch_fs *);
|
|
+void bch2_gc_thread_stop(struct bch_fs *);
|
|
+int bch2_gc_thread_start(struct bch_fs *);
|
|
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
|
|
+
|
|
+/*
|
|
+ * For concurrent mark and sweep (with other index updates), we define a total
|
|
+ * ordering of _all_ references GC walks:
|
|
+ *
|
|
+ * Note that some references will have the same GC position as others - e.g.
|
|
+ * everything within the same btree node; in those cases we're relying on
|
|
+ * whatever locking exists for where those references live, i.e. the write lock
|
|
+ * on a btree node.
|
|
+ *
|
|
+ * That locking is also required to ensure GC doesn't pass the updater in
|
|
+ * between the updater adding/removing the reference and updating the GC marks;
|
|
+ * without that, we would at best double count sometimes.
|
|
+ *
|
|
+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_
|
|
+ * be held that prevents GC from passing the position the updater is at.
|
|
+ *
|
|
+ * (What about the start of gc, when we're clearing all the marks? GC clears the
|
|
+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc
|
|
+ * position inside its cmpxchg loop, so crap magically works).
|
|
+ */
|
|
+
|
|
+/* Position of (the start of) a gc phase: */
|
|
+static inline struct gc_pos gc_phase(enum gc_phase phase)
|
|
+{
|
|
+ return (struct gc_pos) {
|
|
+ .phase = phase,
|
|
+ .pos = POS_MIN,
|
|
+ .level = 0,
|
|
+ };
|
|
+}
|
|
+
|
|
+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
|
|
+{
|
|
+ return cmp_int(l.phase, r.phase) ?:
|
|
+ bpos_cmp(l.pos, r.pos) ?:
|
|
+ cmp_int(l.level, r.level);
|
|
+}
|
|
+
|
|
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
|
|
+ BCH_BTREE_IDS()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
|
|
+ struct bpos pos, unsigned level)
|
|
+{
|
|
+ return (struct gc_pos) {
|
|
+ .phase = btree_id_to_gc_phase(id),
|
|
+ .pos = pos,
|
|
+ .level = level,
|
|
+ };
|
|
+}
|
|
+
|
|
+/*
|
|
+ * GC position of the pointers within a btree node: note, _not_ for &b->key
|
|
+ * itself, that lives in the parent node:
|
|
+ */
|
|
+static inline struct gc_pos gc_pos_btree_node(struct btree *b)
|
|
+{
|
|
+ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * GC position of the pointer to a btree root: we don't use
|
|
+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with
|
|
+ * btree_split() increasing the tree depth - the new root will have level > the
|
|
+ * old root and thus have a greater gc position than the old root, but that
|
|
+ * would be incorrect since once gc has marked the root it's not coming back.
|
|
+ */
|
|
+static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
|
|
+{
|
|
+ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
|
|
+}
|
|
+
|
|
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
|
|
+{
|
|
+ unsigned seq;
|
|
+ bool ret;
|
|
+
|
|
+ do {
|
|
+ seq = read_seqcount_begin(&c->gc_pos_lock);
|
|
+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
|
|
+ } while (read_seqcount_retry(&c->gc_pos_lock, seq));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_GC_H */
|
|
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
|
|
new file mode 100644
|
|
index 000000000000..e609bc49cefe
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_io.c
|
|
@@ -0,0 +1,1778 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "bkey_sort.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_locking.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "buckets.h"
|
|
+#include "checksum.h"
|
|
+#include "debug.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "io.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/sched/mm.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static void verify_no_dups(struct btree *b,
|
|
+ struct bkey_packed *start,
|
|
+ struct bkey_packed *end)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct bkey_packed *k, *p;
|
|
+
|
|
+ if (start == end)
|
|
+ return;
|
|
+
|
|
+ for (p = start, k = bkey_next(start);
|
|
+ k != end;
|
|
+ p = k, k = bkey_next(k)) {
|
|
+ struct bkey l = bkey_unpack_key(b, p);
|
|
+ struct bkey r = bkey_unpack_key(b, k);
|
|
+
|
|
+ BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0);
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+static void set_needs_whiteout(struct bset *i, int v)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
|
|
+ k->needs_whiteout = v;
|
|
+}
|
|
+
|
|
+static void btree_bounce_free(struct bch_fs *c, size_t size,
|
|
+ bool used_mempool, void *p)
|
|
+{
|
|
+ if (used_mempool)
|
|
+ mempool_free(p, &c->btree_bounce_pool);
|
|
+ else
|
|
+ vpfree(p, size);
|
|
+}
|
|
+
|
|
+static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
|
|
+ bool *used_mempool)
|
|
+{
|
|
+ unsigned flags = memalloc_nofs_save();
|
|
+ void *p;
|
|
+
|
|
+ BUG_ON(size > btree_bytes(c));
|
|
+
|
|
+ *used_mempool = false;
|
|
+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
|
|
+ if (!p) {
|
|
+ *used_mempool = true;
|
|
+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
|
|
+ }
|
|
+ memalloc_nofs_restore(flags);
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static void sort_bkey_ptrs(const struct btree *bt,
|
|
+ struct bkey_packed **ptrs, unsigned nr)
|
|
+{
|
|
+ unsigned n = nr, a = nr / 2, b, c, d;
|
|
+
|
|
+ if (!a)
|
|
+ return;
|
|
+
|
|
+ /* Heap sort: see lib/sort.c: */
|
|
+ while (1) {
|
|
+ if (a)
|
|
+ a--;
|
|
+ else if (--n)
|
|
+ swap(ptrs[0], ptrs[n]);
|
|
+ else
|
|
+ break;
|
|
+
|
|
+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;)
|
|
+ b = bch2_bkey_cmp_packed(bt,
|
|
+ ptrs[c],
|
|
+ ptrs[d]) >= 0 ? c : d;
|
|
+ if (d == n)
|
|
+ b = c;
|
|
+
|
|
+ while (b != a &&
|
|
+ bch2_bkey_cmp_packed(bt,
|
|
+ ptrs[a],
|
|
+ ptrs[b]) >= 0)
|
|
+ b = (b - 1) / 2;
|
|
+ c = b;
|
|
+ while (b != a) {
|
|
+ b = (b - 1) / 2;
|
|
+ swap(ptrs[b], ptrs[c]);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k;
|
|
+ bool used_mempool = false;
|
|
+ size_t bytes = b->whiteout_u64s * sizeof(u64);
|
|
+
|
|
+ if (!b->whiteout_u64s)
|
|
+ return;
|
|
+
|
|
+ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool);
|
|
+
|
|
+ ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
|
|
+
|
|
+ for (k = unwritten_whiteouts_start(c, b);
|
|
+ k != unwritten_whiteouts_end(c, b);
|
|
+ k = bkey_next(k))
|
|
+ *--ptrs = k;
|
|
+
|
|
+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs);
|
|
+
|
|
+ k = new_whiteouts;
|
|
+
|
|
+ while (ptrs != ptrs_end) {
|
|
+ bkey_copy(k, *ptrs);
|
|
+ k = bkey_next(k);
|
|
+ ptrs++;
|
|
+ }
|
|
+
|
|
+ verify_no_dups(b, new_whiteouts,
|
|
+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
|
|
+
|
|
+ memcpy_u64s(unwritten_whiteouts_start(c, b),
|
|
+ new_whiteouts, b->whiteout_u64s);
|
|
+
|
|
+ btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
|
|
+}
|
|
+
|
|
+static bool should_compact_bset(struct btree *b, struct bset_tree *t,
|
|
+ bool compacting, enum compact_mode mode)
|
|
+{
|
|
+ if (!bset_dead_u64s(b, t))
|
|
+ return false;
|
|
+
|
|
+ switch (mode) {
|
|
+ case COMPACT_LAZY:
|
|
+ return should_compact_bset_lazy(b, t) ||
|
|
+ (compacting && !bset_written(b, bset(b, t)));
|
|
+ case COMPACT_ALL:
|
|
+ return true;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+ bool ret = false;
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ struct bset *i = bset(b, t);
|
|
+ struct bkey_packed *k, *n, *out, *start, *end;
|
|
+ struct btree_node_entry *src = NULL, *dst = NULL;
|
|
+
|
|
+ if (t != b->set && !bset_written(b, i)) {
|
|
+ src = container_of(i, struct btree_node_entry, keys);
|
|
+ dst = max(write_block(b),
|
|
+ (void *) btree_bkey_last(b, t - 1));
|
|
+ }
|
|
+
|
|
+ if (src != dst)
|
|
+ ret = true;
|
|
+
|
|
+ if (!should_compact_bset(b, t, ret, mode)) {
|
|
+ if (src != dst) {
|
|
+ memmove(dst, src, sizeof(*src) +
|
|
+ le16_to_cpu(src->keys.u64s) *
|
|
+ sizeof(u64));
|
|
+ i = &dst->keys;
|
|
+ set_btree_bset(b, t, i);
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ start = btree_bkey_first(b, t);
|
|
+ end = btree_bkey_last(b, t);
|
|
+
|
|
+ if (src != dst) {
|
|
+ memmove(dst, src, sizeof(*src));
|
|
+ i = &dst->keys;
|
|
+ set_btree_bset(b, t, i);
|
|
+ }
|
|
+
|
|
+ out = i->start;
|
|
+
|
|
+ for (k = start; k != end; k = n) {
|
|
+ n = bkey_next(k);
|
|
+
|
|
+ if (!bkey_deleted(k)) {
|
|
+ bkey_copy(out, k);
|
|
+ out = bkey_next(out);
|
|
+ } else {
|
|
+ BUG_ON(k->needs_whiteout);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ i->u64s = cpu_to_le16((u64 *) out - i->_data);
|
|
+ set_btree_bset_end(b, t);
|
|
+ bch2_bset_set_no_aux_tree(b, t);
|
|
+ ret = true;
|
|
+ }
|
|
+
|
|
+ bch2_verify_btree_nr_keys(b);
|
|
+
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
|
|
+ enum compact_mode mode)
|
|
+{
|
|
+ return bch2_drop_whiteouts(b, mode);
|
|
+}
|
|
+
|
|
+static void btree_node_sort(struct bch_fs *c, struct btree *b,
|
|
+ unsigned start_idx,
|
|
+ unsigned end_idx,
|
|
+ bool filter_whiteouts)
|
|
+{
|
|
+ struct btree_node *out;
|
|
+ struct sort_iter sort_iter;
|
|
+ struct bset_tree *t;
|
|
+ struct bset *start_bset = bset(b, &b->set[start_idx]);
|
|
+ bool used_mempool = false;
|
|
+ u64 start_time, seq = 0;
|
|
+ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1;
|
|
+ bool sorting_entire_node = start_idx == 0 &&
|
|
+ end_idx == b->nsets;
|
|
+
|
|
+ sort_iter_init(&sort_iter, b);
|
|
+
|
|
+ for (t = b->set + start_idx;
|
|
+ t < b->set + end_idx;
|
|
+ t++) {
|
|
+ u64s += le16_to_cpu(bset(b, t)->u64s);
|
|
+ sort_iter_add(&sort_iter,
|
|
+ btree_bkey_first(b, t),
|
|
+ btree_bkey_last(b, t));
|
|
+ }
|
|
+
|
|
+ bytes = sorting_entire_node
|
|
+ ? btree_bytes(c)
|
|
+ : __vstruct_bytes(struct btree_node, u64s);
|
|
+
|
|
+ out = btree_bounce_alloc(c, bytes, &used_mempool);
|
|
+
|
|
+ start_time = local_clock();
|
|
+
|
|
+ u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts);
|
|
+
|
|
+ out->keys.u64s = cpu_to_le16(u64s);
|
|
+
|
|
+ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes);
|
|
+
|
|
+ if (sorting_entire_node)
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
|
|
+ start_time);
|
|
+
|
|
+ /* Make sure we preserve bset journal_seq: */
|
|
+ for (t = b->set + start_idx; t < b->set + end_idx; t++)
|
|
+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq));
|
|
+ start_bset->journal_seq = cpu_to_le64(seq);
|
|
+
|
|
+ if (sorting_entire_node) {
|
|
+ unsigned u64s = le16_to_cpu(out->keys.u64s);
|
|
+
|
|
+ BUG_ON(bytes != btree_bytes(c));
|
|
+
|
|
+ /*
|
|
+ * Our temporary buffer is the same size as the btree node's
|
|
+ * buffer, we can just swap buffers instead of doing a big
|
|
+ * memcpy()
|
|
+ */
|
|
+ *out = *b->data;
|
|
+ out->keys.u64s = cpu_to_le16(u64s);
|
|
+ swap(out, b->data);
|
|
+ set_btree_bset(b, b->set, &b->data->keys);
|
|
+ } else {
|
|
+ start_bset->u64s = out->keys.u64s;
|
|
+ memcpy_u64s(start_bset->start,
|
|
+ out->keys.start,
|
|
+ le16_to_cpu(out->keys.u64s));
|
|
+ }
|
|
+
|
|
+ for (i = start_idx + 1; i < end_idx; i++)
|
|
+ b->nr.bset_u64s[start_idx] +=
|
|
+ b->nr.bset_u64s[i];
|
|
+
|
|
+ b->nsets -= shift;
|
|
+
|
|
+ for (i = start_idx + 1; i < b->nsets; i++) {
|
|
+ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift];
|
|
+ b->set[i] = b->set[i + shift];
|
|
+ }
|
|
+
|
|
+ for (i = b->nsets; i < MAX_BSETS; i++)
|
|
+ b->nr.bset_u64s[i] = 0;
|
|
+
|
|
+ set_btree_bset_end(b, &b->set[start_idx]);
|
|
+ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]);
|
|
+
|
|
+ btree_bounce_free(c, bytes, used_mempool, out);
|
|
+
|
|
+ bch2_verify_btree_nr_keys(b);
|
|
+}
|
|
+
|
|
+void bch2_btree_sort_into(struct bch_fs *c,
|
|
+ struct btree *dst,
|
|
+ struct btree *src)
|
|
+{
|
|
+ struct btree_nr_keys nr;
|
|
+ struct btree_node_iter src_iter;
|
|
+ u64 start_time = local_clock();
|
|
+
|
|
+ BUG_ON(dst->nsets != 1);
|
|
+
|
|
+ bch2_bset_set_no_aux_tree(dst, dst->set);
|
|
+
|
|
+ bch2_btree_node_iter_init_from_start(&src_iter, src);
|
|
+
|
|
+ if (btree_node_is_extents(src))
|
|
+ nr = bch2_sort_repack_merge(c, btree_bset_first(dst),
|
|
+ src, &src_iter,
|
|
+ &dst->format,
|
|
+ true);
|
|
+ else
|
|
+ nr = bch2_sort_repack(btree_bset_first(dst),
|
|
+ src, &src_iter,
|
|
+ &dst->format,
|
|
+ true);
|
|
+
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort],
|
|
+ start_time);
|
|
+
|
|
+ set_btree_bset_end(dst, dst->set);
|
|
+
|
|
+ dst->nr.live_u64s += nr.live_u64s;
|
|
+ dst->nr.bset_u64s[0] += nr.bset_u64s[0];
|
|
+ dst->nr.packed_keys += nr.packed_keys;
|
|
+ dst->nr.unpacked_keys += nr.unpacked_keys;
|
|
+
|
|
+ bch2_verify_btree_nr_keys(dst);
|
|
+}
|
|
+
|
|
+#define SORT_CRIT (4096 / sizeof(u64))
|
|
+
|
|
+/*
|
|
+ * We're about to add another bset to the btree node, so if there's currently
|
|
+ * too many bsets - sort some of them together:
|
|
+ */
|
|
+static bool btree_node_compact(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ unsigned unwritten_idx;
|
|
+ bool ret = false;
|
|
+
|
|
+ for (unwritten_idx = 0;
|
|
+ unwritten_idx < b->nsets;
|
|
+ unwritten_idx++)
|
|
+ if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
|
|
+ break;
|
|
+
|
|
+ if (b->nsets - unwritten_idx > 1) {
|
|
+ btree_node_sort(c, b, unwritten_idx,
|
|
+ b->nsets, false);
|
|
+ ret = true;
|
|
+ }
|
|
+
|
|
+ if (unwritten_idx > 1) {
|
|
+ btree_node_sort(c, b, 0, unwritten_idx, false);
|
|
+ ret = true;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_btree_build_aux_trees(struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ bch2_bset_build_aux_tree(b, t,
|
|
+ !bset_written(b, bset(b, t)) &&
|
|
+ t == bset_tree_last(b));
|
|
+}
|
|
+
|
|
+/*
|
|
+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be
|
|
+ * inserted into
|
|
+ *
|
|
+ * Safe to call if there already is an unwritten bset - will only add a new bset
|
|
+ * if @b doesn't already have one.
|
|
+ *
|
|
+ * Returns true if we sorted (i.e. invalidated iterators
|
|
+ */
|
|
+void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_node_entry *bne;
|
|
+ bool reinit_iter = false;
|
|
+
|
|
+ EBUG_ON(!(b->c.lock.state.seq & 1));
|
|
+ EBUG_ON(iter && iter->l[b->c.level].b != b);
|
|
+ BUG_ON(bset_written(b, bset(b, &b->set[1])));
|
|
+
|
|
+ if (b->nsets == MAX_BSETS) {
|
|
+ unsigned log_u64s[] = {
|
|
+ ilog2(bset_u64s(&b->set[0])),
|
|
+ ilog2(bset_u64s(&b->set[1])),
|
|
+ ilog2(bset_u64s(&b->set[2])),
|
|
+ };
|
|
+
|
|
+ if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) {
|
|
+ bch2_btree_node_write(c, b, SIX_LOCK_write);
|
|
+ reinit_iter = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (b->nsets == MAX_BSETS &&
|
|
+ btree_node_compact(c, b))
|
|
+ reinit_iter = true;
|
|
+
|
|
+ BUG_ON(b->nsets >= MAX_BSETS);
|
|
+
|
|
+ bne = want_new_bset(c, b);
|
|
+ if (bne)
|
|
+ bch2_bset_init_next(c, b, bne);
|
|
+
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+
|
|
+ if (iter && reinit_iter)
|
|
+ bch2_btree_iter_reinit_node(iter, b);
|
|
+}
|
|
+
|
|
+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ pr_buf(out, "%s level %u/%u\n ",
|
|
+ bch2_btree_ids[b->c.btree_id],
|
|
+ b->c.level,
|
|
+ c->btree_roots[b->c.btree_id].level);
|
|
+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key));
|
|
+}
|
|
+
|
|
+static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bch_dev *ca,
|
|
+ struct btree *b, struct bset *i,
|
|
+ unsigned offset, int write)
|
|
+{
|
|
+ pr_buf(out, "error validating btree node ");
|
|
+ if (write)
|
|
+ pr_buf(out, "before write ");
|
|
+ if (ca)
|
|
+ pr_buf(out, "on %s ", ca->name);
|
|
+ pr_buf(out, "at btree ");
|
|
+ btree_pos_to_text(out, c, b);
|
|
+
|
|
+ pr_buf(out, "\n node offset %u", b->written);
|
|
+ if (i)
|
|
+ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s));
|
|
+}
|
|
+
|
|
+enum btree_err_type {
|
|
+ BTREE_ERR_FIXABLE,
|
|
+ BTREE_ERR_WANT_RETRY,
|
|
+ BTREE_ERR_MUST_RETRY,
|
|
+ BTREE_ERR_FATAL,
|
|
+};
|
|
+
|
|
+enum btree_validate_ret {
|
|
+ BTREE_RETRY_READ = 64,
|
|
+};
|
|
+
|
|
+#define btree_err(type, c, ca, b, i, msg, ...) \
|
|
+({ \
|
|
+ __label__ out; \
|
|
+ char _buf[300]; \
|
|
+ char *_buf2 = _buf; \
|
|
+ struct printbuf out = PBUF(_buf); \
|
|
+ \
|
|
+ _buf2 = kmalloc(4096, GFP_ATOMIC); \
|
|
+ if (_buf2) \
|
|
+ out = _PBUF(_buf2, 4986); \
|
|
+ \
|
|
+ btree_err_msg(&out, c, ca, b, i, b->written, write); \
|
|
+ pr_buf(&out, ": " msg, ##__VA_ARGS__); \
|
|
+ \
|
|
+ if (type == BTREE_ERR_FIXABLE && \
|
|
+ write == READ && \
|
|
+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
|
|
+ mustfix_fsck_err(c, "%s", _buf2); \
|
|
+ goto out; \
|
|
+ } \
|
|
+ \
|
|
+ switch (write) { \
|
|
+ case READ: \
|
|
+ bch_err(c, "%s", _buf2); \
|
|
+ \
|
|
+ switch (type) { \
|
|
+ case BTREE_ERR_FIXABLE: \
|
|
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
|
+ goto fsck_err; \
|
|
+ case BTREE_ERR_WANT_RETRY: \
|
|
+ if (have_retry) { \
|
|
+ ret = BTREE_RETRY_READ; \
|
|
+ goto fsck_err; \
|
|
+ } \
|
|
+ break; \
|
|
+ case BTREE_ERR_MUST_RETRY: \
|
|
+ ret = BTREE_RETRY_READ; \
|
|
+ goto fsck_err; \
|
|
+ case BTREE_ERR_FATAL: \
|
|
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
|
+ goto fsck_err; \
|
|
+ } \
|
|
+ break; \
|
|
+ case WRITE: \
|
|
+ bch_err(c, "corrupt metadata before write: %s", _buf2); \
|
|
+ \
|
|
+ if (bch2_fs_inconsistent(c)) { \
|
|
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
|
+ goto fsck_err; \
|
|
+ } \
|
|
+ break; \
|
|
+ } \
|
|
+out: \
|
|
+ if (_buf2 != _buf) \
|
|
+ kfree(_buf2); \
|
|
+ true; \
|
|
+})
|
|
+
|
|
+#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
|
|
+
|
|
+/*
|
|
+ * When btree topology repair changes the start or end of a node, that might
|
|
+ * mean we have to drop keys that are no longer inside the node:
|
|
+ */
|
|
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ struct bset *i = bset(b, t);
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
|
|
+ if (bkey_cmp_left_packed(b, k, &b->data->min_key) < 0)
|
|
+ break;
|
|
+
|
|
+ if (k != i->start) {
|
|
+ unsigned shift = (u64 *) k - (u64 *) i->start;
|
|
+
|
|
+ memmove_u64s_down(i->start, k,
|
|
+ (u64 *) vstruct_end(i) - (u64 *) k);
|
|
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
|
|
+ set_btree_bset_end(b, t);
|
|
+ bch2_bset_set_no_aux_tree(b, t);
|
|
+ }
|
|
+
|
|
+ for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
|
|
+ if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
|
|
+ break;
|
|
+
|
|
+ if (k != vstruct_last(i)) {
|
|
+ i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
|
|
+ set_btree_bset_end(b, t);
|
|
+ bch2_bset_set_no_aux_tree(b, t);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+}
|
|
+
|
|
+static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
|
|
+ struct btree *b, struct bset *i,
|
|
+ unsigned sectors, int write, bool have_retry)
|
|
+{
|
|
+ unsigned version = le16_to_cpu(i->version);
|
|
+ const char *err;
|
|
+ char buf1[100];
|
|
+ char buf2[100];
|
|
+ int ret = 0;
|
|
+
|
|
+ btree_err_on((version != BCH_BSET_VERSION_OLD &&
|
|
+ version < bcachefs_metadata_version_min) ||
|
|
+ version >= bcachefs_metadata_version_max,
|
|
+ BTREE_ERR_FATAL, c, ca, b, i,
|
|
+ "unsupported bset version");
|
|
+
|
|
+ if (btree_err_on(version < c->sb.version_min,
|
|
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "bset version %u older than superblock version_min %u",
|
|
+ version, c->sb.version_min)) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ c->disk_sb.sb->version_min = cpu_to_le16(version);
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ if (btree_err_on(version > c->sb.version,
|
|
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "bset version %u newer than superblock version %u",
|
|
+ version, c->sb.version)) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ c->disk_sb.sb->version = cpu_to_le16(version);
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ btree_err_on(BSET_SEPARATE_WHITEOUTS(i),
|
|
+ BTREE_ERR_FATAL, c, ca, b, i,
|
|
+ "BSET_SEPARATE_WHITEOUTS no longer supported");
|
|
+
|
|
+ if (btree_err_on(b->written + sectors > c->opts.btree_node_size,
|
|
+ BTREE_ERR_FIXABLE, c, ca, b, i,
|
|
+ "bset past end of btree node")) {
|
|
+ i->u64s = 0;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ btree_err_on(b->written && !i->u64s,
|
|
+ BTREE_ERR_FIXABLE, c, ca, b, i,
|
|
+ "empty bset");
|
|
+
|
|
+ if (!b->written) {
|
|
+ struct btree_node *bn =
|
|
+ container_of(i, struct btree_node, keys);
|
|
+ /* These indicate that we read the wrong btree node: */
|
|
+
|
|
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bch_btree_ptr_v2 *bp =
|
|
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
|
|
+
|
|
+ /* XXX endianness */
|
|
+ btree_err_on(bp->seq != bn->keys.seq,
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
|
+ "incorrect sequence number (wrong btree node)");
|
|
+ }
|
|
+
|
|
+ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id,
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
|
|
+ "incorrect btree id");
|
|
+
|
|
+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level,
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
|
|
+ "incorrect level");
|
|
+
|
|
+ if (!write)
|
|
+ compat_btree_node(b->c.level, b->c.btree_id, version,
|
|
+ BSET_BIG_ENDIAN(i), write, bn);
|
|
+
|
|
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bch_btree_ptr_v2 *bp =
|
|
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
|
|
+
|
|
+ if (BTREE_PTR_RANGE_UPDATED(bp)) {
|
|
+ b->data->min_key = bp->min_key;
|
|
+ b->data->max_key = b->key.k.p;
|
|
+ }
|
|
+
|
|
+ btree_err_on(bpos_cmp(b->data->min_key, bp->min_key),
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
|
+ "incorrect min_key: got %s should be %s",
|
|
+ (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1),
|
|
+ (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2));
|
|
+ }
|
|
+
|
|
+ btree_err_on(bpos_cmp(bn->max_key, b->key.k.p),
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, i,
|
|
+ "incorrect max key %s",
|
|
+ (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1));
|
|
+
|
|
+ if (write)
|
|
+ compat_btree_node(b->c.level, b->c.btree_id, version,
|
|
+ BSET_BIG_ENDIAN(i), write, bn);
|
|
+
|
|
+ err = bch2_bkey_format_validate(&bn->format);
|
|
+ btree_err_on(err,
|
|
+ BTREE_ERR_FATAL, c, ca, b, i,
|
|
+ "invalid bkey format: %s", err);
|
|
+
|
|
+ compat_bformat(b->c.level, b->c.btree_id, version,
|
|
+ BSET_BIG_ENDIAN(i), write,
|
|
+ &bn->format);
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int validate_bset_keys(struct bch_fs *c, struct btree *b,
|
|
+ struct bset *i, unsigned *whiteout_u64s,
|
|
+ int write, bool have_retry)
|
|
+{
|
|
+ unsigned version = le16_to_cpu(i->version);
|
|
+ struct bkey_packed *k, *prev = NULL;
|
|
+ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
+ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
|
|
+ int ret = 0;
|
|
+
|
|
+ for (k = i->start;
|
|
+ k != vstruct_last(i);) {
|
|
+ struct bkey_s u;
|
|
+ struct bkey tmp;
|
|
+ const char *invalid;
|
|
+
|
|
+ if (btree_err_on(bkey_next(k) > vstruct_last(i),
|
|
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "key extends past end of bset")) {
|
|
+ i->u64s = cpu_to_le16((u64 *) k - i->_data);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (btree_err_on(k->format > KEY_FORMAT_CURRENT,
|
|
+ BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "invalid bkey format %u", k->format)) {
|
|
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
|
+ memmove_u64s_down(k, bkey_next(k),
|
|
+ (u64 *) vstruct_end(i) - (u64 *) k);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* XXX: validate k->u64s */
|
|
+ if (!write)
|
|
+ bch2_bkey_compat(b->c.level, b->c.btree_id, version,
|
|
+ BSET_BIG_ENDIAN(i), write,
|
|
+ &b->format, k);
|
|
+
|
|
+ u = __bkey_disassemble(b, k, &tmp);
|
|
+
|
|
+ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
|
|
+ (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
|
|
+ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
|
|
+ if (invalid) {
|
|
+ char buf[160];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
|
|
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "invalid bkey: %s\n%s", invalid, buf);
|
|
+
|
|
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
|
+ memmove_u64s_down(k, bkey_next(k),
|
|
+ (u64 *) vstruct_end(i) - (u64 *) k);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (write)
|
|
+ bch2_bkey_compat(b->c.level, b->c.btree_id, version,
|
|
+ BSET_BIG_ENDIAN(i), write,
|
|
+ &b->format, k);
|
|
+
|
|
+ if (prev && bkey_iter_cmp(b, prev, k) > 0) {
|
|
+ char buf1[80];
|
|
+ char buf2[80];
|
|
+ struct bkey up = bkey_unpack_key(b, prev);
|
|
+
|
|
+ bch2_bkey_to_text(&PBUF(buf1), &up);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), u.k);
|
|
+
|
|
+ bch2_dump_bset(c, b, i, 0);
|
|
+
|
|
+ if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "keys out of order: %s > %s",
|
|
+ buf1, buf2)) {
|
|
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
|
+ memmove_u64s_down(k, bkey_next(k),
|
|
+ (u64 *) vstruct_end(i) - (u64 *) k);
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ prev = k;
|
|
+ k = bkey_next(k);
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
|
|
+ struct btree *b, bool have_retry)
|
|
+{
|
|
+ struct btree_node_entry *bne;
|
|
+ struct sort_iter *iter;
|
|
+ struct btree_node *sorted;
|
|
+ struct bkey_packed *k;
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct bset *i;
|
|
+ bool used_mempool, blacklisted;
|
|
+ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
+ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
|
|
+ unsigned u64s;
|
|
+ int ret, retry_read = 0, write = READ;
|
|
+
|
|
+ b->version_ondisk = U16_MAX;
|
|
+
|
|
+ iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
|
|
+ sort_iter_init(iter, b);
|
|
+ iter->size = (btree_blocks(c) + 1) * 2;
|
|
+
|
|
+ if (bch2_meta_read_fault("btree"))
|
|
+ btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
|
+ "dynamic fault");
|
|
+
|
|
+ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c),
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
|
+ "bad magic");
|
|
+
|
|
+ btree_err_on(!b->data->keys.seq,
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
|
+ "bad btree header");
|
|
+
|
|
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bch_btree_ptr_v2 *bp =
|
|
+ &bkey_i_to_btree_ptr_v2(&b->key)->v;
|
|
+
|
|
+ btree_err_on(b->data->keys.seq != bp->seq,
|
|
+ BTREE_ERR_MUST_RETRY, c, ca, b, NULL,
|
|
+ "got wrong btree node (seq %llx want %llx)",
|
|
+ b->data->keys.seq, bp->seq);
|
|
+ }
|
|
+
|
|
+ while (b->written < c->opts.btree_node_size) {
|
|
+ unsigned sectors, whiteout_u64s = 0;
|
|
+ struct nonce nonce;
|
|
+ struct bch_csum csum;
|
|
+ bool first = !b->written;
|
|
+
|
|
+ if (!b->written) {
|
|
+ i = &b->data->keys;
|
|
+
|
|
+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
|
|
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
|
+ "unknown checksum type %llu",
|
|
+ BSET_CSUM_TYPE(i));
|
|
+
|
|
+ nonce = btree_nonce(i, b->written << 9);
|
|
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
|
|
+
|
|
+ btree_err_on(bch2_crc_cmp(csum, b->data->csum),
|
|
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
|
+ "invalid checksum");
|
|
+
|
|
+ bset_encrypt(c, i, b->written << 9);
|
|
+
|
|
+ btree_err_on(btree_node_is_extents(b) &&
|
|
+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
|
|
+ BTREE_ERR_FATAL, c, NULL, b, NULL,
|
|
+ "btree node does not have NEW_EXTENT_OVERWRITE set");
|
|
+
|
|
+ sectors = vstruct_sectors(b->data, c->block_bits);
|
|
+ } else {
|
|
+ bne = write_block(b);
|
|
+ i = &bne->keys;
|
|
+
|
|
+ if (i->seq != b->data->keys.seq)
|
|
+ break;
|
|
+
|
|
+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)),
|
|
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
|
+ "unknown checksum type %llu",
|
|
+ BSET_CSUM_TYPE(i));
|
|
+
|
|
+ nonce = btree_nonce(i, b->written << 9);
|
|
+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
|
+
|
|
+ btree_err_on(bch2_crc_cmp(csum, bne->csum),
|
|
+ BTREE_ERR_WANT_RETRY, c, ca, b, i,
|
|
+ "invalid checksum");
|
|
+
|
|
+ bset_encrypt(c, i, b->written << 9);
|
|
+
|
|
+ sectors = vstruct_sectors(bne, c->block_bits);
|
|
+ }
|
|
+
|
|
+ b->version_ondisk = min(b->version_ondisk,
|
|
+ le16_to_cpu(i->version));
|
|
+
|
|
+ ret = validate_bset(c, ca, b, i, sectors,
|
|
+ READ, have_retry);
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
+
|
|
+ if (!b->written)
|
|
+ btree_node_set_format(b, b->data->format);
|
|
+
|
|
+ ret = validate_bset_keys(c, b, i, &whiteout_u64s,
|
|
+ READ, have_retry);
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
+
|
|
+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
|
|
+
|
|
+ b->written += sectors;
|
|
+
|
|
+ blacklisted = bch2_journal_seq_is_blacklisted(c,
|
|
+ le64_to_cpu(i->journal_seq),
|
|
+ true);
|
|
+
|
|
+ btree_err_on(blacklisted && first,
|
|
+ BTREE_ERR_FIXABLE, c, ca, b, i,
|
|
+ "first btree node bset has blacklisted journal seq");
|
|
+ if (blacklisted && !first)
|
|
+ continue;
|
|
+
|
|
+ sort_iter_add(iter, i->start,
|
|
+ vstruct_idx(i, whiteout_u64s));
|
|
+
|
|
+ sort_iter_add(iter,
|
|
+ vstruct_idx(i, whiteout_u64s),
|
|
+ vstruct_last(i));
|
|
+ }
|
|
+
|
|
+ for (bne = write_block(b);
|
|
+ bset_byte_offset(b, bne) < btree_bytes(c);
|
|
+ bne = (void *) bne + block_bytes(c))
|
|
+ btree_err_on(bne->keys.seq == b->data->keys.seq,
|
|
+ BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
|
|
+ "found bset signature after last bset");
|
|
+
|
|
+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
|
|
+ sorted->keys.u64s = 0;
|
|
+
|
|
+ set_btree_bset(b, b->set, &b->data->keys);
|
|
+
|
|
+ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter);
|
|
+
|
|
+ u64s = le16_to_cpu(sorted->keys.u64s);
|
|
+ *sorted = *b->data;
|
|
+ sorted->keys.u64s = cpu_to_le16(u64s);
|
|
+ swap(sorted, b->data);
|
|
+ set_btree_bset(b, b->set, &b->data->keys);
|
|
+ b->nsets = 1;
|
|
+
|
|
+ BUG_ON(b->nr.live_u64s != u64s);
|
|
+
|
|
+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
|
|
+
|
|
+ if (updated_range)
|
|
+ bch2_btree_node_drop_keys_outside_node(b);
|
|
+
|
|
+ i = &b->data->keys;
|
|
+ for (k = i->start; k != vstruct_last(i);) {
|
|
+ struct bkey tmp;
|
|
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
|
|
+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c);
|
|
+
|
|
+ if (invalid ||
|
|
+ (bch2_inject_invalid_keys &&
|
|
+ !bversion_cmp(u.k->version, MAX_VERSION))) {
|
|
+ char buf[160];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c);
|
|
+ btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i,
|
|
+ "invalid bkey %s: %s", buf, invalid);
|
|
+
|
|
+ btree_keys_account_key_drop(&b->nr, 0, k);
|
|
+
|
|
+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
|
|
+ memmove_u64s_down(k, bkey_next(k),
|
|
+ (u64 *) vstruct_end(i) - (u64 *) k);
|
|
+ set_btree_bset_end(b, b->set);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (u.k->type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u);
|
|
+
|
|
+ bp.v->mem_ptr = 0;
|
|
+ }
|
|
+
|
|
+ k = bkey_next(k);
|
|
+ }
|
|
+
|
|
+ bch2_bset_build_aux_tree(b, b->set, false);
|
|
+
|
|
+ set_needs_whiteout(btree_bset_first(b), true);
|
|
+
|
|
+ btree_node_reset_sib_u64s(b);
|
|
+
|
|
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+
|
|
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
|
|
+ set_btree_node_need_rewrite(b);
|
|
+ }
|
|
+out:
|
|
+ mempool_free(iter, &c->fill_iter);
|
|
+ return retry_read;
|
|
+fsck_err:
|
|
+ if (ret == BTREE_RETRY_READ) {
|
|
+ retry_read = 1;
|
|
+ } else {
|
|
+ bch2_inconsistent_error(c);
|
|
+ set_btree_node_read_error(b);
|
|
+ }
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static void btree_node_read_work(struct work_struct *work)
|
|
+{
|
|
+ struct btree_read_bio *rb =
|
|
+ container_of(work, struct btree_read_bio, work);
|
|
+ struct bch_fs *c = rb->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
|
+ struct btree *b = rb->bio.bi_private;
|
|
+ struct bio *bio = &rb->bio;
|
|
+ struct bch_io_failures failed = { .nr = 0 };
|
|
+ char buf[200];
|
|
+ struct printbuf out;
|
|
+ bool saw_error = false;
|
|
+ bool can_retry;
|
|
+
|
|
+ goto start;
|
|
+ while (1) {
|
|
+ bch_info(c, "retrying read");
|
|
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
|
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
|
|
+ bio_reset(bio);
|
|
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
|
|
+ bio->bi_iter.bi_sector = rb->pick.ptr.offset;
|
|
+ bio->bi_iter.bi_size = btree_bytes(c);
|
|
+
|
|
+ if (rb->have_ioref) {
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ submit_bio_wait(bio);
|
|
+ } else {
|
|
+ bio->bi_status = BLK_STS_REMOVED;
|
|
+ }
|
|
+start:
|
|
+ out = PBUF(buf);
|
|
+ btree_pos_to_text(&out, c, b);
|
|
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status), buf);
|
|
+ if (rb->have_ioref)
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ rb->have_ioref = false;
|
|
+
|
|
+ bch2_mark_io_failure(&failed, &rb->pick);
|
|
+
|
|
+ can_retry = bch2_bkey_pick_read_device(c,
|
|
+ bkey_i_to_s_c(&b->key),
|
|
+ &failed, &rb->pick) > 0;
|
|
+
|
|
+ if (!bio->bi_status &&
|
|
+ !bch2_btree_node_read_done(c, ca, b, can_retry))
|
|
+ break;
|
|
+
|
|
+ saw_error = true;
|
|
+
|
|
+ if (!can_retry) {
|
|
+ set_btree_node_read_error(b);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
|
|
+ rb->start_time);
|
|
+ bio_put(&rb->bio);
|
|
+
|
|
+ if (saw_error && !btree_node_read_error(b))
|
|
+ bch2_btree_node_rewrite_async(c, b);
|
|
+
|
|
+ clear_btree_node_read_in_flight(b);
|
|
+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
|
|
+}
|
|
+
|
|
+static void btree_node_read_endio(struct bio *bio)
|
|
+{
|
|
+ struct btree_read_bio *rb =
|
|
+ container_of(bio, struct btree_read_bio, bio);
|
|
+ struct bch_fs *c = rb->c;
|
|
+
|
|
+ if (rb->have_ioref) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
|
|
+ bch2_latency_acct(ca, rb->start_time, READ);
|
|
+ }
|
|
+
|
|
+ queue_work(system_unbound_wq, &rb->work);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
|
|
+ bool sync)
|
|
+{
|
|
+ struct extent_ptr_decoded pick;
|
|
+ struct btree_read_bio *rb;
|
|
+ struct bch_dev *ca;
|
|
+ struct bio *bio;
|
|
+ char buf[200];
|
|
+ int ret;
|
|
+
|
|
+ btree_pos_to_text(&PBUF(buf), c, b);
|
|
+ trace_btree_read(c, b);
|
|
+
|
|
+ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
|
|
+ NULL, &pick);
|
|
+ if (bch2_fs_fatal_err_on(ret <= 0, c,
|
|
+ "btree node read error: no device to read from\n"
|
|
+ " at %s", buf)) {
|
|
+ set_btree_node_read_error(b);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
|
+
|
|
+ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data,
|
|
+ btree_bytes(c)),
|
|
+ &c->btree_bio);
|
|
+ rb = container_of(bio, struct btree_read_bio, bio);
|
|
+ rb->c = c;
|
|
+ rb->start_time = local_clock();
|
|
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
|
|
+ rb->pick = pick;
|
|
+ INIT_WORK(&rb->work, btree_node_read_work);
|
|
+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
|
|
+ bio->bi_iter.bi_sector = pick.ptr.offset;
|
|
+ bio->bi_end_io = btree_node_read_endio;
|
|
+ bio->bi_private = b;
|
|
+ bch2_bio_map(bio, b->data, btree_bytes(c));
|
|
+
|
|
+ set_btree_node_read_in_flight(b);
|
|
+
|
|
+ if (rb->have_ioref) {
|
|
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
|
|
+ bio_sectors(bio));
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+
|
|
+ if (sync) {
|
|
+ submit_bio_wait(bio);
|
|
+
|
|
+ bio->bi_private = b;
|
|
+ btree_node_read_work(&rb->work);
|
|
+ } else {
|
|
+ submit_bio(bio);
|
|
+ }
|
|
+ } else {
|
|
+ bio->bi_status = BLK_STS_REMOVED;
|
|
+
|
|
+ if (sync)
|
|
+ btree_node_read_work(&rb->work);
|
|
+ else
|
|
+ queue_work(system_unbound_wq, &rb->work);
|
|
+
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
|
|
+ const struct bkey_i *k, unsigned level)
|
|
+{
|
|
+ struct closure cl;
|
|
+ struct btree *b;
|
|
+ int ret;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ do {
|
|
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
|
+ closure_sync(&cl);
|
|
+ } while (ret);
|
|
+
|
|
+ b = bch2_btree_node_mem_alloc(c);
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+
|
|
+ BUG_ON(IS_ERR(b));
|
|
+
|
|
+ bkey_copy(&b->key, k);
|
|
+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id));
|
|
+
|
|
+ bch2_btree_node_read(c, b, true);
|
|
+
|
|
+ if (btree_node_read_error(b)) {
|
|
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
|
|
+
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ list_move(&b->list, &c->btree_cache.freeable);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_btree_set_root_for_read(c, b);
|
|
+err:
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
|
|
+ struct btree_write *w)
|
|
+{
|
|
+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
|
|
+
|
|
+ do {
|
|
+ old = new = v;
|
|
+ if (!(old & 1))
|
|
+ break;
|
|
+
|
|
+ new &= ~1UL;
|
|
+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
|
|
+
|
|
+ if (old & 1)
|
|
+ closure_put(&((struct btree_update *) new)->cl);
|
|
+
|
|
+ bch2_journal_pin_drop(&c->journal, &w->journal);
|
|
+}
|
|
+
|
|
+static void btree_node_write_done(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct btree_write *w = btree_prev_write(b);
|
|
+
|
|
+ bch2_btree_complete_write(c, b, w);
|
|
+ btree_node_io_unlock(b);
|
|
+}
|
|
+
|
|
+static void bch2_btree_node_write_error(struct bch_fs *c,
|
|
+ struct btree_write_bio *wbio)
|
|
+{
|
|
+ struct btree *b = wbio->wbio.bio.bi_private;
|
|
+ struct bkey_buf k;
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ bch2_bkey_buf_init(&k);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
|
|
+ BTREE_MAX_DEPTH, b->c.level, 0);
|
|
+retry:
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /* has node been freed? */
|
|
+ if (iter->l[b->c.level].b != b) {
|
|
+ /* node has been freed: */
|
|
+ BUG_ON(!btree_node_dying(b));
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!btree_node_hashed(b));
|
|
+
|
|
+ bch2_bkey_buf_copy(&k, c, &b->key);
|
|
+
|
|
+ bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
|
|
+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
|
|
+
|
|
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_btree_node_update_key(c, iter, b, k.k);
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+out:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&k, c);
|
|
+ bio_put(&wbio->wbio.bio);
|
|
+ btree_node_write_done(c, b);
|
|
+ return;
|
|
+err:
|
|
+ set_btree_node_noevict(b);
|
|
+ bch2_fs_fatal_error(c, "fatal error writing btree node");
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+void bch2_btree_write_error_work(struct work_struct *work)
|
|
+{
|
|
+ struct bch_fs *c = container_of(work, struct bch_fs,
|
|
+ btree_write_error_work);
|
|
+ struct bio *bio;
|
|
+
|
|
+ while (1) {
|
|
+ spin_lock_irq(&c->btree_write_error_lock);
|
|
+ bio = bio_list_pop(&c->btree_write_error_list);
|
|
+ spin_unlock_irq(&c->btree_write_error_lock);
|
|
+
|
|
+ if (!bio)
|
|
+ break;
|
|
+
|
|
+ bch2_btree_node_write_error(c,
|
|
+ container_of(bio, struct btree_write_bio, wbio.bio));
|
|
+ }
|
|
+}
|
|
+
|
|
+static void btree_node_write_work(struct work_struct *work)
|
|
+{
|
|
+ struct btree_write_bio *wbio =
|
|
+ container_of(work, struct btree_write_bio, work);
|
|
+ struct bch_fs *c = wbio->wbio.c;
|
|
+ struct btree *b = wbio->wbio.bio.bi_private;
|
|
+
|
|
+ btree_bounce_free(c,
|
|
+ wbio->bytes,
|
|
+ wbio->wbio.used_mempool,
|
|
+ wbio->data);
|
|
+
|
|
+ if (wbio->wbio.failed.nr) {
|
|
+ unsigned long flags;
|
|
+
|
|
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
|
|
+ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
|
|
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
|
|
+
|
|
+ queue_work(c->wq, &c->btree_write_error_work);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ bio_put(&wbio->wbio.bio);
|
|
+ btree_node_write_done(c, b);
|
|
+}
|
|
+
|
|
+static void btree_node_write_endio(struct bio *bio)
|
|
+{
|
|
+ struct bch_write_bio *wbio = to_wbio(bio);
|
|
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
|
|
+ struct bch_write_bio *orig = parent ?: wbio;
|
|
+ struct bch_fs *c = wbio->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
|
|
+ unsigned long flags;
|
|
+
|
|
+ if (wbio->have_ioref)
|
|
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
|
+
|
|
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status)) ||
|
|
+ bch2_meta_write_fault("btree")) {
|
|
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
|
|
+ bch2_dev_list_add_dev(&orig->failed, wbio->dev);
|
|
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
|
|
+ }
|
|
+
|
|
+ if (wbio->have_ioref)
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+
|
|
+ if (parent) {
|
|
+ bio_put(bio);
|
|
+ bio_endio(&parent->bio);
|
|
+ } else {
|
|
+ struct btree_write_bio *wb =
|
|
+ container_of(orig, struct btree_write_bio, wbio);
|
|
+
|
|
+ INIT_WORK(&wb->work, btree_node_write_work);
|
|
+ queue_work(system_unbound_wq, &wb->work);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
|
|
+ struct bset *i, unsigned sectors)
|
|
+{
|
|
+ unsigned whiteout_u64s = 0;
|
|
+ int ret;
|
|
+
|
|
+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree))
|
|
+ return -1;
|
|
+
|
|
+ ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?:
|
|
+ validate_bset(c, NULL, b, i, sectors, WRITE, false);
|
|
+ if (ret) {
|
|
+ bch2_inconsistent_error(c);
|
|
+ dump_stack();
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void btree_write_submit(struct work_struct *work)
|
|
+{
|
|
+ struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
|
|
+
|
|
+ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
|
|
+}
|
|
+
|
|
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct btree_write_bio *wbio;
|
|
+ struct bset_tree *t;
|
|
+ struct bset *i;
|
|
+ struct btree_node *bn = NULL;
|
|
+ struct btree_node_entry *bne = NULL;
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct sort_iter sort_iter;
|
|
+ struct nonce nonce;
|
|
+ unsigned bytes_to_write, sectors_to_write, bytes, u64s;
|
|
+ u64 seq = 0;
|
|
+ bool used_mempool;
|
|
+ unsigned long old, new;
|
|
+ bool validate_before_checksum = false;
|
|
+ void *data;
|
|
+
|
|
+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
|
|
+ return;
|
|
+
|
|
+ /*
|
|
+ * We may only have a read lock on the btree node - the dirty bit is our
|
|
+ * "lock" against racing with other threads that may be trying to start
|
|
+ * a write, we do a write iff we clear the dirty bit. Since setting the
|
|
+ * dirty bit requires a write lock, we can't race with other threads
|
|
+ * redirtying it:
|
|
+ */
|
|
+ do {
|
|
+ old = new = READ_ONCE(b->flags);
|
|
+
|
|
+ if (!(old & (1 << BTREE_NODE_dirty)))
|
|
+ return;
|
|
+
|
|
+ if (!btree_node_may_write(b))
|
|
+ return;
|
|
+
|
|
+ if (old & (1 << BTREE_NODE_never_write))
|
|
+ return;
|
|
+
|
|
+ if (old & (1 << BTREE_NODE_write_in_flight)) {
|
|
+ btree_node_wait_on_io(b);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ new &= ~(1 << BTREE_NODE_dirty);
|
|
+ new &= ~(1 << BTREE_NODE_need_write);
|
|
+ new |= (1 << BTREE_NODE_write_in_flight);
|
|
+ new |= (1 << BTREE_NODE_just_written);
|
|
+ new ^= (1 << BTREE_NODE_write_idx);
|
|
+ } while (cmpxchg_acquire(&b->flags, old, new) != old);
|
|
+
|
|
+ atomic_dec(&c->btree_cache.dirty);
|
|
+
|
|
+ BUG_ON(btree_node_fake(b));
|
|
+ BUG_ON((b->will_make_reachable != 0) != !b->written);
|
|
+
|
|
+ BUG_ON(b->written >= c->opts.btree_node_size);
|
|
+ BUG_ON(b->written & (c->opts.block_size - 1));
|
|
+ BUG_ON(bset_written(b, btree_bset_last(b)));
|
|
+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
|
|
+ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
|
|
+
|
|
+ bch2_sort_whiteouts(c, b);
|
|
+
|
|
+ sort_iter_init(&sort_iter, b);
|
|
+
|
|
+ bytes = !b->written
|
|
+ ? sizeof(struct btree_node)
|
|
+ : sizeof(struct btree_node_entry);
|
|
+
|
|
+ bytes += b->whiteout_u64s * sizeof(u64);
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ i = bset(b, t);
|
|
+
|
|
+ if (bset_written(b, i))
|
|
+ continue;
|
|
+
|
|
+ bytes += le16_to_cpu(i->u64s) * sizeof(u64);
|
|
+ sort_iter_add(&sort_iter,
|
|
+ btree_bkey_first(b, t),
|
|
+ btree_bkey_last(b, t));
|
|
+ seq = max(seq, le64_to_cpu(i->journal_seq));
|
|
+ }
|
|
+
|
|
+ BUG_ON(b->written && !seq);
|
|
+
|
|
+ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */
|
|
+ bytes += 8;
|
|
+
|
|
+ data = btree_bounce_alloc(c, bytes, &used_mempool);
|
|
+
|
|
+ if (!b->written) {
|
|
+ bn = data;
|
|
+ *bn = *b->data;
|
|
+ i = &bn->keys;
|
|
+ } else {
|
|
+ bne = data;
|
|
+ bne->keys = b->data->keys;
|
|
+ i = &bne->keys;
|
|
+ }
|
|
+
|
|
+ i->journal_seq = cpu_to_le64(seq);
|
|
+ i->u64s = 0;
|
|
+
|
|
+ sort_iter_add(&sort_iter,
|
|
+ unwritten_whiteouts_start(c, b),
|
|
+ unwritten_whiteouts_end(c, b));
|
|
+ SET_BSET_SEPARATE_WHITEOUTS(i, false);
|
|
+
|
|
+ b->whiteout_u64s = 0;
|
|
+
|
|
+ u64s = bch2_sort_keys(i->start, &sort_iter, false);
|
|
+ le16_add_cpu(&i->u64s, u64s);
|
|
+
|
|
+ set_needs_whiteout(i, false);
|
|
+
|
|
+ /* do we have data to write? */
|
|
+ if (b->written && !i->u64s)
|
|
+ goto nowrite;
|
|
+
|
|
+ bytes_to_write = vstruct_end(i) - data;
|
|
+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
|
|
+
|
|
+ memset(data + bytes_to_write, 0,
|
|
+ (sectors_to_write << 9) - bytes_to_write);
|
|
+
|
|
+ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size);
|
|
+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
|
|
+ BUG_ON(i->seq != b->data->keys.seq);
|
|
+
|
|
+ i->version = c->sb.version < bcachefs_metadata_version_new_versioning
|
|
+ ? cpu_to_le16(BCH_BSET_VERSION_OLD)
|
|
+ : cpu_to_le16(c->sb.version);
|
|
+ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c));
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)))
|
|
+ validate_before_checksum = true;
|
|
+
|
|
+ /* validate_bset will be modifying: */
|
|
+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current)
|
|
+ validate_before_checksum = true;
|
|
+
|
|
+ /* if we're going to be encrypting, check metadata validity first: */
|
|
+ if (validate_before_checksum &&
|
|
+ validate_bset_for_write(c, b, i, sectors_to_write))
|
|
+ goto err;
|
|
+
|
|
+ bset_encrypt(c, i, b->written << 9);
|
|
+
|
|
+ nonce = btree_nonce(i, b->written << 9);
|
|
+
|
|
+ if (bn)
|
|
+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn);
|
|
+ else
|
|
+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
|
|
+
|
|
+ /* if we're not encrypting, check metadata after checksumming: */
|
|
+ if (!validate_before_checksum &&
|
|
+ validate_bset_for_write(c, b, i, sectors_to_write))
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * We handle btree write errors by immediately halting the journal -
|
|
+ * after we've done that, we can't issue any subsequent btree writes
|
|
+ * because they might have pointers to new nodes that failed to write.
|
|
+ *
|
|
+ * Furthermore, there's no point in doing any more btree writes because
|
|
+ * with the journal stopped, we're never going to update the journal to
|
|
+ * reflect that those writes were done and the data flushed from the
|
|
+ * journal:
|
|
+ *
|
|
+ * Also on journal error, the pending write may have updates that were
|
|
+ * never journalled (interior nodes, see btree_update_nodes_written()) -
|
|
+ * it's critical that we don't do the write in that case otherwise we
|
|
+ * will have updates visible that weren't in the journal:
|
|
+ *
|
|
+ * Make sure to update b->written so bch2_btree_init_next() doesn't
|
|
+ * break:
|
|
+ */
|
|
+ if (bch2_journal_error(&c->journal) ||
|
|
+ c->opts.nochanges)
|
|
+ goto err;
|
|
+
|
|
+ trace_btree_write(b, bytes_to_write, sectors_to_write);
|
|
+
|
|
+ wbio = container_of(bio_alloc_bioset(GFP_NOIO,
|
|
+ buf_pages(data, sectors_to_write << 9),
|
|
+ &c->btree_bio),
|
|
+ struct btree_write_bio, wbio.bio);
|
|
+ wbio_init(&wbio->wbio.bio);
|
|
+ wbio->data = data;
|
|
+ wbio->bytes = bytes;
|
|
+ wbio->wbio.c = c;
|
|
+ wbio->wbio.used_mempool = used_mempool;
|
|
+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META;
|
|
+ wbio->wbio.bio.bi_end_io = btree_node_write_endio;
|
|
+ wbio->wbio.bio.bi_private = b;
|
|
+
|
|
+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
|
|
+
|
|
+ /*
|
|
+ * If we're appending to a leaf node, we don't technically need FUA -
|
|
+ * this write just needs to be persisted before the next journal write,
|
|
+ * which will be marked FLUSH|FUA.
|
|
+ *
|
|
+ * Similarly if we're writing a new btree root - the pointer is going to
|
|
+ * be in the next journal entry.
|
|
+ *
|
|
+ * But if we're writing a new btree node (that isn't a root) or
|
|
+ * appending to a non leaf btree node, we need either FUA or a flush
|
|
+ * when we write the parent with the new pointer. FUA is cheaper than a
|
|
+ * flush, and writes appending to leaf nodes aren't blocking anything so
|
|
+ * just make all btree node writes FUA to keep things sane.
|
|
+ */
|
|
+
|
|
+ bkey_copy(&wbio->key, &b->key);
|
|
+
|
|
+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
|
|
+ ptr->offset += b->written;
|
|
+
|
|
+ b->written += sectors_to_write;
|
|
+
|
|
+ atomic64_inc(&c->btree_writes_nr);
|
|
+ atomic64_add(sectors_to_write, &c->btree_writes_sectors);
|
|
+
|
|
+ INIT_WORK(&wbio->work, btree_write_submit);
|
|
+ schedule_work(&wbio->work);
|
|
+ return;
|
|
+err:
|
|
+ set_btree_node_noevict(b);
|
|
+ b->written += sectors_to_write;
|
|
+nowrite:
|
|
+ btree_bounce_free(c, bytes, used_mempool, data);
|
|
+ btree_node_write_done(c, b);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Work that must be done with write lock held:
|
|
+ */
|
|
+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ bool invalidated_iter = false;
|
|
+ struct btree_node_entry *bne;
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ if (!btree_node_just_written(b))
|
|
+ return false;
|
|
+
|
|
+ BUG_ON(b->whiteout_u64s);
|
|
+
|
|
+ clear_btree_node_just_written(b);
|
|
+
|
|
+ /*
|
|
+ * Note: immediately after write, bset_written() doesn't work - the
|
|
+ * amount of data we had to write after compaction might have been
|
|
+ * smaller than the offset of the last bset.
|
|
+ *
|
|
+ * However, we know that all bsets have been written here, as long as
|
|
+ * we're still holding the write lock:
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * XXX: decide if we really want to unconditionally sort down to a
|
|
+ * single bset:
|
|
+ */
|
|
+ if (b->nsets > 1) {
|
|
+ btree_node_sort(c, b, 0, b->nsets, true);
|
|
+ invalidated_iter = true;
|
|
+ } else {
|
|
+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL);
|
|
+ }
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ set_needs_whiteout(bset(b, t), true);
|
|
+
|
|
+ bch2_btree_verify(c, b);
|
|
+
|
|
+ /*
|
|
+ * If later we don't unconditionally sort down to a single bset, we have
|
|
+ * to ensure this is still true:
|
|
+ */
|
|
+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b));
|
|
+
|
|
+ bne = want_new_bset(c, b);
|
|
+ if (bne)
|
|
+ bch2_bset_init_next(c, b, bne);
|
|
+
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+
|
|
+ return invalidated_iter;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Use this one if the node is intent locked:
|
|
+ */
|
|
+void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
|
|
+ enum six_lock_type lock_type_held)
|
|
+{
|
|
+ if (lock_type_held == SIX_LOCK_intent ||
|
|
+ (lock_type_held == SIX_LOCK_read &&
|
|
+ six_lock_tryupgrade(&b->c.lock))) {
|
|
+ __bch2_btree_node_write(c, b);
|
|
+
|
|
+ /* don't cycle lock unnecessarily: */
|
|
+ if (btree_node_just_written(b) &&
|
|
+ six_trylock_write(&b->c.lock)) {
|
|
+ bch2_btree_post_write_cleanup(c, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ }
|
|
+
|
|
+ if (lock_type_held == SIX_LOCK_read)
|
|
+ six_lock_downgrade(&b->c.lock);
|
|
+ } else {
|
|
+ __bch2_btree_node_write(c, b);
|
|
+ if (lock_type_held == SIX_LOCK_write &&
|
|
+ btree_node_just_written(b))
|
|
+ bch2_btree_post_write_cleanup(c, b);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
|
|
+{
|
|
+ struct bucket_table *tbl;
|
|
+ struct rhash_head *pos;
|
|
+ struct btree *b;
|
|
+ unsigned i;
|
|
+restart:
|
|
+ rcu_read_lock();
|
|
+ for_each_cached_btree(b, c, tbl, i, pos)
|
|
+ if (test_bit(flag, &b->flags)) {
|
|
+ rcu_read_unlock();
|
|
+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
|
|
+ goto restart;
|
|
+
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+void bch2_btree_flush_all_reads(struct bch_fs *c)
|
|
+{
|
|
+ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
|
|
+}
|
|
+
|
|
+void bch2_btree_flush_all_writes(struct bch_fs *c)
|
|
+{
|
|
+ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
|
|
+}
|
|
+
|
|
+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct bucket_table *tbl;
|
|
+ struct rhash_head *pos;
|
|
+ struct btree *b;
|
|
+ unsigned i;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_cached_btree(b, c, tbl, i, pos) {
|
|
+ unsigned long flags = READ_ONCE(b->flags);
|
|
+
|
|
+ if (!(flags & (1 << BTREE_NODE_dirty)))
|
|
+ continue;
|
|
+
|
|
+ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
|
|
+ b,
|
|
+ (flags & (1 << BTREE_NODE_dirty)) != 0,
|
|
+ (flags & (1 << BTREE_NODE_need_write)) != 0,
|
|
+ b->c.level,
|
|
+ b->written,
|
|
+ !list_empty_careful(&b->write_blocked),
|
|
+ b->will_make_reachable != 0,
|
|
+ b->will_make_reachable & 1);
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+}
|
|
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
|
|
new file mode 100644
|
|
index 000000000000..cadcf7f886d7
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_io.h
|
|
@@ -0,0 +1,257 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_IO_H
|
|
+#define _BCACHEFS_BTREE_IO_H
|
|
+
|
|
+#include "bkey_methods.h"
|
|
+#include "bset.h"
|
|
+#include "btree_locking.h"
|
|
+#include "checksum.h"
|
|
+#include "extents.h"
|
|
+#include "io_types.h"
|
|
+
|
|
+struct bch_fs;
|
|
+struct btree_write;
|
|
+struct btree;
|
|
+struct btree_iter;
|
|
+
|
|
+static inline bool btree_node_dirty(struct btree *b)
|
|
+{
|
|
+ return test_bit(BTREE_NODE_dirty, &b->flags);
|
|
+}
|
|
+
|
|
+static inline void set_btree_node_dirty(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags))
|
|
+ atomic_inc(&c->btree_cache.dirty);
|
|
+}
|
|
+
|
|
+static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags))
|
|
+ atomic_dec(&c->btree_cache.dirty);
|
|
+}
|
|
+
|
|
+struct btree_read_bio {
|
|
+ struct bch_fs *c;
|
|
+ u64 start_time;
|
|
+ unsigned have_ioref:1;
|
|
+ struct extent_ptr_decoded pick;
|
|
+ struct work_struct work;
|
|
+ struct bio bio;
|
|
+};
|
|
+
|
|
+struct btree_write_bio {
|
|
+ struct work_struct work;
|
|
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
|
+ void *data;
|
|
+ unsigned bytes;
|
|
+ struct bch_write_bio wbio;
|
|
+};
|
|
+
|
|
+static inline void btree_node_io_unlock(struct btree *b)
|
|
+{
|
|
+ EBUG_ON(!btree_node_write_in_flight(b));
|
|
+ clear_btree_node_write_in_flight(b);
|
|
+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
|
|
+}
|
|
+
|
|
+static inline void btree_node_io_lock(struct btree *b)
|
|
+{
|
|
+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+}
|
|
+
|
|
+static inline void btree_node_wait_on_io(struct btree *b)
|
|
+{
|
|
+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight,
|
|
+ TASK_UNINTERRUPTIBLE);
|
|
+}
|
|
+
|
|
+static inline bool btree_node_may_write(struct btree *b)
|
|
+{
|
|
+ return list_empty_careful(&b->write_blocked) &&
|
|
+ (!b->written || !b->will_make_reachable);
|
|
+}
|
|
+
|
|
+enum compact_mode {
|
|
+ COMPACT_LAZY,
|
|
+ COMPACT_ALL,
|
|
+};
|
|
+
|
|
+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *,
|
|
+ enum compact_mode);
|
|
+
|
|
+static inline bool should_compact_bset_lazy(struct btree *b,
|
|
+ struct bset_tree *t)
|
|
+{
|
|
+ unsigned total_u64s = bset_u64s(t);
|
|
+ unsigned dead_u64s = bset_dead_u64s(b, t);
|
|
+
|
|
+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s;
|
|
+}
|
|
+
|
|
+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t;
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ if (should_compact_bset_lazy(b, t))
|
|
+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline struct nonce btree_nonce(struct bset *i, unsigned offset)
|
|
+{
|
|
+ return (struct nonce) {{
|
|
+ [0] = cpu_to_le32(offset),
|
|
+ [1] = ((__le32 *) &i->seq)[0],
|
|
+ [2] = ((__le32 *) &i->seq)[1],
|
|
+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE,
|
|
+ }};
|
|
+}
|
|
+
|
|
+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset)
|
|
+{
|
|
+ struct nonce nonce = btree_nonce(i, offset);
|
|
+
|
|
+ if (!offset) {
|
|
+ struct btree_node *bn = container_of(i, struct btree_node, keys);
|
|
+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
|
|
+
|
|
+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags,
|
|
+ bytes);
|
|
+
|
|
+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE));
|
|
+ }
|
|
+
|
|
+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data,
|
|
+ vstruct_end(i) - (void *) i->_data);
|
|
+}
|
|
+
|
|
+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
|
|
+
|
|
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
|
|
+
|
|
+void bch2_btree_build_aux_trees(struct btree *);
|
|
+void bch2_btree_init_next(struct bch_fs *, struct btree *,
|
|
+ struct btree_iter *);
|
|
+
|
|
+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
|
|
+ struct btree *, bool);
|
|
+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
|
|
+int bch2_btree_root_read(struct bch_fs *, enum btree_id,
|
|
+ const struct bkey_i *, unsigned);
|
|
+
|
|
+void bch2_btree_complete_write(struct bch_fs *, struct btree *,
|
|
+ struct btree_write *);
|
|
+void bch2_btree_write_error_work(struct work_struct *);
|
|
+
|
|
+void __bch2_btree_node_write(struct bch_fs *, struct btree *);
|
|
+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
|
|
+
|
|
+void bch2_btree_node_write(struct bch_fs *, struct btree *,
|
|
+ enum six_lock_type);
|
|
+
|
|
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
|
|
+ enum six_lock_type lock_held)
|
|
+{
|
|
+ while (b->written &&
|
|
+ btree_node_need_write(b) &&
|
|
+ btree_node_may_write(b)) {
|
|
+ if (!btree_node_write_in_flight(b)) {
|
|
+ bch2_btree_node_write(c, b, lock_held);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ six_unlock_type(&b->c.lock, lock_held);
|
|
+ btree_node_wait_on_io(b);
|
|
+ btree_node_lock_type(c, b, lock_held);
|
|
+ }
|
|
+}
|
|
+
|
|
+#define bch2_btree_node_write_cond(_c, _b, cond) \
|
|
+do { \
|
|
+ unsigned long old, new, v = READ_ONCE((_b)->flags); \
|
|
+ \
|
|
+ do { \
|
|
+ old = new = v; \
|
|
+ \
|
|
+ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \
|
|
+ break; \
|
|
+ \
|
|
+ new |= (1 << BTREE_NODE_need_write); \
|
|
+ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \
|
|
+ \
|
|
+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \
|
|
+} while (0)
|
|
+
|
|
+void bch2_btree_flush_all_reads(struct bch_fs *);
|
|
+void bch2_btree_flush_all_writes(struct bch_fs *);
|
|
+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+static inline void compat_bformat(unsigned level, enum btree_id btree_id,
|
|
+ unsigned version, unsigned big_endian,
|
|
+ int write, struct bkey_format *f)
|
|
+{
|
|
+ if (version < bcachefs_metadata_version_inode_btree_change &&
|
|
+ btree_id == BTREE_ID_inodes) {
|
|
+ swap(f->bits_per_field[BKEY_FIELD_INODE],
|
|
+ f->bits_per_field[BKEY_FIELD_OFFSET]);
|
|
+ swap(f->field_offset[BKEY_FIELD_INODE],
|
|
+ f->field_offset[BKEY_FIELD_OFFSET]);
|
|
+ }
|
|
+
|
|
+ if (version < bcachefs_metadata_version_snapshot &&
|
|
+ (level || btree_type_has_snapshots(btree_id))) {
|
|
+ u64 max_packed =
|
|
+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]);
|
|
+
|
|
+ f->field_offset[BKEY_FIELD_SNAPSHOT] = write
|
|
+ ? 0
|
|
+ : U32_MAX - max_packed;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void compat_bpos(unsigned level, enum btree_id btree_id,
|
|
+ unsigned version, unsigned big_endian,
|
|
+ int write, struct bpos *p)
|
|
+{
|
|
+ if (big_endian != CPU_BIG_ENDIAN)
|
|
+ bch2_bpos_swab(p);
|
|
+
|
|
+ if (version < bcachefs_metadata_version_inode_btree_change &&
|
|
+ btree_id == BTREE_ID_inodes)
|
|
+ swap(p->inode, p->offset);
|
|
+}
|
|
+
|
|
+static inline void compat_btree_node(unsigned level, enum btree_id btree_id,
|
|
+ unsigned version, unsigned big_endian,
|
|
+ int write,
|
|
+ struct btree_node *bn)
|
|
+{
|
|
+ if (version < bcachefs_metadata_version_inode_btree_change &&
|
|
+ btree_node_type_is_extents(btree_id) &&
|
|
+ bpos_cmp(bn->min_key, POS_MIN) &&
|
|
+ write)
|
|
+ bn->min_key = bpos_nosnap_predecessor(bn->min_key);
|
|
+
|
|
+ if (version < bcachefs_metadata_version_snapshot &&
|
|
+ write)
|
|
+ bn->max_key.snapshot = 0;
|
|
+
|
|
+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key);
|
|
+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key);
|
|
+
|
|
+ if (version < bcachefs_metadata_version_snapshot &&
|
|
+ !write)
|
|
+ bn->max_key.snapshot = U32_MAX;
|
|
+
|
|
+ if (version < bcachefs_metadata_version_inode_btree_change &&
|
|
+ btree_node_type_is_extents(btree_id) &&
|
|
+ bpos_cmp(bn->min_key, POS_MIN) &&
|
|
+ !write)
|
|
+ bn->min_key = bpos_nosnap_successor(bn->min_key);
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_IO_H */
|
|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
|
|
new file mode 100644
|
|
index 000000000000..cdec05c86173
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_iter.c
|
|
@@ -0,0 +1,2455 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "btree_locking.h"
|
|
+#include "btree_update.h"
|
|
+#include "debug.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "journal.h"
|
|
+#include "replicas.h"
|
|
+
|
|
+#include <linux/prefetch.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static void btree_iter_set_search_pos(struct btree_iter *, struct bpos);
|
|
+
|
|
+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p)
|
|
+{
|
|
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
|
|
+
|
|
+ /* Are we iterating over keys in all snapshots? */
|
|
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
|
|
+ p = bpos_successor(p);
|
|
+ } else {
|
|
+ p = bpos_nosnap_successor(p);
|
|
+ p.snapshot = iter->snapshot;
|
|
+ }
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p)
|
|
+{
|
|
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_NODES);
|
|
+
|
|
+ /* Are we iterating over keys in all snapshots? */
|
|
+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) {
|
|
+ p = bpos_predecessor(p);
|
|
+ } else {
|
|
+ p = bpos_nosnap_predecessor(p);
|
|
+ p.snapshot = iter->snapshot;
|
|
+ }
|
|
+
|
|
+ return p;
|
|
+}
|
|
+
|
|
+static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
|
|
+{
|
|
+ return l < BTREE_MAX_DEPTH &&
|
|
+ (unsigned long) iter->l[l].b >= 128;
|
|
+}
|
|
+
|
|
+static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos pos = iter->pos;
|
|
+
|
|
+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
|
|
+ bkey_cmp(pos, POS_MAX))
|
|
+ pos = bkey_successor(iter, pos);
|
|
+ return pos;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return bpos_cmp(iter->real_pos, b->data->min_key) < 0;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return bpos_cmp(b->key.k.p, iter->real_pos) < 0;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return iter->btree_id == b->c.btree_id &&
|
|
+ !btree_iter_pos_before_node(iter, b) &&
|
|
+ !btree_iter_pos_after_node(iter, b);
|
|
+}
|
|
+
|
|
+/* Btree node locking: */
|
|
+
|
|
+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
|
|
+{
|
|
+ bch2_btree_node_unlock_write_inlined(b, iter);
|
|
+}
|
|
+
|
|
+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+ unsigned readers = 0;
|
|
+
|
|
+ EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
|
|
+
|
|
+ trans_for_each_iter(iter->trans, linked)
|
|
+ if (linked->l[b->c.level].b == b &&
|
|
+ btree_node_read_locked(linked, b->c.level))
|
|
+ readers++;
|
|
+
|
|
+ /*
|
|
+ * Must drop our read locks before calling six_lock_write() -
|
|
+ * six_unlock() won't do wakeups until the reader count
|
|
+ * goes to 0, and it's safe because we have the node intent
|
|
+ * locked:
|
|
+ */
|
|
+ atomic64_sub(__SIX_VAL(read_lock, readers),
|
|
+ &b->c.lock.state.counter);
|
|
+ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
|
|
+ atomic64_add(__SIX_VAL(read_lock, readers),
|
|
+ &b->c.lock.state.counter);
|
|
+}
|
|
+
|
|
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
|
|
+{
|
|
+ struct btree *b = btree_iter_node(iter, level);
|
|
+ int want = __btree_lock_want(iter, level);
|
|
+
|
|
+ if (!is_btree_node(iter, level))
|
|
+ return false;
|
|
+
|
|
+ if (race_fault())
|
|
+ return false;
|
|
+
|
|
+ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
|
|
+ (btree_node_lock_seq_matches(iter, b, level) &&
|
|
+ btree_node_lock_increment(iter->trans, b, level, want))) {
|
|
+ mark_btree_node_locked(iter, level, want);
|
|
+ return true;
|
|
+ } else {
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
|
|
+{
|
|
+ struct btree *b = iter->l[level].b;
|
|
+
|
|
+ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
|
|
+
|
|
+ if (!is_btree_node(iter, level))
|
|
+ return false;
|
|
+
|
|
+ if (btree_node_intent_locked(iter, level))
|
|
+ return true;
|
|
+
|
|
+ if (race_fault())
|
|
+ return false;
|
|
+
|
|
+ if (btree_node_locked(iter, level)
|
|
+ ? six_lock_tryupgrade(&b->c.lock)
|
|
+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
|
|
+ goto success;
|
|
+
|
|
+ if (btree_node_lock_seq_matches(iter, b, level) &&
|
|
+ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
|
|
+ btree_node_unlock(iter, level);
|
|
+ goto success;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+success:
|
|
+ mark_btree_node_intent_locked(iter, level);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_get_locks(struct btree_iter *iter,
|
|
+ bool upgrade, bool trace)
|
|
+{
|
|
+ unsigned l = iter->level;
|
|
+ int fail_idx = -1;
|
|
+
|
|
+ do {
|
|
+ if (!btree_iter_node(iter, l))
|
|
+ break;
|
|
+
|
|
+ if (!(upgrade
|
|
+ ? bch2_btree_node_upgrade(iter, l)
|
|
+ : bch2_btree_node_relock(iter, l))) {
|
|
+ if (trace)
|
|
+ (upgrade
|
|
+ ? trace_node_upgrade_fail
|
|
+ : trace_node_relock_fail)(l, iter->l[l].lock_seq,
|
|
+ is_btree_node(iter, l)
|
|
+ ? 0
|
|
+ : (unsigned long) iter->l[l].b,
|
|
+ is_btree_node(iter, l)
|
|
+ ? iter->l[l].b->c.lock.state.seq
|
|
+ : 0);
|
|
+
|
|
+ fail_idx = l;
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
|
|
+ }
|
|
+
|
|
+ l++;
|
|
+ } while (l < iter->locks_want);
|
|
+
|
|
+ /*
|
|
+ * When we fail to get a lock, we have to ensure that any child nodes
|
|
+ * can't be relocked so bch2_btree_iter_traverse has to walk back up to
|
|
+ * the node that we failed to relock:
|
|
+ */
|
|
+ while (fail_idx >= 0) {
|
|
+ btree_node_unlock(iter, fail_idx);
|
|
+ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
|
|
+ --fail_idx;
|
|
+ }
|
|
+
|
|
+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
|
|
+ iter->uptodate = BTREE_ITER_NEED_PEEK;
|
|
+
|
|
+ bch2_btree_trans_verify_locks(iter->trans);
|
|
+
|
|
+ return iter->uptodate < BTREE_ITER_NEED_RELOCK;
|
|
+}
|
|
+
|
|
+static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
|
|
+ enum btree_iter_type type)
|
|
+{
|
|
+ return type != BTREE_ITER_CACHED
|
|
+ ? container_of(_b, struct btree, c)->key.k.p
|
|
+ : container_of(_b, struct bkey_cached, c)->key.pos;
|
|
+}
|
|
+
|
|
+/* Slowpath: */
|
|
+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
|
|
+ unsigned level, struct btree_iter *iter,
|
|
+ enum six_lock_type type,
|
|
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
|
|
+ unsigned long ip)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+ struct btree_iter *linked, *deadlock_iter = NULL;
|
|
+ u64 start_time = local_clock();
|
|
+ unsigned reason = 9;
|
|
+ bool ret;
|
|
+
|
|
+ /* Check if it's safe to block: */
|
|
+ trans_for_each_iter(trans, linked) {
|
|
+ if (!linked->nodes_locked)
|
|
+ continue;
|
|
+
|
|
+ /*
|
|
+ * Can't block taking an intent lock if we have _any_ nodes read
|
|
+ * locked:
|
|
+ *
|
|
+ * - Our read lock blocks another thread with an intent lock on
|
|
+ * the same node from getting a write lock, and thus from
|
|
+ * dropping its intent lock
|
|
+ *
|
|
+ * - And the other thread may have multiple nodes intent locked:
|
|
+ * both the node we want to intent lock, and the node we
|
|
+ * already have read locked - deadlock:
|
|
+ */
|
|
+ if (type == SIX_LOCK_intent &&
|
|
+ linked->nodes_locked != linked->nodes_intent_locked) {
|
|
+ deadlock_iter = linked;
|
|
+ reason = 1;
|
|
+ }
|
|
+
|
|
+ if (linked->btree_id != iter->btree_id) {
|
|
+ if (linked->btree_id > iter->btree_id) {
|
|
+ deadlock_iter = linked;
|
|
+ reason = 3;
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Within the same btree, cached iterators come before non
|
|
+ * cached iterators:
|
|
+ */
|
|
+ if (btree_iter_is_cached(linked) != btree_iter_is_cached(iter)) {
|
|
+ if (btree_iter_is_cached(iter)) {
|
|
+ deadlock_iter = linked;
|
|
+ reason = 4;
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Interior nodes must be locked before their descendants: if
|
|
+ * another iterator has possible descendants locked of the node
|
|
+ * we're about to lock, it must have the ancestors locked too:
|
|
+ */
|
|
+ if (level > __fls(linked->nodes_locked)) {
|
|
+ deadlock_iter = linked;
|
|
+ reason = 5;
|
|
+ }
|
|
+
|
|
+ /* Must lock btree nodes in key order: */
|
|
+ if (btree_node_locked(linked, level) &&
|
|
+ bpos_cmp(pos, btree_node_pos((void *) linked->l[level].b,
|
|
+ btree_iter_type(linked))) <= 0) {
|
|
+ deadlock_iter = linked;
|
|
+ reason = 7;
|
|
+ BUG_ON(trans->in_traverse_all);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (unlikely(deadlock_iter)) {
|
|
+ trace_trans_restart_would_deadlock(iter->trans->ip, ip,
|
|
+ trans->in_traverse_all, reason,
|
|
+ deadlock_iter->btree_id,
|
|
+ btree_iter_type(deadlock_iter),
|
|
+ &deadlock_iter->real_pos,
|
|
+ iter->btree_id,
|
|
+ btree_iter_type(iter),
|
|
+ &pos);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (six_trylock_type(&b->c.lock, type))
|
|
+ return true;
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ trans->locking_iter_idx = iter->idx;
|
|
+ trans->locking_pos = pos;
|
|
+ trans->locking_btree_id = iter->btree_id;
|
|
+ trans->locking_level = level;
|
|
+ trans->locking = b;
|
|
+#endif
|
|
+
|
|
+ ret = six_lock_type(&b->c.lock, type, should_sleep_fn, p) == 0;
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ trans->locking = NULL;
|
|
+#endif
|
|
+ if (ret)
|
|
+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
|
|
+ start_time);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Btree iterator locking: */
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
|
|
+{
|
|
+ unsigned l;
|
|
+
|
|
+ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
|
|
+ BUG_ON(iter->nodes_locked);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (l = 0; is_btree_node(iter, l); l++) {
|
|
+ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
|
|
+ !btree_node_locked(iter, l))
|
|
+ continue;
|
|
+
|
|
+ BUG_ON(btree_lock_want(iter, l) !=
|
|
+ btree_node_locked_type(iter, l));
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_btree_trans_verify_locks(struct btree_trans *trans)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ bch2_btree_iter_verify_locks(iter);
|
|
+}
|
|
+#else
|
|
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
|
|
+#endif
|
|
+
|
|
+__flatten
|
|
+bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
|
|
+{
|
|
+ return btree_iter_get_locks(iter, false, trace);
|
|
+}
|
|
+
|
|
+bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
|
|
+ unsigned new_locks_want)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ EBUG_ON(iter->locks_want >= new_locks_want);
|
|
+
|
|
+ iter->locks_want = new_locks_want;
|
|
+
|
|
+ if (btree_iter_get_locks(iter, true, true))
|
|
+ return true;
|
|
+
|
|
+ /*
|
|
+ * XXX: this is ugly - we'd prefer to not be mucking with other
|
|
+ * iterators in the btree_trans here.
|
|
+ *
|
|
+ * On failure to upgrade the iterator, setting iter->locks_want and
|
|
+ * calling get_locks() is sufficient to make bch2_btree_iter_traverse()
|
|
+ * get the locks we want on transaction restart.
|
|
+ *
|
|
+ * But if this iterator was a clone, on transaction restart what we did
|
|
+ * to this iterator isn't going to be preserved.
|
|
+ *
|
|
+ * Possibly we could add an iterator field for the parent iterator when
|
|
+ * an iterator is a copy - for now, we'll just upgrade any other
|
|
+ * iterators with the same btree id.
|
|
+ *
|
|
+ * The code below used to be needed to ensure ancestor nodes get locked
|
|
+ * before interior nodes - now that's handled by
|
|
+ * bch2_btree_iter_traverse_all().
|
|
+ */
|
|
+ trans_for_each_iter(iter->trans, linked)
|
|
+ if (linked != iter &&
|
|
+ btree_iter_type(linked) == btree_iter_type(iter) &&
|
|
+ linked->btree_id == iter->btree_id &&
|
|
+ linked->locks_want < new_locks_want) {
|
|
+ linked->locks_want = new_locks_want;
|
|
+ btree_iter_get_locks(linked, true, false);
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
|
|
+ unsigned new_locks_want)
|
|
+{
|
|
+ unsigned l;
|
|
+
|
|
+ EBUG_ON(iter->locks_want < new_locks_want);
|
|
+
|
|
+ iter->locks_want = new_locks_want;
|
|
+
|
|
+ while (iter->nodes_locked &&
|
|
+ (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
|
|
+ if (l > iter->level) {
|
|
+ btree_node_unlock(iter, l);
|
|
+ } else {
|
|
+ if (btree_node_intent_locked(iter, l)) {
|
|
+ six_lock_downgrade(&iter->l[l].b->c.lock);
|
|
+ iter->nodes_intent_locked ^= 1 << l;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_btree_trans_verify_locks(iter->trans);
|
|
+}
|
|
+
|
|
+void bch2_trans_downgrade(struct btree_trans *trans)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ bch2_btree_iter_downgrade(iter);
|
|
+}
|
|
+
|
|
+/* Btree transaction locking: */
|
|
+
|
|
+bool bch2_trans_relock(struct btree_trans *trans)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ if (!bch2_btree_iter_relock(iter, true)) {
|
|
+ trace_trans_restart_relock(trans->ip);
|
|
+ return false;
|
|
+ }
|
|
+ return true;
|
|
+}
|
|
+
|
|
+void bch2_trans_unlock(struct btree_trans *trans)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ __bch2_btree_iter_unlock(iter);
|
|
+}
|
|
+
|
|
+/* Btree iterator: */
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+
|
|
+static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
|
|
+{
|
|
+ struct bkey_cached *ck;
|
|
+ bool locked = btree_node_locked(iter, 0);
|
|
+
|
|
+ if (!bch2_btree_node_relock(iter, 0))
|
|
+ return;
|
|
+
|
|
+ ck = (void *) iter->l[0].b;
|
|
+ BUG_ON(ck->key.btree_id != iter->btree_id ||
|
|
+ bkey_cmp(ck->key.pos, iter->pos));
|
|
+
|
|
+ if (!locked)
|
|
+ btree_node_unlock(iter, 0);
|
|
+}
|
|
+
|
|
+static void bch2_btree_iter_verify_level(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ struct btree_iter_level *l;
|
|
+ struct btree_node_iter tmp;
|
|
+ bool locked;
|
|
+ struct bkey_packed *p, *k;
|
|
+ char buf1[100], buf2[100], buf3[100];
|
|
+ const char *msg;
|
|
+
|
|
+ if (!bch2_debug_check_iterators)
|
|
+ return;
|
|
+
|
|
+ l = &iter->l[level];
|
|
+ tmp = l->iter;
|
|
+ locked = btree_node_locked(iter, level);
|
|
+
|
|
+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
|
|
+ if (!level)
|
|
+ bch2_btree_iter_verify_cached(iter);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ BUG_ON(iter->level < iter->min_depth);
|
|
+
|
|
+ if (!btree_iter_node(iter, level))
|
|
+ return;
|
|
+
|
|
+ if (!bch2_btree_node_relock(iter, level))
|
|
+ return;
|
|
+
|
|
+ BUG_ON(!btree_iter_pos_in_node(iter, l->b));
|
|
+
|
|
+ /*
|
|
+ * node iterators don't use leaf node iterator:
|
|
+ */
|
|
+ if (btree_iter_type(iter) == BTREE_ITER_NODES &&
|
|
+ level <= iter->min_depth)
|
|
+ goto unlock;
|
|
+
|
|
+ bch2_btree_node_iter_verify(&l->iter, l->b);
|
|
+
|
|
+ /*
|
|
+ * For interior nodes, the iterator will have skipped past
|
|
+ * deleted keys:
|
|
+ *
|
|
+ * For extents, the iterator may have skipped past deleted keys (but not
|
|
+ * whiteouts)
|
|
+ */
|
|
+ p = level || btree_node_type_is_extents(iter->btree_id)
|
|
+ ? bch2_btree_node_iter_prev(&tmp, l->b)
|
|
+ : bch2_btree_node_iter_prev_all(&tmp, l->b);
|
|
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
|
|
+
|
|
+ if (p && bkey_iter_pos_cmp(l->b, p, &iter->real_pos) >= 0) {
|
|
+ msg = "before";
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (k && bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
|
|
+ msg = "after";
|
|
+ goto err;
|
|
+ }
|
|
+unlock:
|
|
+ if (!locked)
|
|
+ btree_node_unlock(iter, level);
|
|
+ return;
|
|
+err:
|
|
+ strcpy(buf2, "(none)");
|
|
+ strcpy(buf3, "(none)");
|
|
+
|
|
+ bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
|
|
+
|
|
+ if (p) {
|
|
+ struct bkey uk = bkey_unpack_key(l->b, p);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
|
|
+ }
|
|
+
|
|
+ if (k) {
|
|
+ struct bkey uk = bkey_unpack_key(l->b, k);
|
|
+ bch2_bkey_to_text(&PBUF(buf3), &uk);
|
|
+ }
|
|
+
|
|
+ panic("iterator should be %s key at level %u:\n"
|
|
+ "iter pos %s\n"
|
|
+ "prev key %s\n"
|
|
+ "cur key %s\n",
|
|
+ msg, level, buf1, buf2, buf3);
|
|
+}
|
|
+
|
|
+static void bch2_btree_iter_verify(struct btree_iter *iter)
|
|
+{
|
|
+ enum btree_iter_type type = btree_iter_type(iter);
|
|
+ unsigned i;
|
|
+
|
|
+ EBUG_ON(iter->btree_id >= BTREE_ID_NR);
|
|
+
|
|
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
|
|
+ iter->pos.snapshot != iter->snapshot);
|
|
+
|
|
+ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
|
|
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
|
|
+
|
|
+ BUG_ON(type == BTREE_ITER_NODES &&
|
|
+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
|
|
+
|
|
+ BUG_ON(type != BTREE_ITER_NODES &&
|
|
+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
|
|
+ !btree_type_has_snapshots(iter->btree_id));
|
|
+
|
|
+ bch2_btree_iter_verify_locks(iter);
|
|
+
|
|
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
|
|
+ bch2_btree_iter_verify_level(iter, i);
|
|
+}
|
|
+
|
|
+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
|
|
+{
|
|
+ enum btree_iter_type type = btree_iter_type(iter);
|
|
+
|
|
+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
|
|
+ iter->pos.snapshot != iter->snapshot);
|
|
+
|
|
+ BUG_ON((type == BTREE_ITER_KEYS ||
|
|
+ type == BTREE_ITER_CACHED) &&
|
|
+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
|
|
+ bkey_cmp(iter->pos, iter->k.p) > 0));
|
|
+}
|
|
+
|
|
+void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ if (!bch2_debug_check_iterators)
|
|
+ return;
|
|
+
|
|
+ trans_for_each_iter_with_node(trans, b, iter)
|
|
+ bch2_btree_iter_verify_level(iter, b->c.level);
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
|
|
+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
|
|
+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {}
|
|
+
|
|
+#endif
|
|
+
|
|
+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct bset_tree *t,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ struct btree_node_iter_set *set;
|
|
+
|
|
+ btree_node_iter_for_each(iter, set)
|
|
+ if (set->end == t->end_offset) {
|
|
+ set->k = __btree_node_key_to_offset(b, k);
|
|
+ bch2_btree_node_iter_sort(iter, b);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
|
|
+}
|
|
+
|
|
+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct bkey_packed *where)
|
|
+{
|
|
+ struct btree_iter_level *l = &iter->l[b->c.level];
|
|
+
|
|
+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
|
|
+ return;
|
|
+
|
|
+ if (bkey_iter_pos_cmp(l->b, where, &iter->real_pos) < 0)
|
|
+ bch2_btree_node_iter_advance(&l->iter, l->b);
|
|
+
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
|
|
+}
|
|
+
|
|
+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct bkey_packed *where)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ trans_for_each_iter_with_node(iter->trans, b, linked) {
|
|
+ __bch2_btree_iter_fix_key_modified(linked, b, where);
|
|
+ bch2_btree_iter_verify_level(linked, b->c.level);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct btree_node_iter *node_iter,
|
|
+ struct bset_tree *t,
|
|
+ struct bkey_packed *where,
|
|
+ unsigned clobber_u64s,
|
|
+ unsigned new_u64s)
|
|
+{
|
|
+ const struct bkey_packed *end = btree_bkey_last(b, t);
|
|
+ struct btree_node_iter_set *set;
|
|
+ unsigned offset = __btree_node_key_to_offset(b, where);
|
|
+ int shift = new_u64s - clobber_u64s;
|
|
+ unsigned old_end = t->end_offset - shift;
|
|
+ unsigned orig_iter_pos = node_iter->data[0].k;
|
|
+ bool iter_current_key_modified =
|
|
+ orig_iter_pos >= offset &&
|
|
+ orig_iter_pos <= offset + clobber_u64s;
|
|
+
|
|
+ btree_node_iter_for_each(node_iter, set)
|
|
+ if (set->end == old_end)
|
|
+ goto found;
|
|
+
|
|
+ /* didn't find the bset in the iterator - might have to readd it: */
|
|
+ if (new_u64s &&
|
|
+ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
|
|
+ bch2_btree_node_iter_push(node_iter, b, where, end);
|
|
+ goto fixup_done;
|
|
+ } else {
|
|
+ /* Iterator is after key that changed */
|
|
+ return;
|
|
+ }
|
|
+found:
|
|
+ set->end = t->end_offset;
|
|
+
|
|
+ /* Iterator hasn't gotten to the key that changed yet: */
|
|
+ if (set->k < offset)
|
|
+ return;
|
|
+
|
|
+ if (new_u64s &&
|
|
+ bkey_iter_pos_cmp(b, where, &iter->real_pos) >= 0) {
|
|
+ set->k = offset;
|
|
+ } else if (set->k < offset + clobber_u64s) {
|
|
+ set->k = offset + new_u64s;
|
|
+ if (set->k == set->end)
|
|
+ bch2_btree_node_iter_set_drop(node_iter, set);
|
|
+ } else {
|
|
+ /* Iterator is after key that changed */
|
|
+ set->k = (int) set->k + shift;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_iter_sort(node_iter, b);
|
|
+fixup_done:
|
|
+ if (node_iter->data[0].k != orig_iter_pos)
|
|
+ iter_current_key_modified = true;
|
|
+
|
|
+ /*
|
|
+ * When a new key is added, and the node iterator now points to that
|
|
+ * key, the iterator might have skipped past deleted keys that should
|
|
+ * come after the key the iterator now points to. We have to rewind to
|
|
+ * before those deleted keys - otherwise
|
|
+ * bch2_btree_node_iter_prev_all() breaks:
|
|
+ */
|
|
+ if (!bch2_btree_node_iter_end(node_iter) &&
|
|
+ iter_current_key_modified &&
|
|
+ (b->c.level ||
|
|
+ btree_node_type_is_extents(iter->btree_id))) {
|
|
+ struct bset_tree *t;
|
|
+ struct bkey_packed *k, *k2, *p;
|
|
+
|
|
+ k = bch2_btree_node_iter_peek_all(node_iter, b);
|
|
+
|
|
+ for_each_bset(b, t) {
|
|
+ bool set_pos = false;
|
|
+
|
|
+ if (node_iter->data[0].end == t->end_offset)
|
|
+ continue;
|
|
+
|
|
+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);
|
|
+
|
|
+ while ((p = bch2_bkey_prev_all(b, t, k2)) &&
|
|
+ bkey_iter_cmp(b, k, p) < 0) {
|
|
+ k2 = p;
|
|
+ set_pos = true;
|
|
+ }
|
|
+
|
|
+ if (set_pos)
|
|
+ btree_node_iter_set_set_pos(node_iter,
|
|
+ b, t, k2);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (!b->c.level &&
|
|
+ node_iter == &iter->l[0].iter &&
|
|
+ iter_current_key_modified)
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_iter_fix(struct btree_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct btree_node_iter *node_iter,
|
|
+ struct bkey_packed *where,
|
|
+ unsigned clobber_u64s,
|
|
+ unsigned new_u64s)
|
|
+{
|
|
+ struct bset_tree *t = bch2_bkey_to_bset(b, where);
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ if (node_iter != &iter->l[b->c.level].iter) {
|
|
+ __bch2_btree_node_iter_fix(iter, b, node_iter, t,
|
|
+ where, clobber_u64s, new_u64s);
|
|
+
|
|
+ if (bch2_debug_check_iterators)
|
|
+ bch2_btree_node_iter_verify(node_iter, b);
|
|
+ }
|
|
+
|
|
+ trans_for_each_iter_with_node(iter->trans, b, linked) {
|
|
+ __bch2_btree_node_iter_fix(linked, b,
|
|
+ &linked->l[b->c.level].iter, t,
|
|
+ where, clobber_u64s, new_u64s);
|
|
+ bch2_btree_iter_verify_level(linked, b->c.level);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
|
|
+ struct btree_iter_level *l,
|
|
+ struct bkey *u,
|
|
+ struct bkey_packed *k)
|
|
+{
|
|
+ struct bkey_s_c ret;
|
|
+
|
|
+ if (unlikely(!k)) {
|
|
+ /*
|
|
+ * signal to bch2_btree_iter_peek_slot() that we're currently at
|
|
+ * a hole
|
|
+ */
|
|
+ u->type = KEY_TYPE_deleted;
|
|
+ return bkey_s_c_null;
|
|
+ }
|
|
+
|
|
+ ret = bkey_disassemble(l->b, k, u);
|
|
+
|
|
+ if (bch2_debug_check_bkeys)
|
|
+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* peek_all() doesn't skip deleted keys */
|
|
+static inline struct bkey_s_c btree_iter_level_peek_all(struct btree_iter *iter,
|
|
+ struct btree_iter_level *l,
|
|
+ struct bkey *u)
|
|
+{
|
|
+ return __btree_iter_unpack(iter, l, u,
|
|
+ bch2_btree_node_iter_peek_all(&l->iter, l->b));
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c btree_iter_level_peek(struct btree_iter *iter,
|
|
+ struct btree_iter_level *l)
|
|
+{
|
|
+ struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
|
|
+ bch2_btree_node_iter_peek(&l->iter, l->b));
|
|
+
|
|
+ iter->real_pos = k.k ? k.k->p : l->b->key.k.p;
|
|
+ return k;
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c btree_iter_level_prev(struct btree_iter *iter,
|
|
+ struct btree_iter_level *l)
|
|
+{
|
|
+ struct bkey_s_c k = __btree_iter_unpack(iter, l, &iter->k,
|
|
+ bch2_btree_node_iter_prev(&l->iter, l->b));
|
|
+
|
|
+ iter->real_pos = k.k ? k.k->p : l->b->data->min_key;
|
|
+ return k;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
|
|
+ struct btree_iter_level *l,
|
|
+ int max_advance)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+ int nr_advanced = 0;
|
|
+
|
|
+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
|
|
+ bkey_iter_pos_cmp(l->b, k, &iter->real_pos) < 0) {
|
|
+ if (max_advance > 0 && nr_advanced >= max_advance)
|
|
+ return false;
|
|
+
|
|
+ bch2_btree_node_iter_advance(&l->iter, l->b);
|
|
+ nr_advanced++;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Verify that iterator for parent node points to child node:
|
|
+ */
|
|
+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
|
|
+{
|
|
+ struct btree_iter_level *l;
|
|
+ unsigned plevel;
|
|
+ bool parent_locked;
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
|
|
+ return;
|
|
+
|
|
+ plevel = b->c.level + 1;
|
|
+ if (!btree_iter_node(iter, plevel))
|
|
+ return;
|
|
+
|
|
+ parent_locked = btree_node_locked(iter, plevel);
|
|
+
|
|
+ if (!bch2_btree_node_relock(iter, plevel))
|
|
+ return;
|
|
+
|
|
+ l = &iter->l[plevel];
|
|
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
|
|
+ if (!k ||
|
|
+ bkey_deleted(k) ||
|
|
+ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
|
|
+ char buf1[100];
|
|
+ char buf2[100];
|
|
+ char buf3[100];
|
|
+ char buf4[100];
|
|
+ struct bkey uk = bkey_unpack_key(b, k);
|
|
+
|
|
+ bch2_dump_btree_node(iter->trans->c, l->b);
|
|
+ bch2_bpos_to_text(&PBUF(buf1), iter->real_pos);
|
|
+ bch2_bkey_to_text(&PBUF(buf2), &uk);
|
|
+ bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
|
|
+ bch2_bpos_to_text(&PBUF(buf3), b->data->max_key);
|
|
+ panic("parent iter doesn't point to new node:\n"
|
|
+ "iter pos %s %s\n"
|
|
+ "iter key %s\n"
|
|
+ "new node %s-%s\n",
|
|
+ bch2_btree_ids[iter->btree_id], buf1,
|
|
+ buf2, buf3, buf4);
|
|
+ }
|
|
+
|
|
+ if (!parent_locked)
|
|
+ btree_node_unlock(iter, b->c.level + 1);
|
|
+}
|
|
+
|
|
+static inline void __btree_iter_init(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ struct btree_iter_level *l = &iter->l[level];
|
|
+
|
|
+ bch2_btree_node_iter_init(&l->iter, l->b, &iter->real_pos);
|
|
+
|
|
+ /*
|
|
+ * Iterators to interior nodes should always be pointed at the first non
|
|
+ * whiteout:
|
|
+ */
|
|
+ if (level)
|
|
+ bch2_btree_node_iter_peek(&l->iter, l->b);
|
|
+
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
|
|
+}
|
|
+
|
|
+static inline void btree_iter_node_set(struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
|
|
+
|
|
+ btree_iter_verify_new_node(iter, b);
|
|
+
|
|
+ EBUG_ON(!btree_iter_pos_in_node(iter, b));
|
|
+ EBUG_ON(b->c.lock.state.seq & 1);
|
|
+
|
|
+ iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
|
|
+ iter->l[b->c.level].b = b;
|
|
+ __btree_iter_init(iter, b->c.level);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * A btree node is being replaced - update the iterator to point to the new
|
|
+ * node:
|
|
+ */
|
|
+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
|
|
+{
|
|
+ enum btree_node_locked_type t;
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ trans_for_each_iter(iter->trans, linked)
|
|
+ if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
|
|
+ btree_iter_pos_in_node(linked, b)) {
|
|
+ /*
|
|
+ * bch2_btree_iter_node_drop() has already been called -
|
|
+ * the old node we're replacing has already been
|
|
+ * unlocked and the pointer invalidated
|
|
+ */
|
|
+ BUG_ON(btree_node_locked(linked, b->c.level));
|
|
+
|
|
+ t = btree_lock_want(linked, b->c.level);
|
|
+ if (t != BTREE_NODE_UNLOCKED) {
|
|
+ six_lock_increment(&b->c.lock, t);
|
|
+ mark_btree_node_locked(linked, b->c.level, t);
|
|
+ }
|
|
+
|
|
+ btree_iter_node_set(linked, b);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+ unsigned level = b->c.level;
|
|
+
|
|
+ trans_for_each_iter(iter->trans, linked)
|
|
+ if (linked->l[level].b == b) {
|
|
+ btree_node_unlock(linked, level);
|
|
+ linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * A btree node has been modified in such a way as to invalidate iterators - fix
|
|
+ * them:
|
|
+ */
|
|
+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ trans_for_each_iter_with_node(iter->trans, b, linked)
|
|
+ __btree_iter_init(linked, b->c.level);
|
|
+}
|
|
+
|
|
+static int lock_root_check_fn(struct six_lock *lock, void *p)
|
|
+{
|
|
+ struct btree *b = container_of(lock, struct btree, c.lock);
|
|
+ struct btree **rootp = p;
|
|
+
|
|
+ return b == *rootp ? 0 : -1;
|
|
+}
|
|
+
|
|
+static inline int btree_iter_lock_root(struct btree_iter *iter,
|
|
+ unsigned depth_want,
|
|
+ unsigned long trace_ip)
|
|
+{
|
|
+ struct bch_fs *c = iter->trans->c;
|
|
+ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
|
|
+ enum six_lock_type lock_type;
|
|
+ unsigned i;
|
|
+
|
|
+ EBUG_ON(iter->nodes_locked);
|
|
+
|
|
+ while (1) {
|
|
+ b = READ_ONCE(*rootp);
|
|
+ iter->level = READ_ONCE(b->c.level);
|
|
+
|
|
+ if (unlikely(iter->level < depth_want)) {
|
|
+ /*
|
|
+ * the root is at a lower depth than the depth we want:
|
|
+ * got to the end of the btree, or we're walking nodes
|
|
+ * greater than some depth and there are no nodes >=
|
|
+ * that depth
|
|
+ */
|
|
+ iter->level = depth_want;
|
|
+ for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
|
|
+ iter->l[i].b = NULL;
|
|
+ return 1;
|
|
+ }
|
|
+
|
|
+ lock_type = __btree_lock_want(iter, iter->level);
|
|
+ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
|
|
+ iter, lock_type,
|
|
+ lock_root_check_fn, rootp,
|
|
+ trace_ip)))
|
|
+ return -EINTR;
|
|
+
|
|
+ if (likely(b == READ_ONCE(*rootp) &&
|
|
+ b->c.level == iter->level &&
|
|
+ !race_fault())) {
|
|
+ for (i = 0; i < iter->level; i++)
|
|
+ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
|
|
+ iter->l[iter->level].b = b;
|
|
+ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
|
|
+ iter->l[i].b = NULL;
|
|
+
|
|
+ mark_btree_node_locked(iter, iter->level, lock_type);
|
|
+ btree_iter_node_set(iter, b);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ six_unlock_type(&b->c.lock, lock_type);
|
|
+ }
|
|
+}
|
|
+
|
|
+noinline
|
|
+static void btree_iter_prefetch(struct btree_iter *iter)
|
|
+{
|
|
+ struct bch_fs *c = iter->trans->c;
|
|
+ struct btree_iter_level *l = &iter->l[iter->level];
|
|
+ struct btree_node_iter node_iter = l->iter;
|
|
+ struct bkey_packed *k;
|
|
+ struct bkey_buf tmp;
|
|
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
|
|
+ ? (iter->level > 1 ? 0 : 2)
|
|
+ : (iter->level > 1 ? 1 : 16);
|
|
+ bool was_locked = btree_node_locked(iter, iter->level);
|
|
+
|
|
+ bch2_bkey_buf_init(&tmp);
|
|
+
|
|
+ while (nr) {
|
|
+ if (!bch2_btree_node_relock(iter, iter->level))
|
|
+ break;
|
|
+
|
|
+ bch2_btree_node_iter_advance(&node_iter, l->b);
|
|
+ k = bch2_btree_node_iter_peek(&node_iter, l->b);
|
|
+ if (!k)
|
|
+ break;
|
|
+
|
|
+ bch2_bkey_buf_unpack(&tmp, c, l->b, k);
|
|
+ bch2_btree_node_prefetch(c, iter, tmp.k, iter->btree_id,
|
|
+ iter->level - 1);
|
|
+ }
|
|
+
|
|
+ if (!was_locked)
|
|
+ btree_node_unlock(iter, iter->level);
|
|
+
|
|
+ bch2_bkey_buf_exit(&tmp, c);
|
|
+}
|
|
+
|
|
+static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
|
|
+ unsigned plevel, struct btree *b)
|
|
+{
|
|
+ struct btree_iter_level *l = &iter->l[plevel];
|
|
+ bool locked = btree_node_locked(iter, plevel);
|
|
+ struct bkey_packed *k;
|
|
+ struct bch_btree_ptr_v2 *bp;
|
|
+
|
|
+ if (!bch2_btree_node_relock(iter, plevel))
|
|
+ return;
|
|
+
|
|
+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
|
|
+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);
|
|
+
|
|
+ bp = (void *) bkeyp_val(&l->b->format, k);
|
|
+ bp->mem_ptr = (unsigned long)b;
|
|
+
|
|
+ if (!locked)
|
|
+ btree_node_unlock(iter, plevel);
|
|
+}
|
|
+
|
|
+static __always_inline int btree_iter_down(struct btree_iter *iter,
|
|
+ unsigned long trace_ip)
|
|
+{
|
|
+ struct bch_fs *c = iter->trans->c;
|
|
+ struct btree_iter_level *l = &iter->l[iter->level];
|
|
+ struct btree *b;
|
|
+ unsigned level = iter->level - 1;
|
|
+ enum six_lock_type lock_type = __btree_lock_want(iter, level);
|
|
+ struct bkey_buf tmp;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(!btree_node_locked(iter, iter->level));
|
|
+
|
|
+ bch2_bkey_buf_init(&tmp);
|
|
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
|
|
+ bch2_btree_node_iter_peek(&l->iter, l->b));
|
|
+
|
|
+ b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
|
|
+ ret = PTR_ERR_OR_ZERO(b);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ mark_btree_node_locked(iter, level, lock_type);
|
|
+ btree_iter_node_set(iter, b);
|
|
+
|
|
+ if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
|
|
+ unlikely(b != btree_node_mem_ptr(tmp.k)))
|
|
+ btree_node_mem_ptr_set(iter, level + 1, b);
|
|
+
|
|
+ if (iter->flags & BTREE_ITER_PREFETCH)
|
|
+ btree_iter_prefetch(iter);
|
|
+
|
|
+ iter->level = level;
|
|
+err:
|
|
+ bch2_bkey_buf_exit(&tmp, c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
|
|
+
|
|
+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ u8 sorted[BTREE_ITER_MAX];
|
|
+ int i, nr_sorted = 0;
|
|
+ bool relock_fail;
|
|
+
|
|
+ if (trans->in_traverse_all)
|
|
+ return -EINTR;
|
|
+
|
|
+ trans->in_traverse_all = true;
|
|
+retry_all:
|
|
+ nr_sorted = 0;
|
|
+ relock_fail = false;
|
|
+
|
|
+ trans_for_each_iter(trans, iter) {
|
|
+ if (!bch2_btree_iter_relock(iter, true))
|
|
+ relock_fail = true;
|
|
+ sorted[nr_sorted++] = iter->idx;
|
|
+ }
|
|
+
|
|
+ if (!relock_fail) {
|
|
+ trans->in_traverse_all = false;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+#define btree_iter_cmp_by_idx(_l, _r) \
|
|
+ btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
|
|
+
|
|
+ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
|
|
+#undef btree_iter_cmp_by_idx
|
|
+
|
|
+ for (i = nr_sorted - 2; i >= 0; --i) {
|
|
+ struct btree_iter *iter1 = trans->iters + sorted[i];
|
|
+ struct btree_iter *iter2 = trans->iters + sorted[i + 1];
|
|
+
|
|
+ if (iter1->btree_id == iter2->btree_id &&
|
|
+ iter1->locks_want < iter2->locks_want)
|
|
+ __bch2_btree_iter_upgrade(iter1, iter2->locks_want);
|
|
+ else if (!iter1->locks_want && iter2->locks_want)
|
|
+ __bch2_btree_iter_upgrade(iter1, 1);
|
|
+ }
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+ cond_resched();
|
|
+
|
|
+ if (unlikely(ret == -ENOMEM)) {
|
|
+ struct closure cl;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ do {
|
|
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
|
+ closure_sync(&cl);
|
|
+ } while (ret);
|
|
+ }
|
|
+
|
|
+ if (unlikely(ret == -EIO)) {
|
|
+ trans->error = true;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ BUG_ON(ret && ret != -EINTR);
|
|
+
|
|
+ /* Now, redo traversals in correct order: */
|
|
+ for (i = 0; i < nr_sorted; i++) {
|
|
+ unsigned idx = sorted[i];
|
|
+
|
|
+ /*
|
|
+ * sucessfully traversing one iterator can cause another to be
|
|
+ * unlinked, in btree_key_cache_fill()
|
|
+ */
|
|
+ if (!(trans->iters_linked & (1ULL << idx)))
|
|
+ continue;
|
|
+
|
|
+ ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
|
|
+ if (ret)
|
|
+ goto retry_all;
|
|
+ }
|
|
+
|
|
+ if (hweight64(trans->iters_live) > 1)
|
|
+ ret = -EINTR;
|
|
+ else
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
|
|
+ ret = -EINTR;
|
|
+ break;
|
|
+ }
|
|
+out:
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+
|
|
+ trans->in_traverse_all = false;
|
|
+
|
|
+ trace_trans_traverse_all(trans->ip);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_btree_iter_traverse_all(struct btree_trans *trans)
|
|
+{
|
|
+ return __btree_iter_traverse_all(trans, 0);
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_good_node(struct btree_iter *iter,
|
|
+ unsigned l, int check_pos)
|
|
+{
|
|
+ if (!is_btree_node(iter, l) ||
|
|
+ !bch2_btree_node_relock(iter, l))
|
|
+ return false;
|
|
+
|
|
+ if (check_pos < 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
|
|
+ return false;
|
|
+ if (check_pos > 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
|
|
+ int check_pos)
|
|
+{
|
|
+ unsigned l = iter->level;
|
|
+
|
|
+ while (btree_iter_node(iter, l) &&
|
|
+ !btree_iter_good_node(iter, l, check_pos)) {
|
|
+ btree_node_unlock(iter, l);
|
|
+ iter->l[l].b = BTREE_ITER_NO_NODE_UP;
|
|
+ l++;
|
|
+ }
|
|
+
|
|
+ return l;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This is the main state machine for walking down the btree - walks down to a
|
|
+ * specified depth
|
|
+ *
|
|
+ * Returns 0 on success, -EIO on error (error reading in a btree node).
|
|
+ *
|
|
+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is
|
|
+ * stashed in the iterator and returned from bch2_trans_exit().
|
|
+ */
|
|
+static int btree_iter_traverse_one(struct btree_iter *iter,
|
|
+ unsigned long trace_ip)
|
|
+{
|
|
+ unsigned depth_want = iter->level;
|
|
+
|
|
+ /*
|
|
+ * if we need interior nodes locked, call btree_iter_relock() to make
|
|
+ * sure we walk back up enough that we lock them:
|
|
+ */
|
|
+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
|
|
+ iter->locks_want > 1)
|
|
+ bch2_btree_iter_relock(iter, false);
|
|
+
|
|
+ if (btree_iter_type(iter) == BTREE_ITER_CACHED)
|
|
+ return bch2_btree_iter_traverse_cached(iter);
|
|
+
|
|
+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
|
|
+ return 0;
|
|
+
|
|
+ if (unlikely(iter->level >= BTREE_MAX_DEPTH))
|
|
+ return 0;
|
|
+
|
|
+ iter->level = btree_iter_up_until_good_node(iter, 0);
|
|
+
|
|
+ /*
|
|
+ * Note: iter->nodes[iter->level] may be temporarily NULL here - that
|
|
+ * would indicate to other code that we got to the end of the btree,
|
|
+ * here it indicates that relocking the root failed - it's critical that
|
|
+ * btree_iter_lock_root() comes next and that it can't fail
|
|
+ */
|
|
+ while (iter->level > depth_want) {
|
|
+ int ret = btree_iter_node(iter, iter->level)
|
|
+ ? btree_iter_down(iter, trace_ip)
|
|
+ : btree_iter_lock_root(iter, depth_want, trace_ip);
|
|
+ if (unlikely(ret)) {
|
|
+ if (ret == 1)
|
|
+ return 0;
|
|
+
|
|
+ iter->level = depth_want;
|
|
+
|
|
+ if (ret == -EIO) {
|
|
+ iter->flags |= BTREE_ITER_ERROR;
|
|
+ iter->l[iter->level].b =
|
|
+ BTREE_ITER_NO_NODE_ERROR;
|
|
+ } else {
|
|
+ iter->l[iter->level].b =
|
|
+ BTREE_ITER_NO_NODE_DOWN;
|
|
+ }
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ iter->uptodate = BTREE_ITER_NEED_PEEK;
|
|
+
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_trans_cond_resched(trans) ?:
|
|
+ btree_iter_traverse_one(iter, _RET_IP_);
|
|
+ if (unlikely(ret))
|
|
+ ret = __btree_iter_traverse_all(trans, ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Note:
|
|
+ * bch2_btree_iter_traverse() is for external users, btree_iter_traverse() is
|
|
+ * for internal btree iterator users
|
|
+ *
|
|
+ * bch2_btree_iter_traverse sets iter->real_pos to iter->pos,
|
|
+ * btree_iter_traverse() does not:
|
|
+ */
|
|
+static inline int __must_check
|
|
+btree_iter_traverse(struct btree_iter *iter)
|
|
+{
|
|
+ return iter->uptodate >= BTREE_ITER_NEED_RELOCK
|
|
+ ? __bch2_btree_iter_traverse(iter)
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+int __must_check
|
|
+bch2_btree_iter_traverse(struct btree_iter *iter)
|
|
+{
|
|
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
|
|
+
|
|
+ return btree_iter_traverse(iter);
|
|
+}
|
|
+
|
|
+/* Iterate across nodes (leaf and interior nodes) */
|
|
+
|
|
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
|
|
+{
|
|
+ struct btree *b;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ return NULL;
|
|
+
|
|
+ b = btree_iter_node(iter, iter->level);
|
|
+ if (!b)
|
|
+ return NULL;
|
|
+
|
|
+ BUG_ON(bpos_cmp(b->key.k.p, iter->pos) < 0);
|
|
+
|
|
+ iter->pos = iter->real_pos = b->key.k.p;
|
|
+
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ return b;
|
|
+}
|
|
+
|
|
+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
|
|
+{
|
|
+ struct btree *b;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ /* already got to end? */
|
|
+ if (!btree_iter_node(iter, iter->level))
|
|
+ return NULL;
|
|
+
|
|
+ bch2_trans_cond_resched(iter->trans);
|
|
+
|
|
+ btree_node_unlock(iter, iter->level);
|
|
+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
|
|
+ iter->level++;
|
|
+
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ return NULL;
|
|
+
|
|
+ /* got to end? */
|
|
+ b = btree_iter_node(iter, iter->level);
|
|
+ if (!b)
|
|
+ return NULL;
|
|
+
|
|
+ if (bpos_cmp(iter->pos, b->key.k.p) < 0) {
|
|
+ /*
|
|
+ * Haven't gotten to the end of the parent node: go back down to
|
|
+ * the next child node
|
|
+ */
|
|
+ btree_iter_set_search_pos(iter, bpos_successor(iter->pos));
|
|
+
|
|
+ /* Unlock to avoid screwing up our lock invariants: */
|
|
+ btree_node_unlock(iter, iter->level);
|
|
+
|
|
+ iter->level = iter->min_depth;
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ return NULL;
|
|
+
|
|
+ b = iter->l[iter->level].b;
|
|
+ }
|
|
+
|
|
+ iter->pos = iter->real_pos = b->key.k.p;
|
|
+
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ return b;
|
|
+}
|
|
+
|
|
+/* Iterate across keys (in leaf nodes only) */
|
|
+
|
|
+static void btree_iter_set_search_pos(struct btree_iter *iter, struct bpos new_pos)
|
|
+{
|
|
+ int cmp = bpos_cmp(new_pos, iter->real_pos);
|
|
+ unsigned l = iter->level;
|
|
+
|
|
+ if (!cmp)
|
|
+ goto out;
|
|
+
|
|
+ iter->real_pos = new_pos;
|
|
+
|
|
+ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
|
|
+ btree_node_unlock(iter, 0);
|
|
+ iter->l[0].b = BTREE_ITER_NO_NODE_UP;
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ l = btree_iter_up_until_good_node(iter, cmp);
|
|
+
|
|
+ if (btree_iter_node(iter, l)) {
|
|
+ /*
|
|
+ * We might have to skip over many keys, or just a few: try
|
|
+ * advancing the node iterator, and if we have to skip over too
|
|
+ * many keys just reinit it (or if we're rewinding, since that
|
|
+ * is expensive).
|
|
+ */
|
|
+ if (cmp < 0 ||
|
|
+ !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
|
|
+ __btree_iter_init(iter, l);
|
|
+
|
|
+ /* Don't leave it locked if we're not supposed to: */
|
|
+ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
|
|
+ btree_node_unlock(iter, l);
|
|
+ }
|
|
+out:
|
|
+ if (l != iter->level)
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
|
|
+ else
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
|
|
+
|
|
+ bch2_btree_iter_verify(iter);
|
|
+}
|
|
+
|
|
+inline bool bch2_btree_iter_advance(struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos pos = iter->k.p;
|
|
+ bool ret = bpos_cmp(pos, POS_MAX) != 0;
|
|
+
|
|
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
|
|
+ pos = bkey_successor(iter, pos);
|
|
+ bch2_btree_iter_set_pos(iter, pos);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos pos = bkey_start_pos(&iter->k);
|
|
+ bool ret = bpos_cmp(pos, POS_MIN) != 0;
|
|
+
|
|
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
|
|
+ pos = bkey_predecessor(iter, pos);
|
|
+ bch2_btree_iter_set_pos(iter, pos);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos next_pos = iter->l[0].b->key.k.p;
|
|
+ bool ret = bpos_cmp(next_pos, POS_MAX) != 0;
|
|
+
|
|
+ /*
|
|
+ * Typically, we don't want to modify iter->pos here, since that
|
|
+ * indicates where we searched from - unless we got to the end of the
|
|
+ * btree, in that case we want iter->pos to reflect that:
|
|
+ */
|
|
+ if (ret)
|
|
+ btree_iter_set_search_pos(iter, bpos_successor(next_pos));
|
|
+ else
|
|
+ bch2_btree_iter_set_pos(iter, POS_MAX);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos next_pos = iter->l[0].b->data->min_key;
|
|
+ bool ret = bpos_cmp(next_pos, POS_MIN) != 0;
|
|
+
|
|
+ if (ret)
|
|
+ btree_iter_set_search_pos(iter, bpos_predecessor(next_pos));
|
|
+ else
|
|
+ bch2_btree_iter_set_pos(iter, POS_MIN);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct bkey_i *btree_trans_peek_updates(struct btree_trans *trans,
|
|
+ enum btree_id btree_id, struct bpos pos)
|
|
+{
|
|
+ struct btree_insert_entry *i;
|
|
+
|
|
+ trans_for_each_update2(trans, i)
|
|
+ if ((cmp_int(btree_id, i->iter->btree_id) ?:
|
|
+ bkey_cmp(pos, i->k->k.p)) <= 0) {
|
|
+ if (btree_id == i->iter->btree_id)
|
|
+ return i->k;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, bool with_updates)
|
|
+{
|
|
+ struct bpos search_key = btree_iter_search_key(iter);
|
|
+ struct bkey_i *next_update = with_updates
|
|
+ ? btree_trans_peek_updates(iter->trans, iter->btree_id, search_key)
|
|
+ : NULL;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+
|
|
+ btree_iter_set_search_pos(iter, search_key);
|
|
+
|
|
+ while (1) {
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (unlikely(ret))
|
|
+ return bkey_s_c_err(ret);
|
|
+
|
|
+ k = btree_iter_level_peek(iter, &iter->l[0]);
|
|
+
|
|
+ if (next_update &&
|
|
+ bpos_cmp(next_update->k.p, iter->real_pos) <= 0)
|
|
+ k = bkey_i_to_s_c(next_update);
|
|
+
|
|
+ if (likely(k.k)) {
|
|
+ if (bkey_deleted(k.k)) {
|
|
+ btree_iter_set_search_pos(iter,
|
|
+ bkey_successor(iter, k.k->p));
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!btree_iter_set_pos_to_next_leaf(iter))
|
|
+ return bkey_s_c_null;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * iter->pos should be mononotically increasing, and always be equal to
|
|
+ * the key we just returned - except extents can straddle iter->pos:
|
|
+ */
|
|
+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS))
|
|
+ iter->pos = k.k->p;
|
|
+ else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
|
|
+ iter->pos = bkey_start_pos(k.k);
|
|
+
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ return k;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
|
|
+ * current position
|
|
+ */
|
|
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
|
|
+{
|
|
+ return __btree_iter_peek(iter, false);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_btree_iter_next: returns first key greater than iterator's current
|
|
+ * position
|
|
+ */
|
|
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
|
|
+{
|
|
+ if (!bch2_btree_iter_advance(iter))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ return bch2_btree_iter_peek(iter);
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
|
|
+{
|
|
+ return __btree_iter_peek(iter, true);
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
|
|
+{
|
|
+ if (!bch2_btree_iter_advance(iter))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ return bch2_btree_iter_peek_with_updates(iter);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_btree_iter_peek_prev: returns first key less than or equal to
|
|
+ * iterator's current position
|
|
+ */
|
|
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_iter_level *l = &iter->l[0];
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+
|
|
+ btree_iter_set_search_pos(iter, iter->pos);
|
|
+
|
|
+ while (1) {
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (unlikely(ret)) {
|
|
+ k = bkey_s_c_err(ret);
|
|
+ goto no_key;
|
|
+ }
|
|
+
|
|
+ k = btree_iter_level_peek(iter, l);
|
|
+ if (!k.k ||
|
|
+ ((iter->flags & BTREE_ITER_IS_EXTENTS)
|
|
+ ? bkey_cmp(bkey_start_pos(k.k), iter->pos) >= 0
|
|
+ : bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0))
|
|
+ k = btree_iter_level_prev(iter, l);
|
|
+
|
|
+ if (likely(k.k))
|
|
+ break;
|
|
+
|
|
+ if (!btree_iter_set_pos_to_prev_leaf(iter)) {
|
|
+ k = bkey_s_c_null;
|
|
+ goto no_key;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0);
|
|
+
|
|
+ /* Extents can straddle iter->pos: */
|
|
+ if (bkey_cmp(k.k->p, iter->pos) < 0)
|
|
+ iter->pos = k.k->p;
|
|
+out:
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ return k;
|
|
+no_key:
|
|
+ /*
|
|
+ * btree_iter_level_peek() may have set iter->k to a key we didn't want, and
|
|
+ * then we errored going to the previous leaf - make sure it's
|
|
+ * consistent with iter->pos:
|
|
+ */
|
|
+ bkey_init(&iter->k);
|
|
+ iter->k.p = iter->pos;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_btree_iter_prev: returns first key less than iterator's current
|
|
+ * position
|
|
+ */
|
|
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
|
|
+{
|
|
+ if (!bch2_btree_iter_rewind(iter))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ return bch2_btree_iter_peek_prev(iter);
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c
|
|
+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
|
|
+{
|
|
+ struct bkey_s_c k;
|
|
+ struct bpos pos, next_start;
|
|
+
|
|
+ /* keys & holes can't span inode numbers: */
|
|
+ if (iter->pos.offset == KEY_OFFSET_MAX) {
|
|
+ if (iter->pos.inode == KEY_INODE_MAX)
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter, bkey_successor(iter, iter->pos));
|
|
+ }
|
|
+
|
|
+ pos = iter->pos;
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ iter->pos = pos;
|
|
+
|
|
+ if (bkey_err(k))
|
|
+ return k;
|
|
+
|
|
+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0)
|
|
+ return k;
|
|
+
|
|
+ next_start = k.k ? bkey_start_pos(k.k) : POS_MAX;
|
|
+
|
|
+ bkey_init(&iter->k);
|
|
+ iter->k.p = iter->pos;
|
|
+ bch2_key_resize(&iter->k,
|
|
+ min_t(u64, KEY_SIZE_MAX,
|
|
+ (next_start.inode == iter->pos.inode
|
|
+ ? next_start.offset
|
|
+ : KEY_OFFSET_MAX) -
|
|
+ iter->pos.offset));
|
|
+
|
|
+ EBUG_ON(!iter->k.size);
|
|
+
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ return (struct bkey_s_c) { &iter->k, NULL };
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_iter_level *l = &iter->l[0];
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+
|
|
+ btree_iter_set_search_pos(iter, btree_iter_search_key(iter));
|
|
+
|
|
+ if (iter->flags & BTREE_ITER_IS_EXTENTS)
|
|
+ return __bch2_btree_iter_peek_slot_extents(iter);
|
|
+
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (unlikely(ret))
|
|
+ return bkey_s_c_err(ret);
|
|
+
|
|
+ k = btree_iter_level_peek_all(iter, l, &iter->k);
|
|
+
|
|
+ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
|
|
+
|
|
+ if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
|
|
+ /* hole */
|
|
+ bkey_init(&iter->k);
|
|
+ iter->k.p = iter->pos;
|
|
+ k = (struct bkey_s_c) { &iter->k, NULL };
|
|
+ }
|
|
+
|
|
+ bch2_btree_iter_verify_entry_exit(iter);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+ return k;
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
|
|
+{
|
|
+ if (!bch2_btree_iter_advance(iter))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ return bch2_btree_iter_peek_slot(iter);
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter)
|
|
+{
|
|
+ if (!bch2_btree_iter_rewind(iter))
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ return bch2_btree_iter_peek_slot(iter);
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
|
|
+{
|
|
+ struct bkey_cached *ck;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
|
|
+ bch2_btree_iter_verify(iter);
|
|
+
|
|
+ ret = btree_iter_traverse(iter);
|
|
+ if (unlikely(ret))
|
|
+ return bkey_s_c_err(ret);
|
|
+
|
|
+ ck = (void *) iter->l[0].b;
|
|
+
|
|
+ EBUG_ON(iter->btree_id != ck->key.btree_id ||
|
|
+ bkey_cmp(iter->pos, ck->key.pos));
|
|
+ BUG_ON(!ck->valid);
|
|
+
|
|
+ return bkey_i_to_s_c(ck->k);
|
|
+}
|
|
+
|
|
+static inline void bch2_btree_iter_init(struct btree_trans *trans,
|
|
+ struct btree_iter *iter, enum btree_id btree_id)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ unsigned i;
|
|
+
|
|
+ iter->trans = trans;
|
|
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
|
|
+ iter->btree_id = btree_id;
|
|
+ iter->level = 0;
|
|
+ iter->min_depth = 0;
|
|
+ iter->locks_want = 0;
|
|
+ iter->nodes_locked = 0;
|
|
+ iter->nodes_intent_locked = 0;
|
|
+ for (i = 0; i < ARRAY_SIZE(iter->l); i++)
|
|
+ iter->l[i].b = BTREE_ITER_NO_NODE_INIT;
|
|
+
|
|
+ prefetch(c->btree_roots[btree_id].b);
|
|
+}
|
|
+
|
|
+/* new transactional stuff: */
|
|
+
|
|
+static inline void __bch2_trans_iter_free(struct btree_trans *trans,
|
|
+ unsigned idx)
|
|
+{
|
|
+ __bch2_btree_iter_unlock(&trans->iters[idx]);
|
|
+ trans->iters_linked &= ~(1ULL << idx);
|
|
+ trans->iters_live &= ~(1ULL << idx);
|
|
+ trans->iters_touched &= ~(1ULL << idx);
|
|
+}
|
|
+
|
|
+int bch2_trans_iter_put(struct btree_trans *trans,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (IS_ERR_OR_NULL(iter))
|
|
+ return 0;
|
|
+
|
|
+ BUG_ON(trans->iters + iter->idx != iter);
|
|
+ BUG_ON(!btree_iter_live(trans, iter));
|
|
+
|
|
+ ret = btree_iter_err(iter);
|
|
+
|
|
+ if (!(trans->iters_touched & (1ULL << iter->idx)) &&
|
|
+ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
|
|
+ __bch2_trans_iter_free(trans, iter->idx);
|
|
+
|
|
+ trans->iters_live &= ~(1ULL << iter->idx);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_trans_iter_free(struct btree_trans *trans,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ if (IS_ERR_OR_NULL(iter))
|
|
+ return 0;
|
|
+
|
|
+ set_btree_iter_dontneed(trans, iter);
|
|
+
|
|
+ return bch2_trans_iter_put(trans, iter);
|
|
+}
|
|
+
|
|
+noinline __cold
|
|
+static void btree_trans_iter_alloc_fail(struct btree_trans *trans)
|
|
+{
|
|
+
|
|
+ struct btree_iter *iter;
|
|
+ struct btree_insert_entry *i;
|
|
+ char buf[100];
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ printk(KERN_ERR "iter: btree %s pos %s%s%s%s %pS\n",
|
|
+ bch2_btree_ids[iter->btree_id],
|
|
+ (bch2_bpos_to_text(&PBUF(buf), iter->pos), buf),
|
|
+ btree_iter_live(trans, iter) ? " live" : "",
|
|
+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
|
|
+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
|
|
+ (void *) iter->ip_allocated);
|
|
+
|
|
+ trans_for_each_update(trans, i) {
|
|
+ char buf[300];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), trans->c, bkey_i_to_s_c(i->k));
|
|
+ printk(KERN_ERR "update: btree %s %s\n",
|
|
+ bch2_btree_ids[i->iter->btree_id], buf);
|
|
+ }
|
|
+ panic("trans iter oveflow\n");
|
|
+}
|
|
+
|
|
+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
|
|
+{
|
|
+ unsigned idx;
|
|
+
|
|
+ if (unlikely(trans->iters_linked ==
|
|
+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
|
|
+ btree_trans_iter_alloc_fail(trans);
|
|
+
|
|
+ idx = __ffs64(~trans->iters_linked);
|
|
+
|
|
+ trans->iters_linked |= 1ULL << idx;
|
|
+ trans->iters[idx].idx = idx;
|
|
+ trans->iters[idx].flags = 0;
|
|
+ return &trans->iters[idx];
|
|
+}
|
|
+
|
|
+static inline void btree_iter_copy(struct btree_iter *dst,
|
|
+ struct btree_iter *src)
|
|
+{
|
|
+ unsigned i, idx = dst->idx;
|
|
+
|
|
+ *dst = *src;
|
|
+ dst->idx = idx;
|
|
+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+
|
|
+ for (i = 0; i < BTREE_MAX_DEPTH; i++)
|
|
+ if (btree_node_locked(dst, i))
|
|
+ six_lock_increment(&dst->l[i].b->c.lock,
|
|
+ __btree_lock_want(dst, i));
|
|
+
|
|
+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
|
|
+}
|
|
+
|
|
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
|
|
+ unsigned btree_id, struct bpos pos,
|
|
+ unsigned locks_want,
|
|
+ unsigned depth,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_iter *iter, *best = NULL;
|
|
+ struct bpos real_pos, pos_min = POS_MIN;
|
|
+
|
|
+ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
|
|
+ btree_node_type_is_extents(btree_id) &&
|
|
+ !(flags & BTREE_ITER_NOT_EXTENTS) &&
|
|
+ !(flags & BTREE_ITER_ALL_SNAPSHOTS))
|
|
+ flags |= BTREE_ITER_IS_EXTENTS;
|
|
+
|
|
+ if ((flags & BTREE_ITER_TYPE) != BTREE_ITER_NODES &&
|
|
+ !btree_type_has_snapshots(btree_id))
|
|
+ flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
|
|
+
|
|
+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS))
|
|
+ pos.snapshot = btree_type_has_snapshots(btree_id)
|
|
+ ? U32_MAX : 0;
|
|
+
|
|
+ real_pos = pos;
|
|
+
|
|
+ if ((flags & BTREE_ITER_IS_EXTENTS) &&
|
|
+ bkey_cmp(pos, POS_MAX))
|
|
+ real_pos = bpos_nosnap_successor(pos);
|
|
+
|
|
+ trans_for_each_iter(trans, iter) {
|
|
+ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
|
|
+ continue;
|
|
+
|
|
+ if (iter->btree_id != btree_id)
|
|
+ continue;
|
|
+
|
|
+ if (best) {
|
|
+ int cmp = bkey_cmp(bpos_diff(best->real_pos, real_pos),
|
|
+ bpos_diff(iter->real_pos, real_pos));
|
|
+
|
|
+ if (cmp < 0 ||
|
|
+ ((cmp == 0 && btree_iter_keep(trans, iter))))
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ best = iter;
|
|
+ }
|
|
+
|
|
+ trace_trans_get_iter(_RET_IP_, trans->ip,
|
|
+ btree_id,
|
|
+ &real_pos, locks_want,
|
|
+ best ? &best->real_pos : &pos_min,
|
|
+ best ? best->locks_want : 0,
|
|
+ best ? best->uptodate : BTREE_ITER_NEED_TRAVERSE);
|
|
+
|
|
+ if (!best) {
|
|
+ iter = btree_trans_iter_alloc(trans);
|
|
+ bch2_btree_iter_init(trans, iter, btree_id);
|
|
+ } else if (btree_iter_keep(trans, best)) {
|
|
+ iter = btree_trans_iter_alloc(trans);
|
|
+ btree_iter_copy(iter, best);
|
|
+ } else {
|
|
+ iter = best;
|
|
+ }
|
|
+
|
|
+ trans->iters_live |= 1ULL << iter->idx;
|
|
+ trans->iters_touched |= 1ULL << iter->idx;
|
|
+
|
|
+ iter->flags = flags;
|
|
+
|
|
+ iter->snapshot = pos.snapshot;
|
|
+
|
|
+ /*
|
|
+ * If the iterator has locks_want greater than requested, we explicitly
|
|
+ * do not downgrade it here - on transaction restart because btree node
|
|
+ * split needs to upgrade locks, we might be putting/getting the
|
|
+ * iterator again. Downgrading iterators only happens via an explicit
|
|
+ * bch2_trans_downgrade().
|
|
+ */
|
|
+
|
|
+ locks_want = min(locks_want, BTREE_MAX_DEPTH);
|
|
+ if (locks_want > iter->locks_want) {
|
|
+ iter->locks_want = locks_want;
|
|
+ btree_iter_get_locks(iter, true, false);
|
|
+ }
|
|
+
|
|
+ while (iter->level != depth) {
|
|
+ btree_node_unlock(iter, iter->level);
|
|
+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT;
|
|
+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE;
|
|
+ if (iter->level < depth)
|
|
+ iter->level++;
|
|
+ else
|
|
+ iter->level--;
|
|
+ }
|
|
+
|
|
+ iter->min_depth = depth;
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter, pos);
|
|
+ btree_iter_set_search_pos(iter, real_pos);
|
|
+
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
|
|
+ enum btree_id btree_id,
|
|
+ struct bpos pos,
|
|
+ unsigned locks_want,
|
|
+ unsigned depth,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_iter *iter =
|
|
+ __bch2_trans_get_iter(trans, btree_id, pos,
|
|
+ locks_want, depth,
|
|
+ BTREE_ITER_NODES|
|
|
+ BTREE_ITER_NOT_EXTENTS|
|
|
+ BTREE_ITER_ALL_SNAPSHOTS|
|
|
+ flags);
|
|
+
|
|
+ BUG_ON(bkey_cmp(iter->pos, pos));
|
|
+ BUG_ON(iter->locks_want != min(locks_want, BTREE_MAX_DEPTH));
|
|
+ BUG_ON(iter->level != depth);
|
|
+ BUG_ON(iter->min_depth != depth);
|
|
+ iter->ip_allocated = _RET_IP_;
|
|
+
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
|
|
+ struct btree_iter *src)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ iter = btree_trans_iter_alloc(trans);
|
|
+ btree_iter_copy(iter, src);
|
|
+
|
|
+ trans->iters_live |= 1ULL << iter->idx;
|
|
+ /*
|
|
+ * We don't need to preserve this iter since it's cheap to copy it
|
|
+ * again - this will cause trans_iter_put() to free it right away:
|
|
+ */
|
|
+ set_btree_iter_dontneed(trans, iter);
|
|
+
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|
+{
|
|
+ size_t new_top = trans->mem_top + size;
|
|
+ void *p;
|
|
+
|
|
+ if (new_top > trans->mem_bytes) {
|
|
+ size_t old_bytes = trans->mem_bytes;
|
|
+ size_t new_bytes = roundup_pow_of_two(new_top);
|
|
+ void *new_mem;
|
|
+
|
|
+ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
|
|
+
|
|
+ new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
|
|
+ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
|
|
+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
|
|
+ new_bytes = BTREE_TRANS_MEM_MAX;
|
|
+ kfree(trans->mem);
|
|
+ }
|
|
+
|
|
+ if (!new_mem)
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ trans->mem = new_mem;
|
|
+ trans->mem_bytes = new_bytes;
|
|
+
|
|
+ if (old_bytes) {
|
|
+ trace_trans_restart_mem_realloced(trans->ip, _RET_IP_, new_bytes);
|
|
+ return ERR_PTR(-EINTR);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ p = trans->mem + trans->mem_top;
|
|
+ trans->mem_top += size;
|
|
+ return p;
|
|
+}
|
|
+
|
|
+inline void bch2_trans_unlink_iters(struct btree_trans *trans)
|
|
+{
|
|
+ u64 iters = trans->iters_linked &
|
|
+ ~trans->iters_touched &
|
|
+ ~trans->iters_live;
|
|
+
|
|
+ while (iters) {
|
|
+ unsigned idx = __ffs64(iters);
|
|
+
|
|
+ iters &= ~(1ULL << idx);
|
|
+ __bch2_trans_iter_free(trans, idx);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
|
|
+ BTREE_ITER_SET_POS_AFTER_COMMIT);
|
|
+
|
|
+ bch2_trans_unlink_iters(trans);
|
|
+
|
|
+ trans->iters_touched &= trans->iters_live;
|
|
+
|
|
+ trans->nr_updates = 0;
|
|
+ trans->nr_updates2 = 0;
|
|
+ trans->mem_top = 0;
|
|
+
|
|
+ trans->hooks = NULL;
|
|
+ trans->extra_journal_entries = NULL;
|
|
+ trans->extra_journal_entry_u64s = 0;
|
|
+
|
|
+ if (trans->fs_usage_deltas) {
|
|
+ trans->fs_usage_deltas->used = 0;
|
|
+ memset(&trans->fs_usage_deltas->memset_start, 0,
|
|
+ (void *) &trans->fs_usage_deltas->memset_end -
|
|
+ (void *) &trans->fs_usage_deltas->memset_start);
|
|
+ }
|
|
+
|
|
+ if (!(flags & TRANS_RESET_NOUNLOCK))
|
|
+ bch2_trans_cond_resched(trans);
|
|
+
|
|
+ if (!(flags & TRANS_RESET_NOTRAVERSE) &&
|
|
+ trans->iters_linked)
|
|
+ bch2_btree_iter_traverse_all(trans);
|
|
+}
|
|
+
|
|
+static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
|
|
+{
|
|
+ size_t iters_bytes = sizeof(struct btree_iter) * BTREE_ITER_MAX;
|
|
+ size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX;
|
|
+ void *p = NULL;
|
|
+
|
|
+ BUG_ON(trans->used_mempool);
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+ p = this_cpu_xchg(c->btree_iters_bufs->iter, NULL);
|
|
+#endif
|
|
+ if (!p)
|
|
+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
|
|
+
|
|
+ trans->iters = p; p += iters_bytes;
|
|
+ trans->updates = p; p += updates_bytes;
|
|
+ trans->updates2 = p; p += updates_bytes;
|
|
+}
|
|
+
|
|
+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
|
|
+ unsigned expected_nr_iters,
|
|
+ size_t expected_mem_bytes)
|
|
+{
|
|
+ memset(trans, 0, sizeof(*trans));
|
|
+ trans->c = c;
|
|
+ trans->ip = _RET_IP_;
|
|
+
|
|
+ /*
|
|
+ * reallocating iterators currently completely breaks
|
|
+ * bch2_trans_iter_put(), we always allocate the max:
|
|
+ */
|
|
+ bch2_trans_alloc_iters(trans, c);
|
|
+
|
|
+ if (expected_mem_bytes) {
|
|
+ trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
|
|
+ trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
|
|
+
|
|
+ if (!unlikely(trans->mem)) {
|
|
+ trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
|
|
+ trans->mem_bytes = BTREE_TRANS_MEM_MAX;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ trans->pid = current->pid;
|
|
+ mutex_lock(&c->btree_trans_lock);
|
|
+ list_add(&trans->list, &c->btree_trans_list);
|
|
+ mutex_unlock(&c->btree_trans_lock);
|
|
+#endif
|
|
+}
|
|
+
|
|
+int bch2_trans_exit(struct btree_trans *trans)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ if (trans->iters_live) {
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ bch_err(c, "btree iterators leaked!");
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ if (btree_iter_live(trans, iter))
|
|
+ printk(KERN_ERR " btree %s allocated at %pS\n",
|
|
+ bch2_btree_ids[iter->btree_id],
|
|
+ (void *) iter->ip_allocated);
|
|
+ /* Be noisy about this: */
|
|
+ bch2_fatal_error(c);
|
|
+ }
|
|
+
|
|
+ mutex_lock(&trans->c->btree_trans_lock);
|
|
+ list_del(&trans->list);
|
|
+ mutex_unlock(&trans->c->btree_trans_lock);
|
|
+#endif
|
|
+
|
|
+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
|
|
+
|
|
+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
|
|
+
|
|
+ if (trans->fs_usage_deltas) {
|
|
+ if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
|
|
+ REPLICAS_DELTA_LIST_MAX)
|
|
+ mempool_free(trans->fs_usage_deltas,
|
|
+ &trans->c->replicas_delta_pool);
|
|
+ else
|
|
+ kfree(trans->fs_usage_deltas);
|
|
+ }
|
|
+
|
|
+ if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
|
|
+ mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
|
|
+ else
|
|
+ kfree(trans->mem);
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+ /*
|
|
+ * Userspace doesn't have a real percpu implementation:
|
|
+ */
|
|
+ trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
|
|
+#endif
|
|
+
|
|
+ if (trans->iters)
|
|
+ mempool_free(trans->iters, &trans->c->btree_iters_pool);
|
|
+
|
|
+ trans->mem = (void *) 0x1;
|
|
+ trans->iters = (void *) 0x1;
|
|
+
|
|
+ return trans->error ? -EIO : 0;
|
|
+}
|
|
+
|
|
+static void __maybe_unused
|
|
+bch2_btree_iter_node_to_text(struct printbuf *out,
|
|
+ struct btree_bkey_cached_common *_b,
|
|
+ enum btree_iter_type type)
|
|
+{
|
|
+ pr_buf(out, " l=%u %s:",
|
|
+ _b->level, bch2_btree_ids[_b->btree_id]);
|
|
+ bch2_bpos_to_text(out, btree_node_pos(_b, type));
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+static bool trans_has_btree_nodes_locked(struct btree_trans *trans)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED &&
|
|
+ iter->nodes_locked)
|
|
+ return true;
|
|
+ return false;
|
|
+}
|
|
+#endif
|
|
+
|
|
+void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct btree_trans *trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct btree *b;
|
|
+ unsigned l;
|
|
+
|
|
+ mutex_lock(&c->btree_trans_lock);
|
|
+ list_for_each_entry(trans, &c->btree_trans_list, list) {
|
|
+ if (!trans_has_btree_nodes_locked(trans))
|
|
+ continue;
|
|
+
|
|
+ pr_buf(out, "%i %ps\n", trans->pid, (void *) trans->ip);
|
|
+
|
|
+ trans_for_each_iter(trans, iter) {
|
|
+ if (!iter->nodes_locked)
|
|
+ continue;
|
|
+
|
|
+ pr_buf(out, " iter %u %c %s:",
|
|
+ iter->idx,
|
|
+ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
|
|
+ bch2_btree_ids[iter->btree_id]);
|
|
+ bch2_bpos_to_text(out, iter->pos);
|
|
+ pr_buf(out, "\n");
|
|
+
|
|
+ for (l = 0; l < BTREE_MAX_DEPTH; l++) {
|
|
+ if (btree_node_locked(iter, l)) {
|
|
+ pr_buf(out, " %s l=%u ",
|
|
+ btree_node_intent_locked(iter, l) ? "i" : "r", l);
|
|
+ bch2_btree_iter_node_to_text(out,
|
|
+ (void *) iter->l[l].b,
|
|
+ btree_iter_type(iter));
|
|
+ pr_buf(out, "\n");
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ b = READ_ONCE(trans->locking);
|
|
+ if (b) {
|
|
+ iter = &trans->iters[trans->locking_iter_idx];
|
|
+ pr_buf(out, " locking iter %u %c l=%u %s:",
|
|
+ trans->locking_iter_idx,
|
|
+ btree_iter_type(iter) == BTREE_ITER_CACHED ? 'c' : 'b',
|
|
+ trans->locking_level,
|
|
+ bch2_btree_ids[trans->locking_btree_id]);
|
|
+ bch2_bpos_to_text(out, trans->locking_pos);
|
|
+
|
|
+ pr_buf(out, " node ");
|
|
+ bch2_btree_iter_node_to_text(out,
|
|
+ (void *) b,
|
|
+ btree_iter_type(iter));
|
|
+ pr_buf(out, "\n");
|
|
+ }
|
|
+ }
|
|
+ mutex_unlock(&c->btree_trans_lock);
|
|
+#endif
|
|
+}
|
|
+
|
|
+void bch2_fs_btree_iter_exit(struct bch_fs *c)
|
|
+{
|
|
+ mempool_exit(&c->btree_trans_mem_pool);
|
|
+ mempool_exit(&c->btree_iters_pool);
|
|
+ cleanup_srcu_struct(&c->btree_trans_barrier);
|
|
+}
|
|
+
|
|
+int bch2_fs_btree_iter_init(struct bch_fs *c)
|
|
+{
|
|
+ unsigned nr = BTREE_ITER_MAX;
|
|
+
|
|
+ INIT_LIST_HEAD(&c->btree_trans_list);
|
|
+ mutex_init(&c->btree_trans_lock);
|
|
+
|
|
+ return init_srcu_struct(&c->btree_trans_barrier) ?:
|
|
+ mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
|
|
+ sizeof(struct btree_iter) * nr +
|
|
+ sizeof(struct btree_insert_entry) * nr +
|
|
+ sizeof(struct btree_insert_entry) * nr) ?:
|
|
+ mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
|
|
+ BTREE_TRANS_MEM_MAX);
|
|
+}
|
|
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
|
|
new file mode 100644
|
|
index 000000000000..2f63adb9e420
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_iter.h
|
|
@@ -0,0 +1,332 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_ITER_H
|
|
+#define _BCACHEFS_BTREE_ITER_H
|
|
+
|
|
+#include "bset.h"
|
|
+#include "btree_types.h"
|
|
+
|
|
+static inline void btree_iter_set_dirty(struct btree_iter *iter,
|
|
+ enum btree_iter_uptodate u)
|
|
+{
|
|
+ iter->uptodate = max_t(unsigned, iter->uptodate, u);
|
|
+}
|
|
+
|
|
+static inline struct btree *btree_iter_node(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL;
|
|
+}
|
|
+
|
|
+static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter,
|
|
+ const struct btree *b, unsigned level)
|
|
+{
|
|
+ /*
|
|
+ * We don't compare the low bits of the lock sequence numbers because
|
|
+ * @iter might have taken a write lock on @b, and we don't want to skip
|
|
+ * the linked iterator if the sequence numbers were equal before taking
|
|
+ * that write lock. The lock sequence number is incremented by taking
|
|
+ * and releasing write locks and is even when unlocked:
|
|
+ */
|
|
+ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1;
|
|
+}
|
|
+
|
|
+static inline struct btree *btree_node_parent(struct btree_iter *iter,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return btree_iter_node(iter, b->c.level + 1);
|
|
+}
|
|
+
|
|
+static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans)
|
|
+{
|
|
+ return hweight64(trans->iters_linked) > 1;
|
|
+}
|
|
+
|
|
+static inline int btree_iter_err(const struct btree_iter *iter)
|
|
+{
|
|
+ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
|
|
+}
|
|
+
|
|
+/* Iterate over iters within a transaction: */
|
|
+
|
|
+static inline struct btree_iter *
|
|
+__trans_next_iter(struct btree_trans *trans, unsigned idx)
|
|
+{
|
|
+ u64 l;
|
|
+
|
|
+ if (idx == BTREE_ITER_MAX)
|
|
+ return NULL;
|
|
+
|
|
+ l = trans->iters_linked >> idx;
|
|
+ if (!l)
|
|
+ return NULL;
|
|
+
|
|
+ idx += __ffs64(l);
|
|
+ EBUG_ON(idx >= BTREE_ITER_MAX);
|
|
+ EBUG_ON(trans->iters[idx].idx != idx);
|
|
+ return &trans->iters[idx];
|
|
+}
|
|
+
|
|
+#define trans_for_each_iter(_trans, _iter) \
|
|
+ for (_iter = __trans_next_iter((_trans), 0); \
|
|
+ (_iter); \
|
|
+ _iter = __trans_next_iter((_trans), (_iter)->idx + 1))
|
|
+
|
|
+static inline bool __iter_has_node(const struct btree_iter *iter,
|
|
+ const struct btree *b)
|
|
+{
|
|
+ return iter->l[b->c.level].b == b &&
|
|
+ btree_node_lock_seq_matches(iter, b, b->c.level);
|
|
+}
|
|
+
|
|
+static inline struct btree_iter *
|
|
+__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b,
|
|
+ unsigned idx)
|
|
+{
|
|
+ struct btree_iter *iter = __trans_next_iter(trans, idx);
|
|
+
|
|
+ while (iter && !__iter_has_node(iter, b))
|
|
+ iter = __trans_next_iter(trans, iter->idx + 1);
|
|
+
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+#define trans_for_each_iter_with_node(_trans, _b, _iter) \
|
|
+ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \
|
|
+ (_iter); \
|
|
+ _iter = __trans_next_iter_with_node((_trans), (_b), \
|
|
+ (_iter)->idx + 1))
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *);
|
|
+void bch2_btree_trans_verify_locks(struct btree_trans *);
|
|
+#else
|
|
+static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans,
|
|
+ struct btree *b) {}
|
|
+static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {}
|
|
+#endif
|
|
+
|
|
+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *,
|
|
+ struct bkey_packed *);
|
|
+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
|
|
+ struct btree_node_iter *, struct bkey_packed *,
|
|
+ unsigned, unsigned);
|
|
+
|
|
+bool bch2_btree_iter_relock(struct btree_iter *, bool);
|
|
+bool bch2_trans_relock(struct btree_trans *);
|
|
+void bch2_trans_unlock(struct btree_trans *);
|
|
+
|
|
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
|
|
+
|
|
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
|
|
+ unsigned new_locks_want)
|
|
+{
|
|
+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
|
|
+
|
|
+ return iter->locks_want < new_locks_want
|
|
+ ? __bch2_btree_iter_upgrade(iter, new_locks_want)
|
|
+ : iter->uptodate <= BTREE_ITER_NEED_PEEK;
|
|
+}
|
|
+
|
|
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
|
|
+
|
|
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
|
|
+{
|
|
+ unsigned new_locks_want = (iter->flags & BTREE_ITER_INTENT ? 1 : 0);
|
|
+
|
|
+ if (iter->locks_want > new_locks_want)
|
|
+ __bch2_btree_iter_downgrade(iter, new_locks_want);
|
|
+}
|
|
+
|
|
+void bch2_trans_downgrade(struct btree_trans *);
|
|
+
|
|
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
|
|
+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
|
|
+
|
|
+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
|
|
+
|
|
+int __must_check bch2_btree_iter_traverse(struct btree_iter *);
|
|
+
|
|
+int bch2_btree_iter_traverse_all(struct btree_trans *);
|
|
+
|
|
+struct btree *bch2_btree_iter_peek_node(struct btree_iter *);
|
|
+struct btree *bch2_btree_iter_next_node(struct btree_iter *);
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *);
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *);
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *);
|
|
+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *);
|
|
+
|
|
+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *);
|
|
+
|
|
+bool bch2_btree_iter_advance(struct btree_iter *);
|
|
+bool bch2_btree_iter_rewind(struct btree_iter *);
|
|
+
|
|
+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
|
|
+{
|
|
+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
|
|
+ new_pos.snapshot = iter->snapshot;
|
|
+
|
|
+ iter->k.type = KEY_TYPE_deleted;
|
|
+ iter->k.p.inode = iter->pos.inode = new_pos.inode;
|
|
+ iter->k.p.offset = iter->pos.offset = new_pos.offset;
|
|
+ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot;
|
|
+ iter->k.size = 0;
|
|
+}
|
|
+
|
|
+/* Sort order for locking btree iterators: */
|
|
+static inline int btree_iter_lock_cmp(const struct btree_iter *l,
|
|
+ const struct btree_iter *r)
|
|
+{
|
|
+ return cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ -cmp_int(btree_iter_is_cached(l), btree_iter_is_cached(r)) ?:
|
|
+ bkey_cmp(l->real_pos, r->real_pos);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Unlocks before scheduling
|
|
+ * Note: does not revalidate iterator
|
|
+ */
|
|
+static inline int bch2_trans_cond_resched(struct btree_trans *trans)
|
|
+{
|
|
+ if (need_resched() || race_fault()) {
|
|
+ bch2_trans_unlock(trans);
|
|
+ schedule();
|
|
+ return bch2_trans_relock(trans) ? 0 : -EINTR;
|
|
+ } else {
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \
|
|
+ _locks_want, _depth, _flags, _b) \
|
|
+ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \
|
|
+ _start, _locks_want, _depth, _flags), \
|
|
+ _b = bch2_btree_iter_peek_node(_iter); \
|
|
+ (_b); \
|
|
+ (_b) = bch2_btree_iter_next_node(_iter))
|
|
+
|
|
+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \
|
|
+ _flags, _b) \
|
|
+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \
|
|
+ 0, 0, _flags, _b)
|
|
+
|
|
+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
|
|
+ unsigned flags)
|
|
+{
|
|
+ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED)
|
|
+ return bch2_btree_iter_peek_cached(iter);
|
|
+ else
|
|
+ return flags & BTREE_ITER_SLOTS
|
|
+ ? bch2_btree_iter_peek_slot(iter)
|
|
+ : bch2_btree_iter_peek(iter);
|
|
+}
|
|
+
|
|
+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return flags & BTREE_ITER_SLOTS
|
|
+ ? bch2_btree_iter_next_slot(iter)
|
|
+ : bch2_btree_iter_next(iter);
|
|
+}
|
|
+
|
|
+static inline int bkey_err(struct bkey_s_c k)
|
|
+{
|
|
+ return PTR_ERR_OR_ZERO(k.k);
|
|
+}
|
|
+
|
|
+#define for_each_btree_key(_trans, _iter, _btree_id, \
|
|
+ _start, _flags, _k, _ret) \
|
|
+ for ((_iter) = bch2_trans_get_iter((_trans), (_btree_id), \
|
|
+ (_start), (_flags)), \
|
|
+ (_k) = __bch2_btree_iter_peek(_iter, _flags); \
|
|
+ !((_ret) = bkey_err(_k)) && (_k).k; \
|
|
+ (_k) = __bch2_btree_iter_next(_iter, _flags))
|
|
+
|
|
+#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \
|
|
+ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \
|
|
+ !((_ret) = bkey_err(_k)) && (_k).k; \
|
|
+ (_k) = __bch2_btree_iter_next(_iter, _flags))
|
|
+
|
|
+/* new multiple iterator interface: */
|
|
+
|
|
+int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *);
|
|
+int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *);
|
|
+
|
|
+void bch2_trans_unlink_iters(struct btree_trans *);
|
|
+
|
|
+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id,
|
|
+ struct bpos, unsigned,
|
|
+ unsigned, unsigned);
|
|
+
|
|
+static inline struct btree_iter *
|
|
+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id,
|
|
+ struct bpos pos, unsigned flags)
|
|
+{
|
|
+ struct btree_iter *iter =
|
|
+ __bch2_trans_get_iter(trans, btree_id, pos,
|
|
+ (flags & BTREE_ITER_INTENT) != 0, 0,
|
|
+ flags);
|
|
+ iter->ip_allocated = _THIS_IP_;
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *,
|
|
+ struct btree_iter *);
|
|
+static inline struct btree_iter *
|
|
+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src)
|
|
+{
|
|
+ struct btree_iter *iter =
|
|
+ __bch2_trans_copy_iter(trans, src);
|
|
+
|
|
+ iter->ip_allocated = _THIS_IP_;
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *,
|
|
+ enum btree_id, struct bpos,
|
|
+ unsigned, unsigned, unsigned);
|
|
+
|
|
+static inline bool btree_iter_live(struct btree_trans *trans, struct btree_iter *iter)
|
|
+{
|
|
+ return (trans->iters_live & (1ULL << iter->idx)) != 0;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_keep(struct btree_trans *trans, struct btree_iter *iter)
|
|
+{
|
|
+ return btree_iter_live(trans, iter) ||
|
|
+ (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
|
|
+}
|
|
+
|
|
+static inline void set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter)
|
|
+{
|
|
+ trans->iters_touched &= ~(1ULL << iter->idx);
|
|
+}
|
|
+
|
|
+#define TRANS_RESET_NOTRAVERSE (1 << 0)
|
|
+#define TRANS_RESET_NOUNLOCK (1 << 1)
|
|
+
|
|
+void bch2_trans_reset(struct btree_trans *, unsigned);
|
|
+
|
|
+static inline void bch2_trans_begin(struct btree_trans *trans)
|
|
+{
|
|
+ return bch2_trans_reset(trans, 0);
|
|
+}
|
|
+
|
|
+void *bch2_trans_kmalloc(struct btree_trans *, size_t);
|
|
+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t);
|
|
+int bch2_trans_exit(struct btree_trans *);
|
|
+
|
|
+void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+void bch2_fs_btree_iter_exit(struct bch_fs *);
|
|
+int bch2_fs_btree_iter_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_ITER_H */
|
|
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
|
|
new file mode 100644
|
|
index 000000000000..dfaf5e6df917
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_key_cache.c
|
|
@@ -0,0 +1,740 @@
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "btree_locking.h"
|
|
+#include "btree_update.h"
|
|
+#include "error.h"
|
|
+#include "journal.h"
|
|
+#include "journal_reclaim.h"
|
|
+
|
|
+#include <linux/sched/mm.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static struct kmem_cache *bch2_key_cache;
|
|
+
|
|
+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg,
|
|
+ const void *obj)
|
|
+{
|
|
+ const struct bkey_cached *ck = obj;
|
|
+ const struct bkey_cached_key *key = arg->key;
|
|
+
|
|
+ return cmp_int(ck->key.btree_id, key->btree_id) ?:
|
|
+ bpos_cmp(ck->key.pos, key->pos);
|
|
+}
|
|
+
|
|
+static const struct rhashtable_params bch2_btree_key_cache_params = {
|
|
+ .head_offset = offsetof(struct bkey_cached, hash),
|
|
+ .key_offset = offsetof(struct bkey_cached, key),
|
|
+ .key_len = sizeof(struct bkey_cached_key),
|
|
+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn,
|
|
+};
|
|
+
|
|
+__flatten
|
|
+inline struct bkey_cached *
|
|
+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos)
|
|
+{
|
|
+ struct bkey_cached_key key = {
|
|
+ .btree_id = btree_id,
|
|
+ .pos = pos,
|
|
+ };
|
|
+
|
|
+ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key,
|
|
+ bch2_btree_key_cache_params);
|
|
+}
|
|
+
|
|
+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck)
|
|
+{
|
|
+ if (!six_trylock_intent(&ck->c.lock))
|
|
+ return false;
|
|
+
|
|
+ if (!six_trylock_write(&ck->c.lock)) {
|
|
+ six_unlock_intent(&ck->c.lock);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
|
+ six_unlock_write(&ck->c.lock);
|
|
+ six_unlock_intent(&ck->c.lock);
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void bkey_cached_evict(struct btree_key_cache *c,
|
|
+ struct bkey_cached *ck)
|
|
+{
|
|
+ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash,
|
|
+ bch2_btree_key_cache_params));
|
|
+ memset(&ck->key, ~0, sizeof(ck->key));
|
|
+
|
|
+ atomic_long_dec(&c->nr_keys);
|
|
+}
|
|
+
|
|
+static void bkey_cached_free(struct btree_key_cache *bc,
|
|
+ struct bkey_cached *ck)
|
|
+{
|
|
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
|
+
|
|
+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags));
|
|
+
|
|
+ ck->btree_trans_barrier_seq =
|
|
+ start_poll_synchronize_srcu(&c->btree_trans_barrier);
|
|
+
|
|
+ list_move_tail(&ck->list, &bc->freed);
|
|
+ bc->nr_freed++;
|
|
+
|
|
+ kfree(ck->k);
|
|
+ ck->k = NULL;
|
|
+ ck->u64s = 0;
|
|
+
|
|
+ six_unlock_write(&ck->c.lock);
|
|
+ six_unlock_intent(&ck->c.lock);
|
|
+}
|
|
+
|
|
+static struct bkey_cached *
|
|
+bkey_cached_alloc(struct btree_key_cache *c)
|
|
+{
|
|
+ struct bkey_cached *ck;
|
|
+
|
|
+ ck = kmem_cache_alloc(bch2_key_cache, GFP_NOFS|__GFP_ZERO);
|
|
+ if (likely(ck)) {
|
|
+ INIT_LIST_HEAD(&ck->list);
|
|
+ six_lock_init(&ck->c.lock);
|
|
+ BUG_ON(!six_trylock_intent(&ck->c.lock));
|
|
+ BUG_ON(!six_trylock_write(&ck->c.lock));
|
|
+ return ck;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static struct bkey_cached *
|
|
+bkey_cached_reuse(struct btree_key_cache *c)
|
|
+{
|
|
+ struct bucket_table *tbl;
|
|
+ struct rhash_head *pos;
|
|
+ struct bkey_cached *ck;
|
|
+ unsigned i;
|
|
+
|
|
+ mutex_lock(&c->lock);
|
|
+ list_for_each_entry_reverse(ck, &c->freed, list)
|
|
+ if (bkey_cached_lock_for_evict(ck)) {
|
|
+ c->nr_freed--;
|
|
+ list_del(&ck->list);
|
|
+ mutex_unlock(&c->lock);
|
|
+ return ck;
|
|
+ }
|
|
+ mutex_unlock(&c->lock);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ tbl = rht_dereference_rcu(c->table.tbl, &c->table);
|
|
+ for (i = 0; i < tbl->size; i++)
|
|
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
|
|
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
|
|
+ bkey_cached_lock_for_evict(ck)) {
|
|
+ bkey_cached_evict(c, ck);
|
|
+ rcu_read_unlock();
|
|
+ return ck;
|
|
+ }
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static struct bkey_cached *
|
|
+btree_key_cache_create(struct btree_key_cache *c,
|
|
+ enum btree_id btree_id,
|
|
+ struct bpos pos)
|
|
+{
|
|
+ struct bkey_cached *ck;
|
|
+ bool was_new = true;
|
|
+
|
|
+ ck = bkey_cached_alloc(c);
|
|
+
|
|
+ if (unlikely(!ck)) {
|
|
+ ck = bkey_cached_reuse(c);
|
|
+ if (unlikely(!ck))
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ was_new = false;
|
|
+ }
|
|
+
|
|
+ ck->c.level = 0;
|
|
+ ck->c.btree_id = btree_id;
|
|
+ ck->key.btree_id = btree_id;
|
|
+ ck->key.pos = pos;
|
|
+ ck->valid = false;
|
|
+ ck->flags = 1U << BKEY_CACHED_ACCESSED;
|
|
+
|
|
+ if (unlikely(rhashtable_lookup_insert_fast(&c->table,
|
|
+ &ck->hash,
|
|
+ bch2_btree_key_cache_params))) {
|
|
+ /* We raced with another fill: */
|
|
+
|
|
+ if (likely(was_new)) {
|
|
+ six_unlock_write(&ck->c.lock);
|
|
+ six_unlock_intent(&ck->c.lock);
|
|
+ kfree(ck);
|
|
+ } else {
|
|
+ mutex_lock(&c->lock);
|
|
+ bkey_cached_free(c, ck);
|
|
+ mutex_unlock(&c->lock);
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ atomic_long_inc(&c->nr_keys);
|
|
+
|
|
+ six_unlock_write(&ck->c.lock);
|
|
+
|
|
+ return ck;
|
|
+}
|
|
+
|
|
+static int btree_key_cache_fill(struct btree_trans *trans,
|
|
+ struct btree_iter *ck_iter,
|
|
+ struct bkey_cached *ck)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ unsigned new_u64s = 0;
|
|
+ struct bkey_i *new_k = NULL;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, ck->key.btree_id,
|
|
+ ck->key.pos, BTREE_ITER_SLOTS);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!bch2_btree_node_relock(ck_iter, 0)) {
|
|
+ trace_transaction_restart_ip(trans->ip, _THIS_IP_);
|
|
+ ret = -EINTR;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * bch2_varint_decode can read past the end of the buffer by at
|
|
+ * most 7 bytes (it won't be used):
|
|
+ */
|
|
+ new_u64s = k.k->u64s + 1;
|
|
+
|
|
+ if (new_u64s > ck->u64s) {
|
|
+ new_u64s = roundup_pow_of_two(new_u64s);
|
|
+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
|
|
+ if (!new_k) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter);
|
|
+ if (new_k) {
|
|
+ kfree(ck->k);
|
|
+ ck->u64s = new_u64s;
|
|
+ ck->k = new_k;
|
|
+ }
|
|
+
|
|
+ bkey_reassemble(ck->k, k);
|
|
+ ck->valid = true;
|
|
+ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter);
|
|
+
|
|
+ /* We're not likely to need this iterator again: */
|
|
+ set_btree_iter_dontneed(trans, iter);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bkey_cached_check_fn(struct six_lock *lock, void *p)
|
|
+{
|
|
+ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock);
|
|
+ const struct btree_iter *iter = p;
|
|
+
|
|
+ return ck->key.btree_id == iter->btree_id &&
|
|
+ !bpos_cmp(ck->key.pos, iter->pos) ? 0 : -1;
|
|
+}
|
|
+
|
|
+__flatten
|
|
+int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_cached *ck;
|
|
+ int ret = 0;
|
|
+
|
|
+ BUG_ON(iter->level);
|
|
+
|
|
+ if (btree_node_locked(iter, 0)) {
|
|
+ ck = (void *) iter->l[0].b;
|
|
+ goto fill;
|
|
+ }
|
|
+retry:
|
|
+ ck = bch2_btree_key_cache_find(c, iter->btree_id, iter->pos);
|
|
+ if (!ck) {
|
|
+ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) {
|
|
+ iter->l[0].b = NULL;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ ck = btree_key_cache_create(&c->btree_key_cache,
|
|
+ iter->btree_id, iter->pos);
|
|
+ ret = PTR_ERR_OR_ZERO(ck);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ if (!ck)
|
|
+ goto retry;
|
|
+
|
|
+ mark_btree_node_locked(iter, 0, SIX_LOCK_intent);
|
|
+ iter->locks_want = 1;
|
|
+ } else {
|
|
+ enum six_lock_type lock_want = __btree_lock_want(iter, 0);
|
|
+
|
|
+ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want,
|
|
+ bkey_cached_check_fn, iter, _THIS_IP_)) {
|
|
+ if (ck->key.btree_id != iter->btree_id ||
|
|
+ bpos_cmp(ck->key.pos, iter->pos)) {
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ trace_transaction_restart_ip(trans->ip, _THIS_IP_);
|
|
+ ret = -EINTR;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (ck->key.btree_id != iter->btree_id ||
|
|
+ bpos_cmp(ck->key.pos, iter->pos)) {
|
|
+ six_unlock_type(&ck->c.lock, lock_want);
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ mark_btree_node_locked(iter, 0, lock_want);
|
|
+ }
|
|
+
|
|
+ iter->l[0].lock_seq = ck->c.lock.state.seq;
|
|
+ iter->l[0].b = (void *) ck;
|
|
+fill:
|
|
+ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) {
|
|
+ if (!btree_node_intent_locked(iter, 0))
|
|
+ bch2_btree_iter_upgrade(iter, 1);
|
|
+ if (!btree_node_intent_locked(iter, 0)) {
|
|
+ trace_transaction_restart_ip(trans->ip, _THIS_IP_);
|
|
+ ret = -EINTR;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = btree_key_cache_fill(trans, iter, ck);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
|
|
+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags);
|
|
+
|
|
+ iter->uptodate = BTREE_ITER_NEED_PEEK;
|
|
+
|
|
+ if (!(iter->flags & BTREE_ITER_INTENT))
|
|
+ bch2_btree_iter_downgrade(iter);
|
|
+ else if (!iter->locks_want) {
|
|
+ if (!__bch2_btree_iter_upgrade(iter, 1))
|
|
+ ret = -EINTR;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+err:
|
|
+ if (ret != -EINTR) {
|
|
+ btree_node_unlock(iter, 0);
|
|
+ iter->flags |= BTREE_ITER_ERROR;
|
|
+ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int btree_key_cache_flush_pos(struct btree_trans *trans,
|
|
+ struct bkey_cached_key key,
|
|
+ u64 journal_seq,
|
|
+ unsigned commit_flags,
|
|
+ bool evict)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct journal *j = &c->journal;
|
|
+ struct btree_iter *c_iter = NULL, *b_iter = NULL;
|
|
+ struct bkey_cached *ck = NULL;
|
|
+ int ret;
|
|
+
|
|
+ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
|
|
+ BTREE_ITER_SLOTS|
|
|
+ BTREE_ITER_INTENT);
|
|
+ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos,
|
|
+ BTREE_ITER_CACHED|
|
|
+ BTREE_ITER_CACHED_NOFILL|
|
|
+ BTREE_ITER_CACHED_NOCREATE|
|
|
+ BTREE_ITER_INTENT);
|
|
+retry:
|
|
+ ret = bch2_btree_iter_traverse(c_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ck = (void *) c_iter->l[0].b;
|
|
+ if (!ck ||
|
|
+ (journal_seq && ck->journal.seq != journal_seq))
|
|
+ goto out;
|
|
+
|
|
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
|
+ if (!evict)
|
|
+ goto out;
|
|
+ goto evict;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Since journal reclaim depends on us making progress here, and the
|
|
+ * allocator/copygc depend on journal reclaim making progress, we need
|
|
+ * to be using alloc reserves:
|
|
+ * */
|
|
+ ret = bch2_btree_iter_traverse(b_iter) ?:
|
|
+ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOUNLOCK|
|
|
+ BTREE_INSERT_NOCHECK_RW|
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_USE_RESERVE|
|
|
+ (ck->journal.seq == journal_last_seq(j)
|
|
+ ? BTREE_INSERT_JOURNAL_RESERVED
|
|
+ : 0)|
|
|
+ commit_flags);
|
|
+err:
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ if (ret == -EAGAIN)
|
|
+ goto out;
|
|
+
|
|
+ if (ret) {
|
|
+ bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
|
|
+ "error flushing key cache: %i", ret);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_journal_pin_drop(j, &ck->journal);
|
|
+ bch2_journal_preres_put(j, &ck->res);
|
|
+
|
|
+ BUG_ON(!btree_node_locked(c_iter, 0));
|
|
+
|
|
+ if (!evict) {
|
|
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
|
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
|
|
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
|
|
+ }
|
|
+ } else {
|
|
+evict:
|
|
+ BUG_ON(!btree_node_intent_locked(c_iter, 0));
|
|
+
|
|
+ mark_btree_node_unlocked(c_iter, 0);
|
|
+ c_iter->l[0].b = NULL;
|
|
+
|
|
+ six_lock_write(&ck->c.lock, NULL, NULL);
|
|
+
|
|
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
|
+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
|
|
+ atomic_long_dec(&c->btree_key_cache.nr_dirty);
|
|
+ }
|
|
+
|
|
+ bkey_cached_evict(&c->btree_key_cache, ck);
|
|
+
|
|
+ mutex_lock(&c->btree_key_cache.lock);
|
|
+ bkey_cached_free(&c->btree_key_cache, ck);
|
|
+ mutex_unlock(&c->btree_key_cache.lock);
|
|
+ }
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, b_iter);
|
|
+ bch2_trans_iter_put(trans, c_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_btree_key_cache_journal_flush(struct journal *j,
|
|
+ struct journal_entry_pin *pin, u64 seq)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bkey_cached *ck =
|
|
+ container_of(pin, struct bkey_cached, journal);
|
|
+ struct bkey_cached_key key;
|
|
+ struct btree_trans trans;
|
|
+ int ret = 0;
|
|
+
|
|
+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
|
+
|
|
+ six_lock_read(&ck->c.lock, NULL, NULL);
|
|
+ key = ck->key;
|
|
+
|
|
+ if (ck->journal.seq != seq ||
|
|
+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
|
+ six_unlock_read(&ck->c.lock);
|
|
+ goto unlock;
|
|
+ }
|
|
+ six_unlock_read(&ck->c.lock);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ ret = btree_key_cache_flush_pos(&trans, key, seq,
|
|
+ BTREE_INSERT_JOURNAL_RECLAIM, false);
|
|
+ bch2_trans_exit(&trans);
|
|
+unlock:
|
|
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Flush and evict a key from the key cache:
|
|
+ */
|
|
+int bch2_btree_key_cache_flush(struct btree_trans *trans,
|
|
+ enum btree_id id, struct bpos pos)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_cached_key key = { id, pos };
|
|
+
|
|
+ /* Fastpath - assume it won't be found: */
|
|
+ if (!bch2_btree_key_cache_find(c, id, pos))
|
|
+ return 0;
|
|
+
|
|
+ return btree_key_cache_flush_pos(trans, key, 0, 0, true);
|
|
+}
|
|
+
|
|
+bool bch2_btree_insert_key_cached(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *insert)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_cached *ck = (void *) iter->l[0].b;
|
|
+ bool kick_reclaim = false;
|
|
+
|
|
+ BUG_ON(insert->u64s > ck->u64s);
|
|
+
|
|
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
|
|
+ int difference;
|
|
+
|
|
+ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s);
|
|
+
|
|
+ difference = jset_u64s(insert->u64s) - ck->res.u64s;
|
|
+ if (difference > 0) {
|
|
+ trans->journal_preres.u64s -= difference;
|
|
+ ck->res.u64s += difference;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bkey_copy(ck->k, insert);
|
|
+ ck->valid = true;
|
|
+
|
|
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
|
|
+ set_bit(BKEY_CACHED_DIRTY, &ck->flags);
|
|
+ atomic_long_inc(&c->btree_key_cache.nr_dirty);
|
|
+
|
|
+ if (bch2_nr_btree_keys_need_flush(c))
|
|
+ kick_reclaim = true;
|
|
+ }
|
|
+
|
|
+ bch2_journal_pin_update(&c->journal, trans->journal_res.seq,
|
|
+ &ck->journal, bch2_btree_key_cache_journal_flush);
|
|
+
|
|
+ if (kick_reclaim)
|
|
+ journal_reclaim_kick(&c->journal);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
|
|
+ enum btree_id id, struct bpos pos)
|
|
+{
|
|
+ BUG_ON(bch2_btree_key_cache_find(trans->c, id, pos));
|
|
+}
|
|
+#endif
|
|
+
|
|
+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
|
|
+ struct shrink_control *sc)
|
|
+{
|
|
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
|
|
+ btree_key_cache.shrink);
|
|
+ struct btree_key_cache *bc = &c->btree_key_cache;
|
|
+ struct bucket_table *tbl;
|
|
+ struct bkey_cached *ck, *t;
|
|
+ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan;
|
|
+ unsigned start, flags;
|
|
+ int srcu_idx;
|
|
+
|
|
+ /* Return -1 if we can't do anything right now */
|
|
+ if (sc->gfp_mask & __GFP_FS)
|
|
+ mutex_lock(&bc->lock);
|
|
+ else if (!mutex_trylock(&bc->lock))
|
|
+ return -1;
|
|
+
|
|
+ srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
|
|
+ flags = memalloc_nofs_save();
|
|
+
|
|
+ /*
|
|
+ * Newest freed entries are at the end of the list - once we hit one
|
|
+ * that's too new to be freed, we can bail out:
|
|
+ */
|
|
+ list_for_each_entry_safe(ck, t, &bc->freed, list) {
|
|
+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier,
|
|
+ ck->btree_trans_barrier_seq))
|
|
+ break;
|
|
+
|
|
+ list_del(&ck->list);
|
|
+ kmem_cache_free(bch2_key_cache, ck);
|
|
+ bc->nr_freed--;
|
|
+ scanned++;
|
|
+ freed++;
|
|
+ }
|
|
+
|
|
+ if (scanned >= nr)
|
|
+ goto out;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
|
|
+ if (bc->shrink_iter >= tbl->size)
|
|
+ bc->shrink_iter = 0;
|
|
+ start = bc->shrink_iter;
|
|
+
|
|
+ do {
|
|
+ struct rhash_head *pos, *next;
|
|
+
|
|
+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter));
|
|
+
|
|
+ while (!rht_is_a_nulls(pos)) {
|
|
+ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter);
|
|
+ ck = container_of(pos, struct bkey_cached, hash);
|
|
+
|
|
+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags))
|
|
+ goto next;
|
|
+
|
|
+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags))
|
|
+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags);
|
|
+ else if (bkey_cached_lock_for_evict(ck)) {
|
|
+ bkey_cached_evict(bc, ck);
|
|
+ bkey_cached_free(bc, ck);
|
|
+ }
|
|
+
|
|
+ scanned++;
|
|
+ if (scanned >= nr)
|
|
+ break;
|
|
+next:
|
|
+ pos = next;
|
|
+ }
|
|
+
|
|
+ bc->shrink_iter++;
|
|
+ if (bc->shrink_iter >= tbl->size)
|
|
+ bc->shrink_iter = 0;
|
|
+ } while (scanned < nr && bc->shrink_iter != start);
|
|
+
|
|
+ rcu_read_unlock();
|
|
+out:
|
|
+ memalloc_nofs_restore(flags);
|
|
+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ return freed;
|
|
+}
|
|
+
|
|
+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
|
|
+ struct shrink_control *sc)
|
|
+{
|
|
+ struct bch_fs *c = container_of(shrink, struct bch_fs,
|
|
+ btree_key_cache.shrink);
|
|
+ struct btree_key_cache *bc = &c->btree_key_cache;
|
|
+ long nr = atomic_long_read(&bc->nr_keys) -
|
|
+ atomic_long_read(&bc->nr_dirty);
|
|
+
|
|
+ return max(0L, nr);
|
|
+}
|
|
+
|
|
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
|
|
+{
|
|
+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
|
|
+ struct bucket_table *tbl;
|
|
+ struct bkey_cached *ck, *n;
|
|
+ struct rhash_head *pos;
|
|
+ unsigned i;
|
|
+
|
|
+ if (bc->shrink.list.next)
|
|
+ unregister_shrinker(&bc->shrink);
|
|
+
|
|
+ mutex_lock(&bc->lock);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table);
|
|
+ for (i = 0; i < tbl->size; i++)
|
|
+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) {
|
|
+ bkey_cached_evict(bc, ck);
|
|
+ list_add(&ck->list, &bc->freed);
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ list_for_each_entry_safe(ck, n, &bc->freed, list) {
|
|
+ cond_resched();
|
|
+
|
|
+ bch2_journal_pin_drop(&c->journal, &ck->journal);
|
|
+ bch2_journal_preres_put(&c->journal, &ck->res);
|
|
+
|
|
+ list_del(&ck->list);
|
|
+ kfree(ck->k);
|
|
+ kmem_cache_free(bch2_key_cache, ck);
|
|
+ }
|
|
+
|
|
+ BUG_ON(atomic_long_read(&bc->nr_dirty) &&
|
|
+ !bch2_journal_error(&c->journal) &&
|
|
+ test_bit(BCH_FS_WAS_RW, &c->flags));
|
|
+ BUG_ON(atomic_long_read(&bc->nr_keys));
|
|
+
|
|
+ mutex_unlock(&bc->lock);
|
|
+
|
|
+ if (bc->table_init_done)
|
|
+ rhashtable_destroy(&bc->table);
|
|
+}
|
|
+
|
|
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
|
|
+{
|
|
+ mutex_init(&c->lock);
|
|
+ INIT_LIST_HEAD(&c->freed);
|
|
+}
|
|
+
|
|
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *c)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = rhashtable_init(&c->table, &bch2_btree_key_cache_params);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ c->table_init_done = true;
|
|
+
|
|
+ c->shrink.seeks = 1;
|
|
+ c->shrink.count_objects = bch2_btree_key_cache_count;
|
|
+ c->shrink.scan_objects = bch2_btree_key_cache_scan;
|
|
+ return register_shrinker(&c->shrink);
|
|
+}
|
|
+
|
|
+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c)
|
|
+{
|
|
+ pr_buf(out, "nr_freed:\t%zu\n", c->nr_freed);
|
|
+ pr_buf(out, "nr_keys:\t%zu\n", atomic_long_read(&c->nr_keys));
|
|
+ pr_buf(out, "nr_dirty:\t%zu\n", atomic_long_read(&c->nr_dirty));
|
|
+}
|
|
+
|
|
+void bch2_btree_key_cache_exit(void)
|
|
+{
|
|
+ if (bch2_key_cache)
|
|
+ kmem_cache_destroy(bch2_key_cache);
|
|
+}
|
|
+
|
|
+int __init bch2_btree_key_cache_init(void)
|
|
+{
|
|
+ bch2_key_cache = KMEM_CACHE(bkey_cached, 0);
|
|
+ if (!bch2_key_cache)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
|
|
new file mode 100644
|
|
index 000000000000..7e2b0a08f745
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_key_cache.h
|
|
@@ -0,0 +1,53 @@
|
|
+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H
|
|
+#define _BCACHEFS_BTREE_KEY_CACHE_H
|
|
+
|
|
+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c)
|
|
+{
|
|
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
|
|
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
|
|
+ size_t max_dirty = 1024 + nr_keys / 2;
|
|
+
|
|
+ return max_t(ssize_t, 0, nr_dirty - max_dirty);
|
|
+}
|
|
+
|
|
+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c)
|
|
+{
|
|
+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty);
|
|
+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys);
|
|
+ size_t max_dirty = 4096 + (nr_keys * 3) / 4;
|
|
+
|
|
+ return nr_dirty > max_dirty &&
|
|
+ test_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
|
|
+}
|
|
+
|
|
+int bch2_btree_key_cache_journal_flush(struct journal *,
|
|
+ struct journal_entry_pin *, u64);
|
|
+
|
|
+struct bkey_cached *
|
|
+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos);
|
|
+
|
|
+int bch2_btree_iter_traverse_cached(struct btree_iter *);
|
|
+
|
|
+bool bch2_btree_insert_key_cached(struct btree_trans *,
|
|
+ struct btree_iter *, struct bkey_i *);
|
|
+int bch2_btree_key_cache_flush(struct btree_trans *,
|
|
+ enum btree_id, struct bpos);
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_btree_key_cache_verify_clean(struct btree_trans *,
|
|
+ enum btree_id, struct bpos);
|
|
+#else
|
|
+static inline void
|
|
+bch2_btree_key_cache_verify_clean(struct btree_trans *trans,
|
|
+ enum btree_id id, struct bpos pos) {}
|
|
+#endif
|
|
+
|
|
+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *);
|
|
+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *);
|
|
+int bch2_fs_btree_key_cache_init(struct btree_key_cache *);
|
|
+
|
|
+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *);
|
|
+
|
|
+void bch2_btree_key_cache_exit(void);
|
|
+int __init bch2_btree_key_cache_init(void);
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */
|
|
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
|
|
new file mode 100644
|
|
index 000000000000..7532bcdef967
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_locking.h
|
|
@@ -0,0 +1,239 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_LOCKING_H
|
|
+#define _BCACHEFS_BTREE_LOCKING_H
|
|
+
|
|
+/*
|
|
+ * Only for internal btree use:
|
|
+ *
|
|
+ * The btree iterator tracks what locks it wants to take, and what locks it
|
|
+ * currently has - here we have wrappers for locking/unlocking btree nodes and
|
|
+ * updating the iterator state
|
|
+ */
|
|
+
|
|
+#include <linux/six.h>
|
|
+
|
|
+#include "btree_iter.h"
|
|
+
|
|
+/* matches six lock types */
|
|
+enum btree_node_locked_type {
|
|
+ BTREE_NODE_UNLOCKED = -1,
|
|
+ BTREE_NODE_READ_LOCKED = SIX_LOCK_read,
|
|
+ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent,
|
|
+};
|
|
+
|
|
+static inline int btree_node_locked_type(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ /*
|
|
+ * We're relying on the fact that if nodes_intent_locked is set
|
|
+ * nodes_locked must be set as well, so that we can compute without
|
|
+ * branches:
|
|
+ */
|
|
+ return BTREE_NODE_UNLOCKED +
|
|
+ ((iter->nodes_locked >> level) & 1) +
|
|
+ ((iter->nodes_intent_locked >> level) & 1);
|
|
+}
|
|
+
|
|
+static inline bool btree_node_intent_locked(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED;
|
|
+}
|
|
+
|
|
+static inline bool btree_node_read_locked(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED;
|
|
+}
|
|
+
|
|
+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level)
|
|
+{
|
|
+ return iter->nodes_locked & (1 << level);
|
|
+}
|
|
+
|
|
+static inline void mark_btree_node_unlocked(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ iter->nodes_locked &= ~(1 << level);
|
|
+ iter->nodes_intent_locked &= ~(1 << level);
|
|
+}
|
|
+
|
|
+static inline void mark_btree_node_locked(struct btree_iter *iter,
|
|
+ unsigned level,
|
|
+ enum six_lock_type type)
|
|
+{
|
|
+ /* relying on this to avoid a branch */
|
|
+ BUILD_BUG_ON(SIX_LOCK_read != 0);
|
|
+ BUILD_BUG_ON(SIX_LOCK_intent != 1);
|
|
+
|
|
+ iter->nodes_locked |= 1 << level;
|
|
+ iter->nodes_intent_locked |= type << level;
|
|
+}
|
|
+
|
|
+static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ mark_btree_node_locked(iter, level, SIX_LOCK_intent);
|
|
+}
|
|
+
|
|
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
|
|
+{
|
|
+ return level < iter->locks_want
|
|
+ ? SIX_LOCK_intent
|
|
+ : SIX_LOCK_read;
|
|
+}
|
|
+
|
|
+static inline enum btree_node_locked_type
|
|
+btree_lock_want(struct btree_iter *iter, int level)
|
|
+{
|
|
+ if (level < iter->level)
|
|
+ return BTREE_NODE_UNLOCKED;
|
|
+ if (level < iter->locks_want)
|
|
+ return BTREE_NODE_INTENT_LOCKED;
|
|
+ if (level == iter->level)
|
|
+ return BTREE_NODE_READ_LOCKED;
|
|
+ return BTREE_NODE_UNLOCKED;
|
|
+}
|
|
+
|
|
+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
|
|
+{
|
|
+ int lock_type = btree_node_locked_type(iter, level);
|
|
+
|
|
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
+
|
|
+ if (lock_type != BTREE_NODE_UNLOCKED)
|
|
+ six_unlock_type(&iter->l[level].b->c.lock, lock_type);
|
|
+ mark_btree_node_unlocked(iter, level);
|
|
+}
|
|
+
|
|
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
|
|
+{
|
|
+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
|
|
+
|
|
+ while (iter->nodes_locked)
|
|
+ btree_node_unlock(iter, __ffs(iter->nodes_locked));
|
|
+}
|
|
+
|
|
+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
|
|
+{
|
|
+ switch (type) {
|
|
+ case SIX_LOCK_read:
|
|
+ return BCH_TIME_btree_lock_contended_read;
|
|
+ case SIX_LOCK_intent:
|
|
+ return BCH_TIME_btree_lock_contended_intent;
|
|
+ case SIX_LOCK_write:
|
|
+ return BCH_TIME_btree_lock_contended_write;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * wrapper around six locks that just traces lock contended time
|
|
+ */
|
|
+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b,
|
|
+ enum six_lock_type type)
|
|
+{
|
|
+ u64 start_time = local_clock();
|
|
+
|
|
+ six_lock_type(&b->c.lock, type, NULL, NULL);
|
|
+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time);
|
|
+}
|
|
+
|
|
+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b,
|
|
+ enum six_lock_type type)
|
|
+{
|
|
+ if (!six_trylock_type(&b->c.lock, type))
|
|
+ __btree_node_lock_type(c, b, type);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Lock a btree node if we already have it locked on one of our linked
|
|
+ * iterators:
|
|
+ */
|
|
+static inline bool btree_node_lock_increment(struct btree_trans *trans,
|
|
+ struct btree *b, unsigned level,
|
|
+ enum btree_node_locked_type want)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ if (iter->l[level].b == b &&
|
|
+ btree_node_locked_type(iter, level) >= want) {
|
|
+ six_lock_increment(&b->c.lock, want);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned,
|
|
+ struct btree_iter *, enum six_lock_type,
|
|
+ six_lock_should_sleep_fn, void *,
|
|
+ unsigned long);
|
|
+
|
|
+static inline bool btree_node_lock(struct btree *b,
|
|
+ struct bpos pos, unsigned level,
|
|
+ struct btree_iter *iter,
|
|
+ enum six_lock_type type,
|
|
+ six_lock_should_sleep_fn should_sleep_fn, void *p,
|
|
+ unsigned long ip)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+
|
|
+ EBUG_ON(level >= BTREE_MAX_DEPTH);
|
|
+ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx)));
|
|
+
|
|
+ return likely(six_trylock_type(&b->c.lock, type)) ||
|
|
+ btree_node_lock_increment(trans, b, level, type) ||
|
|
+ __bch2_btree_node_lock(b, pos, level, iter, type,
|
|
+ should_sleep_fn, p, ip);
|
|
+}
|
|
+
|
|
+bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
|
|
+
|
|
+static inline bool bch2_btree_node_relock(struct btree_iter *iter,
|
|
+ unsigned level)
|
|
+{
|
|
+ EBUG_ON(btree_node_locked(iter, level) &&
|
|
+ btree_node_locked_type(iter, level) !=
|
|
+ __btree_lock_want(iter, level));
|
|
+
|
|
+ return likely(btree_node_locked(iter, level)) ||
|
|
+ __bch2_btree_node_relock(iter, level);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will
|
|
+ * succeed:
|
|
+ */
|
|
+static inline void
|
|
+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ EBUG_ON(iter->l[b->c.level].b != b);
|
|
+ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq);
|
|
+
|
|
+ trans_for_each_iter_with_node(iter->trans, b, linked)
|
|
+ linked->l[b->c.level].lock_seq += 2;
|
|
+
|
|
+ six_unlock_write(&b->c.lock);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *);
|
|
+
|
|
+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *);
|
|
+
|
|
+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
|
|
+{
|
|
+ EBUG_ON(iter->l[b->c.level].b != b);
|
|
+ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq);
|
|
+
|
|
+ if (unlikely(!six_trylock_write(&b->c.lock)))
|
|
+ __bch2_btree_node_lock_write(b, iter);
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_LOCKING_H */
|
|
+
|
|
+
|
|
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
|
|
new file mode 100644
|
|
index 000000000000..06a2c412db7a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_types.h
|
|
@@ -0,0 +1,695 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_TYPES_H
|
|
+#define _BCACHEFS_BTREE_TYPES_H
|
|
+
|
|
+#include <linux/list.h>
|
|
+#include <linux/rhashtable.h>
|
|
+#include <linux/six.h>
|
|
+
|
|
+#include "bkey_methods.h"
|
|
+#include "buckets_types.h"
|
|
+#include "journal_types.h"
|
|
+
|
|
+struct open_bucket;
|
|
+struct btree_update;
|
|
+struct btree_trans;
|
|
+
|
|
+#define MAX_BSETS 3U
|
|
+
|
|
+struct btree_nr_keys {
|
|
+
|
|
+ /*
|
|
+ * Amount of live metadata (i.e. size of node after a compaction) in
|
|
+ * units of u64s
|
|
+ */
|
|
+ u16 live_u64s;
|
|
+ u16 bset_u64s[MAX_BSETS];
|
|
+
|
|
+ /* live keys only: */
|
|
+ u16 packed_keys;
|
|
+ u16 unpacked_keys;
|
|
+};
|
|
+
|
|
+struct bset_tree {
|
|
+ /*
|
|
+ * We construct a binary tree in an array as if the array
|
|
+ * started at 1, so that things line up on the same cachelines
|
|
+ * better: see comments in bset.c at cacheline_to_bkey() for
|
|
+ * details
|
|
+ */
|
|
+
|
|
+ /* size of the binary tree and prev array */
|
|
+ u16 size;
|
|
+
|
|
+ /* function of size - precalculated for to_inorder() */
|
|
+ u16 extra;
|
|
+
|
|
+ u16 data_offset;
|
|
+ u16 aux_data_offset;
|
|
+ u16 end_offset;
|
|
+};
|
|
+
|
|
+struct btree_write {
|
|
+ struct journal_entry_pin journal;
|
|
+};
|
|
+
|
|
+struct btree_alloc {
|
|
+ struct open_buckets ob;
|
|
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
|
+};
|
|
+
|
|
+struct btree_bkey_cached_common {
|
|
+ struct six_lock lock;
|
|
+ u8 level;
|
|
+ u8 btree_id;
|
|
+};
|
|
+
|
|
+struct btree {
|
|
+ struct btree_bkey_cached_common c;
|
|
+
|
|
+ struct rhash_head hash;
|
|
+ u64 hash_val;
|
|
+
|
|
+ unsigned long flags;
|
|
+ u16 written;
|
|
+ u8 nsets;
|
|
+ u8 nr_key_bits;
|
|
+ u16 version_ondisk;
|
|
+
|
|
+ struct bkey_format format;
|
|
+
|
|
+ struct btree_node *data;
|
|
+ void *aux_data;
|
|
+
|
|
+ /*
|
|
+ * Sets of sorted keys - the real btree node - plus a binary search tree
|
|
+ *
|
|
+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
|
|
+ * to the memory we have allocated for this btree node. Additionally,
|
|
+ * set[0]->data points to the entire btree node as it exists on disk.
|
|
+ */
|
|
+ struct bset_tree set[MAX_BSETS];
|
|
+
|
|
+ struct btree_nr_keys nr;
|
|
+ u16 sib_u64s[2];
|
|
+ u16 whiteout_u64s;
|
|
+ u8 byte_order;
|
|
+ u8 unpack_fn_len;
|
|
+
|
|
+ struct btree_write writes[2];
|
|
+
|
|
+ /* Key/pointer for this btree node */
|
|
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
|
+
|
|
+ /*
|
|
+ * XXX: add a delete sequence number, so when bch2_btree_node_relock()
|
|
+ * fails because the lock sequence number has changed - i.e. the
|
|
+ * contents were modified - we can still relock the node if it's still
|
|
+ * the one we want, without redoing the traversal
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * For asynchronous splits/interior node updates:
|
|
+ * When we do a split, we allocate new child nodes and update the parent
|
|
+ * node to point to them: we update the parent in memory immediately,
|
|
+ * but then we must wait until the children have been written out before
|
|
+ * the update to the parent can be written - this is a list of the
|
|
+ * btree_updates that are blocking this node from being
|
|
+ * written:
|
|
+ */
|
|
+ struct list_head write_blocked;
|
|
+
|
|
+ /*
|
|
+ * Also for asynchronous splits/interior node updates:
|
|
+ * If a btree node isn't reachable yet, we don't want to kick off
|
|
+ * another write - because that write also won't yet be reachable and
|
|
+ * marking it as completed before it's reachable would be incorrect:
|
|
+ */
|
|
+ unsigned long will_make_reachable;
|
|
+
|
|
+ struct open_buckets ob;
|
|
+
|
|
+ /* lru list */
|
|
+ struct list_head list;
|
|
+};
|
|
+
|
|
+struct btree_cache {
|
|
+ struct rhashtable table;
|
|
+ bool table_init_done;
|
|
+ /*
|
|
+ * We never free a struct btree, except on shutdown - we just put it on
|
|
+ * the btree_cache_freed list and reuse it later. This simplifies the
|
|
+ * code, and it doesn't cost us much memory as the memory usage is
|
|
+ * dominated by buffers that hold the actual btree node data and those
|
|
+ * can be freed - and the number of struct btrees allocated is
|
|
+ * effectively bounded.
|
|
+ *
|
|
+ * btree_cache_freeable effectively is a small cache - we use it because
|
|
+ * high order page allocations can be rather expensive, and it's quite
|
|
+ * common to delete and allocate btree nodes in quick succession. It
|
|
+ * should never grow past ~2-3 nodes in practice.
|
|
+ */
|
|
+ struct mutex lock;
|
|
+ struct list_head live;
|
|
+ struct list_head freeable;
|
|
+ struct list_head freed;
|
|
+
|
|
+ /* Number of elements in live + freeable lists */
|
|
+ unsigned used;
|
|
+ unsigned reserve;
|
|
+ atomic_t dirty;
|
|
+ struct shrinker shrink;
|
|
+
|
|
+ /*
|
|
+ * If we need to allocate memory for a new btree node and that
|
|
+ * allocation fails, we can cannibalize another node in the btree cache
|
|
+ * to satisfy the allocation - lock to guarantee only one thread does
|
|
+ * this at a time:
|
|
+ */
|
|
+ struct task_struct *alloc_lock;
|
|
+ struct closure_waitlist alloc_wait;
|
|
+};
|
|
+
|
|
+struct btree_node_iter {
|
|
+ struct btree_node_iter_set {
|
|
+ u16 k, end;
|
|
+ } data[MAX_BSETS];
|
|
+};
|
|
+
|
|
+enum btree_iter_type {
|
|
+ BTREE_ITER_KEYS,
|
|
+ BTREE_ITER_NODES,
|
|
+ BTREE_ITER_CACHED,
|
|
+};
|
|
+
|
|
+#define BTREE_ITER_TYPE ((1 << 2) - 1)
|
|
+
|
|
+/*
|
|
+ * Iterate over all possible positions, synthesizing deleted keys for holes:
|
|
+ */
|
|
+#define BTREE_ITER_SLOTS (1 << 2)
|
|
+/*
|
|
+ * Indicates that intent locks should be taken on leaf nodes, because we expect
|
|
+ * to be doing updates:
|
|
+ */
|
|
+#define BTREE_ITER_INTENT (1 << 3)
|
|
+/*
|
|
+ * Causes the btree iterator code to prefetch additional btree nodes from disk:
|
|
+ */
|
|
+#define BTREE_ITER_PREFETCH (1 << 4)
|
|
+/*
|
|
+ * Indicates that this iterator should not be reused until transaction commit,
|
|
+ * either because a pending update references it or because the update depends
|
|
+ * on that particular key being locked (e.g. by the str_hash code, for hash
|
|
+ * table consistency)
|
|
+ */
|
|
+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5)
|
|
+/*
|
|
+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
|
|
+ * @pos or the first key strictly greater than @pos
|
|
+ */
|
|
+#define BTREE_ITER_IS_EXTENTS (1 << 6)
|
|
+#define BTREE_ITER_ERROR (1 << 7)
|
|
+#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8)
|
|
+#define BTREE_ITER_CACHED_NOFILL (1 << 9)
|
|
+#define BTREE_ITER_CACHED_NOCREATE (1 << 10)
|
|
+#define BTREE_ITER_NOT_EXTENTS (1 << 11)
|
|
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
|
|
+
|
|
+enum btree_iter_uptodate {
|
|
+ BTREE_ITER_UPTODATE = 0,
|
|
+ BTREE_ITER_NEED_PEEK = 1,
|
|
+ BTREE_ITER_NEED_RELOCK = 2,
|
|
+ BTREE_ITER_NEED_TRAVERSE = 3,
|
|
+};
|
|
+
|
|
+#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1)
|
|
+#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2)
|
|
+#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3)
|
|
+#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4)
|
|
+#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5)
|
|
+#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6)
|
|
+#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7)
|
|
+
|
|
+/*
|
|
+ * @pos - iterator's current position
|
|
+ * @level - current btree depth
|
|
+ * @locks_want - btree level below which we start taking intent locks
|
|
+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked
|
|
+ * @nodes_intent_locked - bitmask indicating which locks are intent locks
|
|
+ */
|
|
+struct btree_iter {
|
|
+ struct btree_trans *trans;
|
|
+ struct bpos pos;
|
|
+ /* what we're searching for/what the iterator actually points to: */
|
|
+ struct bpos real_pos;
|
|
+ struct bpos pos_after_commit;
|
|
+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */
|
|
+ unsigned snapshot;
|
|
+
|
|
+ u16 flags;
|
|
+ u8 idx;
|
|
+
|
|
+ enum btree_id btree_id:4;
|
|
+ enum btree_iter_uptodate uptodate:4;
|
|
+ unsigned level:4,
|
|
+ min_depth:4,
|
|
+ locks_want:4,
|
|
+ nodes_locked:4,
|
|
+ nodes_intent_locked:4;
|
|
+
|
|
+ struct btree_iter_level {
|
|
+ struct btree *b;
|
|
+ struct btree_node_iter iter;
|
|
+ u32 lock_seq;
|
|
+ } l[BTREE_MAX_DEPTH];
|
|
+
|
|
+ /*
|
|
+ * Current unpacked key - so that bch2_btree_iter_next()/
|
|
+ * bch2_btree_iter_next_slot() can correctly advance pos.
|
|
+ */
|
|
+ struct bkey k;
|
|
+ unsigned long ip_allocated;
|
|
+};
|
|
+
|
|
+static inline enum btree_iter_type
|
|
+btree_iter_type(const struct btree_iter *iter)
|
|
+{
|
|
+ return iter->flags & BTREE_ITER_TYPE;
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_is_cached(const struct btree_iter *iter)
|
|
+{
|
|
+ return btree_iter_type(iter) == BTREE_ITER_CACHED;
|
|
+}
|
|
+
|
|
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
|
|
+{
|
|
+ return iter->l + iter->level;
|
|
+}
|
|
+
|
|
+struct btree_key_cache {
|
|
+ struct mutex lock;
|
|
+ struct rhashtable table;
|
|
+ bool table_init_done;
|
|
+ struct list_head freed;
|
|
+ struct shrinker shrink;
|
|
+ unsigned shrink_iter;
|
|
+
|
|
+ size_t nr_freed;
|
|
+ atomic_long_t nr_keys;
|
|
+ atomic_long_t nr_dirty;
|
|
+};
|
|
+
|
|
+struct bkey_cached_key {
|
|
+ u32 btree_id;
|
|
+ struct bpos pos;
|
|
+} __attribute__((packed, aligned(4)));
|
|
+
|
|
+#define BKEY_CACHED_ACCESSED 0
|
|
+#define BKEY_CACHED_DIRTY 1
|
|
+
|
|
+struct bkey_cached {
|
|
+ struct btree_bkey_cached_common c;
|
|
+
|
|
+ unsigned long flags;
|
|
+ u8 u64s;
|
|
+ bool valid;
|
|
+ u32 btree_trans_barrier_seq;
|
|
+ struct bkey_cached_key key;
|
|
+
|
|
+ struct rhash_head hash;
|
|
+ struct list_head list;
|
|
+
|
|
+ struct journal_preres res;
|
|
+ struct journal_entry_pin journal;
|
|
+
|
|
+ struct bkey_i *k;
|
|
+};
|
|
+
|
|
+struct btree_insert_entry {
|
|
+ unsigned trigger_flags;
|
|
+ u8 bkey_type;
|
|
+ enum btree_id btree_id:8;
|
|
+ u8 level;
|
|
+ unsigned trans_triggers_run:1;
|
|
+ unsigned is_extent:1;
|
|
+ struct bkey_i *k;
|
|
+ struct btree_iter *iter;
|
|
+};
|
|
+
|
|
+#ifndef CONFIG_LOCKDEP
|
|
+#define BTREE_ITER_MAX 64
|
|
+#else
|
|
+#define BTREE_ITER_MAX 32
|
|
+#endif
|
|
+
|
|
+struct btree_trans_commit_hook;
|
|
+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
|
|
+
|
|
+struct btree_trans_commit_hook {
|
|
+ btree_trans_commit_hook_fn *fn;
|
|
+ struct btree_trans_commit_hook *next;
|
|
+};
|
|
+
|
|
+#define BTREE_TRANS_MEM_MAX 4096
|
|
+
|
|
+struct btree_trans {
|
|
+ struct bch_fs *c;
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct list_head list;
|
|
+ struct btree *locking;
|
|
+ unsigned locking_iter_idx;
|
|
+ struct bpos locking_pos;
|
|
+ u8 locking_btree_id;
|
|
+ u8 locking_level;
|
|
+ pid_t pid;
|
|
+#endif
|
|
+ unsigned long ip;
|
|
+ int srcu_idx;
|
|
+
|
|
+ u8 nr_updates;
|
|
+ u8 nr_updates2;
|
|
+ unsigned used_mempool:1;
|
|
+ unsigned error:1;
|
|
+ unsigned in_traverse_all:1;
|
|
+
|
|
+ u64 iters_linked;
|
|
+ u64 iters_live;
|
|
+ u64 iters_touched;
|
|
+
|
|
+ unsigned mem_top;
|
|
+ unsigned mem_bytes;
|
|
+ void *mem;
|
|
+
|
|
+ struct btree_iter *iters;
|
|
+ struct btree_insert_entry *updates;
|
|
+ struct btree_insert_entry *updates2;
|
|
+
|
|
+ /* update path: */
|
|
+ struct btree_trans_commit_hook *hooks;
|
|
+ struct jset_entry *extra_journal_entries;
|
|
+ unsigned extra_journal_entry_u64s;
|
|
+ struct journal_entry_pin *journal_pin;
|
|
+
|
|
+ struct journal_res journal_res;
|
|
+ struct journal_preres journal_preres;
|
|
+ u64 *journal_seq;
|
|
+ struct disk_reservation *disk_res;
|
|
+ unsigned flags;
|
|
+ unsigned journal_u64s;
|
|
+ unsigned journal_preres_u64s;
|
|
+ struct replicas_delta_list *fs_usage_deltas;
|
|
+};
|
|
+
|
|
+#define BTREE_FLAG(flag) \
|
|
+static inline bool btree_node_ ## flag(struct btree *b) \
|
|
+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
|
+ \
|
|
+static inline void set_btree_node_ ## flag(struct btree *b) \
|
|
+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \
|
|
+ \
|
|
+static inline void clear_btree_node_ ## flag(struct btree *b) \
|
|
+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); }
|
|
+
|
|
+enum btree_flags {
|
|
+ BTREE_NODE_read_in_flight,
|
|
+ BTREE_NODE_read_error,
|
|
+ BTREE_NODE_dirty,
|
|
+ BTREE_NODE_need_write,
|
|
+ BTREE_NODE_noevict,
|
|
+ BTREE_NODE_write_idx,
|
|
+ BTREE_NODE_accessed,
|
|
+ BTREE_NODE_write_in_flight,
|
|
+ BTREE_NODE_just_written,
|
|
+ BTREE_NODE_dying,
|
|
+ BTREE_NODE_fake,
|
|
+ BTREE_NODE_need_rewrite,
|
|
+ BTREE_NODE_never_write,
|
|
+};
|
|
+
|
|
+BTREE_FLAG(read_in_flight);
|
|
+BTREE_FLAG(read_error);
|
|
+BTREE_FLAG(need_write);
|
|
+BTREE_FLAG(noevict);
|
|
+BTREE_FLAG(write_idx);
|
|
+BTREE_FLAG(accessed);
|
|
+BTREE_FLAG(write_in_flight);
|
|
+BTREE_FLAG(just_written);
|
|
+BTREE_FLAG(dying);
|
|
+BTREE_FLAG(fake);
|
|
+BTREE_FLAG(need_rewrite);
|
|
+BTREE_FLAG(never_write);
|
|
+
|
|
+static inline struct btree_write *btree_current_write(struct btree *b)
|
|
+{
|
|
+ return b->writes + btree_node_write_idx(b);
|
|
+}
|
|
+
|
|
+static inline struct btree_write *btree_prev_write(struct btree *b)
|
|
+{
|
|
+ return b->writes + (btree_node_write_idx(b) ^ 1);
|
|
+}
|
|
+
|
|
+static inline struct bset_tree *bset_tree_last(struct btree *b)
|
|
+{
|
|
+ EBUG_ON(!b->nsets);
|
|
+ return b->set + b->nsets - 1;
|
|
+}
|
|
+
|
|
+static inline void *
|
|
+__btree_node_offset_to_ptr(const struct btree *b, u16 offset)
|
|
+{
|
|
+ return (void *) ((u64 *) b->data + 1 + offset);
|
|
+}
|
|
+
|
|
+static inline u16
|
|
+__btree_node_ptr_to_offset(const struct btree *b, const void *p)
|
|
+{
|
|
+ u16 ret = (u64 *) p - 1 - (u64 *) b->data;
|
|
+
|
|
+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline struct bset *bset(const struct btree *b,
|
|
+ const struct bset_tree *t)
|
|
+{
|
|
+ return __btree_node_offset_to_ptr(b, t->data_offset);
|
|
+}
|
|
+
|
|
+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t)
|
|
+{
|
|
+ t->end_offset =
|
|
+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t)));
|
|
+}
|
|
+
|
|
+static inline void set_btree_bset(struct btree *b, struct bset_tree *t,
|
|
+ const struct bset *i)
|
|
+{
|
|
+ t->data_offset = __btree_node_ptr_to_offset(b, i);
|
|
+ set_btree_bset_end(b, t);
|
|
+}
|
|
+
|
|
+static inline struct bset *btree_bset_first(struct btree *b)
|
|
+{
|
|
+ return bset(b, b->set);
|
|
+}
|
|
+
|
|
+static inline struct bset *btree_bset_last(struct btree *b)
|
|
+{
|
|
+ return bset(b, bset_tree_last(b));
|
|
+}
|
|
+
|
|
+static inline u16
|
|
+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k)
|
|
+{
|
|
+ return __btree_node_ptr_to_offset(b, k);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *
|
|
+__btree_node_offset_to_key(const struct btree *b, u16 k)
|
|
+{
|
|
+ return __btree_node_offset_to_ptr(b, k);
|
|
+}
|
|
+
|
|
+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t)
|
|
+{
|
|
+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64);
|
|
+}
|
|
+
|
|
+#define btree_bkey_first(_b, _t) \
|
|
+({ \
|
|
+ EBUG_ON(bset(_b, _t)->start != \
|
|
+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\
|
|
+ \
|
|
+ bset(_b, _t)->start; \
|
|
+})
|
|
+
|
|
+#define btree_bkey_last(_b, _t) \
|
|
+({ \
|
|
+ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \
|
|
+ vstruct_last(bset(_b, _t))); \
|
|
+ \
|
|
+ __btree_node_offset_to_key(_b, (_t)->end_offset); \
|
|
+})
|
|
+
|
|
+static inline unsigned bset_u64s(struct bset_tree *t)
|
|
+{
|
|
+ return t->end_offset - t->data_offset -
|
|
+ sizeof(struct bset) / sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t)
|
|
+{
|
|
+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set];
|
|
+}
|
|
+
|
|
+static inline unsigned bset_byte_offset(struct btree *b, void *i)
|
|
+{
|
|
+ return i - (void *) b->data;
|
|
+}
|
|
+
|
|
+enum btree_node_type {
|
|
+#define x(kwd, val) BKEY_TYPE_##kwd = val,
|
|
+ BCH_BTREE_IDS()
|
|
+#undef x
|
|
+ BKEY_TYPE_btree,
|
|
+};
|
|
+
|
|
+/* Type of a key in btree @id at level @level: */
|
|
+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id)
|
|
+{
|
|
+ return level ? BKEY_TYPE_btree : (enum btree_node_type) id;
|
|
+}
|
|
+
|
|
+/* Type of keys @b contains: */
|
|
+static inline enum btree_node_type btree_node_type(struct btree *b)
|
|
+{
|
|
+ return __btree_node_type(b->c.level, b->c.btree_id);
|
|
+}
|
|
+
|
|
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BKEY_TYPE_extents:
|
|
+ case BKEY_TYPE_reflink:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool btree_node_is_extents(struct btree *b)
|
|
+{
|
|
+ return btree_node_type_is_extents(btree_node_type(b));
|
|
+}
|
|
+
|
|
+static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter)
|
|
+{
|
|
+ return __btree_node_type(iter->level, iter->btree_id);
|
|
+}
|
|
+
|
|
+static inline bool btree_iter_is_extents(struct btree_iter *iter)
|
|
+{
|
|
+ return btree_node_type_is_extents(btree_iter_key_type(iter));
|
|
+}
|
|
+
|
|
+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \
|
|
+ ((1U << BKEY_TYPE_extents)| \
|
|
+ (1U << BKEY_TYPE_inodes)| \
|
|
+ (1U << BKEY_TYPE_stripes)| \
|
|
+ (1U << BKEY_TYPE_reflink)| \
|
|
+ (1U << BKEY_TYPE_btree))
|
|
+
|
|
+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
|
|
+ ((1U << BKEY_TYPE_alloc)| \
|
|
+ (1U << BKEY_TYPE_stripes))
|
|
+
|
|
+#define BTREE_NODE_TYPE_HAS_TRIGGERS \
|
|
+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
|
|
+ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
|
|
+
|
|
+#define BTREE_ID_HAS_SNAPSHOTS \
|
|
+ ((1U << BTREE_ID_extents)| \
|
|
+ (1U << BTREE_ID_inodes)| \
|
|
+ (1U << BTREE_ID_dirents)| \
|
|
+ (1U << BTREE_ID_xattrs))
|
|
+
|
|
+#define BTREE_ID_HAS_PTRS \
|
|
+ ((1U << BTREE_ID_extents)| \
|
|
+ (1U << BTREE_ID_reflink))
|
|
+
|
|
+static inline bool btree_type_has_snapshots(enum btree_id id)
|
|
+{
|
|
+ return (1 << id) & BTREE_ID_HAS_SNAPSHOTS;
|
|
+}
|
|
+
|
|
+enum btree_trigger_flags {
|
|
+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
|
|
+
|
|
+ __BTREE_TRIGGER_INSERT,
|
|
+ __BTREE_TRIGGER_OVERWRITE,
|
|
+ __BTREE_TRIGGER_OVERWRITE_SPLIT,
|
|
+
|
|
+ __BTREE_TRIGGER_GC,
|
|
+ __BTREE_TRIGGER_BUCKET_INVALIDATE,
|
|
+ __BTREE_TRIGGER_NOATOMIC,
|
|
+};
|
|
+
|
|
+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
|
|
+
|
|
+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
|
|
+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
|
|
+#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT)
|
|
+
|
|
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
|
|
+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
|
|
+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
|
|
+
|
|
+static inline bool btree_node_type_needs_gc(enum btree_node_type type)
|
|
+{
|
|
+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type);
|
|
+}
|
|
+
|
|
+struct btree_root {
|
|
+ struct btree *b;
|
|
+
|
|
+ /* On disk root - see async splits: */
|
|
+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
|
|
+ u8 level;
|
|
+ u8 alive;
|
|
+ s8 error;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Optional hook that will be called just prior to a btree node update, when
|
|
+ * we're holding the write lock and we know what key is about to be overwritten:
|
|
+ */
|
|
+
|
|
+enum btree_insert_ret {
|
|
+ BTREE_INSERT_OK,
|
|
+ /* leaf node needs to be split */
|
|
+ BTREE_INSERT_BTREE_NODE_FULL,
|
|
+ BTREE_INSERT_ENOSPC,
|
|
+ BTREE_INSERT_NEED_MARK_REPLICAS,
|
|
+ BTREE_INSERT_NEED_JOURNAL_RES,
|
|
+ BTREE_INSERT_NEED_JOURNAL_RECLAIM,
|
|
+};
|
|
+
|
|
+enum btree_gc_coalesce_fail_reason {
|
|
+ BTREE_GC_COALESCE_FAIL_RESERVE_GET,
|
|
+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
|
|
+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
|
|
+};
|
|
+
|
|
+enum btree_node_sibling {
|
|
+ btree_prev_sib,
|
|
+ btree_next_sib,
|
|
+};
|
|
+
|
|
+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
|
|
+ struct btree *,
|
|
+ struct btree_node_iter *);
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_TYPES_H */
|
|
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
|
|
new file mode 100644
|
|
index 000000000000..56131ac516ce
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_update.h
|
|
@@ -0,0 +1,148 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_UPDATE_H
|
|
+#define _BCACHEFS_BTREE_UPDATE_H
|
|
+
|
|
+#include "btree_iter.h"
|
|
+#include "journal.h"
|
|
+
|
|
+struct bch_fs;
|
|
+struct btree;
|
|
+
|
|
+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
|
|
+ struct btree_iter *);
|
|
+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
|
|
+ struct btree_node_iter *, struct bkey_i *);
|
|
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
|
|
+
|
|
+enum btree_insert_flags {
|
|
+ __BTREE_INSERT_NOUNLOCK,
|
|
+ __BTREE_INSERT_NOFAIL,
|
|
+ __BTREE_INSERT_NOCHECK_RW,
|
|
+ __BTREE_INSERT_LAZY_RW,
|
|
+ __BTREE_INSERT_USE_RESERVE,
|
|
+ __BTREE_INSERT_JOURNAL_REPLAY,
|
|
+ __BTREE_INSERT_JOURNAL_RESERVED,
|
|
+ __BTREE_INSERT_JOURNAL_RECLAIM,
|
|
+ __BTREE_INSERT_NOWAIT,
|
|
+ __BTREE_INSERT_GC_LOCK_HELD,
|
|
+ __BCH_HASH_SET_MUST_CREATE,
|
|
+ __BCH_HASH_SET_MUST_REPLACE,
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Don't drop locks _after_ successfully updating btree:
|
|
+ */
|
|
+#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK)
|
|
+
|
|
+/* Don't check for -ENOSPC: */
|
|
+#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL)
|
|
+
|
|
+#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW)
|
|
+#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW)
|
|
+
|
|
+/* for copygc, or when merging btree nodes */
|
|
+#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE)
|
|
+
|
|
+/* Insert is for journal replay - don't get journal reservations: */
|
|
+#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY)
|
|
+
|
|
+/* Indicates that we have pre-reserved space in the journal: */
|
|
+#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED)
|
|
+
|
|
+/* Insert is being called from journal reclaim path: */
|
|
+#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM)
|
|
+
|
|
+/* Don't block on allocation failure (for new btree nodes: */
|
|
+#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT)
|
|
+#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD)
|
|
+
|
|
+#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE)
|
|
+#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE)
|
|
+
|
|
+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
|
|
+
|
|
+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *);
|
|
+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
|
|
+ struct disk_reservation *, u64 *, int flags);
|
|
+
|
|
+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id,
|
|
+ struct bpos, struct bpos, u64 *);
|
|
+int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
|
|
+ struct bpos, struct bpos, u64 *);
|
|
+
|
|
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
|
|
+ __le64, unsigned);
|
|
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
|
|
+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
|
|
+ struct btree *, struct bkey_i *);
|
|
+
|
|
+int bch2_trans_update(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_i *, enum btree_trigger_flags);
|
|
+void bch2_trans_commit_hook(struct btree_trans *,
|
|
+ struct btree_trans_commit_hook *);
|
|
+int __bch2_trans_commit(struct btree_trans *);
|
|
+
|
|
+/**
|
|
+ * bch2_trans_commit - insert keys at given iterator positions
|
|
+ *
|
|
+ * This is main entry point for btree updates.
|
|
+ *
|
|
+ * Return values:
|
|
+ * -EINTR: locking changed, this function should be called again.
|
|
+ * -EROFS: filesystem read only
|
|
+ * -EIO: journal or btree node IO error
|
|
+ */
|
|
+static inline int bch2_trans_commit(struct btree_trans *trans,
|
|
+ struct disk_reservation *disk_res,
|
|
+ u64 *journal_seq,
|
|
+ unsigned flags)
|
|
+{
|
|
+ trans->disk_res = disk_res;
|
|
+ trans->journal_seq = journal_seq;
|
|
+ trans->flags = flags;
|
|
+
|
|
+ return __bch2_trans_commit(trans);
|
|
+}
|
|
+
|
|
+#define lockrestart_do(_trans, _do) \
|
|
+({ \
|
|
+ int _ret; \
|
|
+ \
|
|
+ while (1) { \
|
|
+ _ret = (_do); \
|
|
+ if (_ret != -EINTR) \
|
|
+ break; \
|
|
+ bch2_trans_reset(_trans, 0); \
|
|
+ } \
|
|
+ \
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \
|
|
+ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
|
|
+ (_journal_seq), (_flags)))
|
|
+
|
|
+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
|
|
+({ \
|
|
+ struct btree_trans trans; \
|
|
+ int _ret, _ret2; \
|
|
+ \
|
|
+ bch2_trans_init(&trans, (_c), 0, 0); \
|
|
+ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \
|
|
+ _do); \
|
|
+ _ret2 = bch2_trans_exit(&trans); \
|
|
+ \
|
|
+ _ret ?: _ret2; \
|
|
+})
|
|
+
|
|
+#define trans_for_each_update(_trans, _i) \
|
|
+ for ((_i) = (_trans)->updates; \
|
|
+ (_i) < (_trans)->updates + (_trans)->nr_updates; \
|
|
+ (_i)++)
|
|
+
|
|
+#define trans_for_each_update2(_trans, _i) \
|
|
+ for ((_i) = (_trans)->updates2; \
|
|
+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \
|
|
+ (_i)++)
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_UPDATE_H */
|
|
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
|
|
new file mode 100644
|
|
index 000000000000..b9e0ff97a41b
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_update_interior.c
|
|
@@ -0,0 +1,2116 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_locking.h"
|
|
+#include "buckets.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "journal.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "keylist.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/random.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+/* Debug code: */
|
|
+
|
|
+/*
|
|
+ * Verify that child nodes correctly span parent node's range:
|
|
+ */
|
|
+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ struct bpos next_node = b->data->min_key;
|
|
+ struct btree_node_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_c_btree_ptr_v2 bp;
|
|
+ struct bkey unpacked;
|
|
+ char buf1[100], buf2[100];
|
|
+
|
|
+ BUG_ON(!b->c.level);
|
|
+
|
|
+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
|
|
+ return;
|
|
+
|
|
+ bch2_btree_node_iter_init_from_start(&iter, b);
|
|
+
|
|
+ while (1) {
|
|
+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
|
|
+ if (k.k->type != KEY_TYPE_btree_ptr_v2)
|
|
+ break;
|
|
+ bp = bkey_s_c_to_btree_ptr_v2(k);
|
|
+
|
|
+ if (bpos_cmp(next_node, bp.v->min_key)) {
|
|
+ bch2_dump_btree_node(c, b);
|
|
+ panic("expected next min_key %s got %s\n",
|
|
+ (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
|
|
+ (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_iter_advance(&iter, b);
|
|
+
|
|
+ if (bch2_btree_node_iter_end(&iter)) {
|
|
+ if (bpos_cmp(k.k->p, b->key.k.p)) {
|
|
+ bch2_dump_btree_node(c, b);
|
|
+ panic("expected end %s got %s\n",
|
|
+ (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
|
|
+ (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ next_node = bpos_successor(k.k->p);
|
|
+ }
|
|
+#endif
|
|
+}
|
|
+
|
|
+/* Calculate ideal packed bkey format for new btree nodes: */
|
|
+
|
|
+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+ struct bset_tree *t;
|
|
+ struct bkey uk;
|
|
+
|
|
+ for_each_bset(b, t)
|
|
+ bset_tree_for_each_key(b, t, k)
|
|
+ if (!bkey_deleted(k)) {
|
|
+ uk = bkey_unpack_key(b, k);
|
|
+ bch2_bkey_format_add_key(s, &uk);
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct bkey_format bch2_btree_calc_format(struct btree *b)
|
|
+{
|
|
+ struct bkey_format_state s;
|
|
+
|
|
+ bch2_bkey_format_init(&s);
|
|
+ bch2_bkey_format_add_pos(&s, b->data->min_key);
|
|
+ bch2_bkey_format_add_pos(&s, b->data->max_key);
|
|
+ __bch2_btree_calc_format(&s, b);
|
|
+
|
|
+ return bch2_bkey_format_done(&s);
|
|
+}
|
|
+
|
|
+static size_t btree_node_u64s_with_format(struct btree *b,
|
|
+ struct bkey_format *new_f)
|
|
+{
|
|
+ struct bkey_format *old_f = &b->format;
|
|
+
|
|
+ /* stupid integer promotion rules */
|
|
+ ssize_t delta =
|
|
+ (((int) new_f->key_u64s - old_f->key_u64s) *
|
|
+ (int) b->nr.packed_keys) +
|
|
+ (((int) new_f->key_u64s - BKEY_U64s) *
|
|
+ (int) b->nr.unpacked_keys);
|
|
+
|
|
+ BUG_ON(delta + b->nr.live_u64s < 0);
|
|
+
|
|
+ return b->nr.live_u64s + delta;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * btree_node_format_fits - check if we could rewrite node with a new format
|
|
+ *
|
|
+ * This assumes all keys can pack with the new format -- it just checks if
|
|
+ * the re-packed keys would fit inside the node itself.
|
|
+ */
|
|
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
|
|
+ struct bkey_format *new_f)
|
|
+{
|
|
+ size_t u64s = btree_node_u64s_with_format(b, new_f);
|
|
+
|
|
+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
|
|
+}
|
|
+
|
|
+/* Btree node freeing/allocation: */
|
|
+
|
|
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ trace_btree_node_free(c, b);
|
|
+
|
|
+ BUG_ON(btree_node_dirty(b));
|
|
+ BUG_ON(btree_node_need_write(b));
|
|
+ BUG_ON(b == btree_node_root(c, b));
|
|
+ BUG_ON(b->ob.nr);
|
|
+ BUG_ON(!list_empty(&b->write_blocked));
|
|
+ BUG_ON(b->will_make_reachable);
|
|
+
|
|
+ clear_btree_node_noevict(b);
|
|
+
|
|
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
|
|
+
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ list_move(&b->list, &c->btree_cache.freeable);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct open_buckets ob = b->ob;
|
|
+
|
|
+ b->ob.nr = 0;
|
|
+
|
|
+ clear_btree_node_dirty(c, b);
|
|
+
|
|
+ btree_node_lock_type(c, b, SIX_LOCK_write);
|
|
+ __btree_node_free(c, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+
|
|
+ bch2_open_buckets_put(c, &ob);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ trans_for_each_iter(iter->trans, linked)
|
|
+ BUG_ON(linked->l[b->c.level].b == b);
|
|
+
|
|
+ six_lock_write(&b->c.lock, NULL, NULL);
|
|
+ __btree_node_free(c, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+}
|
|
+
|
|
+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
|
|
+ struct disk_reservation *res,
|
|
+ struct closure *cl,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct write_point *wp;
|
|
+ struct btree *b;
|
|
+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
|
|
+ struct open_buckets ob = { .nr = 0 };
|
|
+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
|
|
+ unsigned nr_reserve;
|
|
+ enum alloc_reserve alloc_reserve;
|
|
+
|
|
+ if (flags & BTREE_INSERT_USE_RESERVE) {
|
|
+ nr_reserve = 0;
|
|
+ alloc_reserve = RESERVE_BTREE_MOVINGGC;
|
|
+ } else {
|
|
+ nr_reserve = BTREE_NODE_RESERVE;
|
|
+ alloc_reserve = RESERVE_BTREE;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->btree_reserve_cache_lock);
|
|
+ if (c->btree_reserve_cache_nr > nr_reserve) {
|
|
+ struct btree_alloc *a =
|
|
+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
|
|
+
|
|
+ ob = a->ob;
|
|
+ bkey_copy(&tmp.k, &a->k);
|
|
+ mutex_unlock(&c->btree_reserve_cache_lock);
|
|
+ goto mem_alloc;
|
|
+ }
|
|
+ mutex_unlock(&c->btree_reserve_cache_lock);
|
|
+
|
|
+retry:
|
|
+ wp = bch2_alloc_sectors_start(c,
|
|
+ c->opts.metadata_target ?:
|
|
+ c->opts.foreground_target,
|
|
+ 0,
|
|
+ writepoint_ptr(&c->btree_write_point),
|
|
+ &devs_have,
|
|
+ res->nr_replicas,
|
|
+ c->opts.metadata_replicas_required,
|
|
+ alloc_reserve, 0, cl);
|
|
+ if (IS_ERR(wp))
|
|
+ return ERR_CAST(wp);
|
|
+
|
|
+ if (wp->sectors_free < c->opts.btree_node_size) {
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i)
|
|
+ if (ob->sectors_free < c->opts.btree_node_size)
|
|
+ ob->sectors_free = 0;
|
|
+
|
|
+ bch2_alloc_sectors_done(c, wp);
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2))
|
|
+ bkey_btree_ptr_v2_init(&tmp.k);
|
|
+ else
|
|
+ bkey_btree_ptr_init(&tmp.k);
|
|
+
|
|
+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
|
|
+
|
|
+ bch2_open_bucket_get(c, wp, &ob);
|
|
+ bch2_alloc_sectors_done(c, wp);
|
|
+mem_alloc:
|
|
+ b = bch2_btree_node_mem_alloc(c);
|
|
+
|
|
+ /* we hold cannibalize_lock: */
|
|
+ BUG_ON(IS_ERR(b));
|
|
+ BUG_ON(b->ob.nr);
|
|
+
|
|
+ bkey_copy(&b->key, &tmp.k);
|
|
+ b->ob = ob;
|
|
+
|
|
+ return b;
|
|
+}
|
|
+
|
|
+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct btree *b;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(level >= BTREE_MAX_DEPTH);
|
|
+ BUG_ON(!as->nr_prealloc_nodes);
|
|
+
|
|
+ b = as->prealloc_nodes[--as->nr_prealloc_nodes];
|
|
+
|
|
+ set_btree_node_accessed(b);
|
|
+ set_btree_node_dirty(c, b);
|
|
+ set_btree_node_need_write(b);
|
|
+
|
|
+ bch2_bset_init_first(b, &b->data->keys);
|
|
+ b->c.level = level;
|
|
+ b->c.btree_id = as->btree_id;
|
|
+ b->version_ondisk = c->sb.version;
|
|
+
|
|
+ memset(&b->nr, 0, sizeof(b->nr));
|
|
+ b->data->magic = cpu_to_le64(bset_magic(c));
|
|
+ b->data->flags = 0;
|
|
+ SET_BTREE_NODE_ID(b->data, as->btree_id);
|
|
+ SET_BTREE_NODE_LEVEL(b->data, level);
|
|
+
|
|
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);
|
|
+
|
|
+ bp->v.mem_ptr = 0;
|
|
+ bp->v.seq = b->data->keys.seq;
|
|
+ bp->v.sectors_written = 0;
|
|
+ }
|
|
+
|
|
+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
|
|
+
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+
|
|
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ trace_btree_node_alloc(c, b);
|
|
+ return b;
|
|
+}
|
|
+
|
|
+static void btree_set_min(struct btree *b, struct bpos pos)
|
|
+{
|
|
+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
|
|
+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
|
|
+ b->data->min_key = pos;
|
|
+}
|
|
+
|
|
+static void btree_set_max(struct btree *b, struct bpos pos)
|
|
+{
|
|
+ b->key.k.p = pos;
|
|
+ b->data->max_key = pos;
|
|
+}
|
|
+
|
|
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
|
|
+ struct btree *b,
|
|
+ struct bkey_format format)
|
|
+{
|
|
+ struct btree *n;
|
|
+
|
|
+ n = bch2_btree_node_alloc(as, b->c.level);
|
|
+
|
|
+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
|
|
+
|
|
+ btree_set_min(n, b->data->min_key);
|
|
+ btree_set_max(n, b->data->max_key);
|
|
+
|
|
+ n->data->format = format;
|
|
+ btree_node_set_format(n, format);
|
|
+
|
|
+ bch2_btree_sort_into(as->c, n, b);
|
|
+
|
|
+ btree_node_reset_sib_u64s(n);
|
|
+
|
|
+ n->key.k.p = b->key.k.p;
|
|
+ return n;
|
|
+}
|
|
+
|
|
+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct bkey_format new_f = bch2_btree_calc_format(b);
|
|
+
|
|
+ /*
|
|
+ * The keys might expand with the new format - if they wouldn't fit in
|
|
+ * the btree node anymore, use the old format for now:
|
|
+ */
|
|
+ if (!bch2_btree_node_format_fits(as->c, b, &new_f))
|
|
+ new_f = b->format;
|
|
+
|
|
+ return __bch2_btree_node_alloc_replacement(as, b, new_f);
|
|
+}
|
|
+
|
|
+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
|
|
+{
|
|
+ struct btree *b = bch2_btree_node_alloc(as, level);
|
|
+
|
|
+ btree_set_min(b, POS_MIN);
|
|
+ btree_set_max(b, POS_MAX);
|
|
+ b->data->format = bch2_btree_calc_format(b);
|
|
+
|
|
+ btree_node_set_format(b, b->data->format);
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+
|
|
+ bch2_btree_update_add_new_node(as, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+
|
|
+ return b;
|
|
+}
|
|
+
|
|
+static void bch2_btree_reserve_put(struct btree_update *as)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ mutex_lock(&c->btree_reserve_cache_lock);
|
|
+
|
|
+ while (as->nr_prealloc_nodes) {
|
|
+ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
|
|
+
|
|
+ six_unlock_write(&b->c.lock);
|
|
+
|
|
+ if (c->btree_reserve_cache_nr <
|
|
+ ARRAY_SIZE(c->btree_reserve_cache)) {
|
|
+ struct btree_alloc *a =
|
|
+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++];
|
|
+
|
|
+ a->ob = b->ob;
|
|
+ b->ob.nr = 0;
|
|
+ bkey_copy(&a->k, &b->key);
|
|
+ } else {
|
|
+ bch2_open_buckets_put(c, &b->ob);
|
|
+ }
|
|
+
|
|
+ btree_node_lock_type(c, b, SIX_LOCK_write);
|
|
+ __btree_node_free(c, b);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->btree_reserve_cache_lock);
|
|
+}
|
|
+
|
|
+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
|
|
+ unsigned flags, struct closure *cl)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct btree *b;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
|
|
+
|
|
+ /*
|
|
+ * Protects reaping from the btree node cache and using the btree node
|
|
+ * open bucket reserve:
|
|
+ */
|
|
+ ret = bch2_btree_cache_cannibalize_lock(c, cl);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ while (as->nr_prealloc_nodes < nr_nodes) {
|
|
+ b = __bch2_btree_node_alloc(c, &as->disk_res,
|
|
+ flags & BTREE_INSERT_NOWAIT
|
|
+ ? NULL : cl, flags);
|
|
+ if (IS_ERR(b)) {
|
|
+ ret = PTR_ERR(b);
|
|
+ goto err_free;
|
|
+ }
|
|
+
|
|
+ as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
|
|
+ }
|
|
+
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+ return 0;
|
|
+err_free:
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+ trace_btree_reserve_get_fail(c, nr_nodes, cl);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Asynchronous interior node update machinery */
|
|
+
|
|
+static void bch2_btree_update_free(struct btree_update *as)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ if (as->took_gc_lock)
|
|
+ up_read(&c->gc_lock);
|
|
+ as->took_gc_lock = false;
|
|
+
|
|
+ bch2_journal_preres_put(&c->journal, &as->journal_preres);
|
|
+
|
|
+ bch2_journal_pin_drop(&c->journal, &as->journal);
|
|
+ bch2_journal_pin_flush(&c->journal, &as->journal);
|
|
+ bch2_disk_reservation_put(c, &as->disk_res);
|
|
+ bch2_btree_reserve_put(as);
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ list_del(&as->unwritten_list);
|
|
+ list_del(&as->list);
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ closure_debug_destroy(&as->cl);
|
|
+ mempool_free(as, &c->btree_interior_update_pool);
|
|
+
|
|
+ closure_wake_up(&c->btree_interior_update_wait);
|
|
+}
|
|
+
|
|
+static void btree_update_will_delete_key(struct btree_update *as,
|
|
+ struct bkey_i *k)
|
|
+{
|
|
+ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
|
|
+ ARRAY_SIZE(as->_old_keys));
|
|
+ bch2_keylist_add(&as->old_keys, k);
|
|
+}
|
|
+
|
|
+static void btree_update_will_add_key(struct btree_update *as,
|
|
+ struct bkey_i *k)
|
|
+{
|
|
+ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
|
|
+ ARRAY_SIZE(as->_new_keys));
|
|
+ bch2_keylist_add(&as->new_keys, k);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * The transactional part of an interior btree node update, where we journal the
|
|
+ * update we did to the interior node and update alloc info:
|
|
+ */
|
|
+static int btree_update_nodes_written_trans(struct btree_trans *trans,
|
|
+ struct btree_update *as)
|
|
+{
|
|
+ struct bkey_i *k;
|
|
+ int ret;
|
|
+
|
|
+ trans->extra_journal_entries = (void *) &as->journal_entries[0];
|
|
+ trans->extra_journal_entry_u64s = as->journal_u64s;
|
|
+ trans->journal_pin = &as->journal;
|
|
+
|
|
+ for_each_keylist_key(&as->new_keys, k) {
|
|
+ ret = bch2_trans_mark_key(trans,
|
|
+ bkey_s_c_null,
|
|
+ bkey_i_to_s_c(k),
|
|
+ 0, 0, BTREE_TRIGGER_INSERT);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ for_each_keylist_key(&as->old_keys, k) {
|
|
+ ret = bch2_trans_mark_key(trans,
|
|
+ bkey_i_to_s_c(k),
|
|
+ bkey_s_c_null,
|
|
+ 0, 0, BTREE_TRIGGER_OVERWRITE);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void btree_update_nodes_written(struct btree_update *as)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct btree *b = as->b;
|
|
+ struct btree_trans trans;
|
|
+ u64 journal_seq = 0;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * If we're already in an error state, it might be because a btree node
|
|
+ * was never written, and we might be trying to free that same btree
|
|
+ * node here, but it won't have been marked as allocated and we'll see
|
|
+ * spurious disk usage inconsistencies in the transactional part below
|
|
+ * if we don't skip it:
|
|
+ */
|
|
+ ret = bch2_journal_error(&c->journal);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ BUG_ON(!journal_pin_active(&as->journal));
|
|
+
|
|
+ /*
|
|
+ * We did an update to a parent node where the pointers we added pointed
|
|
+ * to child nodes that weren't written yet: now, the child nodes have
|
|
+ * been written so we can write out the update to the interior node.
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * We can't call into journal reclaim here: we'd block on the journal
|
|
+ * reclaim lock, but we may need to release the open buckets we have
|
|
+ * pinned in order for other btree updates to make forward progress, and
|
|
+ * journal reclaim does btree updates when flushing bkey_cached entries,
|
|
+ * which may require allocations as well.
|
|
+ */
|
|
+ bch2_trans_init(&trans, c, 0, 512);
|
|
+ ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_NOCHECK_RW|
|
|
+ BTREE_INSERT_JOURNAL_RECLAIM|
|
|
+ BTREE_INSERT_JOURNAL_RESERVED,
|
|
+ btree_update_nodes_written_trans(&trans, as));
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
|
|
+ "error %i in btree_update_nodes_written()", ret);
|
|
+err:
|
|
+ if (b) {
|
|
+ /*
|
|
+ * @b is the node we did the final insert into:
|
|
+ *
|
|
+ * On failure to get a journal reservation, we still have to
|
|
+ * unblock the write and allow most of the write path to happen
|
|
+ * so that shutdown works, but the i->journal_seq mechanism
|
|
+ * won't work to prevent the btree write from being visible (we
|
|
+ * didn't get a journal sequence number) - instead
|
|
+ * __bch2_btree_node_write() doesn't do the actual write if
|
|
+ * we're in journal error state:
|
|
+ */
|
|
+
|
|
+ btree_node_lock_type(c, b, SIX_LOCK_intent);
|
|
+ btree_node_lock_type(c, b, SIX_LOCK_write);
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+
|
|
+ list_del(&as->write_blocked_list);
|
|
+
|
|
+ /*
|
|
+ * Node might have been freed, recheck under
|
|
+ * btree_interior_update_lock:
|
|
+ */
|
|
+ if (as->b == b) {
|
|
+ struct bset *i = btree_bset_last(b);
|
|
+
|
|
+ BUG_ON(!b->c.level);
|
|
+ BUG_ON(!btree_node_dirty(b));
|
|
+
|
|
+ if (!ret) {
|
|
+ i->journal_seq = cpu_to_le64(
|
|
+ max(journal_seq,
|
|
+ le64_to_cpu(i->journal_seq)));
|
|
+
|
|
+ bch2_btree_add_journal_pin(c, b, journal_seq);
|
|
+ } else {
|
|
+ /*
|
|
+ * If we didn't get a journal sequence number we
|
|
+ * can't write this btree node, because recovery
|
|
+ * won't know to ignore this write:
|
|
+ */
|
|
+ set_btree_node_never_write(b);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+ six_unlock_write(&b->c.lock);
|
|
+
|
|
+ btree_node_write_if_need(c, b, SIX_LOCK_intent);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+ }
|
|
+
|
|
+ bch2_journal_pin_drop(&c->journal, &as->journal);
|
|
+
|
|
+ bch2_journal_preres_put(&c->journal, &as->journal_preres);
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ for (i = 0; i < as->nr_new_nodes; i++) {
|
|
+ b = as->new_nodes[i];
|
|
+
|
|
+ BUG_ON(b->will_make_reachable != (unsigned long) as);
|
|
+ b->will_make_reachable = 0;
|
|
+ }
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ for (i = 0; i < as->nr_new_nodes; i++) {
|
|
+ b = as->new_nodes[i];
|
|
+
|
|
+ btree_node_lock_type(c, b, SIX_LOCK_read);
|
|
+ btree_node_write_if_need(c, b, SIX_LOCK_read);
|
|
+ six_unlock_read(&b->c.lock);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < as->nr_open_buckets; i++)
|
|
+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
|
|
+
|
|
+ bch2_btree_update_free(as);
|
|
+}
|
|
+
|
|
+static void btree_interior_update_work(struct work_struct *work)
|
|
+{
|
|
+ struct bch_fs *c =
|
|
+ container_of(work, struct bch_fs, btree_interior_update_work);
|
|
+ struct btree_update *as;
|
|
+
|
|
+ while (1) {
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
|
|
+ struct btree_update, unwritten_list);
|
|
+ if (as && !as->nodes_written)
|
|
+ as = NULL;
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ if (!as)
|
|
+ break;
|
|
+
|
|
+ btree_update_nodes_written(as);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void btree_update_set_nodes_written(struct closure *cl)
|
|
+{
|
|
+ struct btree_update *as = container_of(cl, struct btree_update, cl);
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ as->nodes_written = true;
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We're updating @b with pointers to nodes that haven't finished writing yet:
|
|
+ * block @b from being written until @as completes
|
|
+ */
|
|
+static void btree_update_updated_node(struct btree_update *as, struct btree *b)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
|
|
+
|
|
+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
|
|
+ BUG_ON(!btree_node_dirty(b));
|
|
+
|
|
+ as->mode = BTREE_INTERIOR_UPDATING_NODE;
|
|
+ as->b = b;
|
|
+ list_add(&as->write_blocked_list, &b->write_blocked);
|
|
+
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+}
|
|
+
|
|
+static void btree_update_reparent(struct btree_update *as,
|
|
+ struct btree_update *child)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ lockdep_assert_held(&c->btree_interior_update_lock);
|
|
+
|
|
+ child->b = NULL;
|
|
+ child->mode = BTREE_INTERIOR_UPDATING_AS;
|
|
+
|
|
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
|
|
+}
|
|
+
|
|
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
|
|
+{
|
|
+ struct bkey_i *insert = &b->key;
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
|
|
+
|
|
+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
|
|
+ ARRAY_SIZE(as->journal_entries));
|
|
+
|
|
+ as->journal_u64s +=
|
|
+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
|
|
+ BCH_JSET_ENTRY_btree_root,
|
|
+ b->c.btree_id, b->c.level,
|
|
+ insert, insert->k.u64s);
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
|
|
+
|
|
+ as->mode = BTREE_INTERIOR_UPDATING_ROOT;
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * bch2_btree_update_add_new_node:
|
|
+ *
|
|
+ * This causes @as to wait on @b to be written, before it gets to
|
|
+ * bch2_btree_update_nodes_written
|
|
+ *
|
|
+ * Additionally, it sets b->will_make_reachable to prevent any additional writes
|
|
+ * to @b from happening besides the first until @b is reachable on disk
|
|
+ *
|
|
+ * And it adds @b to the list of @as's new nodes, so that we can update sector
|
|
+ * counts in bch2_btree_update_nodes_written:
|
|
+ */
|
|
+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+
|
|
+ closure_get(&as->cl);
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
|
|
+ BUG_ON(b->will_make_reachable);
|
|
+
|
|
+ as->new_nodes[as->nr_new_nodes++] = b;
|
|
+ b->will_make_reachable = 1UL|(unsigned long) as;
|
|
+
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ btree_update_will_add_key(as, &b->key);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * returns true if @b was a new node
|
|
+ */
|
|
+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct btree_update *as;
|
|
+ unsigned long v;
|
|
+ unsigned i;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ /*
|
|
+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's
|
|
+ * dropped when it gets written by bch2_btree_complete_write - the
|
|
+ * xchg() is for synchronization with bch2_btree_complete_write:
|
|
+ */
|
|
+ v = xchg(&b->will_make_reachable, 0);
|
|
+ as = (struct btree_update *) (v & ~1UL);
|
|
+
|
|
+ if (!as) {
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < as->nr_new_nodes; i++)
|
|
+ if (as->new_nodes[i] == b)
|
|
+ goto found;
|
|
+
|
|
+ BUG();
|
|
+found:
|
|
+ array_remove_item(as->new_nodes, as->nr_new_nodes, i);
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ if (v & 1)
|
|
+ closure_put(&as->cl);
|
|
+}
|
|
+
|
|
+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
|
|
+{
|
|
+ while (b->ob.nr)
|
|
+ as->open_buckets[as->nr_open_buckets++] =
|
|
+ b->ob.v[--b->ob.nr];
|
|
+}
|
|
+
|
|
+/*
|
|
+ * @b is being split/rewritten: it may have pointers to not-yet-written btree
|
|
+ * nodes and thus outstanding btree_updates - redirect @b's
|
|
+ * btree_updates to point to this btree_update:
|
|
+ */
|
|
+void bch2_btree_interior_update_will_free_node(struct btree_update *as,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct btree_update *p, *n;
|
|
+ struct btree_write *w;
|
|
+
|
|
+ set_btree_node_dying(b);
|
|
+
|
|
+ if (btree_node_fake(b))
|
|
+ return;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+
|
|
+ /*
|
|
+ * Does this node have any btree_update operations preventing
|
|
+ * it from being written?
|
|
+ *
|
|
+ * If so, redirect them to point to this btree_update: we can
|
|
+ * write out our new nodes, but we won't make them visible until those
|
|
+ * operations complete
|
|
+ */
|
|
+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
|
|
+ list_del_init(&p->write_blocked_list);
|
|
+ btree_update_reparent(as, p);
|
|
+
|
|
+ /*
|
|
+ * for flush_held_btree_writes() waiting on updates to flush or
|
|
+ * nodes to be writeable:
|
|
+ */
|
|
+ closure_wake_up(&c->btree_interior_update_wait);
|
|
+ }
|
|
+
|
|
+ clear_btree_node_dirty(c, b);
|
|
+ clear_btree_node_need_write(b);
|
|
+
|
|
+ /*
|
|
+ * Does this node have unwritten data that has a pin on the journal?
|
|
+ *
|
|
+ * If so, transfer that pin to the btree_update operation -
|
|
+ * note that if we're freeing multiple nodes, we only need to keep the
|
|
+ * oldest pin of any of the nodes we're freeing. We'll release the pin
|
|
+ * when the new nodes are persistent and reachable on disk:
|
|
+ */
|
|
+ w = btree_current_write(b);
|
|
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
|
|
+ bch2_journal_pin_drop(&c->journal, &w->journal);
|
|
+
|
|
+ w = btree_prev_write(b);
|
|
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
|
|
+ bch2_journal_pin_drop(&c->journal, &w->journal);
|
|
+
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ /*
|
|
+ * Is this a node that isn't reachable on disk yet?
|
|
+ *
|
|
+ * Nodes that aren't reachable yet have writes blocked until they're
|
|
+ * reachable - now that we've cancelled any pending writes and moved
|
|
+ * things waiting on that write to wait on this update, we can drop this
|
|
+ * node from the list of nodes that the other update is making
|
|
+ * reachable, prior to freeing it:
|
|
+ */
|
|
+ btree_update_drop_new_node(c, b);
|
|
+
|
|
+ btree_update_will_delete_key(as, &b->key);
|
|
+
|
|
+ /*
|
|
+ * XXX: Waiting on io with btree node locks held, we don't want to be
|
|
+ * doing this. We can't have btree writes happening after the space has
|
|
+ * been freed, but we really only need to block before
|
|
+ * btree_update_nodes_written_trans() happens.
|
|
+ */
|
|
+ btree_node_wait_on_io(b);
|
|
+}
|
|
+
|
|
+void bch2_btree_update_done(struct btree_update *as)
|
|
+{
|
|
+ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
|
|
+
|
|
+ if (as->took_gc_lock)
|
|
+ up_read(&as->c->gc_lock);
|
|
+ as->took_gc_lock = false;
|
|
+
|
|
+ bch2_btree_reserve_put(as);
|
|
+
|
|
+ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
|
|
+}
|
|
+
|
|
+struct btree_update *
|
|
+bch2_btree_update_start(struct btree_iter *iter, unsigned level,
|
|
+ unsigned nr_nodes, unsigned flags)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_update *as;
|
|
+ struct closure cl;
|
|
+ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
|
|
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
|
|
+ int journal_flags = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (flags & BTREE_INSERT_JOURNAL_RESERVED)
|
|
+ journal_flags |= JOURNAL_RES_GET_RESERVED;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+retry:
|
|
+ /*
|
|
+ * This check isn't necessary for correctness - it's just to potentially
|
|
+ * prevent us from doing a lot of work that'll end up being wasted:
|
|
+ */
|
|
+ ret = bch2_journal_error(&c->journal);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+
|
|
+ /*
|
|
+ * XXX: figure out how far we might need to split,
|
|
+ * instead of locking/reserving all the way to the root:
|
|
+ */
|
|
+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
|
|
+ trace_trans_restart_iter_upgrade(trans->ip);
|
|
+ return ERR_PTR(-EINTR);
|
|
+ }
|
|
+
|
|
+ if (flags & BTREE_INSERT_GC_LOCK_HELD)
|
|
+ lockdep_assert_held(&c->gc_lock);
|
|
+ else if (!down_read_trylock(&c->gc_lock)) {
|
|
+ if (flags & BTREE_INSERT_NOUNLOCK)
|
|
+ return ERR_PTR(-EINTR);
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+ down_read(&c->gc_lock);
|
|
+ if (!bch2_trans_relock(trans)) {
|
|
+ up_read(&c->gc_lock);
|
|
+ return ERR_PTR(-EINTR);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
|
|
+ memset(as, 0, sizeof(*as));
|
|
+ closure_init(&as->cl, NULL);
|
|
+ as->c = c;
|
|
+ as->mode = BTREE_INTERIOR_NO_UPDATE;
|
|
+ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
|
|
+ as->btree_id = iter->btree_id;
|
|
+ INIT_LIST_HEAD(&as->list);
|
|
+ INIT_LIST_HEAD(&as->unwritten_list);
|
|
+ INIT_LIST_HEAD(&as->write_blocked_list);
|
|
+ bch2_keylist_init(&as->old_keys, as->_old_keys);
|
|
+ bch2_keylist_init(&as->new_keys, as->_new_keys);
|
|
+ bch2_keylist_init(&as->parent_keys, as->inline_keys);
|
|
+
|
|
+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
|
|
+ BTREE_UPDATE_JOURNAL_RES,
|
|
+ journal_flags|JOURNAL_RES_GET_NONBLOCK);
|
|
+ if (ret == -EAGAIN) {
|
|
+ /*
|
|
+ * this would be cleaner if bch2_journal_preres_get() took a
|
|
+ * closure argument
|
|
+ */
|
|
+ if (flags & BTREE_INSERT_NOUNLOCK) {
|
|
+ trace_trans_restart_journal_preres_get(trans->ip);
|
|
+ ret = -EINTR;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
|
|
+ bch2_btree_update_free(as);
|
|
+ return ERR_PTR(ret);
|
|
+ }
|
|
+
|
|
+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
|
|
+ BTREE_UPDATE_JOURNAL_RES,
|
|
+ journal_flags);
|
|
+ if (ret) {
|
|
+ trace_trans_restart_journal_preres_get(trans->ip);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!bch2_trans_relock(trans)) {
|
|
+ ret = -EINTR;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_disk_reservation_get(c, &as->disk_res,
|
|
+ nr_nodes * c->opts.btree_node_size,
|
|
+ c->opts.metadata_replicas,
|
|
+ disk_res_flags);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_btree_reserve_get(as, nr_nodes, flags,
|
|
+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_journal_pin_add(&c->journal,
|
|
+ atomic64_read(&c->journal.seq),
|
|
+ &as->journal, NULL);
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ list_add_tail(&as->list, &c->btree_interior_update_list);
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ return as;
|
|
+err:
|
|
+ bch2_btree_update_free(as);
|
|
+
|
|
+ if (ret == -EAGAIN) {
|
|
+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+ closure_sync(&cl);
|
|
+ ret = -EINTR;
|
|
+ }
|
|
+
|
|
+ if (ret == -EINTR && bch2_trans_relock(trans))
|
|
+ goto retry;
|
|
+
|
|
+ return ERR_PTR(ret);
|
|
+}
|
|
+
|
|
+/* Btree root updates: */
|
|
+
|
|
+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ /* Root nodes cannot be reaped */
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ list_del_init(&b->list);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+
|
|
+ if (b->c.level)
|
|
+ six_lock_pcpu_alloc(&b->c.lock);
|
|
+ else
|
|
+ six_lock_pcpu_free(&b->c.lock);
|
|
+
|
|
+ mutex_lock(&c->btree_root_lock);
|
|
+ BUG_ON(btree_node_root(c, b) &&
|
|
+ (b->c.level < btree_node_root(c, b)->c.level ||
|
|
+ !btree_node_dying(btree_node_root(c, b))));
|
|
+
|
|
+ btree_node_root(c, b) = b;
|
|
+ mutex_unlock(&c->btree_root_lock);
|
|
+
|
|
+ bch2_recalc_btree_reserve(c);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_btree_set_root - update the root in memory and on disk
|
|
+ *
|
|
+ * To ensure forward progress, the current task must not be holding any
|
|
+ * btree node write locks. However, you must hold an intent lock on the
|
|
+ * old root.
|
|
+ *
|
|
+ * Note: This allocates a journal entry but doesn't add any keys to
|
|
+ * it. All the btree roots are part of every journal write, so there
|
|
+ * is nothing new to be done. This just guarantees that there is a
|
|
+ * journal write.
|
|
+ */
|
|
+static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct btree *old;
|
|
+
|
|
+ trace_btree_set_root(c, b);
|
|
+ BUG_ON(!b->written &&
|
|
+ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
|
|
+
|
|
+ old = btree_node_root(c, b);
|
|
+
|
|
+ /*
|
|
+ * Ensure no one is using the old root while we switch to the
|
|
+ * new root:
|
|
+ */
|
|
+ bch2_btree_node_lock_write(old, iter);
|
|
+
|
|
+ bch2_btree_set_root_inmem(c, b);
|
|
+
|
|
+ btree_update_updated_root(as, b);
|
|
+
|
|
+ /*
|
|
+ * Unlock old root after new root is visible:
|
|
+ *
|
|
+ * The new root isn't persistent, but that's ok: we still have
|
|
+ * an intent lock on the new root, and any updates that would
|
|
+ * depend on the new root would have to update the new root.
|
|
+ */
|
|
+ bch2_btree_node_unlock_write(old, iter);
|
|
+}
|
|
+
|
|
+/* Interior node updates: */
|
|
+
|
|
+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *insert,
|
|
+ struct btree_node_iter *node_iter)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct bkey_packed *k;
|
|
+ const char *invalid;
|
|
+
|
|
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
|
|
+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
|
|
+ if (invalid) {
|
|
+ char buf[160];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
|
|
+ bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
|
|
+ dump_stack();
|
|
+ }
|
|
+
|
|
+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
|
|
+ ARRAY_SIZE(as->journal_entries));
|
|
+
|
|
+ as->journal_u64s +=
|
|
+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
|
|
+ BCH_JSET_ENTRY_btree_keys,
|
|
+ b->c.btree_id, b->c.level,
|
|
+ insert, insert->k.u64s);
|
|
+
|
|
+ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
|
|
+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
|
|
+ bch2_btree_node_iter_advance(node_iter, b);
|
|
+
|
|
+ bch2_btree_bset_insert_key(iter, b, node_iter, insert);
|
|
+ set_btree_node_dirty(c, b);
|
|
+ set_btree_node_need_write(b);
|
|
+}
|
|
+
|
|
+static void
|
|
+__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter, struct keylist *keys,
|
|
+ struct btree_node_iter node_iter)
|
|
+{
|
|
+ struct bkey_i *insert = bch2_keylist_front(keys);
|
|
+ struct bkey_packed *k;
|
|
+
|
|
+ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
|
|
+
|
|
+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
|
|
+ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
|
|
+ ;
|
|
+
|
|
+ while (!bch2_keylist_empty(keys)) {
|
|
+ bch2_insert_fixup_btree_ptr(as, b, iter,
|
|
+ bch2_keylist_front(keys), &node_iter);
|
|
+ bch2_keylist_pop_front(keys);
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher
|
|
+ * node)
|
|
+ */
|
|
+static struct btree *__btree_split_node(struct btree_update *as,
|
|
+ struct btree *n1,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ struct bkey_format_state s;
|
|
+ size_t nr_packed = 0, nr_unpacked = 0;
|
|
+ struct btree *n2;
|
|
+ struct bset *set1, *set2;
|
|
+ struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
|
|
+ struct bpos n1_pos;
|
|
+
|
|
+ n2 = bch2_btree_node_alloc(as, n1->c.level);
|
|
+ bch2_btree_update_add_new_node(as, n2);
|
|
+
|
|
+ n2->data->max_key = n1->data->max_key;
|
|
+ n2->data->format = n1->format;
|
|
+ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
|
|
+ n2->key.k.p = n1->key.k.p;
|
|
+
|
|
+ set1 = btree_bset_first(n1);
|
|
+ set2 = btree_bset_first(n2);
|
|
+
|
|
+ /*
|
|
+ * Has to be a linear search because we don't have an auxiliary
|
|
+ * search tree yet
|
|
+ */
|
|
+ k = set1->start;
|
|
+ while (1) {
|
|
+ struct bkey_packed *n = bkey_next(k);
|
|
+
|
|
+ if (n == vstruct_last(set1))
|
|
+ break;
|
|
+ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
|
|
+ break;
|
|
+
|
|
+ if (bkey_packed(k))
|
|
+ nr_packed++;
|
|
+ else
|
|
+ nr_unpacked++;
|
|
+
|
|
+ prev = k;
|
|
+ k = n;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!prev);
|
|
+ set2_start = k;
|
|
+ set2_end = vstruct_last(set1);
|
|
+
|
|
+ set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
|
|
+ set_btree_bset_end(n1, n1->set);
|
|
+
|
|
+ n1->nr.live_u64s = le16_to_cpu(set1->u64s);
|
|
+ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s);
|
|
+ n1->nr.packed_keys = nr_packed;
|
|
+ n1->nr.unpacked_keys = nr_unpacked;
|
|
+
|
|
+ n1_pos = bkey_unpack_pos(n1, prev);
|
|
+ if (as->c->sb.version < bcachefs_metadata_version_snapshot)
|
|
+ n1_pos.snapshot = U32_MAX;
|
|
+
|
|
+ btree_set_max(n1, n1_pos);
|
|
+ btree_set_min(n2, bpos_successor(n1->key.k.p));
|
|
+
|
|
+ bch2_bkey_format_init(&s);
|
|
+ bch2_bkey_format_add_pos(&s, n2->data->min_key);
|
|
+ bch2_bkey_format_add_pos(&s, n2->data->max_key);
|
|
+
|
|
+ for (k = set2_start; k != set2_end; k = bkey_next(k)) {
|
|
+ struct bkey uk = bkey_unpack_key(n1, k);
|
|
+ bch2_bkey_format_add_key(&s, &uk);
|
|
+ }
|
|
+
|
|
+ n2->data->format = bch2_bkey_format_done(&s);
|
|
+ btree_node_set_format(n2, n2->data->format);
|
|
+
|
|
+ out = set2->start;
|
|
+ memset(&n2->nr, 0, sizeof(n2->nr));
|
|
+
|
|
+ for (k = set2_start; k != set2_end; k = bkey_next(k)) {
|
|
+ BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
|
|
+ ? &n1->format : &bch2_bkey_format_current, k));
|
|
+ out->format = KEY_FORMAT_LOCAL_BTREE;
|
|
+ btree_keys_account_key_add(&n2->nr, 0, out);
|
|
+ out = bkey_next(out);
|
|
+ }
|
|
+
|
|
+ set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
|
|
+ set_btree_bset_end(n2, n2->set);
|
|
+
|
|
+ BUG_ON(!set1->u64s);
|
|
+ BUG_ON(!set2->u64s);
|
|
+
|
|
+ btree_node_reset_sib_u64s(n1);
|
|
+ btree_node_reset_sib_u64s(n2);
|
|
+
|
|
+ bch2_verify_btree_nr_keys(n1);
|
|
+ bch2_verify_btree_nr_keys(n2);
|
|
+
|
|
+ if (n1->c.level) {
|
|
+ btree_node_interior_verify(as->c, n1);
|
|
+ btree_node_interior_verify(as->c, n2);
|
|
+ }
|
|
+
|
|
+ return n2;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * For updates to interior nodes, we've got to do the insert before we split
|
|
+ * because the stuff we're inserting has to be inserted atomically. Post split,
|
|
+ * the keys might have to go in different nodes and the split would no longer be
|
|
+ * atomic.
|
|
+ *
|
|
+ * Worse, if the insert is from btree node coalescing, if we do the insert after
|
|
+ * we do the split (and pick the pivot) - the pivot we pick might be between
|
|
+ * nodes that were coalesced, and thus in the middle of a child node post
|
|
+ * coalescing:
|
|
+ */
|
|
+static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter,
|
|
+ struct keylist *keys)
|
|
+{
|
|
+ struct btree_node_iter node_iter;
|
|
+ struct bkey_i *k = bch2_keylist_front(keys);
|
|
+ struct bkey_packed *src, *dst, *n;
|
|
+ struct bset *i;
|
|
+
|
|
+ bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
|
|
+
|
|
+ __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
|
|
+
|
|
+ /*
|
|
+ * We can't tolerate whiteouts here - with whiteouts there can be
|
|
+ * duplicate keys, and it would be rather bad if we picked a duplicate
|
|
+ * for the pivot:
|
|
+ */
|
|
+ i = btree_bset_first(b);
|
|
+ src = dst = i->start;
|
|
+ while (src != vstruct_last(i)) {
|
|
+ n = bkey_next(src);
|
|
+ if (!bkey_deleted(src)) {
|
|
+ memmove_u64s_down(dst, src, src->u64s);
|
|
+ dst = bkey_next(dst);
|
|
+ }
|
|
+ src = n;
|
|
+ }
|
|
+
|
|
+ /* Also clear out the unwritten whiteouts area: */
|
|
+ b->whiteout_u64s = 0;
|
|
+
|
|
+ i->u64s = cpu_to_le16((u64 *) dst - i->_data);
|
|
+ set_btree_bset_end(b, b->set);
|
|
+
|
|
+ BUG_ON(b->nsets != 1 ||
|
|
+ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
|
|
+
|
|
+ btree_node_interior_verify(as->c, b);
|
|
+}
|
|
+
|
|
+static void btree_split(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter, struct keylist *keys,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ struct btree *parent = btree_node_parent(iter, b);
|
|
+ struct btree *n1, *n2 = NULL, *n3 = NULL;
|
|
+ u64 start_time = local_clock();
|
|
+
|
|
+ BUG_ON(!parent && (b != btree_node_root(c, b)));
|
|
+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
|
|
+
|
|
+ bch2_btree_interior_update_will_free_node(as, b);
|
|
+
|
|
+ n1 = bch2_btree_node_alloc_replacement(as, b);
|
|
+ bch2_btree_update_add_new_node(as, n1);
|
|
+
|
|
+ if (keys)
|
|
+ btree_split_insert_keys(as, n1, iter, keys);
|
|
+
|
|
+ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
|
|
+ trace_btree_split(c, b);
|
|
+
|
|
+ n2 = __btree_split_node(as, n1, iter);
|
|
+
|
|
+ bch2_btree_build_aux_trees(n2);
|
|
+ bch2_btree_build_aux_trees(n1);
|
|
+ six_unlock_write(&n2->c.lock);
|
|
+ six_unlock_write(&n1->c.lock);
|
|
+
|
|
+ bch2_btree_node_write(c, n2, SIX_LOCK_intent);
|
|
+
|
|
+ /*
|
|
+ * Note that on recursive parent_keys == keys, so we
|
|
+ * can't start adding new keys to parent_keys before emptying it
|
|
+ * out (which we did with btree_split_insert_keys() above)
|
|
+ */
|
|
+ bch2_keylist_add(&as->parent_keys, &n1->key);
|
|
+ bch2_keylist_add(&as->parent_keys, &n2->key);
|
|
+
|
|
+ if (!parent) {
|
|
+ /* Depth increases, make a new root */
|
|
+ n3 = __btree_root_alloc(as, b->c.level + 1);
|
|
+
|
|
+ n3->sib_u64s[0] = U16_MAX;
|
|
+ n3->sib_u64s[1] = U16_MAX;
|
|
+
|
|
+ btree_split_insert_keys(as, n3, iter, &as->parent_keys);
|
|
+
|
|
+ bch2_btree_node_write(c, n3, SIX_LOCK_intent);
|
|
+ }
|
|
+ } else {
|
|
+ trace_btree_compact(c, b);
|
|
+
|
|
+ bch2_btree_build_aux_trees(n1);
|
|
+ six_unlock_write(&n1->c.lock);
|
|
+
|
|
+ if (parent)
|
|
+ bch2_keylist_add(&as->parent_keys, &n1->key);
|
|
+ }
|
|
+
|
|
+ bch2_btree_node_write(c, n1, SIX_LOCK_intent);
|
|
+
|
|
+ /* New nodes all written, now make them visible: */
|
|
+
|
|
+ if (parent) {
|
|
+ /* Split a non root node */
|
|
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
|
|
+ } else if (n3) {
|
|
+ bch2_btree_set_root(as, n3, iter);
|
|
+ } else {
|
|
+ /* Root filled up but didn't need to be split */
|
|
+ bch2_btree_set_root(as, n1, iter);
|
|
+ }
|
|
+
|
|
+ bch2_btree_update_get_open_buckets(as, n1);
|
|
+ if (n2)
|
|
+ bch2_btree_update_get_open_buckets(as, n2);
|
|
+ if (n3)
|
|
+ bch2_btree_update_get_open_buckets(as, n3);
|
|
+
|
|
+ /* Successful split, update the iterator to point to the new nodes: */
|
|
+
|
|
+ six_lock_increment(&b->c.lock, SIX_LOCK_intent);
|
|
+ bch2_btree_iter_node_drop(iter, b);
|
|
+ if (n3)
|
|
+ bch2_btree_iter_node_replace(iter, n3);
|
|
+ if (n2)
|
|
+ bch2_btree_iter_node_replace(iter, n2);
|
|
+ bch2_btree_iter_node_replace(iter, n1);
|
|
+
|
|
+ /*
|
|
+ * The old node must be freed (in memory) _before_ unlocking the new
|
|
+ * nodes - else another thread could re-acquire a read lock on the old
|
|
+ * node after another thread has locked and updated the new node, thus
|
|
+ * seeing stale data:
|
|
+ */
|
|
+ bch2_btree_node_free_inmem(c, b, iter);
|
|
+
|
|
+ if (n3)
|
|
+ six_unlock_intent(&n3->c.lock);
|
|
+ if (n2)
|
|
+ six_unlock_intent(&n2->c.lock);
|
|
+ six_unlock_intent(&n1->c.lock);
|
|
+
|
|
+ bch2_btree_trans_verify_locks(iter->trans);
|
|
+
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
|
|
+ start_time);
|
|
+}
|
|
+
|
|
+static void
|
|
+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter, struct keylist *keys)
|
|
+{
|
|
+ struct btree_iter *linked;
|
|
+
|
|
+ __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
|
|
+
|
|
+ btree_update_updated_node(as, b);
|
|
+
|
|
+ trans_for_each_iter_with_node(iter->trans, b, linked)
|
|
+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
|
|
+
|
|
+ bch2_btree_trans_verify_iters(iter->trans, b);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_btree_insert_node - insert bkeys into a given btree node
|
|
+ *
|
|
+ * @iter: btree iterator
|
|
+ * @keys: list of keys to insert
|
|
+ * @hook: insert callback
|
|
+ * @persistent: if not null, @persistent will wait on journal write
|
|
+ *
|
|
+ * Inserts as many keys as it can into a given btree node, splitting it if full.
|
|
+ * If a split occurred, this function will return early. This can only happen
|
|
+ * for leaf nodes -- inserts into interior nodes have to be atomic.
|
|
+ */
|
|
+void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
|
|
+ struct btree_iter *iter, struct keylist *keys,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = as->c;
|
|
+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
|
|
+ int old_live_u64s = b->nr.live_u64s;
|
|
+ int live_u64s_added, u64s_added;
|
|
+
|
|
+ lockdep_assert_held(&c->gc_lock);
|
|
+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level));
|
|
+ BUG_ON(!b->c.level);
|
|
+ BUG_ON(!as || as->b);
|
|
+ bch2_verify_keylist_sorted(keys);
|
|
+
|
|
+ bch2_btree_node_lock_for_insert(c, b, iter);
|
|
+
|
|
+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
|
|
+ bch2_btree_node_unlock_write(b, iter);
|
|
+ goto split;
|
|
+ }
|
|
+
|
|
+ btree_node_interior_verify(c, b);
|
|
+
|
|
+ bch2_btree_insert_keys_interior(as, b, iter, keys);
|
|
+
|
|
+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
|
|
+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
|
|
+
|
|
+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
|
|
+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
|
|
+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
|
|
+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
|
|
+
|
|
+ if (u64s_added > live_u64s_added &&
|
|
+ bch2_maybe_compact_whiteouts(c, b))
|
|
+ bch2_btree_iter_reinit_node(iter, b);
|
|
+
|
|
+ bch2_btree_node_unlock_write(b, iter);
|
|
+
|
|
+ btree_node_interior_verify(c, b);
|
|
+ return;
|
|
+split:
|
|
+ btree_split(as, b, iter, keys, flags);
|
|
+}
|
|
+
|
|
+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree *b = iter_l(iter)->b;
|
|
+ struct btree_update *as;
|
|
+ unsigned l;
|
|
+ int ret = 0;
|
|
+
|
|
+ as = bch2_btree_update_start(iter, iter->level,
|
|
+ btree_update_reserve_required(c, b), flags);
|
|
+ if (IS_ERR(as))
|
|
+ return PTR_ERR(as);
|
|
+
|
|
+ btree_split(as, b, iter, NULL, flags);
|
|
+ bch2_btree_update_done(as);
|
|
+
|
|
+ for (l = iter->level + 1; btree_iter_node(iter, l) && !ret; l++)
|
|
+ ret = bch2_foreground_maybe_merge(c, iter, l, flags);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_foreground_maybe_merge(struct bch_fs *c,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned level,
|
|
+ unsigned flags,
|
|
+ enum btree_node_sibling sib)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+ struct btree_iter *sib_iter = NULL;
|
|
+ struct btree_update *as;
|
|
+ struct bkey_format_state new_s;
|
|
+ struct bkey_format new_f;
|
|
+ struct bkey_i delete;
|
|
+ struct btree *b, *m, *n, *prev, *next, *parent;
|
|
+ struct bpos sib_pos;
|
|
+ size_t sib_u64s;
|
|
+ int ret = 0, ret2 = 0;
|
|
+
|
|
+ BUG_ON(!btree_node_locked(iter, level));
|
|
+retry:
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ BUG_ON(!btree_node_locked(iter, level));
|
|
+
|
|
+ b = iter->l[level].b;
|
|
+
|
|
+ if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
|
|
+ (sib == btree_next_sib && !bpos_cmp(b->data->max_key, POS_MAX))) {
|
|
+ b->sib_u64s[sib] = U16_MAX;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ sib_pos = sib == btree_prev_sib
|
|
+ ? bpos_predecessor(b->data->min_key)
|
|
+ : bpos_successor(b->data->max_key);
|
|
+
|
|
+ sib_iter = bch2_trans_get_node_iter(trans, iter->btree_id,
|
|
+ sib_pos, U8_MAX, level,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = bch2_btree_iter_traverse(sib_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ m = sib_iter->l[level].b;
|
|
+
|
|
+ if (btree_node_parent(iter, b) !=
|
|
+ btree_node_parent(sib_iter, m)) {
|
|
+ b->sib_u64s[sib] = U16_MAX;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (sib == btree_prev_sib) {
|
|
+ prev = m;
|
|
+ next = b;
|
|
+ } else {
|
|
+ prev = b;
|
|
+ next = m;
|
|
+ }
|
|
+
|
|
+ if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
|
|
+ char buf1[100], buf2[100];
|
|
+
|
|
+ bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
|
|
+ bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
|
|
+ bch_err(c,
|
|
+ "btree topology error in btree merge:\n"
|
|
+ " prev ends at %s\n"
|
|
+ " next starts at %s",
|
|
+ buf1, buf2);
|
|
+ bch2_topology_error(c);
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_bkey_format_init(&new_s);
|
|
+ bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
|
|
+ __bch2_btree_calc_format(&new_s, prev);
|
|
+ __bch2_btree_calc_format(&new_s, next);
|
|
+ bch2_bkey_format_add_pos(&new_s, next->data->max_key);
|
|
+ new_f = bch2_bkey_format_done(&new_s);
|
|
+
|
|
+ sib_u64s = btree_node_u64s_with_format(b, &new_f) +
|
|
+ btree_node_u64s_with_format(m, &new_f);
|
|
+
|
|
+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
|
|
+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
|
|
+ sib_u64s /= 2;
|
|
+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
|
|
+ }
|
|
+
|
|
+ sib_u64s = min(sib_u64s, btree_max_u64s(c));
|
|
+ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
|
|
+ b->sib_u64s[sib] = sib_u64s;
|
|
+
|
|
+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
|
|
+ goto out;
|
|
+
|
|
+ parent = btree_node_parent(iter, b);
|
|
+ as = bch2_btree_update_start(iter, level,
|
|
+ btree_update_reserve_required(c, parent) + 1,
|
|
+ flags|
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_USE_RESERVE);
|
|
+ ret = PTR_ERR_OR_ZERO(as);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ trace_btree_merge(c, b);
|
|
+
|
|
+ bch2_btree_interior_update_will_free_node(as, b);
|
|
+ bch2_btree_interior_update_will_free_node(as, m);
|
|
+
|
|
+ n = bch2_btree_node_alloc(as, b->c.level);
|
|
+ bch2_btree_update_add_new_node(as, n);
|
|
+
|
|
+ btree_set_min(n, prev->data->min_key);
|
|
+ btree_set_max(n, next->data->max_key);
|
|
+ n->data->format = new_f;
|
|
+
|
|
+ btree_node_set_format(n, new_f);
|
|
+
|
|
+ bch2_btree_sort_into(c, n, prev);
|
|
+ bch2_btree_sort_into(c, n, next);
|
|
+
|
|
+ bch2_btree_build_aux_trees(n);
|
|
+ six_unlock_write(&n->c.lock);
|
|
+
|
|
+ bkey_init(&delete.k);
|
|
+ delete.k.p = prev->key.k.p;
|
|
+ bch2_keylist_add(&as->parent_keys, &delete);
|
|
+ bch2_keylist_add(&as->parent_keys, &n->key);
|
|
+
|
|
+ bch2_btree_node_write(c, n, SIX_LOCK_intent);
|
|
+
|
|
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
|
|
+
|
|
+ bch2_btree_update_get_open_buckets(as, n);
|
|
+
|
|
+ six_lock_increment(&b->c.lock, SIX_LOCK_intent);
|
|
+ six_lock_increment(&m->c.lock, SIX_LOCK_intent);
|
|
+ bch2_btree_iter_node_drop(iter, b);
|
|
+ bch2_btree_iter_node_drop(iter, m);
|
|
+
|
|
+ bch2_btree_iter_node_replace(iter, n);
|
|
+
|
|
+ bch2_btree_trans_verify_iters(trans, n);
|
|
+
|
|
+ bch2_btree_node_free_inmem(c, b, iter);
|
|
+ bch2_btree_node_free_inmem(c, m, iter);
|
|
+
|
|
+ six_unlock_intent(&n->c.lock);
|
|
+
|
|
+ bch2_btree_update_done(as);
|
|
+out:
|
|
+ bch2_btree_trans_verify_locks(trans);
|
|
+ bch2_trans_iter_free(trans, sib_iter);
|
|
+
|
|
+ /*
|
|
+ * Don't downgrade locks here: we're called after successful insert,
|
|
+ * and the caller will downgrade locks after a successful insert
|
|
+ * anyways (in case e.g. a split was required first)
|
|
+ *
|
|
+ * And we're also called when inserting into interior nodes in the
|
|
+ * split path, and downgrading to read locks in there is potentially
|
|
+ * confusing:
|
|
+ */
|
|
+ return ret ?: ret2;
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, sib_iter);
|
|
+ sib_iter = NULL;
|
|
+
|
|
+ if (ret == -EINTR && bch2_trans_relock(trans))
|
|
+ goto retry;
|
|
+
|
|
+ if (ret == -EINTR && !(flags & BTREE_INSERT_NOUNLOCK)) {
|
|
+ ret2 = ret;
|
|
+ ret = bch2_btree_iter_traverse_all(trans);
|
|
+ if (!ret)
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_btree_node_rewrite - Rewrite/move a btree node
|
|
+ */
|
|
+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
|
|
+ __le64 seq, unsigned flags)
|
|
+{
|
|
+ struct btree *b, *n, *parent;
|
|
+ struct btree_update *as;
|
|
+ int ret;
|
|
+
|
|
+ flags |= BTREE_INSERT_NOFAIL;
|
|
+retry:
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ b = bch2_btree_iter_peek_node(iter);
|
|
+ if (!b || b->data->keys.seq != seq)
|
|
+ goto out;
|
|
+
|
|
+ parent = btree_node_parent(iter, b);
|
|
+ as = bch2_btree_update_start(iter, b->c.level,
|
|
+ (parent
|
|
+ ? btree_update_reserve_required(c, parent)
|
|
+ : 0) + 1,
|
|
+ flags);
|
|
+ ret = PTR_ERR_OR_ZERO(as);
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ if (ret) {
|
|
+ trace_btree_gc_rewrite_node_fail(c, b);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_btree_interior_update_will_free_node(as, b);
|
|
+
|
|
+ n = bch2_btree_node_alloc_replacement(as, b);
|
|
+ bch2_btree_update_add_new_node(as, n);
|
|
+
|
|
+ bch2_btree_build_aux_trees(n);
|
|
+ six_unlock_write(&n->c.lock);
|
|
+
|
|
+ trace_btree_gc_rewrite_node(c, b);
|
|
+
|
|
+ bch2_btree_node_write(c, n, SIX_LOCK_intent);
|
|
+
|
|
+ if (parent) {
|
|
+ bch2_keylist_add(&as->parent_keys, &n->key);
|
|
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
|
|
+ } else {
|
|
+ bch2_btree_set_root(as, n, iter);
|
|
+ }
|
|
+
|
|
+ bch2_btree_update_get_open_buckets(as, n);
|
|
+
|
|
+ six_lock_increment(&b->c.lock, SIX_LOCK_intent);
|
|
+ bch2_btree_iter_node_drop(iter, b);
|
|
+ bch2_btree_iter_node_replace(iter, n);
|
|
+ bch2_btree_node_free_inmem(c, b, iter);
|
|
+ six_unlock_intent(&n->c.lock);
|
|
+
|
|
+ bch2_btree_update_done(as);
|
|
+out:
|
|
+ bch2_btree_iter_downgrade(iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct async_btree_rewrite {
|
|
+ struct bch_fs *c;
|
|
+ struct work_struct work;
|
|
+ enum btree_id btree_id;
|
|
+ unsigned level;
|
|
+ struct bpos pos;
|
|
+ __le64 seq;
|
|
+};
|
|
+
|
|
+void async_btree_node_rewrite_work(struct work_struct *work)
|
|
+{
|
|
+ struct async_btree_rewrite *a =
|
|
+ container_of(work, struct async_btree_rewrite, work);
|
|
+ struct bch_fs *c = a->c;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
|
|
+ BTREE_MAX_DEPTH, a->level, 0);
|
|
+ bch2_btree_node_rewrite(c, iter, a->seq, 0);
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ percpu_ref_put(&c->writes);
|
|
+ kfree(a);
|
|
+}
|
|
+
|
|
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
|
|
+
|
|
+ if (!percpu_ref_tryget(&c->writes))
|
|
+ return;
|
|
+
|
|
+ a = kmalloc(sizeof(*a), GFP_NOFS);
|
|
+ if (!a) {
|
|
+ percpu_ref_put(&c->writes);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ a->c = c;
|
|
+ a->btree_id = b->c.btree_id;
|
|
+ a->level = b->c.level;
|
|
+ a->pos = b->key.k.p;
|
|
+ a->seq = b->data->keys.seq;
|
|
+
|
|
+ INIT_WORK(&a->work, async_btree_node_rewrite_work);
|
|
+ queue_work(system_long_wq, &a->work);
|
|
+}
|
|
+
|
|
+static void __bch2_btree_node_update_key(struct bch_fs *c,
|
|
+ struct btree_update *as,
|
|
+ struct btree_iter *iter,
|
|
+ struct btree *b, struct btree *new_hash,
|
|
+ struct bkey_i *new_key)
|
|
+{
|
|
+ struct btree *parent;
|
|
+ int ret;
|
|
+
|
|
+ btree_update_will_delete_key(as, &b->key);
|
|
+ btree_update_will_add_key(as, new_key);
|
|
+
|
|
+ parent = btree_node_parent(iter, b);
|
|
+ if (parent) {
|
|
+ if (new_hash) {
|
|
+ bkey_copy(&new_hash->key, new_key);
|
|
+ ret = bch2_btree_node_hash_insert(&c->btree_cache,
|
|
+ new_hash, b->c.level, b->c.btree_id);
|
|
+ BUG_ON(ret);
|
|
+ }
|
|
+
|
|
+ bch2_keylist_add(&as->parent_keys, new_key);
|
|
+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
|
|
+
|
|
+ if (new_hash) {
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
|
|
+
|
|
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
|
|
+
|
|
+ bkey_copy(&b->key, new_key);
|
|
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
|
|
+ BUG_ON(ret);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+ } else {
|
|
+ bkey_copy(&b->key, new_key);
|
|
+ }
|
|
+ } else {
|
|
+ BUG_ON(btree_node_root(c, b) != b);
|
|
+
|
|
+ bch2_btree_node_lock_write(b, iter);
|
|
+ bkey_copy(&b->key, new_key);
|
|
+
|
|
+ if (btree_ptr_hash_val(&b->key) != b->hash_val) {
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ bch2_btree_node_hash_remove(&c->btree_cache, b);
|
|
+
|
|
+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
|
|
+ BUG_ON(ret);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+ }
|
|
+
|
|
+ btree_update_updated_root(as, b);
|
|
+ bch2_btree_node_unlock_write(b, iter);
|
|
+ }
|
|
+
|
|
+ bch2_btree_update_done(as);
|
|
+}
|
|
+
|
|
+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct bkey_i *new_key)
|
|
+{
|
|
+ struct btree *parent = btree_node_parent(iter, b);
|
|
+ struct btree_update *as = NULL;
|
|
+ struct btree *new_hash = NULL;
|
|
+ struct closure cl;
|
|
+ int ret = 0;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ /*
|
|
+ * check btree_ptr_hash_val() after @b is locked by
|
|
+ * btree_iter_traverse():
|
|
+ */
|
|
+ if (btree_ptr_hash_val(new_key) != b->hash_val) {
|
|
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
|
+ if (ret) {
|
|
+ bch2_trans_unlock(iter->trans);
|
|
+ closure_sync(&cl);
|
|
+ if (!bch2_trans_relock(iter->trans))
|
|
+ return -EINTR;
|
|
+ }
|
|
+
|
|
+ new_hash = bch2_btree_node_mem_alloc(c);
|
|
+ }
|
|
+
|
|
+ as = bch2_btree_update_start(iter, b->c.level,
|
|
+ parent ? btree_update_reserve_required(c, parent) : 0,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+ if (IS_ERR(as)) {
|
|
+ ret = PTR_ERR(as);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
|
|
+
|
|
+ bch2_btree_iter_downgrade(iter);
|
|
+err:
|
|
+ if (new_hash) {
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ list_move(&new_hash->list, &c->btree_cache.freeable);
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+
|
|
+ six_unlock_write(&new_hash->c.lock);
|
|
+ six_unlock_intent(&new_hash->c.lock);
|
|
+ }
|
|
+ closure_sync(&cl);
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Init code: */
|
|
+
|
|
+/*
|
|
+ * Only for filesystem bringup, when first reading the btree roots or allocating
|
|
+ * btree roots when initializing a new filesystem:
|
|
+ */
|
|
+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ BUG_ON(btree_node_root(c, b));
|
|
+
|
|
+ bch2_btree_set_root_inmem(c, b);
|
|
+}
|
|
+
|
|
+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
|
|
+{
|
|
+ struct closure cl;
|
|
+ struct btree *b;
|
|
+ int ret;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ do {
|
|
+ ret = bch2_btree_cache_cannibalize_lock(c, &cl);
|
|
+ closure_sync(&cl);
|
|
+ } while (ret);
|
|
+
|
|
+ b = bch2_btree_node_mem_alloc(c);
|
|
+ bch2_btree_cache_cannibalize_unlock(c);
|
|
+
|
|
+ set_btree_node_fake(b);
|
|
+ set_btree_node_need_rewrite(b);
|
|
+ b->c.level = 0;
|
|
+ b->c.btree_id = id;
|
|
+
|
|
+ bkey_btree_ptr_init(&b->key);
|
|
+ b->key.k.p = POS_MAX;
|
|
+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
|
|
+
|
|
+ bch2_bset_init_first(b, &b->data->keys);
|
|
+ bch2_btree_build_aux_trees(b);
|
|
+
|
|
+ b->data->flags = 0;
|
|
+ btree_set_min(b, POS_MIN);
|
|
+ btree_set_max(b, POS_MAX);
|
|
+ b->data->format = bch2_btree_calc_format(b);
|
|
+ btree_node_set_format(b, b->data->format);
|
|
+
|
|
+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
|
|
+ b->c.level, b->c.btree_id);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ bch2_btree_set_root_inmem(c, b);
|
|
+
|
|
+ six_unlock_write(&b->c.lock);
|
|
+ six_unlock_intent(&b->c.lock);
|
|
+}
|
|
+
|
|
+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct btree_update *as;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ list_for_each_entry(as, &c->btree_interior_update_list, list)
|
|
+ pr_buf(out, "%p m %u w %u r %u j %llu\n",
|
|
+ as,
|
|
+ as->mode,
|
|
+ as->nodes_written,
|
|
+ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
|
|
+ as->journal.seq);
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+}
|
|
+
|
|
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
|
|
+{
|
|
+ size_t ret = 0;
|
|
+ struct list_head *i;
|
|
+
|
|
+ mutex_lock(&c->btree_interior_update_lock);
|
|
+ list_for_each(i, &c->btree_interior_update_list)
|
|
+ ret++;
|
|
+ mutex_unlock(&c->btree_interior_update_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
|
|
+{
|
|
+ struct btree_root *r;
|
|
+ struct jset_entry *entry;
|
|
+
|
|
+ mutex_lock(&c->btree_root_lock);
|
|
+
|
|
+ vstruct_for_each(jset, entry)
|
|
+ if (entry->type == BCH_JSET_ENTRY_btree_root) {
|
|
+ r = &c->btree_roots[entry->btree_id];
|
|
+ r->level = entry->level;
|
|
+ r->alive = true;
|
|
+ bkey_copy(&r->key, &entry->start[0]);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->btree_root_lock);
|
|
+}
|
|
+
|
|
+struct jset_entry *
|
|
+bch2_btree_roots_to_journal_entries(struct bch_fs *c,
|
|
+ struct jset_entry *start,
|
|
+ struct jset_entry *end)
|
|
+{
|
|
+ struct jset_entry *entry;
|
|
+ unsigned long have = 0;
|
|
+ unsigned i;
|
|
+
|
|
+ for (entry = start; entry < end; entry = vstruct_next(entry))
|
|
+ if (entry->type == BCH_JSET_ENTRY_btree_root)
|
|
+ __set_bit(entry->btree_id, &have);
|
|
+
|
|
+ mutex_lock(&c->btree_root_lock);
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ if (c->btree_roots[i].alive && !test_bit(i, &have)) {
|
|
+ journal_entry_set(end,
|
|
+ BCH_JSET_ENTRY_btree_root,
|
|
+ i, c->btree_roots[i].level,
|
|
+ &c->btree_roots[i].key,
|
|
+ c->btree_roots[i].key.u64s);
|
|
+ end = vstruct_next(end);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->btree_root_lock);
|
|
+
|
|
+ return end;
|
|
+}
|
|
+
|
|
+void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
|
|
+{
|
|
+ if (c->btree_interior_update_worker)
|
|
+ destroy_workqueue(c->btree_interior_update_worker);
|
|
+ mempool_exit(&c->btree_interior_update_pool);
|
|
+}
|
|
+
|
|
+int bch2_fs_btree_interior_update_init(struct bch_fs *c)
|
|
+{
|
|
+ mutex_init(&c->btree_reserve_cache_lock);
|
|
+ INIT_LIST_HEAD(&c->btree_interior_update_list);
|
|
+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
|
|
+ mutex_init(&c->btree_interior_update_lock);
|
|
+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);
|
|
+
|
|
+ c->btree_interior_update_worker =
|
|
+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
|
|
+ if (!c->btree_interior_update_worker)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
|
|
+ sizeof(struct btree_update));
|
|
+}
|
|
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
|
|
new file mode 100644
|
|
index 000000000000..7eef3dbb6ef1
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_update_interior.h
|
|
@@ -0,0 +1,335 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H
|
|
+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H
|
|
+
|
|
+#include "btree_cache.h"
|
|
+#include "btree_locking.h"
|
|
+#include "btree_update.h"
|
|
+
|
|
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
|
|
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
|
|
+ struct bkey_format *);
|
|
+
|
|
+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)
|
|
+
|
|
+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
|
|
+
|
|
+/*
|
|
+ * Tracks an in progress split/rewrite of a btree node and the update to the
|
|
+ * parent node:
|
|
+ *
|
|
+ * When we split/rewrite a node, we do all the updates in memory without
|
|
+ * waiting for any writes to complete - we allocate the new node(s) and update
|
|
+ * the parent node, possibly recursively up to the root.
|
|
+ *
|
|
+ * The end result is that we have one or more new nodes being written -
|
|
+ * possibly several, if there were multiple splits - and then a write (updating
|
|
+ * an interior node) which will make all these new nodes visible.
|
|
+ *
|
|
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
|
|
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
|
|
+ * update to the interior node that makes the new node visible completes -
|
|
+ * until then, the old nodes are still reachable on disk.
|
|
+ *
|
|
+ */
|
|
+struct btree_update {
|
|
+ struct closure cl;
|
|
+ struct bch_fs *c;
|
|
+
|
|
+ struct list_head list;
|
|
+ struct list_head unwritten_list;
|
|
+
|
|
+ /* What kind of update are we doing? */
|
|
+ enum {
|
|
+ BTREE_INTERIOR_NO_UPDATE,
|
|
+ BTREE_INTERIOR_UPDATING_NODE,
|
|
+ BTREE_INTERIOR_UPDATING_ROOT,
|
|
+ BTREE_INTERIOR_UPDATING_AS,
|
|
+ } mode;
|
|
+
|
|
+ unsigned nodes_written:1;
|
|
+ unsigned took_gc_lock:1;
|
|
+
|
|
+ enum btree_id btree_id;
|
|
+
|
|
+ struct disk_reservation disk_res;
|
|
+ struct journal_preres journal_preres;
|
|
+
|
|
+ /*
|
|
+ * BTREE_INTERIOR_UPDATING_NODE:
|
|
+ * The update that made the new nodes visible was a regular update to an
|
|
+ * existing interior node - @b. We can't write out the update to @b
|
|
+ * until the new nodes we created are finished writing, so we block @b
|
|
+ * from writing by putting this btree_interior update on the
|
|
+ * @b->write_blocked list with @write_blocked_list:
|
|
+ */
|
|
+ struct btree *b;
|
|
+ struct list_head write_blocked_list;
|
|
+
|
|
+ /*
|
|
+ * We may be freeing nodes that were dirty, and thus had journal entries
|
|
+ * pinned: we need to transfer the oldest of those pins to the
|
|
+ * btree_update operation, and release it when the new node(s)
|
|
+ * are all persistent and reachable:
|
|
+ */
|
|
+ struct journal_entry_pin journal;
|
|
+
|
|
+ /* Preallocated nodes we reserve when we start the update: */
|
|
+ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX];
|
|
+ unsigned nr_prealloc_nodes;
|
|
+
|
|
+ /* Nodes being freed: */
|
|
+ struct keylist old_keys;
|
|
+ u64 _old_keys[BTREE_UPDATE_NODES_MAX *
|
|
+ BKEY_BTREE_PTR_VAL_U64s_MAX];
|
|
+
|
|
+ /* Nodes being added: */
|
|
+ struct keylist new_keys;
|
|
+ u64 _new_keys[BTREE_UPDATE_NODES_MAX *
|
|
+ BKEY_BTREE_PTR_VAL_U64s_MAX];
|
|
+
|
|
+ /* New nodes, that will be made reachable by this update: */
|
|
+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX];
|
|
+ unsigned nr_new_nodes;
|
|
+
|
|
+ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX *
|
|
+ BCH_REPLICAS_MAX];
|
|
+ open_bucket_idx_t nr_open_buckets;
|
|
+
|
|
+ unsigned journal_u64s;
|
|
+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES];
|
|
+
|
|
+ /* Only here to reduce stack usage on recursive splits: */
|
|
+ struct keylist parent_keys;
|
|
+ /*
|
|
+ * Enough room for btree_split's keys without realloc - btree node
|
|
+ * pointers never have crc/compression info, so we only need to acount
|
|
+ * for the pointers for three keys
|
|
+ */
|
|
+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
|
|
+};
|
|
+
|
|
+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
|
|
+ struct btree_iter *);
|
|
+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
|
|
+
|
|
+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *);
|
|
+
|
|
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
|
|
+ struct btree *,
|
|
+ struct bkey_format);
|
|
+
|
|
+void bch2_btree_update_done(struct btree_update *);
|
|
+struct btree_update *
|
|
+bch2_btree_update_start(struct btree_iter *, unsigned, unsigned, unsigned);
|
|
+
|
|
+void bch2_btree_interior_update_will_free_node(struct btree_update *,
|
|
+ struct btree *);
|
|
+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
|
|
+
|
|
+void bch2_btree_insert_node(struct btree_update *, struct btree *,
|
|
+ struct btree_iter *, struct keylist *,
|
|
+ unsigned);
|
|
+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
|
|
+
|
|
+int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
|
|
+ unsigned, unsigned, enum btree_node_sibling);
|
|
+
|
|
+static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned level, unsigned flags,
|
|
+ enum btree_node_sibling sib)
|
|
+{
|
|
+ struct btree *b;
|
|
+
|
|
+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
|
|
+ return 0;
|
|
+
|
|
+ if (!bch2_btree_node_relock(iter, level))
|
|
+ return 0;
|
|
+
|
|
+ b = iter->l[level].b;
|
|
+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
|
|
+ return 0;
|
|
+
|
|
+ return __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
|
|
+}
|
|
+
|
|
+static inline int bch2_foreground_maybe_merge(struct bch_fs *c,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned level,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
|
|
+ btree_prev_sib) ?:
|
|
+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
|
|
+ btree_next_sib);
|
|
+}
|
|
+
|
|
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
|
|
+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
|
|
+
|
|
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ unsigned depth = btree_node_root(c, b)->c.level + 1;
|
|
+
|
|
+ /*
|
|
+ * Number of nodes we might have to allocate in a worst case btree
|
|
+ * split operation - we split all the way up to the root, then allocate
|
|
+ * a new root, unless we're already at max depth:
|
|
+ */
|
|
+ if (depth < BTREE_MAX_DEPTH)
|
|
+ return (depth - b->c.level) * 2 + 1;
|
|
+ else
|
|
+ return (depth - b->c.level) * 2 - 1;
|
|
+}
|
|
+
|
|
+static inline void btree_node_reset_sib_u64s(struct btree *b)
|
|
+{
|
|
+ b->sib_u64s[0] = b->nr.live_u64s;
|
|
+ b->sib_u64s[1] = b->nr.live_u64s;
|
|
+}
|
|
+
|
|
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ return (void *) b->data + btree_bytes(c);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
|
|
+}
|
|
+
|
|
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ return btree_data_end(c, b);
|
|
+}
|
|
+
|
|
+static inline void *write_block(struct btree *b)
|
|
+{
|
|
+ return (void *) b->data + (b->written << 9);
|
|
+}
|
|
+
|
|
+static inline bool __btree_addr_written(struct btree *b, void *p)
|
|
+{
|
|
+ return p < write_block(b);
|
|
+}
|
|
+
|
|
+static inline bool bset_written(struct btree *b, struct bset *i)
|
|
+{
|
|
+ return __btree_addr_written(b, i);
|
|
+}
|
|
+
|
|
+static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
|
|
+{
|
|
+ return __btree_addr_written(b, k);
|
|
+}
|
|
+
|
|
+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
|
|
+ struct btree *b,
|
|
+ void *end)
|
|
+{
|
|
+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
|
|
+ b->whiteout_u64s;
|
|
+ ssize_t total = c->opts.btree_node_size << 6;
|
|
+
|
|
+ /* Always leave one extra u64 for bch2_varint_decode: */
|
|
+ used++;
|
|
+
|
|
+ return total - used;
|
|
+}
|
|
+
|
|
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ ssize_t remaining = __bch_btree_u64s_remaining(c, b,
|
|
+ btree_bkey_last(b, bset_tree_last(b)));
|
|
+
|
|
+ BUG_ON(remaining < 0);
|
|
+
|
|
+ if (bset_written(b, btree_bset_last(b)))
|
|
+ return 0;
|
|
+
|
|
+ return remaining;
|
|
+}
|
|
+
|
|
+#define BTREE_WRITE_SET_U64s_BITS 9
|
|
+
|
|
+static inline unsigned btree_write_set_buffer(struct btree *b)
|
|
+{
|
|
+ /*
|
|
+ * Could buffer up larger amounts of keys for btrees with larger keys,
|
|
+ * pending benchmarking:
|
|
+ */
|
|
+ return 8 << BTREE_WRITE_SET_U64s_BITS;
|
|
+}
|
|
+
|
|
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ struct bset_tree *t = bset_tree_last(b);
|
|
+ struct btree_node_entry *bne = max(write_block(b),
|
|
+ (void *) btree_bkey_last(b, bset_tree_last(b)));
|
|
+ ssize_t remaining_space =
|
|
+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]);
|
|
+
|
|
+ if (unlikely(bset_written(b, bset(b, t)))) {
|
|
+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
|
|
+ return bne;
|
|
+ } else {
|
|
+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
|
|
+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3))
|
|
+ return bne;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static inline void push_whiteout(struct bch_fs *c, struct btree *b,
|
|
+ struct bpos pos)
|
|
+{
|
|
+ struct bkey_packed k;
|
|
+
|
|
+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
|
|
+
|
|
+ if (!bkey_pack_pos(&k, pos, b)) {
|
|
+ struct bkey *u = (void *) &k;
|
|
+
|
|
+ bkey_init(u);
|
|
+ u->p = pos;
|
|
+ }
|
|
+
|
|
+ k.needs_whiteout = true;
|
|
+
|
|
+ b->whiteout_u64s += k.u64s;
|
|
+ bkey_copy(unwritten_whiteouts_start(c, b), &k);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * write lock must be held on @b (else the dirty bset that we were going to
|
|
+ * insert into could be written out from under us)
|
|
+ */
|
|
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
|
|
+ struct btree *b, unsigned u64s)
|
|
+{
|
|
+ if (unlikely(btree_node_need_rewrite(b)))
|
|
+ return false;
|
|
+
|
|
+ return u64s <= bch_btree_keys_u64s_remaining(c, b);
|
|
+}
|
|
+
|
|
+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
|
|
+
|
|
+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *);
|
|
+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *,
|
|
+ struct jset_entry *, struct jset_entry *);
|
|
+
|
|
+void bch2_fs_btree_interior_update_exit(struct bch_fs *);
|
|
+int bch2_fs_btree_interior_update_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
|
|
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
|
|
new file mode 100644
|
|
index 000000000000..64b6e86cf90a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/btree_update_leaf.c
|
|
@@ -0,0 +1,1259 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "btree_locking.h"
|
|
+#include "buckets.h"
|
|
+#include "debug.h"
|
|
+#include "error.h"
|
|
+#include "extent_update.h"
|
|
+#include "journal.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "keylist.h"
|
|
+#include "replicas.h"
|
|
+
|
|
+#include <linux/prefetch.h>
|
|
+#include <linux/sort.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
|
|
+ const struct btree_insert_entry *r)
|
|
+{
|
|
+ return cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ -cmp_int(l->level, r->level) ?:
|
|
+ bpos_cmp(l->k->k.p, r->k->k.p);
|
|
+}
|
|
+
|
|
+static inline bool same_leaf_as_prev(struct btree_trans *trans,
|
|
+ struct btree_insert_entry *i)
|
|
+{
|
|
+ return i != trans->updates2 &&
|
|
+ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
|
|
+}
|
|
+
|
|
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ bch2_btree_node_lock_write(b, iter);
|
|
+
|
|
+ if (btree_iter_type(iter) == BTREE_ITER_CACHED)
|
|
+ return;
|
|
+
|
|
+ if (unlikely(btree_node_just_written(b)) &&
|
|
+ bch2_btree_post_write_cleanup(c, b))
|
|
+ bch2_btree_iter_reinit_node(iter, b);
|
|
+
|
|
+ /*
|
|
+ * If the last bset has been written, or if it's gotten too big - start
|
|
+ * a new bset to insert into:
|
|
+ */
|
|
+ if (want_new_bset(c, b))
|
|
+ bch2_btree_init_next(c, b, iter);
|
|
+}
|
|
+
|
|
+/* Inserting into a given leaf node (last stage of insert): */
|
|
+
|
|
+/* Handle overwrites and do insert, for non extents: */
|
|
+bool bch2_btree_bset_insert_key(struct btree_iter *iter,
|
|
+ struct btree *b,
|
|
+ struct btree_node_iter *node_iter,
|
|
+ struct bkey_i *insert)
|
|
+{
|
|
+ struct bkey_packed *k;
|
|
+ unsigned clobber_u64s = 0, new_u64s = 0;
|
|
+
|
|
+ EBUG_ON(btree_node_just_written(b));
|
|
+ EBUG_ON(bset_written(b, btree_bset_last(b)));
|
|
+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
|
|
+ EBUG_ON(bpos_cmp(insert->k.p, b->data->min_key) < 0);
|
|
+ EBUG_ON(bpos_cmp(insert->k.p, b->data->max_key) > 0);
|
|
+ EBUG_ON(insert->k.u64s >
|
|
+ bch_btree_keys_u64s_remaining(iter->trans->c, b));
|
|
+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
|
|
+
|
|
+ k = bch2_btree_node_iter_peek_all(node_iter, b);
|
|
+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p))
|
|
+ k = NULL;
|
|
+
|
|
+ /* @k is the key being overwritten/deleted, if any: */
|
|
+ EBUG_ON(k && bkey_deleted(k));
|
|
+
|
|
+ /* Deleting, but not found? nothing to do: */
|
|
+ if (bkey_deleted(&insert->k) && !k)
|
|
+ return false;
|
|
+
|
|
+ if (bkey_deleted(&insert->k)) {
|
|
+ /* Deleting: */
|
|
+ btree_account_key_drop(b, k);
|
|
+ k->type = KEY_TYPE_deleted;
|
|
+
|
|
+ if (k->needs_whiteout)
|
|
+ push_whiteout(iter->trans->c, b, insert->k.p);
|
|
+ k->needs_whiteout = false;
|
|
+
|
|
+ if (k >= btree_bset_last(b)->start) {
|
|
+ clobber_u64s = k->u64s;
|
|
+ bch2_bset_delete(b, k, clobber_u64s);
|
|
+ goto fix_iter;
|
|
+ } else {
|
|
+ bch2_btree_iter_fix_key_modified(iter, b, k);
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ if (k) {
|
|
+ /* Overwriting: */
|
|
+ btree_account_key_drop(b, k);
|
|
+ k->type = KEY_TYPE_deleted;
|
|
+
|
|
+ insert->k.needs_whiteout = k->needs_whiteout;
|
|
+ k->needs_whiteout = false;
|
|
+
|
|
+ if (k >= btree_bset_last(b)->start) {
|
|
+ clobber_u64s = k->u64s;
|
|
+ goto overwrite;
|
|
+ } else {
|
|
+ bch2_btree_iter_fix_key_modified(iter, b, k);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
|
|
+overwrite:
|
|
+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
|
|
+ new_u64s = k->u64s;
|
|
+fix_iter:
|
|
+ if (clobber_u64s != new_u64s)
|
|
+ bch2_btree_node_iter_fix(iter, b, node_iter, k,
|
|
+ clobber_u64s, new_u64s);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
|
|
+ unsigned i, u64 seq)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct btree_write *w = container_of(pin, struct btree_write, journal);
|
|
+ struct btree *b = container_of(w, struct btree, writes[i]);
|
|
+
|
|
+ btree_node_lock_type(c, b, SIX_LOCK_read);
|
|
+ bch2_btree_node_write_cond(c, b,
|
|
+ (btree_current_write(b) == w && w->journal.seq == seq));
|
|
+ six_unlock_read(&b->c.lock);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
|
|
+{
|
|
+ return __btree_node_flush(j, pin, 0, seq);
|
|
+}
|
|
+
|
|
+static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
|
|
+{
|
|
+ return __btree_node_flush(j, pin, 1, seq);
|
|
+}
|
|
+
|
|
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
|
|
+ struct btree *b, u64 seq)
|
|
+{
|
|
+ struct btree_write *w = btree_current_write(b);
|
|
+
|
|
+ bch2_journal_pin_add(&c->journal, seq, &w->journal,
|
|
+ btree_node_write_idx(b) == 0
|
|
+ ? btree_node_flush0
|
|
+ : btree_node_flush1);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * btree_insert_key - insert a key one key into a leaf node
|
|
+ */
|
|
+static bool btree_insert_key_leaf(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *insert)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree *b = iter_l(iter)->b;
|
|
+ struct bset_tree *t = bset_tree_last(b);
|
|
+ struct bset *i = bset(b, t);
|
|
+ int old_u64s = bset_u64s(t);
|
|
+ int old_live_u64s = b->nr.live_u64s;
|
|
+ int live_u64s_added, u64s_added;
|
|
+
|
|
+ EBUG_ON(!iter->level &&
|
|
+ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags));
|
|
+
|
|
+ if (unlikely(!bch2_btree_bset_insert_key(iter, b,
|
|
+ &iter_l(iter)->iter, insert)))
|
|
+ return false;
|
|
+
|
|
+ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq,
|
|
+ le64_to_cpu(i->journal_seq)));
|
|
+
|
|
+ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq);
|
|
+
|
|
+ if (unlikely(!btree_node_dirty(b)))
|
|
+ set_btree_node_dirty(c, b);
|
|
+
|
|
+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
|
|
+ u64s_added = (int) bset_u64s(t) - old_u64s;
|
|
+
|
|
+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
|
|
+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
|
|
+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
|
|
+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
|
|
+
|
|
+ if (u64s_added > live_u64s_added &&
|
|
+ bch2_maybe_compact_whiteouts(c, b))
|
|
+ bch2_btree_iter_reinit_node(iter, b);
|
|
+
|
|
+ trace_btree_insert_key(c, b, insert);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* Cached btree updates: */
|
|
+
|
|
+/* Normal update interface: */
|
|
+
|
|
+static inline void btree_insert_entry_checks(struct btree_trans *trans,
|
|
+ struct btree_insert_entry *i)
|
|
+{
|
|
+ BUG_ON(!i->is_extent && bpos_cmp(i->k->k.p, i->iter->real_pos));
|
|
+ BUG_ON(i->level != i->iter->level);
|
|
+ BUG_ON(i->btree_id != i->iter->btree_id);
|
|
+}
|
|
+
|
|
+static noinline int
|
|
+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ ret = bch2_journal_preres_get(&c->journal,
|
|
+ &trans->journal_preres, u64s, 0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!bch2_trans_relock(trans)) {
|
|
+ trace_trans_restart_journal_preres_get(trans->ip);
|
|
+ return -EINTR;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ int ret;
|
|
+
|
|
+ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
|
|
+ flags |= JOURNAL_RES_GET_RESERVED;
|
|
+
|
|
+ ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
|
|
+ trans->journal_u64s, flags);
|
|
+
|
|
+ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
|
|
+}
|
|
+
|
|
+static enum btree_insert_ret
|
|
+btree_key_can_insert(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree *b = iter_l(iter)->b;
|
|
+
|
|
+ if (!bch2_btree_node_insert_fits(c, b, u64s))
|
|
+ return BTREE_INSERT_BTREE_NODE_FULL;
|
|
+
|
|
+ return BTREE_INSERT_OK;
|
|
+}
|
|
+
|
|
+static enum btree_insert_ret
|
|
+btree_key_can_insert_cached(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ struct bkey_cached *ck = (void *) iter->l[0].b;
|
|
+ unsigned new_u64s;
|
|
+ struct bkey_i *new_k;
|
|
+
|
|
+ BUG_ON(iter->level);
|
|
+
|
|
+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
|
|
+ bch2_btree_key_cache_must_wait(trans->c) &&
|
|
+ !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
|
|
+ return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
|
|
+
|
|
+ /*
|
|
+ * bch2_varint_decode can read past the end of the buffer by at most 7
|
|
+ * bytes (it won't be used):
|
|
+ */
|
|
+ u64s += 1;
|
|
+
|
|
+ if (u64s <= ck->u64s)
|
|
+ return BTREE_INSERT_OK;
|
|
+
|
|
+ new_u64s = roundup_pow_of_two(u64s);
|
|
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS);
|
|
+ if (!new_k)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ck->u64s = new_u64s;
|
|
+ ck->k = new_k;
|
|
+ return BTREE_INSERT_OK;
|
|
+}
|
|
+
|
|
+static inline void do_btree_insert_one(struct btree_trans *trans,
|
|
+ struct btree_insert_entry *i)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct journal *j = &c->journal;
|
|
+ bool did_work;
|
|
+
|
|
+ EBUG_ON(trans->journal_res.ref !=
|
|
+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
|
|
+
|
|
+ i->k->k.needs_whiteout = false;
|
|
+
|
|
+ did_work = (btree_iter_type(i->iter) != BTREE_ITER_CACHED)
|
|
+ ? btree_insert_key_leaf(trans, i->iter, i->k)
|
|
+ : bch2_btree_insert_key_cached(trans, i->iter, i->k);
|
|
+ if (!did_work)
|
|
+ return;
|
|
+
|
|
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
|
|
+ bch2_journal_add_keys(j, &trans->journal_res,
|
|
+ i->btree_id,
|
|
+ i->level,
|
|
+ i->k);
|
|
+
|
|
+ bch2_journal_set_has_inode(j, &trans->journal_res,
|
|
+ i->k->k.p.inode);
|
|
+
|
|
+ if (trans->journal_seq)
|
|
+ *trans->journal_seq = trans->journal_res.seq;
|
|
+ }
|
|
+}
|
|
+
|
|
+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
|
|
+{
|
|
+ __bch2_btree_iter_unlock(iter);
|
|
+}
|
|
+
|
|
+static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_insert_entry *i;
|
|
+
|
|
+ trans_for_each_update(trans, i) {
|
|
+ /*
|
|
+ * XXX: synchronization of cached update triggers with gc
|
|
+ */
|
|
+ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED);
|
|
+
|
|
+ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
|
|
+ bch2_mark_update(trans, i->iter, i->k, NULL,
|
|
+ i->trigger_flags|BTREE_TRIGGER_GC);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline int
|
|
+bch2_trans_commit_write_locked(struct btree_trans *trans,
|
|
+ struct btree_insert_entry **stopped_at)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_insert_entry *i;
|
|
+ struct btree_trans_commit_hook *h;
|
|
+ unsigned u64s = 0;
|
|
+ bool marking = false;
|
|
+ int ret;
|
|
+
|
|
+ if (race_fault()) {
|
|
+ trace_trans_restart_fault_inject(trans->ip);
|
|
+ return -EINTR;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Check if the insert will fit in the leaf node with the write lock
|
|
+ * held, otherwise another thread could write the node changing the
|
|
+ * amount of space available:
|
|
+ */
|
|
+
|
|
+ prefetch(&trans->c->journal.flags);
|
|
+
|
|
+ h = trans->hooks;
|
|
+ while (h) {
|
|
+ ret = h->fn(trans, h);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ h = h->next;
|
|
+ }
|
|
+
|
|
+ trans_for_each_update2(trans, i) {
|
|
+ /* Multiple inserts might go to same leaf: */
|
|
+ if (!same_leaf_as_prev(trans, i))
|
|
+ u64s = 0;
|
|
+
|
|
+ u64s += i->k->k.u64s;
|
|
+ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED
|
|
+ ? btree_key_can_insert(trans, i->iter, u64s)
|
|
+ : btree_key_can_insert_cached(trans, i->iter, u64s);
|
|
+ if (ret) {
|
|
+ *stopped_at = i;
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (btree_node_type_needs_gc(i->bkey_type))
|
|
+ marking = true;
|
|
+ }
|
|
+
|
|
+ if (marking) {
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ /* Must be called under mark_lock: */
|
|
+ if (marking && trans->fs_usage_deltas &&
|
|
+ !bch2_replicas_delta_list_marked(c, trans->fs_usage_deltas)) {
|
|
+ ret = BTREE_INSERT_NEED_MARK_REPLICAS;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Don't get journal reservation until after we know insert will
|
|
+ * succeed:
|
|
+ */
|
|
+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
|
|
+ ret = bch2_trans_journal_res_get(trans,
|
|
+ JOURNAL_RES_GET_NONBLOCK);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ } else {
|
|
+ trans->journal_res.seq = c->journal.replay_journal_seq;
|
|
+ }
|
|
+
|
|
+ if (unlikely(trans->extra_journal_entry_u64s)) {
|
|
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
|
|
+ trans->extra_journal_entries,
|
|
+ trans->extra_journal_entry_u64s);
|
|
+
|
|
+ trans->journal_res.offset += trans->extra_journal_entry_u64s;
|
|
+ trans->journal_res.u64s -= trans->extra_journal_entry_u64s;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Not allowed to fail after we've gotten our journal reservation - we
|
|
+ * have to use it:
|
|
+ */
|
|
+
|
|
+ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
|
|
+ if (bch2_journal_seq_verify)
|
|
+ trans_for_each_update2(trans, i)
|
|
+ i->k->k.version.lo = trans->journal_res.seq;
|
|
+ else if (bch2_inject_invalid_keys)
|
|
+ trans_for_each_update2(trans, i)
|
|
+ i->k->k.version = MAX_VERSION;
|
|
+ }
|
|
+
|
|
+ trans_for_each_update(trans, i)
|
|
+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type))
|
|
+ bch2_mark_update(trans, i->iter, i->k,
|
|
+ NULL, i->trigger_flags);
|
|
+
|
|
+ if (marking && trans->fs_usage_deltas)
|
|
+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas);
|
|
+
|
|
+ if (unlikely(c->gc_pos.phase))
|
|
+ bch2_trans_mark_gc(trans);
|
|
+
|
|
+ trans_for_each_update2(trans, i)
|
|
+ do_btree_insert_one(trans, i);
|
|
+err:
|
|
+ if (marking) {
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static noinline int maybe_do_btree_merge(struct btree_trans *trans, struct btree_iter *iter)
|
|
+{
|
|
+ struct btree_insert_entry *i;
|
|
+ struct btree *b = iter_l(iter)->b;
|
|
+ struct bkey_s_c old;
|
|
+ int u64s_delta = 0;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * Inserting directly into interior nodes is an uncommon operation with
|
|
+ * various weird edge cases: also, a lot of things about
|
|
+ * BTREE_ITER_NODES iters need to be audited
|
|
+ */
|
|
+ if (unlikely(btree_iter_type(iter) != BTREE_ITER_KEYS))
|
|
+ return 0;
|
|
+
|
|
+ BUG_ON(iter->level);
|
|
+
|
|
+ trans_for_each_update2(trans, i) {
|
|
+ if (iter_l(i->iter)->b != b)
|
|
+ continue;
|
|
+
|
|
+ old = bch2_btree_iter_peek_slot(i->iter);
|
|
+ ret = bkey_err(old);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0;
|
|
+ u64s_delta -= !bkey_deleted(old.k) ? old.k->u64s : 0;
|
|
+ }
|
|
+
|
|
+ return u64s_delta <= 0
|
|
+ ? (bch2_foreground_maybe_merge(trans->c, iter, iter->level,
|
|
+ trans->flags & ~BTREE_INSERT_NOUNLOCK) ?: -EINTR)
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
|
|
+ */
|
|
+static inline int do_bch2_trans_commit(struct btree_trans *trans,
|
|
+ struct btree_insert_entry **stopped_at)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_insert_entry *i;
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ trans_for_each_update2(trans, i) {
|
|
+ struct btree *b;
|
|
+
|
|
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
|
|
+
|
|
+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED)
|
|
+ continue;
|
|
+
|
|
+ b = iter_l(i->iter)->b;
|
|
+ if (b->sib_u64s[0] < c->btree_foreground_merge_threshold ||
|
|
+ b->sib_u64s[1] < c->btree_foreground_merge_threshold) {
|
|
+ ret = maybe_do_btree_merge(trans, i->iter);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ trans_for_each_update2(trans, i)
|
|
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
|
|
+
|
|
+ ret = bch2_journal_preres_get(&c->journal,
|
|
+ &trans->journal_preres, trans->journal_preres_u64s,
|
|
+ JOURNAL_RES_GET_NONBLOCK|
|
|
+ ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
|
|
+ ? JOURNAL_RES_GET_RESERVED : 0));
|
|
+ if (unlikely(ret == -EAGAIN))
|
|
+ ret = bch2_trans_journal_preres_get_cold(trans,
|
|
+ trans->journal_preres_u64s);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+
|
|
+ /*
|
|
+ * Can't be holding any read locks when we go to take write locks:
|
|
+ * another thread could be holding an intent lock on the same node we
|
|
+ * have a read lock on, and it'll block trying to take a write lock
|
|
+ * (because we hold a read lock) and it could be blocking us by holding
|
|
+ * its own read lock (while we're trying to to take write locks).
|
|
+ *
|
|
+ * note - this must be done after bch2_trans_journal_preres_get_cold()
|
|
+ * or anything else that might call bch2_trans_relock(), since that
|
|
+ * would just retake the read locks:
|
|
+ */
|
|
+ trans_for_each_iter(trans, iter) {
|
|
+ if (iter->nodes_locked != iter->nodes_intent_locked) {
|
|
+ if (btree_iter_keep(trans, iter)) {
|
|
+ if (!bch2_btree_iter_upgrade(iter, 1)) {
|
|
+ trace_trans_restart_upgrade(trans->ip);
|
|
+ return -EINTR;
|
|
+ }
|
|
+ } else {
|
|
+ bch2_btree_iter_unlock_noinline(iter);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ trans_for_each_update2(trans, i) {
|
|
+ const char *invalid = bch2_bkey_invalid(c,
|
|
+ bkey_i_to_s_c(i->k), i->bkey_type);
|
|
+ if (invalid) {
|
|
+ char buf[200];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
|
|
+ bch_err(c, "invalid bkey %s on insert: %s\n", buf, invalid);
|
|
+ bch2_fatal_error(c);
|
|
+ }
|
|
+ btree_insert_entry_checks(trans, i);
|
|
+ }
|
|
+ bch2_btree_trans_verify_locks(trans);
|
|
+
|
|
+ trans_for_each_update2(trans, i)
|
|
+ if (!same_leaf_as_prev(trans, i))
|
|
+ bch2_btree_node_lock_for_insert(c,
|
|
+ iter_l(i->iter)->b, i->iter);
|
|
+
|
|
+ ret = bch2_trans_commit_write_locked(trans, stopped_at);
|
|
+
|
|
+ trans_for_each_update2(trans, i)
|
|
+ if (!same_leaf_as_prev(trans, i))
|
|
+ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
|
|
+ i->iter);
|
|
+
|
|
+ if (!ret && trans->journal_pin)
|
|
+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
|
|
+ trans->journal_pin, NULL);
|
|
+
|
|
+ /*
|
|
+ * Drop journal reservation after dropping write locks, since dropping
|
|
+ * the journal reservation may kick off a journal write:
|
|
+ */
|
|
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
|
|
+
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+
|
|
+ bch2_trans_downgrade(trans);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int journal_reclaim_wait_done(struct bch_fs *c)
|
|
+{
|
|
+ int ret = bch2_journal_error(&c->journal) ?:
|
|
+ !bch2_btree_key_cache_must_wait(c);
|
|
+
|
|
+ if (!ret)
|
|
+ journal_reclaim_kick(&c->journal);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static noinline
|
|
+int bch2_trans_commit_error(struct btree_trans *trans,
|
|
+ struct btree_insert_entry *i,
|
|
+ int ret)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ unsigned flags = trans->flags;
|
|
+
|
|
+ /*
|
|
+ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
|
|
+ * update; if we haven't done anything yet it doesn't apply
|
|
+ */
|
|
+ flags &= ~BTREE_INSERT_NOUNLOCK;
|
|
+
|
|
+ switch (ret) {
|
|
+ case BTREE_INSERT_BTREE_NODE_FULL:
|
|
+ ret = bch2_btree_split_leaf(c, i->iter, flags);
|
|
+
|
|
+ /*
|
|
+ * if the split succeeded without dropping locks the insert will
|
|
+ * still be atomic (what the caller peeked() and is overwriting
|
|
+ * won't have changed)
|
|
+ */
|
|
+#if 0
|
|
+ /*
|
|
+ * XXX:
|
|
+ * split -> btree node merging (of parent node) might still drop
|
|
+ * locks when we're not passing it BTREE_INSERT_NOUNLOCK
|
|
+ *
|
|
+ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
|
|
+ * will inhibit merging - but we don't have a reliable way yet
|
|
+ * (do we?) of checking if we dropped locks in this path
|
|
+ */
|
|
+ if (!ret)
|
|
+ goto retry;
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * don't care if we got ENOSPC because we told split it
|
|
+ * couldn't block:
|
|
+ */
|
|
+ if (!ret ||
|
|
+ ret == -EINTR ||
|
|
+ (flags & BTREE_INSERT_NOUNLOCK)) {
|
|
+ trace_trans_restart_btree_node_split(trans->ip);
|
|
+ ret = -EINTR;
|
|
+ }
|
|
+ break;
|
|
+ case BTREE_INSERT_ENOSPC:
|
|
+ ret = -ENOSPC;
|
|
+ break;
|
|
+ case BTREE_INSERT_NEED_MARK_REPLICAS:
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ ret = bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (bch2_trans_relock(trans))
|
|
+ return 0;
|
|
+
|
|
+ trace_trans_restart_mark_replicas(trans->ip);
|
|
+ ret = -EINTR;
|
|
+ break;
|
|
+ case BTREE_INSERT_NEED_JOURNAL_RES:
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
|
|
+ !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
|
|
+ return -EAGAIN;
|
|
+
|
|
+ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (bch2_trans_relock(trans))
|
|
+ return 0;
|
|
+
|
|
+ trace_trans_restart_journal_res_get(trans->ip);
|
|
+ ret = -EINTR;
|
|
+ break;
|
|
+ case BTREE_INSERT_NEED_JOURNAL_RECLAIM:
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ trace_trans_blocked_journal_reclaim(trans->ip);
|
|
+
|
|
+ wait_event_freezable(c->journal.reclaim_wait,
|
|
+ (ret = journal_reclaim_wait_done(c)));
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if (bch2_trans_relock(trans))
|
|
+ return 0;
|
|
+
|
|
+ trace_trans_restart_journal_reclaim(trans->ip);
|
|
+ ret = -EINTR;
|
|
+ break;
|
|
+ default:
|
|
+ BUG_ON(ret >= 0);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static noinline int
|
|
+bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ int ret;
|
|
+
|
|
+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
|
|
+ return -EROFS;
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ ret = bch2_fs_read_write_early(c);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ percpu_ref_get(&c->writes);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void __bch2_trans_update2(struct btree_trans *trans,
|
|
+ struct btree_insert_entry n)
|
|
+{
|
|
+ struct btree_insert_entry *i;
|
|
+
|
|
+ btree_insert_entry_checks(trans, &n);
|
|
+
|
|
+ EBUG_ON(trans->nr_updates2 >= BTREE_ITER_MAX);
|
|
+
|
|
+ n.iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+
|
|
+ trans_for_each_update2(trans, i)
|
|
+ if (btree_insert_entry_cmp(&n, i) <= 0)
|
|
+ break;
|
|
+
|
|
+ if (i < trans->updates2 + trans->nr_updates2 &&
|
|
+ !btree_insert_entry_cmp(&n, i))
|
|
+ *i = n;
|
|
+ else
|
|
+ array_insert_item(trans->updates2, trans->nr_updates2,
|
|
+ i - trans->updates2, n);
|
|
+}
|
|
+
|
|
+static void bch2_trans_update2(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *insert)
|
|
+{
|
|
+ __bch2_trans_update2(trans, (struct btree_insert_entry) {
|
|
+ .bkey_type = __btree_node_type(iter->level, iter->btree_id),
|
|
+ .btree_id = iter->btree_id,
|
|
+ .level = iter->level,
|
|
+ .iter = iter,
|
|
+ .k = insert,
|
|
+ });
|
|
+}
|
|
+
|
|
+static int extent_update_to_keys(struct btree_trans *trans,
|
|
+ struct btree_insert_entry n)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (bkey_deleted(&n.k->k))
|
|
+ return 0;
|
|
+
|
|
+ ret = bch2_extent_can_insert(trans, n.iter, n.k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ n.iter = bch2_trans_get_iter(trans, n.iter->btree_id, n.k->k.p,
|
|
+ BTREE_ITER_INTENT|
|
|
+ BTREE_ITER_NOT_EXTENTS);
|
|
+ n.is_extent = false;
|
|
+
|
|
+ __bch2_trans_update2(trans, n);
|
|
+ bch2_trans_iter_put(trans, n.iter);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int extent_handle_overwrites(struct btree_trans *trans,
|
|
+ enum btree_id btree_id,
|
|
+ struct bkey_i *insert)
|
|
+{
|
|
+ struct btree_iter *iter, *update_iter;
|
|
+ struct bpos start = bkey_start_pos(&insert->k);
|
|
+ struct bkey_i *update;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, btree_id, start,
|
|
+ BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek_with_updates(iter);
|
|
+
|
|
+ while (k.k && !(ret = bkey_err(k))) {
|
|
+ if (bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0)
|
|
+ break;
|
|
+
|
|
+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) {
|
|
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
|
|
+ if ((ret = PTR_ERR_OR_ZERO(update)))
|
|
+ break;
|
|
+
|
|
+ bkey_reassemble(update, k);
|
|
+
|
|
+ bch2_cut_back(start, update);
|
|
+
|
|
+ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
|
|
+ BTREE_ITER_NOT_EXTENTS|
|
|
+ BTREE_ITER_INTENT);
|
|
+ bch2_trans_update2(trans, update_iter, update);
|
|
+ bch2_trans_iter_put(trans, update_iter);
|
|
+ }
|
|
+
|
|
+ if (bkey_cmp(k.k->p, insert->k.p) < 0 ||
|
|
+ (!bkey_cmp(k.k->p, insert->k.p) && bkey_deleted(&insert->k))) {
|
|
+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey));
|
|
+ if ((ret = PTR_ERR_OR_ZERO(update)))
|
|
+ break;
|
|
+
|
|
+ bkey_init(&update->k);
|
|
+ update->k.p = k.k->p;
|
|
+
|
|
+ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
|
|
+ BTREE_ITER_NOT_EXTENTS|
|
|
+ BTREE_ITER_INTENT);
|
|
+ bch2_trans_update2(trans, update_iter, update);
|
|
+ bch2_trans_iter_put(trans, update_iter);
|
|
+ }
|
|
+
|
|
+ if (bkey_cmp(k.k->p, insert->k.p) > 0) {
|
|
+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
|
|
+ if ((ret = PTR_ERR_OR_ZERO(update)))
|
|
+ break;
|
|
+
|
|
+ bkey_reassemble(update, k);
|
|
+ bch2_cut_front(insert->k.p, update);
|
|
+
|
|
+ update_iter = bch2_trans_get_iter(trans, btree_id, update->k.p,
|
|
+ BTREE_ITER_NOT_EXTENTS|
|
|
+ BTREE_ITER_INTENT);
|
|
+ bch2_trans_update2(trans, update_iter, update);
|
|
+ bch2_trans_iter_put(trans, update_iter);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ k = bch2_btree_iter_next_with_updates(iter);
|
|
+ }
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_trans_commit(struct btree_trans *trans)
|
|
+{
|
|
+ struct btree_insert_entry *i = NULL;
|
|
+ struct btree_iter *iter;
|
|
+ bool trans_trigger_run;
|
|
+ unsigned u64s, reset_flags = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!trans->nr_updates)
|
|
+ goto out_reset;
|
|
+
|
|
+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
|
|
+ lockdep_assert_held(&trans->c->gc_lock);
|
|
+
|
|
+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
|
|
+
|
|
+ trans->journal_u64s = trans->extra_journal_entry_u64s;
|
|
+ trans->journal_preres_u64s = 0;
|
|
+
|
|
+ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
|
|
+ unlikely(!percpu_ref_tryget(&trans->c->writes))) {
|
|
+ ret = bch2_trans_commit_get_rw_cold(trans);
|
|
+ if (ret)
|
|
+ goto out_reset;
|
|
+ }
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ trans_for_each_update(trans, i)
|
|
+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED &&
|
|
+ !(i->trigger_flags & BTREE_TRIGGER_NORUN))
|
|
+ bch2_btree_key_cache_verify_clean(trans,
|
|
+ i->btree_id, i->k->k.p);
|
|
+#endif
|
|
+
|
|
+ /*
|
|
+ * Running triggers will append more updates to the list of updates as
|
|
+ * we're walking it:
|
|
+ */
|
|
+ do {
|
|
+ trans_trigger_run = false;
|
|
+
|
|
+ trans_for_each_update(trans, i) {
|
|
+ if ((BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) &&
|
|
+ !i->trans_triggers_run) {
|
|
+ i->trans_triggers_run = true;
|
|
+ trans_trigger_run = true;
|
|
+
|
|
+ ret = bch2_trans_mark_update(trans, i->iter, i->k,
|
|
+ i->trigger_flags);
|
|
+ if (unlikely(ret)) {
|
|
+ if (ret == -EINTR)
|
|
+ trace_trans_restart_mark(trans->ip);
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } while (trans_trigger_run);
|
|
+
|
|
+ /* Turn extents updates into keys: */
|
|
+ trans_for_each_update(trans, i)
|
|
+ if (i->is_extent) {
|
|
+ ret = extent_handle_overwrites(trans, i->btree_id, i->k);
|
|
+ if (unlikely(ret))
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ trans_for_each_update(trans, i) {
|
|
+ ret = i->is_extent
|
|
+ ? extent_update_to_keys(trans, *i)
|
|
+ : (__bch2_trans_update2(trans, *i), 0);
|
|
+ if (unlikely(ret))
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ trans_for_each_update2(trans, i) {
|
|
+ ret = bch2_btree_iter_traverse(i->iter);
|
|
+ if (unlikely(ret)) {
|
|
+ trace_trans_restart_traverse(trans->ip);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (unlikely(!bch2_btree_iter_upgrade(i->iter, i->level + 1))) {
|
|
+ trace_trans_restart_upgrade(trans->ip);
|
|
+ ret = -EINTR;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!btree_node_intent_locked(i->iter, i->level));
|
|
+
|
|
+ u64s = jset_u64s(i->k->k.u64s);
|
|
+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED &&
|
|
+ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)))
|
|
+ trans->journal_preres_u64s += u64s;
|
|
+ trans->journal_u64s += u64s;
|
|
+ }
|
|
+retry:
|
|
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
|
|
+
|
|
+ ret = do_bch2_trans_commit(trans, &i);
|
|
+
|
|
+ /* make sure we didn't drop or screw up locks: */
|
|
+ bch2_btree_trans_verify_locks(trans);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ trans_for_each_iter(trans, iter)
|
|
+ if (btree_iter_live(trans, iter) &&
|
|
+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT))
|
|
+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
|
|
+out:
|
|
+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
|
|
+
|
|
+ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
|
|
+ percpu_ref_put(&trans->c->writes);
|
|
+out_reset:
|
|
+ if (!ret)
|
|
+ reset_flags |= TRANS_RESET_NOTRAVERSE;
|
|
+ if (!ret && (trans->flags & BTREE_INSERT_NOUNLOCK))
|
|
+ reset_flags |= TRANS_RESET_NOUNLOCK;
|
|
+ bch2_trans_reset(trans, reset_flags);
|
|
+
|
|
+ return ret;
|
|
+err:
|
|
+ ret = bch2_trans_commit_error(trans, i, ret);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ goto retry;
|
|
+}
|
|
+
|
|
+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bkey_i *k, enum btree_trigger_flags flags)
|
|
+{
|
|
+ struct btree_insert_entry *i, n = (struct btree_insert_entry) {
|
|
+ .trigger_flags = flags,
|
|
+ .bkey_type = __btree_node_type(iter->level, iter->btree_id),
|
|
+ .btree_id = iter->btree_id,
|
|
+ .level = iter->level,
|
|
+ .is_extent = (iter->flags & BTREE_ITER_IS_EXTENTS) != 0,
|
|
+ .iter = iter,
|
|
+ .k = k
|
|
+ };
|
|
+
|
|
+ BUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ BUG_ON(bkey_cmp(iter->pos,
|
|
+ n.is_extent ? bkey_start_pos(&k->k) : k->k.p));
|
|
+
|
|
+ trans_for_each_update(trans, i) {
|
|
+ BUG_ON(bkey_cmp(i->iter->pos,
|
|
+ i->is_extent ? bkey_start_pos(&i->k->k) : i->k->k.p));
|
|
+
|
|
+ BUG_ON(i != trans->updates &&
|
|
+ btree_insert_entry_cmp(i - 1, i) >= 0);
|
|
+ }
|
|
+#endif
|
|
+
|
|
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+
|
|
+ if (n.is_extent) {
|
|
+ iter->pos_after_commit = k->k.p;
|
|
+ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Pending updates are kept sorted: first, find position of new update,
|
|
+ * then delete/trim any updates the new update overwrites:
|
|
+ */
|
|
+ if (!n.is_extent) {
|
|
+ trans_for_each_update(trans, i)
|
|
+ if (btree_insert_entry_cmp(&n, i) <= 0)
|
|
+ break;
|
|
+
|
|
+ if (i < trans->updates + trans->nr_updates &&
|
|
+ !btree_insert_entry_cmp(&n, i))
|
|
+ *i = n;
|
|
+ else
|
|
+ array_insert_item(trans->updates, trans->nr_updates,
|
|
+ i - trans->updates, n);
|
|
+ } else {
|
|
+ trans_for_each_update(trans, i)
|
|
+ if (btree_insert_entry_cmp(&n, i) < 0)
|
|
+ break;
|
|
+
|
|
+ while (i > trans->updates &&
|
|
+ i[-1].btree_id == n.btree_id &&
|
|
+ bkey_cmp(bkey_start_pos(&n.k->k),
|
|
+ bkey_start_pos(&i[-1].k->k)) <= 0) {
|
|
+ --i;
|
|
+ array_remove_item(trans->updates, trans->nr_updates,
|
|
+ i - trans->updates);
|
|
+ }
|
|
+
|
|
+ if (i > trans->updates &&
|
|
+ i[-1].btree_id == n.btree_id &&
|
|
+ bkey_cmp(bkey_start_pos(&n.k->k), i[-1].k->k.p) < 0)
|
|
+ bch2_cut_back(bkey_start_pos(&n.k->k), i[-1].k);
|
|
+
|
|
+ if (i < trans->updates + trans->nr_updates &&
|
|
+ i->btree_id == n.btree_id &&
|
|
+ bkey_cmp(n.k->k.p, bkey_start_pos(&i->k->k)) > 0) {
|
|
+ /* We don't handle splitting extents here: */
|
|
+ BUG_ON(bkey_cmp(bkey_start_pos(&n.k->k),
|
|
+ bkey_start_pos(&i->k->k)) > 0);
|
|
+
|
|
+ /*
|
|
+ * When we have an extent that overwrites the start of another
|
|
+ * update, trimming that extent will mean the iterator's
|
|
+ * position has to change since the iterator position has to
|
|
+ * match the extent's start pos - but we don't want to change
|
|
+ * the iterator pos if some other code is using it, so we may
|
|
+ * need to clone it:
|
|
+ */
|
|
+ if (btree_iter_live(trans, i->iter)) {
|
|
+ i->iter = bch2_trans_copy_iter(trans, i->iter);
|
|
+
|
|
+ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+ bch2_trans_iter_put(trans, i->iter);
|
|
+ }
|
|
+
|
|
+ bch2_cut_front(n.k->k.p, i->k);
|
|
+ bch2_btree_iter_set_pos(i->iter, n.k->k.p);
|
|
+ }
|
|
+
|
|
+ array_insert_item(trans->updates, trans->nr_updates,
|
|
+ i - trans->updates, n);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_trans_commit_hook(struct btree_trans *trans,
|
|
+ struct btree_trans_commit_hook *h)
|
|
+{
|
|
+ h->next = trans->hooks;
|
|
+ trans->hooks = h;
|
|
+}
|
|
+
|
|
+int __bch2_btree_insert(struct btree_trans *trans,
|
|
+ enum btree_id id, struct bkey_i *k)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ ret = bch2_trans_update(trans, iter, k, 0);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_btree_insert - insert keys into the extent btree
|
|
+ * @c: pointer to struct bch_fs
|
|
+ * @id: btree to insert into
|
|
+ * @insert_keys: list of keys to insert
|
|
+ * @hook: insert callback
|
|
+ */
|
|
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
|
|
+ struct bkey_i *k,
|
|
+ struct disk_reservation *disk_res,
|
|
+ u64 *journal_seq, int flags)
|
|
+{
|
|
+ return bch2_trans_do(c, disk_res, journal_seq, flags,
|
|
+ __bch2_btree_insert(&trans, id, k));
|
|
+}
|
|
+
|
|
+int bch2_btree_delete_at(struct btree_trans *trans,
|
|
+ struct btree_iter *iter, unsigned flags)
|
|
+{
|
|
+ struct bkey_i k;
|
|
+
|
|
+ bkey_init(&k.k);
|
|
+ k.k.p = iter->pos;
|
|
+
|
|
+ bch2_trans_update(trans, iter, &k, 0);
|
|
+ return bch2_trans_commit(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|flags);
|
|
+}
|
|
+
|
|
+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
|
|
+ struct bpos start, struct bpos end,
|
|
+ u64 *journal_seq)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, id, start, BTREE_ITER_INTENT);
|
|
+retry:
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k)) &&
|
|
+ bkey_cmp(iter->pos, end) < 0) {
|
|
+ struct bkey_i delete;
|
|
+
|
|
+ bch2_trans_begin(trans);
|
|
+
|
|
+ bkey_init(&delete.k);
|
|
+
|
|
+ /*
|
|
+ * This could probably be more efficient for extents:
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * For extents, iter.pos won't necessarily be the same as
|
|
+ * bkey_start_pos(k.k) (for non extents they always will be the
|
|
+ * same). It's important that we delete starting from iter.pos
|
|
+ * because the range we want to delete could start in the middle
|
|
+ * of k.
|
|
+ *
|
|
+ * (bch2_btree_iter_peek() does guarantee that iter.pos >=
|
|
+ * bkey_start_pos(k.k)).
|
|
+ */
|
|
+ delete.k.p = iter->pos;
|
|
+
|
|
+ if (btree_node_type_is_extents(iter->btree_id)) {
|
|
+ unsigned max_sectors =
|
|
+ KEY_SIZE_MAX & (~0 << trans->c->block_bits);
|
|
+
|
|
+ /* create the biggest key we can */
|
|
+ bch2_key_resize(&delete.k, max_sectors);
|
|
+ bch2_cut_back(end, &delete);
|
|
+
|
|
+ ret = bch2_extent_trim_atomic(&delete, iter);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_trans_update(trans, iter, &delete, 0);
|
|
+ ret = bch2_trans_commit(trans, NULL, journal_seq,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ bch2_trans_cond_resched(trans);
|
|
+ }
|
|
+
|
|
+ if (ret == -EINTR) {
|
|
+ ret = 0;
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_free(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * bch_btree_delete_range - delete everything within a given range
|
|
+ *
|
|
+ * Range is a half open interval - [start, end)
|
|
+ */
|
|
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
|
|
+ struct bpos start, struct bpos end,
|
|
+ u64 *journal_seq)
|
|
+{
|
|
+ return bch2_trans_do(c, NULL, journal_seq, 0,
|
|
+ bch2_btree_delete_range_trans(&trans, id, start, end, journal_seq));
|
|
+}
|
|
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
|
|
new file mode 100644
|
|
index 000000000000..70008603f047
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/buckets.c
|
|
@@ -0,0 +1,2287 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Code for manipulating bucket marks for garbage collection.
|
|
+ *
|
|
+ * Copyright 2014 Datera, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "bset.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "movinggc.h"
|
|
+#include "replicas.h"
|
|
+
|
|
+#include <linux/preempt.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
|
|
+ enum bch_data_type data_type,
|
|
+ s64 sectors)
|
|
+{
|
|
+ switch (data_type) {
|
|
+ case BCH_DATA_btree:
|
|
+ fs_usage->btree += sectors;
|
|
+ break;
|
|
+ case BCH_DATA_user:
|
|
+ case BCH_DATA_parity:
|
|
+ fs_usage->data += sectors;
|
|
+ break;
|
|
+ case BCH_DATA_cached:
|
|
+ fs_usage->cached += sectors;
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
|
|
+ * wraparound:
|
|
+ */
|
|
+void bch2_bucket_seq_cleanup(struct bch_fs *c)
|
|
+{
|
|
+ u64 journal_seq = atomic64_read(&c->journal.seq);
|
|
+ u16 last_seq_ondisk = c->journal.last_seq_ondisk;
|
|
+ struct bch_dev *ca;
|
|
+ struct bucket_array *buckets;
|
|
+ struct bucket *g;
|
|
+ struct bucket_mark m;
|
|
+ unsigned i;
|
|
+
|
|
+ if (journal_seq - c->last_bucket_seq_cleanup <
|
|
+ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
|
|
+ return;
|
|
+
|
|
+ c->last_bucket_seq_cleanup = journal_seq;
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ down_read(&ca->bucket_lock);
|
|
+ buckets = bucket_array(ca);
|
|
+
|
|
+ for_each_bucket(g, buckets) {
|
|
+ bucket_cmpxchg(g, m, ({
|
|
+ if (!m.journal_seq_valid ||
|
|
+ bucket_needs_journal_commit(m, last_seq_ondisk))
|
|
+ break;
|
|
+
|
|
+ m.journal_seq_valid = 0;
|
|
+ }));
|
|
+ }
|
|
+ up_read(&ca->bucket_lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_fs_usage_initialize(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_usage *usage;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+ usage = c->usage_base;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
|
+ bch2_fs_usage_acc_to_base(c, i);
|
|
+
|
|
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
|
|
+ usage->reserved += usage->persistent_reserved[i];
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+
|
|
+ fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
|
|
+ }
|
|
+
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ struct bch_dev_usage dev = bch2_dev_usage_read(ca);
|
|
+
|
|
+ usage->hidden += (dev.d[BCH_DATA_sb].buckets +
|
|
+ dev.d[BCH_DATA_journal].buckets) *
|
|
+ ca->mi.bucket_size;
|
|
+ }
|
|
+
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+}
|
|
+
|
|
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
|
|
+ unsigned journal_seq,
|
|
+ bool gc)
|
|
+{
|
|
+ return this_cpu_ptr(gc
|
|
+ ? ca->usage_gc
|
|
+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
|
|
+}
|
|
+
|
|
+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct bch_dev_usage ret;
|
|
+ unsigned seq, i, u64s = dev_usage_u64s();
|
|
+
|
|
+ do {
|
|
+ seq = read_seqcount_begin(&c->usage_lock);
|
|
+ memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
|
|
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
|
|
+ acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
|
|
+ } while (read_seqcount_retry(&c->usage_lock, seq));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
|
|
+ unsigned journal_seq,
|
|
+ bool gc)
|
|
+{
|
|
+ return this_cpu_ptr(gc
|
|
+ ? c->usage_gc
|
|
+ : c->usage[journal_seq & JOURNAL_BUF_MASK]);
|
|
+}
|
|
+
|
|
+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v)
|
|
+{
|
|
+ ssize_t offset = v - (u64 *) c->usage_base;
|
|
+ unsigned i, seq;
|
|
+ u64 ret;
|
|
+
|
|
+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c));
|
|
+ percpu_rwsem_assert_held(&c->mark_lock);
|
|
+
|
|
+ do {
|
|
+ seq = read_seqcount_begin(&c->usage_lock);
|
|
+ ret = *v;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
|
+ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset);
|
|
+ } while (read_seqcount_retry(&c->usage_lock, seq));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_usage_online *ret;
|
|
+ unsigned seq, i, u64s;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+
|
|
+ ret = kmalloc(sizeof(struct bch_fs_usage_online) +
|
|
+ sizeof(u64) * c->replicas.nr, GFP_NOFS);
|
|
+ if (unlikely(!ret)) {
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ ret->online_reserved = percpu_u64_get(c->online_reserved);
|
|
+
|
|
+ u64s = fs_usage_u64s(c);
|
|
+ do {
|
|
+ seq = read_seqcount_begin(&c->usage_lock);
|
|
+ memcpy(&ret->u, c->usage_base, u64s * sizeof(u64));
|
|
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
|
+ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], u64s);
|
|
+ } while (read_seqcount_retry(&c->usage_lock, seq));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, u64s = fs_usage_u64s(c);
|
|
+
|
|
+ BUG_ON(idx >= ARRAY_SIZE(c->usage));
|
|
+
|
|
+ preempt_disable();
|
|
+ write_seqcount_begin(&c->usage_lock);
|
|
+
|
|
+ acc_u64s_percpu((u64 *) c->usage_base,
|
|
+ (u64 __percpu *) c->usage[idx], u64s);
|
|
+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_member_device_rcu(ca, c, i, NULL) {
|
|
+ u64s = dev_usage_u64s();
|
|
+
|
|
+ acc_u64s_percpu((u64 *) ca->usage_base,
|
|
+ (u64 __percpu *) ca->usage[idx], u64s);
|
|
+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ write_seqcount_end(&c->usage_lock);
|
|
+ preempt_enable();
|
|
+}
|
|
+
|
|
+void bch2_fs_usage_to_text(struct printbuf *out,
|
|
+ struct bch_fs *c,
|
|
+ struct bch_fs_usage_online *fs_usage)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity);
|
|
+
|
|
+ pr_buf(out, "hidden:\t\t\t\t%llu\n",
|
|
+ fs_usage->u.hidden);
|
|
+ pr_buf(out, "data:\t\t\t\t%llu\n",
|
|
+ fs_usage->u.data);
|
|
+ pr_buf(out, "cached:\t\t\t\t%llu\n",
|
|
+ fs_usage->u.cached);
|
|
+ pr_buf(out, "reserved:\t\t\t%llu\n",
|
|
+ fs_usage->u.reserved);
|
|
+ pr_buf(out, "nr_inodes:\t\t\t%llu\n",
|
|
+ fs_usage->u.nr_inodes);
|
|
+ pr_buf(out, "online reserved:\t\t%llu\n",
|
|
+ fs_usage->online_reserved);
|
|
+
|
|
+ for (i = 0;
|
|
+ i < ARRAY_SIZE(fs_usage->u.persistent_reserved);
|
|
+ i++) {
|
|
+ pr_buf(out, "%u replicas:\n", i + 1);
|
|
+ pr_buf(out, "\treserved:\t\t%llu\n",
|
|
+ fs_usage->u.persistent_reserved[i]);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+
|
|
+ pr_buf(out, "\t");
|
|
+ bch2_replicas_entry_to_text(out, e);
|
|
+ pr_buf(out, ":\t%llu\n", fs_usage->u.replicas[i]);
|
|
+ }
|
|
+}
|
|
+
|
|
+#define RESERVE_FACTOR 6
|
|
+
|
|
+static u64 reserve_factor(u64 r)
|
|
+{
|
|
+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
|
|
+}
|
|
+
|
|
+static u64 avail_factor(u64 r)
|
|
+{
|
|
+ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1);
|
|
+}
|
|
+
|
|
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
|
|
+{
|
|
+ return min(fs_usage->u.hidden +
|
|
+ fs_usage->u.btree +
|
|
+ fs_usage->u.data +
|
|
+ reserve_factor(fs_usage->u.reserved +
|
|
+ fs_usage->online_reserved),
|
|
+ c->capacity);
|
|
+}
|
|
+
|
|
+static struct bch_fs_usage_short
|
|
+__bch2_fs_usage_read_short(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_usage_short ret;
|
|
+ u64 data, reserved;
|
|
+
|
|
+ ret.capacity = c->capacity -
|
|
+ bch2_fs_usage_read_one(c, &c->usage_base->hidden);
|
|
+
|
|
+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
|
|
+ bch2_fs_usage_read_one(c, &c->usage_base->btree);
|
|
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
|
|
+ percpu_u64_get(c->online_reserved);
|
|
+
|
|
+ ret.used = min(ret.capacity, data + reserve_factor(reserved));
|
|
+ ret.free = ret.capacity - ret.used;
|
|
+
|
|
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct bch_fs_usage_short
|
|
+bch2_fs_usage_read_short(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_usage_short ret;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ ret = __bch2_fs_usage_read_short(c);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline int is_unavailable_bucket(struct bucket_mark m)
|
|
+{
|
|
+ return !is_available_bucket(m);
|
|
+}
|
|
+
|
|
+static inline int bucket_sectors_fragmented(struct bch_dev *ca,
|
|
+ struct bucket_mark m)
|
|
+{
|
|
+ return bucket_sectors_used(m)
|
|
+ ? max(0, (int) ca->mi.bucket_size - (int) bucket_sectors_used(m))
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+static inline int is_stripe_data_bucket(struct bucket_mark m)
|
|
+{
|
|
+ return m.stripe && m.data_type != BCH_DATA_parity;
|
|
+}
|
|
+
|
|
+static inline enum bch_data_type bucket_type(struct bucket_mark m)
|
|
+{
|
|
+ return m.cached_sectors && !m.dirty_sectors
|
|
+ ? BCH_DATA_cached
|
|
+ : m.data_type;
|
|
+}
|
|
+
|
|
+static bool bucket_became_unavailable(struct bucket_mark old,
|
|
+ struct bucket_mark new)
|
|
+{
|
|
+ return is_available_bucket(old) &&
|
|
+ !is_available_bucket(new);
|
|
+}
|
|
+
|
|
+static inline void account_bucket(struct bch_fs_usage *fs_usage,
|
|
+ struct bch_dev_usage *dev_usage,
|
|
+ enum bch_data_type type,
|
|
+ int nr, s64 size)
|
|
+{
|
|
+ if (type == BCH_DATA_sb || type == BCH_DATA_journal)
|
|
+ fs_usage->hidden += size;
|
|
+
|
|
+ dev_usage->d[type].buckets += nr;
|
|
+}
|
|
+
|
|
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ struct bucket_mark old, struct bucket_mark new,
|
|
+ u64 journal_seq, bool gc)
|
|
+{
|
|
+ struct bch_dev_usage *u;
|
|
+
|
|
+ percpu_rwsem_assert_held(&c->mark_lock);
|
|
+
|
|
+ preempt_disable();
|
|
+ if (!fs_usage)
|
|
+ fs_usage = fs_usage_ptr(c, journal_seq, gc);
|
|
+ u = dev_usage_ptr(ca, journal_seq, gc);
|
|
+
|
|
+ if (bucket_type(old))
|
|
+ account_bucket(fs_usage, u, bucket_type(old),
|
|
+ -1, -ca->mi.bucket_size);
|
|
+
|
|
+ if (bucket_type(new))
|
|
+ account_bucket(fs_usage, u, bucket_type(new),
|
|
+ 1, ca->mi.bucket_size);
|
|
+
|
|
+ u->buckets_ec += (int) new.stripe - (int) old.stripe;
|
|
+ u->buckets_unavailable +=
|
|
+ is_unavailable_bucket(new) - is_unavailable_bucket(old);
|
|
+
|
|
+ u->d[old.data_type].sectors -= old.dirty_sectors;
|
|
+ u->d[new.data_type].sectors += new.dirty_sectors;
|
|
+ u->d[BCH_DATA_cached].sectors +=
|
|
+ (int) new.cached_sectors - (int) old.cached_sectors;
|
|
+
|
|
+ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
|
|
+ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
|
|
+
|
|
+ preempt_enable();
|
|
+
|
|
+ if (!is_available_bucket(old) && is_available_bucket(new))
|
|
+ bch2_wake_allocator(ca);
|
|
+}
|
|
+
|
|
+static inline int update_replicas(struct bch_fs *c,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ struct bch_replicas_entry *r,
|
|
+ s64 sectors)
|
|
+{
|
|
+ int idx = bch2_replicas_entry_idx(c, r);
|
|
+
|
|
+ if (idx < 0)
|
|
+ return -1;
|
|
+
|
|
+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
|
|
+ fs_usage->replicas[idx] += sectors;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int update_cached_sectors(struct bch_fs *c,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ unsigned dev, s64 sectors)
|
|
+{
|
|
+ struct bch_replicas_padded r;
|
|
+
|
|
+ bch2_replicas_entry_cached(&r.e, dev);
|
|
+
|
|
+ return update_replicas(c, fs_usage, &r.e, sectors);
|
|
+}
|
|
+
|
|
+static struct replicas_delta_list *
|
|
+replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
|
|
+{
|
|
+ struct replicas_delta_list *d = trans->fs_usage_deltas;
|
|
+ unsigned new_size = d ? (d->size + more) * 2 : 128;
|
|
+ unsigned alloc_size = sizeof(*d) + new_size;
|
|
+
|
|
+ WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
|
|
+
|
|
+ if (!d || d->used + more > d->size) {
|
|
+ d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
|
|
+
|
|
+ BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
|
|
+
|
|
+ if (!d) {
|
|
+ d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
|
|
+ memset(d, 0, REPLICAS_DELTA_LIST_MAX);
|
|
+
|
|
+ if (trans->fs_usage_deltas)
|
|
+ memcpy(d, trans->fs_usage_deltas,
|
|
+ trans->fs_usage_deltas->size + sizeof(*d));
|
|
+
|
|
+ new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
|
|
+ kfree(trans->fs_usage_deltas);
|
|
+ }
|
|
+
|
|
+ d->size = new_size;
|
|
+ trans->fs_usage_deltas = d;
|
|
+ }
|
|
+ return d;
|
|
+}
|
|
+
|
|
+static inline void update_replicas_list(struct btree_trans *trans,
|
|
+ struct bch_replicas_entry *r,
|
|
+ s64 sectors)
|
|
+{
|
|
+ struct replicas_delta_list *d;
|
|
+ struct replicas_delta *n;
|
|
+ unsigned b;
|
|
+
|
|
+ if (!sectors)
|
|
+ return;
|
|
+
|
|
+ b = replicas_entry_bytes(r) + 8;
|
|
+ d = replicas_deltas_realloc(trans, b);
|
|
+
|
|
+ n = (void *) d->d + d->used;
|
|
+ n->delta = sectors;
|
|
+ memcpy(&n->r, r, replicas_entry_bytes(r));
|
|
+ bch2_replicas_entry_sort(&n->r);
|
|
+ d->used += b;
|
|
+}
|
|
+
|
|
+static inline void update_cached_sectors_list(struct btree_trans *trans,
|
|
+ unsigned dev, s64 sectors)
|
|
+{
|
|
+ struct bch_replicas_padded r;
|
|
+
|
|
+ bch2_replicas_entry_cached(&r.e, dev);
|
|
+
|
|
+ update_replicas_list(trans, &r.e, sectors);
|
|
+}
|
|
+
|
|
+#define do_mark_fn(fn, c, pos, flags, ...) \
|
|
+({ \
|
|
+ int gc, ret = 0; \
|
|
+ \
|
|
+ percpu_rwsem_assert_held(&c->mark_lock); \
|
|
+ \
|
|
+ for (gc = 0; gc < 2 && !ret; gc++) \
|
|
+ if (!gc == !(flags & BTREE_TRIGGER_GC) || \
|
|
+ (gc && gc_visited(c, pos))) \
|
|
+ ret = fn(c, __VA_ARGS__, gc); \
|
|
+ ret; \
|
|
+})
|
|
+
|
|
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t b, bool owned_by_allocator)
|
|
+{
|
|
+ struct bucket *g = bucket(ca, b);
|
|
+ struct bucket_mark old, new;
|
|
+
|
|
+ old = bucket_cmpxchg(g, new, ({
|
|
+ new.owned_by_allocator = owned_by_allocator;
|
|
+ }));
|
|
+
|
|
+ BUG_ON(owned_by_allocator == old.owned_by_allocator);
|
|
+}
|
|
+
|
|
+static int bch2_mark_alloc(struct bch_fs *c,
|
|
+ struct bkey_s_c old, struct bkey_s_c new,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ u64 journal_seq, unsigned flags)
|
|
+{
|
|
+ bool gc = flags & BTREE_TRIGGER_GC;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+ struct bch_dev *ca;
|
|
+ struct bucket *g;
|
|
+ struct bucket_mark old_m, m;
|
|
+
|
|
+ /* We don't do anything for deletions - do we?: */
|
|
+ if (new.k->type != KEY_TYPE_alloc &&
|
|
+ new.k->type != KEY_TYPE_alloc_v2)
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * alloc btree is read in by bch2_alloc_read, not gc:
|
|
+ */
|
|
+ if ((flags & BTREE_TRIGGER_GC) &&
|
|
+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
|
|
+ return 0;
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, new.k->p.inode);
|
|
+
|
|
+ if (new.k->p.offset >= ca->mi.nbuckets)
|
|
+ return 0;
|
|
+
|
|
+ g = __bucket(ca, new.k->p.offset, gc);
|
|
+ u = bch2_alloc_unpack(new);
|
|
+
|
|
+ old_m = bucket_cmpxchg(g, m, ({
|
|
+ m.gen = u.gen;
|
|
+ m.data_type = u.data_type;
|
|
+ m.dirty_sectors = u.dirty_sectors;
|
|
+ m.cached_sectors = u.cached_sectors;
|
|
+ m.stripe = u.stripe != 0;
|
|
+
|
|
+ if (journal_seq) {
|
|
+ m.journal_seq_valid = 1;
|
|
+ m.journal_seq = journal_seq;
|
|
+ }
|
|
+ }));
|
|
+
|
|
+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
|
|
+
|
|
+ g->io_time[READ] = u.read_time;
|
|
+ g->io_time[WRITE] = u.write_time;
|
|
+ g->oldest_gen = u.oldest_gen;
|
|
+ g->gen_valid = 1;
|
|
+ g->stripe = u.stripe;
|
|
+ g->stripe_redundancy = u.stripe_redundancy;
|
|
+
|
|
+ /*
|
|
+ * need to know if we're getting called from the invalidate path or
|
|
+ * not:
|
|
+ */
|
|
+
|
|
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
|
|
+ old_m.cached_sectors) {
|
|
+ if (update_cached_sectors(c, fs_usage, ca->dev_idx,
|
|
+ -old_m.cached_sectors)) {
|
|
+ bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
|
|
+ old_m.cached_sectors);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define checked_add(a, b) \
|
|
+({ \
|
|
+ unsigned _res = (unsigned) (a) + (b); \
|
|
+ bool overflow = _res > U16_MAX; \
|
|
+ if (overflow) \
|
|
+ _res = U16_MAX; \
|
|
+ (a) = _res; \
|
|
+ overflow; \
|
|
+})
|
|
+
|
|
+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t b, enum bch_data_type data_type,
|
|
+ unsigned sectors, bool gc)
|
|
+{
|
|
+ struct bucket *g = __bucket(ca, b, gc);
|
|
+ struct bucket_mark old, new;
|
|
+ bool overflow;
|
|
+
|
|
+ BUG_ON(data_type != BCH_DATA_sb &&
|
|
+ data_type != BCH_DATA_journal);
|
|
+
|
|
+ old = bucket_cmpxchg(g, new, ({
|
|
+ new.data_type = data_type;
|
|
+ overflow = checked_add(new.dirty_sectors, sectors);
|
|
+ }));
|
|
+
|
|
+ bch2_fs_inconsistent_on(old.data_type &&
|
|
+ old.data_type != data_type, c,
|
|
+ "different types of data in same bucket: %s, %s",
|
|
+ bch2_data_types[old.data_type],
|
|
+ bch2_data_types[data_type]);
|
|
+
|
|
+ bch2_fs_inconsistent_on(overflow, c,
|
|
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX",
|
|
+ ca->dev_idx, b, new.gen,
|
|
+ bch2_data_types[old.data_type ?: data_type],
|
|
+ old.dirty_sectors, sectors);
|
|
+
|
|
+ if (c)
|
|
+ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
|
|
+ old, new, 0, gc);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t b, enum bch_data_type type,
|
|
+ unsigned sectors, struct gc_pos pos,
|
|
+ unsigned flags)
|
|
+{
|
|
+ BUG_ON(type != BCH_DATA_sb &&
|
|
+ type != BCH_DATA_journal);
|
|
+
|
|
+ preempt_disable();
|
|
+
|
|
+ if (likely(c)) {
|
|
+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
|
|
+ ca, b, type, sectors);
|
|
+ } else {
|
|
+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
|
|
+ }
|
|
+
|
|
+ preempt_enable();
|
|
+}
|
|
+
|
|
+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors)
|
|
+{
|
|
+ return DIV_ROUND_UP(sectors * n, d);
|
|
+}
|
|
+
|
|
+static s64 __ptr_disk_sectors_delta(unsigned old_size,
|
|
+ unsigned offset, s64 delta,
|
|
+ unsigned flags,
|
|
+ unsigned n, unsigned d)
|
|
+{
|
|
+ BUG_ON(!n || !d);
|
|
+
|
|
+ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) {
|
|
+ BUG_ON(offset + -delta > old_size);
|
|
+
|
|
+ return -disk_sectors_scaled(n, d, old_size) +
|
|
+ disk_sectors_scaled(n, d, offset) +
|
|
+ disk_sectors_scaled(n, d, old_size - offset + delta);
|
|
+ } else if (flags & BTREE_TRIGGER_OVERWRITE) {
|
|
+ BUG_ON(offset + -delta > old_size);
|
|
+
|
|
+ return -disk_sectors_scaled(n, d, old_size) +
|
|
+ disk_sectors_scaled(n, d, old_size + delta);
|
|
+ } else {
|
|
+ return disk_sectors_scaled(n, d, delta);
|
|
+ }
|
|
+}
|
|
+
|
|
+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
|
|
+ unsigned offset, s64 delta,
|
|
+ unsigned flags)
|
|
+{
|
|
+ return __ptr_disk_sectors_delta(p.crc.live_size,
|
|
+ offset, delta, flags,
|
|
+ p.crc.compressed_size,
|
|
+ p.crc.uncompressed_size);
|
|
+}
|
|
+
|
|
+static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
|
|
+ const struct bch_extent_ptr *ptr,
|
|
+ s64 sectors, enum bch_data_type ptr_data_type,
|
|
+ u8 bucket_gen, u8 bucket_data_type,
|
|
+ u16 dirty_sectors, u16 cached_sectors)
|
|
+{
|
|
+ size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr);
|
|
+ u16 bucket_sectors = !ptr->cached
|
|
+ ? dirty_sectors
|
|
+ : cached_sectors;
|
|
+ char buf[200];
|
|
+
|
|
+ if (gen_after(ptr->gen, bucket_gen)) {
|
|
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
|
+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
|
|
+ "while marking %s",
|
|
+ ptr->dev, bucket_nr, bucket_gen,
|
|
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
|
|
+ ptr->gen,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) {
|
|
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
|
+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
|
|
+ "while marking %s",
|
|
+ ptr->dev, bucket_nr, bucket_gen,
|
|
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
|
|
+ ptr->gen,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ if (bucket_gen != ptr->gen && !ptr->cached) {
|
|
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
|
+ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n"
|
|
+ "while marking %s",
|
|
+ ptr->dev, bucket_nr, bucket_gen,
|
|
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
|
|
+ ptr->gen,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ if (bucket_gen != ptr->gen)
|
|
+ return 1;
|
|
+
|
|
+ if (bucket_data_type && ptr_data_type &&
|
|
+ bucket_data_type != ptr_data_type) {
|
|
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
|
+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
|
|
+ "while marking %s",
|
|
+ ptr->dev, bucket_nr, bucket_gen,
|
|
+ bch2_data_types[bucket_data_type],
|
|
+ bch2_data_types[ptr_data_type],
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
|
|
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
|
+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
|
|
+ "while marking %s",
|
|
+ ptr->dev, bucket_nr, bucket_gen,
|
|
+ bch2_data_types[bucket_data_type ?: ptr_data_type],
|
|
+ bucket_sectors, sectors,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
|
|
+ unsigned ptr_idx,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ u64 journal_seq, unsigned flags)
|
|
+{
|
|
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
|
|
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
|
|
+ bool parity = ptr_idx >= nr_data;
|
|
+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
|
|
+ bool gc = flags & BTREE_TRIGGER_GC;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, ptr, gc);
|
|
+ struct bucket_mark new, old;
|
|
+ char buf[200];
|
|
+ int ret;
|
|
+
|
|
+ if (g->stripe && g->stripe != k.k->p.offset) {
|
|
+ bch2_fs_inconsistent(c,
|
|
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
|
|
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ old = bucket_cmpxchg(g, new, ({
|
|
+ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
|
|
+ new.dirty_sectors, new.cached_sectors);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (parity) {
|
|
+ new.data_type = BCH_DATA_parity;
|
|
+ new.dirty_sectors = le16_to_cpu(s->sectors);
|
|
+ }
|
|
+
|
|
+ if (journal_seq) {
|
|
+ new.journal_seq_valid = 1;
|
|
+ new.journal_seq = journal_seq;
|
|
+ }
|
|
+ }));
|
|
+
|
|
+ g->stripe = k.k->p.offset;
|
|
+ g->stripe_redundancy = s->nr_redundant;
|
|
+
|
|
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k,
|
|
+ const struct bch_extent_ptr *ptr,
|
|
+ s64 sectors, enum bch_data_type ptr_data_type,
|
|
+ u8 bucket_gen, u8 *bucket_data_type,
|
|
+ u16 *dirty_sectors, u16 *cached_sectors)
|
|
+{
|
|
+ u16 *dst_sectors = !ptr->cached
|
|
+ ? dirty_sectors
|
|
+ : cached_sectors;
|
|
+ int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type,
|
|
+ bucket_gen, *bucket_data_type,
|
|
+ *dirty_sectors, *cached_sectors);
|
|
+
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ *dst_sectors += sectors;
|
|
+ *bucket_data_type = *dirty_sectors || *cached_sectors
|
|
+ ? ptr_data_type : 0;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
|
|
+ struct extent_ptr_decoded p,
|
|
+ s64 sectors, enum bch_data_type data_type,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ u64 journal_seq, unsigned flags)
|
|
+{
|
|
+ bool gc = flags & BTREE_TRIGGER_GC;
|
|
+ struct bucket_mark old, new;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
|
|
+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc);
|
|
+ u8 bucket_data_type;
|
|
+ u64 v;
|
|
+ int ret;
|
|
+
|
|
+ v = atomic64_read(&g->_mark.v);
|
|
+ do {
|
|
+ new.v.counter = old.v.counter = v;
|
|
+ bucket_data_type = new.data_type;
|
|
+
|
|
+ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen,
|
|
+ &bucket_data_type,
|
|
+ &new.dirty_sectors,
|
|
+ &new.cached_sectors);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ new.data_type = bucket_data_type;
|
|
+
|
|
+ if (journal_seq) {
|
|
+ new.journal_seq_valid = 1;
|
|
+ new.journal_seq = journal_seq;
|
|
+ }
|
|
+
|
|
+ if (flags & BTREE_TRIGGER_NOATOMIC) {
|
|
+ g->_mark = new;
|
|
+ break;
|
|
+ }
|
|
+ } while ((v = atomic64_cmpxchg(&g->_mark.v,
|
|
+ old.v.counter,
|
|
+ new.v.counter)) != old.v.counter);
|
|
+
|
|
+ bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
|
|
+
|
|
+ BUG_ON(!gc && bucket_became_unavailable(old, new));
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_mark_stripe_ptr(struct bch_fs *c,
|
|
+ struct bch_extent_stripe_ptr p,
|
|
+ enum bch_data_type data_type,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ s64 sectors, unsigned flags)
|
|
+{
|
|
+ bool gc = flags & BTREE_TRIGGER_GC;
|
|
+ struct bch_replicas_padded r;
|
|
+ struct stripe *m;
|
|
+ unsigned i, blocks_nonempty = 0;
|
|
+
|
|
+ m = genradix_ptr(&c->stripes[gc], p.idx);
|
|
+
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+
|
|
+ if (!m || !m->alive) {
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
|
|
+ (u64) p.idx);
|
|
+ bch2_inconsistent_error(c);
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ m->block_sectors[p.block] += sectors;
|
|
+
|
|
+ r = m->r;
|
|
+
|
|
+ for (i = 0; i < m->nr_blocks; i++)
|
|
+ blocks_nonempty += m->block_sectors[i] != 0;
|
|
+
|
|
+ if (m->blocks_nonempty != blocks_nonempty) {
|
|
+ m->blocks_nonempty = blocks_nonempty;
|
|
+ if (!gc)
|
|
+ bch2_stripes_heap_update(c, m, p.idx);
|
|
+ }
|
|
+
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+
|
|
+ r.e.data_type = data_type;
|
|
+ update_replicas(c, fs_usage, &r.e, sectors);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_mark_extent(struct bch_fs *c,
|
|
+ struct bkey_s_c old, struct bkey_s_c new,
|
|
+ unsigned offset, s64 sectors,
|
|
+ enum bch_data_type data_type,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ unsigned journal_seq, unsigned flags)
|
|
+{
|
|
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ struct bch_replicas_padded r;
|
|
+ s64 dirty_sectors = 0;
|
|
+ bool stale;
|
|
+ int ret;
|
|
+
|
|
+ r.e.data_type = data_type;
|
|
+ r.e.nr_devs = 0;
|
|
+ r.e.nr_required = 1;
|
|
+
|
|
+ BUG_ON(!sectors);
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ s64 disk_sectors = data_type == BCH_DATA_btree
|
|
+ ? sectors
|
|
+ : ptr_disk_sectors_delta(p, offset, sectors, flags);
|
|
+
|
|
+ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type,
|
|
+ fs_usage, journal_seq, flags);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ stale = ret > 0;
|
|
+
|
|
+ if (p.ptr.cached) {
|
|
+ if (!stale)
|
|
+ if (update_cached_sectors(c, fs_usage, p.ptr.dev,
|
|
+ disk_sectors)) {
|
|
+ bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
|
|
+ return -1;
|
|
+
|
|
+ }
|
|
+ } else if (!p.has_ec) {
|
|
+ dirty_sectors += disk_sectors;
|
|
+ r.e.devs[r.e.nr_devs++] = p.ptr.dev;
|
|
+ } else {
|
|
+ ret = bch2_mark_stripe_ptr(c, p.ec, data_type,
|
|
+ fs_usage, disk_sectors, flags);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ /*
|
|
+ * There may be other dirty pointers in this extent, but
|
|
+ * if so they're not required for mounting if we have an
|
|
+ * erasure coded pointer in this extent:
|
|
+ */
|
|
+ r.e.nr_required = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (r.e.nr_devs) {
|
|
+ if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) {
|
|
+ char buf[200];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
|
|
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_mark_stripe(struct bch_fs *c,
|
|
+ struct bkey_s_c old, struct bkey_s_c new,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ u64 journal_seq, unsigned flags)
|
|
+{
|
|
+ bool gc = flags & BTREE_TRIGGER_GC;
|
|
+ size_t idx = new.k->p.offset;
|
|
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
|
|
+ ? bkey_s_c_to_stripe(old).v : NULL;
|
|
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
|
|
+ ? bkey_s_c_to_stripe(new).v : NULL;
|
|
+ struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(gc && old_s);
|
|
+
|
|
+ if (!m || (old_s && !m->alive)) {
|
|
+ bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
|
|
+ idx);
|
|
+ bch2_inconsistent_error(c);
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ if (!new_s) {
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ bch2_stripes_heap_del(c, m, idx);
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+
|
|
+ memset(m, 0, sizeof(*m));
|
|
+ } else {
|
|
+ m->alive = true;
|
|
+ m->sectors = le16_to_cpu(new_s->sectors);
|
|
+ m->algorithm = new_s->algorithm;
|
|
+ m->nr_blocks = new_s->nr_blocks;
|
|
+ m->nr_redundant = new_s->nr_redundant;
|
|
+ m->blocks_nonempty = 0;
|
|
+
|
|
+ for (i = 0; i < new_s->nr_blocks; i++) {
|
|
+ m->block_sectors[i] =
|
|
+ stripe_blockcount_get(new_s, i);
|
|
+ m->blocks_nonempty += !!m->block_sectors[i];
|
|
+
|
|
+ m->ptrs[i] = new_s->ptrs[i];
|
|
+ }
|
|
+
|
|
+ bch2_bkey_to_replicas(&m->r.e, new);
|
|
+
|
|
+ if (!gc) {
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ bch2_stripes_heap_update(c, m, idx);
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (gc) {
|
|
+ /*
|
|
+ * gc recalculates this field from stripe ptr
|
|
+ * references:
|
|
+ */
|
|
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
|
|
+ m->blocks_nonempty = 0;
|
|
+
|
|
+ for (i = 0; i < new_s->nr_blocks; i++) {
|
|
+ ret = mark_stripe_bucket(c, new, i, fs_usage,
|
|
+ journal_seq, flags);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (update_replicas(c, fs_usage, &m->r.e,
|
|
+ ((s64) m->sectors * m->nr_redundant))) {
|
|
+ char buf[200];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, new);
|
|
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
|
|
+ return -1;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_mark_key_locked(struct bch_fs *c,
|
|
+ struct bkey_s_c old,
|
|
+ struct bkey_s_c new,
|
|
+ unsigned offset, s64 sectors,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ u64 journal_seq, unsigned flags)
|
|
+{
|
|
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
|
|
+ int ret = 0;
|
|
+
|
|
+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
|
|
+
|
|
+ preempt_disable();
|
|
+
|
|
+ if (!fs_usage || (flags & BTREE_TRIGGER_GC))
|
|
+ fs_usage = fs_usage_ptr(c, journal_seq,
|
|
+ flags & BTREE_TRIGGER_GC);
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_alloc:
|
|
+ case KEY_TYPE_alloc_v2:
|
|
+ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
|
|
+ break;
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
|
|
+ ? c->opts.btree_node_size
|
|
+ : -c->opts.btree_node_size;
|
|
+
|
|
+ ret = bch2_mark_extent(c, old, new, offset, sectors,
|
|
+ BCH_DATA_btree, fs_usage, journal_seq, flags);
|
|
+ break;
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ ret = bch2_mark_extent(c, old, new, offset, sectors,
|
|
+ BCH_DATA_user, fs_usage, journal_seq, flags);
|
|
+ break;
|
|
+ case KEY_TYPE_stripe:
|
|
+ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags);
|
|
+ break;
|
|
+ case KEY_TYPE_inode:
|
|
+ fs_usage->nr_inodes += new.k->type == KEY_TYPE_inode;
|
|
+ fs_usage->nr_inodes -= old.k->type == KEY_TYPE_inode;
|
|
+ break;
|
|
+ case KEY_TYPE_reservation: {
|
|
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
|
|
+
|
|
+ sectors *= replicas;
|
|
+ replicas = clamp_t(unsigned, replicas, 1,
|
|
+ ARRAY_SIZE(fs_usage->persistent_reserved));
|
|
+
|
|
+ fs_usage->reserved += sectors;
|
|
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ preempt_enable();
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new,
|
|
+ unsigned offset, s64 sectors,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ u64 journal_seq, unsigned flags)
|
|
+{
|
|
+ struct bkey deleted;
|
|
+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
|
|
+ int ret;
|
|
+
|
|
+ bkey_init(&deleted);
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ ret = bch2_mark_key_locked(c, old, new, offset, sectors,
|
|
+ fs_usage, journal_seq,
|
|
+ BTREE_TRIGGER_INSERT|flags);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_mark_update(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *new,
|
|
+ struct bch_fs_usage *fs_usage,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_s_c old;
|
|
+ struct bkey unpacked;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
|
|
+ return 0;
|
|
+
|
|
+ if (!btree_node_type_needs_gc(iter->btree_id))
|
|
+ return 0;
|
|
+
|
|
+ bkey_init(&unpacked);
|
|
+ old = (struct bkey_s_c) { &unpacked, NULL };
|
|
+
|
|
+ if (!btree_node_type_is_extents(iter->btree_id)) {
|
|
+ /* iterators should be uptodate, shouldn't get errors here: */
|
|
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
|
|
+ old = bch2_btree_iter_peek_slot(iter);
|
|
+ BUG_ON(bkey_err(old));
|
|
+ } else {
|
|
+ struct bkey_cached *ck = (void *) iter->l[0].b;
|
|
+
|
|
+ if (ck->valid)
|
|
+ old = bkey_i_to_s_c(ck->k);
|
|
+ }
|
|
+
|
|
+ if (old.k->type == new->k.type) {
|
|
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
|
|
+ fs_usage, trans->journal_res.seq,
|
|
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
|
|
+
|
|
+ } else {
|
|
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
|
|
+ fs_usage, trans->journal_res.seq,
|
|
+ BTREE_TRIGGER_INSERT|flags);
|
|
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0,
|
|
+ fs_usage, trans->journal_res.seq,
|
|
+ BTREE_TRIGGER_OVERWRITE|flags);
|
|
+ }
|
|
+ } else {
|
|
+ struct btree_iter *copy;
|
|
+
|
|
+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
|
|
+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
|
|
+ 0, new->k.size,
|
|
+ fs_usage, trans->journal_res.seq,
|
|
+ BTREE_TRIGGER_INSERT|flags);
|
|
+
|
|
+ copy = bch2_trans_copy_iter(trans, iter);
|
|
+
|
|
+ for_each_btree_key_continue(copy, 0, old, ret) {
|
|
+ unsigned offset = 0;
|
|
+ s64 sectors = -((s64) old.k->size);
|
|
+
|
|
+ flags |= BTREE_TRIGGER_OVERWRITE;
|
|
+
|
|
+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
|
|
+ break;
|
|
+
|
|
+ switch (bch2_extent_overlap(&new->k, old.k)) {
|
|
+ case BCH_EXTENT_OVERLAP_ALL:
|
|
+ offset = 0;
|
|
+ sectors = -((s64) old.k->size);
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_BACK:
|
|
+ offset = bkey_start_offset(&new->k) -
|
|
+ bkey_start_offset(old.k);
|
|
+ sectors = bkey_start_offset(&new->k) -
|
|
+ old.k->p.offset;
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_FRONT:
|
|
+ offset = 0;
|
|
+ sectors = bkey_start_offset(old.k) -
|
|
+ new->k.p.offset;
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_MIDDLE:
|
|
+ offset = bkey_start_offset(&new->k) -
|
|
+ bkey_start_offset(old.k);
|
|
+ sectors = -((s64) new->k.size);
|
|
+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ BUG_ON(sectors >= 0);
|
|
+
|
|
+ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new),
|
|
+ offset, sectors, fs_usage,
|
|
+ trans->journal_res.seq, flags) ?: 1;
|
|
+ if (ret <= 0)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(trans, copy);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static noinline __cold
|
|
+void fs_usage_apply_warn(struct btree_trans *trans,
|
|
+ unsigned disk_res_sectors)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_insert_entry *i;
|
|
+ char buf[200];
|
|
+
|
|
+ bch_err(c, "disk usage increased more than %u sectors reserved",
|
|
+ disk_res_sectors);
|
|
+
|
|
+ trans_for_each_update(trans, i) {
|
|
+ pr_err("while inserting");
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
|
|
+ pr_err("%s", buf);
|
|
+ pr_err("overlapping with");
|
|
+
|
|
+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) {
|
|
+ struct btree_iter *copy = bch2_trans_copy_iter(trans, i->iter);
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key_continue(copy, 0, k, ret) {
|
|
+ if (btree_node_type_is_extents(i->iter->btree_id)
|
|
+ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
|
|
+ : bkey_cmp(i->k->k.p, k.k->p))
|
|
+ break;
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, k);
|
|
+ pr_err("%s", buf);
|
|
+ }
|
|
+ bch2_trans_iter_put(trans, copy);
|
|
+ } else {
|
|
+ struct bkey_cached *ck = (void *) i->iter->l[0].b;
|
|
+
|
|
+ if (ck->valid) {
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k));
|
|
+ pr_err("%s", buf);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_trans_fs_usage_apply(struct btree_trans *trans,
|
|
+ struct replicas_delta_list *deltas)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ static int warned_disk_usage = 0;
|
|
+ bool warn = false;
|
|
+ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
|
|
+ struct replicas_delta *d = deltas->d;
|
|
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
|
|
+ struct bch_fs_usage *dst;
|
|
+ s64 added = 0, should_not_have_added;
|
|
+ unsigned i;
|
|
+
|
|
+ percpu_rwsem_assert_held(&c->mark_lock);
|
|
+
|
|
+ preempt_disable();
|
|
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
|
|
+
|
|
+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
|
|
+ switch (d->r.data_type) {
|
|
+ case BCH_DATA_btree:
|
|
+ case BCH_DATA_user:
|
|
+ case BCH_DATA_parity:
|
|
+ added += d->delta;
|
|
+ }
|
|
+
|
|
+ BUG_ON(update_replicas(c, dst, &d->r, d->delta));
|
|
+ }
|
|
+
|
|
+ dst->nr_inodes += deltas->nr_inodes;
|
|
+
|
|
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
|
|
+ added += deltas->persistent_reserved[i];
|
|
+ dst->reserved += deltas->persistent_reserved[i];
|
|
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Not allowed to reduce sectors_available except by getting a
|
|
+ * reservation:
|
|
+ */
|
|
+ should_not_have_added = added - (s64) disk_res_sectors;
|
|
+ if (unlikely(should_not_have_added > 0)) {
|
|
+ atomic64_sub(should_not_have_added, &c->sectors_available);
|
|
+ added -= should_not_have_added;
|
|
+ warn = true;
|
|
+ }
|
|
+
|
|
+ if (added > 0) {
|
|
+ trans->disk_res->sectors -= added;
|
|
+ this_cpu_sub(*c->online_reserved, added);
|
|
+ }
|
|
+
|
|
+ preempt_enable();
|
|
+
|
|
+ if (unlikely(warn) && !xchg(&warned_disk_usage, 1))
|
|
+ fs_usage_apply_warn(trans, disk_res_sectors);
|
|
+}
|
|
+
|
|
+/* trans_mark: */
|
|
+
|
|
+static struct btree_iter *trans_get_update(struct btree_trans *trans,
|
|
+ enum btree_id btree_id, struct bpos pos,
|
|
+ struct bkey_s_c *k)
|
|
+{
|
|
+ struct btree_insert_entry *i;
|
|
+
|
|
+ trans_for_each_update(trans, i)
|
|
+ if (i->iter->btree_id == btree_id &&
|
|
+ (btree_node_type_is_extents(btree_id)
|
|
+ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 &&
|
|
+ bkey_cmp(pos, i->k->k.p) < 0
|
|
+ : !bkey_cmp(pos, i->iter->pos))) {
|
|
+ *k = bkey_i_to_s_c(i->k);
|
|
+
|
|
+ /* ugly hack.. */
|
|
+ BUG_ON(btree_iter_live(trans, i->iter));
|
|
+ trans->iters_live |= 1ULL << i->iter->idx;
|
|
+ return i->iter;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static int trans_get_key(struct btree_trans *trans,
|
|
+ enum btree_id btree_id, struct bpos pos,
|
|
+ struct btree_iter **iter,
|
|
+ struct bkey_s_c *k)
|
|
+{
|
|
+ unsigned flags = btree_id != BTREE_ID_alloc
|
|
+ ? BTREE_ITER_SLOTS
|
|
+ : BTREE_ITER_CACHED;
|
|
+ int ret;
|
|
+
|
|
+ *iter = trans_get_update(trans, btree_id, pos, k);
|
|
+ if (*iter)
|
|
+ return 1;
|
|
+
|
|
+ *iter = bch2_trans_get_iter(trans, btree_id, pos,
|
|
+ flags|BTREE_ITER_INTENT);
|
|
+ *k = __bch2_btree_iter_peek(*iter, flags);
|
|
+ ret = bkey_err(*k);
|
|
+ if (ret)
|
|
+ bch2_trans_iter_put(trans, *iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct bkey_alloc_buf *
|
|
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
|
|
+ const struct bch_extent_ptr *ptr,
|
|
+ struct bkey_alloc_unpacked *u)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr));
|
|
+ struct bucket *g;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_alloc_buf *a;
|
|
+ int ret;
|
|
+
|
|
+ a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
|
|
+ if (IS_ERR(a))
|
|
+ return a;
|
|
+
|
|
+ iter = trans_get_update(trans, BTREE_ID_alloc, pos, &k);
|
|
+ if (iter) {
|
|
+ *u = bch2_alloc_unpack(k);
|
|
+ } else {
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, pos,
|
|
+ BTREE_ITER_CACHED|
|
|
+ BTREE_ITER_CACHED_NOFILL|
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret) {
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ERR_PTR(ret);
|
|
+ }
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ g = bucket(ca, pos.offset);
|
|
+ *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ *_iter = iter;
|
|
+ return a;
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_pointer(struct btree_trans *trans,
|
|
+ struct bkey_s_c k, struct extent_ptr_decoded p,
|
|
+ s64 sectors, enum bch_data_type data_type)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+ struct bkey_alloc_buf *a;
|
|
+ int ret;
|
|
+
|
|
+ a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
|
|
+ if (IS_ERR(a))
|
|
+ return PTR_ERR(a);
|
|
+
|
|
+ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
|
|
+ &u.dirty_sectors, &u.cached_sectors);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ bch2_alloc_pack(c, a, u);
|
|
+ bch2_trans_update(trans, iter, &a->k, 0);
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
|
|
+ struct extent_ptr_decoded p,
|
|
+ s64 sectors, enum bch_data_type data_type)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_i_stripe *s;
|
|
+ struct bch_replicas_padded r;
|
|
+ int ret = 0;
|
|
+
|
|
+ ret = trans_get_key(trans, BTREE_ID_stripes, POS(0, p.ec.idx), &iter, &k);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_stripe) {
|
|
+ bch2_fs_inconsistent(c,
|
|
+ "pointer to nonexistent stripe %llu",
|
|
+ (u64) p.ec.idx);
|
|
+ bch2_inconsistent_error(c);
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (!bch2_ptr_matches_stripe(bkey_s_c_to_stripe(k).v, p)) {
|
|
+ bch2_fs_inconsistent(c,
|
|
+ "stripe pointer doesn't match stripe %llu",
|
|
+ (u64) p.ec.idx);
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
|
|
+ ret = PTR_ERR_OR_ZERO(s);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ bkey_reassemble(&s->k_i, k);
|
|
+ stripe_blockcount_set(&s->v, p.ec.block,
|
|
+ stripe_blockcount_get(&s->v, p.ec.block) +
|
|
+ sectors);
|
|
+ bch2_trans_update(trans, iter, &s->k_i, 0);
|
|
+
|
|
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
|
|
+ r.e.data_type = data_type;
|
|
+ update_replicas_list(trans, &r.e, sectors);
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_extent(struct btree_trans *trans,
|
|
+ struct bkey_s_c k, unsigned offset,
|
|
+ s64 sectors, unsigned flags,
|
|
+ enum bch_data_type data_type)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ struct bch_replicas_padded r;
|
|
+ s64 dirty_sectors = 0;
|
|
+ bool stale;
|
|
+ int ret;
|
|
+
|
|
+ r.e.data_type = data_type;
|
|
+ r.e.nr_devs = 0;
|
|
+ r.e.nr_required = 1;
|
|
+
|
|
+ BUG_ON(!sectors);
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ s64 disk_sectors = data_type == BCH_DATA_btree
|
|
+ ? sectors
|
|
+ : ptr_disk_sectors_delta(p, offset, sectors, flags);
|
|
+
|
|
+ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors,
|
|
+ data_type);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ stale = ret > 0;
|
|
+
|
|
+ if (p.ptr.cached) {
|
|
+ if (!stale)
|
|
+ update_cached_sectors_list(trans, p.ptr.dev,
|
|
+ disk_sectors);
|
|
+ } else if (!p.has_ec) {
|
|
+ dirty_sectors += disk_sectors;
|
|
+ r.e.devs[r.e.nr_devs++] = p.ptr.dev;
|
|
+ } else {
|
|
+ ret = bch2_trans_mark_stripe_ptr(trans, p,
|
|
+ disk_sectors, data_type);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ r.e.nr_required = 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (r.e.nr_devs)
|
|
+ update_replicas_list(trans, &r.e, dirty_sectors);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
|
|
+ struct bkey_s_c_stripe s,
|
|
+ unsigned idx, bool deleting)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
|
|
+ struct bkey_alloc_buf *a;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+ bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
|
|
+ int ret = 0;
|
|
+
|
|
+ a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
|
|
+ if (IS_ERR(a))
|
|
+ return PTR_ERR(a);
|
|
+
|
|
+ if (parity) {
|
|
+ s64 sectors = le16_to_cpu(s.v->sectors);
|
|
+
|
|
+ if (deleting)
|
|
+ sectors = -sectors;
|
|
+
|
|
+ u.dirty_sectors += sectors;
|
|
+ u.data_type = u.dirty_sectors
|
|
+ ? BCH_DATA_parity
|
|
+ : 0;
|
|
+ }
|
|
+
|
|
+ if (!deleting) {
|
|
+ if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
|
|
+ "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
|
|
+ iter->pos.inode, iter->pos.offset, u.gen,
|
|
+ u.stripe, s.k->p.offset)) {
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ u.stripe = s.k->p.offset;
|
|
+ u.stripe_redundancy = s.v->nr_redundant;
|
|
+ } else {
|
|
+ u.stripe = 0;
|
|
+ u.stripe_redundancy = 0;
|
|
+ }
|
|
+
|
|
+ bch2_alloc_pack(c, a, u);
|
|
+ bch2_trans_update(trans, iter, &a->k, 0);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_stripe(struct btree_trans *trans,
|
|
+ struct bkey_s_c old, struct bkey_s_c new,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bkey_s_c_stripe old_s = { NULL };
|
|
+ struct bkey_s_c_stripe new_s = { NULL };
|
|
+ struct bch_replicas_padded r;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (old.k->type == KEY_TYPE_stripe)
|
|
+ old_s = bkey_s_c_to_stripe(old);
|
|
+ if (new.k->type == KEY_TYPE_stripe)
|
|
+ new_s = bkey_s_c_to_stripe(new);
|
|
+
|
|
+ /*
|
|
+ * If the pointers aren't changing, we don't need to do anything:
|
|
+ */
|
|
+ if (new_s.k && old_s.k &&
|
|
+ new_s.v->nr_blocks == old_s.v->nr_blocks &&
|
|
+ new_s.v->nr_redundant == old_s.v->nr_redundant &&
|
|
+ !memcmp(old_s.v->ptrs, new_s.v->ptrs,
|
|
+ new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
|
|
+ return 0;
|
|
+
|
|
+ if (new_s.k) {
|
|
+ s64 sectors = le16_to_cpu(new_s.v->sectors);
|
|
+
|
|
+ bch2_bkey_to_replicas(&r.e, new);
|
|
+ update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
|
|
+
|
|
+ for (i = 0; i < new_s.v->nr_blocks; i++) {
|
|
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
|
|
+ i, false);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (old_s.k) {
|
|
+ s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
|
|
+
|
|
+ bch2_bkey_to_replicas(&r.e, old);
|
|
+ update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
|
|
+
|
|
+ for (i = 0; i < old_s.v->nr_blocks; i++) {
|
|
+ ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
|
|
+ i, true);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static __le64 *bkey_refcount(struct bkey_i *k)
|
|
+{
|
|
+ switch (k->k.type) {
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ return &bkey_i_to_reflink_v(k)->v.refcount;
|
|
+ case KEY_TYPE_indirect_inline_data:
|
|
+ return &bkey_i_to_indirect_inline_data(k)->v.refcount;
|
|
+ default:
|
|
+ return NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
|
|
+ struct bkey_s_c_reflink_p p,
|
|
+ u64 idx, unsigned sectors,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_i *n;
|
|
+ __le64 *refcount;
|
|
+ s64 ret;
|
|
+
|
|
+ ret = trans_get_key(trans, BTREE_ID_reflink,
|
|
+ POS(0, idx), &iter, &k);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if ((flags & BTREE_TRIGGER_OVERWRITE) &&
|
|
+ (bkey_start_offset(k.k) < idx ||
|
|
+ k.k->p.offset > idx + sectors))
|
|
+ goto out;
|
|
+
|
|
+ sectors = k.k->p.offset - idx;
|
|
+
|
|
+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
|
|
+ ret = PTR_ERR_OR_ZERO(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bkey_reassemble(n, k);
|
|
+
|
|
+ refcount = bkey_refcount(n);
|
|
+ if (!refcount) {
|
|
+ bch2_fs_inconsistent(c,
|
|
+ "%llu:%llu len %u points to nonexistent indirect extent %llu",
|
|
+ p.k->p.inode, p.k->p.offset, p.k->size, idx);
|
|
+ bch2_inconsistent_error(c);
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1);
|
|
+
|
|
+ if (!*refcount) {
|
|
+ n->k.type = KEY_TYPE_deleted;
|
|
+ set_bkey_val_u64s(&n->k, 0);
|
|
+ }
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k));
|
|
+ bch2_trans_update(trans, iter, n, 0);
|
|
+out:
|
|
+ ret = sectors;
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
|
|
+ struct bkey_s_c_reflink_p p, unsigned offset,
|
|
+ s64 sectors, unsigned flags)
|
|
+{
|
|
+ u64 idx = le64_to_cpu(p.v->idx) + offset;
|
|
+ s64 ret = 0;
|
|
+
|
|
+ sectors = abs(sectors);
|
|
+ BUG_ON(offset + sectors > p.k->size);
|
|
+
|
|
+ while (sectors) {
|
|
+ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags);
|
|
+ if (ret < 0)
|
|
+ break;
|
|
+
|
|
+ idx += ret;
|
|
+ sectors = max_t(s64, 0LL, sectors - ret);
|
|
+ ret = 0;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_trans_mark_key(struct btree_trans *trans,
|
|
+ struct bkey_s_c old,
|
|
+ struct bkey_s_c new,
|
|
+ unsigned offset, s64 sectors, unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old;
|
|
+ struct replicas_delta_list *d;
|
|
+
|
|
+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)));
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE)
|
|
+ ? c->opts.btree_node_size
|
|
+ : -c->opts.btree_node_size;
|
|
+
|
|
+ return bch2_trans_mark_extent(trans, k, offset, sectors,
|
|
+ flags, BCH_DATA_btree);
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ return bch2_trans_mark_extent(trans, k, offset, sectors,
|
|
+ flags, BCH_DATA_user);
|
|
+ case KEY_TYPE_stripe:
|
|
+ return bch2_trans_mark_stripe(trans, old, new, flags);
|
|
+ case KEY_TYPE_inode: {
|
|
+ int nr = (new.k->type == KEY_TYPE_inode) -
|
|
+ (old.k->type == KEY_TYPE_inode);
|
|
+
|
|
+ if (nr) {
|
|
+ d = replicas_deltas_realloc(trans, 0);
|
|
+ d->nr_inodes += nr;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+ }
|
|
+ case KEY_TYPE_reservation: {
|
|
+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
|
|
+
|
|
+ d = replicas_deltas_realloc(trans, 0);
|
|
+
|
|
+ sectors *= replicas;
|
|
+ replicas = clamp_t(unsigned, replicas, 1,
|
|
+ ARRAY_SIZE(d->persistent_reserved));
|
|
+
|
|
+ d->persistent_reserved[replicas - 1] += sectors;
|
|
+ return 0;
|
|
+ }
|
|
+ case KEY_TYPE_reflink_p:
|
|
+ return bch2_trans_mark_reflink_p(trans,
|
|
+ bkey_s_c_to_reflink_p(k),
|
|
+ offset, sectors, flags);
|
|
+ default:
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_trans_mark_update(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *new,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bkey_s_c old;
|
|
+ int ret;
|
|
+
|
|
+ if (unlikely(flags & BTREE_TRIGGER_NORUN))
|
|
+ return 0;
|
|
+
|
|
+ if (!btree_node_type_needs_gc(iter->btree_id))
|
|
+ return 0;
|
|
+
|
|
+ if (!btree_node_type_is_extents(iter->btree_id)) {
|
|
+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) {
|
|
+ old = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(old);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ } else {
|
|
+ struct bkey_cached *ck = (void *) iter->l[0].b;
|
|
+
|
|
+ BUG_ON(!ck->valid);
|
|
+ old = bkey_i_to_s_c(ck->k);
|
|
+ }
|
|
+
|
|
+ if (old.k->type == new->k.type) {
|
|
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
|
|
+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
|
|
+ } else {
|
|
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
|
|
+ BTREE_TRIGGER_INSERT|flags) ?:
|
|
+ bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new), 0, 0,
|
|
+ BTREE_TRIGGER_OVERWRITE|flags);
|
|
+ }
|
|
+ } else {
|
|
+ struct btree_iter *copy;
|
|
+ struct bkey _old;
|
|
+
|
|
+ EBUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);
|
|
+
|
|
+ bkey_init(&_old);
|
|
+ old = (struct bkey_s_c) { &_old, NULL };
|
|
+
|
|
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
|
|
+ 0, new->k.size,
|
|
+ BTREE_TRIGGER_INSERT);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ copy = bch2_trans_copy_iter(trans, iter);
|
|
+
|
|
+ for_each_btree_key_continue(copy, 0, old, ret) {
|
|
+ unsigned offset = 0;
|
|
+ s64 sectors = -((s64) old.k->size);
|
|
+
|
|
+ flags |= BTREE_TRIGGER_OVERWRITE;
|
|
+
|
|
+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0)
|
|
+ break;
|
|
+
|
|
+ switch (bch2_extent_overlap(&new->k, old.k)) {
|
|
+ case BCH_EXTENT_OVERLAP_ALL:
|
|
+ offset = 0;
|
|
+ sectors = -((s64) old.k->size);
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_BACK:
|
|
+ offset = bkey_start_offset(&new->k) -
|
|
+ bkey_start_offset(old.k);
|
|
+ sectors = bkey_start_offset(&new->k) -
|
|
+ old.k->p.offset;
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_FRONT:
|
|
+ offset = 0;
|
|
+ sectors = bkey_start_offset(old.k) -
|
|
+ new->k.p.offset;
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_MIDDLE:
|
|
+ offset = bkey_start_offset(&new->k) -
|
|
+ bkey_start_offset(old.k);
|
|
+ sectors = -((s64) new->k.size);
|
|
+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ BUG_ON(sectors >= 0);
|
|
+
|
|
+ ret = bch2_trans_mark_key(trans, old, bkey_i_to_s_c(new),
|
|
+ offset, sectors, flags);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(trans, copy);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
|
+ struct bch_dev *ca, size_t b,
|
|
+ enum bch_data_type type,
|
|
+ unsigned sectors)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_alloc_unpacked u;
|
|
+ struct bkey_alloc_buf *a;
|
|
+ struct bch_extent_ptr ptr = {
|
|
+ .dev = ca->dev_idx,
|
|
+ .offset = bucket_to_sector(ca, b),
|
|
+ };
|
|
+ int ret = 0;
|
|
+
|
|
+ a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
|
|
+ if (IS_ERR(a))
|
|
+ return PTR_ERR(a);
|
|
+
|
|
+ if (u.data_type && u.data_type != type) {
|
|
+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
|
|
+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
|
|
+ "while marking %s",
|
|
+ iter->pos.inode, iter->pos.offset, u.gen,
|
|
+ bch2_data_types[u.data_type],
|
|
+ bch2_data_types[type],
|
|
+ bch2_data_types[type]);
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ u.data_type = type;
|
|
+ u.dirty_sectors = sectors;
|
|
+
|
|
+ bch2_alloc_pack(c, a, u);
|
|
+ bch2_trans_update(trans, iter, &a->k, 0);
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
|
|
+ struct bch_dev *ca, size_t b,
|
|
+ enum bch_data_type type,
|
|
+ unsigned sectors)
|
|
+{
|
|
+ return __bch2_trans_do(trans, NULL, NULL, 0,
|
|
+ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors));
|
|
+}
|
|
+
|
|
+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans,
|
|
+ struct bch_dev *ca,
|
|
+ u64 start, u64 end,
|
|
+ enum bch_data_type type,
|
|
+ u64 *bucket, unsigned *bucket_sectors)
|
|
+{
|
|
+ do {
|
|
+ u64 b = sector_to_bucket(ca, start);
|
|
+ unsigned sectors =
|
|
+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start;
|
|
+
|
|
+ if (b != *bucket && *bucket_sectors) {
|
|
+ int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket,
|
|
+ type, *bucket_sectors);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ *bucket_sectors = 0;
|
|
+ }
|
|
+
|
|
+ *bucket = b;
|
|
+ *bucket_sectors += sectors;
|
|
+ start += sectors;
|
|
+ } while (start < end);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans,
|
|
+ struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
|
|
+ u64 bucket = 0;
|
|
+ unsigned i, bucket_sectors = 0;
|
|
+ int ret;
|
|
+
|
|
+ for (i = 0; i < layout->nr_superblocks; i++) {
|
|
+ u64 offset = le64_to_cpu(layout->sb_offset[i]);
|
|
+
|
|
+ if (offset == BCH_SB_SECTOR) {
|
|
+ ret = bch2_trans_mark_metadata_sectors(trans, ca,
|
|
+ 0, BCH_SB_SECTOR,
|
|
+ BCH_DATA_sb, &bucket, &bucket_sectors);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_mark_metadata_sectors(trans, ca, offset,
|
|
+ offset + (1 << layout->sb_max_size_bits),
|
|
+ BCH_DATA_sb, &bucket, &bucket_sectors);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (bucket_sectors) {
|
|
+ ret = bch2_trans_mark_metadata_bucket(trans, ca,
|
|
+ bucket, BCH_DATA_sb, bucket_sectors);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < ca->journal.nr; i++) {
|
|
+ ret = bch2_trans_mark_metadata_bucket(trans, ca,
|
|
+ ca->journal.buckets[i],
|
|
+ BCH_DATA_journal, ca->mi.bucket_size);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ return bch2_trans_do(c, NULL, NULL, 0,
|
|
+ __bch2_trans_mark_dev_sb(&trans, ca));
|
|
+}
|
|
+
|
|
+/* Disk reservations: */
|
|
+
|
|
+#define SECTORS_CACHE 1024
|
|
+
|
|
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
|
|
+ u64 sectors, int flags)
|
|
+{
|
|
+ struct bch_fs_pcpu *pcpu;
|
|
+ u64 old, v, get;
|
|
+ s64 sectors_available;
|
|
+ int ret;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ preempt_disable();
|
|
+ pcpu = this_cpu_ptr(c->pcpu);
|
|
+
|
|
+ if (sectors <= pcpu->sectors_available)
|
|
+ goto out;
|
|
+
|
|
+ v = atomic64_read(&c->sectors_available);
|
|
+ do {
|
|
+ old = v;
|
|
+ get = min((u64) sectors + SECTORS_CACHE, old);
|
|
+
|
|
+ if (get < sectors) {
|
|
+ preempt_enable();
|
|
+ goto recalculate;
|
|
+ }
|
|
+ } while ((v = atomic64_cmpxchg(&c->sectors_available,
|
|
+ old, old - get)) != old);
|
|
+
|
|
+ pcpu->sectors_available += get;
|
|
+
|
|
+out:
|
|
+ pcpu->sectors_available -= sectors;
|
|
+ this_cpu_add(*c->online_reserved, sectors);
|
|
+ res->sectors += sectors;
|
|
+
|
|
+ preempt_enable();
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ return 0;
|
|
+
|
|
+recalculate:
|
|
+ mutex_lock(&c->sectors_available_lock);
|
|
+
|
|
+ percpu_u64_set(&c->pcpu->sectors_available, 0);
|
|
+ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
|
|
+
|
|
+ if (sectors <= sectors_available ||
|
|
+ (flags & BCH_DISK_RESERVATION_NOFAIL)) {
|
|
+ atomic64_set(&c->sectors_available,
|
|
+ max_t(s64, 0, sectors_available - sectors));
|
|
+ this_cpu_add(*c->online_reserved, sectors);
|
|
+ res->sectors += sectors;
|
|
+ ret = 0;
|
|
+ } else {
|
|
+ atomic64_set(&c->sectors_available, sectors_available);
|
|
+ ret = -ENOSPC;
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->sectors_available_lock);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Startup/shutdown: */
|
|
+
|
|
+static void buckets_free_rcu(struct rcu_head *rcu)
|
|
+{
|
|
+ struct bucket_array *buckets =
|
|
+ container_of(rcu, struct bucket_array, rcu);
|
|
+
|
|
+ kvpfree(buckets,
|
|
+ sizeof(struct bucket_array) +
|
|
+ buckets->nbuckets * sizeof(struct bucket));
|
|
+}
|
|
+
|
|
+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
+{
|
|
+ struct bucket_array *buckets = NULL, *old_buckets = NULL;
|
|
+ unsigned long *buckets_nouse = NULL;
|
|
+ alloc_fifo free[RESERVE_NR];
|
|
+ alloc_fifo free_inc;
|
|
+ alloc_heap alloc_heap;
|
|
+
|
|
+ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
|
|
+ ca->mi.bucket_size / c->opts.btree_node_size);
|
|
+ /* XXX: these should be tunable */
|
|
+ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
|
|
+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 6);
|
|
+ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
|
|
+ btree_reserve * 2);
|
|
+ bool resize = ca->buckets[0] != NULL;
|
|
+ int ret = -ENOMEM;
|
|
+ unsigned i;
|
|
+
|
|
+ memset(&free, 0, sizeof(free));
|
|
+ memset(&free_inc, 0, sizeof(free_inc));
|
|
+ memset(&alloc_heap, 0, sizeof(alloc_heap));
|
|
+
|
|
+ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
|
|
+ nbuckets * sizeof(struct bucket),
|
|
+ GFP_KERNEL|__GFP_ZERO)) ||
|
|
+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
|
|
+ sizeof(unsigned long),
|
|
+ GFP_KERNEL|__GFP_ZERO)) ||
|
|
+ !init_fifo(&free[RESERVE_MOVINGGC],
|
|
+ copygc_reserve, GFP_KERNEL) ||
|
|
+ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
|
|
+ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
|
|
+ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
|
|
+ goto err;
|
|
+
|
|
+ buckets->first_bucket = ca->mi.first_bucket;
|
|
+ buckets->nbuckets = nbuckets;
|
|
+
|
|
+ bch2_copygc_stop(c);
|
|
+
|
|
+ if (resize) {
|
|
+ down_write(&c->gc_lock);
|
|
+ down_write(&ca->bucket_lock);
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+ }
|
|
+
|
|
+ old_buckets = bucket_array(ca);
|
|
+
|
|
+ if (resize) {
|
|
+ size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
|
|
+
|
|
+ memcpy(buckets->b,
|
|
+ old_buckets->b,
|
|
+ n * sizeof(struct bucket));
|
|
+ memcpy(buckets_nouse,
|
|
+ ca->buckets_nouse,
|
|
+ BITS_TO_LONGS(n) * sizeof(unsigned long));
|
|
+ }
|
|
+
|
|
+ rcu_assign_pointer(ca->buckets[0], buckets);
|
|
+ buckets = old_buckets;
|
|
+
|
|
+ swap(ca->buckets_nouse, buckets_nouse);
|
|
+
|
|
+ if (resize) {
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+ up_write(&c->gc_lock);
|
|
+ }
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ for (i = 0; i < RESERVE_NR; i++) {
|
|
+ fifo_move(&free[i], &ca->free[i]);
|
|
+ swap(ca->free[i], free[i]);
|
|
+ }
|
|
+ fifo_move(&free_inc, &ca->free_inc);
|
|
+ swap(ca->free_inc, free_inc);
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ /* with gc lock held, alloc_heap can't be in use: */
|
|
+ swap(ca->alloc_heap, alloc_heap);
|
|
+
|
|
+ nbuckets = ca->mi.nbuckets;
|
|
+
|
|
+ if (resize)
|
|
+ up_write(&ca->bucket_lock);
|
|
+
|
|
+ ret = 0;
|
|
+err:
|
|
+ free_heap(&alloc_heap);
|
|
+ free_fifo(&free_inc);
|
|
+ for (i = 0; i < RESERVE_NR; i++)
|
|
+ free_fifo(&free[i]);
|
|
+ kvpfree(buckets_nouse,
|
|
+ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
|
|
+ if (buckets)
|
|
+ call_rcu(&old_buckets->rcu, buckets_free_rcu);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_dev_buckets_free(struct bch_dev *ca)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ free_heap(&ca->alloc_heap);
|
|
+ free_fifo(&ca->free_inc);
|
|
+ for (i = 0; i < RESERVE_NR; i++)
|
|
+ free_fifo(&ca->free[i]);
|
|
+ kvpfree(ca->buckets_nouse,
|
|
+ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
|
|
+ kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
|
|
+ sizeof(struct bucket_array) +
|
|
+ ca->mi.nbuckets * sizeof(struct bucket));
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
|
|
+ free_percpu(ca->usage[i]);
|
|
+ kfree(ca->usage_base);
|
|
+}
|
|
+
|
|
+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
|
|
+ if (!ca->usage_base)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
|
|
+ ca->usage[i] = alloc_percpu(struct bch_dev_usage);
|
|
+ if (!ca->usage[i])
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
|
|
+}
|
|
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
|
|
new file mode 100644
|
|
index 000000000000..7463e6420b14
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/buckets.h
|
|
@@ -0,0 +1,301 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+/*
|
|
+ * Code for manipulating bucket marks for garbage collection.
|
|
+ *
|
|
+ * Copyright 2014 Datera, Inc.
|
|
+ */
|
|
+
|
|
+#ifndef _BUCKETS_H
|
|
+#define _BUCKETS_H
|
|
+
|
|
+#include "buckets_types.h"
|
|
+#include "super.h"
|
|
+
|
|
+#define for_each_bucket(_b, _buckets) \
|
|
+ for (_b = (_buckets)->b + (_buckets)->first_bucket; \
|
|
+ _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
|
|
+
|
|
+#define bucket_cmpxchg(g, new, expr) \
|
|
+({ \
|
|
+ struct bucket *_g = g; \
|
|
+ u64 _v = atomic64_read(&(g)->_mark.v); \
|
|
+ struct bucket_mark _old; \
|
|
+ \
|
|
+ do { \
|
|
+ (new).v.counter = _old.v.counter = _v; \
|
|
+ expr; \
|
|
+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \
|
|
+ _old.v.counter, \
|
|
+ (new).v.counter)) != _old.v.counter);\
|
|
+ _old; \
|
|
+})
|
|
+
|
|
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
|
|
+ bool gc)
|
|
+{
|
|
+ return rcu_dereference_check(ca->buckets[gc],
|
|
+ !ca->fs ||
|
|
+ percpu_rwsem_is_held(&ca->fs->mark_lock) ||
|
|
+ lockdep_is_held(&ca->fs->gc_lock) ||
|
|
+ lockdep_is_held(&ca->bucket_lock));
|
|
+}
|
|
+
|
|
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
|
|
+{
|
|
+ return __bucket_array(ca, false);
|
|
+}
|
|
+
|
|
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
|
|
+{
|
|
+ struct bucket_array *buckets = __bucket_array(ca, gc);
|
|
+
|
|
+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
|
|
+ return buckets->b + b;
|
|
+}
|
|
+
|
|
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
|
|
+{
|
|
+ return __bucket(ca, b, false);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
|
|
+ * the oldest gen of any pointer into that bucket in the btree.
|
|
+ */
|
|
+
|
|
+static inline u8 bucket_gc_gen(struct bucket *g)
|
|
+{
|
|
+ return g->mark.gen - g->oldest_gen;
|
|
+}
|
|
+
|
|
+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
|
|
+ const struct bch_extent_ptr *ptr)
|
|
+{
|
|
+ return sector_to_bucket(ca, ptr->offset);
|
|
+}
|
|
+
|
|
+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
|
|
+ const struct bch_extent_ptr *ptr,
|
|
+ bool gc)
|
|
+{
|
|
+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
|
|
+}
|
|
+
|
|
+static inline enum bch_data_type ptr_data_type(const struct bkey *k,
|
|
+ const struct bch_extent_ptr *ptr)
|
|
+{
|
|
+ if (k->type == KEY_TYPE_btree_ptr ||
|
|
+ k->type == KEY_TYPE_btree_ptr_v2)
|
|
+ return BCH_DATA_btree;
|
|
+
|
|
+ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user;
|
|
+}
|
|
+
|
|
+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
|
|
+ const struct bch_extent_ptr *ptr)
|
|
+{
|
|
+ struct bucket_mark m;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return m;
|
|
+}
|
|
+
|
|
+static inline int gen_cmp(u8 a, u8 b)
|
|
+{
|
|
+ return (s8) (a - b);
|
|
+}
|
|
+
|
|
+static inline int gen_after(u8 a, u8 b)
|
|
+{
|
|
+ int r = gen_cmp(a, b);
|
|
+
|
|
+ return r > 0 ? r : 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * ptr_stale() - check if a pointer points into a bucket that has been
|
|
+ * invalidated.
|
|
+ */
|
|
+static inline u8 ptr_stale(struct bch_dev *ca,
|
|
+ const struct bch_extent_ptr *ptr)
|
|
+{
|
|
+ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen);
|
|
+}
|
|
+
|
|
+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p,
|
|
+ unsigned live_size)
|
|
+{
|
|
+ return live_size && p.crc.compression_type
|
|
+ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size,
|
|
+ p.crc.uncompressed_size))
|
|
+ : live_size;
|
|
+}
|
|
+
|
|
+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p)
|
|
+{
|
|
+ return __ptr_disk_sectors(p, p.crc.live_size);
|
|
+}
|
|
+
|
|
+/* bucket gc marks */
|
|
+
|
|
+static inline unsigned bucket_sectors_used(struct bucket_mark mark)
|
|
+{
|
|
+ return mark.dirty_sectors + mark.cached_sectors;
|
|
+}
|
|
+
|
|
+static inline bool is_available_bucket(struct bucket_mark mark)
|
|
+{
|
|
+ return !mark.dirty_sectors && !mark.stripe;
|
|
+}
|
|
+
|
|
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
|
|
+ u16 last_seq_ondisk)
|
|
+{
|
|
+ return m.journal_seq_valid &&
|
|
+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
|
|
+}
|
|
+
|
|
+/* Device usage: */
|
|
+
|
|
+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
|
|
+
|
|
+static inline u64 __dev_buckets_available(struct bch_dev *ca,
|
|
+ struct bch_dev_usage stats)
|
|
+{
|
|
+ u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
|
|
+
|
|
+ if (WARN_ONCE(stats.buckets_unavailable > total,
|
|
+ "buckets_unavailable overflow (%llu > %llu)\n",
|
|
+ stats.buckets_unavailable, total))
|
|
+ return 0;
|
|
+
|
|
+ return total - stats.buckets_unavailable;
|
|
+}
|
|
+
|
|
+static inline u64 dev_buckets_available(struct bch_dev *ca)
|
|
+{
|
|
+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
|
|
+}
|
|
+
|
|
+static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
|
|
+ struct bch_dev_usage stats)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ s64 available = __dev_buckets_available(ca, stats);
|
|
+ unsigned i;
|
|
+
|
|
+ spin_lock(&c->freelist_lock);
|
|
+ for (i = 0; i < RESERVE_NR; i++)
|
|
+ available -= fifo_used(&ca->free[i]);
|
|
+ available -= fifo_used(&ca->free_inc);
|
|
+ available -= ca->nr_open_buckets;
|
|
+ spin_unlock(&c->freelist_lock);
|
|
+
|
|
+ return max(available, 0LL);
|
|
+}
|
|
+
|
|
+static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
|
|
+{
|
|
+ return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
|
|
+}
|
|
+
|
|
+/* Filesystem usage: */
|
|
+
|
|
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
|
|
+{
|
|
+
|
|
+ return sizeof(struct bch_fs_usage) / sizeof(u64) +
|
|
+ READ_ONCE(c->replicas.nr);
|
|
+}
|
|
+
|
|
+static inline unsigned dev_usage_u64s(void)
|
|
+{
|
|
+ return sizeof(struct bch_dev_usage) / sizeof(u64);
|
|
+}
|
|
+
|
|
+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *);
|
|
+
|
|
+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *);
|
|
+
|
|
+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned);
|
|
+
|
|
+void bch2_fs_usage_to_text(struct printbuf *,
|
|
+ struct bch_fs *, struct bch_fs_usage_online *);
|
|
+
|
|
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
|
|
+
|
|
+struct bch_fs_usage_short
|
|
+bch2_fs_usage_read_short(struct bch_fs *);
|
|
+
|
|
+/* key/bucket marking: */
|
|
+
|
|
+void bch2_bucket_seq_cleanup(struct bch_fs *);
|
|
+void bch2_fs_usage_initialize(struct bch_fs *);
|
|
+
|
|
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
|
|
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
|
|
+ size_t, enum bch_data_type, unsigned,
|
|
+ struct gc_pos, unsigned);
|
|
+
|
|
+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned,
|
|
+ s64, struct bch_fs_usage *, u64, unsigned);
|
|
+
|
|
+int bch2_mark_update(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_i *, struct bch_fs_usage *, unsigned);
|
|
+
|
|
+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, struct bkey_s_c,
|
|
+ unsigned, s64, unsigned);
|
|
+int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter,
|
|
+ struct bkey_i *insert, unsigned);
|
|
+void bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
|
|
+
|
|
+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
|
|
+ size_t, enum bch_data_type, unsigned);
|
|
+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+/* disk reservations: */
|
|
+
|
|
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
|
|
+ struct disk_reservation *res)
|
|
+{
|
|
+ this_cpu_sub(*c->online_reserved, res->sectors);
|
|
+ res->sectors = 0;
|
|
+}
|
|
+
|
|
+#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
|
|
+
|
|
+int bch2_disk_reservation_add(struct bch_fs *,
|
|
+ struct disk_reservation *,
|
|
+ u64, int);
|
|
+
|
|
+static inline struct disk_reservation
|
|
+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas)
|
|
+{
|
|
+ return (struct disk_reservation) {
|
|
+ .sectors = 0,
|
|
+#if 0
|
|
+ /* not used yet: */
|
|
+ .gen = c->capacity_gen,
|
|
+#endif
|
|
+ .nr_replicas = nr_replicas,
|
|
+ };
|
|
+}
|
|
+
|
|
+static inline int bch2_disk_reservation_get(struct bch_fs *c,
|
|
+ struct disk_reservation *res,
|
|
+ u64 sectors, unsigned nr_replicas,
|
|
+ int flags)
|
|
+{
|
|
+ *res = bch2_disk_reservation_init(c, nr_replicas);
|
|
+
|
|
+ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags);
|
|
+}
|
|
+
|
|
+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64);
|
|
+void bch2_dev_buckets_free(struct bch_dev *);
|
|
+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+#endif /* _BUCKETS_H */
|
|
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
|
|
new file mode 100644
|
|
index 000000000000..b2de2995c5e7
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/buckets_types.h
|
|
@@ -0,0 +1,124 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BUCKETS_TYPES_H
|
|
+#define _BUCKETS_TYPES_H
|
|
+
|
|
+#include "bcachefs_format.h"
|
|
+#include "util.h"
|
|
+
|
|
+#define BUCKET_JOURNAL_SEQ_BITS 16
|
|
+
|
|
+struct bucket_mark {
|
|
+ union {
|
|
+ atomic64_t v;
|
|
+
|
|
+ struct {
|
|
+ u8 gen;
|
|
+ u8 data_type:3,
|
|
+ owned_by_allocator:1,
|
|
+ journal_seq_valid:1,
|
|
+ stripe:1;
|
|
+ u16 dirty_sectors;
|
|
+ u16 cached_sectors;
|
|
+
|
|
+ /*
|
|
+ * low bits of journal sequence number when this bucket was most
|
|
+ * recently modified: if journal_seq_valid is set, this bucket can't be
|
|
+ * reused until the journal sequence number written to disk is >= the
|
|
+ * bucket's journal sequence number:
|
|
+ */
|
|
+ u16 journal_seq;
|
|
+ };
|
|
+ };
|
|
+};
|
|
+
|
|
+struct bucket {
|
|
+ union {
|
|
+ struct bucket_mark _mark;
|
|
+ const struct bucket_mark mark;
|
|
+ };
|
|
+
|
|
+ u64 io_time[2];
|
|
+ u8 oldest_gen;
|
|
+ u8 gc_gen;
|
|
+ unsigned gen_valid:1;
|
|
+ u8 stripe_redundancy;
|
|
+ u32 stripe;
|
|
+};
|
|
+
|
|
+struct bucket_array {
|
|
+ struct rcu_head rcu;
|
|
+ u16 first_bucket;
|
|
+ size_t nbuckets;
|
|
+ struct bucket b[];
|
|
+};
|
|
+
|
|
+struct bch_dev_usage {
|
|
+ u64 buckets_ec;
|
|
+ u64 buckets_unavailable;
|
|
+
|
|
+ struct {
|
|
+ u64 buckets;
|
|
+ u64 sectors; /* _compressed_ sectors: */
|
|
+ /*
|
|
+ * XXX
|
|
+ * Why do we have this? Isn't it just buckets * bucket_size -
|
|
+ * sectors?
|
|
+ */
|
|
+ u64 fragmented;
|
|
+ } d[BCH_DATA_NR];
|
|
+};
|
|
+
|
|
+struct bch_fs_usage {
|
|
+ /* all fields are in units of 512 byte sectors: */
|
|
+ u64 hidden;
|
|
+ u64 btree;
|
|
+ u64 data;
|
|
+ u64 cached;
|
|
+ u64 reserved;
|
|
+ u64 nr_inodes;
|
|
+
|
|
+ /* XXX: add stats for compression ratio */
|
|
+#if 0
|
|
+ u64 uncompressed;
|
|
+ u64 compressed;
|
|
+#endif
|
|
+
|
|
+ /* broken out: */
|
|
+
|
|
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
|
|
+ u64 replicas[];
|
|
+};
|
|
+
|
|
+struct bch_fs_usage_online {
|
|
+ u64 online_reserved;
|
|
+ struct bch_fs_usage u;
|
|
+};
|
|
+
|
|
+struct bch_fs_usage_short {
|
|
+ u64 capacity;
|
|
+ u64 used;
|
|
+ u64 free;
|
|
+ u64 nr_inodes;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * A reservation for space on disk:
|
|
+ */
|
|
+struct disk_reservation {
|
|
+ u64 sectors;
|
|
+ u32 gen;
|
|
+ unsigned nr_replicas;
|
|
+};
|
|
+
|
|
+struct copygc_heap_entry {
|
|
+ u8 dev;
|
|
+ u8 gen;
|
|
+ u8 replicas;
|
|
+ u16 fragmentation;
|
|
+ u32 sectors;
|
|
+ u64 offset;
|
|
+};
|
|
+
|
|
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
|
|
+
|
|
+#endif /* _BUCKETS_TYPES_H */
|
|
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
|
|
new file mode 100644
|
|
index 000000000000..c61601476c0d
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/chardev.c
|
|
@@ -0,0 +1,728 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#ifndef NO_BCACHEFS_CHARDEV
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bcachefs_ioctl.h"
|
|
+#include "buckets.h"
|
|
+#include "chardev.h"
|
|
+#include "journal.h"
|
|
+#include "move.h"
|
|
+#include "replicas.h"
|
|
+#include "super.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/anon_inodes.h>
|
|
+#include <linux/cdev.h>
|
|
+#include <linux/device.h>
|
|
+#include <linux/file.h>
|
|
+#include <linux/fs.h>
|
|
+#include <linux/ioctl.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/major.h>
|
|
+#include <linux/sched/task.h>
|
|
+#include <linux/slab.h>
|
|
+#include <linux/uaccess.h>
|
|
+
|
|
+/* returns with ref on ca->ref */
|
|
+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ if (flags & BCH_BY_INDEX) {
|
|
+ if (dev >= c->sb.nr_devices)
|
|
+ return ERR_PTR(-EINVAL);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ ca = rcu_dereference(c->devs[dev]);
|
|
+ if (ca)
|
|
+ percpu_ref_get(&ca->ref);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (!ca)
|
|
+ return ERR_PTR(-EINVAL);
|
|
+ } else {
|
|
+ char *path;
|
|
+
|
|
+ path = strndup_user((const char __user *)
|
|
+ (unsigned long) dev, PATH_MAX);
|
|
+ if (IS_ERR(path))
|
|
+ return ERR_CAST(path);
|
|
+
|
|
+ ca = bch2_dev_lookup(c, path);
|
|
+ kfree(path);
|
|
+ }
|
|
+
|
|
+ return ca;
|
|
+}
|
|
+
|
|
+#if 0
|
|
+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
|
|
+{
|
|
+ struct bch_ioctl_assemble arg;
|
|
+ struct bch_fs *c;
|
|
+ u64 *user_devs = NULL;
|
|
+ char **devs = NULL;
|
|
+ unsigned i;
|
|
+ int ret = -EFAULT;
|
|
+
|
|
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
|
+ return -EFAULT;
|
|
+
|
|
+ if (arg.flags || arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
|
|
+ if (!user_devs)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);
|
|
+
|
|
+ if (copy_from_user(user_devs, user_arg->devs,
|
|
+ sizeof(u64) * arg.nr_devs))
|
|
+ goto err;
|
|
+
|
|
+ for (i = 0; i < arg.nr_devs; i++) {
|
|
+ devs[i] = strndup_user((const char __user *)(unsigned long)
|
|
+ user_devs[i],
|
|
+ PATH_MAX);
|
|
+ if (!devs[i]) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty());
|
|
+ ret = PTR_ERR_OR_ZERO(c);
|
|
+ if (!ret)
|
|
+ closure_put(&c->cl);
|
|
+err:
|
|
+ if (devs)
|
|
+ for (i = 0; i < arg.nr_devs; i++)
|
|
+ kfree(devs[i]);
|
|
+ kfree(devs);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg)
|
|
+{
|
|
+ struct bch_ioctl_incremental arg;
|
|
+ const char *err;
|
|
+ char *path;
|
|
+
|
|
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
|
+ return -EFAULT;
|
|
+
|
|
+ if (arg.flags || arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
|
|
+ if (!path)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ err = bch2_fs_open_incremental(path);
|
|
+ kfree(path);
|
|
+
|
|
+ if (err) {
|
|
+ pr_err("Could not register bcachefs devices: %s", err);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static long bch2_global_ioctl(unsigned cmd, void __user *arg)
|
|
+{
|
|
+ switch (cmd) {
|
|
+#if 0
|
|
+ case BCH_IOCTL_ASSEMBLE:
|
|
+ return bch2_ioctl_assemble(arg);
|
|
+ case BCH_IOCTL_INCREMENTAL:
|
|
+ return bch2_ioctl_incremental(arg);
|
|
+#endif
|
|
+ default:
|
|
+ return -ENOTTY;
|
|
+ }
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_query_uuid(struct bch_fs *c,
|
|
+ struct bch_ioctl_query_uuid __user *user_arg)
|
|
+{
|
|
+ return copy_to_user(&user_arg->uuid,
|
|
+ &c->sb.user_uuid,
|
|
+ sizeof(c->sb.user_uuid));
|
|
+}
|
|
+
|
|
+#if 0
|
|
+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg)
|
|
+{
|
|
+ if (arg.flags || arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ return bch2_fs_start(c);
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_stop(struct bch_fs *c)
|
|
+{
|
|
+ bch2_fs_stop(c);
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg)
|
|
+{
|
|
+ char *path;
|
|
+ int ret;
|
|
+
|
|
+ if (arg.flags || arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
|
|
+ if (!path)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = bch2_dev_add(c, path);
|
|
+ kfree(path);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
|
|
+ BCH_FORCE_IF_METADATA_LOST|
|
|
+ BCH_FORCE_IF_DEGRADED|
|
|
+ BCH_BY_INDEX)) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+ if (IS_ERR(ca))
|
|
+ return PTR_ERR(ca);
|
|
+
|
|
+ return bch2_dev_remove(c, ca, arg.flags);
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg)
|
|
+{
|
|
+ char *path;
|
|
+ int ret;
|
|
+
|
|
+ if (arg.flags || arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX);
|
|
+ if (!path)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = bch2_dev_online(c, path);
|
|
+ kfree(path);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ int ret;
|
|
+
|
|
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
|
|
+ BCH_FORCE_IF_METADATA_LOST|
|
|
+ BCH_FORCE_IF_DEGRADED|
|
|
+ BCH_BY_INDEX)) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+ if (IS_ERR(ca))
|
|
+ return PTR_ERR(ca);
|
|
+
|
|
+ ret = bch2_dev_offline(c, ca, arg.flags);
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_set_state(struct bch_fs *c,
|
|
+ struct bch_ioctl_disk_set_state arg)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ int ret;
|
|
+
|
|
+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST|
|
|
+ BCH_FORCE_IF_METADATA_LOST|
|
|
+ BCH_FORCE_IF_DEGRADED|
|
|
+ BCH_BY_INDEX)) ||
|
|
+ arg.pad[0] || arg.pad[1] || arg.pad[2])
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+ if (IS_ERR(ca))
|
|
+ return PTR_ERR(ca);
|
|
+
|
|
+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags);
|
|
+
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct bch_data_ctx {
|
|
+ struct bch_fs *c;
|
|
+ struct bch_ioctl_data arg;
|
|
+ struct bch_move_stats stats;
|
|
+
|
|
+ int ret;
|
|
+
|
|
+ struct task_struct *thread;
|
|
+};
|
|
+
|
|
+static int bch2_data_thread(void *arg)
|
|
+{
|
|
+ struct bch_data_ctx *ctx = arg;
|
|
+
|
|
+ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
|
|
+
|
|
+ ctx->stats.data_type = U8_MAX;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_data_job_release(struct inode *inode, struct file *file)
|
|
+{
|
|
+ struct bch_data_ctx *ctx = file->private_data;
|
|
+
|
|
+ kthread_stop(ctx->thread);
|
|
+ put_task_struct(ctx->thread);
|
|
+ kfree(ctx);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
|
|
+ size_t len, loff_t *ppos)
|
|
+{
|
|
+ struct bch_data_ctx *ctx = file->private_data;
|
|
+ struct bch_fs *c = ctx->c;
|
|
+ struct bch_ioctl_data_event e = {
|
|
+ .type = BCH_DATA_EVENT_PROGRESS,
|
|
+ .p.data_type = ctx->stats.data_type,
|
|
+ .p.btree_id = ctx->stats.btree_id,
|
|
+ .p.pos = ctx->stats.pos,
|
|
+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen),
|
|
+ .p.sectors_total = bch2_fs_usage_read_short(c).used,
|
|
+ };
|
|
+
|
|
+ if (len < sizeof(e))
|
|
+ return -EINVAL;
|
|
+
|
|
+ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e);
|
|
+}
|
|
+
|
|
+static const struct file_operations bcachefs_data_ops = {
|
|
+ .release = bch2_data_job_release,
|
|
+ .read = bch2_data_job_read,
|
|
+ .llseek = no_llseek,
|
|
+};
|
|
+
|
|
+static long bch2_ioctl_data(struct bch_fs *c,
|
|
+ struct bch_ioctl_data arg)
|
|
+{
|
|
+ struct bch_data_ctx *ctx = NULL;
|
|
+ struct file *file = NULL;
|
|
+ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
|
|
+ int ret, fd = -1;
|
|
+
|
|
+ if (arg.op >= BCH_DATA_OP_NR || arg.flags)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
|
|
+ if (!ctx)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ctx->c = c;
|
|
+ ctx->arg = arg;
|
|
+
|
|
+ ctx->thread = kthread_create(bch2_data_thread, ctx,
|
|
+ "bch-data/%s", c->name);
|
|
+ if (IS_ERR(ctx->thread)) {
|
|
+ ret = PTR_ERR(ctx->thread);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = get_unused_fd_flags(flags);
|
|
+ if (ret < 0)
|
|
+ goto err;
|
|
+ fd = ret;
|
|
+
|
|
+ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
|
|
+ if (IS_ERR(file)) {
|
|
+ ret = PTR_ERR(file);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ fd_install(fd, file);
|
|
+
|
|
+ get_task_struct(ctx->thread);
|
|
+ wake_up_process(ctx->thread);
|
|
+
|
|
+ return fd;
|
|
+err:
|
|
+ if (fd >= 0)
|
|
+ put_unused_fd(fd);
|
|
+ if (!IS_ERR_OR_NULL(ctx->thread))
|
|
+ kthread_stop(ctx->thread);
|
|
+ kfree(ctx);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_fs_usage(struct bch_fs *c,
|
|
+ struct bch_ioctl_fs_usage __user *user_arg)
|
|
+{
|
|
+ struct bch_ioctl_fs_usage *arg = NULL;
|
|
+ struct bch_replicas_usage *dst_e, *dst_end;
|
|
+ struct bch_fs_usage_online *src;
|
|
+ u32 replica_entries_bytes;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
|
|
+ return -EFAULT;
|
|
+
|
|
+ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL);
|
|
+ if (!arg)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ src = bch2_fs_usage_read(c);
|
|
+ if (!src) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ arg->capacity = c->capacity;
|
|
+ arg->used = bch2_fs_sectors_used(c, src);
|
|
+ arg->online_reserved = src->online_reserved;
|
|
+
|
|
+ for (i = 0; i < BCH_REPLICAS_MAX; i++)
|
|
+ arg->persistent_reserved[i] = src->u.persistent_reserved[i];
|
|
+
|
|
+ dst_e = arg->replicas;
|
|
+ dst_end = (void *) arg->replicas + replica_entries_bytes;
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *src_e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+
|
|
+ if (replicas_usage_next(dst_e) > dst_end) {
|
|
+ ret = -ERANGE;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ dst_e->sectors = src->u.replicas[i];
|
|
+ dst_e->r = *src_e;
|
|
+
|
|
+ /* recheck after setting nr_devs: */
|
|
+ if (replicas_usage_next(dst_e) > dst_end) {
|
|
+ ret = -ERANGE;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs);
|
|
+
|
|
+ dst_e = replicas_usage_next(dst_e);
|
|
+ }
|
|
+
|
|
+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas;
|
|
+
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ kfree(src);
|
|
+
|
|
+ if (!ret)
|
|
+ ret = copy_to_user(user_arg, arg,
|
|
+ sizeof(*arg) + arg->replica_entries_bytes);
|
|
+err:
|
|
+ kfree(arg);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_dev_usage(struct bch_fs *c,
|
|
+ struct bch_ioctl_dev_usage __user *user_arg)
|
|
+{
|
|
+ struct bch_ioctl_dev_usage arg;
|
|
+ struct bch_dev_usage src;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
|
|
+ return -EFAULT;
|
|
+
|
|
+ if ((arg.flags & ~BCH_BY_INDEX) ||
|
|
+ arg.pad[0] ||
|
|
+ arg.pad[1] ||
|
|
+ arg.pad[2])
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+ if (IS_ERR(ca))
|
|
+ return PTR_ERR(ca);
|
|
+
|
|
+ src = bch2_dev_usage_read(ca);
|
|
+
|
|
+ arg.state = ca->mi.state;
|
|
+ arg.bucket_size = ca->mi.bucket_size;
|
|
+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
|
|
+ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable;
|
|
+ arg.ec_buckets = src.buckets_ec;
|
|
+ arg.ec_sectors = 0;
|
|
+
|
|
+ for (i = 0; i < BCH_DATA_NR; i++) {
|
|
+ arg.buckets[i] = src.d[i].buckets;
|
|
+ arg.sectors[i] = src.d[i].sectors;
|
|
+ }
|
|
+
|
|
+ percpu_ref_put(&ca->ref);
|
|
+
|
|
+ return copy_to_user(user_arg, &arg, sizeof(arg));
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_read_super(struct bch_fs *c,
|
|
+ struct bch_ioctl_read_super arg)
|
|
+{
|
|
+ struct bch_dev *ca = NULL;
|
|
+ struct bch_sb *sb;
|
|
+ int ret = 0;
|
|
+
|
|
+ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ if (arg.flags & BCH_READ_DEV) {
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+
|
|
+ if (IS_ERR(ca)) {
|
|
+ ret = PTR_ERR(ca);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ sb = ca->disk_sb.sb;
|
|
+ } else {
|
|
+ sb = c->disk_sb.sb;
|
|
+ }
|
|
+
|
|
+ if (vstruct_bytes(sb) > arg.size) {
|
|
+ ret = -ERANGE;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = copy_to_user((void __user *)(unsigned long)arg.sb,
|
|
+ sb, vstruct_bytes(sb));
|
|
+err:
|
|
+ if (ca)
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
|
|
+ struct bch_ioctl_disk_get_idx arg)
|
|
+{
|
|
+ dev_t dev = huge_decode_dev(arg.dev);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ if (ca->disk_sb.bdev->bd_dev == dev) {
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ return i;
|
|
+ }
|
|
+
|
|
+ return -ENOENT;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_resize(struct bch_fs *c,
|
|
+ struct bch_ioctl_disk_resize arg)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ int ret;
|
|
+
|
|
+ if ((arg.flags & ~BCH_BY_INDEX) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+ if (IS_ERR(ca))
|
|
+ return PTR_ERR(ca);
|
|
+
|
|
+ ret = bch2_dev_resize(c, ca, arg.nbuckets);
|
|
+
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
|
|
+ struct bch_ioctl_disk_resize_journal arg)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ int ret;
|
|
+
|
|
+ if ((arg.flags & ~BCH_BY_INDEX) ||
|
|
+ arg.pad)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
|
|
+ if (IS_ERR(ca))
|
|
+ return PTR_ERR(ca);
|
|
+
|
|
+ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets);
|
|
+
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#define BCH_IOCTL(_name, _argtype) \
|
|
+do { \
|
|
+ _argtype i; \
|
|
+ \
|
|
+ if (copy_from_user(&i, arg, sizeof(i))) \
|
|
+ return -EFAULT; \
|
|
+ return bch2_ioctl_##_name(c, i); \
|
|
+} while (0)
|
|
+
|
|
+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
|
|
+{
|
|
+ /* ioctls that don't require admin cap: */
|
|
+ switch (cmd) {
|
|
+ case BCH_IOCTL_QUERY_UUID:
|
|
+ return bch2_ioctl_query_uuid(c, arg);
|
|
+ case BCH_IOCTL_FS_USAGE:
|
|
+ return bch2_ioctl_fs_usage(c, arg);
|
|
+ case BCH_IOCTL_DEV_USAGE:
|
|
+ return bch2_ioctl_dev_usage(c, arg);
|
|
+ }
|
|
+
|
|
+ if (!capable(CAP_SYS_ADMIN))
|
|
+ return -EPERM;
|
|
+
|
|
+ switch (cmd) {
|
|
+#if 0
|
|
+ case BCH_IOCTL_START:
|
|
+ BCH_IOCTL(start, struct bch_ioctl_start);
|
|
+ case BCH_IOCTL_STOP:
|
|
+ return bch2_ioctl_stop(c);
|
|
+#endif
|
|
+ case BCH_IOCTL_READ_SUPER:
|
|
+ BCH_IOCTL(read_super, struct bch_ioctl_read_super);
|
|
+ case BCH_IOCTL_DISK_GET_IDX:
|
|
+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
|
|
+ }
|
|
+
|
|
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
|
|
+ return -EINVAL;
|
|
+
|
|
+ /* ioctls that do require admin cap: */
|
|
+ switch (cmd) {
|
|
+ case BCH_IOCTL_DISK_ADD:
|
|
+ BCH_IOCTL(disk_add, struct bch_ioctl_disk);
|
|
+ case BCH_IOCTL_DISK_REMOVE:
|
|
+ BCH_IOCTL(disk_remove, struct bch_ioctl_disk);
|
|
+ case BCH_IOCTL_DISK_ONLINE:
|
|
+ BCH_IOCTL(disk_online, struct bch_ioctl_disk);
|
|
+ case BCH_IOCTL_DISK_OFFLINE:
|
|
+ BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
|
|
+ case BCH_IOCTL_DISK_SET_STATE:
|
|
+ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
|
|
+ case BCH_IOCTL_DATA:
|
|
+ BCH_IOCTL(data, struct bch_ioctl_data);
|
|
+ case BCH_IOCTL_DISK_RESIZE:
|
|
+ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
|
|
+ case BCH_IOCTL_DISK_RESIZE_JOURNAL:
|
|
+ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
|
|
+
|
|
+ default:
|
|
+ return -ENOTTY;
|
|
+ }
|
|
+}
|
|
+
|
|
+static DEFINE_IDR(bch_chardev_minor);
|
|
+
|
|
+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
|
|
+{
|
|
+ unsigned minor = iminor(file_inode(filp));
|
|
+ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
|
|
+ void __user *arg = (void __user *) v;
|
|
+
|
|
+ return c
|
|
+ ? bch2_fs_ioctl(c, cmd, arg)
|
|
+ : bch2_global_ioctl(cmd, arg);
|
|
+}
|
|
+
|
|
+static const struct file_operations bch_chardev_fops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .unlocked_ioctl = bch2_chardev_ioctl,
|
|
+ .open = nonseekable_open,
|
|
+};
|
|
+
|
|
+static int bch_chardev_major;
|
|
+static struct class *bch_chardev_class;
|
|
+static struct device *bch_chardev;
|
|
+
|
|
+void bch2_fs_chardev_exit(struct bch_fs *c)
|
|
+{
|
|
+ if (!IS_ERR_OR_NULL(c->chardev))
|
|
+ device_unregister(c->chardev);
|
|
+ if (c->minor >= 0)
|
|
+ idr_remove(&bch_chardev_minor, c->minor);
|
|
+}
|
|
+
|
|
+int bch2_fs_chardev_init(struct bch_fs *c)
|
|
+{
|
|
+ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL);
|
|
+ if (c->minor < 0)
|
|
+ return c->minor;
|
|
+
|
|
+ c->chardev = device_create(bch_chardev_class, NULL,
|
|
+ MKDEV(bch_chardev_major, c->minor), c,
|
|
+ "bcachefs%u-ctl", c->minor);
|
|
+ if (IS_ERR(c->chardev))
|
|
+ return PTR_ERR(c->chardev);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_chardev_exit(void)
|
|
+{
|
|
+ if (!IS_ERR_OR_NULL(bch_chardev_class))
|
|
+ device_destroy(bch_chardev_class,
|
|
+ MKDEV(bch_chardev_major, U8_MAX));
|
|
+ if (!IS_ERR_OR_NULL(bch_chardev_class))
|
|
+ class_destroy(bch_chardev_class);
|
|
+ if (bch_chardev_major > 0)
|
|
+ unregister_chrdev(bch_chardev_major, "bcachefs");
|
|
+}
|
|
+
|
|
+int __init bch2_chardev_init(void)
|
|
+{
|
|
+ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops);
|
|
+ if (bch_chardev_major < 0)
|
|
+ return bch_chardev_major;
|
|
+
|
|
+ bch_chardev_class = class_create(THIS_MODULE, "bcachefs");
|
|
+ if (IS_ERR(bch_chardev_class))
|
|
+ return PTR_ERR(bch_chardev_class);
|
|
+
|
|
+ bch_chardev = device_create(bch_chardev_class, NULL,
|
|
+ MKDEV(bch_chardev_major, U8_MAX),
|
|
+ NULL, "bcachefs-ctl");
|
|
+ if (IS_ERR(bch_chardev))
|
|
+ return PTR_ERR(bch_chardev);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif /* NO_BCACHEFS_CHARDEV */
|
|
diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h
|
|
new file mode 100644
|
|
index 000000000000..3a4890d39ff9
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/chardev.h
|
|
@@ -0,0 +1,31 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_CHARDEV_H
|
|
+#define _BCACHEFS_CHARDEV_H
|
|
+
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
|
|
+
|
|
+void bch2_fs_chardev_exit(struct bch_fs *);
|
|
+int bch2_fs_chardev_init(struct bch_fs *);
|
|
+
|
|
+void bch2_chardev_exit(void);
|
|
+int __init bch2_chardev_init(void);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline long bch2_fs_ioctl(struct bch_fs *c,
|
|
+ unsigned cmd, void __user * arg)
|
|
+{
|
|
+ return -ENOSYS;
|
|
+}
|
|
+
|
|
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
|
|
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
|
|
+
|
|
+static inline void bch2_chardev_exit(void) {}
|
|
+static inline int __init bch2_chardev_init(void) { return 0; }
|
|
+
|
|
+#endif /* NO_BCACHEFS_FS */
|
|
+
|
|
+#endif /* _BCACHEFS_CHARDEV_H */
|
|
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
|
|
new file mode 100644
|
|
index 000000000000..3d88719ba86c
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/checksum.c
|
|
@@ -0,0 +1,618 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "checksum.h"
|
|
+#include "super.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/crc32c.h>
|
|
+#include <linux/crypto.h>
|
|
+#include <linux/key.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/scatterlist.h>
|
|
+#include <crypto/algapi.h>
|
|
+#include <crypto/chacha.h>
|
|
+#include <crypto/hash.h>
|
|
+#include <crypto/poly1305.h>
|
|
+#include <crypto/skcipher.h>
|
|
+#include <keys/user-type.h>
|
|
+
|
|
+static u64 bch2_checksum_init(unsigned type)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_NONE:
|
|
+ return 0;
|
|
+ case BCH_CSUM_CRC32C_NONZERO:
|
|
+ return U32_MAX;
|
|
+ case BCH_CSUM_CRC64_NONZERO:
|
|
+ return U64_MAX;
|
|
+ case BCH_CSUM_CRC32C:
|
|
+ return 0;
|
|
+ case BCH_CSUM_CRC64:
|
|
+ return 0;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static u64 bch2_checksum_final(unsigned type, u64 crc)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_NONE:
|
|
+ return 0;
|
|
+ case BCH_CSUM_CRC32C_NONZERO:
|
|
+ return crc ^ U32_MAX;
|
|
+ case BCH_CSUM_CRC64_NONZERO:
|
|
+ return crc ^ U64_MAX;
|
|
+ case BCH_CSUM_CRC32C:
|
|
+ return crc;
|
|
+ case BCH_CSUM_CRC64:
|
|
+ return crc;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_NONE:
|
|
+ return 0;
|
|
+ case BCH_CSUM_CRC32C_NONZERO:
|
|
+ case BCH_CSUM_CRC32C:
|
|
+ return crc32c(crc, data, len);
|
|
+ case BCH_CSUM_CRC64_NONZERO:
|
|
+ case BCH_CSUM_CRC64:
|
|
+ return crc64_be(crc, data, len);
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm,
|
|
+ struct nonce nonce,
|
|
+ struct scatterlist *sg, size_t len)
|
|
+{
|
|
+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
|
|
+ int ret;
|
|
+
|
|
+ skcipher_request_set_sync_tfm(req, tfm);
|
|
+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d);
|
|
+
|
|
+ ret = crypto_skcipher_encrypt(req);
|
|
+ BUG_ON(ret);
|
|
+}
|
|
+
|
|
+static inline void do_encrypt(struct crypto_sync_skcipher *tfm,
|
|
+ struct nonce nonce,
|
|
+ void *buf, size_t len)
|
|
+{
|
|
+ struct scatterlist sg;
|
|
+
|
|
+ sg_init_one(&sg, buf, len);
|
|
+ do_encrypt_sg(tfm, nonce, &sg, len);
|
|
+}
|
|
+
|
|
+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
|
|
+ void *buf, size_t len)
|
|
+{
|
|
+ struct crypto_sync_skcipher *chacha20 =
|
|
+ crypto_alloc_sync_skcipher("chacha20", 0, 0);
|
|
+ int ret;
|
|
+
|
|
+ if (!chacha20) {
|
|
+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
|
|
+ return PTR_ERR(chacha20);
|
|
+ }
|
|
+
|
|
+ ret = crypto_skcipher_setkey(&chacha20->base,
|
|
+ (void *) key, sizeof(*key));
|
|
+ if (ret) {
|
|
+ pr_err("crypto_skcipher_setkey() error: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ do_encrypt(chacha20, nonce, buf, len);
|
|
+err:
|
|
+ crypto_free_sync_skcipher(chacha20);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc,
|
|
+ struct nonce nonce)
|
|
+{
|
|
+ u8 key[POLY1305_KEY_SIZE];
|
|
+
|
|
+ nonce.d[3] ^= BCH_NONCE_POLY;
|
|
+
|
|
+ memset(key, 0, sizeof(key));
|
|
+ do_encrypt(c->chacha20, nonce, key, sizeof(key));
|
|
+
|
|
+ desc->tfm = c->poly1305;
|
|
+ crypto_shash_init(desc);
|
|
+ crypto_shash_update(desc, key, sizeof(key));
|
|
+}
|
|
+
|
|
+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
|
|
+ struct nonce nonce, const void *data, size_t len)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_NONE:
|
|
+ case BCH_CSUM_CRC32C_NONZERO:
|
|
+ case BCH_CSUM_CRC64_NONZERO:
|
|
+ case BCH_CSUM_CRC32C:
|
|
+ case BCH_CSUM_CRC64: {
|
|
+ u64 crc = bch2_checksum_init(type);
|
|
+
|
|
+ crc = bch2_checksum_update(type, crc, data, len);
|
|
+ crc = bch2_checksum_final(type, crc);
|
|
+
|
|
+ return (struct bch_csum) { .lo = cpu_to_le64(crc) };
|
|
+ }
|
|
+
|
|
+ case BCH_CSUM_CHACHA20_POLY1305_80:
|
|
+ case BCH_CSUM_CHACHA20_POLY1305_128: {
|
|
+ SHASH_DESC_ON_STACK(desc, c->poly1305);
|
|
+ u8 digest[POLY1305_DIGEST_SIZE];
|
|
+ struct bch_csum ret = { 0 };
|
|
+
|
|
+ gen_poly_key(c, desc, nonce);
|
|
+
|
|
+ crypto_shash_update(desc, data, len);
|
|
+ crypto_shash_final(desc, digest);
|
|
+
|
|
+ memcpy(&ret, digest, bch_crc_bytes[type]);
|
|
+ return ret;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_encrypt(struct bch_fs *c, unsigned type,
|
|
+ struct nonce nonce, void *data, size_t len)
|
|
+{
|
|
+ if (!bch2_csum_type_is_encryption(type))
|
|
+ return;
|
|
+
|
|
+ do_encrypt(c->chacha20, nonce, data, len);
|
|
+}
|
|
+
|
|
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
|
+ struct nonce nonce, struct bio *bio,
|
|
+ struct bvec_iter *iter)
|
|
+{
|
|
+ struct bio_vec bv;
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_NONE:
|
|
+ return (struct bch_csum) { 0 };
|
|
+ case BCH_CSUM_CRC32C_NONZERO:
|
|
+ case BCH_CSUM_CRC64_NONZERO:
|
|
+ case BCH_CSUM_CRC32C:
|
|
+ case BCH_CSUM_CRC64: {
|
|
+ u64 crc = bch2_checksum_init(type);
|
|
+
|
|
+#ifdef CONFIG_HIGHMEM
|
|
+ __bio_for_each_segment(bv, bio, *iter, *iter) {
|
|
+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
|
|
+ crc = bch2_checksum_update(type,
|
|
+ crc, p, bv.bv_len);
|
|
+ kunmap_atomic(p);
|
|
+ }
|
|
+#else
|
|
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
|
|
+ crc = bch2_checksum_update(type, crc,
|
|
+ page_address(bv.bv_page) + bv.bv_offset,
|
|
+ bv.bv_len);
|
|
+#endif
|
|
+ crc = bch2_checksum_final(type, crc);
|
|
+ return (struct bch_csum) { .lo = cpu_to_le64(crc) };
|
|
+ }
|
|
+
|
|
+ case BCH_CSUM_CHACHA20_POLY1305_80:
|
|
+ case BCH_CSUM_CHACHA20_POLY1305_128: {
|
|
+ SHASH_DESC_ON_STACK(desc, c->poly1305);
|
|
+ u8 digest[POLY1305_DIGEST_SIZE];
|
|
+ struct bch_csum ret = { 0 };
|
|
+
|
|
+ gen_poly_key(c, desc, nonce);
|
|
+
|
|
+#ifdef CONFIG_HIGHMEM
|
|
+ __bio_for_each_segment(bv, bio, *iter, *iter) {
|
|
+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
|
|
+
|
|
+ crypto_shash_update(desc, p, bv.bv_len);
|
|
+ kunmap_atomic(p);
|
|
+ }
|
|
+#else
|
|
+ __bio_for_each_bvec(bv, bio, *iter, *iter)
|
|
+ crypto_shash_update(desc,
|
|
+ page_address(bv.bv_page) + bv.bv_offset,
|
|
+ bv.bv_len);
|
|
+#endif
|
|
+ crypto_shash_final(desc, digest);
|
|
+
|
|
+ memcpy(&ret, digest, bch_crc_bytes[type]);
|
|
+ return ret;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
|
|
+ struct nonce nonce, struct bio *bio)
|
|
+{
|
|
+ struct bvec_iter iter = bio->bi_iter;
|
|
+
|
|
+ return __bch2_checksum_bio(c, type, nonce, bio, &iter);
|
|
+}
|
|
+
|
|
+void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
|
|
+ struct nonce nonce, struct bio *bio)
|
|
+{
|
|
+ struct bio_vec bv;
|
|
+ struct bvec_iter iter;
|
|
+ struct scatterlist sgl[16], *sg = sgl;
|
|
+ size_t bytes = 0;
|
|
+
|
|
+ if (!bch2_csum_type_is_encryption(type))
|
|
+ return;
|
|
+
|
|
+ sg_init_table(sgl, ARRAY_SIZE(sgl));
|
|
+
|
|
+ bio_for_each_segment(bv, bio, iter) {
|
|
+ if (sg == sgl + ARRAY_SIZE(sgl)) {
|
|
+ sg_mark_end(sg - 1);
|
|
+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
|
+
|
|
+ nonce = nonce_add(nonce, bytes);
|
|
+ bytes = 0;
|
|
+
|
|
+ sg_init_table(sgl, ARRAY_SIZE(sgl));
|
|
+ sg = sgl;
|
|
+ }
|
|
+
|
|
+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
|
|
+ bytes += bv.bv_len;
|
|
+ }
|
|
+
|
|
+ sg_mark_end(sg - 1);
|
|
+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
|
|
+}
|
|
+
|
|
+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a,
|
|
+ struct bch_csum b, size_t b_len)
|
|
+{
|
|
+ BUG_ON(!bch2_checksum_mergeable(type));
|
|
+
|
|
+ while (b_len) {
|
|
+ unsigned b = min_t(unsigned, b_len, PAGE_SIZE);
|
|
+
|
|
+ a.lo = bch2_checksum_update(type, a.lo,
|
|
+ page_address(ZERO_PAGE(0)), b);
|
|
+ b_len -= b;
|
|
+ }
|
|
+
|
|
+ a.lo ^= b.lo;
|
|
+ a.hi ^= b.hi;
|
|
+ return a;
|
|
+}
|
|
+
|
|
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
|
|
+ struct bversion version,
|
|
+ struct bch_extent_crc_unpacked crc_old,
|
|
+ struct bch_extent_crc_unpacked *crc_a,
|
|
+ struct bch_extent_crc_unpacked *crc_b,
|
|
+ unsigned len_a, unsigned len_b,
|
|
+ unsigned new_csum_type)
|
|
+{
|
|
+ struct bvec_iter iter = bio->bi_iter;
|
|
+ struct nonce nonce = extent_nonce(version, crc_old);
|
|
+ struct bch_csum merged = { 0 };
|
|
+ struct crc_split {
|
|
+ struct bch_extent_crc_unpacked *crc;
|
|
+ unsigned len;
|
|
+ unsigned csum_type;
|
|
+ struct bch_csum csum;
|
|
+ } splits[3] = {
|
|
+ { crc_a, len_a, new_csum_type },
|
|
+ { crc_b, len_b, new_csum_type },
|
|
+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type },
|
|
+ }, *i;
|
|
+ bool mergeable = crc_old.csum_type == new_csum_type &&
|
|
+ bch2_checksum_mergeable(new_csum_type);
|
|
+ unsigned crc_nonce = crc_old.nonce;
|
|
+
|
|
+ BUG_ON(len_a + len_b > bio_sectors(bio));
|
|
+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
|
|
+ BUG_ON(crc_is_compressed(crc_old));
|
|
+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
|
|
+ bch2_csum_type_is_encryption(new_csum_type));
|
|
+
|
|
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
|
+ iter.bi_size = i->len << 9;
|
|
+ if (mergeable || i->crc)
|
|
+ i->csum = __bch2_checksum_bio(c, i->csum_type,
|
|
+ nonce, bio, &iter);
|
|
+ else
|
|
+ bio_advance_iter(bio, &iter, i->len << 9);
|
|
+ nonce = nonce_add(nonce, i->len << 9);
|
|
+ }
|
|
+
|
|
+ if (mergeable)
|
|
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
|
|
+ merged = bch2_checksum_merge(new_csum_type, merged,
|
|
+ i->csum, i->len << 9);
|
|
+ else
|
|
+ merged = bch2_checksum_bio(c, crc_old.csum_type,
|
|
+ extent_nonce(version, crc_old), bio);
|
|
+
|
|
+ if (bch2_crc_cmp(merged, crc_old.csum))
|
|
+ return -EIO;
|
|
+
|
|
+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
|
|
+ if (i->crc)
|
|
+ *i->crc = (struct bch_extent_crc_unpacked) {
|
|
+ .csum_type = i->csum_type,
|
|
+ .compression_type = crc_old.compression_type,
|
|
+ .compressed_size = i->len,
|
|
+ .uncompressed_size = i->len,
|
|
+ .offset = 0,
|
|
+ .live_size = i->len,
|
|
+ .nonce = crc_nonce,
|
|
+ .csum = i->csum,
|
|
+ };
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(new_csum_type))
|
|
+ crc_nonce += i->len;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
|
|
+{
|
|
+ char key_description[60];
|
|
+ struct key *keyring_key;
|
|
+ const struct user_key_payload *ukp;
|
|
+ int ret;
|
|
+
|
|
+ snprintf(key_description, sizeof(key_description),
|
|
+ "bcachefs:%pUb", &sb->user_uuid);
|
|
+
|
|
+ keyring_key = request_key(&key_type_logon, key_description, NULL);
|
|
+ if (IS_ERR(keyring_key))
|
|
+ return PTR_ERR(keyring_key);
|
|
+
|
|
+ down_read(&keyring_key->sem);
|
|
+ ukp = dereference_key_locked(keyring_key);
|
|
+ if (ukp->datalen == sizeof(*key)) {
|
|
+ memcpy(key, ukp->data, ukp->datalen);
|
|
+ ret = 0;
|
|
+ } else {
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+ up_read(&keyring_key->sem);
|
|
+ key_put(keyring_key);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+#else
|
|
+#include <keyutils.h>
|
|
+#include <uuid/uuid.h>
|
|
+
|
|
+int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
|
|
+{
|
|
+ key_serial_t key_id;
|
|
+ char key_description[60];
|
|
+ char uuid[40];
|
|
+
|
|
+ uuid_unparse_lower(sb->user_uuid.b, uuid);
|
|
+ sprintf(key_description, "bcachefs:%s", uuid);
|
|
+
|
|
+ key_id = request_key("user", key_description, NULL,
|
|
+ KEY_SPEC_USER_KEYRING);
|
|
+ if (key_id < 0)
|
|
+ return -errno;
|
|
+
|
|
+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key))
|
|
+ return -1;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+#endif
|
|
+
|
|
+int bch2_decrypt_sb_key(struct bch_fs *c,
|
|
+ struct bch_sb_field_crypt *crypt,
|
|
+ struct bch_key *key)
|
|
+{
|
|
+ struct bch_encrypted_key sb_key = crypt->key;
|
|
+ struct bch_key user_key;
|
|
+ int ret = 0;
|
|
+
|
|
+ /* is key encrypted? */
|
|
+ if (!bch2_key_is_encrypted(&sb_key))
|
|
+ goto out;
|
|
+
|
|
+ ret = bch2_request_key(c->disk_sb.sb, &user_key);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error requesting encryption key: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /* decrypt real key: */
|
|
+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
|
|
+ &sb_key, sizeof(sb_key));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (bch2_key_is_encrypted(&sb_key)) {
|
|
+ bch_err(c, "incorrect encryption key");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+out:
|
|
+ *key = sb_key.key;
|
|
+err:
|
|
+ memzero_explicit(&sb_key, sizeof(sb_key));
|
|
+ memzero_explicit(&user_key, sizeof(user_key));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_alloc_ciphers(struct bch_fs *c)
|
|
+{
|
|
+ if (!c->chacha20)
|
|
+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0);
|
|
+ if (IS_ERR(c->chacha20)) {
|
|
+ bch_err(c, "error requesting chacha20 module: %li",
|
|
+ PTR_ERR(c->chacha20));
|
|
+ return PTR_ERR(c->chacha20);
|
|
+ }
|
|
+
|
|
+ if (!c->poly1305)
|
|
+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
|
|
+ if (IS_ERR(c->poly1305)) {
|
|
+ bch_err(c, "error requesting poly1305 module: %li",
|
|
+ PTR_ERR(c->poly1305));
|
|
+ return PTR_ERR(c->poly1305);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_disable_encryption(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_crypt *crypt;
|
|
+ struct bch_key key;
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ crypt = bch2_sb_get_crypt(c->disk_sb.sb);
|
|
+ if (!crypt)
|
|
+ goto out;
|
|
+
|
|
+ /* is key encrypted? */
|
|
+ ret = 0;
|
|
+ if (bch2_key_is_encrypted(&crypt->key))
|
|
+ goto out;
|
|
+
|
|
+ ret = bch2_decrypt_sb_key(c, crypt, &key);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ crypt->key.magic = BCH_KEY_MAGIC;
|
|
+ crypt->key.key = key;
|
|
+
|
|
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
|
|
+ bch2_write_super(c);
|
|
+out:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_enable_encryption(struct bch_fs *c, bool keyed)
|
|
+{
|
|
+ struct bch_encrypted_key key;
|
|
+ struct bch_key user_key;
|
|
+ struct bch_sb_field_crypt *crypt;
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ /* Do we already have an encryption key? */
|
|
+ if (bch2_sb_get_crypt(c->disk_sb.sb))
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_alloc_ciphers(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ key.magic = BCH_KEY_MAGIC;
|
|
+ get_random_bytes(&key.key, sizeof(key.key));
|
|
+
|
|
+ if (keyed) {
|
|
+ ret = bch2_request_key(c->disk_sb.sb, &user_key);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error requesting encryption key: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c),
|
|
+ &key, sizeof(key));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
|
|
+ (void *) &key.key, sizeof(key.key));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
|
|
+ if (!crypt) {
|
|
+ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ crypt->key = key;
|
|
+
|
|
+ /* write superblock */
|
|
+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
|
|
+ bch2_write_super(c);
|
|
+err:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ memzero_explicit(&user_key, sizeof(user_key));
|
|
+ memzero_explicit(&key, sizeof(key));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_fs_encryption_exit(struct bch_fs *c)
|
|
+{
|
|
+ if (!IS_ERR_OR_NULL(c->poly1305))
|
|
+ crypto_free_shash(c->poly1305);
|
|
+ if (!IS_ERR_OR_NULL(c->chacha20))
|
|
+ crypto_free_sync_skcipher(c->chacha20);
|
|
+ if (!IS_ERR_OR_NULL(c->sha256))
|
|
+ crypto_free_shash(c->sha256);
|
|
+}
|
|
+
|
|
+int bch2_fs_encryption_init(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_crypt *crypt;
|
|
+ struct bch_key key;
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_verbose_init(c->opts, "");
|
|
+
|
|
+ c->sha256 = crypto_alloc_shash("sha256", 0, 0);
|
|
+ if (IS_ERR(c->sha256)) {
|
|
+ bch_err(c, "error requesting sha256 module");
|
|
+ ret = PTR_ERR(c->sha256);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ crypt = bch2_sb_get_crypt(c->disk_sb.sb);
|
|
+ if (!crypt)
|
|
+ goto out;
|
|
+
|
|
+ ret = bch2_alloc_ciphers(c);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ ret = bch2_decrypt_sb_key(c, crypt, &key);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ ret = crypto_skcipher_setkey(&c->chacha20->base,
|
|
+ (void *) &key.key, sizeof(key.key));
|
|
+ if (ret)
|
|
+ goto out;
|
|
+out:
|
|
+ memzero_explicit(&key, sizeof(key));
|
|
+ pr_verbose_init(c->opts, "ret %i", ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
|
|
new file mode 100644
|
|
index 000000000000..728b7ef1a149
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/checksum.h
|
|
@@ -0,0 +1,202 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_CHECKSUM_H
|
|
+#define _BCACHEFS_CHECKSUM_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "extents_types.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/crc64.h>
|
|
+#include <crypto/chacha.h>
|
|
+
|
|
+static inline bool bch2_checksum_mergeable(unsigned type)
|
|
+{
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_NONE:
|
|
+ case BCH_CSUM_CRC32C:
|
|
+ case BCH_CSUM_CRC64:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum,
|
|
+ struct bch_csum, size_t);
|
|
+
|
|
+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28)
|
|
+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28)
|
|
+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28)
|
|
+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28)
|
|
+#define BCH_NONCE_POLY cpu_to_le32(1 << 31)
|
|
+
|
|
+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
|
|
+ const void *, size_t);
|
|
+
|
|
+/*
|
|
+ * This is used for various on disk data structures - bch_sb, prio_set, bset,
|
|
+ * jset: The checksum is _always_ the first field of these structs
|
|
+ */
|
|
+#define csum_vstruct(_c, _type, _nonce, _i) \
|
|
+({ \
|
|
+ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \
|
|
+ const void *end = vstruct_end(_i); \
|
|
+ \
|
|
+ bch2_checksum(_c, _type, _nonce, start, end - start); \
|
|
+})
|
|
+
|
|
+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
|
|
+int bch2_request_key(struct bch_sb *, struct bch_key *);
|
|
+
|
|
+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
|
|
+ void *data, size_t);
|
|
+
|
|
+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
|
|
+ struct nonce, struct bio *);
|
|
+
|
|
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
|
|
+ struct bch_extent_crc_unpacked,
|
|
+ struct bch_extent_crc_unpacked *,
|
|
+ struct bch_extent_crc_unpacked *,
|
|
+ unsigned, unsigned, unsigned);
|
|
+
|
|
+void bch2_encrypt_bio(struct bch_fs *, unsigned,
|
|
+ struct nonce, struct bio *);
|
|
+
|
|
+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *,
|
|
+ struct bch_key *);
|
|
+
|
|
+int bch2_disable_encryption(struct bch_fs *);
|
|
+int bch2_enable_encryption(struct bch_fs *, bool);
|
|
+
|
|
+void bch2_fs_encryption_exit(struct bch_fs *);
|
|
+int bch2_fs_encryption_init(struct bch_fs *);
|
|
+
|
|
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
|
|
+ bool data)
|
|
+{
|
|
+ switch (type) {
|
|
+ case BCH_CSUM_OPT_none:
|
|
+ return BCH_CSUM_NONE;
|
|
+ case BCH_CSUM_OPT_crc32c:
|
|
+ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
|
|
+ case BCH_CSUM_OPT_crc64:
|
|
+ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
|
|
+ unsigned opt)
|
|
+{
|
|
+ if (c->sb.encryption_type)
|
|
+ return c->opts.wide_macs
|
|
+ ? BCH_CSUM_CHACHA20_POLY1305_128
|
|
+ : BCH_CSUM_CHACHA20_POLY1305_80;
|
|
+
|
|
+ return bch2_csum_opt_to_type(opt, true);
|
|
+}
|
|
+
|
|
+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
|
|
+{
|
|
+ if (c->sb.encryption_type)
|
|
+ return BCH_CSUM_CHACHA20_POLY1305_128;
|
|
+
|
|
+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
|
|
+}
|
|
+
|
|
+static const unsigned bch2_compression_opt_to_type[] = {
|
|
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
|
|
+ BCH_COMPRESSION_OPTS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
|
|
+ unsigned type)
|
|
+{
|
|
+ if (type >= BCH_CSUM_NR)
|
|
+ return false;
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(type) && !c->chacha20)
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+/* returns true if not equal */
|
|
+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
|
|
+{
|
|
+ /*
|
|
+ * XXX: need some way of preventing the compiler from optimizing this
|
|
+ * into a form that isn't constant time..
|
|
+ */
|
|
+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0;
|
|
+}
|
|
+
|
|
+/* for skipping ahead and encrypting/decrypting at an offset: */
|
|
+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
|
|
+{
|
|
+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1));
|
|
+
|
|
+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE);
|
|
+ return nonce;
|
|
+}
|
|
+
|
|
+static inline struct nonce null_nonce(void)
|
|
+{
|
|
+ struct nonce ret;
|
|
+
|
|
+ memset(&ret, 0, sizeof(ret));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline struct nonce extent_nonce(struct bversion version,
|
|
+ struct bch_extent_crc_unpacked crc)
|
|
+{
|
|
+ unsigned compression_type = crc_is_compressed(crc)
|
|
+ ? crc.compression_type
|
|
+ : 0;
|
|
+ unsigned size = compression_type ? crc.uncompressed_size : 0;
|
|
+ struct nonce nonce = (struct nonce) {{
|
|
+ [0] = cpu_to_le32(size << 22),
|
|
+ [1] = cpu_to_le32(version.lo),
|
|
+ [2] = cpu_to_le32(version.lo >> 32),
|
|
+ [3] = cpu_to_le32(version.hi|
|
|
+ (compression_type << 24))^BCH_NONCE_EXTENT,
|
|
+ }};
|
|
+
|
|
+ return nonce_add(nonce, crc.nonce << 9);
|
|
+}
|
|
+
|
|
+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
|
|
+{
|
|
+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
|
|
+}
|
|
+
|
|
+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb)
|
|
+{
|
|
+ __le64 magic = __bch2_sb_magic(sb);
|
|
+
|
|
+ return (struct nonce) {{
|
|
+ [0] = 0,
|
|
+ [1] = 0,
|
|
+ [2] = ((__le32 *) &magic)[0],
|
|
+ [3] = ((__le32 *) &magic)[1],
|
|
+ }};
|
|
+}
|
|
+
|
|
+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c)
|
|
+{
|
|
+ __le64 magic = bch2_sb_magic(c);
|
|
+
|
|
+ return (struct nonce) {{
|
|
+ [0] = 0,
|
|
+ [1] = 0,
|
|
+ [2] = ((__le32 *) &magic)[0],
|
|
+ [3] = ((__le32 *) &magic)[1],
|
|
+ }};
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_CHECKSUM_H */
|
|
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
|
|
new file mode 100644
|
|
index 000000000000..4324cfe7eed0
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/clock.c
|
|
@@ -0,0 +1,191 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "clock.h"
|
|
+
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/preempt.h>
|
|
+
|
|
+static inline long io_timer_cmp(io_timer_heap *h,
|
|
+ struct io_timer *l,
|
|
+ struct io_timer *r)
|
|
+{
|
|
+ return l->expire - r->expire;
|
|
+}
|
|
+
|
|
+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
|
|
+{
|
|
+ size_t i;
|
|
+
|
|
+ spin_lock(&clock->timer_lock);
|
|
+
|
|
+ if (time_after_eq((unsigned long) atomic64_read(&clock->now),
|
|
+ timer->expire)) {
|
|
+ spin_unlock(&clock->timer_lock);
|
|
+ timer->fn(timer);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < clock->timers.used; i++)
|
|
+ if (clock->timers.data[i] == timer)
|
|
+ goto out;
|
|
+
|
|
+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL));
|
|
+out:
|
|
+ spin_unlock(&clock->timer_lock);
|
|
+}
|
|
+
|
|
+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer)
|
|
+{
|
|
+ size_t i;
|
|
+
|
|
+ spin_lock(&clock->timer_lock);
|
|
+
|
|
+ for (i = 0; i < clock->timers.used; i++)
|
|
+ if (clock->timers.data[i] == timer) {
|
|
+ heap_del(&clock->timers, i, io_timer_cmp, NULL);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ spin_unlock(&clock->timer_lock);
|
|
+}
|
|
+
|
|
+struct io_clock_wait {
|
|
+ struct io_timer io_timer;
|
|
+ struct timer_list cpu_timer;
|
|
+ struct task_struct *task;
|
|
+ int expired;
|
|
+};
|
|
+
|
|
+static void io_clock_wait_fn(struct io_timer *timer)
|
|
+{
|
|
+ struct io_clock_wait *wait = container_of(timer,
|
|
+ struct io_clock_wait, io_timer);
|
|
+
|
|
+ wait->expired = 1;
|
|
+ wake_up_process(wait->task);
|
|
+}
|
|
+
|
|
+static void io_clock_cpu_timeout(struct timer_list *timer)
|
|
+{
|
|
+ struct io_clock_wait *wait = container_of(timer,
|
|
+ struct io_clock_wait, cpu_timer);
|
|
+
|
|
+ wait->expired = 1;
|
|
+ wake_up_process(wait->task);
|
|
+}
|
|
+
|
|
+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until)
|
|
+{
|
|
+ struct io_clock_wait wait;
|
|
+
|
|
+ /* XXX: calculate sleep time rigorously */
|
|
+ wait.io_timer.expire = until;
|
|
+ wait.io_timer.fn = io_clock_wait_fn;
|
|
+ wait.task = current;
|
|
+ wait.expired = 0;
|
|
+ bch2_io_timer_add(clock, &wait.io_timer);
|
|
+
|
|
+ schedule();
|
|
+
|
|
+ bch2_io_timer_del(clock, &wait.io_timer);
|
|
+}
|
|
+
|
|
+void bch2_kthread_io_clock_wait(struct io_clock *clock,
|
|
+ unsigned long io_until,
|
|
+ unsigned long cpu_timeout)
|
|
+{
|
|
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
|
|
+ struct io_clock_wait wait;
|
|
+
|
|
+ wait.io_timer.expire = io_until;
|
|
+ wait.io_timer.fn = io_clock_wait_fn;
|
|
+ wait.task = current;
|
|
+ wait.expired = 0;
|
|
+ bch2_io_timer_add(clock, &wait.io_timer);
|
|
+
|
|
+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0);
|
|
+
|
|
+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
|
|
+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
|
|
+
|
|
+ while (1) {
|
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
|
+ if (kthread && kthread_should_stop())
|
|
+ break;
|
|
+
|
|
+ if (wait.expired)
|
|
+ break;
|
|
+
|
|
+ schedule();
|
|
+ try_to_freeze();
|
|
+ }
|
|
+
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+ del_singleshot_timer_sync(&wait.cpu_timer);
|
|
+ destroy_timer_on_stack(&wait.cpu_timer);
|
|
+ bch2_io_timer_del(clock, &wait.io_timer);
|
|
+}
|
|
+
|
|
+static struct io_timer *get_expired_timer(struct io_clock *clock,
|
|
+ unsigned long now)
|
|
+{
|
|
+ struct io_timer *ret = NULL;
|
|
+
|
|
+ spin_lock(&clock->timer_lock);
|
|
+
|
|
+ if (clock->timers.used &&
|
|
+ time_after_eq(now, clock->timers.data[0]->expire))
|
|
+ heap_pop(&clock->timers, ret, io_timer_cmp, NULL);
|
|
+
|
|
+ spin_unlock(&clock->timer_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
|
|
+{
|
|
+ struct io_timer *timer;
|
|
+ unsigned long now = atomic64_add_return(sectors, &clock->now);
|
|
+
|
|
+ while ((timer = get_expired_timer(clock, now)))
|
|
+ timer->fn(timer);
|
|
+}
|
|
+
|
|
+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
|
|
+{
|
|
+ unsigned long now;
|
|
+ unsigned i;
|
|
+
|
|
+ spin_lock(&clock->timer_lock);
|
|
+ now = atomic64_read(&clock->now);
|
|
+
|
|
+ for (i = 0; i < clock->timers.used; i++)
|
|
+ pr_buf(out, "%ps:\t%li\n",
|
|
+ clock->timers.data[i]->fn,
|
|
+ clock->timers.data[i]->expire - now);
|
|
+ spin_unlock(&clock->timer_lock);
|
|
+}
|
|
+
|
|
+void bch2_io_clock_exit(struct io_clock *clock)
|
|
+{
|
|
+ free_heap(&clock->timers);
|
|
+ free_percpu(clock->pcpu_buf);
|
|
+}
|
|
+
|
|
+int bch2_io_clock_init(struct io_clock *clock)
|
|
+{
|
|
+ atomic64_set(&clock->now, 0);
|
|
+ spin_lock_init(&clock->timer_lock);
|
|
+
|
|
+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
|
|
+
|
|
+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf);
|
|
+ if (!clock->pcpu_buf)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h
|
|
new file mode 100644
|
|
index 000000000000..70a0f7436c84
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/clock.h
|
|
@@ -0,0 +1,38 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_CLOCK_H
|
|
+#define _BCACHEFS_CLOCK_H
|
|
+
|
|
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
|
|
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
|
|
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
|
|
+ unsigned long);
|
|
+
|
|
+void __bch2_increment_clock(struct io_clock *, unsigned);
|
|
+
|
|
+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors,
|
|
+ int rw)
|
|
+{
|
|
+ struct io_clock *clock = &c->io_clock[rw];
|
|
+
|
|
+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >=
|
|
+ IO_CLOCK_PCPU_SECTORS))
|
|
+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0));
|
|
+}
|
|
+
|
|
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
|
|
+
|
|
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
|
|
+({ \
|
|
+ long __ret = timeout; \
|
|
+ might_sleep(); \
|
|
+ if (!___wait_cond_timeout(condition)) \
|
|
+ __ret = __wait_event_timeout(wq, condition, timeout); \
|
|
+ __ret; \
|
|
+})
|
|
+
|
|
+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *);
|
|
+
|
|
+void bch2_io_clock_exit(struct io_clock *);
|
|
+int bch2_io_clock_init(struct io_clock *);
|
|
+
|
|
+#endif /* _BCACHEFS_CLOCK_H */
|
|
diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h
|
|
new file mode 100644
|
|
index 000000000000..5fae0012d808
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/clock_types.h
|
|
@@ -0,0 +1,37 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_CLOCK_TYPES_H
|
|
+#define _BCACHEFS_CLOCK_TYPES_H
|
|
+
|
|
+#include "util.h"
|
|
+
|
|
+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3)
|
|
+
|
|
+/*
|
|
+ * Clocks/timers in units of sectors of IO:
|
|
+ *
|
|
+ * Note - they use percpu batching, so they're only approximate.
|
|
+ */
|
|
+
|
|
+struct io_timer;
|
|
+typedef void (*io_timer_fn)(struct io_timer *);
|
|
+
|
|
+struct io_timer {
|
|
+ io_timer_fn fn;
|
|
+ unsigned long expire;
|
|
+};
|
|
+
|
|
+/* Amount to buffer up on a percpu counter */
|
|
+#define IO_CLOCK_PCPU_SECTORS 128
|
|
+
|
|
+typedef HEAP(struct io_timer *) io_timer_heap;
|
|
+
|
|
+struct io_clock {
|
|
+ atomic64_t now;
|
|
+ u16 __percpu *pcpu_buf;
|
|
+ unsigned max_slop;
|
|
+
|
|
+ spinlock_t timer_lock;
|
|
+ io_timer_heap timers;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_CLOCK_TYPES_H */
|
|
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
|
|
new file mode 100644
|
|
index 000000000000..f63651d291e5
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/compress.c
|
|
@@ -0,0 +1,640 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "checksum.h"
|
|
+#include "compress.h"
|
|
+#include "extents.h"
|
|
+#include "io.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/lz4.h>
|
|
+#include <linux/zlib.h>
|
|
+#include <linux/zstd.h>
|
|
+
|
|
+/* Bounce buffer: */
|
|
+struct bbuf {
|
|
+ void *b;
|
|
+ enum {
|
|
+ BB_NONE,
|
|
+ BB_VMAP,
|
|
+ BB_KMALLOC,
|
|
+ BB_MEMPOOL,
|
|
+ } type;
|
|
+ int rw;
|
|
+};
|
|
+
|
|
+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw)
|
|
+{
|
|
+ void *b;
|
|
+
|
|
+ BUG_ON(size > c->sb.encoded_extent_max << 9);
|
|
+
|
|
+ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN);
|
|
+ if (b)
|
|
+ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw };
|
|
+
|
|
+ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO);
|
|
+ if (b)
|
|
+ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw };
|
|
+
|
|
+ BUG();
|
|
+}
|
|
+
|
|
+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start)
|
|
+{
|
|
+ struct bio_vec bv;
|
|
+ struct bvec_iter iter;
|
|
+ void *expected_start = NULL;
|
|
+
|
|
+ __bio_for_each_bvec(bv, bio, iter, start) {
|
|
+ if (expected_start &&
|
|
+ expected_start != page_address(bv.bv_page) + bv.bv_offset)
|
|
+ return false;
|
|
+
|
|
+ expected_start = page_address(bv.bv_page) +
|
|
+ bv.bv_offset + bv.bv_len;
|
|
+ }
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio,
|
|
+ struct bvec_iter start, int rw)
|
|
+{
|
|
+ struct bbuf ret;
|
|
+ struct bio_vec bv;
|
|
+ struct bvec_iter iter;
|
|
+ unsigned nr_pages = 0;
|
|
+ struct page *stack_pages[16];
|
|
+ struct page **pages = NULL;
|
|
+ void *data;
|
|
+
|
|
+ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max);
|
|
+
|
|
+ if (!PageHighMem(bio_iter_page(bio, start)) &&
|
|
+ bio_phys_contig(bio, start))
|
|
+ return (struct bbuf) {
|
|
+ .b = page_address(bio_iter_page(bio, start)) +
|
|
+ bio_iter_offset(bio, start),
|
|
+ .type = BB_NONE, .rw = rw
|
|
+ };
|
|
+
|
|
+ /* check if we can map the pages contiguously: */
|
|
+ __bio_for_each_segment(bv, bio, iter, start) {
|
|
+ if (iter.bi_size != start.bi_size &&
|
|
+ bv.bv_offset)
|
|
+ goto bounce;
|
|
+
|
|
+ if (bv.bv_len < iter.bi_size &&
|
|
+ bv.bv_offset + bv.bv_len < PAGE_SIZE)
|
|
+ goto bounce;
|
|
+
|
|
+ nr_pages++;
|
|
+ }
|
|
+
|
|
+ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages);
|
|
+
|
|
+ pages = nr_pages > ARRAY_SIZE(stack_pages)
|
|
+ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO)
|
|
+ : stack_pages;
|
|
+ if (!pages)
|
|
+ goto bounce;
|
|
+
|
|
+ nr_pages = 0;
|
|
+ __bio_for_each_segment(bv, bio, iter, start)
|
|
+ pages[nr_pages++] = bv.bv_page;
|
|
+
|
|
+ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
|
|
+ if (pages != stack_pages)
|
|
+ kfree(pages);
|
|
+
|
|
+ if (data)
|
|
+ return (struct bbuf) {
|
|
+ .b = data + bio_iter_offset(bio, start),
|
|
+ .type = BB_VMAP, .rw = rw
|
|
+ };
|
|
+bounce:
|
|
+ ret = __bounce_alloc(c, start.bi_size, rw);
|
|
+
|
|
+ if (rw == READ)
|
|
+ memcpy_from_bio(ret.b, bio, start);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw)
|
|
+{
|
|
+ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw);
|
|
+}
|
|
+
|
|
+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf)
|
|
+{
|
|
+ switch (buf.type) {
|
|
+ case BB_NONE:
|
|
+ break;
|
|
+ case BB_VMAP:
|
|
+ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK));
|
|
+ break;
|
|
+ case BB_KMALLOC:
|
|
+ kfree(buf.b);
|
|
+ break;
|
|
+ case BB_MEMPOOL:
|
|
+ mempool_free(buf.b, &c->compression_bounce[buf.rw]);
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void zlib_set_workspace(z_stream *strm, void *workspace)
|
|
+{
|
|
+#ifdef __KERNEL__
|
|
+ strm->workspace = workspace;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static int __bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
+ void *dst_data, struct bch_extent_crc_unpacked crc)
|
|
+{
|
|
+ struct bbuf src_data = { NULL };
|
|
+ size_t src_len = src->bi_iter.bi_size;
|
|
+ size_t dst_len = crc.uncompressed_size << 9;
|
|
+ void *workspace;
|
|
+ int ret;
|
|
+
|
|
+ src_data = bio_map_or_bounce(c, src, READ);
|
|
+
|
|
+ switch (crc.compression_type) {
|
|
+ case BCH_COMPRESSION_TYPE_lz4_old:
|
|
+ case BCH_COMPRESSION_TYPE_lz4:
|
|
+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
|
|
+ src_len, dst_len, dst_len);
|
|
+ if (ret != dst_len)
|
|
+ goto err;
|
|
+ break;
|
|
+ case BCH_COMPRESSION_TYPE_gzip: {
|
|
+ z_stream strm = {
|
|
+ .next_in = src_data.b,
|
|
+ .avail_in = src_len,
|
|
+ .next_out = dst_data,
|
|
+ .avail_out = dst_len,
|
|
+ };
|
|
+
|
|
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
|
|
+
|
|
+ zlib_set_workspace(&strm, workspace);
|
|
+ zlib_inflateInit2(&strm, -MAX_WBITS);
|
|
+ ret = zlib_inflate(&strm, Z_FINISH);
|
|
+
|
|
+ mempool_free(workspace, &c->decompress_workspace);
|
|
+
|
|
+ if (ret != Z_STREAM_END)
|
|
+ goto err;
|
|
+ break;
|
|
+ }
|
|
+ case BCH_COMPRESSION_TYPE_zstd: {
|
|
+ ZSTD_DCtx *ctx;
|
|
+ size_t real_src_len = le32_to_cpup(src_data.b);
|
|
+
|
|
+ if (real_src_len > src_len - 4)
|
|
+ goto err;
|
|
+
|
|
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
|
|
+ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
|
|
+
|
|
+ ret = ZSTD_decompressDCtx(ctx,
|
|
+ dst_data, dst_len,
|
|
+ src_data.b + 4, real_src_len);
|
|
+
|
|
+ mempool_free(workspace, &c->decompress_workspace);
|
|
+
|
|
+ if (ret != dst_len)
|
|
+ goto err;
|
|
+ break;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+ ret = 0;
|
|
+out:
|
|
+ bio_unmap_or_unbounce(c, src_data);
|
|
+ return ret;
|
|
+err:
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
|
|
+ struct bch_extent_crc_unpacked *crc)
|
|
+{
|
|
+ struct bbuf data = { NULL };
|
|
+ size_t dst_len = crc->uncompressed_size << 9;
|
|
+
|
|
+ /* bio must own its pages: */
|
|
+ BUG_ON(!bio->bi_vcnt);
|
|
+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
|
|
+
|
|
+ if (crc->uncompressed_size > c->sb.encoded_extent_max ||
|
|
+ crc->compressed_size > c->sb.encoded_extent_max) {
|
|
+ bch_err(c, "error rewriting existing data: extent too big");
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ data = __bounce_alloc(c, dst_len, WRITE);
|
|
+
|
|
+ if (__bio_uncompress(c, bio, data.b, *crc)) {
|
|
+ bch_err(c, "error rewriting existing data: decompression error");
|
|
+ bio_unmap_or_unbounce(c, data);
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * XXX: don't have a good way to assert that the bio was allocated with
|
|
+ * enough space, we depend on bch2_move_extent doing the right thing
|
|
+ */
|
|
+ bio->bi_iter.bi_size = crc->live_size << 9;
|
|
+
|
|
+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
|
|
+
|
|
+ crc->csum_type = 0;
|
|
+ crc->compression_type = 0;
|
|
+ crc->compressed_size = crc->live_size;
|
|
+ crc->uncompressed_size = crc->live_size;
|
|
+ crc->offset = 0;
|
|
+ crc->csum = (struct bch_csum) { 0, 0 };
|
|
+
|
|
+ bio_unmap_or_unbounce(c, data);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
|
|
+ struct bio *dst, struct bvec_iter dst_iter,
|
|
+ struct bch_extent_crc_unpacked crc)
|
|
+{
|
|
+ struct bbuf dst_data = { NULL };
|
|
+ size_t dst_len = crc.uncompressed_size << 9;
|
|
+ int ret = -ENOMEM;
|
|
+
|
|
+ if (crc.uncompressed_size > c->sb.encoded_extent_max ||
|
|
+ crc.compressed_size > c->sb.encoded_extent_max)
|
|
+ return -EIO;
|
|
+
|
|
+ dst_data = dst_len == dst_iter.bi_size
|
|
+ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE)
|
|
+ : __bounce_alloc(c, dst_len, WRITE);
|
|
+
|
|
+ ret = __bio_uncompress(c, src, dst_data.b, crc);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (dst_data.type != BB_NONE &&
|
|
+ dst_data.type != BB_VMAP)
|
|
+ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9));
|
|
+err:
|
|
+ bio_unmap_or_unbounce(c, dst_data);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int attempt_compress(struct bch_fs *c,
|
|
+ void *workspace,
|
|
+ void *dst, size_t dst_len,
|
|
+ void *src, size_t src_len,
|
|
+ enum bch_compression_type compression_type)
|
|
+{
|
|
+ switch (compression_type) {
|
|
+ case BCH_COMPRESSION_TYPE_lz4: {
|
|
+ int len = src_len;
|
|
+ int ret = LZ4_compress_destSize(
|
|
+ src, dst,
|
|
+ &len, dst_len,
|
|
+ workspace);
|
|
+
|
|
+ if (len < src_len)
|
|
+ return -len;
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+ case BCH_COMPRESSION_TYPE_gzip: {
|
|
+ z_stream strm = {
|
|
+ .next_in = src,
|
|
+ .avail_in = src_len,
|
|
+ .next_out = dst,
|
|
+ .avail_out = dst_len,
|
|
+ };
|
|
+
|
|
+ zlib_set_workspace(&strm, workspace);
|
|
+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
|
|
+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
|
|
+ Z_DEFAULT_STRATEGY);
|
|
+
|
|
+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
|
|
+ return 0;
|
|
+
|
|
+ if (zlib_deflateEnd(&strm) != Z_OK)
|
|
+ return 0;
|
|
+
|
|
+ return strm.total_out;
|
|
+ }
|
|
+ case BCH_COMPRESSION_TYPE_zstd: {
|
|
+ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
|
|
+ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
|
|
+
|
|
+ /*
|
|
+ * ZSTD requires that when we decompress we pass in the exact
|
|
+ * compressed size - rounding it up to the nearest sector
|
|
+ * doesn't work, so we use the first 4 bytes of the buffer for
|
|
+ * that.
|
|
+ *
|
|
+ * Additionally, the ZSTD code seems to have a bug where it will
|
|
+ * write just past the end of the buffer - so subtract a fudge
|
|
+ * factor (7 bytes) from the dst buffer size to account for
|
|
+ * that.
|
|
+ */
|
|
+ size_t len = ZSTD_compressCCtx(ctx,
|
|
+ dst + 4, dst_len - 4 - 7,
|
|
+ src, src_len,
|
|
+ c->zstd_params);
|
|
+ if (ZSTD_isError(len))
|
|
+ return 0;
|
|
+
|
|
+ *((__le32 *) dst) = cpu_to_le32(len);
|
|
+ return len + 4;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static unsigned __bio_compress(struct bch_fs *c,
|
|
+ struct bio *dst, size_t *dst_len,
|
|
+ struct bio *src, size_t *src_len,
|
|
+ enum bch_compression_type compression_type)
|
|
+{
|
|
+ struct bbuf src_data = { NULL }, dst_data = { NULL };
|
|
+ void *workspace;
|
|
+ unsigned pad;
|
|
+ int ret = 0;
|
|
+
|
|
+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR);
|
|
+ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type]));
|
|
+
|
|
+ /* If it's only one block, don't bother trying to compress: */
|
|
+ if (bio_sectors(src) <= c->opts.block_size)
|
|
+ return 0;
|
|
+
|
|
+ dst_data = bio_map_or_bounce(c, dst, WRITE);
|
|
+ src_data = bio_map_or_bounce(c, src, READ);
|
|
+
|
|
+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
|
|
+
|
|
+ *src_len = src->bi_iter.bi_size;
|
|
+ *dst_len = dst->bi_iter.bi_size;
|
|
+
|
|
+ /*
|
|
+ * XXX: this algorithm sucks when the compression code doesn't tell us
|
|
+ * how much would fit, like LZ4 does:
|
|
+ */
|
|
+ while (1) {
|
|
+ if (*src_len <= block_bytes(c)) {
|
|
+ ret = -1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = attempt_compress(c, workspace,
|
|
+ dst_data.b, *dst_len,
|
|
+ src_data.b, *src_len,
|
|
+ compression_type);
|
|
+ if (ret > 0) {
|
|
+ *dst_len = ret;
|
|
+ ret = 0;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /* Didn't fit: should we retry with a smaller amount? */
|
|
+ if (*src_len <= *dst_len) {
|
|
+ ret = -1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If ret is negative, it's a hint as to how much data would fit
|
|
+ */
|
|
+ BUG_ON(-ret >= *src_len);
|
|
+
|
|
+ if (ret < 0)
|
|
+ *src_len = -ret;
|
|
+ else
|
|
+ *src_len -= (*src_len - *dst_len) / 2;
|
|
+ *src_len = round_down(*src_len, block_bytes(c));
|
|
+ }
|
|
+
|
|
+ mempool_free(workspace, &c->compress_workspace[compression_type]);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /* Didn't get smaller: */
|
|
+ if (round_up(*dst_len, block_bytes(c)) >= *src_len)
|
|
+ goto err;
|
|
+
|
|
+ pad = round_up(*dst_len, block_bytes(c)) - *dst_len;
|
|
+
|
|
+ memset(dst_data.b + *dst_len, 0, pad);
|
|
+ *dst_len += pad;
|
|
+
|
|
+ if (dst_data.type != BB_NONE &&
|
|
+ dst_data.type != BB_VMAP)
|
|
+ memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
|
|
+
|
|
+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
|
|
+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
|
|
+ BUG_ON(*dst_len & (block_bytes(c) - 1));
|
|
+ BUG_ON(*src_len & (block_bytes(c) - 1));
|
|
+out:
|
|
+ bio_unmap_or_unbounce(c, src_data);
|
|
+ bio_unmap_or_unbounce(c, dst_data);
|
|
+ return compression_type;
|
|
+err:
|
|
+ compression_type = BCH_COMPRESSION_TYPE_incompressible;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+unsigned bch2_bio_compress(struct bch_fs *c,
|
|
+ struct bio *dst, size_t *dst_len,
|
|
+ struct bio *src, size_t *src_len,
|
|
+ unsigned compression_type)
|
|
+{
|
|
+ unsigned orig_dst = dst->bi_iter.bi_size;
|
|
+ unsigned orig_src = src->bi_iter.bi_size;
|
|
+
|
|
+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
|
|
+ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
|
|
+ c->sb.encoded_extent_max << 9);
|
|
+ /* Don't generate a bigger output than input: */
|
|
+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
|
|
+
|
|
+ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old)
|
|
+ compression_type = BCH_COMPRESSION_TYPE_lz4;
|
|
+
|
|
+ compression_type =
|
|
+ __bio_compress(c, dst, dst_len, src, src_len, compression_type);
|
|
+
|
|
+ dst->bi_iter.bi_size = orig_dst;
|
|
+ src->bi_iter.bi_size = orig_src;
|
|
+ return compression_type;
|
|
+}
|
|
+
|
|
+static int __bch2_fs_compress_init(struct bch_fs *, u64);
|
|
+
|
|
+#define BCH_FEATURE_none 0
|
|
+
|
|
+static const unsigned bch2_compression_opt_to_feature[] = {
|
|
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
|
|
+ BCH_COMPRESSION_OPTS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+#undef BCH_FEATURE_none
|
|
+
|
|
+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ if ((c->sb.features & f) == f)
|
|
+ return 0;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ if ((c->sb.features & f) == f) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_fs_compress_init(c, c->sb.features|f);
|
|
+ if (ret) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ c->disk_sb.sb->features[0] |= cpu_to_le64(f);
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_check_set_has_compressed_data(struct bch_fs *c,
|
|
+ unsigned compression_type)
|
|
+{
|
|
+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
|
|
+
|
|
+ return compression_type
|
|
+ ? __bch2_check_set_has_compressed_data(c,
|
|
+ 1ULL << bch2_compression_opt_to_feature[compression_type])
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+void bch2_fs_compress_exit(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ mempool_exit(&c->decompress_workspace);
|
|
+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
|
|
+ mempool_exit(&c->compress_workspace[i]);
|
|
+ mempool_exit(&c->compression_bounce[WRITE]);
|
|
+ mempool_exit(&c->compression_bounce[READ]);
|
|
+}
|
|
+
|
|
+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
|
|
+{
|
|
+ size_t max_extent = c->sb.encoded_extent_max << 9;
|
|
+ size_t decompress_workspace_size = 0;
|
|
+ bool decompress_workspace_needed;
|
|
+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
|
|
+ struct {
|
|
+ unsigned feature;
|
|
+ unsigned type;
|
|
+ size_t compress_workspace;
|
|
+ size_t decompress_workspace;
|
|
+ } compression_types[] = {
|
|
+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 },
|
|
+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip,
|
|
+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
|
|
+ zlib_inflate_workspacesize(), },
|
|
+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd,
|
|
+ ZSTD_CCtxWorkspaceBound(params.cParams),
|
|
+ ZSTD_DCtxWorkspaceBound() },
|
|
+ }, *i;
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_verbose_init(c->opts, "");
|
|
+
|
|
+ c->zstd_params = params;
|
|
+
|
|
+ for (i = compression_types;
|
|
+ i < compression_types + ARRAY_SIZE(compression_types);
|
|
+ i++)
|
|
+ if (features & (1 << i->feature))
|
|
+ goto have_compressed;
|
|
+
|
|
+ goto out;
|
|
+have_compressed:
|
|
+
|
|
+ if (!mempool_initialized(&c->compression_bounce[READ])) {
|
|
+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
|
|
+ 1, max_extent);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (!mempool_initialized(&c->compression_bounce[WRITE])) {
|
|
+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
|
|
+ 1, max_extent);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ for (i = compression_types;
|
|
+ i < compression_types + ARRAY_SIZE(compression_types);
|
|
+ i++) {
|
|
+ decompress_workspace_size =
|
|
+ max(decompress_workspace_size, i->decompress_workspace);
|
|
+
|
|
+ if (!(features & (1 << i->feature)))
|
|
+ continue;
|
|
+
|
|
+ if (i->decompress_workspace)
|
|
+ decompress_workspace_needed = true;
|
|
+
|
|
+ if (mempool_initialized(&c->compress_workspace[i->type]))
|
|
+ continue;
|
|
+
|
|
+ ret = mempool_init_kvpmalloc_pool(
|
|
+ &c->compress_workspace[i->type],
|
|
+ 1, i->compress_workspace);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (!mempool_initialized(&c->decompress_workspace)) {
|
|
+ ret = mempool_init_kvpmalloc_pool(
|
|
+ &c->decompress_workspace,
|
|
+ 1, decompress_workspace_size);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+out:
|
|
+ pr_verbose_init(c->opts, "ret %i", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_fs_compress_init(struct bch_fs *c)
|
|
+{
|
|
+ u64 f = c->sb.features;
|
|
+
|
|
+ if (c->opts.compression)
|
|
+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression];
|
|
+
|
|
+ if (c->opts.background_compression)
|
|
+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression];
|
|
+
|
|
+ return __bch2_fs_compress_init(c, f);
|
|
+
|
|
+}
|
|
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
|
|
new file mode 100644
|
|
index 000000000000..4bab1f61b3b5
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/compress.h
|
|
@@ -0,0 +1,18 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_COMPRESS_H
|
|
+#define _BCACHEFS_COMPRESS_H
|
|
+
|
|
+#include "extents_types.h"
|
|
+
|
|
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
|
|
+ struct bch_extent_crc_unpacked *);
|
|
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
|
|
+ struct bvec_iter, struct bch_extent_crc_unpacked);
|
|
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
|
|
+ struct bio *, size_t *, unsigned);
|
|
+
|
|
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
|
|
+void bch2_fs_compress_exit(struct bch_fs *);
|
|
+int bch2_fs_compress_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_COMPRESS_H */
|
|
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
|
|
new file mode 100644
|
|
index 000000000000..4215c119e0a2
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/debug.c
|
|
@@ -0,0 +1,476 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Assorted bcachefs debug code
|
|
+ *
|
|
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "debug.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "fsck.h"
|
|
+#include "inode.h"
|
|
+#include "io.h"
|
|
+#include "super.h"
|
|
+
|
|
+#include <linux/console.h>
|
|
+#include <linux/debugfs.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/seq_file.h>
|
|
+
|
|
+static struct dentry *bch_debug;
|
|
+
|
|
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
|
|
+ struct extent_ptr_decoded pick)
|
|
+{
|
|
+ struct btree *v = c->verify_data;
|
|
+ struct btree_node *n_ondisk = c->verify_ondisk;
|
|
+ struct btree_node *n_sorted = c->verify_data->data;
|
|
+ struct bset *sorted, *inmemory = &b->data->keys;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
|
+ struct bio *bio;
|
|
+ bool failed = false;
|
|
+
|
|
+ if (!bch2_dev_get_ioref(ca, READ))
|
|
+ return false;
|
|
+
|
|
+ bio = bio_alloc_bioset(GFP_NOIO,
|
|
+ buf_pages(n_sorted, btree_bytes(c)),
|
|
+ &c->btree_bio);
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ bio->bi_opf = REQ_OP_READ|REQ_META;
|
|
+ bio->bi_iter.bi_sector = pick.ptr.offset;
|
|
+ bch2_bio_map(bio, n_sorted, btree_bytes(c));
|
|
+
|
|
+ submit_bio_wait(bio);
|
|
+
|
|
+ bio_put(bio);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+
|
|
+ memcpy(n_ondisk, n_sorted, btree_bytes(c));
|
|
+
|
|
+ v->written = 0;
|
|
+ if (bch2_btree_node_read_done(c, ca, v, false))
|
|
+ return false;
|
|
+
|
|
+ n_sorted = c->verify_data->data;
|
|
+ sorted = &n_sorted->keys;
|
|
+
|
|
+ if (inmemory->u64s != sorted->u64s ||
|
|
+ memcmp(inmemory->start,
|
|
+ sorted->start,
|
|
+ vstruct_end(inmemory) - (void *) inmemory->start)) {
|
|
+ unsigned offset = 0, sectors;
|
|
+ struct bset *i;
|
|
+ unsigned j;
|
|
+
|
|
+ console_lock();
|
|
+
|
|
+ printk(KERN_ERR "*** in memory:\n");
|
|
+ bch2_dump_bset(c, b, inmemory, 0);
|
|
+
|
|
+ printk(KERN_ERR "*** read back in:\n");
|
|
+ bch2_dump_bset(c, v, sorted, 0);
|
|
+
|
|
+ while (offset < v->written) {
|
|
+ if (!offset) {
|
|
+ i = &n_ondisk->keys;
|
|
+ sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
|
|
+ c->block_bits;
|
|
+ } else {
|
|
+ struct btree_node_entry *bne =
|
|
+ (void *) n_ondisk + (offset << 9);
|
|
+ i = &bne->keys;
|
|
+
|
|
+ sectors = vstruct_blocks(bne, c->block_bits) <<
|
|
+ c->block_bits;
|
|
+ }
|
|
+
|
|
+ printk(KERN_ERR "*** on disk block %u:\n", offset);
|
|
+ bch2_dump_bset(c, b, i, offset);
|
|
+
|
|
+ offset += sectors;
|
|
+ }
|
|
+
|
|
+ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
|
|
+ if (inmemory->_data[j] != sorted->_data[j])
|
|
+ break;
|
|
+
|
|
+ console_unlock();
|
|
+ bch_err(c, "verify failed at key %u", j);
|
|
+
|
|
+ failed = true;
|
|
+ }
|
|
+
|
|
+ if (v->written != b->written) {
|
|
+ bch_err(c, "written wrong: expected %u, got %u",
|
|
+ b->written, v->written);
|
|
+ failed = true;
|
|
+ }
|
|
+
|
|
+ return failed;
|
|
+}
|
|
+
|
|
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs;
|
|
+ struct extent_ptr_decoded p;
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct btree *v;
|
|
+ struct bset *inmemory = &b->data->keys;
|
|
+ struct bkey_packed *k;
|
|
+ bool failed = false;
|
|
+
|
|
+ if (c->opts.nochanges)
|
|
+ return;
|
|
+
|
|
+ btree_node_io_lock(b);
|
|
+ mutex_lock(&c->verify_lock);
|
|
+
|
|
+ if (!c->verify_ondisk) {
|
|
+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
|
|
+ if (!c->verify_ondisk)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (!c->verify_data) {
|
|
+ c->verify_data = __bch2_btree_node_mem_alloc(c);
|
|
+ if (!c->verify_data)
|
|
+ goto out;
|
|
+
|
|
+ list_del_init(&c->verify_data->list);
|
|
+ }
|
|
+
|
|
+ BUG_ON(b->nsets != 1);
|
|
+
|
|
+ for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k))
|
|
+ if (k->type == KEY_TYPE_btree_ptr_v2) {
|
|
+ struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
|
|
+ v->mem_ptr = 0;
|
|
+ }
|
|
+
|
|
+ v = c->verify_data;
|
|
+ bkey_copy(&v->key, &b->key);
|
|
+ v->c.level = b->c.level;
|
|
+ v->c.btree_id = b->c.btree_id;
|
|
+ bch2_btree_keys_init(v);
|
|
+
|
|
+ ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
|
|
+ bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
|
|
+ failed |= bch2_btree_verify_replica(c, b, p);
|
|
+
|
|
+ if (failed) {
|
|
+ char buf[200];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
|
|
+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
|
|
+ }
|
|
+out:
|
|
+ mutex_unlock(&c->verify_lock);
|
|
+ btree_node_io_unlock(b);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_DEBUG_FS
|
|
+
|
|
+/* XXX: bch_fs refcounting */
|
|
+
|
|
+struct dump_iter {
|
|
+ struct bpos from;
|
|
+ struct bch_fs *c;
|
|
+ enum btree_id id;
|
|
+
|
|
+ char buf[1 << 12];
|
|
+ size_t bytes; /* what's currently in buf */
|
|
+
|
|
+ char __user *ubuf; /* destination user buffer */
|
|
+ size_t size; /* size of requested read */
|
|
+ ssize_t ret; /* bytes read so far */
|
|
+};
|
|
+
|
|
+static int flush_buf(struct dump_iter *i)
|
|
+{
|
|
+ if (i->bytes) {
|
|
+ size_t bytes = min(i->bytes, i->size);
|
|
+ int err = copy_to_user(i->ubuf, i->buf, bytes);
|
|
+
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ i->ret += bytes;
|
|
+ i->ubuf += bytes;
|
|
+ i->size -= bytes;
|
|
+ i->bytes -= bytes;
|
|
+ memmove(i->buf, i->buf + bytes, i->bytes);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_dump_open(struct inode *inode, struct file *file)
|
|
+{
|
|
+ struct btree_debug *bd = inode->i_private;
|
|
+ struct dump_iter *i;
|
|
+
|
|
+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
|
|
+ if (!i)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ file->private_data = i;
|
|
+ i->from = POS_MIN;
|
|
+ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]);
|
|
+ i->id = bd->id;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_dump_release(struct inode *inode, struct file *file)
|
|
+{
|
|
+ kfree(file->private_data);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static ssize_t bch2_read_btree(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ struct dump_iter *i = file->private_data;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int err;
|
|
+
|
|
+ i->ubuf = buf;
|
|
+ i->size = size;
|
|
+ i->ret = 0;
|
|
+
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ if (!i->size)
|
|
+ return i->ret;
|
|
+
|
|
+ bch2_trans_init(&trans, i->c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, i->id, i->from,
|
|
+ BTREE_ITER_PREFETCH|
|
|
+ BTREE_ITER_ALL_SNAPSHOTS);
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+
|
|
+ while (k.k && !(err = bkey_err(k))) {
|
|
+ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k);
|
|
+ i->bytes = strlen(i->buf);
|
|
+ BUG_ON(i->bytes >= sizeof(i->buf));
|
|
+ i->buf[i->bytes] = '\n';
|
|
+ i->bytes++;
|
|
+
|
|
+ k = bch2_btree_iter_next(iter);
|
|
+ i->from = iter->pos;
|
|
+
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ break;
|
|
+
|
|
+ if (!i->size)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ return err < 0 ? err : i->ret;
|
|
+}
|
|
+
|
|
+static const struct file_operations btree_debug_ops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .open = bch2_dump_open,
|
|
+ .release = bch2_dump_release,
|
|
+ .read = bch2_read_btree,
|
|
+};
|
|
+
|
|
+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ struct dump_iter *i = file->private_data;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct btree *b;
|
|
+ int err;
|
|
+
|
|
+ i->ubuf = buf;
|
|
+ i->size = size;
|
|
+ i->ret = 0;
|
|
+
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ if (!i->size || !bpos_cmp(POS_MAX, i->from))
|
|
+ return i->ret;
|
|
+
|
|
+ bch2_trans_init(&trans, i->c, 0, 0);
|
|
+
|
|
+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) {
|
|
+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b);
|
|
+ i->bytes = strlen(i->buf);
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ break;
|
|
+
|
|
+ /*
|
|
+ * can't easily correctly restart a btree node traversal across
|
|
+ * all nodes, meh
|
|
+ */
|
|
+ i->from = bpos_cmp(POS_MAX, b->key.k.p)
|
|
+ ? bpos_successor(b->key.k.p)
|
|
+ : b->key.k.p;
|
|
+
|
|
+ if (!i->size)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ return err < 0 ? err : i->ret;
|
|
+}
|
|
+
|
|
+static const struct file_operations btree_format_debug_ops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .open = bch2_dump_open,
|
|
+ .release = bch2_dump_release,
|
|
+ .read = bch2_read_btree_formats,
|
|
+};
|
|
+
|
|
+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
|
|
+ size_t size, loff_t *ppos)
|
|
+{
|
|
+ struct dump_iter *i = file->private_data;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct btree *prev_node = NULL;
|
|
+ int err;
|
|
+
|
|
+ i->ubuf = buf;
|
|
+ i->size = size;
|
|
+ i->ret = 0;
|
|
+
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ if (!i->size)
|
|
+ return i->ret;
|
|
+
|
|
+ bch2_trans_init(&trans, i->c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH);
|
|
+
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(err = bkey_err(k))) {
|
|
+ struct btree_iter_level *l = &iter->l[0];
|
|
+ struct bkey_packed *_k =
|
|
+ bch2_btree_node_iter_peek(&l->iter, l->b);
|
|
+
|
|
+ if (l->b != prev_node) {
|
|
+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b);
|
|
+ i->bytes = strlen(i->buf);
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ break;
|
|
+ }
|
|
+ prev_node = l->b;
|
|
+
|
|
+ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k);
|
|
+ i->bytes = strlen(i->buf);
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ break;
|
|
+
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ i->from = iter->pos;
|
|
+
|
|
+ err = flush_buf(i);
|
|
+ if (err)
|
|
+ break;
|
|
+
|
|
+ if (!i->size)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ return err < 0 ? err : i->ret;
|
|
+}
|
|
+
|
|
+static const struct file_operations bfloat_failed_debug_ops = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .open = bch2_dump_open,
|
|
+ .release = bch2_dump_release,
|
|
+ .read = bch2_read_bfloat_failed,
|
|
+};
|
|
+
|
|
+void bch2_fs_debug_exit(struct bch_fs *c)
|
|
+{
|
|
+ if (!IS_ERR_OR_NULL(c->debug))
|
|
+ debugfs_remove_recursive(c->debug);
|
|
+}
|
|
+
|
|
+void bch2_fs_debug_init(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_debug *bd;
|
|
+ char name[100];
|
|
+
|
|
+ if (IS_ERR_OR_NULL(bch_debug))
|
|
+ return;
|
|
+
|
|
+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b);
|
|
+ c->debug = debugfs_create_dir(name, bch_debug);
|
|
+ if (IS_ERR_OR_NULL(c->debug))
|
|
+ return;
|
|
+
|
|
+ for (bd = c->btree_debug;
|
|
+ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
|
|
+ bd++) {
|
|
+ bd->id = bd - c->btree_debug;
|
|
+ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id],
|
|
+ 0400, c->debug, bd,
|
|
+ &btree_debug_ops);
|
|
+
|
|
+ snprintf(name, sizeof(name), "%s-formats",
|
|
+ bch2_btree_ids[bd->id]);
|
|
+
|
|
+ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
|
|
+ &btree_format_debug_ops);
|
|
+
|
|
+ snprintf(name, sizeof(name), "%s-bfloat-failed",
|
|
+ bch2_btree_ids[bd->id]);
|
|
+
|
|
+ bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
|
|
+ &bfloat_failed_debug_ops);
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+void bch2_debug_exit(void)
|
|
+{
|
|
+ if (!IS_ERR_OR_NULL(bch_debug))
|
|
+ debugfs_remove_recursive(bch_debug);
|
|
+}
|
|
+
|
|
+int __init bch2_debug_init(void)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ bch_debug = debugfs_create_dir("bcachefs", NULL);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h
|
|
new file mode 100644
|
|
index 000000000000..0b86736e5e1b
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/debug.h
|
|
@@ -0,0 +1,30 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_DEBUG_H
|
|
+#define _BCACHEFS_DEBUG_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+
|
|
+struct bio;
|
|
+struct btree;
|
|
+struct bch_fs;
|
|
+
|
|
+void __bch2_btree_verify(struct bch_fs *, struct btree *);
|
|
+
|
|
+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
|
|
+{
|
|
+ if (bch2_verify_btree_ondisk)
|
|
+ __bch2_btree_verify(c, b);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_DEBUG_FS
|
|
+void bch2_fs_debug_exit(struct bch_fs *);
|
|
+void bch2_fs_debug_init(struct bch_fs *);
|
|
+#else
|
|
+static inline void bch2_fs_debug_exit(struct bch_fs *c) {}
|
|
+static inline void bch2_fs_debug_init(struct bch_fs *c) {}
|
|
+#endif
|
|
+
|
|
+void bch2_debug_exit(void);
|
|
+int bch2_debug_init(void);
|
|
+
|
|
+#endif /* _BCACHEFS_DEBUG_H */
|
|
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
|
|
new file mode 100644
|
|
index 000000000000..ec4666143f23
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/dirent.c
|
|
@@ -0,0 +1,402 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_update.h"
|
|
+#include "extents.h"
|
|
+#include "dirent.h"
|
|
+#include "fs.h"
|
|
+#include "keylist.h"
|
|
+#include "str_hash.h"
|
|
+
|
|
+#include <linux/dcache.h>
|
|
+
|
|
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
|
|
+{
|
|
+ unsigned len = bkey_val_bytes(d.k) -
|
|
+ offsetof(struct bch_dirent, d_name);
|
|
+
|
|
+ return strnlen(d.v->d_name, len);
|
|
+}
|
|
+
|
|
+static u64 bch2_dirent_hash(const struct bch_hash_info *info,
|
|
+ const struct qstr *name)
|
|
+{
|
|
+ struct bch_str_hash_ctx ctx;
|
|
+
|
|
+ bch2_str_hash_init(&ctx, info);
|
|
+ bch2_str_hash_update(&ctx, info, name->name, name->len);
|
|
+
|
|
+ /* [0,2) reserved for dots */
|
|
+ return max_t(u64, bch2_str_hash_end(&ctx, info), 2);
|
|
+}
|
|
+
|
|
+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key)
|
|
+{
|
|
+ return bch2_dirent_hash(info, key);
|
|
+}
|
|
+
|
|
+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
|
|
+
|
|
+ return bch2_dirent_hash(info, &name);
|
|
+}
|
|
+
|
|
+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
|
|
+{
|
|
+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
|
+ int len = bch2_dirent_name_bytes(l);
|
|
+ const struct qstr *r = _r;
|
|
+
|
|
+ return len - r->len ?: memcmp(l.v->d_name, r->name, len);
|
|
+}
|
|
+
|
|
+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
|
+{
|
|
+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
|
|
+ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
|
|
+ int l_len = bch2_dirent_name_bytes(l);
|
|
+ int r_len = bch2_dirent_name_bytes(r);
|
|
+
|
|
+ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len);
|
|
+}
|
|
+
|
|
+const struct bch_hash_desc bch2_dirent_hash_desc = {
|
|
+ .btree_id = BTREE_ID_dirents,
|
|
+ .key_type = KEY_TYPE_dirent,
|
|
+ .hash_key = dirent_hash_key,
|
|
+ .hash_bkey = dirent_hash_bkey,
|
|
+ .cmp_key = dirent_cmp_key,
|
|
+ .cmp_bkey = dirent_cmp_bkey,
|
|
+};
|
|
+
|
|
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+ unsigned len;
|
|
+
|
|
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent))
|
|
+ return "value too small";
|
|
+
|
|
+ len = bch2_dirent_name_bytes(d);
|
|
+ if (!len)
|
|
+ return "empty name";
|
|
+
|
|
+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
|
|
+ return "value too big";
|
|
+
|
|
+ if (len > BCH_NAME_MAX)
|
|
+ return "dirent name too big";
|
|
+
|
|
+ if (len == 1 && !memcmp(d.v->d_name, ".", 1))
|
|
+ return "invalid name";
|
|
+
|
|
+ if (len == 2 && !memcmp(d.v->d_name, "..", 2))
|
|
+ return "invalid name";
|
|
+
|
|
+ if (memchr(d.v->d_name, '/', len))
|
|
+ return "invalid name";
|
|
+
|
|
+ if (le64_to_cpu(d.v->d_inum) == d.k->p.inode)
|
|
+ return "dirent points to own directory";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
|
|
+
|
|
+ bch_scnmemcpy(out, d.v->d_name,
|
|
+ bch2_dirent_name_bytes(d));
|
|
+ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type);
|
|
+}
|
|
+
|
|
+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
|
|
+ u8 type, const struct qstr *name, u64 dst)
|
|
+{
|
|
+ struct bkey_i_dirent *dirent;
|
|
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
|
|
+
|
|
+ if (name->len > BCH_NAME_MAX)
|
|
+ return ERR_PTR(-ENAMETOOLONG);
|
|
+
|
|
+ BUG_ON(u64s > U8_MAX);
|
|
+
|
|
+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
|
|
+ if (IS_ERR(dirent))
|
|
+ return dirent;
|
|
+
|
|
+ bkey_dirent_init(&dirent->k_i);
|
|
+ dirent->k.u64s = u64s;
|
|
+ dirent->v.d_inum = cpu_to_le64(dst);
|
|
+ dirent->v.d_type = type;
|
|
+
|
|
+ memcpy(dirent->v.d_name, name->name, name->len);
|
|
+ memset(dirent->v.d_name + name->len, 0,
|
|
+ bkey_val_bytes(&dirent->k) -
|
|
+ offsetof(struct bch_dirent, d_name) -
|
|
+ name->len);
|
|
+
|
|
+ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
|
|
+
|
|
+ return dirent;
|
|
+}
|
|
+
|
|
+int bch2_dirent_create(struct btree_trans *trans,
|
|
+ u64 dir_inum, const struct bch_hash_info *hash_info,
|
|
+ u8 type, const struct qstr *name, u64 dst_inum,
|
|
+ u64 *dir_offset, int flags)
|
|
+{
|
|
+ struct bkey_i_dirent *dirent;
|
|
+ int ret;
|
|
+
|
|
+ dirent = dirent_create_key(trans, type, name, dst_inum);
|
|
+ ret = PTR_ERR_OR_ZERO(dirent);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
|
|
+ dir_inum, &dirent->k_i, flags);
|
|
+ *dir_offset = dirent->k.p.offset;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void dirent_copy_target(struct bkey_i_dirent *dst,
|
|
+ struct bkey_s_c_dirent src)
|
|
+{
|
|
+ dst->v.d_inum = src.v->d_inum;
|
|
+ dst->v.d_type = src.v->d_type;
|
|
+}
|
|
+
|
|
+int bch2_dirent_rename(struct btree_trans *trans,
|
|
+ u64 src_dir, struct bch_hash_info *src_hash,
|
|
+ u64 dst_dir, struct bch_hash_info *dst_hash,
|
|
+ const struct qstr *src_name, u64 *src_inum, u64 *src_offset,
|
|
+ const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset,
|
|
+ enum bch_rename_mode mode)
|
|
+{
|
|
+ struct btree_iter *src_iter = NULL, *dst_iter = NULL;
|
|
+ struct bkey_s_c old_src, old_dst;
|
|
+ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
|
|
+ struct bpos dst_pos =
|
|
+ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name));
|
|
+ int ret = 0;
|
|
+
|
|
+ *src_inum = *dst_inum = 0;
|
|
+
|
|
+ /*
|
|
+ * Lookup dst:
|
|
+ *
|
|
+ * Note that in BCH_RENAME mode, we're _not_ checking if
|
|
+ * the target already exists - we're relying on the VFS
|
|
+ * to do that check for us for correctness:
|
|
+ */
|
|
+ dst_iter = mode == BCH_RENAME
|
|
+ ? bch2_hash_hole(trans, bch2_dirent_hash_desc,
|
|
+ dst_hash, dst_dir, dst_name)
|
|
+ : bch2_hash_lookup(trans, bch2_dirent_hash_desc,
|
|
+ dst_hash, dst_dir, dst_name,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(dst_iter);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ old_dst = bch2_btree_iter_peek_slot(dst_iter);
|
|
+
|
|
+ if (mode != BCH_RENAME)
|
|
+ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum);
|
|
+
|
|
+ /* Lookup src: */
|
|
+ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc,
|
|
+ src_hash, src_dir, src_name,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(src_iter);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ old_src = bch2_btree_iter_peek_slot(src_iter);
|
|
+ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum);
|
|
+
|
|
+ /* Create new dst key: */
|
|
+ new_dst = dirent_create_key(trans, 0, dst_name, 0);
|
|
+ ret = PTR_ERR_OR_ZERO(new_dst);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src));
|
|
+ new_dst->k.p = dst_iter->pos;
|
|
+
|
|
+ /* Create new src key: */
|
|
+ if (mode == BCH_RENAME_EXCHANGE) {
|
|
+ new_src = dirent_create_key(trans, 0, src_name, 0);
|
|
+ ret = PTR_ERR_OR_ZERO(new_src);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst));
|
|
+ new_src->k.p = src_iter->pos;
|
|
+ } else {
|
|
+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i));
|
|
+ ret = PTR_ERR_OR_ZERO(new_src);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ bkey_init(&new_src->k);
|
|
+ new_src->k.p = src_iter->pos;
|
|
+
|
|
+ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 &&
|
|
+ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) {
|
|
+ /*
|
|
+ * We have a hash collision for the new dst key,
|
|
+ * and new_src - the key we're deleting - is between
|
|
+ * new_dst's hashed slot and the slot we're going to be
|
|
+ * inserting it into - oops. This will break the hash
|
|
+ * table if we don't deal with it:
|
|
+ */
|
|
+ if (mode == BCH_RENAME) {
|
|
+ /*
|
|
+ * If we're not overwriting, we can just insert
|
|
+ * new_dst at the src position:
|
|
+ */
|
|
+ new_dst->k.p = src_iter->pos;
|
|
+ bch2_trans_update(trans, src_iter,
|
|
+ &new_dst->k_i, 0);
|
|
+ goto out_set_offset;
|
|
+ } else {
|
|
+ /* If we're overwriting, we can't insert new_dst
|
|
+ * at a different slot because it has to
|
|
+ * overwrite old_dst - just make sure to use a
|
|
+ * whiteout when deleting src:
|
|
+ */
|
|
+ new_src->k.type = KEY_TYPE_hash_whiteout;
|
|
+ }
|
|
+ } else {
|
|
+ /* Check if we need a whiteout to delete src: */
|
|
+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc,
|
|
+ src_hash, src_iter);
|
|
+ if (ret < 0)
|
|
+ goto out;
|
|
+
|
|
+ if (ret)
|
|
+ new_src->k.type = KEY_TYPE_hash_whiteout;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_update(trans, src_iter, &new_src->k_i, 0);
|
|
+ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0);
|
|
+out_set_offset:
|
|
+ *src_offset = new_src->k.p.offset;
|
|
+ *dst_offset = new_dst->k.p.offset;
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, src_iter);
|
|
+ bch2_trans_iter_put(trans, dst_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_dirent_delete_at(struct btree_trans *trans,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
+ hash_info, iter);
|
|
+}
|
|
+
|
|
+struct btree_iter *
|
|
+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ const struct qstr *name, unsigned flags)
|
|
+{
|
|
+ return bch2_hash_lookup(trans, bch2_dirent_hash_desc,
|
|
+ hash_info, dir_inum, name, flags);
|
|
+}
|
|
+
|
|
+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ const struct qstr *name)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 inum = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = __bch2_dirent_lookup_trans(&trans, dir_inum,
|
|
+ hash_info, name, 0);
|
|
+ if (IS_ERR(iter)) {
|
|
+ BUG_ON(PTR_ERR(iter) == -EINTR);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+out:
|
|
+ bch2_trans_exit(&trans);
|
|
+ return inum;
|
|
+}
|
|
+
|
|
+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key(trans, iter, BTREE_ID_dirents,
|
|
+ POS(dir_inum, 0), 0, k, ret) {
|
|
+ if (k.k->p.inode > dir_inum)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_dirent) {
|
|
+ ret = -ENOTEMPTY;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_c_dirent dirent;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_dirents,
|
|
+ POS(inum, ctx->pos), 0, k, ret) {
|
|
+ if (k.k->p.inode > inum)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_dirent)
|
|
+ continue;
|
|
+
|
|
+ dirent = bkey_s_c_to_dirent(k);
|
|
+
|
|
+ /*
|
|
+ * XXX: dir_emit() can fault and block, while we're holding
|
|
+ * locks
|
|
+ */
|
|
+ ctx->pos = dirent.k->p.offset;
|
|
+ if (!dir_emit(ctx, dirent.v->d_name,
|
|
+ bch2_dirent_name_bytes(dirent),
|
|
+ le64_to_cpu(dirent.v->d_inum),
|
|
+ dirent.v->d_type))
|
|
+ break;
|
|
+ ctx->pos = dirent.k->p.offset + 1;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
|
|
new file mode 100644
|
|
index 000000000000..e1d8ce377d43
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/dirent.h
|
|
@@ -0,0 +1,63 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_DIRENT_H
|
|
+#define _BCACHEFS_DIRENT_H
|
|
+
|
|
+#include "str_hash.h"
|
|
+
|
|
+extern const struct bch_hash_desc bch2_dirent_hash_desc;
|
|
+
|
|
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_dirent (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_dirent_invalid, \
|
|
+ .val_to_text = bch2_dirent_to_text, \
|
|
+}
|
|
+
|
|
+struct qstr;
|
|
+struct file;
|
|
+struct dir_context;
|
|
+struct bch_fs;
|
|
+struct bch_hash_info;
|
|
+struct bch_inode_info;
|
|
+
|
|
+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent);
|
|
+
|
|
+static inline unsigned dirent_val_u64s(unsigned len)
|
|
+{
|
|
+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
|
|
+ sizeof(u64));
|
|
+}
|
|
+
|
|
+int bch2_dirent_create(struct btree_trans *, u64,
|
|
+ const struct bch_hash_info *, u8,
|
|
+ const struct qstr *, u64, u64 *, int);
|
|
+
|
|
+int bch2_dirent_delete_at(struct btree_trans *,
|
|
+ const struct bch_hash_info *,
|
|
+ struct btree_iter *);
|
|
+
|
|
+enum bch_rename_mode {
|
|
+ BCH_RENAME,
|
|
+ BCH_RENAME_OVERWRITE,
|
|
+ BCH_RENAME_EXCHANGE,
|
|
+};
|
|
+
|
|
+int bch2_dirent_rename(struct btree_trans *,
|
|
+ u64, struct bch_hash_info *,
|
|
+ u64, struct bch_hash_info *,
|
|
+ const struct qstr *, u64 *, u64 *,
|
|
+ const struct qstr *, u64 *, u64 *,
|
|
+ enum bch_rename_mode);
|
|
+
|
|
+struct btree_iter *
|
|
+__bch2_dirent_lookup_trans(struct btree_trans *, u64,
|
|
+ const struct bch_hash_info *,
|
|
+ const struct qstr *, unsigned);
|
|
+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *,
|
|
+ const struct qstr *);
|
|
+
|
|
+int bch2_empty_dir_trans(struct btree_trans *, u64);
|
|
+int bch2_readdir(struct bch_fs *, u64, struct dir_context *);
|
|
+
|
|
+#endif /* _BCACHEFS_DIRENT_H */
|
|
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
|
|
new file mode 100644
|
|
index 000000000000..c52b6faac9b4
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/disk_groups.c
|
|
@@ -0,0 +1,486 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "disk_groups.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/sort.h>
|
|
+
|
|
+static int group_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ const struct bch_disk_group *l = _l;
|
|
+ const struct bch_disk_group *r = _r;
|
|
+
|
|
+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
|
|
+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
|
|
+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
|
|
+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
|
|
+ strncmp(l->label, r->label, sizeof(l->label));
|
|
+}
|
|
+
|
|
+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_disk_groups *groups =
|
|
+ field_to_type(f, disk_groups);
|
|
+ struct bch_disk_group *g, *sorted = NULL;
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_member *m;
|
|
+ unsigned i, nr_groups, len;
|
|
+ const char *err = NULL;
|
|
+
|
|
+ mi = bch2_sb_get_members(sb);
|
|
+ groups = bch2_sb_get_disk_groups(sb);
|
|
+ nr_groups = disk_groups_nr(groups);
|
|
+
|
|
+ for (m = mi->members;
|
|
+ m < mi->members + sb->nr_devices;
|
|
+ m++) {
|
|
+ unsigned g;
|
|
+
|
|
+ if (!BCH_MEMBER_GROUP(m))
|
|
+ continue;
|
|
+
|
|
+ g = BCH_MEMBER_GROUP(m) - 1;
|
|
+
|
|
+ if (g >= nr_groups ||
|
|
+ BCH_GROUP_DELETED(&groups->entries[g]))
|
|
+ return "disk has invalid group";
|
|
+ }
|
|
+
|
|
+ if (!nr_groups)
|
|
+ return NULL;
|
|
+
|
|
+ for (g = groups->entries;
|
|
+ g < groups->entries + nr_groups;
|
|
+ g++) {
|
|
+ if (BCH_GROUP_DELETED(g))
|
|
+ continue;
|
|
+
|
|
+ len = strnlen(g->label, sizeof(g->label));
|
|
+ if (!len) {
|
|
+ err = "group with empty label";
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
|
|
+ if (!sorted)
|
|
+ return "cannot allocate memory";
|
|
+
|
|
+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
|
|
+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
|
|
+
|
|
+ for (i = 0; i + 1 < nr_groups; i++)
|
|
+ if (!BCH_GROUP_DELETED(sorted + i) &&
|
|
+ !group_cmp(sorted + i, sorted + i + 1)) {
|
|
+ err = "duplicate groups";
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ err = NULL;
|
|
+err:
|
|
+ kfree(sorted);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static void bch2_sb_disk_groups_to_text(struct printbuf *out,
|
|
+ struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_disk_groups *groups =
|
|
+ field_to_type(f, disk_groups);
|
|
+ struct bch_disk_group *g;
|
|
+ unsigned nr_groups = disk_groups_nr(groups);
|
|
+
|
|
+ for (g = groups->entries;
|
|
+ g < groups->entries + nr_groups;
|
|
+ g++) {
|
|
+ if (g != groups->entries)
|
|
+ pr_buf(out, " ");
|
|
+
|
|
+ if (BCH_GROUP_DELETED(g))
|
|
+ pr_buf(out, "[deleted]");
|
|
+ else
|
|
+ pr_buf(out, "[parent %llu name %s]",
|
|
+ BCH_GROUP_PARENT(g), g->label);
|
|
+ }
|
|
+}
|
|
+
|
|
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
|
|
+ .validate = bch2_sb_disk_groups_validate,
|
|
+ .to_text = bch2_sb_disk_groups_to_text
|
|
+};
|
|
+
|
|
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_sb_field_disk_groups *groups;
|
|
+ struct bch_disk_groups_cpu *cpu_g, *old_g;
|
|
+ unsigned i, g, nr_groups;
|
|
+
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+ groups = bch2_sb_get_disk_groups(c->disk_sb.sb);
|
|
+ nr_groups = disk_groups_nr(groups);
|
|
+
|
|
+ if (!groups)
|
|
+ return 0;
|
|
+
|
|
+ cpu_g = kzalloc(sizeof(*cpu_g) +
|
|
+ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
|
|
+ if (!cpu_g)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ cpu_g->nr = nr_groups;
|
|
+
|
|
+ for (i = 0; i < nr_groups; i++) {
|
|
+ struct bch_disk_group *src = &groups->entries[i];
|
|
+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i];
|
|
+
|
|
+ dst->deleted = BCH_GROUP_DELETED(src);
|
|
+ dst->parent = BCH_GROUP_PARENT(src);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
|
|
+ struct bch_member *m = mi->members + i;
|
|
+ struct bch_disk_group_cpu *dst =
|
|
+ &cpu_g->entries[BCH_MEMBER_GROUP(m)];
|
|
+
|
|
+ if (!bch2_member_exists(m))
|
|
+ continue;
|
|
+
|
|
+ g = BCH_MEMBER_GROUP(m);
|
|
+ while (g) {
|
|
+ dst = &cpu_g->entries[g - 1];
|
|
+ __set_bit(i, dst->devs.d);
|
|
+ g = dst->parent;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ old_g = rcu_dereference_protected(c->disk_groups,
|
|
+ lockdep_is_held(&c->sb_lock));
|
|
+ rcu_assign_pointer(c->disk_groups, cpu_g);
|
|
+ if (old_g)
|
|
+ kfree_rcu(old_g, rcu);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
|
|
+{
|
|
+ struct target t = target_decode(target);
|
|
+
|
|
+ switch (t.type) {
|
|
+ case TARGET_NULL:
|
|
+ return NULL;
|
|
+ case TARGET_DEV: {
|
|
+ struct bch_dev *ca = t.dev < c->sb.nr_devices
|
|
+ ? rcu_dereference(c->devs[t.dev])
|
|
+ : NULL;
|
|
+ return ca ? &ca->self : NULL;
|
|
+ }
|
|
+ case TARGET_GROUP: {
|
|
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
|
|
+
|
|
+ return g && t.group < g->nr && !g->entries[t.group].deleted
|
|
+ ? &g->entries[t.group].devs
|
|
+ : NULL;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
|
|
+{
|
|
+ struct target t = target_decode(target);
|
|
+
|
|
+ switch (t.type) {
|
|
+ case TARGET_NULL:
|
|
+ return false;
|
|
+ case TARGET_DEV:
|
|
+ return dev == t.dev;
|
|
+ case TARGET_GROUP: {
|
|
+ struct bch_disk_groups_cpu *g;
|
|
+ const struct bch_devs_mask *m;
|
|
+ bool ret;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ g = rcu_dereference(c->disk_groups);
|
|
+ m = g && t.group < g->nr && !g->entries[t.group].deleted
|
|
+ ? &g->entries[t.group].devs
|
|
+ : NULL;
|
|
+
|
|
+ ret = m ? test_bit(dev, m->d) : false;
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
|
|
+ unsigned parent,
|
|
+ const char *name, unsigned namelen)
|
|
+{
|
|
+ unsigned i, nr_groups = disk_groups_nr(groups);
|
|
+
|
|
+ if (!namelen || namelen > BCH_SB_LABEL_SIZE)
|
|
+ return -EINVAL;
|
|
+
|
|
+ for (i = 0; i < nr_groups; i++) {
|
|
+ struct bch_disk_group *g = groups->entries + i;
|
|
+
|
|
+ if (BCH_GROUP_DELETED(g))
|
|
+ continue;
|
|
+
|
|
+ if (!BCH_GROUP_DELETED(g) &&
|
|
+ BCH_GROUP_PARENT(g) == parent &&
|
|
+ strnlen(g->label, sizeof(g->label)) == namelen &&
|
|
+ !memcmp(name, g->label, namelen))
|
|
+ return i;
|
|
+ }
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
|
|
+ const char *name, unsigned namelen)
|
|
+{
|
|
+ struct bch_sb_field_disk_groups *groups =
|
|
+ bch2_sb_get_disk_groups(sb->sb);
|
|
+ unsigned i, nr_groups = disk_groups_nr(groups);
|
|
+ struct bch_disk_group *g;
|
|
+
|
|
+ if (!namelen || namelen > BCH_SB_LABEL_SIZE)
|
|
+ return -EINVAL;
|
|
+
|
|
+ for (i = 0;
|
|
+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
|
|
+ i++)
|
|
+ ;
|
|
+
|
|
+ if (i == nr_groups) {
|
|
+ unsigned u64s =
|
|
+ (sizeof(struct bch_sb_field_disk_groups) +
|
|
+ sizeof(struct bch_disk_group) * (nr_groups + 1)) /
|
|
+ sizeof(u64);
|
|
+
|
|
+ groups = bch2_sb_resize_disk_groups(sb, u64s);
|
|
+ if (!groups)
|
|
+ return -ENOSPC;
|
|
+
|
|
+ nr_groups = disk_groups_nr(groups);
|
|
+ }
|
|
+
|
|
+ BUG_ON(i >= nr_groups);
|
|
+
|
|
+ g = &groups->entries[i];
|
|
+
|
|
+ memcpy(g->label, name, namelen);
|
|
+ if (namelen < sizeof(g->label))
|
|
+ g->label[namelen] = '\0';
|
|
+ SET_BCH_GROUP_DELETED(g, 0);
|
|
+ SET_BCH_GROUP_PARENT(g, parent);
|
|
+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
|
|
+{
|
|
+ struct bch_sb_field_disk_groups *groups =
|
|
+ bch2_sb_get_disk_groups(sb->sb);
|
|
+ int v = -1;
|
|
+
|
|
+ do {
|
|
+ const char *next = strchrnul(name, '.');
|
|
+ unsigned len = next - name;
|
|
+
|
|
+ if (*next == '.')
|
|
+ next++;
|
|
+
|
|
+ v = __bch2_disk_group_find(groups, v + 1, name, len);
|
|
+ name = next;
|
|
+ } while (*name && v >= 0);
|
|
+
|
|
+ return v;
|
|
+}
|
|
+
|
|
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
|
|
+{
|
|
+ struct bch_sb_field_disk_groups *groups;
|
|
+ unsigned parent = 0;
|
|
+ int v = -1;
|
|
+
|
|
+ do {
|
|
+ const char *next = strchrnul(name, '.');
|
|
+ unsigned len = next - name;
|
|
+
|
|
+ if (*next == '.')
|
|
+ next++;
|
|
+
|
|
+ groups = bch2_sb_get_disk_groups(sb->sb);
|
|
+
|
|
+ v = __bch2_disk_group_find(groups, parent, name, len);
|
|
+ if (v < 0)
|
|
+ v = __bch2_disk_group_add(sb, parent, name, len);
|
|
+ if (v < 0)
|
|
+ return v;
|
|
+
|
|
+ parent = v + 1;
|
|
+ name = next;
|
|
+ } while (*name && v >= 0);
|
|
+
|
|
+ return v;
|
|
+}
|
|
+
|
|
+void bch2_disk_path_to_text(struct printbuf *out,
|
|
+ struct bch_sb_handle *sb,
|
|
+ unsigned v)
|
|
+{
|
|
+ struct bch_sb_field_disk_groups *groups =
|
|
+ bch2_sb_get_disk_groups(sb->sb);
|
|
+ struct bch_disk_group *g;
|
|
+ unsigned nr = 0;
|
|
+ u16 path[32];
|
|
+
|
|
+ while (1) {
|
|
+ if (nr == ARRAY_SIZE(path))
|
|
+ goto inval;
|
|
+
|
|
+ if (v >= disk_groups_nr(groups))
|
|
+ goto inval;
|
|
+
|
|
+ g = groups->entries + v;
|
|
+
|
|
+ if (BCH_GROUP_DELETED(g))
|
|
+ goto inval;
|
|
+
|
|
+ path[nr++] = v;
|
|
+
|
|
+ if (!BCH_GROUP_PARENT(g))
|
|
+ break;
|
|
+
|
|
+ v = BCH_GROUP_PARENT(g) - 1;
|
|
+ }
|
|
+
|
|
+ while (nr) {
|
|
+ v = path[--nr];
|
|
+ g = groups->entries + v;
|
|
+
|
|
+ bch_scnmemcpy(out, g->label,
|
|
+ strnlen(g->label, sizeof(g->label)));
|
|
+
|
|
+ if (nr)
|
|
+ pr_buf(out, ".");
|
|
+ }
|
|
+ return;
|
|
+inval:
|
|
+ pr_buf(out, "invalid group %u", v);
|
|
+}
|
|
+
|
|
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
|
|
+{
|
|
+ struct bch_member *mi;
|
|
+ int v = -1;
|
|
+ int ret = 0;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ if (!strlen(name) || !strcmp(name, "none"))
|
|
+ goto write_sb;
|
|
+
|
|
+ v = bch2_disk_path_find_or_create(&c->disk_sb, name);
|
|
+ if (v < 0) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return v;
|
|
+ }
|
|
+
|
|
+ ret = bch2_sb_disk_groups_to_cpu(c);
|
|
+ if (ret)
|
|
+ goto unlock;
|
|
+write_sb:
|
|
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
|
|
+ SET_BCH_MEMBER_GROUP(mi, v + 1);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+unlock:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ int g;
|
|
+
|
|
+ if (!strlen(buf) || !strcmp(buf, "none")) {
|
|
+ *v = 0;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* Is it a device? */
|
|
+ ca = bch2_dev_lookup(c, buf);
|
|
+ if (!IS_ERR(ca)) {
|
|
+ *v = dev_to_target(ca->dev_idx);
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ g = bch2_disk_path_find(&c->disk_sb, buf);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ if (g >= 0) {
|
|
+ *v = group_to_target(g);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v)
|
|
+{
|
|
+ struct target t = target_decode(v);
|
|
+
|
|
+ switch (t.type) {
|
|
+ case TARGET_NULL:
|
|
+ pr_buf(out, "none");
|
|
+ break;
|
|
+ case TARGET_DEV: {
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ ca = t.dev < c->sb.nr_devices
|
|
+ ? rcu_dereference(c->devs[t.dev])
|
|
+ : NULL;
|
|
+
|
|
+ if (ca && percpu_ref_tryget(&ca->io_ref)) {
|
|
+ char b[BDEVNAME_SIZE];
|
|
+
|
|
+ pr_buf(out, "/dev/%s",
|
|
+ bdevname(ca->disk_sb.bdev, b));
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ } else if (ca) {
|
|
+ pr_buf(out, "offline device %u", t.dev);
|
|
+ } else {
|
|
+ pr_buf(out, "invalid device %u", t.dev);
|
|
+ }
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ break;
|
|
+ }
|
|
+ case TARGET_GROUP:
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bch2_disk_path_to_text(out, &c->disk_sb, t.group);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h
|
|
new file mode 100644
|
|
index 000000000000..3d84f23c34ed
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/disk_groups.h
|
|
@@ -0,0 +1,91 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_DISK_GROUPS_H
|
|
+#define _BCACHEFS_DISK_GROUPS_H
|
|
+
|
|
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
|
|
+
|
|
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
|
|
+{
|
|
+ return groups
|
|
+ ? (vstruct_end(&groups->field) -
|
|
+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+struct target {
|
|
+ enum {
|
|
+ TARGET_NULL,
|
|
+ TARGET_DEV,
|
|
+ TARGET_GROUP,
|
|
+ } type;
|
|
+ union {
|
|
+ unsigned dev;
|
|
+ unsigned group;
|
|
+ };
|
|
+};
|
|
+
|
|
+#define TARGET_DEV_START 1
|
|
+#define TARGET_GROUP_START (256 + TARGET_DEV_START)
|
|
+
|
|
+static inline u16 dev_to_target(unsigned dev)
|
|
+{
|
|
+ return TARGET_DEV_START + dev;
|
|
+}
|
|
+
|
|
+static inline u16 group_to_target(unsigned group)
|
|
+{
|
|
+ return TARGET_GROUP_START + group;
|
|
+}
|
|
+
|
|
+static inline struct target target_decode(unsigned target)
|
|
+{
|
|
+ if (target >= TARGET_GROUP_START)
|
|
+ return (struct target) {
|
|
+ .type = TARGET_GROUP,
|
|
+ .group = target - TARGET_GROUP_START
|
|
+ };
|
|
+
|
|
+ if (target >= TARGET_DEV_START)
|
|
+ return (struct target) {
|
|
+ .type = TARGET_DEV,
|
|
+ .group = target - TARGET_DEV_START
|
|
+ };
|
|
+
|
|
+ return (struct target) { .type = TARGET_NULL };
|
|
+}
|
|
+
|
|
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
|
|
+
|
|
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
|
|
+ enum bch_data_type data_type,
|
|
+ u16 target)
|
|
+{
|
|
+ struct bch_devs_mask devs = c->rw_devs[data_type];
|
|
+ const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
|
|
+
|
|
+ if (t)
|
|
+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
|
|
+ return devs;
|
|
+}
|
|
+
|
|
+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
|
|
+
|
|
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
|
|
+
|
|
+/* Exported for userspace bcachefs-tools: */
|
|
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
|
|
+
|
|
+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *,
|
|
+ unsigned);
|
|
+
|
|
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
|
|
+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64);
|
|
+
|
|
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
|
|
+
|
|
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
|
|
+
|
|
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
|
|
+ struct bch_sb_field *);
|
|
+
|
|
+#endif /* _BCACHEFS_DISK_GROUPS_H */
|
|
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
|
|
new file mode 100644
|
|
index 000000000000..fa7450d2b2ad
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/ec.c
|
|
@@ -0,0 +1,1769 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+/* erasure coding */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "bset.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "disk_groups.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "io.h"
|
|
+#include "keylist.h"
|
|
+#include "recovery.h"
|
|
+#include "super-io.h"
|
|
+#include "util.h"
|
|
+
|
|
+#include <linux/sort.h>
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+
|
|
+#include <linux/raid/pq.h>
|
|
+#include <linux/raid/xor.h>
|
|
+
|
|
+static void raid5_recov(unsigned disks, unsigned failed_idx,
|
|
+ size_t size, void **data)
|
|
+{
|
|
+ unsigned i = 2, nr;
|
|
+
|
|
+ BUG_ON(failed_idx >= disks);
|
|
+
|
|
+ swap(data[0], data[failed_idx]);
|
|
+ memcpy(data[0], data[1], size);
|
|
+
|
|
+ while (i < disks) {
|
|
+ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
|
|
+ xor_blocks(nr, size, data[0], data + i);
|
|
+ i += nr;
|
|
+ }
|
|
+
|
|
+ swap(data[0], data[failed_idx]);
|
|
+}
|
|
+
|
|
+static void raid_gen(int nd, int np, size_t size, void **v)
|
|
+{
|
|
+ if (np >= 1)
|
|
+ raid5_recov(nd + np, nd, size, v);
|
|
+ if (np >= 2)
|
|
+ raid6_call.gen_syndrome(nd + np, size, v);
|
|
+ BUG_ON(np > 2);
|
|
+}
|
|
+
|
|
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
|
|
+{
|
|
+ switch (nr) {
|
|
+ case 0:
|
|
+ break;
|
|
+ case 1:
|
|
+ if (ir[0] < nd + 1)
|
|
+ raid5_recov(nd + 1, ir[0], size, v);
|
|
+ else
|
|
+ raid6_call.gen_syndrome(nd + np, size, v);
|
|
+ break;
|
|
+ case 2:
|
|
+ if (ir[1] < nd) {
|
|
+ /* data+data failure. */
|
|
+ raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
|
|
+ } else if (ir[0] < nd) {
|
|
+ /* data + p/q failure */
|
|
+
|
|
+ if (ir[1] == nd) /* data + p failure */
|
|
+ raid6_datap_recov(nd + np, size, ir[0], v);
|
|
+ else { /* data + q failure */
|
|
+ raid5_recov(nd + 1, ir[0], size, v);
|
|
+ raid6_call.gen_syndrome(nd + np, size, v);
|
|
+ }
|
|
+ } else {
|
|
+ raid_gen(nd, np, size, v);
|
|
+ }
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+#include <raid/raid.h>
|
|
+
|
|
+#endif
|
|
+
|
|
+struct ec_bio {
|
|
+ struct bch_dev *ca;
|
|
+ struct ec_stripe_buf *buf;
|
|
+ size_t idx;
|
|
+ struct bio bio;
|
|
+};
|
|
+
|
|
+/* Stripes btree keys: */
|
|
+
|
|
+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
|
|
+
|
|
+ if (!bkey_cmp(k.k->p, POS_MIN))
|
|
+ return "stripe at pos 0";
|
|
+
|
|
+ if (k.k->p.inode)
|
|
+ return "invalid stripe key";
|
|
+
|
|
+ if (bkey_val_bytes(k.k) < sizeof(*s))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ if (bkey_val_bytes(k.k) < sizeof(*s) ||
|
|
+ bkey_val_u64s(k.k) < stripe_val_u64s(s))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return bch2_bkey_ptrs_invalid(c, k);
|
|
+}
|
|
+
|
|
+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
|
|
+ unsigned i;
|
|
+
|
|
+ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
|
|
+ s->algorithm,
|
|
+ le16_to_cpu(s->sectors),
|
|
+ s->nr_blocks - s->nr_redundant,
|
|
+ s->nr_redundant,
|
|
+ s->csum_type,
|
|
+ 1U << s->csum_granularity_bits);
|
|
+
|
|
+ for (i = 0; i < s->nr_blocks; i++)
|
|
+ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev,
|
|
+ (u64) s->ptrs[i].offset,
|
|
+ stripe_blockcount_get(s, i));
|
|
+}
|
|
+
|
|
+/* returns blocknr in stripe that we matched: */
|
|
+static int bkey_matches_stripe(struct bch_stripe *s,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
|
|
+
|
|
+ bkey_for_each_ptr(ptrs, ptr)
|
|
+ for (i = 0; i < nr_data; i++)
|
|
+ if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr,
|
|
+ le16_to_cpu(s->sectors)))
|
|
+ return i;
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
|
|
+{
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_extent: {
|
|
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+
|
|
+ extent_for_each_entry(e, entry)
|
|
+ if (extent_entry_type(entry) ==
|
|
+ BCH_EXTENT_ENTRY_stripe_ptr &&
|
|
+ entry->stripe_ptr.idx == idx)
|
|
+ return true;
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/* Stripe bufs: */
|
|
+
|
|
+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < buf->key.v.nr_blocks; i++) {
|
|
+ kvpfree(buf->data[i], buf->size << 9);
|
|
+ buf->data[i] = NULL;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
|
|
+ unsigned offset, unsigned size)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned csum_granularity = 1U << v->csum_granularity_bits;
|
|
+ unsigned end = offset + size;
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(end > le16_to_cpu(v->sectors));
|
|
+
|
|
+ offset = round_down(offset, csum_granularity);
|
|
+ end = min_t(unsigned, le16_to_cpu(v->sectors),
|
|
+ round_up(end, csum_granularity));
|
|
+
|
|
+ buf->offset = offset;
|
|
+ buf->size = end - offset;
|
|
+
|
|
+ memset(buf->valid, 0xFF, sizeof(buf->valid));
|
|
+
|
|
+ for (i = 0; i < buf->key.v.nr_blocks; i++) {
|
|
+ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
|
|
+ if (!buf->data[i])
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+err:
|
|
+ ec_stripe_buf_exit(buf);
|
|
+ return -ENOMEM;
|
|
+}
|
|
+
|
|
+/* Checksumming: */
|
|
+
|
|
+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf,
|
|
+ unsigned block, unsigned offset)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned csum_granularity = 1 << v->csum_granularity_bits;
|
|
+ unsigned end = buf->offset + buf->size;
|
|
+ unsigned len = min(csum_granularity, end - offset);
|
|
+
|
|
+ BUG_ON(offset >= end);
|
|
+ BUG_ON(offset < buf->offset);
|
|
+ BUG_ON(offset & (csum_granularity - 1));
|
|
+ BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
|
|
+ (len & (csum_granularity - 1)));
|
|
+
|
|
+ return bch2_checksum(NULL, v->csum_type,
|
|
+ null_nonce(),
|
|
+ buf->data[block] + ((offset - buf->offset) << 9),
|
|
+ len << 9);
|
|
+}
|
|
+
|
|
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned i, j, csums_per_device = stripe_csums_per_device(v);
|
|
+
|
|
+ if (!v->csum_type)
|
|
+ return;
|
|
+
|
|
+ BUG_ON(buf->offset);
|
|
+ BUG_ON(buf->size != le16_to_cpu(v->sectors));
|
|
+
|
|
+ for (i = 0; i < v->nr_blocks; i++)
|
|
+ for (j = 0; j < csums_per_device; j++)
|
|
+ stripe_csum_set(v, i, j,
|
|
+ ec_block_checksum(buf, i, j << v->csum_granularity_bits));
|
|
+}
|
|
+
|
|
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned csum_granularity = 1 << v->csum_granularity_bits;
|
|
+ unsigned i;
|
|
+
|
|
+ if (!v->csum_type)
|
|
+ return;
|
|
+
|
|
+ for (i = 0; i < v->nr_blocks; i++) {
|
|
+ unsigned offset = buf->offset;
|
|
+ unsigned end = buf->offset + buf->size;
|
|
+
|
|
+ if (!test_bit(i, buf->valid))
|
|
+ continue;
|
|
+
|
|
+ while (offset < end) {
|
|
+ unsigned j = offset >> v->csum_granularity_bits;
|
|
+ unsigned len = min(csum_granularity, end - offset);
|
|
+ struct bch_csum want = stripe_csum_get(v, i, j);
|
|
+ struct bch_csum got = ec_block_checksum(buf, i, offset);
|
|
+
|
|
+ if (bch2_crc_cmp(want, got)) {
|
|
+ char buf2[200];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
|
|
+
|
|
+ bch_err_ratelimited(c,
|
|
+ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
|
|
+ (void *) _RET_IP_, i, j, v->csum_type,
|
|
+ want.lo, got.lo, buf2);
|
|
+ clear_bit(i, buf->valid);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ offset += len;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Erasure coding: */
|
|
+
|
|
+static void ec_generate_ec(struct ec_stripe_buf *buf)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
|
|
+ unsigned bytes = le16_to_cpu(v->sectors) << 9;
|
|
+
|
|
+ raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
|
|
+}
|
|
+
|
|
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
|
|
+{
|
|
+ return buf->key.v.nr_blocks -
|
|
+ bitmap_weight(buf->valid, buf->key.v.nr_blocks);
|
|
+}
|
|
+
|
|
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0;
|
|
+ unsigned nr_data = v->nr_blocks - v->nr_redundant;
|
|
+ unsigned bytes = buf->size << 9;
|
|
+
|
|
+ if (ec_nr_failed(buf) > v->nr_redundant) {
|
|
+ bch_err_ratelimited(c,
|
|
+ "error doing reconstruct read: unable to read enough blocks");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < nr_data; i++)
|
|
+ if (!test_bit(i, buf->valid))
|
|
+ failed[nr_failed++] = i;
|
|
+
|
|
+ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* IO: */
|
|
+
|
|
+static void ec_block_endio(struct bio *bio)
|
|
+{
|
|
+ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
|
|
+ struct bch_stripe *v = &ec_bio->buf->key.v;
|
|
+ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
|
|
+ struct bch_dev *ca = ec_bio->ca;
|
|
+ struct closure *cl = bio->bi_private;
|
|
+
|
|
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s",
|
|
+ bio_data_dir(bio) ? "write" : "read",
|
|
+ bch2_blk_status_to_str(bio->bi_status)))
|
|
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
|
|
+
|
|
+ if (ptr_stale(ca, ptr)) {
|
|
+ bch_err_ratelimited(ca->fs,
|
|
+ "error %s stripe: stale pointer after io",
|
|
+ bio_data_dir(bio) == READ ? "reading from" : "writing to");
|
|
+ clear_bit(ec_bio->idx, ec_bio->buf->valid);
|
|
+ }
|
|
+
|
|
+ bio_put(&ec_bio->bio);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ closure_put(cl);
|
|
+}
|
|
+
|
|
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
|
|
+ unsigned rw, unsigned idx, struct closure *cl)
|
|
+{
|
|
+ struct bch_stripe *v = &buf->key.v;
|
|
+ unsigned offset = 0, bytes = buf->size << 9;
|
|
+ struct bch_extent_ptr *ptr = &v->ptrs[idx];
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant
|
|
+ ? BCH_DATA_user
|
|
+ : BCH_DATA_parity;
|
|
+
|
|
+ if (ptr_stale(ca, ptr)) {
|
|
+ bch_err_ratelimited(c,
|
|
+ "error %s stripe: stale pointer",
|
|
+ rw == READ ? "reading from" : "writing to");
|
|
+ clear_bit(idx, buf->valid);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (!bch2_dev_get_ioref(ca, rw)) {
|
|
+ clear_bit(idx, buf->valid);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size);
|
|
+
|
|
+ while (offset < bytes) {
|
|
+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
|
|
+ DIV_ROUND_UP(bytes, PAGE_SIZE));
|
|
+ unsigned b = min_t(size_t, bytes - offset,
|
|
+ nr_iovecs << PAGE_SHIFT);
|
|
+ struct ec_bio *ec_bio;
|
|
+
|
|
+ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
|
|
+ &c->ec_bioset),
|
|
+ struct ec_bio, bio);
|
|
+
|
|
+ ec_bio->ca = ca;
|
|
+ ec_bio->buf = buf;
|
|
+ ec_bio->idx = idx;
|
|
+
|
|
+ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
|
|
+ bio_set_op_attrs(&ec_bio->bio, rw, 0);
|
|
+
|
|
+ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9);
|
|
+ ec_bio->bio.bi_end_io = ec_block_endio;
|
|
+ ec_bio->bio.bi_private = cl;
|
|
+
|
|
+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b);
|
|
+
|
|
+ closure_get(cl);
|
|
+ percpu_ref_get(&ca->io_ref);
|
|
+
|
|
+ submit_bio(&ec_bio->bio);
|
|
+
|
|
+ offset += b;
|
|
+ }
|
|
+
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+}
|
|
+
|
|
+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_SLOTS);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ if (k.k->type != KEY_TYPE_stripe) {
|
|
+ ret = -ENOENT;
|
|
+ goto err;
|
|
+ }
|
|
+ bkey_reassemble(&stripe->key.k_i, k);
|
|
+err:
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* recovery read path: */
|
|
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
|
|
+{
|
|
+ struct ec_stripe_buf *buf;
|
|
+ struct closure cl;
|
|
+ struct bch_stripe *v;
|
|
+ unsigned i, offset;
|
|
+ int ret = 0;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ BUG_ON(!rbio->pick.has_ec);
|
|
+
|
|
+ buf = kzalloc(sizeof(*buf), GFP_NOIO);
|
|
+ if (!buf)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = get_stripe_key(c, rbio->pick.ec.idx, buf);
|
|
+ if (ret) {
|
|
+ bch_err_ratelimited(c,
|
|
+ "error doing reconstruct read: error %i looking up stripe", ret);
|
|
+ kfree(buf);
|
|
+ return -EIO;
|
|
+ }
|
|
+
|
|
+ v = &buf->key.v;
|
|
+
|
|
+ if (!bch2_ptr_matches_stripe(v, rbio->pick)) {
|
|
+ bch_err_ratelimited(c,
|
|
+ "error doing reconstruct read: pointer doesn't match stripe");
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset;
|
|
+ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) {
|
|
+ bch_err_ratelimited(c,
|
|
+ "error doing reconstruct read: read is bigger than stripe");
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ for (i = 0; i < v->nr_blocks; i++)
|
|
+ ec_block_io(c, buf, REQ_OP_READ, i, &cl);
|
|
+
|
|
+ closure_sync(&cl);
|
|
+
|
|
+ if (ec_nr_failed(buf) > v->nr_redundant) {
|
|
+ bch_err_ratelimited(c,
|
|
+ "error doing reconstruct read: unable to read enough blocks");
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ec_validate_checksums(c, buf);
|
|
+
|
|
+ ret = ec_do_recov(c, buf);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
|
|
+ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9));
|
|
+err:
|
|
+ ec_stripe_buf_exit(buf);
|
|
+ kfree(buf);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* stripe bucket accounting: */
|
|
+
|
|
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
|
|
+{
|
|
+ ec_stripes_heap n, *h = &c->ec_stripes_heap;
|
|
+
|
|
+ if (idx >= h->size) {
|
|
+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ if (n.size > h->size) {
|
|
+ memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
|
|
+ n.used = h->used;
|
|
+ swap(*h, n);
|
|
+ }
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+
|
|
+ free_heap(&n);
|
|
+ }
|
|
+
|
|
+ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
|
|
+ !genradix_ptr_alloc(&c->stripes[1], idx, gfp))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int ec_stripe_mem_alloc(struct bch_fs *c,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ size_t idx = iter->pos.offset;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN))
|
|
+ return ret;
|
|
+
|
|
+ bch2_trans_unlock(iter->trans);
|
|
+ ret = -EINTR;
|
|
+
|
|
+ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
|
|
+ return ret;
|
|
+
|
|
+ return -ENOMEM;
|
|
+}
|
|
+
|
|
+static ssize_t stripe_idx_to_delete(struct bch_fs *c)
|
|
+{
|
|
+ ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
+
|
|
+ return h->used && h->data[0].blocks_nonempty == 0
|
|
+ ? h->data[0].idx : -1;
|
|
+}
|
|
+
|
|
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
|
|
+ struct ec_stripe_heap_entry l,
|
|
+ struct ec_stripe_heap_entry r)
|
|
+{
|
|
+ return ((l.blocks_nonempty > r.blocks_nonempty) -
|
|
+ (l.blocks_nonempty < r.blocks_nonempty));
|
|
+}
|
|
+
|
|
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
|
|
+ size_t i)
|
|
+{
|
|
+ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
|
|
+
|
|
+ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i;
|
|
+}
|
|
+
|
|
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
|
|
+{
|
|
+ ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
+ struct stripe *m = genradix_ptr(&c->stripes[0], idx);
|
|
+
|
|
+ BUG_ON(!m->alive);
|
|
+ BUG_ON(m->heap_idx >= h->used);
|
|
+ BUG_ON(h->data[m->heap_idx].idx != idx);
|
|
+}
|
|
+
|
|
+void bch2_stripes_heap_del(struct bch_fs *c,
|
|
+ struct stripe *m, size_t idx)
|
|
+{
|
|
+ if (!m->on_heap)
|
|
+ return;
|
|
+
|
|
+ m->on_heap = false;
|
|
+
|
|
+ heap_verify_backpointer(c, idx);
|
|
+
|
|
+ heap_del(&c->ec_stripes_heap, m->heap_idx,
|
|
+ ec_stripes_heap_cmp,
|
|
+ ec_stripes_heap_set_backpointer);
|
|
+}
|
|
+
|
|
+void bch2_stripes_heap_insert(struct bch_fs *c,
|
|
+ struct stripe *m, size_t idx)
|
|
+{
|
|
+ if (m->on_heap)
|
|
+ return;
|
|
+
|
|
+ BUG_ON(heap_full(&c->ec_stripes_heap));
|
|
+
|
|
+ m->on_heap = true;
|
|
+
|
|
+ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
|
|
+ .idx = idx,
|
|
+ .blocks_nonempty = m->blocks_nonempty,
|
|
+ }),
|
|
+ ec_stripes_heap_cmp,
|
|
+ ec_stripes_heap_set_backpointer);
|
|
+
|
|
+ heap_verify_backpointer(c, idx);
|
|
+}
|
|
+
|
|
+void bch2_stripes_heap_update(struct bch_fs *c,
|
|
+ struct stripe *m, size_t idx)
|
|
+{
|
|
+ ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
+ size_t i;
|
|
+
|
|
+ if (!m->on_heap)
|
|
+ return;
|
|
+
|
|
+ heap_verify_backpointer(c, idx);
|
|
+
|
|
+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty;
|
|
+
|
|
+ i = m->heap_idx;
|
|
+ heap_sift_up(h, i, ec_stripes_heap_cmp,
|
|
+ ec_stripes_heap_set_backpointer);
|
|
+ heap_sift_down(h, i, ec_stripes_heap_cmp,
|
|
+ ec_stripes_heap_set_backpointer);
|
|
+
|
|
+ heap_verify_backpointer(c, idx);
|
|
+
|
|
+ if (stripe_idx_to_delete(c) >= 0 &&
|
|
+ !percpu_ref_is_dying(&c->writes))
|
|
+ schedule_work(&c->ec_stripe_delete_work);
|
|
+}
|
|
+
|
|
+/* stripe deletion */
|
|
+
|
|
+static int ec_stripe_delete(struct bch_fs *c, size_t idx)
|
|
+{
|
|
+ return bch2_btree_delete_range(c, BTREE_ID_stripes,
|
|
+ POS(0, idx),
|
|
+ POS(0, idx + 1),
|
|
+ NULL);
|
|
+}
|
|
+
|
|
+static void ec_stripe_delete_work(struct work_struct *work)
|
|
+{
|
|
+ struct bch_fs *c =
|
|
+ container_of(work, struct bch_fs, ec_stripe_delete_work);
|
|
+ ssize_t idx;
|
|
+
|
|
+ while (1) {
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ idx = stripe_idx_to_delete(c);
|
|
+ if (idx < 0) {
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx);
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+
|
|
+ if (ec_stripe_delete(c, idx))
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* stripe creation: */
|
|
+
|
|
+static int ec_stripe_bkey_insert(struct bch_fs *c,
|
|
+ struct bkey_i_stripe *stripe,
|
|
+ struct disk_reservation *res)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bpos min_pos = POS(0, 1);
|
|
+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint));
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, start_pos,
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
|
|
+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
|
|
+ if (start_pos.offset) {
|
|
+ start_pos = min_pos;
|
|
+ bch2_btree_iter_set_pos(iter, start_pos);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ ret = -ENOSPC;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (bkey_deleted(k.k))
|
|
+ goto found_slot;
|
|
+ }
|
|
+
|
|
+ goto err;
|
|
+found_slot:
|
|
+ start_pos = iter->pos;
|
|
+
|
|
+ ret = ec_stripe_mem_alloc(c, iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ stripe->k.p = iter->pos;
|
|
+
|
|
+ bch2_trans_update(&trans, iter, &stripe->k_i, 0);
|
|
+
|
|
+ ret = bch2_trans_commit(&trans, res, NULL,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1;
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int ec_stripe_bkey_update(struct btree_trans *trans,
|
|
+ struct bkey_i_stripe *new)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ const struct bch_stripe *existing;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_stripes,
|
|
+ new->k.p, BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!k.k || k.k->type != KEY_TYPE_stripe) {
|
|
+ bch_err(trans->c, "error updating stripe: not found");
|
|
+ ret = -ENOENT;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ existing = bkey_s_c_to_stripe(k).v;
|
|
+
|
|
+ if (existing->nr_blocks != new->v.nr_blocks) {
|
|
+ bch_err(trans->c, "error updating stripe: nr_blocks does not match");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < new->v.nr_blocks; i++)
|
|
+ stripe_blockcount_set(&new->v, i,
|
|
+ stripe_blockcount_get(existing, i));
|
|
+
|
|
+ bch2_trans_update(trans, iter, &new->k_i, 0);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void extent_stripe_ptr_add(struct bkey_s_extent e,
|
|
+ struct ec_stripe_buf *s,
|
|
+ struct bch_extent_ptr *ptr,
|
|
+ unsigned block)
|
|
+{
|
|
+ struct bch_extent_stripe_ptr *dst = (void *) ptr;
|
|
+ union bch_extent_entry *end = extent_entry_last(e);
|
|
+
|
|
+ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
|
|
+ e.k->u64s += sizeof(*dst) / sizeof(u64);
|
|
+
|
|
+ *dst = (struct bch_extent_stripe_ptr) {
|
|
+ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
|
|
+ .block = block,
|
|
+ .redundancy = s->key.v.nr_redundant,
|
|
+ .idx = s->key.k.p.offset,
|
|
+ };
|
|
+}
|
|
+
|
|
+static int ec_stripe_update_ptrs(struct bch_fs *c,
|
|
+ struct ec_stripe_buf *s,
|
|
+ struct bkey *pos)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_extent e;
|
|
+ struct bkey_buf sk;
|
|
+ int ret = 0, dev, block;
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ /* XXX this doesn't support the reflink btree */
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ bkey_start_pos(pos),
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k)) &&
|
|
+ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
|
|
+ struct bch_extent_ptr *ptr, *ec_ptr = NULL;
|
|
+
|
|
+ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ block = bkey_matches_stripe(&s->key.v, k);
|
|
+ if (block < 0) {
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ dev = s->key.v.ptrs[block].dev;
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+ e = bkey_i_to_s_extent(sk.k);
|
|
+
|
|
+ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
|
|
+ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev);
|
|
+ BUG_ON(!ec_ptr);
|
|
+
|
|
+ extent_stripe_ptr_add(e, s, ec_ptr, block);
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
|
|
+ bch2_trans_update(&trans, iter, sk.k, 0);
|
|
+
|
|
+ ret = bch2_trans_commit(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+ if (ret == -EINTR)
|
|
+ ret = 0;
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * data buckets of new stripe all written: create the stripe
|
|
+ */
|
|
+static void ec_stripe_create(struct ec_stripe_new *s)
|
|
+{
|
|
+ struct bch_fs *c = s->c;
|
|
+ struct open_bucket *ob;
|
|
+ struct bkey_i *k;
|
|
+ struct stripe *m;
|
|
+ struct bch_stripe *v = &s->new_stripe.key.v;
|
|
+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(s->h->s == s);
|
|
+
|
|
+ closure_sync(&s->iodone);
|
|
+
|
|
+ if (s->err) {
|
|
+ if (s->err != -EROFS)
|
|
+ bch_err(c, "error creating stripe: error writing data buckets");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (s->have_existing_stripe) {
|
|
+ ec_validate_checksums(c, &s->existing_stripe);
|
|
+
|
|
+ if (ec_do_recov(c, &s->existing_stripe)) {
|
|
+ bch_err(c, "error creating stripe: error reading existing stripe");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < nr_data; i++)
|
|
+ if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
|
|
+ swap(s->new_stripe.data[i],
|
|
+ s->existing_stripe.data[i]);
|
|
+
|
|
+ ec_stripe_buf_exit(&s->existing_stripe);
|
|
+ }
|
|
+
|
|
+ BUG_ON(!s->allocated);
|
|
+
|
|
+ if (!percpu_ref_tryget(&c->writes))
|
|
+ goto err;
|
|
+
|
|
+ ec_generate_ec(&s->new_stripe);
|
|
+
|
|
+ ec_generate_checksums(&s->new_stripe);
|
|
+
|
|
+ /* write p/q: */
|
|
+ for (i = nr_data; i < v->nr_blocks; i++)
|
|
+ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
|
|
+ closure_sync(&s->iodone);
|
|
+
|
|
+ if (ec_nr_failed(&s->new_stripe)) {
|
|
+ bch_err(c, "error creating stripe: error writing redundancy buckets");
|
|
+ goto err_put_writes;
|
|
+ }
|
|
+
|
|
+ ret = s->have_existing_stripe
|
|
+ ? bch2_trans_do(c, &s->res, NULL, BTREE_INSERT_NOFAIL,
|
|
+ ec_stripe_bkey_update(&trans, &s->new_stripe.key))
|
|
+ : ec_stripe_bkey_insert(c, &s->new_stripe.key, &s->res);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error creating stripe: error creating stripe key");
|
|
+ goto err_put_writes;
|
|
+ }
|
|
+
|
|
+ for_each_keylist_key(&s->keys, k) {
|
|
+ ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error creating stripe: error %i updating pointers", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
|
|
+
|
|
+ BUG_ON(m->on_heap);
|
|
+ bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+err_put_writes:
|
|
+ percpu_ref_put(&c->writes);
|
|
+err:
|
|
+ bch2_disk_reservation_put(c, &s->res);
|
|
+
|
|
+ for (i = 0; i < v->nr_blocks; i++)
|
|
+ if (s->blocks[i]) {
|
|
+ ob = c->open_buckets + s->blocks[i];
|
|
+
|
|
+ if (i < nr_data) {
|
|
+ ob->ec = NULL;
|
|
+ __bch2_open_bucket_put(c, ob);
|
|
+ } else {
|
|
+ bch2_open_bucket_put(c, ob);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_keylist_free(&s->keys, s->inline_keys);
|
|
+
|
|
+ ec_stripe_buf_exit(&s->existing_stripe);
|
|
+ ec_stripe_buf_exit(&s->new_stripe);
|
|
+ closure_debug_destroy(&s->iodone);
|
|
+ kfree(s);
|
|
+}
|
|
+
|
|
+static void ec_stripe_create_work(struct work_struct *work)
|
|
+{
|
|
+ struct bch_fs *c = container_of(work,
|
|
+ struct bch_fs, ec_stripe_create_work);
|
|
+ struct ec_stripe_new *s, *n;
|
|
+restart:
|
|
+ mutex_lock(&c->ec_stripe_new_lock);
|
|
+ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list)
|
|
+ if (!atomic_read(&s->pin)) {
|
|
+ list_del(&s->list);
|
|
+ mutex_unlock(&c->ec_stripe_new_lock);
|
|
+ ec_stripe_create(s);
|
|
+ goto restart;
|
|
+ }
|
|
+ mutex_unlock(&c->ec_stripe_new_lock);
|
|
+}
|
|
+
|
|
+static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s)
|
|
+{
|
|
+ BUG_ON(atomic_read(&s->pin) <= 0);
|
|
+
|
|
+ if (atomic_dec_and_test(&s->pin)) {
|
|
+ BUG_ON(!s->pending);
|
|
+ queue_work(system_long_wq, &c->ec_stripe_create_work);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h)
|
|
+{
|
|
+ struct ec_stripe_new *s = h->s;
|
|
+
|
|
+ BUG_ON(!s->allocated && !s->err);
|
|
+
|
|
+ h->s = NULL;
|
|
+ s->pending = true;
|
|
+
|
|
+ mutex_lock(&c->ec_stripe_new_lock);
|
|
+ list_add(&s->list, &c->ec_stripe_new_list);
|
|
+ mutex_unlock(&c->ec_stripe_new_lock);
|
|
+
|
|
+ ec_stripe_new_put(c, s);
|
|
+}
|
|
+
|
|
+/* have a full bucket - hand it off to be erasure coded: */
|
|
+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
|
|
+{
|
|
+ struct ec_stripe_new *s = ob->ec;
|
|
+
|
|
+ if (ob->sectors_free)
|
|
+ s->err = -1;
|
|
+
|
|
+ ec_stripe_new_put(c, s);
|
|
+}
|
|
+
|
|
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
|
|
+{
|
|
+ struct ec_stripe_new *s = ob->ec;
|
|
+
|
|
+ s->err = -EIO;
|
|
+}
|
|
+
|
|
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
|
|
+{
|
|
+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned offset;
|
|
+
|
|
+ if (!ob)
|
|
+ return NULL;
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+ offset = ca->mi.bucket_size - ob->sectors_free;
|
|
+
|
|
+ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
|
|
+}
|
|
+
|
|
+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
|
|
+ struct bpos pos, unsigned sectors)
|
|
+{
|
|
+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
|
|
+ struct ec_stripe_new *ec;
|
|
+
|
|
+ if (!ob)
|
|
+ return;
|
|
+
|
|
+ ec = ob->ec;
|
|
+ mutex_lock(&ec->lock);
|
|
+
|
|
+ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
|
|
+ ARRAY_SIZE(ec->inline_keys),
|
|
+ BKEY_U64s)) {
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ bkey_init(&ec->keys.top->k);
|
|
+ ec->keys.top->k.p = pos;
|
|
+ bch2_key_resize(&ec->keys.top->k, sectors);
|
|
+ bch2_keylist_push(&ec->keys);
|
|
+
|
|
+ mutex_unlock(&ec->lock);
|
|
+}
|
|
+
|
|
+static int unsigned_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ unsigned l = *((const unsigned *) _l);
|
|
+ unsigned r = *((const unsigned *) _r);
|
|
+
|
|
+ return cmp_int(l, r);
|
|
+}
|
|
+
|
|
+/* pick most common bucket size: */
|
|
+static unsigned pick_blocksize(struct bch_fs *c,
|
|
+ struct bch_devs_mask *devs)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
|
|
+ struct {
|
|
+ unsigned nr, size;
|
|
+ } cur = { 0, 0 }, best = { 0, 0 };
|
|
+
|
|
+ for_each_member_device_rcu(ca, c, i, devs)
|
|
+ sizes[nr++] = ca->mi.bucket_size;
|
|
+
|
|
+ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ if (sizes[i] != cur.size) {
|
|
+ if (cur.nr > best.nr)
|
|
+ best = cur;
|
|
+
|
|
+ cur.nr = 0;
|
|
+ cur.size = sizes[i];
|
|
+ }
|
|
+
|
|
+ cur.nr++;
|
|
+ }
|
|
+
|
|
+ if (cur.nr > best.nr)
|
|
+ best = cur;
|
|
+
|
|
+ return best.size;
|
|
+}
|
|
+
|
|
+static bool may_create_new_stripe(struct bch_fs *c)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void ec_stripe_key_init(struct bch_fs *c,
|
|
+ struct bkey_i_stripe *s,
|
|
+ unsigned nr_data,
|
|
+ unsigned nr_parity,
|
|
+ unsigned stripe_size)
|
|
+{
|
|
+ unsigned u64s;
|
|
+
|
|
+ bkey_stripe_init(&s->k_i);
|
|
+ s->v.sectors = cpu_to_le16(stripe_size);
|
|
+ s->v.algorithm = 0;
|
|
+ s->v.nr_blocks = nr_data + nr_parity;
|
|
+ s->v.nr_redundant = nr_parity;
|
|
+ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max);
|
|
+ s->v.csum_type = BCH_CSUM_CRC32C;
|
|
+ s->v.pad = 0;
|
|
+
|
|
+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
|
|
+ BUG_ON(1 << s->v.csum_granularity_bits >=
|
|
+ le16_to_cpu(s->v.sectors) ||
|
|
+ s->v.csum_granularity_bits == U8_MAX);
|
|
+ s->v.csum_granularity_bits++;
|
|
+ }
|
|
+
|
|
+ set_bkey_val_u64s(&s->k, u64s);
|
|
+}
|
|
+
|
|
+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
|
|
+{
|
|
+ struct ec_stripe_new *s;
|
|
+
|
|
+ lockdep_assert_held(&h->lock);
|
|
+
|
|
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
|
|
+ if (!s)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ mutex_init(&s->lock);
|
|
+ closure_init(&s->iodone, NULL);
|
|
+ atomic_set(&s->pin, 1);
|
|
+ s->c = c;
|
|
+ s->h = h;
|
|
+ s->nr_data = min_t(unsigned, h->nr_active_devs,
|
|
+ BCH_BKEY_PTRS_MAX) - h->redundancy;
|
|
+ s->nr_parity = h->redundancy;
|
|
+
|
|
+ bch2_keylist_init(&s->keys, s->inline_keys);
|
|
+
|
|
+ ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
|
|
+ s->nr_parity, h->blocksize);
|
|
+
|
|
+ h->s = s;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct ec_stripe_head *
|
|
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
|
|
+ unsigned algo, unsigned redundancy,
|
|
+ bool copygc)
|
|
+{
|
|
+ struct ec_stripe_head *h;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ h = kzalloc(sizeof(*h), GFP_KERNEL);
|
|
+ if (!h)
|
|
+ return NULL;
|
|
+
|
|
+ mutex_init(&h->lock);
|
|
+ mutex_lock(&h->lock);
|
|
+
|
|
+ h->target = target;
|
|
+ h->algo = algo;
|
|
+ h->redundancy = redundancy;
|
|
+ h->copygc = copygc;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ h->devs = target_rw_devs(c, BCH_DATA_user, target);
|
|
+
|
|
+ for_each_member_device_rcu(ca, c, i, &h->devs)
|
|
+ if (!ca->mi.durability)
|
|
+ __clear_bit(i, h->devs.d);
|
|
+
|
|
+ h->blocksize = pick_blocksize(c, &h->devs);
|
|
+
|
|
+ for_each_member_device_rcu(ca, c, i, &h->devs)
|
|
+ if (ca->mi.bucket_size == h->blocksize)
|
|
+ h->nr_active_devs++;
|
|
+
|
|
+ rcu_read_unlock();
|
|
+ list_add(&h->list, &c->ec_stripe_head_list);
|
|
+ return h;
|
|
+}
|
|
+
|
|
+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
|
|
+{
|
|
+ if (h->s &&
|
|
+ h->s->allocated &&
|
|
+ bitmap_weight(h->s->blocks_allocated,
|
|
+ h->s->nr_data) == h->s->nr_data)
|
|
+ ec_stripe_set_pending(c, h);
|
|
+
|
|
+ mutex_unlock(&h->lock);
|
|
+}
|
|
+
|
|
+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
|
|
+ unsigned target,
|
|
+ unsigned algo,
|
|
+ unsigned redundancy,
|
|
+ bool copygc)
|
|
+{
|
|
+ struct ec_stripe_head *h;
|
|
+
|
|
+ if (!redundancy)
|
|
+ return NULL;
|
|
+
|
|
+ mutex_lock(&c->ec_stripe_head_lock);
|
|
+ list_for_each_entry(h, &c->ec_stripe_head_list, list)
|
|
+ if (h->target == target &&
|
|
+ h->algo == algo &&
|
|
+ h->redundancy == redundancy &&
|
|
+ h->copygc == copygc) {
|
|
+ mutex_lock(&h->lock);
|
|
+ goto found;
|
|
+ }
|
|
+
|
|
+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
|
|
+found:
|
|
+ mutex_unlock(&c->ec_stripe_head_lock);
|
|
+ return h;
|
|
+}
|
|
+
|
|
+static enum bucket_alloc_ret
|
|
+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
|
|
+ struct closure *cl)
|
|
+{
|
|
+ struct bch_devs_mask devs = h->devs;
|
|
+ struct open_bucket *ob;
|
|
+ struct open_buckets buckets;
|
|
+ unsigned i, j, nr_have_parity = 0, nr_have_data = 0;
|
|
+ bool have_cache = true;
|
|
+ enum bucket_alloc_ret ret = ALLOC_SUCCESS;
|
|
+
|
|
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
|
|
+ if (test_bit(i, h->s->blocks_gotten)) {
|
|
+ __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
|
|
+ if (i < h->s->nr_data)
|
|
+ nr_have_data++;
|
|
+ else
|
|
+ nr_have_parity++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ BUG_ON(nr_have_data > h->s->nr_data);
|
|
+ BUG_ON(nr_have_parity > h->s->nr_parity);
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ rcu_read_lock();
|
|
+
|
|
+ buckets.nr = 0;
|
|
+ if (nr_have_parity < h->s->nr_parity) {
|
|
+ ret = bch2_bucket_alloc_set(c, &buckets,
|
|
+ &h->parity_stripe,
|
|
+ &devs,
|
|
+ h->s->nr_parity,
|
|
+ &nr_have_parity,
|
|
+ &have_cache,
|
|
+ h->copygc
|
|
+ ? RESERVE_MOVINGGC
|
|
+ : RESERVE_NONE,
|
|
+ 0,
|
|
+ cl);
|
|
+
|
|
+ open_bucket_for_each(c, &buckets, ob, i) {
|
|
+ j = find_next_zero_bit(h->s->blocks_gotten,
|
|
+ h->s->nr_data + h->s->nr_parity,
|
|
+ h->s->nr_data);
|
|
+ BUG_ON(j >= h->s->nr_data + h->s->nr_parity);
|
|
+
|
|
+ h->s->blocks[j] = buckets.v[i];
|
|
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
|
|
+ __set_bit(j, h->s->blocks_gotten);
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ buckets.nr = 0;
|
|
+ if (nr_have_data < h->s->nr_data) {
|
|
+ ret = bch2_bucket_alloc_set(c, &buckets,
|
|
+ &h->block_stripe,
|
|
+ &devs,
|
|
+ h->s->nr_data,
|
|
+ &nr_have_data,
|
|
+ &have_cache,
|
|
+ h->copygc
|
|
+ ? RESERVE_MOVINGGC
|
|
+ : RESERVE_NONE,
|
|
+ 0,
|
|
+ cl);
|
|
+
|
|
+ open_bucket_for_each(c, &buckets, ob, i) {
|
|
+ j = find_next_zero_bit(h->s->blocks_gotten,
|
|
+ h->s->nr_data, 0);
|
|
+ BUG_ON(j >= h->s->nr_data);
|
|
+
|
|
+ h->s->blocks[j] = buckets.v[i];
|
|
+ h->s->new_stripe.key.v.ptrs[j] = ob->ptr;
|
|
+ __set_bit(j, h->s->blocks_gotten);
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+err:
|
|
+ rcu_read_unlock();
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* XXX: doesn't obey target: */
|
|
+static s64 get_existing_stripe(struct bch_fs *c,
|
|
+ struct ec_stripe_head *head)
|
|
+{
|
|
+ ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
+ struct stripe *m;
|
|
+ size_t heap_idx;
|
|
+ u64 stripe_idx;
|
|
+ s64 ret = -1;
|
|
+
|
|
+ if (may_create_new_stripe(c))
|
|
+ return -1;
|
|
+
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
|
|
+ /* No blocks worth reusing, stripe will just be deleted: */
|
|
+ if (!h->data[heap_idx].blocks_nonempty)
|
|
+ continue;
|
|
+
|
|
+ stripe_idx = h->data[heap_idx].idx;
|
|
+ m = genradix_ptr(&c->stripes[0], stripe_idx);
|
|
+
|
|
+ if (m->algorithm == head->algo &&
|
|
+ m->nr_redundant == head->redundancy &&
|
|
+ m->sectors == head->blocksize &&
|
|
+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) {
|
|
+ bch2_stripes_heap_del(c, m, stripe_idx);
|
|
+ ret = stripe_idx;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __bch2_ec_stripe_head_reuse(struct bch_fs *c,
|
|
+ struct ec_stripe_head *h)
|
|
+{
|
|
+ unsigned i;
|
|
+ s64 idx;
|
|
+ int ret;
|
|
+
|
|
+ idx = get_existing_stripe(c, h);
|
|
+ if (idx < 0) {
|
|
+ bch_err(c, "failed to find an existing stripe");
|
|
+ return -ENOSPC;
|
|
+ }
|
|
+
|
|
+ h->s->have_existing_stripe = true;
|
|
+ ret = get_stripe_key(c, idx, &h->s->existing_stripe);
|
|
+ if (ret) {
|
|
+ bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize)) {
|
|
+ /*
|
|
+ * this is a problem: we have deleted from the
|
|
+ * stripes heap already
|
|
+ */
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ BUG_ON(h->s->existing_stripe.size != h->blocksize);
|
|
+ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors);
|
|
+
|
|
+ for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
|
|
+ if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) {
|
|
+ __set_bit(i, h->s->blocks_gotten);
|
|
+ __set_bit(i, h->s->blocks_allocated);
|
|
+ }
|
|
+
|
|
+ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
|
|
+ }
|
|
+
|
|
+ bkey_copy(&h->s->new_stripe.key.k_i,
|
|
+ &h->s->existing_stripe.key.k_i);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int __bch2_ec_stripe_head_reserve(struct bch_fs *c,
|
|
+ struct ec_stripe_head *h)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_disk_reservation_get(c, &h->s->res,
|
|
+ h->blocksize,
|
|
+ h->s->nr_parity, 0);
|
|
+
|
|
+ if (ret) {
|
|
+ /*
|
|
+ * This means we need to wait for copygc to
|
|
+ * empty out buckets from existing stripes:
|
|
+ */
|
|
+ bch_err(c, "failed to reserve stripe");
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
|
|
+ unsigned target,
|
|
+ unsigned algo,
|
|
+ unsigned redundancy,
|
|
+ bool copygc,
|
|
+ struct closure *cl)
|
|
+{
|
|
+ struct ec_stripe_head *h;
|
|
+ int ret;
|
|
+ bool needs_stripe_new;
|
|
+
|
|
+ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
|
|
+ if (!h) {
|
|
+ bch_err(c, "no stripe head");
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ needs_stripe_new = !h->s;
|
|
+ if (needs_stripe_new) {
|
|
+ if (ec_new_stripe_alloc(c, h)) {
|
|
+ ret = -ENOMEM;
|
|
+ bch_err(c, "failed to allocate new stripe");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize))
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Try reserve a new stripe before reusing an
|
|
+ * existing stripe. This will prevent unnecessary
|
|
+ * read amplification during write oriented workloads.
|
|
+ */
|
|
+ ret = 0;
|
|
+ if (!h->s->allocated && !h->s->res.sectors && !h->s->have_existing_stripe)
|
|
+ ret = __bch2_ec_stripe_head_reserve(c, h);
|
|
+ if (ret && needs_stripe_new)
|
|
+ ret = __bch2_ec_stripe_head_reuse(c, h);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!h->s->allocated) {
|
|
+ ret = new_stripe_alloc_buckets(c, h, cl);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ h->s->allocated = true;
|
|
+ }
|
|
+
|
|
+ return h;
|
|
+
|
|
+err:
|
|
+ bch2_ec_stripe_head_put(c, h);
|
|
+ return ERR_PTR(-ret);
|
|
+}
|
|
+
|
|
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct ec_stripe_head *h;
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ mutex_lock(&c->ec_stripe_head_lock);
|
|
+ list_for_each_entry(h, &c->ec_stripe_head_list, list) {
|
|
+
|
|
+ mutex_lock(&h->lock);
|
|
+ if (!h->s)
|
|
+ goto unlock;
|
|
+
|
|
+ for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) {
|
|
+ if (!h->s->blocks[i])
|
|
+ continue;
|
|
+
|
|
+ ob = c->open_buckets + h->s->blocks[i];
|
|
+ if (ob->ptr.dev == ca->dev_idx)
|
|
+ goto found;
|
|
+ }
|
|
+ goto unlock;
|
|
+found:
|
|
+ h->s->err = -EROFS;
|
|
+ ec_stripe_set_pending(c, h);
|
|
+unlock:
|
|
+ mutex_unlock(&h->lock);
|
|
+ }
|
|
+ mutex_unlock(&c->ec_stripe_head_lock);
|
|
+}
|
|
+
|
|
+void bch2_stripes_heap_start(struct bch_fs *c)
|
|
+{
|
|
+ struct genradix_iter iter;
|
|
+ struct stripe *m;
|
|
+
|
|
+ genradix_for_each(&c->stripes[0], iter, m)
|
|
+ if (m->alive)
|
|
+ bch2_stripes_heap_insert(c, m, iter.pos);
|
|
+}
|
|
+
|
|
+static int __bch2_stripe_write_key(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct stripe *m,
|
|
+ size_t idx,
|
|
+ struct bkey_i_stripe *new_key)
|
|
+{
|
|
+ const struct bch_stripe *v;
|
|
+ struct bkey_s_c k;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter, POS(0, idx));
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_stripe)
|
|
+ return -EIO;
|
|
+
|
|
+ v = bkey_s_c_to_stripe(k).v;
|
|
+ for (i = 0; i < v->nr_blocks; i++)
|
|
+ if (m->block_sectors[i] != stripe_blockcount_get(v, i))
|
|
+ goto write;
|
|
+ return 0;
|
|
+write:
|
|
+ bkey_reassemble(&new_key->k_i, k);
|
|
+
|
|
+ for (i = 0; i < new_key->v.nr_blocks; i++)
|
|
+ stripe_blockcount_set(&new_key->v, i,
|
|
+ m->block_sectors[i]);
|
|
+
|
|
+ bch2_trans_update(trans, iter, &new_key->k_i, 0);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_stripes_write(struct bch_fs *c, unsigned flags)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct genradix_iter giter;
|
|
+ struct bkey_i_stripe *new_key;
|
|
+ struct stripe *m;
|
|
+ int ret = 0;
|
|
+
|
|
+ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL);
|
|
+ BUG_ON(!new_key);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS_MIN,
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+
|
|
+ genradix_for_each(&c->stripes[0], giter, m) {
|
|
+ if (!m->alive)
|
|
+ continue;
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|flags,
|
|
+ __bch2_stripe_write_key(&trans, iter, m,
|
|
+ giter.pos, new_key));
|
|
+
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ kfree(new_key);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_stripes_read_fn(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_stripe)
|
|
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?:
|
|
+ bch2_mark_key(c, k, 0, 0, NULL, 0,
|
|
+ BTREE_TRIGGER_NOATOMIC);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_stripes_read(struct bch_fs *c)
|
|
+{
|
|
+ int ret = bch2_btree_and_journal_walk(c, BTREE_ID_stripes,
|
|
+ bch2_stripes_read_fn);
|
|
+ if (ret)
|
|
+ bch_err(c, "error reading stripes: %i", ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ size_t i, idx = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_stripes, POS(0, U64_MAX), 0);
|
|
+
|
|
+ k = bch2_btree_iter_prev(iter);
|
|
+ if (!IS_ERR_OR_NULL(k.k))
|
|
+ idx = k.k->p.offset + 1;
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ ret = bch2_trans_exit(&trans);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (!idx)
|
|
+ return 0;
|
|
+
|
|
+ if (!gc &&
|
|
+ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
|
|
+ GFP_KERNEL))
|
|
+ return -ENOMEM;
|
|
+#if 0
|
|
+ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL);
|
|
+#else
|
|
+ for (i = 0; i < idx; i++)
|
|
+ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL))
|
|
+ return -ENOMEM;
|
|
+#endif
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ ec_stripes_heap *h = &c->ec_stripes_heap;
|
|
+ struct stripe *m;
|
|
+ size_t i;
|
|
+
|
|
+ spin_lock(&c->ec_stripes_heap_lock);
|
|
+ for (i = 0; i < min_t(size_t, h->used, 20); i++) {
|
|
+ m = genradix_ptr(&c->stripes[0], h->data[i].idx);
|
|
+
|
|
+ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx,
|
|
+ h->data[i].blocks_nonempty,
|
|
+ m->nr_blocks - m->nr_redundant,
|
|
+ m->nr_redundant);
|
|
+ }
|
|
+ spin_unlock(&c->ec_stripes_heap_lock);
|
|
+}
|
|
+
|
|
+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct ec_stripe_head *h;
|
|
+ struct ec_stripe_new *s;
|
|
+
|
|
+ mutex_lock(&c->ec_stripe_head_lock);
|
|
+ list_for_each_entry(h, &c->ec_stripe_head_list, list) {
|
|
+ pr_buf(out, "target %u algo %u redundancy %u:\n",
|
|
+ h->target, h->algo, h->redundancy);
|
|
+
|
|
+ if (h->s)
|
|
+ pr_buf(out, "\tpending: blocks %u+%u allocated %u\n",
|
|
+ h->s->nr_data, h->s->nr_parity,
|
|
+ bitmap_weight(h->s->blocks_allocated,
|
|
+ h->s->nr_data));
|
|
+ }
|
|
+ mutex_unlock(&c->ec_stripe_head_lock);
|
|
+
|
|
+ mutex_lock(&c->ec_stripe_new_lock);
|
|
+ list_for_each_entry(s, &c->ec_stripe_new_list, list) {
|
|
+ pr_buf(out, "\tin flight: blocks %u+%u pin %u\n",
|
|
+ s->nr_data, s->nr_parity,
|
|
+ atomic_read(&s->pin));
|
|
+ }
|
|
+ mutex_unlock(&c->ec_stripe_new_lock);
|
|
+}
|
|
+
|
|
+void bch2_fs_ec_exit(struct bch_fs *c)
|
|
+{
|
|
+ struct ec_stripe_head *h;
|
|
+
|
|
+ while (1) {
|
|
+ mutex_lock(&c->ec_stripe_head_lock);
|
|
+ h = list_first_entry_or_null(&c->ec_stripe_head_list,
|
|
+ struct ec_stripe_head, list);
|
|
+ if (h)
|
|
+ list_del(&h->list);
|
|
+ mutex_unlock(&c->ec_stripe_head_lock);
|
|
+ if (!h)
|
|
+ break;
|
|
+
|
|
+ BUG_ON(h->s);
|
|
+ kfree(h);
|
|
+ }
|
|
+
|
|
+ BUG_ON(!list_empty(&c->ec_stripe_new_list));
|
|
+
|
|
+ free_heap(&c->ec_stripes_heap);
|
|
+ genradix_free(&c->stripes[0]);
|
|
+ bioset_exit(&c->ec_bioset);
|
|
+}
|
|
+
|
|
+int bch2_fs_ec_init(struct bch_fs *c)
|
|
+{
|
|
+ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work);
|
|
+ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
|
|
+
|
|
+ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
|
|
+ BIOSET_NEED_BVECS);
|
|
+}
|
|
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
|
|
new file mode 100644
|
|
index 000000000000..e79626b59509
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/ec.h
|
|
@@ -0,0 +1,229 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_EC_H
|
|
+#define _BCACHEFS_EC_H
|
|
+
|
|
+#include "ec_types.h"
|
|
+#include "buckets_types.h"
|
|
+#include "keylist_types.h"
|
|
+
|
|
+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_stripe (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_stripe_invalid, \
|
|
+ .val_to_text = bch2_stripe_to_text, \
|
|
+ .swab = bch2_ptr_swab, \
|
|
+}
|
|
+
|
|
+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
|
|
+{
|
|
+ return DIV_ROUND_UP(le16_to_cpu(s->sectors),
|
|
+ 1 << s->csum_granularity_bits);
|
|
+}
|
|
+
|
|
+static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
|
|
+ unsigned dev, unsigned csum_idx)
|
|
+{
|
|
+ unsigned csum_bytes = bch_crc_bytes[s->csum_type];
|
|
+
|
|
+ return sizeof(struct bch_stripe) +
|
|
+ sizeof(struct bch_extent_ptr) * s->nr_blocks +
|
|
+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
|
|
+}
|
|
+
|
|
+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s,
|
|
+ unsigned idx)
|
|
+{
|
|
+ return stripe_csum_offset(s, s->nr_blocks, 0) +
|
|
+ sizeof(u16) * idx;
|
|
+}
|
|
+
|
|
+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s,
|
|
+ unsigned idx)
|
|
+{
|
|
+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx));
|
|
+}
|
|
+
|
|
+static inline void stripe_blockcount_set(struct bch_stripe *s,
|
|
+ unsigned idx, unsigned v)
|
|
+{
|
|
+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx);
|
|
+
|
|
+ *p = cpu_to_le16(v);
|
|
+}
|
|
+
|
|
+static inline unsigned stripe_val_u64s(const struct bch_stripe *s)
|
|
+{
|
|
+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks),
|
|
+ sizeof(u64));
|
|
+}
|
|
+
|
|
+static inline void *stripe_csum(struct bch_stripe *s,
|
|
+ unsigned block, unsigned csum_idx)
|
|
+{
|
|
+ EBUG_ON(block >= s->nr_blocks);
|
|
+ EBUG_ON(csum_idx >= stripe_csums_per_device(s));
|
|
+
|
|
+ return (void *) s + stripe_csum_offset(s, block, csum_idx);
|
|
+}
|
|
+
|
|
+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s,
|
|
+ unsigned block, unsigned csum_idx)
|
|
+{
|
|
+ struct bch_csum csum = { 0 };
|
|
+
|
|
+ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]);
|
|
+ return csum;
|
|
+}
|
|
+
|
|
+static inline void stripe_csum_set(struct bch_stripe *s,
|
|
+ unsigned block, unsigned csum_idx,
|
|
+ struct bch_csum csum)
|
|
+{
|
|
+ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]);
|
|
+}
|
|
+
|
|
+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr,
|
|
+ const struct bch_extent_ptr *data_ptr,
|
|
+ unsigned sectors)
|
|
+{
|
|
+ return data_ptr->dev == stripe_ptr->dev &&
|
|
+ data_ptr->gen == stripe_ptr->gen &&
|
|
+ data_ptr->offset >= stripe_ptr->offset &&
|
|
+ data_ptr->offset < stripe_ptr->offset + sectors;
|
|
+}
|
|
+
|
|
+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s,
|
|
+ struct extent_ptr_decoded p)
|
|
+{
|
|
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
|
|
+
|
|
+ BUG_ON(!p.has_ec);
|
|
+
|
|
+ if (p.ec.block >= nr_data)
|
|
+ return false;
|
|
+
|
|
+ return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr,
|
|
+ le16_to_cpu(s->sectors));
|
|
+}
|
|
+
|
|
+static inline bool bch2_ptr_matches_stripe_m(const struct stripe *m,
|
|
+ struct extent_ptr_decoded p)
|
|
+{
|
|
+ unsigned nr_data = m->nr_blocks - m->nr_redundant;
|
|
+
|
|
+ BUG_ON(!p.has_ec);
|
|
+
|
|
+ if (p.ec.block >= nr_data)
|
|
+ return false;
|
|
+
|
|
+ return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr,
|
|
+ m->sectors);
|
|
+}
|
|
+
|
|
+struct bch_read_bio;
|
|
+
|
|
+struct ec_stripe_buf {
|
|
+ /* might not be buffering the entire stripe: */
|
|
+ unsigned offset;
|
|
+ unsigned size;
|
|
+ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
|
|
+
|
|
+ void *data[BCH_BKEY_PTRS_MAX];
|
|
+
|
|
+ union {
|
|
+ struct bkey_i_stripe key;
|
|
+ u64 pad[255];
|
|
+ };
|
|
+};
|
|
+
|
|
+struct ec_stripe_head;
|
|
+
|
|
+struct ec_stripe_new {
|
|
+ struct bch_fs *c;
|
|
+ struct ec_stripe_head *h;
|
|
+ struct mutex lock;
|
|
+ struct list_head list;
|
|
+ struct closure iodone;
|
|
+
|
|
+ /* counts in flight writes, stripe is created when pin == 0 */
|
|
+ atomic_t pin;
|
|
+
|
|
+ int err;
|
|
+
|
|
+ u8 nr_data;
|
|
+ u8 nr_parity;
|
|
+ bool allocated;
|
|
+ bool pending;
|
|
+ bool have_existing_stripe;
|
|
+
|
|
+ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
|
|
+ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
|
|
+ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX];
|
|
+ struct disk_reservation res;
|
|
+
|
|
+ struct keylist keys;
|
|
+ u64 inline_keys[BKEY_U64s * 8];
|
|
+
|
|
+ struct ec_stripe_buf new_stripe;
|
|
+ struct ec_stripe_buf existing_stripe;
|
|
+};
|
|
+
|
|
+struct ec_stripe_head {
|
|
+ struct list_head list;
|
|
+ struct mutex lock;
|
|
+
|
|
+ unsigned target;
|
|
+ unsigned algo;
|
|
+ unsigned redundancy;
|
|
+ bool copygc;
|
|
+
|
|
+ struct bch_devs_mask devs;
|
|
+ unsigned nr_active_devs;
|
|
+
|
|
+ unsigned blocksize;
|
|
+
|
|
+ struct dev_stripe_state block_stripe;
|
|
+ struct dev_stripe_state parity_stripe;
|
|
+
|
|
+ struct ec_stripe_new *s;
|
|
+};
|
|
+
|
|
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
|
|
+
|
|
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
|
|
+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
|
|
+ struct bpos, unsigned);
|
|
+
|
|
+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
|
|
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
|
|
+
|
|
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
|
|
+
|
|
+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
|
|
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
|
|
+ unsigned, unsigned, unsigned, bool, struct closure *);
|
|
+
|
|
+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
|
|
+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
|
|
+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t);
|
|
+
|
|
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+void bch2_ec_flush_new_stripes(struct bch_fs *);
|
|
+
|
|
+void bch2_stripes_heap_start(struct bch_fs *);
|
|
+
|
|
+int bch2_stripes_read(struct bch_fs *);
|
|
+int bch2_stripes_write(struct bch_fs *, unsigned);
|
|
+
|
|
+int bch2_ec_mem_alloc(struct bch_fs *, bool);
|
|
+
|
|
+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *);
|
|
+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+void bch2_fs_ec_exit(struct bch_fs *);
|
|
+int bch2_fs_ec_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_EC_H */
|
|
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
|
|
new file mode 100644
|
|
index 000000000000..3fc31222459a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/ec_types.h
|
|
@@ -0,0 +1,37 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_EC_TYPES_H
|
|
+#define _BCACHEFS_EC_TYPES_H
|
|
+
|
|
+#include <linux/llist.h>
|
|
+
|
|
+struct bch_replicas_padded {
|
|
+ struct bch_replicas_entry e;
|
|
+ u8 pad[BCH_BKEY_PTRS_MAX];
|
|
+};
|
|
+
|
|
+struct stripe {
|
|
+ size_t heap_idx;
|
|
+
|
|
+ u16 sectors;
|
|
+ u8 algorithm;
|
|
+
|
|
+ u8 nr_blocks;
|
|
+ u8 nr_redundant;
|
|
+
|
|
+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */
|
|
+ unsigned on_heap:1;
|
|
+ u8 blocks_nonempty;
|
|
+ u16 block_sectors[BCH_BKEY_PTRS_MAX];
|
|
+ struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX];
|
|
+
|
|
+ struct bch_replicas_padded r;
|
|
+};
|
|
+
|
|
+struct ec_stripe_heap_entry {
|
|
+ size_t idx;
|
|
+ unsigned blocks_nonempty;
|
|
+};
|
|
+
|
|
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
|
|
+
|
|
+#endif /* _BCACHEFS_EC_TYPES_H */
|
|
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
|
|
new file mode 100644
|
|
index 000000000000..90c3b986c264
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/error.c
|
|
@@ -0,0 +1,184 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "error.h"
|
|
+#include "io.h"
|
|
+#include "super.h"
|
|
+
|
|
+#define FSCK_ERR_RATELIMIT_NR 10
|
|
+
|
|
+bool bch2_inconsistent_error(struct bch_fs *c)
|
|
+{
|
|
+ set_bit(BCH_FS_ERROR, &c->flags);
|
|
+
|
|
+ switch (c->opts.errors) {
|
|
+ case BCH_ON_ERROR_continue:
|
|
+ return false;
|
|
+ case BCH_ON_ERROR_ro:
|
|
+ if (bch2_fs_emergency_read_only(c))
|
|
+ bch_err(c, "emergency read only");
|
|
+ return true;
|
|
+ case BCH_ON_ERROR_panic:
|
|
+ panic(bch2_fmt(c, "panic after error"));
|
|
+ return true;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_topology_error(struct bch_fs *c)
|
|
+{
|
|
+ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
|
|
+ if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
|
|
+ bch2_inconsistent_error(c);
|
|
+}
|
|
+
|
|
+void bch2_fatal_error(struct bch_fs *c)
|
|
+{
|
|
+ if (bch2_fs_emergency_read_only(c))
|
|
+ bch_err(c, "emergency read only");
|
|
+}
|
|
+
|
|
+void bch2_io_error_work(struct work_struct *work)
|
|
+{
|
|
+ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ bool dev;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
|
|
+ BCH_FORCE_IF_DEGRADED);
|
|
+ if (dev
|
|
+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
|
|
+ BCH_FORCE_IF_DEGRADED)
|
|
+ : bch2_fs_emergency_read_only(c))
|
|
+ bch_err(ca,
|
|
+ "too many IO errors, setting %s RO",
|
|
+ dev ? "device" : "filesystem");
|
|
+ up_write(&c->state_lock);
|
|
+}
|
|
+
|
|
+void bch2_io_error(struct bch_dev *ca)
|
|
+{
|
|
+ //queue_work(system_long_wq, &ca->io_error_work);
|
|
+}
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+#define ask_yn() false
|
|
+#else
|
|
+#include "tools-util.h"
|
|
+#endif
|
|
+
|
|
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
|
|
+ const char *fmt, ...)
|
|
+{
|
|
+ struct fsck_err_state *s = NULL;
|
|
+ va_list args;
|
|
+ bool fix = false, print = true, suppressing = false;
|
|
+ char _buf[sizeof(s->buf)], *buf = _buf;
|
|
+
|
|
+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
|
|
+ va_start(args, fmt);
|
|
+ vprintk(fmt, args);
|
|
+ va_end(args);
|
|
+
|
|
+ if (c->opts.errors == BCH_ON_ERROR_continue) {
|
|
+ bch_err(c, "fixing");
|
|
+ return FSCK_ERR_FIX;
|
|
+ } else {
|
|
+ bch2_inconsistent_error(c);
|
|
+ return FSCK_ERR_EXIT;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->fsck_error_lock);
|
|
+
|
|
+ list_for_each_entry(s, &c->fsck_errors, list)
|
|
+ if (s->fmt == fmt)
|
|
+ goto found;
|
|
+
|
|
+ s = kzalloc(sizeof(*s), GFP_NOFS);
|
|
+ if (!s) {
|
|
+ if (!c->fsck_alloc_err)
|
|
+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs");
|
|
+ c->fsck_alloc_err = true;
|
|
+ buf = _buf;
|
|
+ goto print;
|
|
+ }
|
|
+
|
|
+ INIT_LIST_HEAD(&s->list);
|
|
+ s->fmt = fmt;
|
|
+found:
|
|
+ list_move(&s->list, &c->fsck_errors);
|
|
+ s->nr++;
|
|
+ if (c->opts.ratelimit_errors &&
|
|
+ s->nr >= FSCK_ERR_RATELIMIT_NR) {
|
|
+ if (s->nr == FSCK_ERR_RATELIMIT_NR)
|
|
+ suppressing = true;
|
|
+ else
|
|
+ print = false;
|
|
+ }
|
|
+ buf = s->buf;
|
|
+print:
|
|
+ va_start(args, fmt);
|
|
+ vscnprintf(buf, sizeof(_buf), fmt, args);
|
|
+ va_end(args);
|
|
+
|
|
+ if (c->opts.fix_errors == FSCK_OPT_EXIT) {
|
|
+ bch_err(c, "%s, exiting", buf);
|
|
+ } else if (flags & FSCK_CAN_FIX) {
|
|
+ if (c->opts.fix_errors == FSCK_OPT_ASK) {
|
|
+ printk(KERN_ERR "%s: fix?", buf);
|
|
+ fix = ask_yn();
|
|
+ } else if (c->opts.fix_errors == FSCK_OPT_YES ||
|
|
+ (c->opts.nochanges &&
|
|
+ !(flags & FSCK_CAN_IGNORE))) {
|
|
+ if (print)
|
|
+ bch_err(c, "%s, fixing", buf);
|
|
+ fix = true;
|
|
+ } else {
|
|
+ if (print)
|
|
+ bch_err(c, "%s, not fixing", buf);
|
|
+ fix = false;
|
|
+ }
|
|
+ } else if (flags & FSCK_NEED_FSCK) {
|
|
+ if (print)
|
|
+ bch_err(c, "%s (run fsck to correct)", buf);
|
|
+ } else {
|
|
+ if (print)
|
|
+ bch_err(c, "%s (repair unimplemented)", buf);
|
|
+ }
|
|
+
|
|
+ if (suppressing)
|
|
+ bch_err(c, "Ratelimiting new instances of previous error");
|
|
+
|
|
+ mutex_unlock(&c->fsck_error_lock);
|
|
+
|
|
+ if (fix) {
|
|
+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
|
|
+ return FSCK_ERR_FIX;
|
|
+ } else {
|
|
+ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
|
|
+ set_bit(BCH_FS_ERROR, &c->flags);
|
|
+ return c->opts.fix_errors == FSCK_OPT_EXIT ||
|
|
+ !(flags & FSCK_CAN_IGNORE)
|
|
+ ? FSCK_ERR_EXIT
|
|
+ : FSCK_ERR_IGNORE;
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_flush_fsck_errs(struct bch_fs *c)
|
|
+{
|
|
+ struct fsck_err_state *s, *n;
|
|
+
|
|
+ mutex_lock(&c->fsck_error_lock);
|
|
+
|
|
+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) {
|
|
+ if (s->ratelimited)
|
|
+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf);
|
|
+
|
|
+ list_del(&s->list);
|
|
+ kfree(s);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->fsck_error_lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
|
|
new file mode 100644
|
|
index 000000000000..d8cd19b3f63c
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/error.h
|
|
@@ -0,0 +1,217 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_ERROR_H
|
|
+#define _BCACHEFS_ERROR_H
|
|
+
|
|
+#include <linux/list.h>
|
|
+#include <linux/printk.h>
|
|
+
|
|
+struct bch_dev;
|
|
+struct bch_fs;
|
|
+struct work_struct;
|
|
+
|
|
+/*
|
|
+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag
|
|
+ * superblock as such
|
|
+ */
|
|
+
|
|
+/* Error messages: */
|
|
+
|
|
+/*
|
|
+ * Inconsistency errors: The on disk data is inconsistent. If these occur during
|
|
+ * initial recovery, they don't indicate a bug in the running code - we walk all
|
|
+ * the metadata before modifying anything. If they occur at runtime, they
|
|
+ * indicate either a bug in the running code or (less likely) data is being
|
|
+ * silently corrupted under us.
|
|
+ *
|
|
+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in
|
|
+ * BCH_ON_ERROR_CONTINUE mode
|
|
+ */
|
|
+
|
|
+bool bch2_inconsistent_error(struct bch_fs *);
|
|
+
|
|
+void bch2_topology_error(struct bch_fs *);
|
|
+
|
|
+#define bch2_fs_inconsistent(c, ...) \
|
|
+({ \
|
|
+ bch_err(c, __VA_ARGS__); \
|
|
+ bch2_inconsistent_error(c); \
|
|
+})
|
|
+
|
|
+#define bch2_fs_inconsistent_on(cond, c, ...) \
|
|
+({ \
|
|
+ int _ret = !!(cond); \
|
|
+ \
|
|
+ if (_ret) \
|
|
+ bch2_fs_inconsistent(c, __VA_ARGS__); \
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+/*
|
|
+ * Later we might want to mark only the particular device inconsistent, not the
|
|
+ * entire filesystem:
|
|
+ */
|
|
+
|
|
+#define bch2_dev_inconsistent(ca, ...) \
|
|
+do { \
|
|
+ bch_err(ca, __VA_ARGS__); \
|
|
+ bch2_inconsistent_error((ca)->fs); \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_dev_inconsistent_on(cond, ca, ...) \
|
|
+({ \
|
|
+ int _ret = !!(cond); \
|
|
+ \
|
|
+ if (_ret) \
|
|
+ bch2_dev_inconsistent(ca, __VA_ARGS__); \
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+/*
|
|
+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally
|
|
+ * be able to repair:
|
|
+ */
|
|
+
|
|
+enum {
|
|
+ BCH_FSCK_OK = 0,
|
|
+ BCH_FSCK_ERRORS_NOT_FIXED = 1,
|
|
+ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2,
|
|
+ BCH_FSCK_REPAIR_IMPOSSIBLE = 3,
|
|
+ BCH_FSCK_UNKNOWN_VERSION = 4,
|
|
+};
|
|
+
|
|
+enum fsck_err_opts {
|
|
+ FSCK_OPT_EXIT,
|
|
+ FSCK_OPT_YES,
|
|
+ FSCK_OPT_NO,
|
|
+ FSCK_OPT_ASK,
|
|
+};
|
|
+
|
|
+enum fsck_err_ret {
|
|
+ FSCK_ERR_IGNORE = 0,
|
|
+ FSCK_ERR_FIX = 1,
|
|
+ FSCK_ERR_EXIT = 2,
|
|
+ FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
|
|
+};
|
|
+
|
|
+struct fsck_err_state {
|
|
+ struct list_head list;
|
|
+ const char *fmt;
|
|
+ u64 nr;
|
|
+ bool ratelimited;
|
|
+ char buf[512];
|
|
+};
|
|
+
|
|
+#define FSCK_CAN_FIX (1 << 0)
|
|
+#define FSCK_CAN_IGNORE (1 << 1)
|
|
+#define FSCK_NEED_FSCK (1 << 2)
|
|
+
|
|
+__printf(3, 4) __cold
|
|
+enum fsck_err_ret bch2_fsck_err(struct bch_fs *,
|
|
+ unsigned, const char *, ...);
|
|
+void bch2_flush_fsck_errs(struct bch_fs *);
|
|
+
|
|
+#define __fsck_err(c, _flags, msg, ...) \
|
|
+({ \
|
|
+ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\
|
|
+ \
|
|
+ if (_fix == FSCK_ERR_EXIT) { \
|
|
+ bch_err(c, "Unable to continue, halting"); \
|
|
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
|
+ goto fsck_err; \
|
|
+ } \
|
|
+ \
|
|
+ _fix; \
|
|
+})
|
|
+
|
|
+/* These macros return true if error should be fixed: */
|
|
+
|
|
+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */
|
|
+
|
|
+#define __fsck_err_on(cond, c, _flags, ...) \
|
|
+ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false)
|
|
+
|
|
+#define need_fsck_err_on(cond, c, ...) \
|
|
+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
|
|
+
|
|
+#define need_fsck_err(c, ...) \
|
|
+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__)
|
|
+
|
|
+#define mustfix_fsck_err(c, ...) \
|
|
+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__)
|
|
+
|
|
+#define mustfix_fsck_err_on(cond, c, ...) \
|
|
+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__)
|
|
+
|
|
+#define fsck_err(c, ...) \
|
|
+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
|
|
+
|
|
+#define fsck_err_on(cond, c, ...) \
|
|
+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__)
|
|
+
|
|
+/*
|
|
+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW
|
|
+ * mode - pretty much just due to metadata IO errors:
|
|
+ */
|
|
+
|
|
+void bch2_fatal_error(struct bch_fs *);
|
|
+
|
|
+#define bch2_fs_fatal_error(c, ...) \
|
|
+do { \
|
|
+ bch_err(c, __VA_ARGS__); \
|
|
+ bch2_fatal_error(c); \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_fs_fatal_err_on(cond, c, ...) \
|
|
+({ \
|
|
+ int _ret = !!(cond); \
|
|
+ \
|
|
+ if (_ret) \
|
|
+ bch2_fs_fatal_error(c, __VA_ARGS__); \
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+/*
|
|
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
|
|
+ * IO - we need to log it and print out a message, but we don't (necessarily)
|
|
+ * want to shut down the fs:
|
|
+ */
|
|
+
|
|
+void bch2_io_error_work(struct work_struct *);
|
|
+
|
|
+/* Does the error handling without logging a message */
|
|
+void bch2_io_error(struct bch_dev *);
|
|
+
|
|
+/* Logs message and handles the error: */
|
|
+#define bch2_dev_io_error(ca, fmt, ...) \
|
|
+do { \
|
|
+ printk_ratelimited(KERN_ERR "bcachefs (%s): " fmt, \
|
|
+ (ca)->name, ##__VA_ARGS__); \
|
|
+ bch2_io_error(ca); \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_dev_inum_io_error(ca, _inum, _offset, fmt, ...) \
|
|
+do { \
|
|
+ printk_ratelimited(KERN_ERR "bcachefs (%s inum %llu offset %llu): " fmt,\
|
|
+ (ca)->name, (_inum), (_offset), ##__VA_ARGS__); \
|
|
+ bch2_io_error(ca); \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_dev_io_err_on(cond, ca, ...) \
|
|
+({ \
|
|
+ bool _ret = (cond); \
|
|
+ \
|
|
+ if (_ret) \
|
|
+ bch2_dev_io_error(ca, __VA_ARGS__); \
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+#define bch2_dev_inum_io_err_on(cond, ca, _inum, _offset, ...) \
|
|
+({ \
|
|
+ bool _ret = (cond); \
|
|
+ \
|
|
+ if (_ret) \
|
|
+ bch2_dev_inum_io_error(ca, _inum, _offset, __VA_ARGS__);\
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+#endif /* _BCACHEFS_ERROR_H */
|
|
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
|
|
new file mode 100644
|
|
index 000000000000..bb4b2b4352e0
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/extent_update.c
|
|
@@ -0,0 +1,210 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "buckets.h"
|
|
+#include "debug.h"
|
|
+#include "extents.h"
|
|
+#include "extent_update.h"
|
|
+
|
|
+/*
|
|
+ * This counts the number of iterators to the alloc & ec btrees we'll need
|
|
+ * inserting/removing this extent:
|
|
+ */
|
|
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ unsigned ret = 0;
|
|
+
|
|
+ bkey_extent_entry_for_each(ptrs, entry) {
|
|
+ switch (__extent_entry_type(entry)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr:
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
+ ret++;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int count_iters_for_insert(struct btree_trans *trans,
|
|
+ struct bkey_s_c k,
|
|
+ unsigned offset,
|
|
+ struct bpos *end,
|
|
+ unsigned *nr_iters,
|
|
+ unsigned max_iters)
|
|
+{
|
|
+ int ret = 0, ret2 = 0;
|
|
+
|
|
+ if (*nr_iters >= max_iters) {
|
|
+ *end = bpos_min(*end, k.k->p);
|
|
+ ret = 1;
|
|
+ }
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
|
|
+
|
|
+ if (*nr_iters >= max_iters) {
|
|
+ *end = bpos_min(*end, k.k->p);
|
|
+ ret = 1;
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ case KEY_TYPE_reflink_p: {
|
|
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
|
+ u64 idx = le64_to_cpu(p.v->idx);
|
|
+ unsigned sectors = bpos_min(*end, p.k->p).offset -
|
|
+ bkey_start_offset(p.k);
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c r_k;
|
|
+
|
|
+ for_each_btree_key(trans, iter,
|
|
+ BTREE_ID_reflink, POS(0, idx + offset),
|
|
+ BTREE_ITER_SLOTS, r_k, ret2) {
|
|
+ if (bkey_cmp(bkey_start_pos(r_k.k),
|
|
+ POS(0, idx + sectors)) >= 0)
|
|
+ break;
|
|
+
|
|
+ /* extent_update_to_keys(), for the reflink_v update */
|
|
+ *nr_iters += 1;
|
|
+
|
|
+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
|
|
+
|
|
+ if (*nr_iters >= max_iters) {
|
|
+ struct bpos pos = bkey_start_pos(k.k);
|
|
+ pos.offset += min_t(u64, k.k->size,
|
|
+ r_k.k->p.offset - idx);
|
|
+
|
|
+ *end = bpos_min(*end, pos);
|
|
+ ret = 1;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret2 ?: ret;
|
|
+}
|
|
+
|
|
+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
|
|
+
|
|
+int bch2_extent_atomic_end(struct btree_iter *iter,
|
|
+ struct bkey_i *insert,
|
|
+ struct bpos *end)
|
|
+{
|
|
+ struct btree_trans *trans = iter->trans;
|
|
+ struct btree_iter *copy;
|
|
+ struct bkey_s_c k;
|
|
+ unsigned nr_iters = 0;
|
|
+ int ret;
|
|
+
|
|
+ *end = insert->k.p;
|
|
+
|
|
+ /* extent_update_to_keys(): */
|
|
+ nr_iters += 1;
|
|
+
|
|
+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
|
|
+ &nr_iters, EXTENT_ITERS_MAX / 2);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ copy = bch2_trans_copy_iter(trans, iter);
|
|
+
|
|
+ for_each_btree_key_continue(copy, 0, k, ret) {
|
|
+ unsigned offset = 0;
|
|
+
|
|
+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
|
|
+ break;
|
|
+
|
|
+ if (bkey_cmp(bkey_start_pos(&insert->k),
|
|
+ bkey_start_pos(k.k)) > 0)
|
|
+ offset = bkey_start_offset(&insert->k) -
|
|
+ bkey_start_offset(k.k);
|
|
+
|
|
+ /* extent_handle_overwrites(): */
|
|
+ switch (bch2_extent_overlap(&insert->k, k.k)) {
|
|
+ case BCH_EXTENT_OVERLAP_ALL:
|
|
+ case BCH_EXTENT_OVERLAP_FRONT:
|
|
+ nr_iters += 1;
|
|
+ break;
|
|
+ case BCH_EXTENT_OVERLAP_BACK:
|
|
+ case BCH_EXTENT_OVERLAP_MIDDLE:
|
|
+ nr_iters += 2;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = count_iters_for_insert(trans, k, offset, end,
|
|
+ &nr_iters, EXTENT_ITERS_MAX);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(trans, copy);
|
|
+ return ret < 0 ? ret : 0;
|
|
+}
|
|
+
|
|
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos end;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_extent_atomic_end(iter, k, &end);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_cut_back(end, k);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
|
|
+{
|
|
+ struct bpos end;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_extent_atomic_end(iter, k, &end);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ return !bkey_cmp(end, k->k.p);
|
|
+}
|
|
+
|
|
+enum btree_insert_ret
|
|
+bch2_extent_can_insert(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *insert)
|
|
+{
|
|
+ struct bkey_s_c k;
|
|
+ int ret, sectors;
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ /* Check if we're splitting a compressed extent: */
|
|
+
|
|
+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 &&
|
|
+ bkey_cmp(insert->k.p, k.k->p) < 0 &&
|
|
+ (sectors = bch2_bkey_sectors_compressed(k))) {
|
|
+ int flags = trans->flags & BTREE_INSERT_NOFAIL
|
|
+ ? BCH_DISK_RESERVATION_NOFAIL : 0;
|
|
+
|
|
+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res,
|
|
+ sectors, flags)) {
|
|
+ case 0:
|
|
+ break;
|
|
+ case -ENOSPC:
|
|
+ return BTREE_INSERT_ENOSPC;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return BTREE_INSERT_OK;
|
|
+}
|
|
diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h
|
|
new file mode 100644
|
|
index 000000000000..38dc084627d2
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/extent_update.h
|
|
@@ -0,0 +1,16 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
|
|
+#define _BCACHEFS_EXTENT_UPDATE_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+
|
|
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
|
|
+ struct bpos *);
|
|
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
|
|
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
|
|
+
|
|
+enum btree_insert_ret
|
|
+bch2_extent_can_insert(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_i *);
|
|
+
|
|
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
|
|
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
|
|
new file mode 100644
|
|
index 000000000000..b07d39555eb6
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/extents.c
|
|
@@ -0,0 +1,1226 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ *
|
|
+ * Code for managing the extent btree and dynamically updating the writeback
|
|
+ * dirty sector count.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "buckets.h"
|
|
+#include "checksum.h"
|
|
+#include "debug.h"
|
|
+#include "disk_groups.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "inode.h"
|
|
+#include "journal.h"
|
|
+#include "replicas.h"
|
|
+#include "super.h"
|
|
+#include "super-io.h"
|
|
+#include "util.h"
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static unsigned bch2_crc_field_size_max[] = {
|
|
+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
|
|
+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
|
|
+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
|
|
+};
|
|
+
|
|
+static void bch2_extent_crc_pack(union bch_extent_crc *,
|
|
+ struct bch_extent_crc_unpacked,
|
|
+ enum bch_extent_entry_type);
|
|
+
|
|
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
|
|
+ unsigned dev)
|
|
+{
|
|
+ struct bch_dev_io_failures *i;
|
|
+
|
|
+ for (i = f->devs; i < f->devs + f->nr; i++)
|
|
+ if (i->dev == dev)
|
|
+ return i;
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_mark_io_failure(struct bch_io_failures *failed,
|
|
+ struct extent_ptr_decoded *p)
|
|
+{
|
|
+ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
|
|
+
|
|
+ if (!f) {
|
|
+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
|
|
+
|
|
+ f = &failed->devs[failed->nr++];
|
|
+ f->dev = p->ptr.dev;
|
|
+ f->idx = p->idx;
|
|
+ f->nr_failed = 1;
|
|
+ f->nr_retries = 0;
|
|
+ } else if (p->idx != f->idx) {
|
|
+ f->idx = p->idx;
|
|
+ f->nr_failed = 1;
|
|
+ f->nr_retries = 0;
|
|
+ } else {
|
|
+ f->nr_failed++;
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * returns true if p1 is better than p2:
|
|
+ */
|
|
+static inline bool ptr_better(struct bch_fs *c,
|
|
+ const struct extent_ptr_decoded p1,
|
|
+ const struct extent_ptr_decoded p2)
|
|
+{
|
|
+ if (likely(!p1.idx && !p2.idx)) {
|
|
+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
|
|
+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
|
|
+
|
|
+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
|
|
+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
|
|
+
|
|
+ /* Pick at random, biased in favor of the faster device: */
|
|
+
|
|
+ return bch2_rand_range(l1 + l2) > l1;
|
|
+ }
|
|
+
|
|
+ if (bch2_force_reconstruct_read)
|
|
+ return p1.idx > p2.idx;
|
|
+
|
|
+ return p1.idx < p2.idx;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
|
|
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
|
|
+ * other devices, it will still pick a pointer from avoid.
|
|
+ */
|
|
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
|
|
+ struct bch_io_failures *failed,
|
|
+ struct extent_ptr_decoded *pick)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ struct bch_dev_io_failures *f;
|
|
+ struct bch_dev *ca;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_error)
|
|
+ return -EIO;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ ca = bch_dev_bkey_exists(c, p.ptr.dev);
|
|
+
|
|
+ /*
|
|
+ * If there are any dirty pointers it's an error if we can't
|
|
+ * read:
|
|
+ */
|
|
+ if (!ret && !p.ptr.cached)
|
|
+ ret = -EIO;
|
|
+
|
|
+ if (p.ptr.cached && ptr_stale(ca, &p.ptr))
|
|
+ continue;
|
|
+
|
|
+ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
|
|
+ if (f)
|
|
+ p.idx = f->nr_failed < f->nr_retries
|
|
+ ? f->idx
|
|
+ : f->idx + 1;
|
|
+
|
|
+ if (!p.idx &&
|
|
+ !bch2_dev_is_readable(ca))
|
|
+ p.idx++;
|
|
+
|
|
+ if (bch2_force_reconstruct_read &&
|
|
+ !p.idx && p.has_ec)
|
|
+ p.idx++;
|
|
+
|
|
+ if (p.idx >= (unsigned) p.has_ec + 1)
|
|
+ continue;
|
|
+
|
|
+ if (ret > 0 && !ptr_better(c, p, *pick))
|
|
+ continue;
|
|
+
|
|
+ *pick = p;
|
|
+ ret = 1;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* KEY_TYPE_btree_ptr: */
|
|
+
|
|
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
|
|
+ return "value too big";
|
|
+
|
|
+ return bch2_bkey_ptrs_invalid(c, k);
|
|
+}
|
|
+
|
|
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ bch2_bkey_ptrs_to_text(out, c, k);
|
|
+}
|
|
+
|
|
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
|
|
+
|
|
+ if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
|
|
+ return "value too small";
|
|
+
|
|
+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
|
|
+ return "value too big";
|
|
+
|
|
+ if (c->sb.version < bcachefs_metadata_version_snapshot &&
|
|
+ bp.v->min_key.snapshot)
|
|
+ return "invalid min_key.snapshot";
|
|
+
|
|
+ return bch2_bkey_ptrs_invalid(c, k);
|
|
+}
|
|
+
|
|
+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
|
|
+
|
|
+ pr_buf(out, "seq %llx written %u min_key ",
|
|
+ le64_to_cpu(bp.v->seq),
|
|
+ le16_to_cpu(bp.v->sectors_written));
|
|
+
|
|
+ bch2_bpos_to_text(out, bp.v->min_key);
|
|
+ pr_buf(out, " ");
|
|
+ bch2_bkey_ptrs_to_text(out, c, k);
|
|
+}
|
|
+
|
|
+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
|
|
+ unsigned big_endian, int write,
|
|
+ struct bkey_s k)
|
|
+{
|
|
+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k);
|
|
+
|
|
+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
|
|
+
|
|
+ if (version < bcachefs_metadata_version_inode_btree_change &&
|
|
+ btree_node_type_is_extents(btree_id) &&
|
|
+ bkey_cmp(bp.v->min_key, POS_MIN))
|
|
+ bp.v->min_key = write
|
|
+ ? bpos_nosnap_predecessor(bp.v->min_key)
|
|
+ : bpos_nosnap_successor(bp.v->min_key);
|
|
+}
|
|
+
|
|
+/* KEY_TYPE_extent: */
|
|
+
|
|
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ return bch2_bkey_ptrs_invalid(c, k);
|
|
+}
|
|
+
|
|
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ bch2_bkey_ptrs_to_text(out, c, k);
|
|
+}
|
|
+
|
|
+enum merge_result bch2_extent_merge(struct bch_fs *c,
|
|
+ struct bkey_s _l, struct bkey_s _r)
|
|
+{
|
|
+ struct bkey_s_extent l = bkey_s_to_extent(_l);
|
|
+ struct bkey_s_extent r = bkey_s_to_extent(_r);
|
|
+ union bch_extent_entry *en_l = l.v->start;
|
|
+ union bch_extent_entry *en_r = r.v->start;
|
|
+ struct bch_extent_crc_unpacked crc_l, crc_r;
|
|
+
|
|
+ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ crc_l = bch2_extent_crc_unpack(l.k, NULL);
|
|
+
|
|
+ extent_for_each_entry(l, en_l) {
|
|
+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
|
|
+
|
|
+ if (extent_entry_type(en_l) != extent_entry_type(en_r))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ switch (extent_entry_type(en_l)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr: {
|
|
+ const struct bch_extent_ptr *lp = &en_l->ptr;
|
|
+ const struct bch_extent_ptr *rp = &en_r->ptr;
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ if (lp->offset + crc_l.compressed_size != rp->offset ||
|
|
+ lp->dev != rp->dev ||
|
|
+ lp->gen != rp->gen)
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ /* We don't allow extents to straddle buckets: */
|
|
+ ca = bch_dev_bkey_exists(c, lp->dev);
|
|
+
|
|
+ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
+ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block ||
|
|
+ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx)
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
|
|
+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
|
|
+
|
|
+ if (crc_l.csum_type != crc_r.csum_type ||
|
|
+ crc_l.compression_type != crc_r.compression_type ||
|
|
+ crc_l.nonce != crc_r.nonce)
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
|
|
+ crc_r.offset)
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if (!bch2_checksum_mergeable(crc_l.csum_type))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if (crc_is_compressed(crc_l))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if (crc_l.csum_type &&
|
|
+ crc_l.uncompressed_size +
|
|
+ crc_r.uncompressed_size > c->sb.encoded_extent_max)
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if (crc_l.uncompressed_size + crc_r.uncompressed_size >
|
|
+ bch2_crc_field_size_max[extent_entry_type(en_l)])
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ break;
|
|
+ default:
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ extent_for_each_entry(l, en_l) {
|
|
+ struct bch_extent_crc_unpacked crc_l, crc_r;
|
|
+
|
|
+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
|
|
+
|
|
+ if (!extent_entry_is_crc(en_l))
|
|
+ continue;
|
|
+
|
|
+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
|
|
+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
|
|
+
|
|
+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
|
|
+ crc_l.csum,
|
|
+ crc_r.csum,
|
|
+ crc_r.uncompressed_size << 9);
|
|
+
|
|
+ crc_l.uncompressed_size += crc_r.uncompressed_size;
|
|
+ crc_l.compressed_size += crc_r.compressed_size;
|
|
+
|
|
+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
|
|
+ extent_entry_type(en_l));
|
|
+ }
|
|
+
|
|
+ bch2_key_resize(l.k, l.k->size + r.k->size);
|
|
+
|
|
+ return BCH_MERGE_MERGE;
|
|
+}
|
|
+
|
|
+/* KEY_TYPE_reservation: */
|
|
+
|
|
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
|
|
+
|
|
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
|
|
+ return "invalid nr_replicas";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
|
|
+
|
|
+ pr_buf(out, "generation %u replicas %u",
|
|
+ le32_to_cpu(r.v->generation),
|
|
+ r.v->nr_replicas);
|
|
+}
|
|
+
|
|
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
|
|
+ struct bkey_s _l, struct bkey_s _r)
|
|
+{
|
|
+ struct bkey_s_reservation l = bkey_s_to_reservation(_l);
|
|
+ struct bkey_s_reservation r = bkey_s_to_reservation(_r);
|
|
+
|
|
+ if (l.v->generation != r.v->generation ||
|
|
+ l.v->nr_replicas != r.v->nr_replicas)
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
|
|
+ bch2_key_resize(l.k, KEY_SIZE_MAX);
|
|
+ bch2_cut_front_s(l.k->p, r.s);
|
|
+ return BCH_MERGE_PARTIAL;
|
|
+ }
|
|
+
|
|
+ bch2_key_resize(l.k, l.k->size + r.k->size);
|
|
+
|
|
+ return BCH_MERGE_MERGE;
|
|
+}
|
|
+
|
|
+/* Extent checksum entries: */
|
|
+
|
|
+/* returns true if not equal */
|
|
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
|
|
+ struct bch_extent_crc_unpacked r)
|
|
+{
|
|
+ return (l.csum_type != r.csum_type ||
|
|
+ l.compression_type != r.compression_type ||
|
|
+ l.compressed_size != r.compressed_size ||
|
|
+ l.uncompressed_size != r.uncompressed_size ||
|
|
+ l.offset != r.offset ||
|
|
+ l.live_size != r.live_size ||
|
|
+ l.nonce != r.nonce ||
|
|
+ bch2_crc_cmp(l.csum, r.csum));
|
|
+}
|
|
+
|
|
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
|
|
+ struct bch_extent_crc_unpacked n)
|
|
+{
|
|
+ return !crc_is_compressed(u) &&
|
|
+ u.csum_type &&
|
|
+ u.uncompressed_size > u.live_size &&
|
|
+ bch2_csum_type_is_encryption(u.csum_type) ==
|
|
+ bch2_csum_type_is_encryption(n.csum_type);
|
|
+}
|
|
+
|
|
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
|
|
+ struct bch_extent_crc_unpacked n)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+ const union bch_extent_entry *i;
|
|
+
|
|
+ if (!n.csum_type)
|
|
+ return false;
|
|
+
|
|
+ bkey_for_each_crc(k.k, ptrs, crc, i)
|
|
+ if (can_narrow_crc(crc, n))
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * We're writing another replica for this extent, so while we've got the data in
|
|
+ * memory we'll be computing a new checksum for the currently live data.
|
|
+ *
|
|
+ * If there are other replicas we aren't moving, and they are checksummed but
|
|
+ * not compressed, we can modify them to point to only the data that is
|
|
+ * currently live (so that readers won't have to bounce) while we've got the
|
|
+ * checksum we need:
|
|
+ */
|
|
+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
|
|
+{
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
|
+ struct bch_extent_crc_unpacked u;
|
|
+ struct extent_ptr_decoded p;
|
|
+ union bch_extent_entry *i;
|
|
+ bool ret = false;
|
|
+
|
|
+ /* Find a checksum entry that covers only live data: */
|
|
+ if (!n.csum_type) {
|
|
+ bkey_for_each_crc(&k->k, ptrs, u, i)
|
|
+ if (!crc_is_compressed(u) &&
|
|
+ u.csum_type &&
|
|
+ u.live_size == u.uncompressed_size) {
|
|
+ n = u;
|
|
+ goto found;
|
|
+ }
|
|
+ return false;
|
|
+ }
|
|
+found:
|
|
+ BUG_ON(crc_is_compressed(n));
|
|
+ BUG_ON(n.offset);
|
|
+ BUG_ON(n.live_size != k->k.size);
|
|
+
|
|
+restart_narrow_pointers:
|
|
+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
|
+
|
|
+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
|
|
+ if (can_narrow_crc(p.crc, n)) {
|
|
+ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
|
|
+ p.ptr.offset += p.crc.offset;
|
|
+ p.crc = n;
|
|
+ bch2_extent_ptr_decoded_append(k, &p);
|
|
+ ret = true;
|
|
+ goto restart_narrow_pointers;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
|
|
+ struct bch_extent_crc_unpacked src,
|
|
+ enum bch_extent_entry_type type)
|
|
+{
|
|
+#define set_common_fields(_dst, _src) \
|
|
+ _dst.type = 1 << type; \
|
|
+ _dst.csum_type = _src.csum_type, \
|
|
+ _dst.compression_type = _src.compression_type, \
|
|
+ _dst._compressed_size = _src.compressed_size - 1, \
|
|
+ _dst._uncompressed_size = _src.uncompressed_size - 1, \
|
|
+ _dst.offset = _src.offset
|
|
+
|
|
+ switch (type) {
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ set_common_fields(dst->crc32, src);
|
|
+ dst->crc32.csum = *((__le32 *) &src.csum.lo);
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ set_common_fields(dst->crc64, src);
|
|
+ dst->crc64.nonce = src.nonce;
|
|
+ dst->crc64.csum_lo = src.csum.lo;
|
|
+ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi);
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ set_common_fields(dst->crc128, src);
|
|
+ dst->crc128.nonce = src.nonce;
|
|
+ dst->crc128.csum = src.csum;
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+#undef set_common_fields
|
|
+}
|
|
+
|
|
+void bch2_extent_crc_append(struct bkey_i *k,
|
|
+ struct bch_extent_crc_unpacked new)
|
|
+{
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
|
+ union bch_extent_crc *crc = (void *) ptrs.end;
|
|
+ enum bch_extent_entry_type type;
|
|
+
|
|
+ if (bch_crc_bytes[new.csum_type] <= 4 &&
|
|
+ new.uncompressed_size <= CRC32_SIZE_MAX &&
|
|
+ new.nonce <= CRC32_NONCE_MAX)
|
|
+ type = BCH_EXTENT_ENTRY_crc32;
|
|
+ else if (bch_crc_bytes[new.csum_type] <= 10 &&
|
|
+ new.uncompressed_size <= CRC64_SIZE_MAX &&
|
|
+ new.nonce <= CRC64_NONCE_MAX)
|
|
+ type = BCH_EXTENT_ENTRY_crc64;
|
|
+ else if (bch_crc_bytes[new.csum_type] <= 16 &&
|
|
+ new.uncompressed_size <= CRC128_SIZE_MAX &&
|
|
+ new.nonce <= CRC128_NONCE_MAX)
|
|
+ type = BCH_EXTENT_ENTRY_crc128;
|
|
+ else
|
|
+ BUG();
|
|
+
|
|
+ bch2_extent_crc_pack(crc, new, type);
|
|
+
|
|
+ k->k.u64s += extent_entry_u64s(ptrs.end);
|
|
+
|
|
+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
|
|
+}
|
|
+
|
|
+/* Generic code for keys with pointers: */
|
|
+
|
|
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
|
|
+{
|
|
+ return bch2_bkey_devs(k).nr;
|
|
+}
|
|
+
|
|
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
|
|
+{
|
|
+ return k.k->type == KEY_TYPE_reservation
|
|
+ ? bkey_s_c_to_reservation(k).v->nr_replicas
|
|
+ : bch2_bkey_dirty_devs(k).nr;
|
|
+}
|
|
+
|
|
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
|
|
+{
|
|
+ unsigned ret = 0;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_reservation) {
|
|
+ ret = bkey_s_c_to_reservation(k).v->nr_replicas;
|
|
+ } else {
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ ret += !p.ptr.cached && !crc_is_compressed(p.crc);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ unsigned ret = 0;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (!p.ptr.cached && crc_is_compressed(p.crc))
|
|
+ ret += p.crc.compressed_size;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_bkey_is_incompressible(struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+
|
|
+ bkey_for_each_crc(k.k, ptrs, crc, entry)
|
|
+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible)
|
|
+ return true;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
|
|
+ unsigned nr_replicas, bool compressed)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bpos end = pos;
|
|
+ struct bkey_s_c k;
|
|
+ bool ret = true;
|
|
+ int err;
|
|
+
|
|
+ end.offset += size;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, pos,
|
|
+ BTREE_ITER_SLOTS, k, err) {
|
|
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
|
|
+ break;
|
|
+
|
|
+ if (nr_replicas > bch2_bkey_replicas(c, k) ||
|
|
+ (!compressed && bch2_bkey_sectors_compressed(k))) {
|
|
+ ret = false;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p = { 0 };
|
|
+ unsigned replicas = 0;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ if (p.ptr.cached)
|
|
+ continue;
|
|
+
|
|
+ if (p.has_ec)
|
|
+ replicas += p.ec.redundancy;
|
|
+
|
|
+ replicas++;
|
|
+
|
|
+ }
|
|
+
|
|
+ return replicas;
|
|
+}
|
|
+
|
|
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
|
|
+ struct extent_ptr_decoded p)
|
|
+{
|
|
+ unsigned durability = 0;
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ if (p.ptr.cached)
|
|
+ return 0;
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, p.ptr.dev);
|
|
+
|
|
+ if (ca->mi.state != BCH_MEMBER_STATE_failed)
|
|
+ durability = max_t(unsigned, durability, ca->mi.durability);
|
|
+
|
|
+ if (p.has_ec)
|
|
+ durability += p.ec.redundancy;
|
|
+
|
|
+ return durability;
|
|
+}
|
|
+
|
|
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ unsigned durability = 0;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ durability += bch2_extent_ptr_durability(c, p);
|
|
+
|
|
+ return durability;
|
|
+}
|
|
+
|
|
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
|
|
+ unsigned target,
|
|
+ unsigned nr_desired_replicas)
|
|
+{
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
|
|
+ union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
|
|
+
|
|
+ if (target && extra > 0)
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ int n = bch2_extent_ptr_durability(c, p);
|
|
+
|
|
+ if (n && n <= extra &&
|
|
+ !bch2_dev_in_target(c, p.ptr.dev, target)) {
|
|
+ entry->ptr.cached = true;
|
|
+ extra -= n;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (extra > 0)
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ int n = bch2_extent_ptr_durability(c, p);
|
|
+
|
|
+ if (n && n <= extra) {
|
|
+ entry->ptr.cached = true;
|
|
+ extra -= n;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
|
|
+{
|
|
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
|
|
+ union bch_extent_entry *next = extent_entry_next(entry);
|
|
+
|
|
+ memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
|
|
+ k->k.u64s -= extent_entry_u64s(entry);
|
|
+}
|
|
+
|
|
+void bch2_bkey_append_ptr(struct bkey_i *k,
|
|
+ struct bch_extent_ptr ptr)
|
|
+{
|
|
+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
|
|
+
|
|
+ switch (k->k.type) {
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ case KEY_TYPE_extent:
|
|
+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
|
|
+
|
|
+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
|
|
+
|
|
+ memcpy((void *) &k->v + bkey_val_bytes(&k->k),
|
|
+ &ptr,
|
|
+ sizeof(ptr));
|
|
+ k->u64s++;
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void __extent_entry_insert(struct bkey_i *k,
|
|
+ union bch_extent_entry *dst,
|
|
+ union bch_extent_entry *new)
|
|
+{
|
|
+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
|
|
+
|
|
+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
|
|
+ dst, (u64 *) end - (u64 *) dst);
|
|
+ k->k.u64s += extent_entry_u64s(new);
|
|
+ memcpy(dst, new, extent_entry_bytes(new));
|
|
+}
|
|
+
|
|
+void bch2_extent_ptr_decoded_append(struct bkey_i *k,
|
|
+ struct extent_ptr_decoded *p)
|
|
+{
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
|
|
+ struct bch_extent_crc_unpacked crc =
|
|
+ bch2_extent_crc_unpack(&k->k, NULL);
|
|
+ union bch_extent_entry *pos;
|
|
+
|
|
+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
|
|
+ pos = ptrs.start;
|
|
+ goto found;
|
|
+ }
|
|
+
|
|
+ bkey_for_each_crc(&k->k, ptrs, crc, pos)
|
|
+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
|
|
+ pos = extent_entry_next(pos);
|
|
+ goto found;
|
|
+ }
|
|
+
|
|
+ bch2_extent_crc_append(k, p->crc);
|
|
+ pos = bkey_val_end(bkey_i_to_s(k));
|
|
+found:
|
|
+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
|
|
+ __extent_entry_insert(k, pos, to_entry(&p->ptr));
|
|
+
|
|
+ if (p->has_ec) {
|
|
+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
|
|
+ __extent_entry_insert(k, pos, to_entry(&p->ec));
|
|
+ }
|
|
+}
|
|
+
|
|
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
|
|
+ union bch_extent_entry *entry)
|
|
+{
|
|
+ union bch_extent_entry *i = ptrs.start;
|
|
+
|
|
+ if (i == entry)
|
|
+ return NULL;
|
|
+
|
|
+ while (extent_entry_next(i) != entry)
|
|
+ i = extent_entry_next(i);
|
|
+ return i;
|
|
+}
|
|
+
|
|
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
|
|
+ struct bch_extent_ptr *ptr)
|
|
+{
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
|
|
+ union bch_extent_entry *dst, *src, *prev;
|
|
+ bool drop_crc = true;
|
|
+
|
|
+ EBUG_ON(ptr < &ptrs.start->ptr ||
|
|
+ ptr >= &ptrs.end->ptr);
|
|
+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
|
|
+
|
|
+ src = extent_entry_next(to_entry(ptr));
|
|
+ if (src != ptrs.end &&
|
|
+ !extent_entry_is_crc(src))
|
|
+ drop_crc = false;
|
|
+
|
|
+ dst = to_entry(ptr);
|
|
+ while ((prev = extent_entry_prev(ptrs, dst))) {
|
|
+ if (extent_entry_is_ptr(prev))
|
|
+ break;
|
|
+
|
|
+ if (extent_entry_is_crc(prev)) {
|
|
+ if (drop_crc)
|
|
+ dst = prev;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ dst = prev;
|
|
+ }
|
|
+
|
|
+ memmove_u64s_down(dst, src,
|
|
+ (u64 *) ptrs.end - (u64 *) src);
|
|
+ k.k->u64s -= (u64 *) src - (u64 *) dst;
|
|
+
|
|
+ return dst;
|
|
+}
|
|
+
|
|
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
|
|
+{
|
|
+ struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
|
|
+}
|
|
+
|
|
+const struct bch_extent_ptr *
|
|
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bkey_for_each_ptr(ptrs, ptr)
|
|
+ if (ptr->dev == dev)
|
|
+ return ptr;
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bkey_for_each_ptr(ptrs, ptr)
|
|
+ if (bch2_dev_in_target(c, ptr->dev, target) &&
|
|
+ (!ptr->cached ||
|
|
+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
|
|
+ struct bch_extent_ptr m, u64 offset)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (p.ptr.dev == m.dev &&
|
|
+ p.ptr.gen == m.gen &&
|
|
+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
|
|
+ (s64) m.offset - offset)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
|
|
+ *
|
|
+ * Returns true if @k should be dropped entirely
|
|
+ *
|
|
+ * For existing keys, only called when btree nodes are being rewritten, not when
|
|
+ * they're merely being compacted/resorted in memory.
|
|
+ */
|
|
+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
|
|
+{
|
|
+ struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bch2_bkey_drop_ptrs(k, ptr,
|
|
+ ptr->cached &&
|
|
+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
|
|
+
|
|
+ /* will only happen if all pointers were cached: */
|
|
+ if (!bch2_bkey_nr_ptrs(k.s_c))
|
|
+ k.k->type = KEY_TYPE_deleted;
|
|
+
|
|
+ return bkey_deleted(k.k);
|
|
+}
|
|
+
|
|
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+ const struct bch_extent_stripe_ptr *ec;
|
|
+ struct bch_dev *ca;
|
|
+ bool first = true;
|
|
+
|
|
+ bkey_extent_entry_for_each(ptrs, entry) {
|
|
+ if (!first)
|
|
+ pr_buf(out, " ");
|
|
+
|
|
+ switch (__extent_entry_type(entry)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr:
|
|
+ ptr = entry_to_ptr(entry);
|
|
+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
|
|
+ ? bch_dev_bkey_exists(c, ptr->dev)
|
|
+ : NULL;
|
|
+
|
|
+ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
|
|
+ (u64) ptr->offset, ptr->gen,
|
|
+ ptr->cached ? " cached" : "",
|
|
+ ca && ptr_stale(ca, ptr)
|
|
+ ? " stale" : "");
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
|
|
+
|
|
+ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
|
|
+ crc.compressed_size,
|
|
+ crc.uncompressed_size,
|
|
+ crc.offset, crc.nonce,
|
|
+ crc.csum_type,
|
|
+ crc.compression_type);
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
+ ec = &entry->stripe_ptr;
|
|
+
|
|
+ pr_buf(out, "ec: idx %llu block %u",
|
|
+ (u64) ec->idx, ec->block);
|
|
+ break;
|
|
+ default:
|
|
+ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ first = false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static const char *extent_ptr_invalid(const struct bch_fs *c,
|
|
+ struct bkey_s_c k,
|
|
+ const struct bch_extent_ptr *ptr,
|
|
+ unsigned size_ondisk,
|
|
+ bool metadata)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr2;
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ if (!bch2_dev_exists2(c, ptr->dev))
|
|
+ return "pointer to invalid device";
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ if (!ca)
|
|
+ return "pointer to invalid device";
|
|
+
|
|
+ bkey_for_each_ptr(ptrs, ptr2)
|
|
+ if (ptr != ptr2 && ptr->dev == ptr2->dev)
|
|
+ return "multiple pointers to same device";
|
|
+
|
|
+ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
|
|
+ return "offset past end of device";
|
|
+
|
|
+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
|
|
+ return "offset before first bucket";
|
|
+
|
|
+ if (bucket_remainder(ca, ptr->offset) +
|
|
+ size_ondisk > ca->mi.bucket_size)
|
|
+ return "spans multiple buckets";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ struct bch_devs_list devs;
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+ unsigned size_ondisk = k.k->size;
|
|
+ const char *reason;
|
|
+ unsigned nonce = UINT_MAX;
|
|
+ unsigned i;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_btree_ptr ||
|
|
+ k.k->type == KEY_TYPE_btree_ptr_v2)
|
|
+ size_ondisk = c->opts.btree_node_size;
|
|
+
|
|
+ bkey_extent_entry_for_each(ptrs, entry) {
|
|
+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
|
|
+ return "invalid extent entry type";
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_btree_ptr &&
|
|
+ !extent_entry_is_ptr(entry))
|
|
+ return "has non ptr field";
|
|
+
|
|
+ switch (extent_entry_type(entry)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr:
|
|
+ reason = extent_ptr_invalid(c, k, &entry->ptr,
|
|
+ size_ondisk, false);
|
|
+ if (reason)
|
|
+ return reason;
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
|
|
+
|
|
+ if (crc.offset + crc.live_size >
|
|
+ crc.uncompressed_size)
|
|
+ return "checksum offset + key size > uncompressed size";
|
|
+
|
|
+ size_ondisk = crc.compressed_size;
|
|
+
|
|
+ if (!bch2_checksum_type_valid(c, crc.csum_type))
|
|
+ return "invalid checksum type";
|
|
+
|
|
+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
|
|
+ return "invalid compression type";
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(crc.csum_type)) {
|
|
+ if (nonce == UINT_MAX)
|
|
+ nonce = crc.offset + crc.nonce;
|
|
+ else if (nonce != crc.offset + crc.nonce)
|
|
+ return "incorrect nonce";
|
|
+ }
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ devs = bch2_bkey_devs(k);
|
|
+ bubble_sort(devs.devs, devs.nr, u8_cmp);
|
|
+ for (i = 0; i + 1 < devs.nr; i++)
|
|
+ if (devs.devs[i] == devs.devs[i + 1])
|
|
+ return "multiple ptrs to same device";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_ptr_swab(struct bkey_s k)
|
|
+{
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
|
|
+ union bch_extent_entry *entry;
|
|
+ u64 *d;
|
|
+
|
|
+ for (d = (u64 *) ptrs.start;
|
|
+ d != (u64 *) ptrs.end;
|
|
+ d++)
|
|
+ *d = swab64(*d);
|
|
+
|
|
+ for (entry = ptrs.start;
|
|
+ entry < ptrs.end;
|
|
+ entry = extent_entry_next(entry)) {
|
|
+ switch (extent_entry_type(entry)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr:
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ entry->crc32.csum = swab32(entry->crc32.csum);
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
|
|
+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ entry->crc128.csum.hi = (__force __le64)
|
|
+ swab64((__force u64) entry->crc128.csum.hi);
|
|
+ entry->crc128.csum.lo = (__force __le64)
|
|
+ swab64((__force u64) entry->crc128.csum.lo);
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Generic extent code: */
|
|
+
|
|
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
|
|
+{
|
|
+ unsigned new_val_u64s = bkey_val_u64s(k.k);
|
|
+ int val_u64s_delta;
|
|
+ u64 sub;
|
|
+
|
|
+ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
|
|
+ return 0;
|
|
+
|
|
+ EBUG_ON(bkey_cmp(where, k.k->p) > 0);
|
|
+
|
|
+ sub = where.offset - bkey_start_offset(k.k);
|
|
+
|
|
+ k.k->size -= sub;
|
|
+
|
|
+ if (!k.k->size) {
|
|
+ k.k->type = KEY_TYPE_deleted;
|
|
+ new_val_u64s = 0;
|
|
+ }
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reflink_v: {
|
|
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
|
|
+ union bch_extent_entry *entry;
|
|
+ bool seen_crc = false;
|
|
+
|
|
+ bkey_extent_entry_for_each(ptrs, entry) {
|
|
+ switch (extent_entry_type(entry)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr:
|
|
+ if (!seen_crc)
|
|
+ entry->ptr.offset += sub;
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ entry->crc32.offset += sub;
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ entry->crc64.offset += sub;
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ entry->crc128.offset += sub;
|
|
+ break;
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr:
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (extent_entry_is_crc(entry))
|
|
+ seen_crc = true;
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case KEY_TYPE_reflink_p: {
|
|
+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
|
|
+
|
|
+ le64_add_cpu(&p.v->idx, sub);
|
|
+ break;
|
|
+ }
|
|
+ case KEY_TYPE_inline_data:
|
|
+ case KEY_TYPE_indirect_inline_data: {
|
|
+ void *p = bkey_inline_data_p(k);
|
|
+ unsigned bytes = bkey_inline_data_bytes(k.k);
|
|
+
|
|
+ sub = min_t(u64, sub << 9, bytes);
|
|
+
|
|
+ memmove(p, p + sub, bytes - sub);
|
|
+
|
|
+ new_val_u64s -= sub >> 3;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
|
|
+ BUG_ON(val_u64s_delta < 0);
|
|
+
|
|
+ set_bkey_val_u64s(k.k, new_val_u64s);
|
|
+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
|
|
+ return -val_u64s_delta;
|
|
+}
|
|
+
|
|
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
|
|
+{
|
|
+ unsigned new_val_u64s = bkey_val_u64s(k.k);
|
|
+ int val_u64s_delta;
|
|
+ u64 len = 0;
|
|
+
|
|
+ if (bkey_cmp(where, k.k->p) >= 0)
|
|
+ return 0;
|
|
+
|
|
+ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
|
|
+
|
|
+ len = where.offset - bkey_start_offset(k.k);
|
|
+
|
|
+ k.k->p.offset = where.offset;
|
|
+ k.k->size = len;
|
|
+
|
|
+ if (!len) {
|
|
+ k.k->type = KEY_TYPE_deleted;
|
|
+ new_val_u64s = 0;
|
|
+ }
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_inline_data:
|
|
+ case KEY_TYPE_indirect_inline_data:
|
|
+ new_val_u64s = (bkey_inline_data_offset(k.k) +
|
|
+ min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
|
|
+ BUG_ON(val_u64s_delta < 0);
|
|
+
|
|
+ set_bkey_val_u64s(k.k, new_val_u64s);
|
|
+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
|
|
+ return -val_u64s_delta;
|
|
+}
|
|
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
|
|
new file mode 100644
|
|
index 000000000000..ccee43a2019d
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/extents.h
|
|
@@ -0,0 +1,646 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_EXTENTS_H
|
|
+#define _BCACHEFS_EXTENTS_H
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey.h"
|
|
+#include "extents_types.h"
|
|
+
|
|
+struct bch_fs;
|
|
+struct btree_trans;
|
|
+
|
|
+/* extent entries: */
|
|
+
|
|
+#define extent_entry_last(_e) \
|
|
+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e))
|
|
+
|
|
+#define entry_to_ptr(_entry) \
|
|
+({ \
|
|
+ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \
|
|
+ \
|
|
+ __builtin_choose_expr( \
|
|
+ type_is_exact(_entry, const union bch_extent_entry *), \
|
|
+ (const struct bch_extent_ptr *) (_entry), \
|
|
+ (struct bch_extent_ptr *) (_entry)); \
|
|
+})
|
|
+
|
|
+/* downcast, preserves const */
|
|
+#define to_entry(_entry) \
|
|
+({ \
|
|
+ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \
|
|
+ !type_is(_entry, struct bch_extent_ptr *) && \
|
|
+ !type_is(_entry, struct bch_extent_stripe_ptr *)); \
|
|
+ \
|
|
+ __builtin_choose_expr( \
|
|
+ (type_is_exact(_entry, const union bch_extent_crc *) || \
|
|
+ type_is_exact(_entry, const struct bch_extent_ptr *) ||\
|
|
+ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\
|
|
+ (const union bch_extent_entry *) (_entry), \
|
|
+ (union bch_extent_entry *) (_entry)); \
|
|
+})
|
|
+
|
|
+#define extent_entry_next(_entry) \
|
|
+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
|
|
+
|
|
+static inline unsigned
|
|
+__extent_entry_type(const union bch_extent_entry *e)
|
|
+{
|
|
+ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX;
|
|
+}
|
|
+
|
|
+static inline enum bch_extent_entry_type
|
|
+extent_entry_type(const union bch_extent_entry *e)
|
|
+{
|
|
+ int ret = __ffs(e->type);
|
|
+
|
|
+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry)
|
|
+{
|
|
+ switch (extent_entry_type(entry)) {
|
|
+#define x(f, n) \
|
|
+ case BCH_EXTENT_ENTRY_##f: \
|
|
+ return sizeof(struct bch_extent_##f);
|
|
+ BCH_EXTENT_ENTRY_TYPES()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
|
|
+{
|
|
+ return extent_entry_bytes(entry) / sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
|
|
+{
|
|
+ switch (extent_entry_type(e)) {
|
|
+ case BCH_EXTENT_ENTRY_ptr:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
|
|
+{
|
|
+ switch (extent_entry_type(e)) {
|
|
+ case BCH_EXTENT_ENTRY_crc32:
|
|
+ case BCH_EXTENT_ENTRY_crc64:
|
|
+ case BCH_EXTENT_ENTRY_crc128:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+union bch_extent_crc {
|
|
+ u8 type;
|
|
+ struct bch_extent_crc32 crc32;
|
|
+ struct bch_extent_crc64 crc64;
|
|
+ struct bch_extent_crc128 crc128;
|
|
+};
|
|
+
|
|
+#define __entry_to_crc(_entry) \
|
|
+ __builtin_choose_expr( \
|
|
+ type_is_exact(_entry, const union bch_extent_entry *), \
|
|
+ (const union bch_extent_crc *) (_entry), \
|
|
+ (union bch_extent_crc *) (_entry))
|
|
+
|
|
+#define entry_to_crc(_entry) \
|
|
+({ \
|
|
+ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \
|
|
+ \
|
|
+ __entry_to_crc(_entry); \
|
|
+})
|
|
+
|
|
+static inline struct bch_extent_crc_unpacked
|
|
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
|
|
+{
|
|
+#define common_fields(_crc) \
|
|
+ .csum_type = _crc.csum_type, \
|
|
+ .compression_type = _crc.compression_type, \
|
|
+ .compressed_size = _crc._compressed_size + 1, \
|
|
+ .uncompressed_size = _crc._uncompressed_size + 1, \
|
|
+ .offset = _crc.offset, \
|
|
+ .live_size = k->size
|
|
+
|
|
+ if (!crc)
|
|
+ return (struct bch_extent_crc_unpacked) {
|
|
+ .compressed_size = k->size,
|
|
+ .uncompressed_size = k->size,
|
|
+ .live_size = k->size,
|
|
+ };
|
|
+
|
|
+ switch (extent_entry_type(to_entry(crc))) {
|
|
+ case BCH_EXTENT_ENTRY_crc32: {
|
|
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
|
+ common_fields(crc->crc32),
|
|
+ };
|
|
+
|
|
+ *((__le32 *) &ret.csum.lo) = crc->crc32.csum;
|
|
+
|
|
+ memcpy(&ret.csum.lo, &crc->crc32.csum,
|
|
+ sizeof(crc->crc32.csum));
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+ case BCH_EXTENT_ENTRY_crc64: {
|
|
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
|
+ common_fields(crc->crc64),
|
|
+ .nonce = crc->crc64.nonce,
|
|
+ .csum.lo = (__force __le64) crc->crc64.csum_lo,
|
|
+ };
|
|
+
|
|
+ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi;
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+ case BCH_EXTENT_ENTRY_crc128: {
|
|
+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) {
|
|
+ common_fields(crc->crc128),
|
|
+ .nonce = crc->crc128.nonce,
|
|
+ .csum = crc->crc128.csum,
|
|
+ };
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+#undef common_fields
|
|
+}
|
|
+
|
|
+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
|
|
+{
|
|
+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none &&
|
|
+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
|
|
+}
|
|
+
|
|
+/* bkey_ptrs: generically over any key type that has ptrs */
|
|
+
|
|
+struct bkey_ptrs_c {
|
|
+ const union bch_extent_entry *start;
|
|
+ const union bch_extent_entry *end;
|
|
+};
|
|
+
|
|
+struct bkey_ptrs {
|
|
+ union bch_extent_entry *start;
|
|
+ union bch_extent_entry *end;
|
|
+};
|
|
+
|
|
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
|
|
+{
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_btree_ptr: {
|
|
+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
|
|
+ return (struct bkey_ptrs_c) {
|
|
+ to_entry(&e.v->start[0]),
|
|
+ to_entry(extent_entry_last(e))
|
|
+ };
|
|
+ }
|
|
+ case KEY_TYPE_extent: {
|
|
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
|
+ return (struct bkey_ptrs_c) {
|
|
+ e.v->start,
|
|
+ extent_entry_last(e)
|
|
+ };
|
|
+ }
|
|
+ case KEY_TYPE_stripe: {
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+ return (struct bkey_ptrs_c) {
|
|
+ to_entry(&s.v->ptrs[0]),
|
|
+ to_entry(&s.v->ptrs[s.v->nr_blocks]),
|
|
+ };
|
|
+ }
|
|
+ case KEY_TYPE_reflink_v: {
|
|
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
|
|
+
|
|
+ return (struct bkey_ptrs_c) {
|
|
+ r.v->start,
|
|
+ bkey_val_end(r),
|
|
+ };
|
|
+ }
|
|
+ case KEY_TYPE_btree_ptr_v2: {
|
|
+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k);
|
|
+ return (struct bkey_ptrs_c) {
|
|
+ to_entry(&e.v->start[0]),
|
|
+ to_entry(extent_entry_last(e))
|
|
+ };
|
|
+ }
|
|
+ default:
|
|
+ return (struct bkey_ptrs_c) { NULL, NULL };
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
|
|
+{
|
|
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
|
|
+
|
|
+ return (struct bkey_ptrs) {
|
|
+ (void *) p.start,
|
|
+ (void *) p.end
|
|
+ };
|
|
+}
|
|
+
|
|
+#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \
|
|
+ for ((_entry) = (_start); \
|
|
+ (_entry) < (_end); \
|
|
+ (_entry) = extent_entry_next(_entry))
|
|
+
|
|
+#define __bkey_ptr_next(_ptr, _end) \
|
|
+({ \
|
|
+ typeof(_end) _entry; \
|
|
+ \
|
|
+ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \
|
|
+ if (extent_entry_is_ptr(_entry)) \
|
|
+ break; \
|
|
+ \
|
|
+ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \
|
|
+})
|
|
+
|
|
+#define bkey_extent_entry_for_each_from(_p, _entry, _start) \
|
|
+ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry)
|
|
+
|
|
+#define bkey_extent_entry_for_each(_p, _entry) \
|
|
+ bkey_extent_entry_for_each_from(_p, _entry, _p.start)
|
|
+
|
|
+#define __bkey_for_each_ptr(_start, _end, _ptr) \
|
|
+ for ((_ptr) = (_start); \
|
|
+ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \
|
|
+ (_ptr)++)
|
|
+
|
|
+#define bkey_ptr_next(_p, _ptr) \
|
|
+ __bkey_ptr_next(_ptr, (_p).end)
|
|
+
|
|
+#define bkey_for_each_ptr(_p, _ptr) \
|
|
+ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr)
|
|
+
|
|
+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \
|
|
+({ \
|
|
+ __label__ out; \
|
|
+ \
|
|
+ (_ptr).idx = 0; \
|
|
+ (_ptr).has_ec = false; \
|
|
+ \
|
|
+ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \
|
|
+ switch (extent_entry_type(_entry)) { \
|
|
+ case BCH_EXTENT_ENTRY_ptr: \
|
|
+ (_ptr).ptr = _entry->ptr; \
|
|
+ goto out; \
|
|
+ case BCH_EXTENT_ENTRY_crc32: \
|
|
+ case BCH_EXTENT_ENTRY_crc64: \
|
|
+ case BCH_EXTENT_ENTRY_crc128: \
|
|
+ (_ptr).crc = bch2_extent_crc_unpack(_k, \
|
|
+ entry_to_crc(_entry)); \
|
|
+ break; \
|
|
+ case BCH_EXTENT_ENTRY_stripe_ptr: \
|
|
+ (_ptr).ec = _entry->stripe_ptr; \
|
|
+ (_ptr).has_ec = true; \
|
|
+ break; \
|
|
+ } \
|
|
+out: \
|
|
+ _entry < (_end); \
|
|
+})
|
|
+
|
|
+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \
|
|
+ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \
|
|
+ (_entry) = _start; \
|
|
+ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \
|
|
+ (_entry) = extent_entry_next(_entry))
|
|
+
|
|
+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \
|
|
+ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \
|
|
+ _ptr, _entry)
|
|
+
|
|
+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \
|
|
+({ \
|
|
+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \
|
|
+ if (extent_entry_is_crc(_iter)) { \
|
|
+ (_crc) = bch2_extent_crc_unpack(_k, \
|
|
+ entry_to_crc(_iter)); \
|
|
+ break; \
|
|
+ } \
|
|
+ \
|
|
+ (_iter) < (_end); \
|
|
+})
|
|
+
|
|
+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \
|
|
+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \
|
|
+ (_iter) = (_start); \
|
|
+ bkey_crc_next(_k, _start, _end, _crc, _iter); \
|
|
+ (_iter) = extent_entry_next(_iter))
|
|
+
|
|
+#define bkey_for_each_crc(_k, _p, _crc, _iter) \
|
|
+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
|
|
+
|
|
+/* Iterate over pointers in KEY_TYPE_extent: */
|
|
+
|
|
+#define extent_for_each_entry_from(_e, _entry, _start) \
|
|
+ __bkey_extent_entry_for_each_from(_start, \
|
|
+ extent_entry_last(_e),_entry)
|
|
+
|
|
+#define extent_for_each_entry(_e, _entry) \
|
|
+ extent_for_each_entry_from(_e, _entry, (_e).v->start)
|
|
+
|
|
+#define extent_ptr_next(_e, _ptr) \
|
|
+ __bkey_ptr_next(_ptr, extent_entry_last(_e))
|
|
+
|
|
+#define extent_for_each_ptr(_e, _ptr) \
|
|
+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
|
|
+
|
|
+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \
|
|
+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \
|
|
+ extent_entry_last(_e), _ptr, _entry)
|
|
+
|
|
+/* utility code common to all keys with pointers: */
|
|
+
|
|
+void bch2_mark_io_failure(struct bch_io_failures *,
|
|
+ struct extent_ptr_decoded *);
|
|
+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
|
|
+ struct bch_io_failures *,
|
|
+ struct extent_ptr_decoded *);
|
|
+
|
|
+/* KEY_TYPE_btree_ptr: */
|
|
+
|
|
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+
|
|
+const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
|
|
+ int, struct bkey_s);
|
|
+
|
|
+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_btree_ptr_invalid, \
|
|
+ .val_to_text = bch2_btree_ptr_to_text, \
|
|
+ .swab = bch2_ptr_swab, \
|
|
+}
|
|
+
|
|
+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_btree_ptr_v2_invalid, \
|
|
+ .val_to_text = bch2_btree_ptr_v2_to_text, \
|
|
+ .swab = bch2_ptr_swab, \
|
|
+ .compat = bch2_btree_ptr_v2_compat, \
|
|
+}
|
|
+
|
|
+/* KEY_TYPE_extent: */
|
|
+
|
|
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+enum merge_result bch2_extent_merge(struct bch_fs *,
|
|
+ struct bkey_s, struct bkey_s);
|
|
+
|
|
+#define bch2_bkey_ops_extent (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_extent_invalid, \
|
|
+ .val_to_text = bch2_extent_to_text, \
|
|
+ .swab = bch2_ptr_swab, \
|
|
+ .key_normalize = bch2_extent_normalize, \
|
|
+ .key_merge = bch2_extent_merge, \
|
|
+}
|
|
+
|
|
+/* KEY_TYPE_reservation: */
|
|
+
|
|
+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+enum merge_result bch2_reservation_merge(struct bch_fs *,
|
|
+ struct bkey_s, struct bkey_s);
|
|
+
|
|
+#define bch2_bkey_ops_reservation (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_reservation_invalid, \
|
|
+ .val_to_text = bch2_reservation_to_text, \
|
|
+ .key_merge = bch2_reservation_merge, \
|
|
+}
|
|
+
|
|
+/* Extent checksum entries: */
|
|
+
|
|
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
|
|
+ struct bch_extent_crc_unpacked);
|
|
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
|
|
+void bch2_extent_crc_append(struct bkey_i *,
|
|
+ struct bch_extent_crc_unpacked);
|
|
+
|
|
+/* Generic code for keys with pointers: */
|
|
+
|
|
+static inline bool bkey_extent_is_direct_data(const struct bkey *k)
|
|
+{
|
|
+ switch (k->type) {
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline bool bkey_extent_is_inline_data(const struct bkey *k)
|
|
+{
|
|
+ return k->type == KEY_TYPE_inline_data ||
|
|
+ k->type == KEY_TYPE_indirect_inline_data;
|
|
+}
|
|
+
|
|
+static inline unsigned bkey_inline_data_offset(const struct bkey *k)
|
|
+{
|
|
+ switch (k->type) {
|
|
+ case KEY_TYPE_inline_data:
|
|
+ return sizeof(struct bch_inline_data);
|
|
+ case KEY_TYPE_indirect_inline_data:
|
|
+ return sizeof(struct bch_indirect_inline_data);
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline unsigned bkey_inline_data_bytes(const struct bkey *k)
|
|
+{
|
|
+ return bkey_val_bytes(k) - bkey_inline_data_offset(k);
|
|
+}
|
|
+
|
|
+#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k))
|
|
+
|
|
+static inline bool bkey_extent_is_data(const struct bkey *k)
|
|
+{
|
|
+ return bkey_extent_is_direct_data(k) ||
|
|
+ bkey_extent_is_inline_data(k) ||
|
|
+ k->type == KEY_TYPE_reflink_p;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Should extent be counted under inode->i_sectors?
|
|
+ */
|
|
+static inline bool bkey_extent_is_allocation(const struct bkey *k)
|
|
+{
|
|
+ switch (k->type) {
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reservation:
|
|
+ case KEY_TYPE_reflink_p:
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ case KEY_TYPE_inline_data:
|
|
+ case KEY_TYPE_indirect_inline_data:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
|
|
+{
|
|
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
|
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bkey_for_each_ptr(p, ptr)
|
|
+ ret.devs[ret.nr++] = ptr->dev;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
|
|
+{
|
|
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
|
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bkey_for_each_ptr(p, ptr)
|
|
+ if (!ptr->cached)
|
|
+ ret.devs[ret.nr++] = ptr->dev;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
|
|
+{
|
|
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
|
|
+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ bkey_for_each_ptr(p, ptr)
|
|
+ if (ptr->cached)
|
|
+ ret.devs[ret.nr++] = ptr->dev;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
|
|
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
|
|
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
|
|
+bool bch2_bkey_is_incompressible(struct bkey_s_c);
|
|
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
|
|
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
|
|
+
|
|
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
|
|
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
|
|
+ unsigned, unsigned);
|
|
+
|
|
+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
|
|
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
|
|
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
|
|
+ struct extent_ptr_decoded *);
|
|
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
|
|
+ struct bch_extent_ptr *);
|
|
+
|
|
+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \
|
|
+do { \
|
|
+ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \
|
|
+ \
|
|
+ _ptr = &_ptrs.start->ptr; \
|
|
+ \
|
|
+ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \
|
|
+ if (_cond) { \
|
|
+ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \
|
|
+ _ptrs = bch2_bkey_ptrs(_k); \
|
|
+ continue; \
|
|
+ } \
|
|
+ \
|
|
+ (_ptr)++; \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
|
|
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
|
|
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
|
|
+
|
|
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
|
|
+ struct bch_extent_ptr, u64);
|
|
+
|
|
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
|
|
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+void bch2_ptr_swab(struct bkey_s);
|
|
+
|
|
+/* Generic extent code: */
|
|
+
|
|
+enum bch_extent_overlap {
|
|
+ BCH_EXTENT_OVERLAP_ALL = 0,
|
|
+ BCH_EXTENT_OVERLAP_BACK = 1,
|
|
+ BCH_EXTENT_OVERLAP_FRONT = 2,
|
|
+ BCH_EXTENT_OVERLAP_MIDDLE = 3,
|
|
+};
|
|
+
|
|
+/* Returns how k overlaps with m */
|
|
+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k,
|
|
+ const struct bkey *m)
|
|
+{
|
|
+ int cmp1 = bkey_cmp(k->p, m->p) < 0;
|
|
+ int cmp2 = bkey_cmp(bkey_start_pos(k),
|
|
+ bkey_start_pos(m)) > 0;
|
|
+
|
|
+ return (cmp1 << 1) + cmp2;
|
|
+}
|
|
+
|
|
+int bch2_cut_front_s(struct bpos, struct bkey_s);
|
|
+int bch2_cut_back_s(struct bpos, struct bkey_s);
|
|
+
|
|
+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
|
|
+{
|
|
+ bch2_cut_front_s(where, bkey_i_to_s(k));
|
|
+}
|
|
+
|
|
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
|
|
+{
|
|
+ bch2_cut_back_s(where, bkey_i_to_s(k));
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_key_resize - adjust size of @k
|
|
+ *
|
|
+ * bkey_start_offset(k) will be preserved, modifies where the extent ends
|
|
+ */
|
|
+static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
|
|
+{
|
|
+ k->p.offset -= k->size;
|
|
+ k->p.offset += new_size;
|
|
+ k->size = new_size;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * In extent_sort_fix_overlapping(), insert_fixup_extent(),
|
|
+ * extent_merge_inline() - we're modifying keys in place that are packed. To do
|
|
+ * that we have to unpack the key, modify the unpacked key - then this
|
|
+ * copies/repacks the unpacked to the original as necessary.
|
|
+ */
|
|
+static inline void extent_save(struct btree *b, struct bkey_packed *dst,
|
|
+ struct bkey *src)
|
|
+{
|
|
+ struct bkey_format *f = &b->format;
|
|
+ struct bkey_i *dst_unpacked;
|
|
+
|
|
+ if ((dst_unpacked = packed_to_bkey(dst)))
|
|
+ dst_unpacked->k = *src;
|
|
+ else
|
|
+ BUG_ON(!bch2_bkey_pack_key(dst, src, f));
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_EXTENTS_H */
|
|
diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h
|
|
new file mode 100644
|
|
index 000000000000..43d6c341ecca
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/extents_types.h
|
|
@@ -0,0 +1,40 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
|
|
+#define _BCACHEFS_EXTENTS_TYPES_H
|
|
+
|
|
+#include "bcachefs_format.h"
|
|
+
|
|
+struct bch_extent_crc_unpacked {
|
|
+ u32 compressed_size;
|
|
+ u32 uncompressed_size;
|
|
+ u32 live_size;
|
|
+
|
|
+ u8 csum_type;
|
|
+ u8 compression_type;
|
|
+
|
|
+ u16 offset;
|
|
+
|
|
+ u16 nonce;
|
|
+
|
|
+ struct bch_csum csum;
|
|
+};
|
|
+
|
|
+struct extent_ptr_decoded {
|
|
+ unsigned idx;
|
|
+ bool has_ec;
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+ struct bch_extent_ptr ptr;
|
|
+ struct bch_extent_stripe_ptr ec;
|
|
+};
|
|
+
|
|
+struct bch_io_failures {
|
|
+ u8 nr;
|
|
+ struct bch_dev_io_failures {
|
|
+ u8 dev;
|
|
+ u8 idx;
|
|
+ u8 nr_failed;
|
|
+ u8 nr_retries;
|
|
+ } devs[BCH_REPLICAS_MAX];
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
|
|
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
|
|
new file mode 100644
|
|
index 000000000000..26d5cad7e6a5
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/eytzinger.h
|
|
@@ -0,0 +1,285 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _EYTZINGER_H
|
|
+#define _EYTZINGER_H
|
|
+
|
|
+#include <linux/bitops.h>
|
|
+#include <linux/log2.h>
|
|
+
|
|
+#include "util.h"
|
|
+
|
|
+/*
|
|
+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an
|
|
+ * array
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * One based indexing version:
|
|
+ *
|
|
+ * With one based indexing each level of the tree starts at a power of two -
|
|
+ * good for cacheline alignment:
|
|
+ *
|
|
+ * Size parameter is treated as if we were using 0 based indexing, however:
|
|
+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there
|
|
+ * are actually size - 1 elements
|
|
+ */
|
|
+
|
|
+static inline unsigned eytzinger1_child(unsigned i, unsigned child)
|
|
+{
|
|
+ EBUG_ON(child > 1);
|
|
+
|
|
+ return (i << 1) + child;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_left_child(unsigned i)
|
|
+{
|
|
+ return eytzinger1_child(i, 0);
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_right_child(unsigned i)
|
|
+{
|
|
+ return eytzinger1_child(i, 1);
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_first(unsigned size)
|
|
+{
|
|
+ return rounddown_pow_of_two(size - 1);
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_last(unsigned size)
|
|
+{
|
|
+ return rounddown_pow_of_two(size) - 1;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that
|
|
+ *
|
|
+ * eytzinger1_next(0) == eytzinger1_first())
|
|
+ * eytzinger1_prev(0) == eytzinger1_last())
|
|
+ *
|
|
+ * eytzinger1_prev(eytzinger1_first()) == 0
|
|
+ * eytzinger1_next(eytzinger1_last()) == 0
|
|
+ */
|
|
+
|
|
+static inline unsigned eytzinger1_next(unsigned i, unsigned size)
|
|
+{
|
|
+ EBUG_ON(i >= size);
|
|
+
|
|
+ if (eytzinger1_right_child(i) < size) {
|
|
+ i = eytzinger1_right_child(i);
|
|
+
|
|
+ i <<= __fls(size) - __fls(i);
|
|
+ i >>= i >= size;
|
|
+ } else {
|
|
+ i >>= ffz(i) + 1;
|
|
+ }
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
|
|
+{
|
|
+ EBUG_ON(i >= size);
|
|
+
|
|
+ if (eytzinger1_left_child(i) < size) {
|
|
+ i = eytzinger1_left_child(i) + 1;
|
|
+
|
|
+ i <<= __fls(size) - __fls(i);
|
|
+ i -= 1;
|
|
+ i >>= i >= size;
|
|
+ } else {
|
|
+ i >>= __ffs(i) + 1;
|
|
+ }
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_extra(unsigned size)
|
|
+{
|
|
+ return (size - rounddown_pow_of_two(size - 1)) << 1;
|
|
+}
|
|
+
|
|
+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
|
|
+ unsigned extra)
|
|
+{
|
|
+ unsigned b = __fls(i);
|
|
+ unsigned shift = __fls(size - 1) - b;
|
|
+ int s;
|
|
+
|
|
+ EBUG_ON(!i || i >= size);
|
|
+
|
|
+ i ^= 1U << b;
|
|
+ i <<= 1;
|
|
+ i |= 1;
|
|
+ i <<= shift;
|
|
+
|
|
+ /*
|
|
+ * sign bit trick:
|
|
+ *
|
|
+ * if (i > extra)
|
|
+ * i -= (i - extra) >> 1;
|
|
+ */
|
|
+ s = extra - i;
|
|
+ i += (s >> 1) & (s >> 31);
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
|
|
+ unsigned extra)
|
|
+{
|
|
+ unsigned shift;
|
|
+ int s;
|
|
+
|
|
+ EBUG_ON(!i || i >= size);
|
|
+
|
|
+ /*
|
|
+ * sign bit trick:
|
|
+ *
|
|
+ * if (i > extra)
|
|
+ * i += i - extra;
|
|
+ */
|
|
+ s = extra - i;
|
|
+ i -= s & (s >> 31);
|
|
+
|
|
+ shift = __ffs(i);
|
|
+
|
|
+ i >>= shift + 1;
|
|
+ i |= 1U << (__fls(size - 1) - shift);
|
|
+
|
|
+ return i;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size)
|
|
+{
|
|
+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size));
|
|
+}
|
|
+
|
|
+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
|
|
+{
|
|
+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size));
|
|
+}
|
|
+
|
|
+#define eytzinger1_for_each(_i, _size) \
|
|
+ for ((_i) = eytzinger1_first((_size)); \
|
|
+ (_i) != 0; \
|
|
+ (_i) = eytzinger1_next((_i), (_size)))
|
|
+
|
|
+/* Zero based indexing version: */
|
|
+
|
|
+static inline unsigned eytzinger0_child(unsigned i, unsigned child)
|
|
+{
|
|
+ EBUG_ON(child > 1);
|
|
+
|
|
+ return (i << 1) + 1 + child;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_left_child(unsigned i)
|
|
+{
|
|
+ return eytzinger0_child(i, 0);
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_right_child(unsigned i)
|
|
+{
|
|
+ return eytzinger0_child(i, 1);
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_first(unsigned size)
|
|
+{
|
|
+ return eytzinger1_first(size + 1) - 1;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_last(unsigned size)
|
|
+{
|
|
+ return eytzinger1_last(size + 1) - 1;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_next(unsigned i, unsigned size)
|
|
+{
|
|
+ return eytzinger1_next(i + 1, size + 1) - 1;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
|
|
+{
|
|
+ return eytzinger1_prev(i + 1, size + 1) - 1;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_extra(unsigned size)
|
|
+{
|
|
+ return eytzinger1_extra(size + 1);
|
|
+}
|
|
+
|
|
+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
|
|
+ unsigned extra)
|
|
+{
|
|
+ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1;
|
|
+}
|
|
+
|
|
+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size,
|
|
+ unsigned extra)
|
|
+{
|
|
+ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1;
|
|
+}
|
|
+
|
|
+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size)
|
|
+{
|
|
+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size));
|
|
+}
|
|
+
|
|
+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
|
|
+{
|
|
+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
|
|
+}
|
|
+
|
|
+#define eytzinger0_for_each(_i, _size) \
|
|
+ for ((_i) = eytzinger0_first((_size)); \
|
|
+ (_i) != -1; \
|
|
+ (_i) = eytzinger0_next((_i), (_size)))
|
|
+
|
|
+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
|
|
+
|
|
+/* return greatest node <= @search, or -1 if not found */
|
|
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
|
|
+ eytzinger_cmp_fn cmp, const void *search)
|
|
+{
|
|
+ unsigned i, n = 0;
|
|
+
|
|
+ if (!nr)
|
|
+ return -1;
|
|
+
|
|
+ do {
|
|
+ i = n;
|
|
+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
|
|
+ } while (n < nr);
|
|
+
|
|
+ if (n & 1) {
|
|
+ /* @i was greater than @search, return previous node: */
|
|
+
|
|
+ if (i == eytzinger0_first(nr))
|
|
+ return -1;
|
|
+
|
|
+ return eytzinger0_prev(i, nr);
|
|
+ } else {
|
|
+ return i;
|
|
+ }
|
|
+}
|
|
+
|
|
+#define eytzinger0_find(base, nr, size, _cmp, search) \
|
|
+({ \
|
|
+ void *_base = (base); \
|
|
+ void *_search = (search); \
|
|
+ size_t _nr = (nr); \
|
|
+ size_t _size = (size); \
|
|
+ size_t _i = 0; \
|
|
+ int _res; \
|
|
+ \
|
|
+ while (_i < _nr && \
|
|
+ (_res = _cmp(_search, _base + _i * _size, _size))) \
|
|
+ _i = eytzinger0_child(_i, _res > 0); \
|
|
+ _i; \
|
|
+})
|
|
+
|
|
+void eytzinger0_sort(void *, size_t, size_t,
|
|
+ int (*cmp_func)(const void *, const void *, size_t),
|
|
+ void (*swap_func)(void *, void *, size_t));
|
|
+
|
|
+#endif /* _EYTZINGER_H */
|
|
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
|
|
new file mode 100644
|
|
index 000000000000..cdb272708a4b
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fifo.h
|
|
@@ -0,0 +1,127 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FIFO_H
|
|
+#define _BCACHEFS_FIFO_H
|
|
+
|
|
+#include "util.h"
|
|
+
|
|
+#define FIFO(type) \
|
|
+struct { \
|
|
+ size_t front, back, size, mask; \
|
|
+ type *data; \
|
|
+}
|
|
+
|
|
+#define DECLARE_FIFO(type, name) FIFO(type) name
|
|
+
|
|
+#define fifo_buf_size(fifo) \
|
|
+ ((fifo)->size \
|
|
+ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \
|
|
+ : 0)
|
|
+
|
|
+#define init_fifo(fifo, _size, _gfp) \
|
|
+({ \
|
|
+ (fifo)->front = (fifo)->back = 0; \
|
|
+ (fifo)->size = (_size); \
|
|
+ (fifo)->mask = (fifo)->size \
|
|
+ ? roundup_pow_of_two((fifo)->size) - 1 \
|
|
+ : 0; \
|
|
+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \
|
|
+})
|
|
+
|
|
+#define free_fifo(fifo) \
|
|
+do { \
|
|
+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \
|
|
+ (fifo)->data = NULL; \
|
|
+} while (0)
|
|
+
|
|
+#define fifo_swap(l, r) \
|
|
+do { \
|
|
+ swap((l)->front, (r)->front); \
|
|
+ swap((l)->back, (r)->back); \
|
|
+ swap((l)->size, (r)->size); \
|
|
+ swap((l)->mask, (r)->mask); \
|
|
+ swap((l)->data, (r)->data); \
|
|
+} while (0)
|
|
+
|
|
+#define fifo_move(dest, src) \
|
|
+do { \
|
|
+ typeof(*((dest)->data)) _t; \
|
|
+ while (!fifo_full(dest) && \
|
|
+ fifo_pop(src, _t)) \
|
|
+ fifo_push(dest, _t); \
|
|
+} while (0)
|
|
+
|
|
+#define fifo_used(fifo) (((fifo)->back - (fifo)->front))
|
|
+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo))
|
|
+
|
|
+#define fifo_empty(fifo) ((fifo)->front == (fifo)->back)
|
|
+#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size)
|
|
+
|
|
+#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask])
|
|
+#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
|
|
+
|
|
+#define fifo_entry_idx_abs(fifo, p) \
|
|
+ ((((p) >= &fifo_peek_front(fifo) \
|
|
+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \
|
|
+ (((p) - (fifo)->data)))
|
|
+
|
|
+#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask)
|
|
+#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask]
|
|
+
|
|
+#define fifo_push_back_ref(f) \
|
|
+ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask])
|
|
+
|
|
+#define fifo_push_front_ref(f) \
|
|
+ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask])
|
|
+
|
|
+#define fifo_push_back(fifo, new) \
|
|
+({ \
|
|
+ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \
|
|
+ if (_r) \
|
|
+ *_r = (new); \
|
|
+ _r != NULL; \
|
|
+})
|
|
+
|
|
+#define fifo_push_front(fifo, new) \
|
|
+({ \
|
|
+ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \
|
|
+ if (_r) \
|
|
+ *_r = (new); \
|
|
+ _r != NULL; \
|
|
+})
|
|
+
|
|
+#define fifo_pop_front(fifo, i) \
|
|
+({ \
|
|
+ bool _r = !fifo_empty((fifo)); \
|
|
+ if (_r) \
|
|
+ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define fifo_pop_back(fifo, i) \
|
|
+({ \
|
|
+ bool _r = !fifo_empty((fifo)); \
|
|
+ if (_r) \
|
|
+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define fifo_push_ref(fifo) fifo_push_back_ref(fifo)
|
|
+#define fifo_push(fifo, i) fifo_push_back(fifo, (i))
|
|
+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i))
|
|
+#define fifo_peek(fifo) fifo_peek_front(fifo)
|
|
+
|
|
+#define fifo_for_each_entry(_entry, _fifo, _iter) \
|
|
+ for (typecheck(typeof((_fifo)->front), _iter), \
|
|
+ (_iter) = (_fifo)->front; \
|
|
+ ((_iter != (_fifo)->back) && \
|
|
+ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \
|
|
+ (_iter)++)
|
|
+
|
|
+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \
|
|
+ for (typecheck(typeof((_fifo)->front), _iter), \
|
|
+ (_iter) = (_fifo)->front; \
|
|
+ ((_iter != (_fifo)->back) && \
|
|
+ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \
|
|
+ (_iter)++)
|
|
+
|
|
+#endif /* _BCACHEFS_FIFO_H */
|
|
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
|
|
new file mode 100644
|
|
index 000000000000..34d69c3f6680
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs-common.c
|
|
@@ -0,0 +1,361 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "acl.h"
|
|
+#include "btree_update.h"
|
|
+#include "dirent.h"
|
|
+#include "fs-common.h"
|
|
+#include "inode.h"
|
|
+#include "xattr.h"
|
|
+
|
|
+#include <linux/posix_acl.h>
|
|
+
|
|
+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
|
|
+ struct bch_inode_unpacked *dir_u,
|
|
+ struct bch_inode_unpacked *new_inode,
|
|
+ const struct qstr *name,
|
|
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
|
+ struct posix_acl *default_acl,
|
|
+ struct posix_acl *acl)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *dir_iter = NULL;
|
|
+ struct btree_iter *inode_iter = NULL;
|
|
+ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
|
|
+ u64 now = bch2_current_time(c);
|
|
+ u64 dir_offset = 0;
|
|
+ int ret;
|
|
+
|
|
+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(dir_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u);
|
|
+
|
|
+ if (!name)
|
|
+ new_inode->bi_flags |= BCH_INODE_UNLINKED;
|
|
+
|
|
+ inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
|
|
+ ret = PTR_ERR_OR_ZERO(inode_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (default_acl) {
|
|
+ ret = bch2_set_acl_trans(trans, new_inode, &hash,
|
|
+ default_acl, ACL_TYPE_DEFAULT);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (acl) {
|
|
+ ret = bch2_set_acl_trans(trans, new_inode, &hash,
|
|
+ acl, ACL_TYPE_ACCESS);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (name) {
|
|
+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u);
|
|
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
|
|
+
|
|
+ if (S_ISDIR(new_inode->bi_mode))
|
|
+ dir_u->bi_nlink++;
|
|
+
|
|
+ ret = bch2_inode_write(trans, dir_iter, dir_u);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
|
|
+ mode_to_type(new_inode->bi_mode),
|
|
+ name, new_inode->bi_inum,
|
|
+ &dir_offset,
|
|
+ BCH_HASH_SET_MUST_CREATE);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
|
|
+ new_inode->bi_dir = dir_u->bi_inum;
|
|
+ new_inode->bi_dir_offset = dir_offset;
|
|
+ }
|
|
+
|
|
+ /* XXX use bch2_btree_iter_set_snapshot() */
|
|
+ inode_iter->snapshot = U32_MAX;
|
|
+ bch2_btree_iter_set_pos(inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX));
|
|
+
|
|
+ ret = bch2_inode_write(trans, inode_iter, new_inode);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, inode_iter);
|
|
+ bch2_trans_iter_put(trans, dir_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum,
|
|
+ u64 inum, struct bch_inode_unpacked *dir_u,
|
|
+ struct bch_inode_unpacked *inode_u, const struct qstr *name)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *dir_iter = NULL, *inode_iter = NULL;
|
|
+ struct bch_hash_info dir_hash;
|
|
+ u64 now = bch2_current_time(c);
|
|
+ u64 dir_offset = 0;
|
|
+ int ret;
|
|
+
|
|
+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(inode_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ inode_u->bi_ctime = now;
|
|
+ bch2_inode_nlink_inc(inode_u);
|
|
+
|
|
+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0);
|
|
+ ret = PTR_ERR_OR_ZERO(dir_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ dir_u->bi_mtime = dir_u->bi_ctime = now;
|
|
+
|
|
+ dir_hash = bch2_hash_info_init(c, dir_u);
|
|
+
|
|
+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash,
|
|
+ mode_to_type(inode_u->bi_mode),
|
|
+ name, inum, &dir_offset,
|
|
+ BCH_HASH_SET_MUST_CREATE);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
|
|
+ inode_u->bi_dir = dir_inum;
|
|
+ inode_u->bi_dir_offset = dir_offset;
|
|
+ }
|
|
+
|
|
+ ret = bch2_inode_write(trans, dir_iter, dir_u) ?:
|
|
+ bch2_inode_write(trans, inode_iter, inode_u);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, dir_iter);
|
|
+ bch2_trans_iter_put(trans, inode_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_unlink_trans(struct btree_trans *trans,
|
|
+ u64 dir_inum, struct bch_inode_unpacked *dir_u,
|
|
+ struct bch_inode_unpacked *inode_u,
|
|
+ const struct qstr *name)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL,
|
|
+ *inode_iter = NULL;
|
|
+ struct bch_hash_info dir_hash;
|
|
+ u64 inum, now = bch2_current_time(c);
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(dir_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ dir_hash = bch2_hash_info_init(c, dir_u);
|
|
+
|
|
+ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash,
|
|
+ name, BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(dirent_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(dirent_iter);
|
|
+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum);
|
|
+
|
|
+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(inode_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (inode_u->bi_dir == k.k->p.inode &&
|
|
+ inode_u->bi_dir_offset == k.k->p.offset) {
|
|
+ inode_u->bi_dir = 0;
|
|
+ inode_u->bi_dir_offset = 0;
|
|
+ }
|
|
+
|
|
+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now;
|
|
+ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode);
|
|
+ bch2_inode_nlink_dec(inode_u);
|
|
+
|
|
+ ret = (S_ISDIR(inode_u->bi_mode)
|
|
+ ? bch2_empty_dir_trans(trans, inum)
|
|
+ : 0) ?:
|
|
+ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?:
|
|
+ bch2_inode_write(trans, dir_iter, dir_u) ?:
|
|
+ bch2_inode_write(trans, inode_iter, inode_u);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, inode_iter);
|
|
+ bch2_trans_iter_put(trans, dirent_iter);
|
|
+ bch2_trans_iter_put(trans, dir_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
|
|
+ struct bch_inode_unpacked *src_u)
|
|
+{
|
|
+ u64 src, dst;
|
|
+ unsigned id;
|
|
+ bool ret = false;
|
|
+
|
|
+ for (id = 0; id < Inode_opt_nr; id++) {
|
|
+ if (dst_u->bi_fields_set & (1 << id))
|
|
+ continue;
|
|
+
|
|
+ src = bch2_inode_opt_get(src_u, id);
|
|
+ dst = bch2_inode_opt_get(dst_u, id);
|
|
+
|
|
+ if (src == dst)
|
|
+ continue;
|
|
+
|
|
+ bch2_inode_opt_set(dst_u, id, src);
|
|
+ ret = true;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_rename_trans(struct btree_trans *trans,
|
|
+ u64 src_dir, struct bch_inode_unpacked *src_dir_u,
|
|
+ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u,
|
|
+ struct bch_inode_unpacked *src_inode_u,
|
|
+ struct bch_inode_unpacked *dst_inode_u,
|
|
+ const struct qstr *src_name,
|
|
+ const struct qstr *dst_name,
|
|
+ enum bch_rename_mode mode)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL;
|
|
+ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL;
|
|
+ struct bch_hash_info src_hash, dst_hash;
|
|
+ u64 src_inode, src_offset, dst_inode, dst_offset;
|
|
+ u64 now = bch2_current_time(c);
|
|
+ int ret;
|
|
+
|
|
+ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(src_dir_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ src_hash = bch2_hash_info_init(c, src_dir_u);
|
|
+
|
|
+ if (dst_dir != src_dir) {
|
|
+ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(dst_dir_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ dst_hash = bch2_hash_info_init(c, dst_dir_u);
|
|
+ } else {
|
|
+ dst_dir_u = src_dir_u;
|
|
+ dst_hash = src_hash;
|
|
+ }
|
|
+
|
|
+ ret = bch2_dirent_rename(trans,
|
|
+ src_dir, &src_hash,
|
|
+ dst_dir, &dst_hash,
|
|
+ src_name, &src_inode, &src_offset,
|
|
+ dst_name, &dst_inode, &dst_offset,
|
|
+ mode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(src_inode_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (dst_inode) {
|
|
+ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(dst_inode_iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
|
|
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
|
|
+ src_inode_u->bi_dir_offset = dst_offset;
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE) {
|
|
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
|
|
+ dst_inode_u->bi_dir_offset = src_offset;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (mode == BCH_RENAME_OVERWRITE) {
|
|
+ if (S_ISDIR(src_inode_u->bi_mode) !=
|
|
+ S_ISDIR(dst_inode_u->bi_mode)) {
|
|
+ ret = -ENOTDIR;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (S_ISDIR(dst_inode_u->bi_mode) &&
|
|
+ bch2_empty_dir_trans(trans, dst_inode)) {
|
|
+ ret = -ENOTEMPTY;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) &&
|
|
+ S_ISDIR(src_inode_u->bi_mode)) {
|
|
+ ret = -EXDEV;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE &&
|
|
+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) &&
|
|
+ S_ISDIR(dst_inode_u->bi_mode)) {
|
|
+ ret = -EXDEV;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (S_ISDIR(src_inode_u->bi_mode)) {
|
|
+ src_dir_u->bi_nlink--;
|
|
+ dst_dir_u->bi_nlink++;
|
|
+ }
|
|
+
|
|
+ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) {
|
|
+ dst_dir_u->bi_nlink--;
|
|
+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE;
|
|
+ }
|
|
+
|
|
+ if (mode == BCH_RENAME_OVERWRITE)
|
|
+ bch2_inode_nlink_dec(dst_inode_u);
|
|
+
|
|
+ src_dir_u->bi_mtime = now;
|
|
+ src_dir_u->bi_ctime = now;
|
|
+
|
|
+ if (src_dir != dst_dir) {
|
|
+ dst_dir_u->bi_mtime = now;
|
|
+ dst_dir_u->bi_ctime = now;
|
|
+ }
|
|
+
|
|
+ src_inode_u->bi_ctime = now;
|
|
+
|
|
+ if (dst_inode)
|
|
+ dst_inode_u->bi_ctime = now;
|
|
+
|
|
+ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?:
|
|
+ (src_dir != dst_dir
|
|
+ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u)
|
|
+ : 0 ) ?:
|
|
+ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?:
|
|
+ (dst_inode
|
|
+ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u)
|
|
+ : 0 );
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, dst_inode_iter);
|
|
+ bch2_trans_iter_put(trans, src_inode_iter);
|
|
+ bch2_trans_iter_put(trans, dst_dir_iter);
|
|
+ bch2_trans_iter_put(trans, src_dir_iter);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h
|
|
new file mode 100644
|
|
index 000000000000..2273b7961c9b
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs-common.h
|
|
@@ -0,0 +1,37 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FS_COMMON_H
|
|
+#define _BCACHEFS_FS_COMMON_H
|
|
+
|
|
+struct posix_acl;
|
|
+
|
|
+int bch2_create_trans(struct btree_trans *, u64,
|
|
+ struct bch_inode_unpacked *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ const struct qstr *,
|
|
+ uid_t, gid_t, umode_t, dev_t,
|
|
+ struct posix_acl *,
|
|
+ struct posix_acl *);
|
|
+
|
|
+int bch2_link_trans(struct btree_trans *, u64,
|
|
+ u64, struct bch_inode_unpacked *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ const struct qstr *);
|
|
+
|
|
+int bch2_unlink_trans(struct btree_trans *,
|
|
+ u64, struct bch_inode_unpacked *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ const struct qstr *);
|
|
+
|
|
+int bch2_rename_trans(struct btree_trans *,
|
|
+ u64, struct bch_inode_unpacked *,
|
|
+ u64, struct bch_inode_unpacked *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ const struct qstr *,
|
|
+ const struct qstr *,
|
|
+ enum bch_rename_mode);
|
|
+
|
|
+bool bch2_reinherit_attrs(struct bch_inode_unpacked *,
|
|
+ struct bch_inode_unpacked *);
|
|
+
|
|
+#endif /* _BCACHEFS_FS_COMMON_H */
|
|
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
|
|
new file mode 100644
|
|
index 000000000000..d707cabd7536
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs-io.c
|
|
@@ -0,0 +1,3177 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "extent_update.h"
|
|
+#include "fs.h"
|
|
+#include "fs-io.h"
|
|
+#include "fsck.h"
|
|
+#include "inode.h"
|
|
+#include "journal.h"
|
|
+#include "io.h"
|
|
+#include "keylist.h"
|
|
+#include "quota.h"
|
|
+#include "reflink.h"
|
|
+
|
|
+#include <linux/aio.h>
|
|
+#include <linux/backing-dev.h>
|
|
+#include <linux/falloc.h>
|
|
+#include <linux/migrate.h>
|
|
+#include <linux/mmu_context.h>
|
|
+#include <linux/pagevec.h>
|
|
+#include <linux/rmap.h>
|
|
+#include <linux/sched/signal.h>
|
|
+#include <linux/task_io_accounting_ops.h>
|
|
+#include <linux/uio.h>
|
|
+#include <linux/writeback.h>
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+#include <trace/events/writeback.h>
|
|
+
|
|
+static inline struct address_space *faults_disabled_mapping(void)
|
|
+{
|
|
+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL);
|
|
+}
|
|
+
|
|
+static inline void set_fdm_dropped_locks(void)
|
|
+{
|
|
+ current->faults_disabled_mapping =
|
|
+ (void *) (((unsigned long) current->faults_disabled_mapping)|1);
|
|
+}
|
|
+
|
|
+static inline bool fdm_dropped_locks(void)
|
|
+{
|
|
+ return ((unsigned long) current->faults_disabled_mapping) & 1;
|
|
+}
|
|
+
|
|
+struct quota_res {
|
|
+ u64 sectors;
|
|
+};
|
|
+
|
|
+struct bch_writepage_io {
|
|
+ struct closure cl;
|
|
+ struct bch_inode_info *inode;
|
|
+
|
|
+ /* must be last: */
|
|
+ struct bch_write_op op;
|
|
+};
|
|
+
|
|
+struct dio_write {
|
|
+ struct completion done;
|
|
+ struct kiocb *req;
|
|
+ struct mm_struct *mm;
|
|
+ unsigned loop:1,
|
|
+ sync:1,
|
|
+ free_iov:1;
|
|
+ struct quota_res quota_res;
|
|
+ u64 written;
|
|
+
|
|
+ struct iov_iter iter;
|
|
+ struct iovec inline_vecs[2];
|
|
+
|
|
+ /* must be last: */
|
|
+ struct bch_write_op op;
|
|
+};
|
|
+
|
|
+struct dio_read {
|
|
+ struct closure cl;
|
|
+ struct kiocb *req;
|
|
+ long ret;
|
|
+ bool should_dirty;
|
|
+ struct bch_read_bio rbio;
|
|
+};
|
|
+
|
|
+/* pagecache_block must be held */
|
|
+static int write_invalidate_inode_pages_range(struct address_space *mapping,
|
|
+ loff_t start, loff_t end)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * XXX: the way this is currently implemented, we can spin if a process
|
|
+ * is continually redirtying a specific page
|
|
+ */
|
|
+ do {
|
|
+ if (!mapping->nrpages &&
|
|
+ !mapping->nrexceptional)
|
|
+ return 0;
|
|
+
|
|
+ ret = filemap_write_and_wait_range(mapping, start, end);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (!mapping->nrpages)
|
|
+ return 0;
|
|
+
|
|
+ ret = invalidate_inode_pages2_range(mapping,
|
|
+ start >> PAGE_SHIFT,
|
|
+ end >> PAGE_SHIFT);
|
|
+ } while (ret == -EBUSY);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* quotas */
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_QUOTA
|
|
+
|
|
+static void bch2_quota_reservation_put(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct quota_res *res)
|
|
+{
|
|
+ if (!res->sectors)
|
|
+ return;
|
|
+
|
|
+ mutex_lock(&inode->ei_quota_lock);
|
|
+ BUG_ON(res->sectors > inode->ei_quota_reserved);
|
|
+
|
|
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC,
|
|
+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC);
|
|
+ inode->ei_quota_reserved -= res->sectors;
|
|
+ mutex_unlock(&inode->ei_quota_lock);
|
|
+
|
|
+ res->sectors = 0;
|
|
+}
|
|
+
|
|
+static int bch2_quota_reservation_add(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct quota_res *res,
|
|
+ unsigned sectors,
|
|
+ bool check_enospc)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&inode->ei_quota_lock);
|
|
+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
|
|
+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK);
|
|
+ if (likely(!ret)) {
|
|
+ inode->ei_quota_reserved += sectors;
|
|
+ res->sectors += sectors;
|
|
+ }
|
|
+ mutex_unlock(&inode->ei_quota_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#else
|
|
+
|
|
+static void bch2_quota_reservation_put(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct quota_res *res)
|
|
+{
|
|
+}
|
|
+
|
|
+static int bch2_quota_reservation_add(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct quota_res *res,
|
|
+ unsigned sectors,
|
|
+ bool check_enospc)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+/* i_size updates: */
|
|
+
|
|
+struct inode_new_size {
|
|
+ loff_t new_size;
|
|
+ u64 now;
|
|
+ unsigned fields;
|
|
+};
|
|
+
|
|
+static int inode_set_size(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct inode_new_size *s = p;
|
|
+
|
|
+ bi->bi_size = s->new_size;
|
|
+ if (s->fields & ATTR_ATIME)
|
|
+ bi->bi_atime = s->now;
|
|
+ if (s->fields & ATTR_MTIME)
|
|
+ bi->bi_mtime = s->now;
|
|
+ if (s->fields & ATTR_CTIME)
|
|
+ bi->bi_ctime = s->now;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int __must_check bch2_write_inode_size(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ loff_t new_size, unsigned fields)
|
|
+{
|
|
+ struct inode_new_size s = {
|
|
+ .new_size = new_size,
|
|
+ .now = bch2_current_time(c),
|
|
+ .fields = fields,
|
|
+ };
|
|
+
|
|
+ return bch2_write_inode(c, inode, inode_set_size, &s, fields);
|
|
+}
|
|
+
|
|
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
|
|
+ struct quota_res *quota_res, s64 sectors)
|
|
+{
|
|
+ if (!sectors)
|
|
+ return;
|
|
+
|
|
+ mutex_lock(&inode->ei_quota_lock);
|
|
+#ifdef CONFIG_BCACHEFS_QUOTA
|
|
+ if (quota_res && sectors > 0) {
|
|
+ BUG_ON(sectors > quota_res->sectors);
|
|
+ BUG_ON(sectors > inode->ei_quota_reserved);
|
|
+
|
|
+ quota_res->sectors -= sectors;
|
|
+ inode->ei_quota_reserved -= sectors;
|
|
+ } else {
|
|
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN);
|
|
+ }
|
|
+#endif
|
|
+ inode->v.i_blocks += sectors;
|
|
+ mutex_unlock(&inode->ei_quota_lock);
|
|
+}
|
|
+
|
|
+/* page state: */
|
|
+
|
|
+/* stored in page->private: */
|
|
+
|
|
+struct bch_page_sector {
|
|
+ /* Uncompressed, fully allocated replicas: */
|
|
+ unsigned nr_replicas:3;
|
|
+
|
|
+ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
|
|
+ unsigned replicas_reserved:3;
|
|
+
|
|
+ /* i_sectors: */
|
|
+ enum {
|
|
+ SECTOR_UNALLOCATED,
|
|
+ SECTOR_RESERVED,
|
|
+ SECTOR_DIRTY,
|
|
+ SECTOR_ALLOCATED,
|
|
+ } state:2;
|
|
+};
|
|
+
|
|
+struct bch_page_state {
|
|
+ spinlock_t lock;
|
|
+ atomic_t write_count;
|
|
+ struct bch_page_sector s[PAGE_SECTORS];
|
|
+};
|
|
+
|
|
+static inline struct bch_page_state *__bch2_page_state(struct page *page)
|
|
+{
|
|
+ return page_has_private(page)
|
|
+ ? (struct bch_page_state *) page_private(page)
|
|
+ : NULL;
|
|
+}
|
|
+
|
|
+static inline struct bch_page_state *bch2_page_state(struct page *page)
|
|
+{
|
|
+ EBUG_ON(!PageLocked(page));
|
|
+
|
|
+ return __bch2_page_state(page);
|
|
+}
|
|
+
|
|
+/* for newly allocated pages: */
|
|
+static void __bch2_page_state_release(struct page *page)
|
|
+{
|
|
+ kfree(detach_page_private(page));
|
|
+}
|
|
+
|
|
+static void bch2_page_state_release(struct page *page)
|
|
+{
|
|
+ EBUG_ON(!PageLocked(page));
|
|
+ __bch2_page_state_release(page);
|
|
+}
|
|
+
|
|
+/* for newly allocated pages: */
|
|
+static struct bch_page_state *__bch2_page_state_create(struct page *page,
|
|
+ gfp_t gfp)
|
|
+{
|
|
+ struct bch_page_state *s;
|
|
+
|
|
+ s = kzalloc(sizeof(*s), GFP_NOFS|gfp);
|
|
+ if (!s)
|
|
+ return NULL;
|
|
+
|
|
+ spin_lock_init(&s->lock);
|
|
+ attach_page_private(page, s);
|
|
+ return s;
|
|
+}
|
|
+
|
|
+static struct bch_page_state *bch2_page_state_create(struct page *page,
|
|
+ gfp_t gfp)
|
|
+{
|
|
+ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp);
|
|
+}
|
|
+
|
|
+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode)
|
|
+{
|
|
+ /* XXX: this should not be open coded */
|
|
+ return inode->ei_inode.bi_data_replicas
|
|
+ ? inode->ei_inode.bi_data_replicas - 1
|
|
+ : c->opts.data_replicas;
|
|
+}
|
|
+
|
|
+static inline unsigned sectors_to_reserve(struct bch_page_sector *s,
|
|
+ unsigned nr_replicas)
|
|
+{
|
|
+ return max(0, (int) nr_replicas -
|
|
+ s->nr_replicas -
|
|
+ s->replicas_reserved);
|
|
+}
|
|
+
|
|
+static int bch2_get_page_disk_reservation(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct page *page, bool check_enospc)
|
|
+{
|
|
+ struct bch_page_state *s = bch2_page_state_create(page, 0);
|
|
+ unsigned nr_replicas = inode_nr_replicas(c, inode);
|
|
+ struct disk_reservation disk_res = { 0 };
|
|
+ unsigned i, disk_res_sectors = 0;
|
|
+ int ret;
|
|
+
|
|
+ if (!s)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(s->s); i++)
|
|
+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas);
|
|
+
|
|
+ if (!disk_res_sectors)
|
|
+ return 0;
|
|
+
|
|
+ ret = bch2_disk_reservation_get(c, &disk_res,
|
|
+ disk_res_sectors, 1,
|
|
+ !check_enospc
|
|
+ ? BCH_DISK_RESERVATION_NOFAIL
|
|
+ : 0);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(s->s); i++)
|
|
+ s->s[i].replicas_reserved +=
|
|
+ sectors_to_reserve(&s->s[i], nr_replicas);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+struct bch2_page_reservation {
|
|
+ struct disk_reservation disk;
|
|
+ struct quota_res quota;
|
|
+};
|
|
+
|
|
+static void bch2_page_reservation_init(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch2_page_reservation *res)
|
|
+{
|
|
+ memset(res, 0, sizeof(*res));
|
|
+
|
|
+ res->disk.nr_replicas = inode_nr_replicas(c, inode);
|
|
+}
|
|
+
|
|
+static void bch2_page_reservation_put(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch2_page_reservation *res)
|
|
+{
|
|
+ bch2_disk_reservation_put(c, &res->disk);
|
|
+ bch2_quota_reservation_put(c, inode, &res->quota);
|
|
+}
|
|
+
|
|
+static int bch2_page_reservation_get(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode, struct page *page,
|
|
+ struct bch2_page_reservation *res,
|
|
+ unsigned offset, unsigned len, bool check_enospc)
|
|
+{
|
|
+ struct bch_page_state *s = bch2_page_state_create(page, 0);
|
|
+ unsigned i, disk_sectors = 0, quota_sectors = 0;
|
|
+ int ret;
|
|
+
|
|
+ if (!s)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ for (i = round_down(offset, block_bytes(c)) >> 9;
|
|
+ i < round_up(offset + len, block_bytes(c)) >> 9;
|
|
+ i++) {
|
|
+ disk_sectors += sectors_to_reserve(&s->s[i],
|
|
+ res->disk.nr_replicas);
|
|
+ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED;
|
|
+ }
|
|
+
|
|
+ if (disk_sectors) {
|
|
+ ret = bch2_disk_reservation_add(c, &res->disk,
|
|
+ disk_sectors,
|
|
+ !check_enospc
|
|
+ ? BCH_DISK_RESERVATION_NOFAIL
|
|
+ : 0);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (quota_sectors) {
|
|
+ ret = bch2_quota_reservation_add(c, inode, &res->quota,
|
|
+ quota_sectors,
|
|
+ check_enospc);
|
|
+ if (unlikely(ret)) {
|
|
+ struct disk_reservation tmp = {
|
|
+ .sectors = disk_sectors
|
|
+ };
|
|
+
|
|
+ bch2_disk_reservation_put(c, &tmp);
|
|
+ res->disk.sectors -= disk_sectors;
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bch2_clear_page_bits(struct page *page)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_page_state *s = bch2_page_state(page);
|
|
+ struct disk_reservation disk_res = { 0 };
|
|
+ int i, dirty_sectors = 0;
|
|
+
|
|
+ if (!s)
|
|
+ return;
|
|
+
|
|
+ EBUG_ON(!PageLocked(page));
|
|
+ EBUG_ON(PageWriteback(page));
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(s->s); i++) {
|
|
+ disk_res.sectors += s->s[i].replicas_reserved;
|
|
+ s->s[i].replicas_reserved = 0;
|
|
+
|
|
+ if (s->s[i].state == SECTOR_DIRTY) {
|
|
+ dirty_sectors++;
|
|
+ s->s[i].state = SECTOR_UNALLOCATED;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_disk_reservation_put(c, &disk_res);
|
|
+
|
|
+ if (dirty_sectors)
|
|
+ i_sectors_acct(c, inode, NULL, -dirty_sectors);
|
|
+
|
|
+ bch2_page_state_release(page);
|
|
+}
|
|
+
|
|
+static void bch2_set_page_dirty(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode, struct page *page,
|
|
+ struct bch2_page_reservation *res,
|
|
+ unsigned offset, unsigned len)
|
|
+{
|
|
+ struct bch_page_state *s = bch2_page_state(page);
|
|
+ unsigned i, dirty_sectors = 0;
|
|
+
|
|
+ WARN_ON((u64) page_offset(page) + offset + len >
|
|
+ round_up((u64) i_size_read(&inode->v), block_bytes(c)));
|
|
+
|
|
+ spin_lock(&s->lock);
|
|
+
|
|
+ for (i = round_down(offset, block_bytes(c)) >> 9;
|
|
+ i < round_up(offset + len, block_bytes(c)) >> 9;
|
|
+ i++) {
|
|
+ unsigned sectors = sectors_to_reserve(&s->s[i],
|
|
+ res->disk.nr_replicas);
|
|
+
|
|
+ /*
|
|
+ * This can happen if we race with the error path in
|
|
+ * bch2_writepage_io_done():
|
|
+ */
|
|
+ sectors = min_t(unsigned, sectors, res->disk.sectors);
|
|
+
|
|
+ s->s[i].replicas_reserved += sectors;
|
|
+ res->disk.sectors -= sectors;
|
|
+
|
|
+ if (s->s[i].state == SECTOR_UNALLOCATED)
|
|
+ dirty_sectors++;
|
|
+
|
|
+ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY);
|
|
+ }
|
|
+
|
|
+ spin_unlock(&s->lock);
|
|
+
|
|
+ if (dirty_sectors)
|
|
+ i_sectors_acct(c, inode, &res->quota, dirty_sectors);
|
|
+
|
|
+ if (!PageDirty(page))
|
|
+ __set_page_dirty_nobuffers(page);
|
|
+}
|
|
+
|
|
+vm_fault_t bch2_page_fault(struct vm_fault *vmf)
|
|
+{
|
|
+ struct file *file = vmf->vma->vm_file;
|
|
+ struct address_space *mapping = file->f_mapping;
|
|
+ struct address_space *fdm = faults_disabled_mapping();
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ int ret;
|
|
+
|
|
+ if (fdm == mapping)
|
|
+ return VM_FAULT_SIGBUS;
|
|
+
|
|
+ /* Lock ordering: */
|
|
+ if (fdm > mapping) {
|
|
+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host);
|
|
+
|
|
+ if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock))
|
|
+ goto got_lock;
|
|
+
|
|
+ bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock);
|
|
+
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+
|
|
+ bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock);
|
|
+
|
|
+ /* Signal that lock has been dropped: */
|
|
+ set_fdm_dropped_locks();
|
|
+ return VM_FAULT_SIGBUS;
|
|
+ }
|
|
+
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+got_lock:
|
|
+ ret = filemap_fault(vmf);
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf)
|
|
+{
|
|
+ struct page *page = vmf->page;
|
|
+ struct file *file = vmf->vma->vm_file;
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct address_space *mapping = file->f_mapping;
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch2_page_reservation res;
|
|
+ unsigned len;
|
|
+ loff_t isize;
|
|
+ int ret = VM_FAULT_LOCKED;
|
|
+
|
|
+ bch2_page_reservation_init(c, inode, &res);
|
|
+
|
|
+ sb_start_pagefault(inode->v.i_sb);
|
|
+ file_update_time(file);
|
|
+
|
|
+ /*
|
|
+ * Not strictly necessary, but helps avoid dio writes livelocking in
|
|
+ * write_invalidate_inode_pages_range() - can drop this if/when we get
|
|
+ * a write_invalidate_inode_pages_range() that works without dropping
|
|
+ * page lock before invalidating page
|
|
+ */
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ lock_page(page);
|
|
+ isize = i_size_read(&inode->v);
|
|
+
|
|
+ if (page->mapping != mapping || page_offset(page) >= isize) {
|
|
+ unlock_page(page);
|
|
+ ret = VM_FAULT_NOPAGE;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page));
|
|
+
|
|
+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) {
|
|
+ unlock_page(page);
|
|
+ ret = VM_FAULT_SIGBUS;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_set_page_dirty(c, inode, page, &res, 0, len);
|
|
+ bch2_page_reservation_put(c, inode, &res);
|
|
+
|
|
+ wait_for_stable_page(page);
|
|
+out:
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+ sb_end_pagefault(inode->v.i_sb);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_invalidatepage(struct page *page, unsigned int offset,
|
|
+ unsigned int length)
|
|
+{
|
|
+ if (offset || length < PAGE_SIZE)
|
|
+ return;
|
|
+
|
|
+ bch2_clear_page_bits(page);
|
|
+}
|
|
+
|
|
+int bch2_releasepage(struct page *page, gfp_t gfp_mask)
|
|
+{
|
|
+ if (PageDirty(page))
|
|
+ return 0;
|
|
+
|
|
+ bch2_clear_page_bits(page);
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_MIGRATION
|
|
+int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
|
|
+ struct page *page, enum migrate_mode mode)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(!PageLocked(page));
|
|
+ EBUG_ON(!PageLocked(newpage));
|
|
+
|
|
+ ret = migrate_page_move_mapping(mapping, newpage, page, 0);
|
|
+ if (ret != MIGRATEPAGE_SUCCESS)
|
|
+ return ret;
|
|
+
|
|
+ if (PagePrivate(page))
|
|
+ attach_page_private(newpage, detach_page_private(page));
|
|
+
|
|
+ if (mode != MIGRATE_SYNC_NO_COPY)
|
|
+ migrate_page_copy(newpage, page);
|
|
+ else
|
|
+ migrate_page_states(newpage, page);
|
|
+ return MIGRATEPAGE_SUCCESS;
|
|
+}
|
|
+#endif
|
|
+
|
|
+/* readpage(s): */
|
|
+
|
|
+static void bch2_readpages_end_io(struct bio *bio)
|
|
+{
|
|
+ struct bvec_iter_all iter;
|
|
+ struct bio_vec *bv;
|
|
+
|
|
+ bio_for_each_segment_all(bv, bio, iter) {
|
|
+ struct page *page = bv->bv_page;
|
|
+
|
|
+ if (!bio->bi_status) {
|
|
+ SetPageUptodate(page);
|
|
+ } else {
|
|
+ ClearPageUptodate(page);
|
|
+ SetPageError(page);
|
|
+ }
|
|
+ unlock_page(page);
|
|
+ }
|
|
+
|
|
+ bio_put(bio);
|
|
+}
|
|
+
|
|
+struct readpages_iter {
|
|
+ struct address_space *mapping;
|
|
+ struct page **pages;
|
|
+ unsigned nr_pages;
|
|
+ unsigned idx;
|
|
+ pgoff_t offset;
|
|
+};
|
|
+
|
|
+static int readpages_iter_init(struct readpages_iter *iter,
|
|
+ struct readahead_control *ractl)
|
|
+{
|
|
+ unsigned i, nr_pages = readahead_count(ractl);
|
|
+
|
|
+ memset(iter, 0, sizeof(*iter));
|
|
+
|
|
+ iter->mapping = ractl->mapping;
|
|
+ iter->offset = readahead_index(ractl);
|
|
+ iter->nr_pages = nr_pages;
|
|
+
|
|
+ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
|
|
+ if (!iter->pages)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ nr_pages = __readahead_batch(ractl, iter->pages, nr_pages);
|
|
+ for (i = 0; i < nr_pages; i++) {
|
|
+ __bch2_page_state_create(iter->pages[i], __GFP_NOFAIL);
|
|
+ put_page(iter->pages[i]);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline struct page *readpage_iter_next(struct readpages_iter *iter)
|
|
+{
|
|
+ if (iter->idx >= iter->nr_pages)
|
|
+ return NULL;
|
|
+
|
|
+ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx);
|
|
+
|
|
+ return iter->pages[iter->idx];
|
|
+}
|
|
+
|
|
+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
|
|
+{
|
|
+ struct bvec_iter iter;
|
|
+ struct bio_vec bv;
|
|
+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
|
|
+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
|
|
+ unsigned state = k.k->type == KEY_TYPE_reservation
|
|
+ ? SECTOR_RESERVED
|
|
+ : SECTOR_ALLOCATED;
|
|
+
|
|
+ bio_for_each_segment(bv, bio, iter) {
|
|
+ struct bch_page_state *s = bch2_page_state(bv.bv_page);
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = bv.bv_offset >> 9;
|
|
+ i < (bv.bv_offset + bv.bv_len) >> 9;
|
|
+ i++) {
|
|
+ s->s[i].nr_replicas = nr_ptrs;
|
|
+ s->s[i].state = state;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+ const union bch_extent_entry *i;
|
|
+
|
|
+ bkey_for_each_crc(k.k, ptrs, crc, i)
|
|
+ if (crc.csum_type || crc.compression_type)
|
|
+ return true;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void readpage_bio_extend(struct readpages_iter *iter,
|
|
+ struct bio *bio,
|
|
+ unsigned sectors_this_extent,
|
|
+ bool get_more)
|
|
+{
|
|
+ while (bio_sectors(bio) < sectors_this_extent &&
|
|
+ bio->bi_vcnt < bio->bi_max_vecs) {
|
|
+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT;
|
|
+ struct page *page = readpage_iter_next(iter);
|
|
+ int ret;
|
|
+
|
|
+ if (page) {
|
|
+ if (iter->offset + iter->idx != page_offset)
|
|
+ break;
|
|
+
|
|
+ iter->idx++;
|
|
+ } else {
|
|
+ if (!get_more)
|
|
+ break;
|
|
+
|
|
+ page = xa_load(&iter->mapping->i_pages, page_offset);
|
|
+ if (page && !xa_is_value(page))
|
|
+ break;
|
|
+
|
|
+ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping));
|
|
+ if (!page)
|
|
+ break;
|
|
+
|
|
+ if (!__bch2_page_state_create(page, 0)) {
|
|
+ put_page(page);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = add_to_page_cache_lru(page, iter->mapping,
|
|
+ page_offset, GFP_NOFS);
|
|
+ if (ret) {
|
|
+ __bch2_page_state_release(page);
|
|
+ put_page(page);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ put_page(page);
|
|
+ }
|
|
+
|
|
+ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0));
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bch_read_bio *rbio, u64 inum,
|
|
+ struct readpages_iter *readpages_iter)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_buf sk;
|
|
+ int flags = BCH_READ_RETRY_IF_STALE|
|
|
+ BCH_READ_MAY_PROMOTE;
|
|
+ int ret = 0;
|
|
+
|
|
+ rbio->c = c;
|
|
+ rbio->start_time = local_clock();
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+retry:
|
|
+ while (1) {
|
|
+ struct bkey_s_c k;
|
|
+ unsigned bytes, sectors, offset_into_extent;
|
|
+ enum btree_id data_btree = BTREE_ID_extents;
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter,
|
|
+ POS(inum, rbio->bio.bi_iter.bi_sector));
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ offset_into_extent = iter->pos.offset -
|
|
+ bkey_start_offset(k.k);
|
|
+ sectors = k.k->size - offset_into_extent;
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+
|
|
+ ret = bch2_read_indirect_extent(trans, &data_btree,
|
|
+ &offset_into_extent, &sk);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ k = bkey_i_to_s_c(sk.k);
|
|
+
|
|
+ sectors = min(sectors, k.k->size - offset_into_extent);
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+
|
|
+ if (readpages_iter)
|
|
+ readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
|
|
+ extent_partial_reads_expensive(k));
|
|
+
|
|
+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
|
|
+ swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
+
|
|
+ if (rbio->bio.bi_iter.bi_size == bytes)
|
|
+ flags |= BCH_READ_LAST_FRAGMENT;
|
|
+
|
|
+ if (bkey_extent_is_allocation(k.k))
|
|
+ bch2_add_page_sectors(&rbio->bio, k);
|
|
+
|
|
+ bch2_read_extent(trans, rbio, iter->pos,
|
|
+ data_btree, k, offset_into_extent, flags);
|
|
+
|
|
+ if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ break;
|
|
+
|
|
+ swap(rbio->bio.bi_iter.bi_size, bytes);
|
|
+ bio_advance(&rbio->bio, bytes);
|
|
+ }
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err_inum_ratelimited(c, inum,
|
|
+ "read error %i from btree lookup", ret);
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ bio_endio(&rbio->bio);
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+}
|
|
+
|
|
+void bch2_readahead(struct readahead_control *ractl)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct page *page;
|
|
+ struct readpages_iter readpages_iter;
|
|
+ int ret;
|
|
+
|
|
+ ret = readpages_iter_init(&readpages_iter, ractl);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
|
|
+ BTREE_ITER_SLOTS);
|
|
+
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ while ((page = readpage_iter_next(&readpages_iter))) {
|
|
+ pgoff_t index = readpages_iter.offset + readpages_iter.idx;
|
|
+ unsigned n = min_t(unsigned,
|
|
+ readpages_iter.nr_pages -
|
|
+ readpages_iter.idx,
|
|
+ BIO_MAX_PAGES);
|
|
+ struct bch_read_bio *rbio =
|
|
+ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read),
|
|
+ opts);
|
|
+
|
|
+ readpages_iter.idx++;
|
|
+
|
|
+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0);
|
|
+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT;
|
|
+ rbio->bio.bi_end_io = bch2_readpages_end_io;
|
|
+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
|
|
+
|
|
+ bchfs_read(&trans, iter, rbio, inode->v.i_ino,
|
|
+ &readpages_iter);
|
|
+ }
|
|
+
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ kfree(readpages_iter.pages);
|
|
+}
|
|
+
|
|
+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
+ u64 inum, struct page *page)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+
|
|
+ bch2_page_state_create(page, __GFP_NOFAIL);
|
|
+
|
|
+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC);
|
|
+ rbio->bio.bi_iter.bi_sector =
|
|
+ (sector_t) page->index << PAGE_SECTOR_SHIFT;
|
|
+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0));
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN,
|
|
+ BTREE_ITER_SLOTS);
|
|
+
|
|
+ bchfs_read(&trans, iter, rbio, inum, NULL);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+}
|
|
+
|
|
+int bch2_readpage(struct file *file, struct page *page)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
|
|
+ struct bch_read_bio *rbio;
|
|
+
|
|
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts);
|
|
+ rbio->bio.bi_end_io = bch2_readpages_end_io;
|
|
+
|
|
+ __bchfs_readpage(c, rbio, inode->v.i_ino, page);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bch2_read_single_page_end_io(struct bio *bio)
|
|
+{
|
|
+ complete(bio->bi_private);
|
|
+}
|
|
+
|
|
+static int bch2_read_single_page(struct page *page,
|
|
+ struct address_space *mapping)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_read_bio *rbio;
|
|
+ int ret;
|
|
+ DECLARE_COMPLETION_ONSTACK(done);
|
|
+
|
|
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read),
|
|
+ io_opts(c, &inode->ei_inode));
|
|
+ rbio->bio.bi_private = &done;
|
|
+ rbio->bio.bi_end_io = bch2_read_single_page_end_io;
|
|
+
|
|
+ __bchfs_readpage(c, rbio, inode->v.i_ino, page);
|
|
+ wait_for_completion(&done);
|
|
+
|
|
+ ret = blk_status_to_errno(rbio->bio.bi_status);
|
|
+ bio_put(&rbio->bio);
|
|
+
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ SetPageUptodate(page);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* writepages: */
|
|
+
|
|
+struct bch_writepage_state {
|
|
+ struct bch_writepage_io *io;
|
|
+ struct bch_io_opts opts;
|
|
+};
|
|
+
|
|
+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode)
|
|
+{
|
|
+ return (struct bch_writepage_state) {
|
|
+ .opts = io_opts(c, &inode->ei_inode)
|
|
+ };
|
|
+}
|
|
+
|
|
+static void bch2_writepage_io_free(struct closure *cl)
|
|
+{
|
|
+ struct bch_writepage_io *io = container_of(cl,
|
|
+ struct bch_writepage_io, cl);
|
|
+
|
|
+ bio_put(&io->op.wbio.bio);
|
|
+}
|
|
+
|
|
+static void bch2_writepage_io_done(struct closure *cl)
|
|
+{
|
|
+ struct bch_writepage_io *io = container_of(cl,
|
|
+ struct bch_writepage_io, cl);
|
|
+ struct bch_fs *c = io->op.c;
|
|
+ struct bio *bio = &io->op.wbio.bio;
|
|
+ struct bvec_iter_all iter;
|
|
+ struct bio_vec *bvec;
|
|
+ unsigned i;
|
|
+
|
|
+ if (io->op.error) {
|
|
+ set_bit(EI_INODE_ERROR, &io->inode->ei_flags);
|
|
+
|
|
+ bio_for_each_segment_all(bvec, bio, iter) {
|
|
+ struct bch_page_state *s;
|
|
+
|
|
+ SetPageError(bvec->bv_page);
|
|
+ mapping_set_error(bvec->bv_page->mapping, -EIO);
|
|
+
|
|
+ s = __bch2_page_state(bvec->bv_page);
|
|
+ spin_lock(&s->lock);
|
|
+ for (i = 0; i < PAGE_SECTORS; i++)
|
|
+ s->s[i].nr_replicas = 0;
|
|
+ spin_unlock(&s->lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
|
|
+ bio_for_each_segment_all(bvec, bio, iter) {
|
|
+ struct bch_page_state *s;
|
|
+
|
|
+ s = __bch2_page_state(bvec->bv_page);
|
|
+ spin_lock(&s->lock);
|
|
+ for (i = 0; i < PAGE_SECTORS; i++)
|
|
+ s->s[i].nr_replicas = 0;
|
|
+ spin_unlock(&s->lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * racing with fallocate can cause us to add fewer sectors than
|
|
+ * expected - but we shouldn't add more sectors than expected:
|
|
+ */
|
|
+ BUG_ON(io->op.i_sectors_delta > 0);
|
|
+
|
|
+ /*
|
|
+ * (error (due to going RO) halfway through a page can screw that up
|
|
+ * slightly)
|
|
+ * XXX wtf?
|
|
+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS);
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * PageWriteback is effectively our ref on the inode - fixup i_blocks
|
|
+ * before calling end_page_writeback:
|
|
+ */
|
|
+ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
|
|
+
|
|
+ bio_for_each_segment_all(bvec, bio, iter) {
|
|
+ struct bch_page_state *s = __bch2_page_state(bvec->bv_page);
|
|
+
|
|
+ if (atomic_dec_and_test(&s->write_count))
|
|
+ end_page_writeback(bvec->bv_page);
|
|
+ }
|
|
+
|
|
+ closure_return_with_destructor(&io->cl, bch2_writepage_io_free);
|
|
+}
|
|
+
|
|
+static void bch2_writepage_do_io(struct bch_writepage_state *w)
|
|
+{
|
|
+ struct bch_writepage_io *io = w->io;
|
|
+
|
|
+ w->io = NULL;
|
|
+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl);
|
|
+ continue_at(&io->cl, bch2_writepage_io_done, NULL);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Get a bch_writepage_io and add @page to it - appending to an existing one if
|
|
+ * possible, else allocating a new one:
|
|
+ */
|
|
+static void bch2_writepage_io_alloc(struct bch_fs *c,
|
|
+ struct writeback_control *wbc,
|
|
+ struct bch_writepage_state *w,
|
|
+ struct bch_inode_info *inode,
|
|
+ u64 sector,
|
|
+ unsigned nr_replicas)
|
|
+{
|
|
+ struct bch_write_op *op;
|
|
+
|
|
+ w->io = container_of(bio_alloc_bioset(GFP_NOFS,
|
|
+ BIO_MAX_PAGES,
|
|
+ &c->writepage_bioset),
|
|
+ struct bch_writepage_io, op.wbio.bio);
|
|
+
|
|
+ closure_init(&w->io->cl, NULL);
|
|
+ w->io->inode = inode;
|
|
+
|
|
+ op = &w->io->op;
|
|
+ bch2_write_op_init(op, c, w->opts);
|
|
+ op->target = w->opts.foreground_target;
|
|
+ op_journal_seq_set(op, &inode->ei_journal_seq);
|
|
+ op->nr_replicas = nr_replicas;
|
|
+ op->res.nr_replicas = nr_replicas;
|
|
+ op->write_point = writepoint_hashed(inode->ei_last_dirtied);
|
|
+ op->pos = POS(inode->v.i_ino, sector);
|
|
+ op->wbio.bio.bi_iter.bi_sector = sector;
|
|
+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc);
|
|
+}
|
|
+
|
|
+static int __bch2_writepage(struct page *page,
|
|
+ struct writeback_control *wbc,
|
|
+ void *data)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_writepage_state *w = data;
|
|
+ struct bch_page_state *s, orig;
|
|
+ unsigned i, offset, nr_replicas_this_write = U32_MAX;
|
|
+ loff_t i_size = i_size_read(&inode->v);
|
|
+ pgoff_t end_index = i_size >> PAGE_SHIFT;
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(!PageUptodate(page));
|
|
+
|
|
+ /* Is the page fully inside i_size? */
|
|
+ if (page->index < end_index)
|
|
+ goto do_io;
|
|
+
|
|
+ /* Is the page fully outside i_size? (truncate in progress) */
|
|
+ offset = i_size & (PAGE_SIZE - 1);
|
|
+ if (page->index > end_index || !offset) {
|
|
+ unlock_page(page);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * The page straddles i_size. It must be zeroed out on each and every
|
|
+ * writepage invocation because it may be mmapped. "A file is mapped
|
|
+ * in multiples of the page size. For a file that is not a multiple of
|
|
+ * the page size, the remaining memory is zeroed when mapped, and
|
|
+ * writes to that region are not written out to the file."
|
|
+ */
|
|
+ zero_user_segment(page, offset, PAGE_SIZE);
|
|
+do_io:
|
|
+ s = bch2_page_state_create(page, __GFP_NOFAIL);
|
|
+
|
|
+ ret = bch2_get_page_disk_reservation(c, inode, page, true);
|
|
+ if (ret) {
|
|
+ SetPageError(page);
|
|
+ mapping_set_error(page->mapping, ret);
|
|
+ unlock_page(page);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ /* Before unlocking the page, get copy of reservations: */
|
|
+ orig = *s;
|
|
+
|
|
+ for (i = 0; i < PAGE_SECTORS; i++) {
|
|
+ if (s->s[i].state < SECTOR_DIRTY)
|
|
+ continue;
|
|
+
|
|
+ nr_replicas_this_write =
|
|
+ min_t(unsigned, nr_replicas_this_write,
|
|
+ s->s[i].nr_replicas +
|
|
+ s->s[i].replicas_reserved);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < PAGE_SECTORS; i++) {
|
|
+ if (s->s[i].state < SECTOR_DIRTY)
|
|
+ continue;
|
|
+
|
|
+ s->s[i].nr_replicas = w->opts.compression
|
|
+ ? 0 : nr_replicas_this_write;
|
|
+
|
|
+ s->s[i].replicas_reserved = 0;
|
|
+ s->s[i].state = SECTOR_ALLOCATED;
|
|
+ }
|
|
+
|
|
+ BUG_ON(atomic_read(&s->write_count));
|
|
+ atomic_set(&s->write_count, 1);
|
|
+
|
|
+ BUG_ON(PageWriteback(page));
|
|
+ set_page_writeback(page);
|
|
+
|
|
+ unlock_page(page);
|
|
+
|
|
+ offset = 0;
|
|
+ while (1) {
|
|
+ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0;
|
|
+ u64 sector;
|
|
+
|
|
+ while (offset < PAGE_SECTORS &&
|
|
+ orig.s[offset].state < SECTOR_DIRTY)
|
|
+ offset++;
|
|
+
|
|
+ if (offset == PAGE_SECTORS)
|
|
+ break;
|
|
+
|
|
+ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset;
|
|
+
|
|
+ while (offset + sectors < PAGE_SECTORS &&
|
|
+ orig.s[offset + sectors].state >= SECTOR_DIRTY)
|
|
+ sectors++;
|
|
+
|
|
+ for (i = offset; i < offset + sectors; i++) {
|
|
+ reserved_sectors += orig.s[i].replicas_reserved;
|
|
+ dirty_sectors += orig.s[i].state == SECTOR_DIRTY;
|
|
+ }
|
|
+
|
|
+ if (w->io &&
|
|
+ (w->io->op.res.nr_replicas != nr_replicas_this_write ||
|
|
+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) ||
|
|
+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >=
|
|
+ (BIO_MAX_PAGES * PAGE_SIZE) ||
|
|
+ bio_end_sector(&w->io->op.wbio.bio) != sector))
|
|
+ bch2_writepage_do_io(w);
|
|
+
|
|
+ if (!w->io)
|
|
+ bch2_writepage_io_alloc(c, wbc, w, inode, sector,
|
|
+ nr_replicas_this_write);
|
|
+
|
|
+ atomic_inc(&s->write_count);
|
|
+
|
|
+ BUG_ON(inode != w->io->inode);
|
|
+ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page,
|
|
+ sectors << 9, offset << 9));
|
|
+
|
|
+ /* Check for writing past i_size: */
|
|
+ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) >
|
|
+ round_up(i_size, block_bytes(c)));
|
|
+
|
|
+ w->io->op.res.sectors += reserved_sectors;
|
|
+ w->io->op.i_sectors_delta -= dirty_sectors;
|
|
+ w->io->op.new_i_size = i_size;
|
|
+
|
|
+ offset += sectors;
|
|
+ }
|
|
+
|
|
+ if (atomic_dec_and_test(&s->write_count))
|
|
+ end_page_writeback(page);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
|
+{
|
|
+ struct bch_fs *c = mapping->host->i_sb->s_fs_info;
|
|
+ struct bch_writepage_state w =
|
|
+ bch_writepage_state_init(c, to_bch_ei(mapping->host));
|
|
+ struct blk_plug plug;
|
|
+ int ret;
|
|
+
|
|
+ blk_start_plug(&plug);
|
|
+ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w);
|
|
+ if (w.io)
|
|
+ bch2_writepage_do_io(&w);
|
|
+ blk_finish_plug(&plug);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_writepage(struct page *page, struct writeback_control *wbc)
|
|
+{
|
|
+ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info;
|
|
+ struct bch_writepage_state w =
|
|
+ bch_writepage_state_init(c, to_bch_ei(page->mapping->host));
|
|
+ int ret;
|
|
+
|
|
+ ret = __bch2_writepage(page, wbc, &w);
|
|
+ if (w.io)
|
|
+ bch2_writepage_do_io(&w);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* buffered writes: */
|
|
+
|
|
+int bch2_write_begin(struct file *file, struct address_space *mapping,
|
|
+ loff_t pos, unsigned len, unsigned flags,
|
|
+ struct page **pagep, void **fsdata)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch2_page_reservation *res;
|
|
+ pgoff_t index = pos >> PAGE_SHIFT;
|
|
+ unsigned offset = pos & (PAGE_SIZE - 1);
|
|
+ struct page *page;
|
|
+ int ret = -ENOMEM;
|
|
+
|
|
+ res = kmalloc(sizeof(*res), GFP_KERNEL);
|
|
+ if (!res)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ bch2_page_reservation_init(c, inode, res);
|
|
+ *fsdata = res;
|
|
+
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ page = grab_cache_page_write_begin(mapping, index, flags);
|
|
+ if (!page)
|
|
+ goto err_unlock;
|
|
+
|
|
+ if (PageUptodate(page))
|
|
+ goto out;
|
|
+
|
|
+ /* If we're writing entire page, don't need to read it in first: */
|
|
+ if (len == PAGE_SIZE)
|
|
+ goto out;
|
|
+
|
|
+ if (!offset && pos + len >= inode->v.i_size) {
|
|
+ zero_user_segment(page, len, PAGE_SIZE);
|
|
+ flush_dcache_page(page);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (index > inode->v.i_size >> PAGE_SHIFT) {
|
|
+ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE);
|
|
+ flush_dcache_page(page);
|
|
+ goto out;
|
|
+ }
|
|
+readpage:
|
|
+ ret = bch2_read_single_page(page, mapping);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+out:
|
|
+ ret = bch2_page_reservation_get(c, inode, page, res,
|
|
+ offset, len, true);
|
|
+ if (ret) {
|
|
+ if (!PageUptodate(page)) {
|
|
+ /*
|
|
+ * If the page hasn't been read in, we won't know if we
|
|
+ * actually need a reservation - we don't actually need
|
|
+ * to read here, we just need to check if the page is
|
|
+ * fully backed by uncompressed data:
|
|
+ */
|
|
+ goto readpage;
|
|
+ }
|
|
+
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ *pagep = page;
|
|
+ return 0;
|
|
+err:
|
|
+ unlock_page(page);
|
|
+ put_page(page);
|
|
+ *pagep = NULL;
|
|
+err_unlock:
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+ kfree(res);
|
|
+ *fsdata = NULL;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_write_end(struct file *file, struct address_space *mapping,
|
|
+ loff_t pos, unsigned len, unsigned copied,
|
|
+ struct page *page, void *fsdata)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(mapping->host);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch2_page_reservation *res = fsdata;
|
|
+ unsigned offset = pos & (PAGE_SIZE - 1);
|
|
+
|
|
+ lockdep_assert_held(&inode->v.i_rwsem);
|
|
+
|
|
+ if (unlikely(copied < len && !PageUptodate(page))) {
|
|
+ /*
|
|
+ * The page needs to be read in, but that would destroy
|
|
+ * our partial write - simplest thing is to just force
|
|
+ * userspace to redo the write:
|
|
+ */
|
|
+ zero_user(page, 0, PAGE_SIZE);
|
|
+ flush_dcache_page(page);
|
|
+ copied = 0;
|
|
+ }
|
|
+
|
|
+ spin_lock(&inode->v.i_lock);
|
|
+ if (pos + copied > inode->v.i_size)
|
|
+ i_size_write(&inode->v, pos + copied);
|
|
+ spin_unlock(&inode->v.i_lock);
|
|
+
|
|
+ if (copied) {
|
|
+ if (!PageUptodate(page))
|
|
+ SetPageUptodate(page);
|
|
+
|
|
+ bch2_set_page_dirty(c, inode, page, res, offset, copied);
|
|
+
|
|
+ inode->ei_last_dirtied = (unsigned long) current;
|
|
+ }
|
|
+
|
|
+ unlock_page(page);
|
|
+ put_page(page);
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+
|
|
+ bch2_page_reservation_put(c, inode, res);
|
|
+ kfree(res);
|
|
+
|
|
+ return copied;
|
|
+}
|
|
+
|
|
+#define WRITE_BATCH_PAGES 32
|
|
+
|
|
+static int __bch2_buffered_write(struct bch_inode_info *inode,
|
|
+ struct address_space *mapping,
|
|
+ struct iov_iter *iter,
|
|
+ loff_t pos, unsigned len)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct page *pages[WRITE_BATCH_PAGES];
|
|
+ struct bch2_page_reservation res;
|
|
+ unsigned long index = pos >> PAGE_SHIFT;
|
|
+ unsigned offset = pos & (PAGE_SIZE - 1);
|
|
+ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
|
|
+ unsigned i, reserved = 0, set_dirty = 0;
|
|
+ unsigned copied = 0, nr_pages_copied = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ BUG_ON(!len);
|
|
+ BUG_ON(nr_pages > ARRAY_SIZE(pages));
|
|
+
|
|
+ bch2_page_reservation_init(c, inode, &res);
|
|
+
|
|
+ for (i = 0; i < nr_pages; i++) {
|
|
+ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
|
|
+ if (!pages[i]) {
|
|
+ nr_pages = i;
|
|
+ if (!i) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ len = min_t(unsigned, len,
|
|
+ nr_pages * PAGE_SIZE - offset);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (offset && !PageUptodate(pages[0])) {
|
|
+ ret = bch2_read_single_page(pages[0], mapping);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if ((pos + len) & (PAGE_SIZE - 1) &&
|
|
+ !PageUptodate(pages[nr_pages - 1])) {
|
|
+ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
|
|
+ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
|
|
+ } else {
|
|
+ ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ while (reserved < len) {
|
|
+ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT];
|
|
+ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1);
|
|
+ unsigned pg_len = min_t(unsigned, len - reserved,
|
|
+ PAGE_SIZE - pg_offset);
|
|
+retry_reservation:
|
|
+ ret = bch2_page_reservation_get(c, inode, page, &res,
|
|
+ pg_offset, pg_len, true);
|
|
+
|
|
+ if (ret && !PageUptodate(page)) {
|
|
+ ret = bch2_read_single_page(page, mapping);
|
|
+ if (!ret)
|
|
+ goto retry_reservation;
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ goto out;
|
|
+
|
|
+ reserved += pg_len;
|
|
+ }
|
|
+
|
|
+ if (mapping_writably_mapped(mapping))
|
|
+ for (i = 0; i < nr_pages; i++)
|
|
+ flush_dcache_page(pages[i]);
|
|
+
|
|
+ while (copied < len) {
|
|
+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
|
|
+ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
|
|
+ unsigned pg_len = min_t(unsigned, len - copied,
|
|
+ PAGE_SIZE - pg_offset);
|
|
+ unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
|
|
+ iter, pg_offset, pg_len);
|
|
+
|
|
+ if (!pg_copied)
|
|
+ break;
|
|
+
|
|
+ if (!PageUptodate(page) &&
|
|
+ pg_copied != PAGE_SIZE &&
|
|
+ pos + copied + pg_copied < inode->v.i_size) {
|
|
+ zero_user(page, 0, PAGE_SIZE);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ flush_dcache_page(page);
|
|
+ iov_iter_advance(iter, pg_copied);
|
|
+ copied += pg_copied;
|
|
+
|
|
+ if (pg_copied != pg_len)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!copied)
|
|
+ goto out;
|
|
+
|
|
+ spin_lock(&inode->v.i_lock);
|
|
+ if (pos + copied > inode->v.i_size)
|
|
+ i_size_write(&inode->v, pos + copied);
|
|
+ spin_unlock(&inode->v.i_lock);
|
|
+
|
|
+ while (set_dirty < copied) {
|
|
+ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT];
|
|
+ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1);
|
|
+ unsigned pg_len = min_t(unsigned, copied - set_dirty,
|
|
+ PAGE_SIZE - pg_offset);
|
|
+
|
|
+ if (!PageUptodate(page))
|
|
+ SetPageUptodate(page);
|
|
+
|
|
+ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len);
|
|
+ unlock_page(page);
|
|
+ put_page(page);
|
|
+
|
|
+ set_dirty += pg_len;
|
|
+ }
|
|
+
|
|
+ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
|
|
+ inode->ei_last_dirtied = (unsigned long) current;
|
|
+out:
|
|
+ for (i = nr_pages_copied; i < nr_pages; i++) {
|
|
+ unlock_page(pages[i]);
|
|
+ put_page(pages[i]);
|
|
+ }
|
|
+
|
|
+ bch2_page_reservation_put(c, inode, &res);
|
|
+
|
|
+ return copied ?: ret;
|
|
+}
|
|
+
|
|
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
|
|
+{
|
|
+ struct file *file = iocb->ki_filp;
|
|
+ struct address_space *mapping = file->f_mapping;
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ loff_t pos = iocb->ki_pos;
|
|
+ ssize_t written = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ do {
|
|
+ unsigned offset = pos & (PAGE_SIZE - 1);
|
|
+ unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
|
|
+ PAGE_SIZE * WRITE_BATCH_PAGES - offset);
|
|
+again:
|
|
+ /*
|
|
+ * Bring in the user page that we will copy from _first_.
|
|
+ * Otherwise there's a nasty deadlock on copying from the
|
|
+ * same page as we're writing to, without it being marked
|
|
+ * up-to-date.
|
|
+ *
|
|
+ * Not only is this an optimisation, but it is also required
|
|
+ * to check that the address is actually valid, when atomic
|
|
+ * usercopies are used, below.
|
|
+ */
|
|
+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
|
|
+ bytes = min_t(unsigned long, iov_iter_count(iter),
|
|
+ PAGE_SIZE - offset);
|
|
+
|
|
+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
|
|
+ ret = -EFAULT;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (unlikely(fatal_signal_pending(current))) {
|
|
+ ret = -EINTR;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
|
|
+ if (unlikely(ret < 0))
|
|
+ break;
|
|
+
|
|
+ cond_resched();
|
|
+
|
|
+ if (unlikely(ret == 0)) {
|
|
+ /*
|
|
+ * If we were unable to copy any data at all, we must
|
|
+ * fall back to a single segment length write.
|
|
+ *
|
|
+ * If we didn't fallback here, we could livelock
|
|
+ * because not all segments in the iov can be copied at
|
|
+ * once without a pagefault.
|
|
+ */
|
|
+ bytes = min_t(unsigned long, PAGE_SIZE - offset,
|
|
+ iov_iter_single_seg_count(iter));
|
|
+ goto again;
|
|
+ }
|
|
+ pos += ret;
|
|
+ written += ret;
|
|
+ ret = 0;
|
|
+
|
|
+ balance_dirty_pages_ratelimited(mapping);
|
|
+ } while (iov_iter_count(iter));
|
|
+
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+
|
|
+ return written ? written : ret;
|
|
+}
|
|
+
|
|
+/* O_DIRECT reads */
|
|
+
|
|
+static void bio_check_or_release(struct bio *bio, bool check_dirty)
|
|
+{
|
|
+ if (check_dirty) {
|
|
+ bio_check_pages_dirty(bio);
|
|
+ } else {
|
|
+ bio_release_pages(bio, false);
|
|
+ bio_put(bio);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bch2_dio_read_complete(struct closure *cl)
|
|
+{
|
|
+ struct dio_read *dio = container_of(cl, struct dio_read, cl);
|
|
+
|
|
+ dio->req->ki_complete(dio->req, dio->ret, 0);
|
|
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
|
|
+}
|
|
+
|
|
+static void bch2_direct_IO_read_endio(struct bio *bio)
|
|
+{
|
|
+ struct dio_read *dio = bio->bi_private;
|
|
+
|
|
+ if (bio->bi_status)
|
|
+ dio->ret = blk_status_to_errno(bio->bi_status);
|
|
+
|
|
+ closure_put(&dio->cl);
|
|
+}
|
|
+
|
|
+static void bch2_direct_IO_read_split_endio(struct bio *bio)
|
|
+{
|
|
+ struct dio_read *dio = bio->bi_private;
|
|
+ bool should_dirty = dio->should_dirty;
|
|
+
|
|
+ bch2_direct_IO_read_endio(bio);
|
|
+ bio_check_or_release(bio, should_dirty);
|
|
+}
|
|
+
|
|
+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
|
|
+{
|
|
+ struct file *file = req->ki_filp;
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode);
|
|
+ struct dio_read *dio;
|
|
+ struct bio *bio;
|
|
+ loff_t offset = req->ki_pos;
|
|
+ bool sync = is_sync_kiocb(req);
|
|
+ size_t shorten;
|
|
+ ssize_t ret;
|
|
+
|
|
+ if ((offset|iter->count) & (block_bytes(c) - 1))
|
|
+ return -EINVAL;
|
|
+
|
|
+ ret = min_t(loff_t, iter->count,
|
|
+ max_t(loff_t, 0, i_size_read(&inode->v) - offset));
|
|
+
|
|
+ if (!ret)
|
|
+ return ret;
|
|
+
|
|
+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
|
|
+ iter->count -= shorten;
|
|
+
|
|
+ bio = bio_alloc_bioset(GFP_KERNEL,
|
|
+ iov_iter_npages(iter, BIO_MAX_PAGES),
|
|
+ &c->dio_read_bioset);
|
|
+
|
|
+ bio->bi_end_io = bch2_direct_IO_read_endio;
|
|
+
|
|
+ dio = container_of(bio, struct dio_read, rbio.bio);
|
|
+ closure_init(&dio->cl, NULL);
|
|
+
|
|
+ /*
|
|
+ * this is a _really_ horrible hack just to avoid an atomic sub at the
|
|
+ * end:
|
|
+ */
|
|
+ if (!sync) {
|
|
+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL);
|
|
+ atomic_set(&dio->cl.remaining,
|
|
+ CLOSURE_REMAINING_INITIALIZER -
|
|
+ CLOSURE_RUNNING +
|
|
+ CLOSURE_DESTRUCTOR);
|
|
+ } else {
|
|
+ atomic_set(&dio->cl.remaining,
|
|
+ CLOSURE_REMAINING_INITIALIZER + 1);
|
|
+ }
|
|
+
|
|
+ dio->req = req;
|
|
+ dio->ret = ret;
|
|
+ /*
|
|
+ * This is one of the sketchier things I've encountered: we have to skip
|
|
+ * the dirtying of requests that are internal from the kernel (i.e. from
|
|
+ * loopback), because we'll deadlock on page_lock.
|
|
+ */
|
|
+ dio->should_dirty = iter_is_iovec(iter);
|
|
+
|
|
+ goto start;
|
|
+ while (iter->count) {
|
|
+ bio = bio_alloc_bioset(GFP_KERNEL,
|
|
+ iov_iter_npages(iter, BIO_MAX_PAGES),
|
|
+ &c->bio_read);
|
|
+ bio->bi_end_io = bch2_direct_IO_read_split_endio;
|
|
+start:
|
|
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC);
|
|
+ bio->bi_iter.bi_sector = offset >> 9;
|
|
+ bio->bi_private = dio;
|
|
+
|
|
+ ret = bio_iov_iter_get_pages(bio, iter);
|
|
+ if (ret < 0) {
|
|
+ /* XXX: fault inject this path */
|
|
+ bio->bi_status = BLK_STS_RESOURCE;
|
|
+ bio_endio(bio);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ offset += bio->bi_iter.bi_size;
|
|
+
|
|
+ if (dio->should_dirty)
|
|
+ bio_set_pages_dirty(bio);
|
|
+
|
|
+ if (iter->count)
|
|
+ closure_get(&dio->cl);
|
|
+
|
|
+ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino);
|
|
+ }
|
|
+
|
|
+ iter->count += shorten;
|
|
+
|
|
+ if (sync) {
|
|
+ closure_sync(&dio->cl);
|
|
+ closure_debug_destroy(&dio->cl);
|
|
+ ret = dio->ret;
|
|
+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
|
|
+ return ret;
|
|
+ } else {
|
|
+ return -EIOCBQUEUED;
|
|
+ }
|
|
+}
|
|
+
|
|
+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter)
|
|
+{
|
|
+ struct file *file = iocb->ki_filp;
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct address_space *mapping = file->f_mapping;
|
|
+ size_t count = iov_iter_count(iter);
|
|
+ ssize_t ret;
|
|
+
|
|
+ if (!count)
|
|
+ return 0; /* skip atime */
|
|
+
|
|
+ if (iocb->ki_flags & IOCB_DIRECT) {
|
|
+ struct blk_plug plug;
|
|
+
|
|
+ ret = filemap_write_and_wait_range(mapping,
|
|
+ iocb->ki_pos,
|
|
+ iocb->ki_pos + count - 1);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ file_accessed(file);
|
|
+
|
|
+ blk_start_plug(&plug);
|
|
+ ret = bch2_direct_IO_read(iocb, iter);
|
|
+ blk_finish_plug(&plug);
|
|
+
|
|
+ if (ret >= 0)
|
|
+ iocb->ki_pos += ret;
|
|
+ } else {
|
|
+ bch2_pagecache_add_get(&inode->ei_pagecache_lock);
|
|
+ ret = generic_file_read_iter(iocb, iter);
|
|
+ bch2_pagecache_add_put(&inode->ei_pagecache_lock);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* O_DIRECT writes */
|
|
+
|
|
+static void bch2_dio_write_loop_async(struct bch_write_op *);
|
|
+
|
|
+static long bch2_dio_write_loop(struct dio_write *dio)
|
|
+{
|
|
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
|
|
+ struct kiocb *req = dio->req;
|
|
+ struct address_space *mapping = req->ki_filp->f_mapping;
|
|
+ struct bch_inode_info *inode = file_bch_inode(req->ki_filp);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bio *bio = &dio->op.wbio.bio;
|
|
+ struct bvec_iter_all iter;
|
|
+ struct bio_vec *bv;
|
|
+ unsigned unaligned, iter_count;
|
|
+ bool sync = dio->sync, dropped_locks;
|
|
+ long ret;
|
|
+
|
|
+ if (dio->loop)
|
|
+ goto loop;
|
|
+
|
|
+ while (1) {
|
|
+ iter_count = dio->iter.count;
|
|
+
|
|
+ if (kthread)
|
|
+ kthread_use_mm(dio->mm);
|
|
+ BUG_ON(current->faults_disabled_mapping);
|
|
+ current->faults_disabled_mapping = mapping;
|
|
+
|
|
+ ret = bio_iov_iter_get_pages(bio, &dio->iter);
|
|
+
|
|
+ dropped_locks = fdm_dropped_locks();
|
|
+
|
|
+ current->faults_disabled_mapping = NULL;
|
|
+ if (kthread)
|
|
+ kthread_unuse_mm(dio->mm);
|
|
+
|
|
+ /*
|
|
+ * If the fault handler returned an error but also signalled
|
|
+ * that it dropped & retook ei_pagecache_lock, we just need to
|
|
+ * re-shoot down the page cache and retry:
|
|
+ */
|
|
+ if (dropped_locks && ret)
|
|
+ ret = 0;
|
|
+
|
|
+ if (unlikely(ret < 0))
|
|
+ goto err;
|
|
+
|
|
+ if (unlikely(dropped_locks)) {
|
|
+ ret = write_invalidate_inode_pages_range(mapping,
|
|
+ req->ki_pos,
|
|
+ req->ki_pos + iter_count - 1);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ if (!bio->bi_iter.bi_size)
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1);
|
|
+ bio->bi_iter.bi_size -= unaligned;
|
|
+ iov_iter_revert(&dio->iter, unaligned);
|
|
+
|
|
+ if (!bio->bi_iter.bi_size) {
|
|
+ /*
|
|
+ * bio_iov_iter_get_pages was only able to get <
|
|
+ * blocksize worth of pages:
|
|
+ */
|
|
+ bio_for_each_segment_all(bv, bio, iter)
|
|
+ put_page(bv->bv_page);
|
|
+ ret = -EFAULT;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode));
|
|
+ dio->op.end_io = bch2_dio_write_loop_async;
|
|
+ dio->op.target = dio->op.opts.foreground_target;
|
|
+ op_journal_seq_set(&dio->op, &inode->ei_journal_seq);
|
|
+ dio->op.write_point = writepoint_hashed((unsigned long) current);
|
|
+ dio->op.nr_replicas = dio->op.opts.data_replicas;
|
|
+ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9);
|
|
+
|
|
+ if ((req->ki_flags & IOCB_DSYNC) &&
|
|
+ !c->opts.journal_flush_disabled)
|
|
+ dio->op.flags |= BCH_WRITE_FLUSH;
|
|
+
|
|
+ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio),
|
|
+ dio->op.opts.data_replicas, 0);
|
|
+ if (unlikely(ret) &&
|
|
+ !bch2_check_range_allocated(c, dio->op.pos,
|
|
+ bio_sectors(bio),
|
|
+ dio->op.opts.data_replicas,
|
|
+ dio->op.opts.compression != 0))
|
|
+ goto err;
|
|
+
|
|
+ task_io_account_write(bio->bi_iter.bi_size);
|
|
+
|
|
+ if (!dio->sync && !dio->loop && dio->iter.count) {
|
|
+ struct iovec *iov = dio->inline_vecs;
|
|
+
|
|
+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) {
|
|
+ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov),
|
|
+ GFP_KERNEL);
|
|
+ if (unlikely(!iov)) {
|
|
+ dio->sync = sync = true;
|
|
+ goto do_io;
|
|
+ }
|
|
+
|
|
+ dio->free_iov = true;
|
|
+ }
|
|
+
|
|
+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov));
|
|
+ dio->iter.iov = iov;
|
|
+ }
|
|
+do_io:
|
|
+ dio->loop = true;
|
|
+ closure_call(&dio->op.cl, bch2_write, NULL, NULL);
|
|
+
|
|
+ if (sync)
|
|
+ wait_for_completion(&dio->done);
|
|
+ else
|
|
+ return -EIOCBQUEUED;
|
|
+loop:
|
|
+ i_sectors_acct(c, inode, &dio->quota_res,
|
|
+ dio->op.i_sectors_delta);
|
|
+ req->ki_pos += (u64) dio->op.written << 9;
|
|
+ dio->written += dio->op.written;
|
|
+
|
|
+ spin_lock(&inode->v.i_lock);
|
|
+ if (req->ki_pos > inode->v.i_size)
|
|
+ i_size_write(&inode->v, req->ki_pos);
|
|
+ spin_unlock(&inode->v.i_lock);
|
|
+
|
|
+ if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
|
|
+ bio_for_each_segment_all(bv, bio, iter)
|
|
+ put_page(bv->bv_page);
|
|
+
|
|
+ if (dio->op.error) {
|
|
+ set_bit(EI_INODE_ERROR, &inode->ei_flags);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!dio->iter.count)
|
|
+ break;
|
|
+
|
|
+ bio_reset(bio);
|
|
+ reinit_completion(&dio->done);
|
|
+ }
|
|
+
|
|
+ ret = dio->op.error ?: ((long) dio->written << 9);
|
|
+err:
|
|
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
|
|
+ bch2_quota_reservation_put(c, inode, &dio->quota_res);
|
|
+
|
|
+ if (dio->free_iov)
|
|
+ kfree(dio->iter.iov);
|
|
+
|
|
+ bio_put(bio);
|
|
+
|
|
+ /* inode->i_dio_count is our ref on inode and thus bch_fs */
|
|
+ inode_dio_end(&inode->v);
|
|
+
|
|
+ if (!sync) {
|
|
+ req->ki_complete(req, ret, 0);
|
|
+ ret = -EIOCBQUEUED;
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void bch2_dio_write_loop_async(struct bch_write_op *op)
|
|
+{
|
|
+ struct dio_write *dio = container_of(op, struct dio_write, op);
|
|
+
|
|
+ if (dio->sync)
|
|
+ complete(&dio->done);
|
|
+ else
|
|
+ bch2_dio_write_loop(dio);
|
|
+}
|
|
+
|
|
+static noinline
|
|
+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
|
|
+{
|
|
+ struct file *file = req->ki_filp;
|
|
+ struct address_space *mapping = file->f_mapping;
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct dio_write *dio;
|
|
+ struct bio *bio;
|
|
+ bool locked = true, extending;
|
|
+ ssize_t ret;
|
|
+
|
|
+ prefetch(&c->opts);
|
|
+ prefetch((void *) &c->opts + 64);
|
|
+ prefetch(&inode->ei_inode);
|
|
+ prefetch((void *) &inode->ei_inode + 64);
|
|
+
|
|
+ inode_lock(&inode->v);
|
|
+
|
|
+ ret = generic_write_checks(req, iter);
|
|
+ if (unlikely(ret <= 0))
|
|
+ goto err;
|
|
+
|
|
+ ret = file_remove_privs(file);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ ret = file_update_time(file);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
|
|
+ goto err;
|
|
+
|
|
+ inode_dio_begin(&inode->v);
|
|
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ extending = req->ki_pos + iter->count > inode->v.i_size;
|
|
+ if (!extending) {
|
|
+ inode_unlock(&inode->v);
|
|
+ locked = false;
|
|
+ }
|
|
+
|
|
+ bio = bio_alloc_bioset(GFP_KERNEL,
|
|
+ iov_iter_npages(iter, BIO_MAX_PAGES),
|
|
+ &c->dio_write_bioset);
|
|
+ dio = container_of(bio, struct dio_write, op.wbio.bio);
|
|
+ init_completion(&dio->done);
|
|
+ dio->req = req;
|
|
+ dio->mm = current->mm;
|
|
+ dio->loop = false;
|
|
+ dio->sync = is_sync_kiocb(req) || extending;
|
|
+ dio->free_iov = false;
|
|
+ dio->quota_res.sectors = 0;
|
|
+ dio->written = 0;
|
|
+ dio->iter = *iter;
|
|
+
|
|
+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res,
|
|
+ iter->count >> 9, true);
|
|
+ if (unlikely(ret))
|
|
+ goto err_put_bio;
|
|
+
|
|
+ ret = write_invalidate_inode_pages_range(mapping,
|
|
+ req->ki_pos,
|
|
+ req->ki_pos + iter->count - 1);
|
|
+ if (unlikely(ret))
|
|
+ goto err_put_bio;
|
|
+
|
|
+ ret = bch2_dio_write_loop(dio);
|
|
+err:
|
|
+ if (locked)
|
|
+ inode_unlock(&inode->v);
|
|
+ return ret;
|
|
+err_put_bio:
|
|
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
|
|
+ bch2_quota_reservation_put(c, inode, &dio->quota_res);
|
|
+ bio_put(bio);
|
|
+ inode_dio_end(&inode->v);
|
|
+ goto err;
|
|
+}
|
|
+
|
|
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
|
+{
|
|
+ struct file *file = iocb->ki_filp;
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ ssize_t ret;
|
|
+
|
|
+ if (iocb->ki_flags & IOCB_DIRECT)
|
|
+ return bch2_direct_write(iocb, from);
|
|
+
|
|
+ /* We can write back this queue in page reclaim */
|
|
+ current->backing_dev_info = inode_to_bdi(&inode->v);
|
|
+ inode_lock(&inode->v);
|
|
+
|
|
+ ret = generic_write_checks(iocb, from);
|
|
+ if (ret <= 0)
|
|
+ goto unlock;
|
|
+
|
|
+ ret = file_remove_privs(file);
|
|
+ if (ret)
|
|
+ goto unlock;
|
|
+
|
|
+ ret = file_update_time(file);
|
|
+ if (ret)
|
|
+ goto unlock;
|
|
+
|
|
+ ret = bch2_buffered_write(iocb, from);
|
|
+ if (likely(ret > 0))
|
|
+ iocb->ki_pos += ret;
|
|
+unlock:
|
|
+ inode_unlock(&inode->v);
|
|
+ current->backing_dev_info = NULL;
|
|
+
|
|
+ if (ret > 0)
|
|
+ ret = generic_write_sync(iocb, ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* fsync: */
|
|
+
|
|
+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
|
|
+{
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ int ret, ret2;
|
|
+
|
|
+ ret = file_write_and_wait_range(file, start, end);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
|
|
+ goto out;
|
|
+
|
|
+ ret = sync_inode_metadata(&inode->v, 1);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+out:
|
|
+ if (!c->opts.journal_flush_disabled)
|
|
+ ret = bch2_journal_flush_seq(&c->journal,
|
|
+ inode->ei_journal_seq);
|
|
+ ret2 = file_check_and_advance_wb_err(file);
|
|
+
|
|
+ return ret ?: ret2;
|
|
+}
|
|
+
|
|
+/* truncate: */
|
|
+
|
|
+static inline int range_has_data(struct bch_fs *c,
|
|
+ struct bpos start,
|
|
+ struct bpos end)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) {
|
|
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
|
|
+ break;
|
|
+
|
|
+ if (bkey_extent_is_data(k.k)) {
|
|
+ ret = 1;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+static int __bch2_truncate_page(struct bch_inode_info *inode,
|
|
+ pgoff_t index, loff_t start, loff_t end)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct address_space *mapping = inode->v.i_mapping;
|
|
+ struct bch_page_state *s;
|
|
+ unsigned start_offset = start & (PAGE_SIZE - 1);
|
|
+ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1;
|
|
+ unsigned i;
|
|
+ struct page *page;
|
|
+ int ret = 0;
|
|
+
|
|
+ /* Page boundary? Nothing to do */
|
|
+ if (!((index == start >> PAGE_SHIFT && start_offset) ||
|
|
+ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE)))
|
|
+ return 0;
|
|
+
|
|
+ /* Above i_size? */
|
|
+ if (index << PAGE_SHIFT >= inode->v.i_size)
|
|
+ return 0;
|
|
+
|
|
+ page = find_lock_page(mapping, index);
|
|
+ if (!page) {
|
|
+ /*
|
|
+ * XXX: we're doing two index lookups when we end up reading the
|
|
+ * page
|
|
+ */
|
|
+ ret = range_has_data(c,
|
|
+ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT),
|
|
+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT));
|
|
+ if (ret <= 0)
|
|
+ return ret;
|
|
+
|
|
+ page = find_or_create_page(mapping, index, GFP_KERNEL);
|
|
+ if (unlikely(!page)) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ s = bch2_page_state_create(page, 0);
|
|
+ if (!s) {
|
|
+ ret = -ENOMEM;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ if (!PageUptodate(page)) {
|
|
+ ret = bch2_read_single_page(page, mapping);
|
|
+ if (ret)
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ if (index != start >> PAGE_SHIFT)
|
|
+ start_offset = 0;
|
|
+ if (index != end >> PAGE_SHIFT)
|
|
+ end_offset = PAGE_SIZE;
|
|
+
|
|
+ for (i = round_up(start_offset, block_bytes(c)) >> 9;
|
|
+ i < round_down(end_offset, block_bytes(c)) >> 9;
|
|
+ i++) {
|
|
+ s->s[i].nr_replicas = 0;
|
|
+ s->s[i].state = SECTOR_UNALLOCATED;
|
|
+ }
|
|
+
|
|
+ zero_user_segment(page, start_offset, end_offset);
|
|
+
|
|
+ /*
|
|
+ * Bit of a hack - we don't want truncate to fail due to -ENOSPC.
|
|
+ *
|
|
+ * XXX: because we aren't currently tracking whether the page has actual
|
|
+ * data in it (vs. just 0s, or only partially written) this wrong. ick.
|
|
+ */
|
|
+ ret = bch2_get_page_disk_reservation(c, inode, page, false);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ /*
|
|
+ * This removes any writeable userspace mappings; we need to force
|
|
+ * .page_mkwrite to be called again before any mmapped writes, to
|
|
+ * redirty the full page:
|
|
+ */
|
|
+ page_mkclean(page);
|
|
+ __set_page_dirty_nobuffers(page);
|
|
+unlock:
|
|
+ unlock_page(page);
|
|
+ put_page(page);
|
|
+out:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
|
|
+{
|
|
+ return __bch2_truncate_page(inode, from >> PAGE_SHIFT,
|
|
+ from, round_up(from, PAGE_SIZE));
|
|
+}
|
|
+
|
|
+static int bch2_extend(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *inode_u,
|
|
+ struct iattr *iattr)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct address_space *mapping = inode->v.i_mapping;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * sync appends:
|
|
+ *
|
|
+ * this has to be done _before_ extending i_size:
|
|
+ */
|
|
+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ truncate_setsize(&inode->v, iattr->ia_size);
|
|
+ setattr_copy(&inode->v, iattr);
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode_size(c, inode, inode->v.i_size,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_truncate_finish_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
|
|
+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_truncate_start_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi, void *p)
|
|
+{
|
|
+ u64 *new_i_size = p;
|
|
+
|
|
+ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY;
|
|
+ bi->bi_size = *new_i_size;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct address_space *mapping = inode->v.i_mapping;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ u64 new_i_size = iattr->ia_size;
|
|
+ s64 i_sectors_delta = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ inode_dio_wait(&inode->v);
|
|
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ /*
|
|
+ * fetch current on disk i_size: inode is locked, i_size can only
|
|
+ * increase underneath us:
|
|
+ */
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0);
|
|
+ ret = PTR_ERR_OR_ZERO(iter);
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * check this before next assertion; on filesystem error our normal
|
|
+ * invariants are a bit broken (truncate has to truncate the page cache
|
|
+ * before the inode).
|
|
+ */
|
|
+ ret = bch2_journal_error(&c->journal);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) &&
|
|
+ inode->v.i_size < inode_u.bi_size);
|
|
+
|
|
+ if (iattr->ia_size > inode->v.i_size) {
|
|
+ ret = bch2_extend(inode, &inode_u, iattr);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_truncate_page(inode, iattr->ia_size);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * When extending, we're going to write the new i_size to disk
|
|
+ * immediately so we need to flush anything above the current on disk
|
|
+ * i_size first:
|
|
+ *
|
|
+ * Also, when extending we need to flush the page that i_size currently
|
|
+ * straddles - if it's mapped to userspace, we need to ensure that
|
|
+ * userspace has to redirty it and call .mkwrite -> set_page_dirty
|
|
+ * again to allocate the part of the page that was extended.
|
|
+ */
|
|
+ if (iattr->ia_size > inode_u.bi_size)
|
|
+ ret = filemap_write_and_wait_range(mapping,
|
|
+ inode_u.bi_size,
|
|
+ iattr->ia_size - 1);
|
|
+ else if (iattr->ia_size & (PAGE_SIZE - 1))
|
|
+ ret = filemap_write_and_wait_range(mapping,
|
|
+ round_down(iattr->ia_size, PAGE_SIZE),
|
|
+ iattr->ia_size - 1);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn,
|
|
+ &new_i_size, 0);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ truncate_setsize(&inode->v, iattr->ia_size);
|
|
+
|
|
+ ret = bch2_fpunch(c, inode->v.i_ino,
|
|
+ round_up(iattr->ia_size, block_bytes(c)) >> 9,
|
|
+ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta);
|
|
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
|
|
+
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ setattr_copy(&inode->v, iattr);
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+err:
|
|
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* fallocate: */
|
|
+
|
|
+static int inode_update_times_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi, void *p)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ u64 discard_start = round_up(offset, block_bytes(c)) >> 9;
|
|
+ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9;
|
|
+ int ret = 0;
|
|
+
|
|
+ inode_lock(&inode->v);
|
|
+ inode_dio_wait(&inode->v);
|
|
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ ret = __bch2_truncate_page(inode,
|
|
+ offset >> PAGE_SHIFT,
|
|
+ offset, offset + len);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ if (offset >> PAGE_SHIFT !=
|
|
+ (offset + len) >> PAGE_SHIFT) {
|
|
+ ret = __bch2_truncate_page(inode,
|
|
+ (offset + len) >> PAGE_SHIFT,
|
|
+ offset, offset + len);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ truncate_pagecache_range(&inode->v, offset, offset + len - 1);
|
|
+
|
|
+ if (discard_start < discard_end) {
|
|
+ s64 i_sectors_delta = 0;
|
|
+
|
|
+ ret = bch2_fpunch(c, inode->v.i_ino,
|
|
+ discard_start, discard_end,
|
|
+ &inode->ei_journal_seq,
|
|
+ &i_sectors_delta);
|
|
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
|
|
+ }
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
|
|
+ ATTR_MTIME|ATTR_CTIME) ?: ret;
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+err:
|
|
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
|
|
+ inode_unlock(&inode->v);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
|
|
+ loff_t offset, loff_t len,
|
|
+ bool insert)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct address_space *mapping = inode->v.i_mapping;
|
|
+ struct bkey_buf copy;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *src, *dst, *del;
|
|
+ loff_t shift, new_size;
|
|
+ u64 src_start;
|
|
+ int ret = 0;
|
|
+
|
|
+ if ((offset | len) & (block_bytes(c) - 1))
|
|
+ return -EINVAL;
|
|
+
|
|
+ /*
|
|
+ * We need i_mutex to keep the page cache consistent with the extents
|
|
+ * btree, and the btree consistent with i_size - we don't need outside
|
|
+ * locking for the extents btree itself, because we're using linked
|
|
+ * iterators
|
|
+ */
|
|
+ inode_lock(&inode->v);
|
|
+ inode_dio_wait(&inode->v);
|
|
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ if (insert) {
|
|
+ ret = -EFBIG;
|
|
+ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len)
|
|
+ goto err;
|
|
+
|
|
+ ret = -EINVAL;
|
|
+ if (offset >= inode->v.i_size)
|
|
+ goto err;
|
|
+
|
|
+ src_start = U64_MAX;
|
|
+ shift = len;
|
|
+ } else {
|
|
+ ret = -EINVAL;
|
|
+ if (offset + len >= inode->v.i_size)
|
|
+ goto err;
|
|
+
|
|
+ src_start = offset + len;
|
|
+ shift = -len;
|
|
+ }
|
|
+
|
|
+ new_size = inode->v.i_size + shift;
|
|
+
|
|
+ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (insert) {
|
|
+ i_size_write(&inode->v, new_size);
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode_size(c, inode, new_size,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+ } else {
|
|
+ s64 i_sectors_delta = 0;
|
|
+
|
|
+ ret = bch2_fpunch(c, inode->v.i_ino,
|
|
+ offset >> 9, (offset + len) >> 9,
|
|
+ &inode->ei_journal_seq,
|
|
+ &i_sectors_delta);
|
|
+ i_sectors_acct(c, inode, NULL, i_sectors_delta);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_init(©);
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
|
|
+ src = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ POS(inode->v.i_ino, src_start >> 9),
|
|
+ BTREE_ITER_INTENT);
|
|
+ dst = bch2_trans_copy_iter(&trans, src);
|
|
+ del = bch2_trans_copy_iter(&trans, src);
|
|
+
|
|
+ while (ret == 0 || ret == -EINTR) {
|
|
+ struct disk_reservation disk_res =
|
|
+ bch2_disk_reservation_init(c, 0);
|
|
+ struct bkey_i delete;
|
|
+ struct bkey_s_c k;
|
|
+ struct bpos next_pos;
|
|
+ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9);
|
|
+ struct bpos atomic_end;
|
|
+ unsigned trigger_flags = 0;
|
|
+
|
|
+ k = insert
|
|
+ ? bch2_btree_iter_peek_prev(src)
|
|
+ : bch2_btree_iter_peek(src);
|
|
+ if ((ret = bkey_err(k)))
|
|
+ continue;
|
|
+
|
|
+ if (!k.k || k.k->p.inode != inode->v.i_ino)
|
|
+ break;
|
|
+
|
|
+ if (insert &&
|
|
+ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
|
|
+ break;
|
|
+reassemble:
|
|
+ bch2_bkey_buf_reassemble(©, c, k);
|
|
+
|
|
+ if (insert &&
|
|
+ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
|
|
+ bch2_cut_front(move_pos, copy.k);
|
|
+
|
|
+ copy.k->k.p.offset += shift >> 9;
|
|
+ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k));
|
|
+
|
|
+ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
|
|
+ if (ret)
|
|
+ continue;
|
|
+
|
|
+ if (bkey_cmp(atomic_end, copy.k->k.p)) {
|
|
+ if (insert) {
|
|
+ move_pos = atomic_end;
|
|
+ move_pos.offset -= shift >> 9;
|
|
+ goto reassemble;
|
|
+ } else {
|
|
+ bch2_cut_back(atomic_end, copy.k);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bkey_init(&delete.k);
|
|
+ delete.k.p = copy.k->k.p;
|
|
+ delete.k.size = copy.k->k.size;
|
|
+ delete.k.p.offset -= shift >> 9;
|
|
+ bch2_btree_iter_set_pos(del, bkey_start_pos(&delete.k));
|
|
+
|
|
+ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
|
|
+
|
|
+ if (copy.k->k.size == k.k->size) {
|
|
+ /*
|
|
+ * If we're moving the entire extent, we can skip
|
|
+ * running triggers:
|
|
+ */
|
|
+ trigger_flags |= BTREE_TRIGGER_NORUN;
|
|
+ } else {
|
|
+ /* We might end up splitting compressed extents: */
|
|
+ unsigned nr_ptrs =
|
|
+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
|
|
+
|
|
+ ret = bch2_disk_reservation_get(c, &disk_res,
|
|
+ copy.k->k.size, nr_ptrs,
|
|
+ BCH_DISK_RESERVATION_NOFAIL);
|
|
+ BUG_ON(ret);
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_update(&trans, del, &delete, trigger_flags) ?:
|
|
+ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?:
|
|
+ bch2_trans_commit(&trans, &disk_res,
|
|
+ &inode->ei_journal_seq,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+ bch2_disk_reservation_put(c, &disk_res);
|
|
+
|
|
+ if (!ret)
|
|
+ bch2_btree_iter_set_pos(src, next_pos);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, del);
|
|
+ bch2_trans_iter_put(&trans, dst);
|
|
+ bch2_trans_iter_put(&trans, src);
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(©, c);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!insert) {
|
|
+ i_size_write(&inode->v, new_size);
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode_size(c, inode, new_size,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+ }
|
|
+err:
|
|
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
|
|
+ inode_unlock(&inode->v);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
|
|
+ u64 start_sector, u64 end_sector)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bpos end_pos = POS(inode->v.i_ino, end_sector);
|
|
+ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ POS(inode->v.i_ino, start_sector),
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+
|
|
+ while (!ret && bkey_cmp(iter->pos, end_pos) < 0) {
|
|
+ s64 i_sectors_delta = 0;
|
|
+ struct disk_reservation disk_res = { 0 };
|
|
+ struct quota_res quota_res = { 0 };
|
|
+ struct bkey_i_reservation reservation;
|
|
+ struct bkey_s_c k;
|
|
+ unsigned sectors;
|
|
+
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ if ((ret = bkey_err(k)))
|
|
+ goto bkey_err;
|
|
+
|
|
+ /* already reserved */
|
|
+ if (k.k->type == KEY_TYPE_reservation &&
|
|
+ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) {
|
|
+ bch2_btree_iter_next_slot(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (bkey_extent_is_data(k.k) &&
|
|
+ !(mode & FALLOC_FL_ZERO_RANGE)) {
|
|
+ bch2_btree_iter_next_slot(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ bkey_reservation_init(&reservation.k_i);
|
|
+ reservation.k.type = KEY_TYPE_reservation;
|
|
+ reservation.k.p = k.k->p;
|
|
+ reservation.k.size = k.k->size;
|
|
+
|
|
+ bch2_cut_front(iter->pos, &reservation.k_i);
|
|
+ bch2_cut_back(end_pos, &reservation.k_i);
|
|
+
|
|
+ sectors = reservation.k.size;
|
|
+ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
|
|
+
|
|
+ if (!bkey_extent_is_allocation(k.k)) {
|
|
+ ret = bch2_quota_reservation_add(c, inode,
|
|
+ "a_res,
|
|
+ sectors, true);
|
|
+ if (unlikely(ret))
|
|
+ goto bkey_err;
|
|
+ }
|
|
+
|
|
+ if (reservation.v.nr_replicas < replicas ||
|
|
+ bch2_bkey_sectors_compressed(k)) {
|
|
+ ret = bch2_disk_reservation_get(c, &disk_res, sectors,
|
|
+ replicas, 0);
|
|
+ if (unlikely(ret))
|
|
+ goto bkey_err;
|
|
+
|
|
+ reservation.v.nr_replicas = disk_res.nr_replicas;
|
|
+ }
|
|
+
|
|
+ ret = bch2_extent_update(&trans, iter, &reservation.k_i,
|
|
+ &disk_res, &inode->ei_journal_seq,
|
|
+ 0, &i_sectors_delta);
|
|
+ i_sectors_acct(c, inode, "a_res, i_sectors_delta);
|
|
+bkey_err:
|
|
+ bch2_quota_reservation_put(c, inode, "a_res);
|
|
+ bch2_disk_reservation_put(c, &disk_res);
|
|
+ if (ret == -EINTR)
|
|
+ ret = 0;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
|
|
+ loff_t offset, loff_t len)
|
|
+{
|
|
+ struct address_space *mapping = inode->v.i_mapping;
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ loff_t end = offset + len;
|
|
+ loff_t block_start = round_down(offset, block_bytes(c));
|
|
+ loff_t block_end = round_up(end, block_bytes(c));
|
|
+ int ret;
|
|
+
|
|
+ inode_lock(&inode->v);
|
|
+ inode_dio_wait(&inode->v);
|
|
+ bch2_pagecache_block_get(&inode->ei_pagecache_lock);
|
|
+
|
|
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) {
|
|
+ ret = inode_newsize_ok(&inode->v, end);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (mode & FALLOC_FL_ZERO_RANGE) {
|
|
+ ret = __bch2_truncate_page(inode,
|
|
+ offset >> PAGE_SHIFT,
|
|
+ offset, end);
|
|
+
|
|
+ if (!ret &&
|
|
+ offset >> PAGE_SHIFT != end >> PAGE_SHIFT)
|
|
+ ret = __bch2_truncate_page(inode,
|
|
+ end >> PAGE_SHIFT,
|
|
+ offset, end);
|
|
+
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ truncate_pagecache_range(&inode->v, offset, end - 1);
|
|
+ }
|
|
+
|
|
+ ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * Do we need to extend the file?
|
|
+ *
|
|
+ * If we zeroed up to the end of the file, we dropped whatever writes
|
|
+ * were going to write out the current i_size, so we have to extend
|
|
+ * manually even if FL_KEEP_SIZE was set:
|
|
+ */
|
|
+ if (end >= inode->v.i_size &&
|
|
+ (!(mode & FALLOC_FL_KEEP_SIZE) ||
|
|
+ (mode & FALLOC_FL_ZERO_RANGE))) {
|
|
+
|
|
+ /*
|
|
+ * Sync existing appends before extending i_size,
|
|
+ * as in bch2_extend():
|
|
+ */
|
|
+ ret = filemap_write_and_wait_range(mapping,
|
|
+ inode->ei_inode.bi_size, S64_MAX);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (mode & FALLOC_FL_KEEP_SIZE)
|
|
+ end = inode->v.i_size;
|
|
+ else
|
|
+ i_size_write(&inode->v, end);
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode_size(c, inode, end, 0);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+ }
|
|
+err:
|
|
+ bch2_pagecache_block_put(&inode->ei_pagecache_lock);
|
|
+ inode_unlock(&inode->v);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+long bch2_fallocate_dispatch(struct file *file, int mode,
|
|
+ loff_t offset, loff_t len)
|
|
+{
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ long ret;
|
|
+
|
|
+ if (!percpu_ref_tryget(&c->writes))
|
|
+ return -EROFS;
|
|
+
|
|
+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE)))
|
|
+ ret = bchfs_fallocate(inode, mode, offset, len);
|
|
+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE))
|
|
+ ret = bchfs_fpunch(inode, offset, len);
|
|
+ else if (mode == FALLOC_FL_INSERT_RANGE)
|
|
+ ret = bchfs_fcollapse_finsert(inode, offset, len, true);
|
|
+ else if (mode == FALLOC_FL_COLLAPSE_RANGE)
|
|
+ ret = bchfs_fcollapse_finsert(inode, offset, len, false);
|
|
+ else
|
|
+ ret = -EOPNOTSUPP;
|
|
+
|
|
+ percpu_ref_put(&c->writes);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void mark_range_unallocated(struct bch_inode_info *inode,
|
|
+ loff_t start, loff_t end)
|
|
+{
|
|
+ pgoff_t index = start >> PAGE_SHIFT;
|
|
+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT;
|
|
+ struct pagevec pvec;
|
|
+
|
|
+ pagevec_init(&pvec);
|
|
+
|
|
+ do {
|
|
+ unsigned nr_pages, i, j;
|
|
+
|
|
+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping,
|
|
+ &index, end_index);
|
|
+ if (nr_pages == 0)
|
|
+ break;
|
|
+
|
|
+ for (i = 0; i < nr_pages; i++) {
|
|
+ struct page *page = pvec.pages[i];
|
|
+ struct bch_page_state *s;
|
|
+
|
|
+ lock_page(page);
|
|
+ s = bch2_page_state(page);
|
|
+
|
|
+ if (s) {
|
|
+ spin_lock(&s->lock);
|
|
+ for (j = 0; j < PAGE_SECTORS; j++)
|
|
+ s->s[j].nr_replicas = 0;
|
|
+ spin_unlock(&s->lock);
|
|
+ }
|
|
+
|
|
+ unlock_page(page);
|
|
+ }
|
|
+ pagevec_release(&pvec);
|
|
+ } while (index <= end_index);
|
|
+}
|
|
+
|
|
+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
|
|
+ struct file *file_dst, loff_t pos_dst,
|
|
+ loff_t len, unsigned remap_flags)
|
|
+{
|
|
+ struct bch_inode_info *src = file_bch_inode(file_src);
|
|
+ struct bch_inode_info *dst = file_bch_inode(file_dst);
|
|
+ struct bch_fs *c = src->v.i_sb->s_fs_info;
|
|
+ s64 i_sectors_delta = 0;
|
|
+ u64 aligned_len;
|
|
+ loff_t ret = 0;
|
|
+
|
|
+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (remap_flags & REMAP_FILE_DEDUP)
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ if ((pos_src & (block_bytes(c) - 1)) ||
|
|
+ (pos_dst & (block_bytes(c) - 1)))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (src == dst &&
|
|
+ abs(pos_src - pos_dst) < len)
|
|
+ return -EINVAL;
|
|
+
|
|
+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
|
|
+
|
|
+ file_update_time(file_dst);
|
|
+
|
|
+ inode_dio_wait(&src->v);
|
|
+ inode_dio_wait(&dst->v);
|
|
+
|
|
+ ret = generic_remap_file_range_prep(file_src, pos_src,
|
|
+ file_dst, pos_dst,
|
|
+ &len, remap_flags);
|
|
+ if (ret < 0 || len == 0)
|
|
+ goto err;
|
|
+
|
|
+ aligned_len = round_up((u64) len, block_bytes(c));
|
|
+
|
|
+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping,
|
|
+ pos_dst, pos_dst + len - 1);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ mark_range_unallocated(src, pos_src, pos_src + aligned_len);
|
|
+
|
|
+ ret = bch2_remap_range(c,
|
|
+ POS(dst->v.i_ino, pos_dst >> 9),
|
|
+ POS(src->v.i_ino, pos_src >> 9),
|
|
+ aligned_len >> 9,
|
|
+ &dst->ei_journal_seq,
|
|
+ pos_dst + len, &i_sectors_delta);
|
|
+ if (ret < 0)
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * due to alignment, we might have remapped slightly more than requsted
|
|
+ */
|
|
+ ret = min((u64) ret << 9, (u64) len);
|
|
+
|
|
+ /* XXX get a quota reservation */
|
|
+ i_sectors_acct(c, dst, NULL, i_sectors_delta);
|
|
+
|
|
+ spin_lock(&dst->v.i_lock);
|
|
+ if (pos_dst + ret > dst->v.i_size)
|
|
+ i_size_write(&dst->v, pos_dst + ret);
|
|
+ spin_unlock(&dst->v.i_lock);
|
|
+err:
|
|
+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* fseek: */
|
|
+
|
|
+static int page_data_offset(struct page *page, unsigned offset)
|
|
+{
|
|
+ struct bch_page_state *s = bch2_page_state(page);
|
|
+ unsigned i;
|
|
+
|
|
+ if (s)
|
|
+ for (i = offset >> 9; i < PAGE_SECTORS; i++)
|
|
+ if (s->s[i].state >= SECTOR_DIRTY)
|
|
+ return i << 9;
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static loff_t bch2_seek_pagecache_data(struct inode *vinode,
|
|
+ loff_t start_offset,
|
|
+ loff_t end_offset)
|
|
+{
|
|
+ struct address_space *mapping = vinode->i_mapping;
|
|
+ struct page *page;
|
|
+ pgoff_t start_index = start_offset >> PAGE_SHIFT;
|
|
+ pgoff_t end_index = end_offset >> PAGE_SHIFT;
|
|
+ pgoff_t index = start_index;
|
|
+ loff_t ret;
|
|
+ int offset;
|
|
+
|
|
+ while (index <= end_index) {
|
|
+ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) {
|
|
+ lock_page(page);
|
|
+
|
|
+ offset = page_data_offset(page,
|
|
+ page->index == start_index
|
|
+ ? start_offset & (PAGE_SIZE - 1)
|
|
+ : 0);
|
|
+ if (offset >= 0) {
|
|
+ ret = clamp(((loff_t) page->index << PAGE_SHIFT) +
|
|
+ offset,
|
|
+ start_offset, end_offset);
|
|
+ unlock_page(page);
|
|
+ put_page(page);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ unlock_page(page);
|
|
+ put_page(page);
|
|
+ } else {
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return end_offset;
|
|
+}
|
|
+
|
|
+static loff_t bch2_seek_data(struct file *file, u64 offset)
|
|
+{
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 isize, next_data = MAX_LFS_FILESIZE;
|
|
+ int ret;
|
|
+
|
|
+ isize = i_size_read(&inode->v);
|
|
+ if (offset >= isize)
|
|
+ return -ENXIO;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
|
|
+ POS(inode->v.i_ino, offset >> 9), 0, k, ret) {
|
|
+ if (k.k->p.inode != inode->v.i_ino) {
|
|
+ break;
|
|
+ } else if (bkey_extent_is_data(k.k)) {
|
|
+ next_data = max(offset, bkey_start_offset(k.k) << 9);
|
|
+ break;
|
|
+ } else if (k.k->p.offset >> 9 > isize)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (next_data > offset)
|
|
+ next_data = bch2_seek_pagecache_data(&inode->v,
|
|
+ offset, next_data);
|
|
+
|
|
+ if (next_data >= isize)
|
|
+ return -ENXIO;
|
|
+
|
|
+ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE);
|
|
+}
|
|
+
|
|
+static int __page_hole_offset(struct page *page, unsigned offset)
|
|
+{
|
|
+ struct bch_page_state *s = bch2_page_state(page);
|
|
+ unsigned i;
|
|
+
|
|
+ if (!s)
|
|
+ return 0;
|
|
+
|
|
+ for (i = offset >> 9; i < PAGE_SECTORS; i++)
|
|
+ if (s->s[i].state < SECTOR_DIRTY)
|
|
+ return i << 9;
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset)
|
|
+{
|
|
+ pgoff_t index = offset >> PAGE_SHIFT;
|
|
+ struct page *page;
|
|
+ int pg_offset;
|
|
+ loff_t ret = -1;
|
|
+
|
|
+ page = find_lock_page(mapping, index);
|
|
+ if (!page)
|
|
+ return offset;
|
|
+
|
|
+ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1));
|
|
+ if (pg_offset >= 0)
|
|
+ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset;
|
|
+
|
|
+ unlock_page(page);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static loff_t bch2_seek_pagecache_hole(struct inode *vinode,
|
|
+ loff_t start_offset,
|
|
+ loff_t end_offset)
|
|
+{
|
|
+ struct address_space *mapping = vinode->i_mapping;
|
|
+ loff_t offset = start_offset, hole;
|
|
+
|
|
+ while (offset < end_offset) {
|
|
+ hole = page_hole_offset(mapping, offset);
|
|
+ if (hole >= 0 && hole <= end_offset)
|
|
+ return max(start_offset, hole);
|
|
+
|
|
+ offset += PAGE_SIZE;
|
|
+ offset &= PAGE_MASK;
|
|
+ }
|
|
+
|
|
+ return end_offset;
|
|
+}
|
|
+
|
|
+static loff_t bch2_seek_hole(struct file *file, u64 offset)
|
|
+{
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 isize, next_hole = MAX_LFS_FILESIZE;
|
|
+ int ret;
|
|
+
|
|
+ isize = i_size_read(&inode->v);
|
|
+ if (offset >= isize)
|
|
+ return -ENXIO;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
|
|
+ POS(inode->v.i_ino, offset >> 9),
|
|
+ BTREE_ITER_SLOTS, k, ret) {
|
|
+ if (k.k->p.inode != inode->v.i_ino) {
|
|
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
|
|
+ offset, MAX_LFS_FILESIZE);
|
|
+ break;
|
|
+ } else if (!bkey_extent_is_data(k.k)) {
|
|
+ next_hole = bch2_seek_pagecache_hole(&inode->v,
|
|
+ max(offset, bkey_start_offset(k.k) << 9),
|
|
+ k.k->p.offset << 9);
|
|
+
|
|
+ if (next_hole < k.k->p.offset << 9)
|
|
+ break;
|
|
+ } else {
|
|
+ offset = max(offset, bkey_start_offset(k.k) << 9);
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (next_hole > isize)
|
|
+ next_hole = isize;
|
|
+
|
|
+ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE);
|
|
+}
|
|
+
|
|
+loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
|
|
+{
|
|
+ switch (whence) {
|
|
+ case SEEK_SET:
|
|
+ case SEEK_CUR:
|
|
+ case SEEK_END:
|
|
+ return generic_file_llseek(file, offset, whence);
|
|
+ case SEEK_DATA:
|
|
+ return bch2_seek_data(file, offset);
|
|
+ case SEEK_HOLE:
|
|
+ return bch2_seek_hole(file, offset);
|
|
+ }
|
|
+
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+void bch2_fs_fsio_exit(struct bch_fs *c)
|
|
+{
|
|
+ bioset_exit(&c->dio_write_bioset);
|
|
+ bioset_exit(&c->dio_read_bioset);
|
|
+ bioset_exit(&c->writepage_bioset);
|
|
+}
|
|
+
|
|
+int bch2_fs_fsio_init(struct bch_fs *c)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_verbose_init(c->opts, "");
|
|
+
|
|
+ if (bioset_init(&c->writepage_bioset,
|
|
+ 4, offsetof(struct bch_writepage_io, op.wbio.bio),
|
|
+ BIOSET_NEED_BVECS) ||
|
|
+ bioset_init(&c->dio_read_bioset,
|
|
+ 4, offsetof(struct dio_read, rbio.bio),
|
|
+ BIOSET_NEED_BVECS) ||
|
|
+ bioset_init(&c->dio_write_bioset,
|
|
+ 4, offsetof(struct dio_write, op.wbio.bio),
|
|
+ BIOSET_NEED_BVECS))
|
|
+ ret = -ENOMEM;
|
|
+
|
|
+ pr_verbose_init(c->opts, "ret %i", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#endif /* NO_BCACHEFS_FS */
|
|
diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h
|
|
new file mode 100644
|
|
index 000000000000..2537a3d25ede
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs-io.h
|
|
@@ -0,0 +1,56 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FS_IO_H
|
|
+#define _BCACHEFS_FS_IO_H
|
|
+
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+#include "buckets.h"
|
|
+#include "io_types.h"
|
|
+
|
|
+#include <linux/uio.h>
|
|
+
|
|
+struct quota_res;
|
|
+
|
|
+int __must_check bch2_write_inode_size(struct bch_fs *,
|
|
+ struct bch_inode_info *,
|
|
+ loff_t, unsigned);
|
|
+
|
|
+int bch2_writepage(struct page *, struct writeback_control *);
|
|
+int bch2_readpage(struct file *, struct page *);
|
|
+
|
|
+int bch2_writepages(struct address_space *, struct writeback_control *);
|
|
+void bch2_readahead(struct readahead_control *);
|
|
+
|
|
+int bch2_write_begin(struct file *, struct address_space *, loff_t,
|
|
+ unsigned, unsigned, struct page **, void **);
|
|
+int bch2_write_end(struct file *, struct address_space *, loff_t,
|
|
+ unsigned, unsigned, struct page *, void *);
|
|
+
|
|
+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *);
|
|
+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
|
|
+
|
|
+int bch2_fsync(struct file *, loff_t, loff_t, int);
|
|
+
|
|
+int bch2_truncate(struct bch_inode_info *, struct iattr *);
|
|
+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t);
|
|
+
|
|
+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *,
|
|
+ loff_t, loff_t, unsigned);
|
|
+
|
|
+loff_t bch2_llseek(struct file *, loff_t, int);
|
|
+
|
|
+vm_fault_t bch2_page_fault(struct vm_fault *);
|
|
+vm_fault_t bch2_page_mkwrite(struct vm_fault *);
|
|
+void bch2_invalidatepage(struct page *, unsigned int, unsigned int);
|
|
+int bch2_releasepage(struct page *, gfp_t);
|
|
+int bch2_migrate_page(struct address_space *, struct page *,
|
|
+ struct page *, enum migrate_mode);
|
|
+
|
|
+void bch2_fs_fsio_exit(struct bch_fs *);
|
|
+int bch2_fs_fsio_init(struct bch_fs *);
|
|
+#else
|
|
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
|
|
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
|
|
+#endif
|
|
+
|
|
+#endif /* _BCACHEFS_FS_IO_H */
|
|
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
|
|
new file mode 100644
|
|
index 000000000000..eb871634eeae
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs-ioctl.c
|
|
@@ -0,0 +1,312 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "chardev.h"
|
|
+#include "dirent.h"
|
|
+#include "fs.h"
|
|
+#include "fs-common.h"
|
|
+#include "fs-ioctl.h"
|
|
+#include "quota.h"
|
|
+
|
|
+#include <linux/compat.h>
|
|
+#include <linux/mount.h>
|
|
+
|
|
+#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32)
|
|
+
|
|
+struct flags_set {
|
|
+ unsigned mask;
|
|
+ unsigned flags;
|
|
+
|
|
+ unsigned projid;
|
|
+};
|
|
+
|
|
+static int bch2_inode_flags_set(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ /*
|
|
+ * We're relying on btree locking here for exclusion with other ioctl
|
|
+ * calls - use the flags in the btree (@bi), not inode->i_flags:
|
|
+ */
|
|
+ struct flags_set *s = p;
|
|
+ unsigned newflags = s->flags;
|
|
+ unsigned oldflags = bi->bi_flags & s->mask;
|
|
+
|
|
+ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) &&
|
|
+ !capable(CAP_LINUX_IMMUTABLE))
|
|
+ return -EPERM;
|
|
+
|
|
+ if (!S_ISREG(bi->bi_mode) &&
|
|
+ !S_ISDIR(bi->bi_mode) &&
|
|
+ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags)
|
|
+ return -EINVAL;
|
|
+
|
|
+ bi->bi_flags &= ~s->mask;
|
|
+ bi->bi_flags |= newflags;
|
|
+
|
|
+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v));
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg)
|
|
+{
|
|
+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags);
|
|
+
|
|
+ return put_user(flags, arg);
|
|
+}
|
|
+
|
|
+static int bch2_ioc_setflags(struct bch_fs *c,
|
|
+ struct file *file,
|
|
+ struct bch_inode_info *inode,
|
|
+ void __user *arg)
|
|
+{
|
|
+ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) };
|
|
+ unsigned uflags;
|
|
+ int ret;
|
|
+
|
|
+ if (get_user(uflags, (int __user *) arg))
|
|
+ return -EFAULT;
|
|
+
|
|
+ s.flags = map_flags_rev(bch_flags_to_uflags, uflags);
|
|
+ if (uflags)
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ ret = mnt_want_write_file(file);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ inode_lock(&inode->v);
|
|
+ if (!inode_owner_or_capable(&inode->v)) {
|
|
+ ret = -EACCES;
|
|
+ goto setflags_out;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s,
|
|
+ ATTR_CTIME);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+setflags_out:
|
|
+ inode_unlock(&inode->v);
|
|
+ mnt_drop_write_file(file);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode,
|
|
+ struct fsxattr __user *arg)
|
|
+{
|
|
+ struct fsxattr fa = { 0 };
|
|
+
|
|
+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags);
|
|
+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ];
|
|
+
|
|
+ return copy_to_user(arg, &fa, sizeof(fa));
|
|
+}
|
|
+
|
|
+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct flags_set *s = p;
|
|
+
|
|
+ if (s->projid != bi->bi_project) {
|
|
+ bi->bi_fields_set |= 1U << Inode_opt_project;
|
|
+ bi->bi_project = s->projid;
|
|
+ }
|
|
+
|
|
+ return bch2_inode_flags_set(inode, bi, p);
|
|
+}
|
|
+
|
|
+static int bch2_ioc_fssetxattr(struct bch_fs *c,
|
|
+ struct file *file,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct fsxattr __user *arg)
|
|
+{
|
|
+ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) };
|
|
+ struct fsxattr fa;
|
|
+ int ret;
|
|
+
|
|
+ if (copy_from_user(&fa, arg, sizeof(fa)))
|
|
+ return -EFAULT;
|
|
+
|
|
+ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags);
|
|
+ if (fa.fsx_xflags)
|
|
+ return -EOPNOTSUPP;
|
|
+
|
|
+ if (fa.fsx_projid >= U32_MAX)
|
|
+ return -EINVAL;
|
|
+
|
|
+ /*
|
|
+ * inode fields accessible via the xattr interface are stored with a +1
|
|
+ * bias, so that 0 means unset:
|
|
+ */
|
|
+ s.projid = fa.fsx_projid + 1;
|
|
+
|
|
+ ret = mnt_want_write_file(file);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ inode_lock(&inode->v);
|
|
+ if (!inode_owner_or_capable(&inode->v)) {
|
|
+ ret = -EACCES;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_set_projid(c, inode, fa.fsx_projid);
|
|
+ if (ret)
|
|
+ goto err_unlock;
|
|
+
|
|
+ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s,
|
|
+ ATTR_CTIME);
|
|
+err_unlock:
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+err:
|
|
+ inode_unlock(&inode->v);
|
|
+ mnt_drop_write_file(file);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct bch_inode_info *dir = p;
|
|
+
|
|
+ return !bch2_reinherit_attrs(bi, &dir->ei_inode);
|
|
+}
|
|
+
|
|
+static int bch2_ioc_reinherit_attrs(struct bch_fs *c,
|
|
+ struct file *file,
|
|
+ struct bch_inode_info *src,
|
|
+ const char __user *name)
|
|
+{
|
|
+ struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode);
|
|
+ struct bch_inode_info *dst;
|
|
+ struct inode *vinode = NULL;
|
|
+ char *kname = NULL;
|
|
+ struct qstr qstr;
|
|
+ int ret = 0;
|
|
+ u64 inum;
|
|
+
|
|
+ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL);
|
|
+ if (!kname)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = strncpy_from_user(kname, name, BCH_NAME_MAX);
|
|
+ if (unlikely(ret < 0))
|
|
+ goto err1;
|
|
+
|
|
+ qstr.len = ret;
|
|
+ qstr.name = kname;
|
|
+
|
|
+ ret = -ENOENT;
|
|
+ inum = bch2_dirent_lookup(c, src->v.i_ino, &hash,
|
|
+ &qstr);
|
|
+ if (!inum)
|
|
+ goto err1;
|
|
+
|
|
+ vinode = bch2_vfs_inode_get(c, inum);
|
|
+ ret = PTR_ERR_OR_ZERO(vinode);
|
|
+ if (ret)
|
|
+ goto err1;
|
|
+
|
|
+ dst = to_bch_ei(vinode);
|
|
+
|
|
+ ret = mnt_want_write_file(file);
|
|
+ if (ret)
|
|
+ goto err2;
|
|
+
|
|
+ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst);
|
|
+
|
|
+ if (inode_attr_changing(src, dst, Inode_opt_project)) {
|
|
+ ret = bch2_fs_quota_transfer(c, dst,
|
|
+ src->ei_qid,
|
|
+ 1 << QTYP_PRJ,
|
|
+ KEY_TYPE_QUOTA_PREALLOC);
|
|
+ if (ret)
|
|
+ goto err3;
|
|
+ }
|
|
+
|
|
+ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0);
|
|
+err3:
|
|
+ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst);
|
|
+
|
|
+ /* return true if we did work */
|
|
+ if (ret >= 0)
|
|
+ ret = !ret;
|
|
+
|
|
+ mnt_drop_write_file(file);
|
|
+err2:
|
|
+ iput(vinode);
|
|
+err1:
|
|
+ kfree(kname);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
|
+{
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct super_block *sb = inode->v.i_sb;
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+
|
|
+ switch (cmd) {
|
|
+ case FS_IOC_GETFLAGS:
|
|
+ return bch2_ioc_getflags(inode, (int __user *) arg);
|
|
+
|
|
+ case FS_IOC_SETFLAGS:
|
|
+ return bch2_ioc_setflags(c, file, inode, (int __user *) arg);
|
|
+
|
|
+ case FS_IOC_FSGETXATTR:
|
|
+ return bch2_ioc_fsgetxattr(inode, (void __user *) arg);
|
|
+ case FS_IOC_FSSETXATTR:
|
|
+ return bch2_ioc_fssetxattr(c, file, inode,
|
|
+ (void __user *) arg);
|
|
+
|
|
+ case BCHFS_IOC_REINHERIT_ATTRS:
|
|
+ return bch2_ioc_reinherit_attrs(c, file, inode,
|
|
+ (void __user *) arg);
|
|
+
|
|
+ case FS_IOC_GETVERSION:
|
|
+ return -ENOTTY;
|
|
+ case FS_IOC_SETVERSION:
|
|
+ return -ENOTTY;
|
|
+
|
|
+ case FS_IOC_GOINGDOWN:
|
|
+ if (!capable(CAP_SYS_ADMIN))
|
|
+ return -EPERM;
|
|
+
|
|
+ down_write(&sb->s_umount);
|
|
+ sb->s_flags |= SB_RDONLY;
|
|
+ if (bch2_fs_emergency_read_only(c))
|
|
+ bch_err(c, "emergency read only due to ioctl");
|
|
+ up_write(&sb->s_umount);
|
|
+ return 0;
|
|
+
|
|
+ default:
|
|
+ return bch2_fs_ioctl(c, cmd, (void __user *) arg);
|
|
+ }
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_COMPAT
|
|
+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
|
+{
|
|
+ /* These are just misnamed, they actually get/put from/to user an int */
|
|
+ switch (cmd) {
|
|
+ case FS_IOC_GETFLAGS:
|
|
+ cmd = FS_IOC_GETFLAGS;
|
|
+ break;
|
|
+ case FS_IOC32_SETFLAGS:
|
|
+ cmd = FS_IOC_SETFLAGS;
|
|
+ break;
|
|
+ default:
|
|
+ return -ENOIOCTLCMD;
|
|
+ }
|
|
+ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
|
|
+}
|
|
+#endif
|
|
+
|
|
+#endif /* NO_BCACHEFS_FS */
|
|
diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h
|
|
new file mode 100644
|
|
index 000000000000..f201980ef2c3
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs-ioctl.h
|
|
@@ -0,0 +1,81 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FS_IOCTL_H
|
|
+#define _BCACHEFS_FS_IOCTL_H
|
|
+
|
|
+/* Inode flags: */
|
|
+
|
|
+/* bcachefs inode flags -> vfs inode flags: */
|
|
+static const unsigned bch_flags_to_vfs[] = {
|
|
+ [__BCH_INODE_SYNC] = S_SYNC,
|
|
+ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE,
|
|
+ [__BCH_INODE_APPEND] = S_APPEND,
|
|
+ [__BCH_INODE_NOATIME] = S_NOATIME,
|
|
+};
|
|
+
|
|
+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */
|
|
+static const unsigned bch_flags_to_uflags[] = {
|
|
+ [__BCH_INODE_SYNC] = FS_SYNC_FL,
|
|
+ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL,
|
|
+ [__BCH_INODE_APPEND] = FS_APPEND_FL,
|
|
+ [__BCH_INODE_NODUMP] = FS_NODUMP_FL,
|
|
+ [__BCH_INODE_NOATIME] = FS_NOATIME_FL,
|
|
+};
|
|
+
|
|
+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */
|
|
+static const unsigned bch_flags_to_xflags[] = {
|
|
+ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC,
|
|
+ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE,
|
|
+ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND,
|
|
+ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP,
|
|
+ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME,
|
|
+ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT;
|
|
+};
|
|
+
|
|
+#define set_flags(_map, _in, _out) \
|
|
+do { \
|
|
+ unsigned _i; \
|
|
+ \
|
|
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
|
+ if ((_in) & (1 << _i)) \
|
|
+ (_out) |= _map[_i]; \
|
|
+ else \
|
|
+ (_out) &= ~_map[_i]; \
|
|
+} while (0)
|
|
+
|
|
+#define map_flags(_map, _in) \
|
|
+({ \
|
|
+ unsigned _out = 0; \
|
|
+ \
|
|
+ set_flags(_map, _in, _out); \
|
|
+ _out; \
|
|
+})
|
|
+
|
|
+#define map_flags_rev(_map, _in) \
|
|
+({ \
|
|
+ unsigned _i, _out = 0; \
|
|
+ \
|
|
+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \
|
|
+ if ((_in) & _map[_i]) { \
|
|
+ (_out) |= 1 << _i; \
|
|
+ (_in) &= ~_map[_i]; \
|
|
+ } \
|
|
+ (_out); \
|
|
+})
|
|
+
|
|
+#define map_defined(_map) \
|
|
+({ \
|
|
+ unsigned _in = ~0; \
|
|
+ \
|
|
+ map_flags_rev(_map, _in); \
|
|
+})
|
|
+
|
|
+/* Set VFS inode flags from bcachefs inode: */
|
|
+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
|
|
+{
|
|
+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
|
|
+}
|
|
+
|
|
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
|
|
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
|
|
+
|
|
+#endif /* _BCACHEFS_FS_IOCTL_H */
|
|
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
|
|
new file mode 100644
|
|
index 000000000000..b1bbec3f96bd
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs.c
|
|
@@ -0,0 +1,1649 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "acl.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "chardev.h"
|
|
+#include "dirent.h"
|
|
+#include "extents.h"
|
|
+#include "fs.h"
|
|
+#include "fs-common.h"
|
|
+#include "fs-io.h"
|
|
+#include "fs-ioctl.h"
|
|
+#include "fsck.h"
|
|
+#include "inode.h"
|
|
+#include "io.h"
|
|
+#include "journal.h"
|
|
+#include "keylist.h"
|
|
+#include "quota.h"
|
|
+#include "super.h"
|
|
+#include "xattr.h"
|
|
+
|
|
+#include <linux/aio.h>
|
|
+#include <linux/backing-dev.h>
|
|
+#include <linux/exportfs.h>
|
|
+#include <linux/fiemap.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/posix_acl.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/statfs.h>
|
|
+#include <linux/xattr.h>
|
|
+
|
|
+static struct kmem_cache *bch2_inode_cache;
|
|
+
|
|
+static void bch2_vfs_inode_init(struct bch_fs *,
|
|
+ struct bch_inode_info *,
|
|
+ struct bch_inode_unpacked *);
|
|
+
|
|
+static void journal_seq_copy(struct bch_fs *c,
|
|
+ struct bch_inode_info *dst,
|
|
+ u64 journal_seq)
|
|
+{
|
|
+ /*
|
|
+ * atomic64_cmpxchg has a fallback for archs that don't support it,
|
|
+ * cmpxchg does not:
|
|
+ */
|
|
+ atomic64_t *dst_seq = (void *) &dst->ei_journal_seq;
|
|
+ u64 old, v = READ_ONCE(dst->ei_journal_seq);
|
|
+
|
|
+ do {
|
|
+ old = v;
|
|
+
|
|
+ if (old >= journal_seq)
|
|
+ break;
|
|
+ } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old);
|
|
+
|
|
+ bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq);
|
|
+}
|
|
+
|
|
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
|
|
+{
|
|
+ BUG_ON(atomic_long_read(&lock->v) == 0);
|
|
+
|
|
+ if (atomic_long_sub_return_release(i, &lock->v) == 0)
|
|
+ wake_up_all(&lock->wait);
|
|
+}
|
|
+
|
|
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
|
|
+{
|
|
+ long v = atomic_long_read(&lock->v), old;
|
|
+
|
|
+ do {
|
|
+ old = v;
|
|
+
|
|
+ if (i > 0 ? v < 0 : v > 0)
|
|
+ return false;
|
|
+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v,
|
|
+ old, old + i)) != old);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
|
|
+{
|
|
+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
|
|
+}
|
|
+
|
|
+void bch2_pagecache_add_put(struct pagecache_lock *lock)
|
|
+{
|
|
+ __pagecache_lock_put(lock, 1);
|
|
+}
|
|
+
|
|
+bool bch2_pagecache_add_tryget(struct pagecache_lock *lock)
|
|
+{
|
|
+ return __pagecache_lock_tryget(lock, 1);
|
|
+}
|
|
+
|
|
+void bch2_pagecache_add_get(struct pagecache_lock *lock)
|
|
+{
|
|
+ __pagecache_lock_get(lock, 1);
|
|
+}
|
|
+
|
|
+void bch2_pagecache_block_put(struct pagecache_lock *lock)
|
|
+{
|
|
+ __pagecache_lock_put(lock, -1);
|
|
+}
|
|
+
|
|
+void bch2_pagecache_block_get(struct pagecache_lock *lock)
|
|
+{
|
|
+ __pagecache_lock_get(lock, -1);
|
|
+}
|
|
+
|
|
+void bch2_inode_update_after_write(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ unsigned fields)
|
|
+{
|
|
+ set_nlink(&inode->v, bch2_inode_nlink_get(bi));
|
|
+ i_uid_write(&inode->v, bi->bi_uid);
|
|
+ i_gid_write(&inode->v, bi->bi_gid);
|
|
+ inode->v.i_mode = bi->bi_mode;
|
|
+
|
|
+ if (fields & ATTR_ATIME)
|
|
+ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime);
|
|
+ if (fields & ATTR_MTIME)
|
|
+ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime);
|
|
+ if (fields & ATTR_CTIME)
|
|
+ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime);
|
|
+
|
|
+ inode->ei_inode = *bi;
|
|
+
|
|
+ bch2_inode_flags_to_vfs(inode);
|
|
+}
|
|
+
|
|
+int __must_check bch2_write_inode(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ inode_set_fn set,
|
|
+ void *p, unsigned fields)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 256);
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(iter) ?:
|
|
+ (set ? set(inode, &inode_u, p) : 0) ?:
|
|
+ bch2_inode_write(&trans, iter, &inode_u) ?:
|
|
+ bch2_trans_commit(&trans, NULL,
|
|
+ &inode->ei_journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK|
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+
|
|
+ /*
|
|
+ * the btree node lock protects inode->ei_inode, not ei_update_lock;
|
|
+ * this is important for inode updates via bchfs_write_index_update
|
|
+ */
|
|
+ if (!ret)
|
|
+ bch2_inode_update_after_write(c, inode, &inode_u, fields);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret < 0 ? ret : 0;
|
|
+}
|
|
+
|
|
+int bch2_fs_quota_transfer(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch_qid new_qid,
|
|
+ unsigned qtypes,
|
|
+ enum quota_acct_mode mode)
|
|
+{
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ qtypes &= enabled_qtypes(c);
|
|
+
|
|
+ for (i = 0; i < QTYP_NR; i++)
|
|
+ if (new_qid.q[i] == inode->ei_qid.q[i])
|
|
+ qtypes &= ~(1U << i);
|
|
+
|
|
+ if (!qtypes)
|
|
+ return 0;
|
|
+
|
|
+ mutex_lock(&inode->ei_quota_lock);
|
|
+
|
|
+ ret = bch2_quota_transfer(c, qtypes, new_qid,
|
|
+ inode->ei_qid,
|
|
+ inode->v.i_blocks +
|
|
+ inode->ei_quota_reserved,
|
|
+ mode);
|
|
+ if (!ret)
|
|
+ for (i = 0; i < QTYP_NR; i++)
|
|
+ if (qtypes & (1 << i))
|
|
+ inode->ei_qid.q[i] = new_qid.q[i];
|
|
+
|
|
+ mutex_unlock(&inode->ei_quota_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum)
|
|
+{
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct bch_inode_info *inode;
|
|
+ int ret;
|
|
+
|
|
+ inode = to_bch_ei(iget_locked(c->vfs_sb, inum));
|
|
+ if (unlikely(!inode))
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+ if (!(inode->v.i_state & I_NEW))
|
|
+ return &inode->v;
|
|
+
|
|
+ ret = bch2_inode_find_by_inum(c, inum, &inode_u);
|
|
+ if (ret) {
|
|
+ iget_failed(&inode->v);
|
|
+ return ERR_PTR(ret);
|
|
+ }
|
|
+
|
|
+ bch2_vfs_inode_init(c, inode, &inode_u);
|
|
+
|
|
+ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum);
|
|
+
|
|
+ unlock_new_inode(&inode->v);
|
|
+
|
|
+ return &inode->v;
|
|
+}
|
|
+
|
|
+static int inum_test(struct inode *inode, void *p)
|
|
+{
|
|
+ unsigned long *ino = p;
|
|
+
|
|
+ return *ino == inode->i_ino;
|
|
+}
|
|
+
|
|
+static struct bch_inode_info *
|
|
+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry,
|
|
+ umode_t mode, dev_t rdev, bool tmpfile)
|
|
+{
|
|
+ struct bch_fs *c = dir->v.i_sb->s_fs_info;
|
|
+ struct user_namespace *ns = dir->v.i_sb->s_user_ns;
|
|
+ struct btree_trans trans;
|
|
+ struct bch_inode_unpacked dir_u;
|
|
+ struct bch_inode_info *inode, *old;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct posix_acl *default_acl = NULL, *acl = NULL;
|
|
+ u64 journal_seq = 0;
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * preallocate acls + vfs inode before btree transaction, so that
|
|
+ * nothing can fail after the transaction succeeds:
|
|
+ */
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+#endif
|
|
+ inode = to_bch_ei(new_inode(c->vfs_sb));
|
|
+ if (unlikely(!inode)) {
|
|
+ inode = ERR_PTR(-ENOMEM);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_inode_init_early(c, &inode_u);
|
|
+
|
|
+ if (!tmpfile)
|
|
+ mutex_lock(&dir->ei_update_lock);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 8,
|
|
+ 2048 + (!tmpfile ? dentry->d_name.len : 0));
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u,
|
|
+ !tmpfile ? &dentry->d_name : NULL,
|
|
+ from_kuid(ns, current_fsuid()),
|
|
+ from_kgid(ns, current_fsgid()),
|
|
+ mode, rdev,
|
|
+ default_acl, acl) ?:
|
|
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1,
|
|
+ KEY_TYPE_QUOTA_PREALLOC);
|
|
+ if (unlikely(ret))
|
|
+ goto err_before_quota;
|
|
+
|
|
+ ret = bch2_trans_commit(&trans, NULL, &journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK);
|
|
+ if (unlikely(ret)) {
|
|
+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1,
|
|
+ KEY_TYPE_QUOTA_WARN);
|
|
+err_before_quota:
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ goto err_trans;
|
|
+ }
|
|
+
|
|
+ if (!tmpfile) {
|
|
+ bch2_inode_update_after_write(c, dir, &dir_u,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ journal_seq_copy(c, dir, journal_seq);
|
|
+ mutex_unlock(&dir->ei_update_lock);
|
|
+ }
|
|
+
|
|
+ bch2_vfs_inode_init(c, inode, &inode_u);
|
|
+ journal_seq_copy(c, inode, journal_seq);
|
|
+
|
|
+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
|
|
+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl);
|
|
+
|
|
+ /*
|
|
+ * we must insert the new inode into the inode cache before calling
|
|
+ * bch2_trans_exit() and dropping locks, else we could race with another
|
|
+ * thread pulling the inode in and modifying it:
|
|
+ */
|
|
+
|
|
+ inode->v.i_state |= I_CREATING;
|
|
+ old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino,
|
|
+ inum_test, NULL, &inode->v.i_ino));
|
|
+ BUG_ON(!old);
|
|
+
|
|
+ if (unlikely(old != inode)) {
|
|
+ /*
|
|
+ * We raced, another process pulled the new inode into cache
|
|
+ * before us:
|
|
+ */
|
|
+ journal_seq_copy(c, old, journal_seq);
|
|
+ make_bad_inode(&inode->v);
|
|
+ iput(&inode->v);
|
|
+
|
|
+ inode = old;
|
|
+ } else {
|
|
+ /*
|
|
+ * we really don't want insert_inode_locked2() to be setting
|
|
+ * I_NEW...
|
|
+ */
|
|
+ unlock_new_inode(&inode->v);
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+err:
|
|
+ posix_acl_release(default_acl);
|
|
+ posix_acl_release(acl);
|
|
+ return inode;
|
|
+err_trans:
|
|
+ if (!tmpfile)
|
|
+ mutex_unlock(&dir->ei_update_lock);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ make_bad_inode(&inode->v);
|
|
+ iput(&inode->v);
|
|
+ inode = ERR_PTR(ret);
|
|
+ goto err;
|
|
+}
|
|
+
|
|
+/* methods */
|
|
+
|
|
+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
|
|
+ unsigned int flags)
|
|
+{
|
|
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *dir = to_bch_ei(vdir);
|
|
+ struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
|
|
+ struct inode *vinode = NULL;
|
|
+ u64 inum;
|
|
+
|
|
+ inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash,
|
|
+ &dentry->d_name);
|
|
+
|
|
+ if (inum)
|
|
+ vinode = bch2_vfs_inode_get(c, inum);
|
|
+
|
|
+ return d_splice_alias(vinode, dentry);
|
|
+}
|
|
+
|
|
+static int bch2_mknod(struct inode *vdir, struct dentry *dentry,
|
|
+ umode_t mode, dev_t rdev)
|
|
+{
|
|
+ struct bch_inode_info *inode =
|
|
+ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false);
|
|
+
|
|
+ if (IS_ERR(inode))
|
|
+ return PTR_ERR(inode);
|
|
+
|
|
+ d_instantiate(dentry, &inode->v);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_create(struct inode *vdir, struct dentry *dentry,
|
|
+ umode_t mode, bool excl)
|
|
+{
|
|
+ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0);
|
|
+}
|
|
+
|
|
+static int __bch2_link(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch_inode_info *dir,
|
|
+ struct dentry *dentry)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct bch_inode_unpacked dir_u, inode_u;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ bch2_trans_init(&trans, c, 4, 1024);
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK,
|
|
+ bch2_link_trans(&trans,
|
|
+ dir->v.i_ino,
|
|
+ inode->v.i_ino, &dir_u, &inode_u,
|
|
+ &dentry->d_name));
|
|
+
|
|
+ if (likely(!ret)) {
|
|
+ BUG_ON(inode_u.bi_inum != inode->v.i_ino);
|
|
+
|
|
+ journal_seq_copy(c, inode, dir->ei_journal_seq);
|
|
+ bch2_inode_update_after_write(c, dir, &dir_u,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME);
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
|
|
+ struct dentry *dentry)
|
|
+{
|
|
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *dir = to_bch_ei(vdir);
|
|
+ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode);
|
|
+ int ret;
|
|
+
|
|
+ lockdep_assert_held(&inode->v.i_rwsem);
|
|
+
|
|
+ ret = __bch2_link(c, inode, dir, dentry);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+
|
|
+ ihold(&inode->v);
|
|
+ d_instantiate(dentry, &inode->v);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
|
|
+{
|
|
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *dir = to_bch_ei(vdir);
|
|
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
|
+ struct bch_inode_unpacked dir_u, inode_u;
|
|
+ struct btree_trans trans;
|
|
+ int ret;
|
|
+
|
|
+ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
|
|
+ bch2_trans_init(&trans, c, 4, 1024);
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK|
|
|
+ BTREE_INSERT_NOFAIL,
|
|
+ bch2_unlink_trans(&trans,
|
|
+ dir->v.i_ino, &dir_u,
|
|
+ &inode_u, &dentry->d_name));
|
|
+
|
|
+ if (likely(!ret)) {
|
|
+ BUG_ON(inode_u.bi_inum != inode->v.i_ino);
|
|
+
|
|
+ journal_seq_copy(c, inode, dir->ei_journal_seq);
|
|
+ bch2_inode_update_after_write(c, dir, &dir_u,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ bch2_inode_update_after_write(c, inode, &inode_u,
|
|
+ ATTR_MTIME);
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_symlink(struct inode *vdir, struct dentry *dentry,
|
|
+ const char *symname)
|
|
+{
|
|
+ struct bch_fs *c = vdir->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *dir = to_bch_ei(vdir), *inode;
|
|
+ int ret;
|
|
+
|
|
+ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true);
|
|
+ if (unlikely(IS_ERR(inode)))
|
|
+ return PTR_ERR(inode);
|
|
+
|
|
+ inode_lock(&inode->v);
|
|
+ ret = page_symlink(&inode->v, symname, strlen(symname) + 1);
|
|
+ inode_unlock(&inode->v);
|
|
+
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ journal_seq_copy(c, dir, inode->ei_journal_seq);
|
|
+
|
|
+ ret = __bch2_link(c, inode, dir, dentry);
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ d_instantiate(dentry, &inode->v);
|
|
+ return 0;
|
|
+err:
|
|
+ iput(&inode->v);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode)
|
|
+{
|
|
+ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0);
|
|
+}
|
|
+
|
|
+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry,
|
|
+ struct inode *dst_vdir, struct dentry *dst_dentry,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = src_vdir->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir);
|
|
+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir);
|
|
+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode);
|
|
+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode);
|
|
+ struct bch_inode_unpacked dst_dir_u, src_dir_u;
|
|
+ struct bch_inode_unpacked src_inode_u, dst_inode_u;
|
|
+ struct btree_trans trans;
|
|
+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE
|
|
+ ? BCH_RENAME_EXCHANGE
|
|
+ : dst_dentry->d_inode
|
|
+ ? BCH_RENAME_OVERWRITE : BCH_RENAME;
|
|
+ u64 journal_seq = 0;
|
|
+ int ret;
|
|
+
|
|
+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (mode == BCH_RENAME_OVERWRITE) {
|
|
+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping,
|
|
+ 0, LLONG_MAX);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ bch2_trans_init(&trans, c, 8, 2048);
|
|
+
|
|
+ bch2_lock_inodes(INODE_UPDATE_LOCK,
|
|
+ src_dir,
|
|
+ dst_dir,
|
|
+ src_inode,
|
|
+ dst_inode);
|
|
+
|
|
+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
|
|
+ ret = bch2_fs_quota_transfer(c, src_inode,
|
|
+ dst_dir->ei_qid,
|
|
+ 1 << QTYP_PRJ,
|
|
+ KEY_TYPE_QUOTA_PREALLOC);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (mode == BCH_RENAME_EXCHANGE &&
|
|
+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) {
|
|
+ ret = bch2_fs_quota_transfer(c, dst_inode,
|
|
+ src_dir->ei_qid,
|
|
+ 1 << QTYP_PRJ,
|
|
+ KEY_TYPE_QUOTA_PREALLOC);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, &journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK,
|
|
+ bch2_rename_trans(&trans,
|
|
+ src_dir->v.i_ino, &src_dir_u,
|
|
+ dst_dir->v.i_ino, &dst_dir_u,
|
|
+ &src_inode_u,
|
|
+ &dst_inode_u,
|
|
+ &src_dentry->d_name,
|
|
+ &dst_dentry->d_name,
|
|
+ mode));
|
|
+ if (unlikely(ret))
|
|
+ goto err;
|
|
+
|
|
+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum);
|
|
+ BUG_ON(dst_inode &&
|
|
+ dst_inode->v.i_ino != dst_inode_u.bi_inum);
|
|
+
|
|
+ bch2_inode_update_after_write(c, src_dir, &src_dir_u,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ journal_seq_copy(c, src_dir, journal_seq);
|
|
+
|
|
+ if (src_dir != dst_dir) {
|
|
+ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u,
|
|
+ ATTR_MTIME|ATTR_CTIME);
|
|
+ journal_seq_copy(c, dst_dir, journal_seq);
|
|
+ }
|
|
+
|
|
+ bch2_inode_update_after_write(c, src_inode, &src_inode_u,
|
|
+ ATTR_CTIME);
|
|
+ journal_seq_copy(c, src_inode, journal_seq);
|
|
+
|
|
+ if (dst_inode) {
|
|
+ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u,
|
|
+ ATTR_CTIME);
|
|
+ journal_seq_copy(c, dst_inode, journal_seq);
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ bch2_fs_quota_transfer(c, src_inode,
|
|
+ bch_qid(&src_inode->ei_inode),
|
|
+ 1 << QTYP_PRJ,
|
|
+ KEY_TYPE_QUOTA_NOCHECK);
|
|
+ if (dst_inode)
|
|
+ bch2_fs_quota_transfer(c, dst_inode,
|
|
+ bch_qid(&dst_inode->ei_inode),
|
|
+ 1 << QTYP_PRJ,
|
|
+ KEY_TYPE_QUOTA_NOCHECK);
|
|
+
|
|
+ bch2_unlock_inodes(INODE_UPDATE_LOCK,
|
|
+ src_dir,
|
|
+ dst_dir,
|
|
+ src_inode,
|
|
+ dst_inode);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_setattr_copy(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ struct iattr *attr)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ unsigned int ia_valid = attr->ia_valid;
|
|
+
|
|
+ if (ia_valid & ATTR_UID)
|
|
+ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid);
|
|
+ if (ia_valid & ATTR_GID)
|
|
+ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid);
|
|
+
|
|
+ if (ia_valid & ATTR_ATIME)
|
|
+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime);
|
|
+ if (ia_valid & ATTR_MTIME)
|
|
+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime);
|
|
+ if (ia_valid & ATTR_CTIME)
|
|
+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime);
|
|
+
|
|
+ if (ia_valid & ATTR_MODE) {
|
|
+ umode_t mode = attr->ia_mode;
|
|
+ kgid_t gid = ia_valid & ATTR_GID
|
|
+ ? attr->ia_gid
|
|
+ : inode->v.i_gid;
|
|
+
|
|
+ if (!in_group_p(gid) &&
|
|
+ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID))
|
|
+ mode &= ~S_ISGID;
|
|
+ bi->bi_mode = mode;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int bch2_setattr_nonsize(struct bch_inode_info *inode,
|
|
+ struct iattr *attr)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_qid qid;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *inode_iter;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct posix_acl *acl = NULL;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+
|
|
+ qid = inode->ei_qid;
|
|
+
|
|
+ if (attr->ia_valid & ATTR_UID)
|
|
+ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid);
|
|
+
|
|
+ if (attr->ia_valid & ATTR_GID)
|
|
+ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid);
|
|
+
|
|
+ ret = bch2_fs_quota_transfer(c, inode, qid, ~0,
|
|
+ KEY_TYPE_QUOTA_PREALLOC);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+ kfree(acl);
|
|
+ acl = NULL;
|
|
+
|
|
+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino,
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = PTR_ERR_OR_ZERO(inode_iter);
|
|
+ if (ret)
|
|
+ goto btree_err;
|
|
+
|
|
+ bch2_setattr_copy(inode, &inode_u, attr);
|
|
+
|
|
+ if (attr->ia_valid & ATTR_MODE) {
|
|
+ ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl);
|
|
+ if (ret)
|
|
+ goto btree_err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
|
|
+ bch2_trans_commit(&trans, NULL,
|
|
+ &inode->ei_journal_seq,
|
|
+ BTREE_INSERT_NOUNLOCK|
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+btree_err:
|
|
+ bch2_trans_iter_put(&trans, inode_iter);
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ if (unlikely(ret))
|
|
+ goto err_trans;
|
|
+
|
|
+ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid);
|
|
+
|
|
+ if (acl)
|
|
+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
|
|
+err_trans:
|
|
+ bch2_trans_exit(&trans);
|
|
+err:
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_getattr(const struct path *path, struct kstat *stat,
|
|
+ u32 request_mask, unsigned query_flags)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry));
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ stat->dev = inode->v.i_sb->s_dev;
|
|
+ stat->ino = inode->v.i_ino;
|
|
+ stat->mode = inode->v.i_mode;
|
|
+ stat->nlink = inode->v.i_nlink;
|
|
+ stat->uid = inode->v.i_uid;
|
|
+ stat->gid = inode->v.i_gid;
|
|
+ stat->rdev = inode->v.i_rdev;
|
|
+ stat->size = i_size_read(&inode->v);
|
|
+ stat->atime = inode->v.i_atime;
|
|
+ stat->mtime = inode->v.i_mtime;
|
|
+ stat->ctime = inode->v.i_ctime;
|
|
+ stat->blksize = block_bytes(c);
|
|
+ stat->blocks = inode->v.i_blocks;
|
|
+
|
|
+ if (request_mask & STATX_BTIME) {
|
|
+ stat->result_mask |= STATX_BTIME;
|
|
+ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime);
|
|
+ }
|
|
+
|
|
+ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE)
|
|
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
|
|
+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE;
|
|
+
|
|
+ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND)
|
|
+ stat->attributes |= STATX_ATTR_APPEND;
|
|
+ stat->attributes_mask |= STATX_ATTR_APPEND;
|
|
+
|
|
+ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP)
|
|
+ stat->attributes |= STATX_ATTR_NODUMP;
|
|
+ stat->attributes_mask |= STATX_ATTR_NODUMP;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_setattr(struct dentry *dentry, struct iattr *iattr)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
|
+ int ret;
|
|
+
|
|
+ lockdep_assert_held(&inode->v.i_rwsem);
|
|
+
|
|
+ ret = setattr_prepare(dentry, iattr);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ return iattr->ia_valid & ATTR_SIZE
|
|
+ ? bch2_truncate(inode, iattr)
|
|
+ : bch2_setattr_nonsize(inode, iattr);
|
|
+}
|
|
+
|
|
+static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode)
|
|
+{
|
|
+ struct bch_inode_info *inode =
|
|
+ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true);
|
|
+
|
|
+ if (IS_ERR(inode))
|
|
+ return PTR_ERR(inode);
|
|
+
|
|
+ d_mark_tmpfile(dentry, &inode->v);
|
|
+ d_instantiate(dentry, &inode->v);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_fill_extent(struct bch_fs *c,
|
|
+ struct fiemap_extent_info *info,
|
|
+ struct bkey_s_c k, unsigned flags)
|
|
+{
|
|
+ if (bkey_extent_is_direct_data(k.k)) {
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ int ret;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_reflink_v)
|
|
+ flags |= FIEMAP_EXTENT_SHARED;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ int flags2 = 0;
|
|
+ u64 offset = p.ptr.offset;
|
|
+
|
|
+ if (p.crc.compression_type)
|
|
+ flags2 |= FIEMAP_EXTENT_ENCODED;
|
|
+ else
|
|
+ offset += p.crc.offset;
|
|
+
|
|
+ if ((offset & (c->opts.block_size - 1)) ||
|
|
+ (k.k->size & (c->opts.block_size - 1)))
|
|
+ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED;
|
|
+
|
|
+ ret = fiemap_fill_next_extent(info,
|
|
+ bkey_start_offset(k.k) << 9,
|
|
+ offset << 9,
|
|
+ k.k->size << 9, flags|flags2);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+ } else if (bkey_extent_is_inline_data(k.k)) {
|
|
+ return fiemap_fill_next_extent(info,
|
|
+ bkey_start_offset(k.k) << 9,
|
|
+ 0, k.k->size << 9,
|
|
+ flags|
|
|
+ FIEMAP_EXTENT_DATA_INLINE);
|
|
+ } else if (k.k->type == KEY_TYPE_reservation) {
|
|
+ return fiemap_fill_next_extent(info,
|
|
+ bkey_start_offset(k.k) << 9,
|
|
+ 0, k.k->size << 9,
|
|
+ flags|
|
|
+ FIEMAP_EXTENT_DELALLOC|
|
|
+ FIEMAP_EXTENT_UNWRITTEN);
|
|
+ } else {
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
|
|
+ u64 start, u64 len)
|
|
+{
|
|
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *ei = to_bch_ei(vinode);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf cur, prev;
|
|
+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
|
|
+ unsigned offset_into_extent, sectors;
|
|
+ bool have_extent = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (start + len < start)
|
|
+ return -EINVAL;
|
|
+
|
|
+ bch2_bkey_buf_init(&cur);
|
|
+ bch2_bkey_buf_init(&prev);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ POS(ei->v.i_ino, start >> 9), 0);
|
|
+retry:
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k)) &&
|
|
+ bkey_cmp(iter->pos, end) < 0) {
|
|
+ enum btree_id data_btree = BTREE_ID_extents;
|
|
+
|
|
+ if (!bkey_extent_is_data(k.k) &&
|
|
+ k.k->type != KEY_TYPE_reservation) {
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ offset_into_extent = iter->pos.offset -
|
|
+ bkey_start_offset(k.k);
|
|
+ sectors = k.k->size - offset_into_extent;
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&cur, c, k);
|
|
+
|
|
+ ret = bch2_read_indirect_extent(&trans, &data_btree,
|
|
+ &offset_into_extent, &cur);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ k = bkey_i_to_s_c(cur.k);
|
|
+ bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
|
|
+
|
|
+ sectors = min(sectors, k.k->size - offset_into_extent);
|
|
+
|
|
+ bch2_cut_front(POS(k.k->p.inode,
|
|
+ bkey_start_offset(k.k) +
|
|
+ offset_into_extent),
|
|
+ cur.k);
|
|
+ bch2_key_resize(&cur.k->k, sectors);
|
|
+ cur.k->k.p = iter->pos;
|
|
+ cur.k->k.p.offset += cur.k->k.size;
|
|
+
|
|
+ if (have_extent) {
|
|
+ ret = bch2_fill_extent(c, info,
|
|
+ bkey_i_to_s_c(prev.k), 0);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bkey_copy(prev.k, cur.k);
|
|
+ have_extent = true;
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter,
|
|
+ POS(iter->pos.inode, iter->pos.offset + sectors));
|
|
+ }
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ if (!ret && have_extent)
|
|
+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
|
|
+ FIEMAP_EXTENT_LAST);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ bch2_bkey_buf_exit(&cur, c);
|
|
+ bch2_bkey_buf_exit(&prev, c);
|
|
+ return ret < 0 ? ret : 0;
|
|
+}
|
|
+
|
|
+static const struct vm_operations_struct bch_vm_ops = {
|
|
+ .fault = bch2_page_fault,
|
|
+ .map_pages = filemap_map_pages,
|
|
+ .page_mkwrite = bch2_page_mkwrite,
|
|
+};
|
|
+
|
|
+static int bch2_mmap(struct file *file, struct vm_area_struct *vma)
|
|
+{
|
|
+ file_accessed(file);
|
|
+
|
|
+ vma->vm_ops = &bch_vm_ops;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Directories: */
|
|
+
|
|
+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence)
|
|
+{
|
|
+ return generic_file_llseek_size(file, offset, whence,
|
|
+ S64_MAX, S64_MAX);
|
|
+}
|
|
+
|
|
+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
|
|
+{
|
|
+ struct bch_inode_info *inode = file_bch_inode(file);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ if (!dir_emit_dots(file, ctx))
|
|
+ return 0;
|
|
+
|
|
+ return bch2_readdir(c, inode->v.i_ino, ctx);
|
|
+}
|
|
+
|
|
+static const struct file_operations bch_file_operations = {
|
|
+ .llseek = bch2_llseek,
|
|
+ .read_iter = bch2_read_iter,
|
|
+ .write_iter = bch2_write_iter,
|
|
+ .mmap = bch2_mmap,
|
|
+ .open = generic_file_open,
|
|
+ .fsync = bch2_fsync,
|
|
+ .splice_read = generic_file_splice_read,
|
|
+ .splice_write = iter_file_splice_write,
|
|
+ .fallocate = bch2_fallocate_dispatch,
|
|
+ .unlocked_ioctl = bch2_fs_file_ioctl,
|
|
+#ifdef CONFIG_COMPAT
|
|
+ .compat_ioctl = bch2_compat_fs_ioctl,
|
|
+#endif
|
|
+ .remap_file_range = bch2_remap_file_range,
|
|
+};
|
|
+
|
|
+static const struct inode_operations bch_file_inode_operations = {
|
|
+ .getattr = bch2_getattr,
|
|
+ .setattr = bch2_setattr,
|
|
+ .fiemap = bch2_fiemap,
|
|
+ .listxattr = bch2_xattr_list,
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+ .get_acl = bch2_get_acl,
|
|
+ .set_acl = bch2_set_acl,
|
|
+#endif
|
|
+};
|
|
+
|
|
+static const struct inode_operations bch_dir_inode_operations = {
|
|
+ .lookup = bch2_lookup,
|
|
+ .create = bch2_create,
|
|
+ .link = bch2_link,
|
|
+ .unlink = bch2_unlink,
|
|
+ .symlink = bch2_symlink,
|
|
+ .mkdir = bch2_mkdir,
|
|
+ .rmdir = bch2_unlink,
|
|
+ .mknod = bch2_mknod,
|
|
+ .rename = bch2_rename2,
|
|
+ .getattr = bch2_getattr,
|
|
+ .setattr = bch2_setattr,
|
|
+ .tmpfile = bch2_tmpfile,
|
|
+ .listxattr = bch2_xattr_list,
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+ .get_acl = bch2_get_acl,
|
|
+ .set_acl = bch2_set_acl,
|
|
+#endif
|
|
+};
|
|
+
|
|
+static const struct file_operations bch_dir_file_operations = {
|
|
+ .llseek = bch2_dir_llseek,
|
|
+ .read = generic_read_dir,
|
|
+ .iterate_shared = bch2_vfs_readdir,
|
|
+ .fsync = bch2_fsync,
|
|
+ .unlocked_ioctl = bch2_fs_file_ioctl,
|
|
+#ifdef CONFIG_COMPAT
|
|
+ .compat_ioctl = bch2_compat_fs_ioctl,
|
|
+#endif
|
|
+};
|
|
+
|
|
+static const struct inode_operations bch_symlink_inode_operations = {
|
|
+ .get_link = page_get_link,
|
|
+ .getattr = bch2_getattr,
|
|
+ .setattr = bch2_setattr,
|
|
+ .listxattr = bch2_xattr_list,
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+ .get_acl = bch2_get_acl,
|
|
+ .set_acl = bch2_set_acl,
|
|
+#endif
|
|
+};
|
|
+
|
|
+static const struct inode_operations bch_special_inode_operations = {
|
|
+ .getattr = bch2_getattr,
|
|
+ .setattr = bch2_setattr,
|
|
+ .listxattr = bch2_xattr_list,
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+ .get_acl = bch2_get_acl,
|
|
+ .set_acl = bch2_set_acl,
|
|
+#endif
|
|
+};
|
|
+
|
|
+static const struct address_space_operations bch_address_space_operations = {
|
|
+ .writepage = bch2_writepage,
|
|
+ .readpage = bch2_readpage,
|
|
+ .writepages = bch2_writepages,
|
|
+ .readahead = bch2_readahead,
|
|
+ .set_page_dirty = __set_page_dirty_nobuffers,
|
|
+ .write_begin = bch2_write_begin,
|
|
+ .write_end = bch2_write_end,
|
|
+ .invalidatepage = bch2_invalidatepage,
|
|
+ .releasepage = bch2_releasepage,
|
|
+ .direct_IO = noop_direct_IO,
|
|
+#ifdef CONFIG_MIGRATION
|
|
+ .migratepage = bch2_migrate_page,
|
|
+#endif
|
|
+ .error_remove_page = generic_error_remove_page,
|
|
+};
|
|
+
|
|
+static struct inode *bch2_nfs_get_inode(struct super_block *sb,
|
|
+ u64 ino, u32 generation)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct inode *vinode;
|
|
+
|
|
+ if (ino < BCACHEFS_ROOT_INO)
|
|
+ return ERR_PTR(-ESTALE);
|
|
+
|
|
+ vinode = bch2_vfs_inode_get(c, ino);
|
|
+ if (IS_ERR(vinode))
|
|
+ return ERR_CAST(vinode);
|
|
+ if (generation && vinode->i_generation != generation) {
|
|
+ /* we didn't find the right inode.. */
|
|
+ iput(vinode);
|
|
+ return ERR_PTR(-ESTALE);
|
|
+ }
|
|
+ return vinode;
|
|
+}
|
|
+
|
|
+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid,
|
|
+ int fh_len, int fh_type)
|
|
+{
|
|
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
|
|
+ bch2_nfs_get_inode);
|
|
+}
|
|
+
|
|
+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid,
|
|
+ int fh_len, int fh_type)
|
|
+{
|
|
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
|
|
+ bch2_nfs_get_inode);
|
|
+}
|
|
+
|
|
+static const struct export_operations bch_export_ops = {
|
|
+ .fh_to_dentry = bch2_fh_to_dentry,
|
|
+ .fh_to_parent = bch2_fh_to_parent,
|
|
+ //.get_parent = bch2_get_parent,
|
|
+};
|
|
+
|
|
+static void bch2_vfs_inode_init(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi)
|
|
+{
|
|
+ bch2_inode_update_after_write(c, inode, bi, ~0);
|
|
+
|
|
+ inode->v.i_blocks = bi->bi_sectors;
|
|
+ inode->v.i_ino = bi->bi_inum;
|
|
+ inode->v.i_rdev = bi->bi_dev;
|
|
+ inode->v.i_generation = bi->bi_generation;
|
|
+ inode->v.i_size = bi->bi_size;
|
|
+
|
|
+ inode->ei_flags = 0;
|
|
+ inode->ei_journal_seq = 0;
|
|
+ inode->ei_quota_reserved = 0;
|
|
+ inode->ei_qid = bch_qid(bi);
|
|
+
|
|
+ inode->v.i_mapping->a_ops = &bch_address_space_operations;
|
|
+
|
|
+ switch (inode->v.i_mode & S_IFMT) {
|
|
+ case S_IFREG:
|
|
+ inode->v.i_op = &bch_file_inode_operations;
|
|
+ inode->v.i_fop = &bch_file_operations;
|
|
+ break;
|
|
+ case S_IFDIR:
|
|
+ inode->v.i_op = &bch_dir_inode_operations;
|
|
+ inode->v.i_fop = &bch_dir_file_operations;
|
|
+ break;
|
|
+ case S_IFLNK:
|
|
+ inode_nohighmem(&inode->v);
|
|
+ inode->v.i_op = &bch_symlink_inode_operations;
|
|
+ break;
|
|
+ default:
|
|
+ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev);
|
|
+ inode->v.i_op = &bch_special_inode_operations;
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+static struct inode *bch2_alloc_inode(struct super_block *sb)
|
|
+{
|
|
+ struct bch_inode_info *inode;
|
|
+
|
|
+ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS);
|
|
+ if (!inode)
|
|
+ return NULL;
|
|
+
|
|
+ inode_init_once(&inode->v);
|
|
+ mutex_init(&inode->ei_update_lock);
|
|
+ pagecache_lock_init(&inode->ei_pagecache_lock);
|
|
+ mutex_init(&inode->ei_quota_lock);
|
|
+ inode->ei_journal_seq = 0;
|
|
+
|
|
+ return &inode->v;
|
|
+}
|
|
+
|
|
+static void bch2_i_callback(struct rcu_head *head)
|
|
+{
|
|
+ struct inode *vinode = container_of(head, struct inode, i_rcu);
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+
|
|
+ kmem_cache_free(bch2_inode_cache, inode);
|
|
+}
|
|
+
|
|
+static void bch2_destroy_inode(struct inode *vinode)
|
|
+{
|
|
+ call_rcu(&vinode->i_rcu, bch2_i_callback);
|
|
+}
|
|
+
|
|
+static int inode_update_times_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime);
|
|
+ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime);
|
|
+ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_vfs_write_inode(struct inode *vinode,
|
|
+ struct writeback_control *wbc)
|
|
+{
|
|
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL,
|
|
+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME);
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void bch2_evict_inode(struct inode *vinode)
|
|
+{
|
|
+ struct bch_fs *c = vinode->i_sb->s_fs_info;
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+
|
|
+ truncate_inode_pages_final(&inode->v.i_data);
|
|
+
|
|
+ clear_inode(&inode->v);
|
|
+
|
|
+ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved);
|
|
+
|
|
+ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) {
|
|
+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks),
|
|
+ KEY_TYPE_QUOTA_WARN);
|
|
+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
|
|
+ KEY_TYPE_QUOTA_WARN);
|
|
+ bch2_inode_rm(c, inode->v.i_ino, true);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
|
|
+{
|
|
+ struct super_block *sb = dentry->d_sb;
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
|
|
+ unsigned shift = sb->s_blocksize_bits - 9;
|
|
+ /*
|
|
+ * this assumes inodes take up 64 bytes, which is a decent average
|
|
+ * number:
|
|
+ */
|
|
+ u64 avail_inodes = ((usage.capacity - usage.used) << 3);
|
|
+ u64 fsid;
|
|
+
|
|
+ buf->f_type = BCACHEFS_STATFS_MAGIC;
|
|
+ buf->f_bsize = sb->s_blocksize;
|
|
+ buf->f_blocks = usage.capacity >> shift;
|
|
+ buf->f_bfree = (usage.capacity - usage.used) >> shift;
|
|
+ buf->f_bavail = buf->f_bfree;
|
|
+
|
|
+ buf->f_files = usage.nr_inodes + avail_inodes;
|
|
+ buf->f_ffree = avail_inodes;
|
|
+
|
|
+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
|
|
+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
|
|
+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
|
|
+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
|
|
+ buf->f_namelen = BCH_NAME_MAX;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_sync_fs(struct super_block *sb, int wait)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+
|
|
+ if (c->opts.journal_flush_disabled)
|
|
+ return 0;
|
|
+
|
|
+ if (!wait) {
|
|
+ bch2_journal_flush_async(&c->journal, NULL);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ return bch2_journal_flush(&c->journal);
|
|
+}
|
|
+
|
|
+static struct bch_fs *bch2_path_to_fs(const char *path)
|
|
+{
|
|
+ struct bch_fs *c;
|
|
+ dev_t dev;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_bdev(path, &dev);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+
|
|
+ c = bch2_dev_to_fs(dev);
|
|
+ if (c)
|
|
+ closure_put(&c->cl);
|
|
+ return c ?: ERR_PTR(-ENOENT);
|
|
+}
|
|
+
|
|
+static char **split_devs(const char *_dev_name, unsigned *nr)
|
|
+{
|
|
+ char *dev_name = NULL, **devs = NULL, *s;
|
|
+ size_t i, nr_devs = 0;
|
|
+
|
|
+ dev_name = kstrdup(_dev_name, GFP_KERNEL);
|
|
+ if (!dev_name)
|
|
+ return NULL;
|
|
+
|
|
+ for (s = dev_name; s; s = strchr(s + 1, ':'))
|
|
+ nr_devs++;
|
|
+
|
|
+ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
|
|
+ if (!devs) {
|
|
+ kfree(dev_name);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ for (i = 0, s = dev_name;
|
|
+ s;
|
|
+ (s = strchr(s, ':')) && (*s++ = '\0'))
|
|
+ devs[i++] = s;
|
|
+
|
|
+ *nr = nr_devs;
|
|
+ return devs;
|
|
+}
|
|
+
|
|
+static int bch2_remount(struct super_block *sb, int *flags, char *data)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct bch_opts opts = bch2_opts_empty();
|
|
+ int ret;
|
|
+
|
|
+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
|
|
+
|
|
+ ret = bch2_parse_mount_opts(c, &opts, data);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (opts.read_only != c->opts.read_only) {
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ if (opts.read_only) {
|
|
+ bch2_fs_read_only(c);
|
|
+
|
|
+ sb->s_flags |= SB_RDONLY;
|
|
+ } else {
|
|
+ ret = bch2_fs_read_write(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error going rw: %i", ret);
|
|
+ up_write(&c->state_lock);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ sb->s_flags &= ~SB_RDONLY;
|
|
+ }
|
|
+
|
|
+ c->opts.read_only = opts.read_only;
|
|
+
|
|
+ up_write(&c->state_lock);
|
|
+ }
|
|
+
|
|
+ if (opts.errors >= 0)
|
|
+ c->opts.errors = opts.errors;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
|
|
+{
|
|
+ struct bch_fs *c = root->d_sb->s_fs_info;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+ bool first = true;
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ if (!first)
|
|
+ seq_putc(seq, ':');
|
|
+ first = false;
|
|
+ seq_puts(seq, "/dev/");
|
|
+ seq_puts(seq, ca->name);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_show_options(struct seq_file *seq, struct dentry *root)
|
|
+{
|
|
+ struct bch_fs *c = root->d_sb->s_fs_info;
|
|
+ enum bch_opt_id i;
|
|
+ char buf[512];
|
|
+
|
|
+ for (i = 0; i < bch2_opts_nr; i++) {
|
|
+ const struct bch_option *opt = &bch2_opt_table[i];
|
|
+ u64 v = bch2_opt_get_by_id(&c->opts, i);
|
|
+
|
|
+ if (!(opt->mode & OPT_MOUNT))
|
|
+ continue;
|
|
+
|
|
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
|
|
+ continue;
|
|
+
|
|
+ bch2_opt_to_text(&PBUF(buf), c, opt, v,
|
|
+ OPT_SHOW_MOUNT_STYLE);
|
|
+ seq_putc(seq, ',');
|
|
+ seq_puts(seq, buf);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bch2_put_super(struct super_block *sb)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+
|
|
+ __bch2_fs_stop(c);
|
|
+}
|
|
+
|
|
+static const struct super_operations bch_super_operations = {
|
|
+ .alloc_inode = bch2_alloc_inode,
|
|
+ .destroy_inode = bch2_destroy_inode,
|
|
+ .write_inode = bch2_vfs_write_inode,
|
|
+ .evict_inode = bch2_evict_inode,
|
|
+ .sync_fs = bch2_sync_fs,
|
|
+ .statfs = bch2_statfs,
|
|
+ .show_devname = bch2_show_devname,
|
|
+ .show_options = bch2_show_options,
|
|
+ .remount_fs = bch2_remount,
|
|
+ .put_super = bch2_put_super,
|
|
+#if 0
|
|
+ .freeze_fs = bch2_freeze,
|
|
+ .unfreeze_fs = bch2_unfreeze,
|
|
+#endif
|
|
+};
|
|
+
|
|
+static int bch2_set_super(struct super_block *s, void *data)
|
|
+{
|
|
+ s->s_fs_info = data;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_noset_super(struct super_block *s, void *data)
|
|
+{
|
|
+ return -EBUSY;
|
|
+}
|
|
+
|
|
+static int bch2_test_super(struct super_block *s, void *data)
|
|
+{
|
|
+ struct bch_fs *c = s->s_fs_info;
|
|
+ struct bch_fs **devs = data;
|
|
+ unsigned i;
|
|
+
|
|
+ if (!c)
|
|
+ return false;
|
|
+
|
|
+ for (i = 0; devs[i]; i++)
|
|
+ if (c != devs[i])
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static struct dentry *bch2_mount(struct file_system_type *fs_type,
|
|
+ int flags, const char *dev_name, void *data)
|
|
+{
|
|
+ struct bch_fs *c;
|
|
+ struct bch_dev *ca;
|
|
+ struct super_block *sb;
|
|
+ struct inode *vinode;
|
|
+ struct bch_opts opts = bch2_opts_empty();
|
|
+ char **devs;
|
|
+ struct bch_fs **devs_to_fs = NULL;
|
|
+ unsigned i, nr_devs;
|
|
+ int ret;
|
|
+
|
|
+ opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
|
|
+
|
|
+ ret = bch2_parse_mount_opts(NULL, &opts, data);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+
|
|
+ devs = split_devs(dev_name, &nr_devs);
|
|
+ if (!devs)
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+
|
|
+ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
|
|
+ if (!devs_to_fs) {
|
|
+ sb = ERR_PTR(-ENOMEM);
|
|
+ goto got_sb;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < nr_devs; i++)
|
|
+ devs_to_fs[i] = bch2_path_to_fs(devs[i]);
|
|
+
|
|
+ sb = sget(fs_type, bch2_test_super, bch2_noset_super,
|
|
+ flags|SB_NOSEC, devs_to_fs);
|
|
+ if (!IS_ERR(sb))
|
|
+ goto got_sb;
|
|
+
|
|
+ c = bch2_fs_open(devs, nr_devs, opts);
|
|
+ if (IS_ERR(c)) {
|
|
+ sb = ERR_CAST(c);
|
|
+ goto got_sb;
|
|
+ }
|
|
+
|
|
+ /* Some options can't be parsed until after the fs is started: */
|
|
+ ret = bch2_parse_mount_opts(c, &opts, data);
|
|
+ if (ret) {
|
|
+ bch2_fs_stop(c);
|
|
+ sb = ERR_PTR(ret);
|
|
+ goto got_sb;
|
|
+ }
|
|
+
|
|
+ bch2_opts_apply(&c->opts, opts);
|
|
+
|
|
+ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c);
|
|
+ if (IS_ERR(sb))
|
|
+ bch2_fs_stop(c);
|
|
+got_sb:
|
|
+ kfree(devs_to_fs);
|
|
+ kfree(devs[0]);
|
|
+ kfree(devs);
|
|
+
|
|
+ if (IS_ERR(sb))
|
|
+ return ERR_CAST(sb);
|
|
+
|
|
+ c = sb->s_fs_info;
|
|
+
|
|
+ if (sb->s_root) {
|
|
+ if ((flags ^ sb->s_flags) & SB_RDONLY) {
|
|
+ ret = -EBUSY;
|
|
+ goto err_put_super;
|
|
+ }
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ sb->s_blocksize = block_bytes(c);
|
|
+ sb->s_blocksize_bits = ilog2(block_bytes(c));
|
|
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
|
|
+ sb->s_op = &bch_super_operations;
|
|
+ sb->s_export_op = &bch_export_ops;
|
|
+#ifdef CONFIG_BCACHEFS_QUOTA
|
|
+ sb->s_qcop = &bch2_quotactl_operations;
|
|
+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ;
|
|
+#endif
|
|
+ sb->s_xattr = bch2_xattr_handlers;
|
|
+ sb->s_magic = BCACHEFS_STATFS_MAGIC;
|
|
+ sb->s_time_gran = c->sb.nsec_per_time_unit;
|
|
+ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
|
|
+ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec);
|
|
+ c->vfs_sb = sb;
|
|
+ strlcpy(sb->s_id, c->name, sizeof(sb->s_id));
|
|
+
|
|
+ ret = super_setup_bdi(sb);
|
|
+ if (ret)
|
|
+ goto err_put_super;
|
|
+
|
|
+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ struct block_device *bdev = ca->disk_sb.bdev;
|
|
+
|
|
+ /* XXX: create an anonymous device for multi device filesystems */
|
|
+ sb->s_bdev = bdev;
|
|
+ sb->s_dev = bdev->bd_dev;
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
|
|
+ if (c->opts.acl)
|
|
+ sb->s_flags |= SB_POSIXACL;
|
|
+#endif
|
|
+
|
|
+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO);
|
|
+ if (IS_ERR(vinode)) {
|
|
+ bch_err(c, "error mounting: error getting root inode %i",
|
|
+ (int) PTR_ERR(vinode));
|
|
+ ret = PTR_ERR(vinode);
|
|
+ goto err_put_super;
|
|
+ }
|
|
+
|
|
+ sb->s_root = d_make_root(vinode);
|
|
+ if (!sb->s_root) {
|
|
+ bch_err(c, "error mounting: error allocating root dentry");
|
|
+ ret = -ENOMEM;
|
|
+ goto err_put_super;
|
|
+ }
|
|
+
|
|
+ sb->s_flags |= SB_ACTIVE;
|
|
+out:
|
|
+ return dget(sb->s_root);
|
|
+
|
|
+err_put_super:
|
|
+ deactivate_locked_super(sb);
|
|
+ return ERR_PTR(ret);
|
|
+}
|
|
+
|
|
+static void bch2_kill_sb(struct super_block *sb)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+
|
|
+ generic_shutdown_super(sb);
|
|
+ bch2_fs_free(c);
|
|
+}
|
|
+
|
|
+static struct file_system_type bcache_fs_type = {
|
|
+ .owner = THIS_MODULE,
|
|
+ .name = "bcachefs",
|
|
+ .mount = bch2_mount,
|
|
+ .kill_sb = bch2_kill_sb,
|
|
+ .fs_flags = FS_REQUIRES_DEV,
|
|
+};
|
|
+
|
|
+MODULE_ALIAS_FS("bcachefs");
|
|
+
|
|
+void bch2_vfs_exit(void)
|
|
+{
|
|
+ unregister_filesystem(&bcache_fs_type);
|
|
+ if (bch2_inode_cache)
|
|
+ kmem_cache_destroy(bch2_inode_cache);
|
|
+}
|
|
+
|
|
+int __init bch2_vfs_init(void)
|
|
+{
|
|
+ int ret = -ENOMEM;
|
|
+
|
|
+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0);
|
|
+ if (!bch2_inode_cache)
|
|
+ goto err;
|
|
+
|
|
+ ret = register_filesystem(&bcache_fs_type);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ return 0;
|
|
+err:
|
|
+ bch2_vfs_exit();
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#endif /* NO_BCACHEFS_FS */
|
|
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
|
|
new file mode 100644
|
|
index 000000000000..2d82ed7dd740
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fs.h
|
|
@@ -0,0 +1,180 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FS_H
|
|
+#define _BCACHEFS_FS_H
|
|
+
|
|
+#include "inode.h"
|
|
+#include "opts.h"
|
|
+#include "str_hash.h"
|
|
+#include "quota_types.h"
|
|
+
|
|
+#include <linux/seqlock.h>
|
|
+#include <linux/stat.h>
|
|
+
|
|
+/*
|
|
+ * Two-state lock - can be taken for add or block - both states are shared,
|
|
+ * like read side of rwsem, but conflict with other state:
|
|
+ */
|
|
+struct pagecache_lock {
|
|
+ atomic_long_t v;
|
|
+ wait_queue_head_t wait;
|
|
+};
|
|
+
|
|
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
|
|
+{
|
|
+ atomic_long_set(&lock->v, 0);
|
|
+ init_waitqueue_head(&lock->wait);
|
|
+}
|
|
+
|
|
+void bch2_pagecache_add_put(struct pagecache_lock *);
|
|
+bool bch2_pagecache_add_tryget(struct pagecache_lock *);
|
|
+void bch2_pagecache_add_get(struct pagecache_lock *);
|
|
+void bch2_pagecache_block_put(struct pagecache_lock *);
|
|
+void bch2_pagecache_block_get(struct pagecache_lock *);
|
|
+
|
|
+struct bch_inode_info {
|
|
+ struct inode v;
|
|
+ unsigned long ei_flags;
|
|
+
|
|
+ struct mutex ei_update_lock;
|
|
+ u64 ei_journal_seq;
|
|
+ u64 ei_quota_reserved;
|
|
+ unsigned long ei_last_dirtied;
|
|
+
|
|
+ struct pagecache_lock ei_pagecache_lock;
|
|
+
|
|
+ struct mutex ei_quota_lock;
|
|
+ struct bch_qid ei_qid;
|
|
+
|
|
+ /* copy of inode in btree: */
|
|
+ struct bch_inode_unpacked ei_inode;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and
|
|
+ * btree inode may be inconsistent:
|
|
+ */
|
|
+#define EI_INODE_ERROR 0
|
|
+
|
|
+#define to_bch_ei(_inode) \
|
|
+ container_of_or_null(_inode, struct bch_inode_info, v)
|
|
+
|
|
+static inline int ptrcmp(void *l, void *r)
|
|
+{
|
|
+ return cmp_int(l, r);
|
|
+}
|
|
+
|
|
+enum bch_inode_lock_op {
|
|
+ INODE_LOCK = (1U << 0),
|
|
+ INODE_PAGECACHE_BLOCK = (1U << 1),
|
|
+ INODE_UPDATE_LOCK = (1U << 2),
|
|
+};
|
|
+
|
|
+#define bch2_lock_inodes(_locks, ...) \
|
|
+do { \
|
|
+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
|
|
+ unsigned i; \
|
|
+ \
|
|
+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
|
|
+ \
|
|
+ for (i = 1; i < ARRAY_SIZE(a); i++) \
|
|
+ if (a[i] != a[i - 1]) { \
|
|
+ if ((_locks) & INODE_LOCK) \
|
|
+ down_write_nested(&a[i]->v.i_rwsem, i); \
|
|
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
|
|
+ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\
|
|
+ if ((_locks) & INODE_UPDATE_LOCK) \
|
|
+ mutex_lock_nested(&a[i]->ei_update_lock, i);\
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+#define bch2_unlock_inodes(_locks, ...) \
|
|
+do { \
|
|
+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \
|
|
+ unsigned i; \
|
|
+ \
|
|
+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \
|
|
+ \
|
|
+ for (i = 1; i < ARRAY_SIZE(a); i++) \
|
|
+ if (a[i] != a[i - 1]) { \
|
|
+ if ((_locks) & INODE_LOCK) \
|
|
+ up_write(&a[i]->v.i_rwsem); \
|
|
+ if ((_locks) & INODE_PAGECACHE_BLOCK) \
|
|
+ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\
|
|
+ if ((_locks) & INODE_UPDATE_LOCK) \
|
|
+ mutex_unlock(&a[i]->ei_update_lock); \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+static inline struct bch_inode_info *file_bch_inode(struct file *file)
|
|
+{
|
|
+ return to_bch_ei(file_inode(file));
|
|
+}
|
|
+
|
|
+static inline bool inode_attr_changing(struct bch_inode_info *dir,
|
|
+ struct bch_inode_info *inode,
|
|
+ enum inode_opt_id id)
|
|
+{
|
|
+ return !(inode->ei_inode.bi_fields_set & (1 << id)) &&
|
|
+ bch2_inode_opt_get(&dir->ei_inode, id) !=
|
|
+ bch2_inode_opt_get(&inode->ei_inode, id);
|
|
+}
|
|
+
|
|
+static inline bool inode_attrs_changing(struct bch_inode_info *dir,
|
|
+ struct bch_inode_info *inode)
|
|
+{
|
|
+ unsigned id;
|
|
+
|
|
+ for (id = 0; id < Inode_opt_nr; id++)
|
|
+ if (inode_attr_changing(dir, inode, id))
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+struct bch_inode_unpacked;
|
|
+
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+int bch2_fs_quota_transfer(struct bch_fs *,
|
|
+ struct bch_inode_info *,
|
|
+ struct bch_qid,
|
|
+ unsigned,
|
|
+ enum quota_acct_mode);
|
|
+
|
|
+static inline int bch2_set_projid(struct bch_fs *c,
|
|
+ struct bch_inode_info *inode,
|
|
+ u32 projid)
|
|
+{
|
|
+ struct bch_qid qid = inode->ei_qid;
|
|
+
|
|
+ qid.q[QTYP_PRJ] = projid;
|
|
+
|
|
+ return bch2_fs_quota_transfer(c, inode, qid,
|
|
+ 1 << QTYP_PRJ,
|
|
+ KEY_TYPE_QUOTA_PREALLOC);
|
|
+}
|
|
+
|
|
+struct inode *bch2_vfs_inode_get(struct bch_fs *, u64);
|
|
+
|
|
+/* returns 0 if we want to do the update, or error is passed up */
|
|
+typedef int (*inode_set_fn)(struct bch_inode_info *,
|
|
+ struct bch_inode_unpacked *, void *);
|
|
+
|
|
+void bch2_inode_update_after_write(struct bch_fs *,
|
|
+ struct bch_inode_info *,
|
|
+ struct bch_inode_unpacked *,
|
|
+ unsigned);
|
|
+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
|
|
+ inode_set_fn, void *, unsigned);
|
|
+
|
|
+void bch2_vfs_exit(void);
|
|
+int bch2_vfs_init(void);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline void bch2_vfs_exit(void) {}
|
|
+static inline int bch2_vfs_init(void) { return 0; }
|
|
+
|
|
+#endif /* NO_BCACHEFS_FS */
|
|
+
|
|
+#endif /* _BCACHEFS_FS_H */
|
|
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
|
|
new file mode 100644
|
|
index 000000000000..338d50bec7e5
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fsck.c
|
|
@@ -0,0 +1,1398 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_update.h"
|
|
+#include "dirent.h"
|
|
+#include "error.h"
|
|
+#include "fs-common.h"
|
|
+#include "fsck.h"
|
|
+#include "inode.h"
|
|
+#include "keylist.h"
|
|
+#include "super.h"
|
|
+#include "xattr.h"
|
|
+
|
|
+#include <linux/bsearch.h>
|
|
+#include <linux/dcache.h> /* struct qstr */
|
|
+
|
|
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
|
|
+
|
|
+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 sectors = 0;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key(trans, iter, BTREE_ID_extents,
|
|
+ POS(inum, 0), 0, k, ret) {
|
|
+ if (k.k->p.inode != inum)
|
|
+ break;
|
|
+
|
|
+ if (bkey_extent_is_allocation(k.k))
|
|
+ sectors += k.k->size;
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_free(trans, iter);
|
|
+
|
|
+ return ret ?: sectors;
|
|
+}
|
|
+
|
|
+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ u32 *snapshot)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes,
|
|
+ POS(0, inode_nr), 0);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (snapshot)
|
|
+ *snapshot = iter->pos.snapshot;
|
|
+ ret = k.k->type == KEY_TYPE_inode
|
|
+ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode)
|
|
+ : -ENOENT;
|
|
+err:
|
|
+ bch2_trans_iter_free(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ u32 *snapshot)
|
|
+{
|
|
+ return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
|
|
+}
|
|
+
|
|
+static int __write_inode(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ u32 snapshot)
|
|
+{
|
|
+ struct btree_iter *inode_iter =
|
|
+ bch2_trans_get_iter(trans, BTREE_ID_inodes,
|
|
+ SPOS(0, inode->bi_inum, snapshot),
|
|
+ BTREE_ITER_INTENT);
|
|
+ int ret = bch2_inode_write(trans, inode_iter, inode);
|
|
+ bch2_trans_iter_put(trans, inode_iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int write_inode(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ u32 snapshot)
|
|
+{
|
|
+ int ret = __bch2_trans_do(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW,
|
|
+ __write_inode(trans, inode, snapshot));
|
|
+ if (ret)
|
|
+ bch_err(trans->c, "error in fsck: error %i updating inode", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ struct bch_inode_unpacked dir_inode;
|
|
+ struct bch_hash_info dir_hash_info;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_inode(trans, pos.inode, &dir_inode, NULL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ dir_hash_info = bch2_hash_info_init(c, &dir_inode);
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
|
|
+
|
|
+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
|
|
+ &dir_hash_info, iter);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int remove_dirent(struct btree_trans *trans, struct bpos pos)
|
|
+{
|
|
+ int ret = __bch2_trans_do(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW,
|
|
+ __remove_dirent(trans, pos));
|
|
+ if (ret)
|
|
+ bch_err(trans->c, "remove_dirent: err %i deleting dirent", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Get lost+found, create if it doesn't exist: */
|
|
+static int lookup_lostfound(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *lostfound)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bch_inode_unpacked root;
|
|
+ struct bch_hash_info root_hash_info;
|
|
+ struct qstr lostfound_str = QSTR("lost+found");
|
|
+ u64 inum;
|
|
+ u32 snapshot;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
|
|
+ if (ret && ret != -ENOENT)
|
|
+ return ret;
|
|
+
|
|
+ root_hash_info = bch2_hash_info_init(c, &root);
|
|
+ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
|
|
+ &lostfound_str);
|
|
+ if (!inum) {
|
|
+ bch_notice(c, "creating lost+found");
|
|
+ goto create_lostfound;
|
|
+ }
|
|
+
|
|
+ ret = lookup_inode(trans, inum, lostfound, &snapshot);
|
|
+ if (ret && ret != -ENOENT) {
|
|
+ /*
|
|
+ * The check_dirents pass has already run, dangling dirents
|
|
+ * shouldn't exist here:
|
|
+ */
|
|
+ bch_err(c, "error looking up lost+found: %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (ret == -ENOENT) {
|
|
+create_lostfound:
|
|
+ bch2_inode_init_early(c, lostfound);
|
|
+
|
|
+ ret = __bch2_trans_do(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW,
|
|
+ bch2_create_trans(trans,
|
|
+ BCACHEFS_ROOT_INO, &root,
|
|
+ lostfound,
|
|
+ &lostfound_str,
|
|
+ 0, 0, S_IFDIR|0700, 0, NULL, NULL));
|
|
+ if (ret)
|
|
+ bch_err(c, "error creating lost+found: %i", ret);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int reattach_inode(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bch_hash_info dir_hash;
|
|
+ struct bch_inode_unpacked lostfound;
|
|
+ char name_buf[20];
|
|
+ struct qstr name;
|
|
+ u64 dir_offset = 0;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_lostfound(trans, &lostfound);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (S_ISDIR(inode->bi_mode)) {
|
|
+ lostfound.bi_nlink++;
|
|
+
|
|
+ ret = write_inode(trans, &lostfound, U32_MAX);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ dir_hash = bch2_hash_info_init(trans->c, &lostfound);
|
|
+
|
|
+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
|
|
+ name = (struct qstr) QSTR(name_buf);
|
|
+
|
|
+ ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
|
|
+ bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
|
|
+ mode_to_type(inode->bi_mode),
|
|
+ &name, inode->bi_inum, &dir_offset,
|
|
+ BCH_HASH_SET_MUST_CREATE));
|
|
+ if (ret) {
|
|
+ bch_err(trans->c, "error %i reattaching inode %llu",
|
|
+ ret, inode->bi_inum);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ inode->bi_dir = lostfound.bi_inum;
|
|
+ inode->bi_dir_offset = dir_offset;
|
|
+
|
|
+ return write_inode(trans, inode, U32_MAX);
|
|
+}
|
|
+
|
|
+static int remove_backpointer(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
|
|
+ POS(inode->bi_dir, inode->bi_dir_offset), 0);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ if (k.k->type != KEY_TYPE_dirent) {
|
|
+ ret = -ENOENT;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ ret = remove_dirent(trans, k.k->p);
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct inode_walker {
|
|
+ bool first_this_inode;
|
|
+ bool have_inode;
|
|
+ u64 cur_inum;
|
|
+ u32 snapshot;
|
|
+ struct bch_inode_unpacked inode;
|
|
+};
|
|
+
|
|
+static struct inode_walker inode_walker_init(void)
|
|
+{
|
|
+ return (struct inode_walker) {
|
|
+ .cur_inum = -1,
|
|
+ .have_inode = false,
|
|
+ };
|
|
+}
|
|
+
|
|
+static int walk_inode(struct btree_trans *trans,
|
|
+ struct inode_walker *w, u64 inum)
|
|
+{
|
|
+ if (inum != w->cur_inum) {
|
|
+ int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
|
|
+
|
|
+ if (ret && ret != -ENOENT)
|
|
+ return ret;
|
|
+
|
|
+ w->have_inode = !ret;
|
|
+ w->cur_inum = inum;
|
|
+ w->first_this_inode = true;
|
|
+ } else {
|
|
+ w->first_this_inode = false;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int hash_redo_key(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ struct bch_hash_info *hash_info,
|
|
+ struct btree_iter *k_iter, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_i *delete;
|
|
+ struct bkey_i *tmp;
|
|
+
|
|
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
|
|
+ if (IS_ERR(delete))
|
|
+ return PTR_ERR(delete);
|
|
+
|
|
+ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
|
|
+ if (IS_ERR(tmp))
|
|
+ return PTR_ERR(tmp);
|
|
+
|
|
+ bkey_reassemble(tmp, k);
|
|
+
|
|
+ bkey_init(&delete->k);
|
|
+ delete->k.p = k_iter->pos;
|
|
+ bch2_trans_update(trans, k_iter, delete, 0);
|
|
+
|
|
+ return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
|
|
+}
|
|
+
|
|
+static int fsck_hash_delete_at(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ struct bch_hash_info *info,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ int ret;
|
|
+retry:
|
|
+ ret = bch2_hash_delete_at(trans, desc, info, iter) ?:
|
|
+ bch2_trans_commit(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW);
|
|
+ if (ret == -EINTR) {
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (!ret)
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int hash_check_key(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ struct bch_hash_info *hash_info,
|
|
+ struct btree_iter *k_iter, struct bkey_s_c hash_k)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter = NULL;
|
|
+ char buf[200];
|
|
+ struct bkey_s_c k;
|
|
+ u64 hash;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (hash_k.k->type != desc.key_type)
|
|
+ return 0;
|
|
+
|
|
+ hash = desc.hash_bkey(hash_info, hash_k);
|
|
+
|
|
+ if (likely(hash == hash_k.k->p.offset))
|
|
+ return 0;
|
|
+
|
|
+ if (hash_k.k->p.offset < hash)
|
|
+ goto bad_hash;
|
|
+
|
|
+ for_each_btree_key(trans, iter, desc.btree_id, POS(hash_k.k->p.inode, hash),
|
|
+ BTREE_ITER_SLOTS, k, ret) {
|
|
+ if (!bkey_cmp(k.k->p, hash_k.k->p))
|
|
+ break;
|
|
+
|
|
+ if (fsck_err_on(k.k->type == desc.key_type &&
|
|
+ !desc.cmp_bkey(k, hash_k), c,
|
|
+ "duplicate hash table keys:\n%s",
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
|
|
+ hash_k), buf))) {
|
|
+ ret = fsck_hash_delete_at(trans, desc, hash_info, k_iter);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ ret = 1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (bkey_deleted(k.k)) {
|
|
+ bch2_trans_iter_free(trans, iter);
|
|
+ goto bad_hash;
|
|
+ }
|
|
+
|
|
+ }
|
|
+ bch2_trans_iter_free(trans, iter);
|
|
+ return ret;
|
|
+bad_hash:
|
|
+ if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
|
|
+ "hashed to %llu\n%s",
|
|
+ desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
|
|
+ return 0;
|
|
+
|
|
+ ret = __bch2_trans_do(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
|
|
+ hash_redo_key(trans, desc, hash_info, k_iter, hash_k));
|
|
+ if (ret) {
|
|
+ bch_err(c, "hash_redo_key err %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+ return -EINTR;
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int check_inode(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_s_c_inode inode)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bch_inode_unpacked u;
|
|
+ bool do_update = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ ret = bch2_inode_unpack(inode, &u);
|
|
+
|
|
+ if (bch2_fs_inconsistent_on(ret, c,
|
|
+ "error unpacking inode %llu in fsck",
|
|
+ inode.k->p.inode))
|
|
+ return ret;
|
|
+
|
|
+ if (u.bi_flags & BCH_INODE_UNLINKED &&
|
|
+ (!c->sb.clean ||
|
|
+ fsck_err(c, "filesystem marked clean, but inode %llu unlinked",
|
|
+ u.bi_inum))) {
|
|
+ bch_verbose(c, "deleting inode %llu", u.bi_inum);
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+ bch2_fs_lazy_rw(c);
|
|
+
|
|
+ ret = bch2_inode_rm(c, u.bi_inum, false);
|
|
+ if (ret)
|
|
+ bch_err(c, "error in fsck: error %i while deleting inode", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY &&
|
|
+ (!c->sb.clean ||
|
|
+ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty",
|
|
+ u.bi_inum))) {
|
|
+ bch_verbose(c, "truncating inode %llu", u.bi_inum);
|
|
+
|
|
+ bch2_trans_unlock(trans);
|
|
+ bch2_fs_lazy_rw(c);
|
|
+
|
|
+ /*
|
|
+ * XXX: need to truncate partial blocks too here - or ideally
|
|
+ * just switch units to bytes and that issue goes away
|
|
+ */
|
|
+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
|
|
+ POS(u.bi_inum, round_up(u.bi_size, block_bytes(c))),
|
|
+ POS(u.bi_inum, U64_MAX),
|
|
+ NULL);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in fsck: error %i truncating inode", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We truncated without our normal sector accounting hook, just
|
|
+ * make sure we recalculate it:
|
|
+ */
|
|
+ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY;
|
|
+
|
|
+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY;
|
|
+ do_update = true;
|
|
+ }
|
|
+
|
|
+ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY &&
|
|
+ (!c->sb.clean ||
|
|
+ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty",
|
|
+ u.bi_inum))) {
|
|
+ s64 sectors;
|
|
+
|
|
+ bch_verbose(c, "recounting sectors for inode %llu",
|
|
+ u.bi_inum);
|
|
+
|
|
+ sectors = bch2_count_inode_sectors(trans, u.bi_inum);
|
|
+ if (sectors < 0) {
|
|
+ bch_err(c, "error in fsck: error %i recounting inode sectors",
|
|
+ (int) sectors);
|
|
+ return sectors;
|
|
+ }
|
|
+
|
|
+ u.bi_sectors = sectors;
|
|
+ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY;
|
|
+ do_update = true;
|
|
+ }
|
|
+
|
|
+ if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) {
|
|
+ u.bi_dir = 0;
|
|
+ u.bi_dir_offset = 0;
|
|
+ u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED;
|
|
+ do_update = true;
|
|
+ }
|
|
+
|
|
+ if (do_update) {
|
|
+ ret = __bch2_trans_do(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW,
|
|
+ bch2_inode_write(trans, iter, &u));
|
|
+ if (ret)
|
|
+ bch_err(c, "error in fsck: error %i "
|
|
+ "updating inode", ret);
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
+static int check_inodes(struct bch_fs *c, bool full)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_c_inode inode;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
|
|
+ if (k.k->type != KEY_TYPE_inode)
|
|
+ continue;
|
|
+
|
|
+ inode = bkey_s_c_to_inode(k);
|
|
+
|
|
+ if (full ||
|
|
+ (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY|
|
|
+ BCH_INODE_I_SECTORS_DIRTY|
|
|
+ BCH_INODE_UNLINKED))) {
|
|
+ ret = check_inode(&trans, iter, inode);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ BUG_ON(ret == -EINTR);
|
|
+
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+static int fix_overlapping_extent(struct btree_trans *trans,
|
|
+ struct bkey_s_c k, struct bpos cut_at)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_i *u;
|
|
+ int ret;
|
|
+
|
|
+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
|
|
+ ret = PTR_ERR_OR_ZERO(u);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bkey_reassemble(u, k);
|
|
+ bch2_cut_front(cut_at, u);
|
|
+
|
|
+
|
|
+ /*
|
|
+ * We don't want to go through the extent_handle_overwrites path:
|
|
+ *
|
|
+ * XXX: this is going to screw up disk accounting, extent triggers
|
|
+ * assume things about extent overwrites - we should be running the
|
|
+ * triggers manually here
|
|
+ */
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_extents, u->k.p,
|
|
+ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS);
|
|
+
|
|
+ BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
|
|
+ bch2_trans_update(trans, iter, u, BTREE_TRIGGER_NORUN);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+
|
|
+ return bch2_trans_commit(trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW);
|
|
+}
|
|
+
|
|
+static int inode_backpointer_exists(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_dirents,
|
|
+ POS(inode->bi_dir, inode->bi_dir_offset), 0);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ if (k.k->type != KEY_TYPE_dirent)
|
|
+ goto out;
|
|
+
|
|
+ ret = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum) == inode->bi_inum;
|
|
+out:
|
|
+ bch2_trans_iter_free(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static bool inode_backpointer_matches(struct bkey_s_c_dirent d,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ return d.k->p.inode == inode->bi_dir &&
|
|
+ d.k->p.offset == inode->bi_dir_offset;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and
|
|
+ * that i_size an i_sectors are consistent
|
|
+ */
|
|
+noinline_for_stack
|
|
+static int check_extents(struct bch_fs *c)
|
|
+{
|
|
+ struct inode_walker w = inode_walker_init();
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf prev;
|
|
+ u64 i_sectors = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_bkey_buf_init(&prev);
|
|
+ prev.k->k = KEY(0, 0, 0);
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ bch_verbose(c, "checking extents");
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ POS(BCACHEFS_ROOT_INO, 0),
|
|
+ BTREE_ITER_INTENT);
|
|
+retry:
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k))) {
|
|
+ if (w.have_inode &&
|
|
+ w.cur_inum != k.k->p.inode &&
|
|
+ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) &&
|
|
+ fsck_err_on(w.inode.bi_sectors != i_sectors, c,
|
|
+ "inode %llu has incorrect i_sectors: got %llu, should be %llu",
|
|
+ w.inode.bi_inum,
|
|
+ w.inode.bi_sectors, i_sectors)) {
|
|
+ w.inode.bi_sectors = i_sectors;
|
|
+
|
|
+ ret = write_inode(&trans, &w.inode, w.snapshot);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) {
|
|
+ char buf1[200];
|
|
+ char buf2[200];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k));
|
|
+ bch2_bkey_val_to_text(&PBUF(buf2), c, k);
|
|
+
|
|
+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2))
|
|
+ return fix_overlapping_extent(&trans, k, prev.k->k.p) ?: -EINTR;
|
|
+ }
|
|
+
|
|
+ ret = walk_inode(&trans, &w, k.k->p.inode);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (w.first_this_inode)
|
|
+ i_sectors = 0;
|
|
+
|
|
+ if (fsck_err_on(!w.have_inode, c,
|
|
+ "extent type %u for missing inode %llu",
|
|
+ k.k->type, k.k->p.inode) ||
|
|
+ fsck_err_on(w.have_inode &&
|
|
+ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c,
|
|
+ "extent type %u for non regular file, inode %llu mode %o",
|
|
+ k.k->type, k.k->p.inode, w.inode.bi_mode)) {
|
|
+ bch2_fs_lazy_rw(c);
|
|
+ return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
|
|
+ POS(k.k->p.inode, 0),
|
|
+ POS(k.k->p.inode, U64_MAX),
|
|
+ NULL) ?: -EINTR;
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(w.have_inode &&
|
|
+ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
|
|
+ k.k->type != KEY_TYPE_reservation &&
|
|
+ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c,
|
|
+ "extent type %u offset %llu past end of inode %llu, i_size %llu",
|
|
+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) {
|
|
+ bch2_fs_lazy_rw(c);
|
|
+ return bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
|
|
+ POS(k.k->p.inode, round_up(w.inode.bi_size, block_bytes(c))),
|
|
+ POS(k.k->p.inode, U64_MAX),
|
|
+ NULL) ?: -EINTR;
|
|
+ }
|
|
+
|
|
+ if (bkey_extent_is_allocation(k.k))
|
|
+ i_sectors += k.k->size;
|
|
+ bch2_bkey_buf_reassemble(&prev, c, k);
|
|
+
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ }
|
|
+fsck_err:
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_bkey_buf_exit(&prev, c);
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
|
|
+ * validate d_type
|
|
+ */
|
|
+noinline_for_stack
|
|
+static int check_dirents(struct bch_fs *c)
|
|
+{
|
|
+ struct inode_walker w = inode_walker_init();
|
|
+ struct bch_hash_info hash_info;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ char buf[200];
|
|
+ unsigned nr_subdirs = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch_verbose(c, "checking dirents");
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
|
|
+ POS(BCACHEFS_ROOT_INO, 0), 0);
|
|
+retry:
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k))) {
|
|
+ struct bkey_s_c_dirent d;
|
|
+ struct bch_inode_unpacked target;
|
|
+ u32 target_snapshot;
|
|
+ bool have_target;
|
|
+ bool backpointer_exists = true;
|
|
+ u64 d_inum;
|
|
+
|
|
+ if (w.have_inode &&
|
|
+ w.cur_inum != k.k->p.inode &&
|
|
+ fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
|
|
+ "directory %llu with wrong i_nlink: got %u, should be %u",
|
|
+ w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
|
|
+ w.inode.bi_nlink = nr_subdirs;
|
|
+ ret = write_inode(&trans, &w.inode, w.snapshot);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = walk_inode(&trans, &w, k.k->p.inode);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (w.first_this_inode)
|
|
+ nr_subdirs = 0;
|
|
+
|
|
+ if (fsck_err_on(!w.have_inode, c,
|
|
+ "dirent in nonexisting directory:\n%s",
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
|
|
+ k), buf)) ||
|
|
+ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
|
|
+ "dirent in non directory inode type %u:\n%s",
|
|
+ mode_to_type(w.inode.bi_mode),
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
|
|
+ k), buf))) {
|
|
+ ret = lockrestart_do(&trans,
|
|
+ bch2_btree_delete_at(&trans, iter, 0));
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ if (!w.have_inode)
|
|
+ goto next;
|
|
+
|
|
+ if (w.first_this_inode)
|
|
+ hash_info = bch2_hash_info_init(c, &w.inode);
|
|
+
|
|
+ ret = hash_check_key(&trans, bch2_dirent_hash_desc,
|
|
+ &hash_info, iter, k);
|
|
+ if (ret > 0) {
|
|
+ ret = 0;
|
|
+ goto next;
|
|
+ }
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_dirent)
|
|
+ goto next;
|
|
+
|
|
+ d = bkey_s_c_to_dirent(k);
|
|
+ d_inum = le64_to_cpu(d.v->d_inum);
|
|
+
|
|
+ ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
|
|
+ if (ret && ret != -ENOENT)
|
|
+ break;
|
|
+
|
|
+ have_target = !ret;
|
|
+ ret = 0;
|
|
+
|
|
+ if (fsck_err_on(!have_target, c,
|
|
+ "dirent points to missing inode:\n%s",
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
|
|
+ k), buf))) {
|
|
+ ret = remove_dirent(&trans, d.k->p);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ if (!have_target)
|
|
+ goto next;
|
|
+
|
|
+ if (!target.bi_dir &&
|
|
+ !target.bi_dir_offset) {
|
|
+ target.bi_dir = k.k->p.inode;
|
|
+ target.bi_dir_offset = k.k->p.offset;
|
|
+
|
|
+ ret = write_inode(&trans, &target, target_snapshot);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!inode_backpointer_matches(d, &target)) {
|
|
+ ret = inode_backpointer_exists(&trans, &target);
|
|
+ if (ret < 0)
|
|
+ goto err;
|
|
+
|
|
+ backpointer_exists = ret;
|
|
+ ret = 0;
|
|
+
|
|
+ if (fsck_err_on(S_ISDIR(target.bi_mode) &&
|
|
+ backpointer_exists, c,
|
|
+ "directory %llu with multiple links",
|
|
+ target.bi_inum)) {
|
|
+ ret = remove_dirent(&trans, d.k->p);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(backpointer_exists &&
|
|
+ !target.bi_nlink, c,
|
|
+ "inode %llu has multiple links but i_nlink 0",
|
|
+ d_inum)) {
|
|
+ target.bi_nlink++;
|
|
+ target.bi_flags &= ~BCH_INODE_UNLINKED;
|
|
+
|
|
+ ret = write_inode(&trans, &target, target_snapshot);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(!backpointer_exists, c,
|
|
+ "inode %llu has wrong backpointer:\n"
|
|
+ "got %llu:%llu\n"
|
|
+ "should be %llu:%llu",
|
|
+ d_inum,
|
|
+ target.bi_dir,
|
|
+ target.bi_dir_offset,
|
|
+ k.k->p.inode,
|
|
+ k.k->p.offset)) {
|
|
+ target.bi_dir = k.k->p.inode;
|
|
+ target.bi_dir_offset = k.k->p.offset;
|
|
+
|
|
+ ret = write_inode(&trans, &target, target_snapshot);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
|
|
+ "incorrect d_type: should be %u:\n%s",
|
|
+ mode_to_type(target.bi_mode),
|
|
+ (bch2_bkey_val_to_text(&PBUF(buf), c,
|
|
+ k), buf))) {
|
|
+ struct bkey_i_dirent *n;
|
|
+
|
|
+ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
|
|
+ if (!n) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bkey_reassemble(&n->k_i, d.s_c);
|
|
+ n->v.d_type = mode_to_type(target.bi_mode);
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW,
|
|
+ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0));
|
|
+ kfree(n);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ }
|
|
+
|
|
+ nr_subdirs += d.v->d_type == DT_DIR;
|
|
+next:
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ }
|
|
+err:
|
|
+fsck_err:
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Walk xattrs: verify that they all have a corresponding inode
|
|
+ */
|
|
+noinline_for_stack
|
|
+static int check_xattrs(struct bch_fs *c)
|
|
+{
|
|
+ struct inode_walker w = inode_walker_init();
|
|
+ struct bch_hash_info hash_info;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch_verbose(c, "checking xattrs");
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs,
|
|
+ POS(BCACHEFS_ROOT_INO, 0), 0);
|
|
+retry:
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k))) {
|
|
+ ret = walk_inode(&trans, &w, k.k->p.inode);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (fsck_err_on(!w.have_inode, c,
|
|
+ "xattr for missing inode %llu",
|
|
+ k.k->p.inode)) {
|
|
+ ret = bch2_btree_delete_at(&trans, iter, 0);
|
|
+ if (ret)
|
|
+ break;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (w.first_this_inode && w.have_inode)
|
|
+ hash_info = bch2_hash_info_init(c, &w.inode);
|
|
+
|
|
+ ret = hash_check_key(&trans, bch2_xattr_hash_desc,
|
|
+ &hash_info, iter, k);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ }
|
|
+fsck_err:
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+/* Get root directory, create if it doesn't exist: */
|
|
+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode)
|
|
+{
|
|
+ struct bkey_inode_buf packed;
|
|
+ u32 snapshot;
|
|
+ int ret;
|
|
+
|
|
+ bch_verbose(c, "checking root directory");
|
|
+
|
|
+ ret = bch2_trans_do(c, NULL, NULL, 0,
|
|
+ lookup_inode(&trans, BCACHEFS_ROOT_INO, root_inode, &snapshot));
|
|
+ if (ret && ret != -ENOENT)
|
|
+ return ret;
|
|
+
|
|
+ if (fsck_err_on(ret, c, "root directory missing"))
|
|
+ goto create_root;
|
|
+
|
|
+ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c,
|
|
+ "root inode not a directory"))
|
|
+ goto create_root;
|
|
+
|
|
+ return 0;
|
|
+fsck_err:
|
|
+ return ret;
|
|
+create_root:
|
|
+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755,
|
|
+ 0, NULL);
|
|
+ root_inode->bi_inum = BCACHEFS_ROOT_INO;
|
|
+
|
|
+ bch2_inode_pack(c, &packed, root_inode);
|
|
+
|
|
+ return bch2_btree_insert(c, BTREE_ID_inodes, &packed.inode.k_i,
|
|
+ NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW);
|
|
+}
|
|
+
|
|
+struct pathbuf {
|
|
+ size_t nr;
|
|
+ size_t size;
|
|
+
|
|
+ struct pathbuf_entry {
|
|
+ u64 inum;
|
|
+ } *entries;
|
|
+};
|
|
+
|
|
+static int path_down(struct pathbuf *p, u64 inum)
|
|
+{
|
|
+ if (p->nr == p->size) {
|
|
+ size_t new_size = max_t(size_t, 256UL, p->size * 2);
|
|
+ void *n = krealloc(p->entries,
|
|
+ new_size * sizeof(p->entries[0]),
|
|
+ GFP_KERNEL);
|
|
+ if (!n) {
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ p->entries = n;
|
|
+ p->size = new_size;
|
|
+ };
|
|
+
|
|
+ p->entries[p->nr++] = (struct pathbuf_entry) {
|
|
+ .inum = inum,
|
|
+ };
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int check_path(struct btree_trans *trans,
|
|
+ struct pathbuf *p,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ u32 snapshot;
|
|
+ size_t i;
|
|
+ int ret = 0;
|
|
+
|
|
+ p->nr = 0;
|
|
+
|
|
+ while (inode->bi_inum != BCACHEFS_ROOT_INO) {
|
|
+ ret = lockrestart_do(trans,
|
|
+ inode_backpointer_exists(trans, inode));
|
|
+ if (ret < 0)
|
|
+ break;
|
|
+
|
|
+ if (!ret) {
|
|
+ if (fsck_err(c, "unreachable inode %llu, type %u nlink %u backptr %llu:%llu",
|
|
+ inode->bi_inum,
|
|
+ mode_to_type(inode->bi_mode),
|
|
+ inode->bi_nlink,
|
|
+ inode->bi_dir,
|
|
+ inode->bi_dir_offset))
|
|
+ ret = reattach_inode(trans, inode);
|
|
+ break;
|
|
+ }
|
|
+ ret = 0;
|
|
+
|
|
+ if (!S_ISDIR(inode->bi_mode))
|
|
+ break;
|
|
+
|
|
+ ret = path_down(p, inode->bi_inum);
|
|
+ if (ret) {
|
|
+ bch_err(c, "memory allocation failure");
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < p->nr; i++) {
|
|
+ if (inode->bi_dir != p->entries[i].inum)
|
|
+ continue;
|
|
+
|
|
+ /* XXX print path */
|
|
+ if (!fsck_err(c, "directory structure loop"))
|
|
+ return 0;
|
|
+
|
|
+ ret = lockrestart_do(trans,
|
|
+ remove_backpointer(trans, inode));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error removing dirent: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = reattach_inode(trans, inode);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
|
|
+ if (ret) {
|
|
+ /* Should have been caught in dirents pass */
|
|
+ bch_err(c, "error looking up parent directory: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+fsck_err:
|
|
+ if (ret)
|
|
+ bch_err(c, "%s: err %i", __func__, ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Check for unreachable inodes, as well as loops in the directory structure:
|
|
+ * After check_dirents(), if an inode backpointer doesn't exist that means it's
|
|
+ * unreachable:
|
|
+ */
|
|
+static int check_directory_structure(struct bch_fs *c)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bch_inode_unpacked u;
|
|
+ struct pathbuf path = { 0, 0, NULL };
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, 0, k, ret) {
|
|
+ if (k.k->type != KEY_TYPE_inode)
|
|
+ continue;
|
|
+
|
|
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
|
|
+ if (ret) {
|
|
+ /* Should have been caught earlier in fsck: */
|
|
+ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = check_path(&trans, &path, &u);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ BUG_ON(ret == -EINTR);
|
|
+
|
|
+ kfree(path.entries);
|
|
+
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+struct nlink_table {
|
|
+ size_t nr;
|
|
+ size_t size;
|
|
+
|
|
+ struct nlink {
|
|
+ u64 inum;
|
|
+ u32 snapshot;
|
|
+ u32 count;
|
|
+ } *d;
|
|
+};
|
|
+
|
|
+static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
|
|
+{
|
|
+ if (t->nr == t->size) {
|
|
+ size_t new_size = max_t(size_t, 128UL, t->size * 2);
|
|
+ void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
|
|
+ if (!d) {
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ memcpy(d, t->d, t->size * sizeof(t->d[0]));
|
|
+ kvfree(t->d);
|
|
+
|
|
+ t->d = d;
|
|
+ t->size = new_size;
|
|
+ }
|
|
+
|
|
+
|
|
+ t->d[t->nr++] = (struct nlink) {
|
|
+ .inum = inum,
|
|
+ .snapshot = snapshot,
|
|
+ };
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int nlink_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ const struct nlink *l = _l;
|
|
+ const struct nlink *r = _r;
|
|
+
|
|
+ return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
|
|
+}
|
|
+
|
|
+static void inc_link(struct bch_fs *c, struct nlink_table *links,
|
|
+ u64 range_start, u64 range_end, u64 inum)
|
|
+{
|
|
+ struct nlink *link, key = {
|
|
+ .inum = inum, .snapshot = U32_MAX,
|
|
+ };
|
|
+
|
|
+ if (inum < range_start || inum >= range_end)
|
|
+ return;
|
|
+
|
|
+ link = __inline_bsearch(&key, links->d, links->nr,
|
|
+ sizeof(links->d[0]), nlink_cmp);
|
|
+ if (link)
|
|
+ link->count++;
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
|
|
+ struct nlink_table *t,
|
|
+ u64 start, u64 *end)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_c_inode inode;
|
|
+ struct bch_inode_unpacked u;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes,
|
|
+ POS(0, start), 0, k, ret) {
|
|
+ if (k.k->type != KEY_TYPE_inode)
|
|
+ continue;
|
|
+
|
|
+ inode = bkey_s_c_to_inode(k);
|
|
+
|
|
+ /*
|
|
+ * Backpointer and directory structure checks are sufficient for
|
|
+ * directories, since they can't have hardlinks:
|
|
+ */
|
|
+ if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
|
|
+ continue;
|
|
+
|
|
+ /* Should never fail, checked by bch2_inode_invalid: */
|
|
+ BUG_ON(bch2_inode_unpack(inode, &u));
|
|
+
|
|
+ if (!u.bi_nlink)
|
|
+ continue;
|
|
+
|
|
+ ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
|
|
+ if (ret) {
|
|
+ *end = k.k->p.offset;
|
|
+ ret = 0;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ if (ret)
|
|
+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
|
|
+ u64 range_start, u64 range_end)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_c_dirent d;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, 0, k, ret) {
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_dirent:
|
|
+ d = bkey_s_c_to_dirent(k);
|
|
+
|
|
+ if (d.v->d_type != DT_DIR)
|
|
+ inc_link(c, links, range_start, range_end,
|
|
+ le64_to_cpu(d.v->d_inum));
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_trans_cond_resched(&trans);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ if (ret)
|
|
+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
|
|
+ struct nlink_table *links,
|
|
+ u64 range_start, u64 range_end)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_s_c_inode inode;
|
|
+ struct bch_inode_unpacked u;
|
|
+ struct nlink *link = links->d;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes,
|
|
+ POS(0, range_start), 0, k, ret) {
|
|
+ if (k.k->p.offset >= range_end)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_inode)
|
|
+ continue;
|
|
+
|
|
+ inode = bkey_s_c_to_inode(k);
|
|
+ if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
|
|
+ continue;
|
|
+
|
|
+ BUG_ON(bch2_inode_unpack(inode, &u));
|
|
+
|
|
+ if (!u.bi_nlink)
|
|
+ continue;
|
|
+
|
|
+ while (link->inum < k.k->p.offset) {
|
|
+ link++;
|
|
+ BUG_ON(link >= links->d + links->nr);
|
|
+ }
|
|
+
|
|
+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
|
|
+ "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
|
|
+ u.bi_inum, mode_to_type(u.bi_mode),
|
|
+ bch2_inode_nlink_get(&u), link->count)) {
|
|
+ bch2_inode_nlink_set(&u, link->count);
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW,
|
|
+ bch2_inode_write(&trans, iter, &u));
|
|
+ if (ret)
|
|
+ bch_err(c, "error in fsck: error %i updating inode", ret);
|
|
+ }
|
|
+ }
|
|
+fsck_err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ if (ret)
|
|
+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
+static int check_nlinks(struct bch_fs *c)
|
|
+{
|
|
+ struct nlink_table links = { 0 };
|
|
+ u64 this_iter_range_start, next_iter_range_start = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch_verbose(c, "checking inode nlinks");
|
|
+
|
|
+ do {
|
|
+ this_iter_range_start = next_iter_range_start;
|
|
+ next_iter_range_start = U64_MAX;
|
|
+
|
|
+ ret = check_nlinks_find_hardlinks(c, &links,
|
|
+ this_iter_range_start,
|
|
+ &next_iter_range_start);
|
|
+
|
|
+ ret = check_nlinks_walk_dirents(c, &links,
|
|
+ this_iter_range_start,
|
|
+ next_iter_range_start);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ ret = check_nlinks_update_hardlinks(c, &links,
|
|
+ this_iter_range_start,
|
|
+ next_iter_range_start);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ links.nr = 0;
|
|
+ } while (next_iter_range_start != U64_MAX);
|
|
+
|
|
+ kvfree(links.d);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Checks for inconsistencies that shouldn't happen, unless we have a bug.
|
|
+ * Doesn't fix them yet, mainly because they haven't yet been observed:
|
|
+ */
|
|
+int bch2_fsck_full(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_inode_unpacked root_inode;
|
|
+
|
|
+ return check_inodes(c, true) ?:
|
|
+ check_extents(c) ?:
|
|
+ check_dirents(c) ?:
|
|
+ check_xattrs(c) ?:
|
|
+ check_root(c, &root_inode) ?:
|
|
+ check_directory_structure(c) ?:
|
|
+ check_nlinks(c);
|
|
+}
|
|
+
|
|
+int bch2_fsck_walk_inodes_only(struct bch_fs *c)
|
|
+{
|
|
+ return check_inodes(c, false);
|
|
+}
|
|
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
|
|
new file mode 100644
|
|
index 000000000000..264f2706b12d
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/fsck.h
|
|
@@ -0,0 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_FSCK_H
|
|
+#define _BCACHEFS_FSCK_H
|
|
+
|
|
+int bch2_fsck_full(struct bch_fs *);
|
|
+int bch2_fsck_walk_inodes_only(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_FSCK_H */
|
|
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
|
|
new file mode 100644
|
|
index 000000000000..dfde5ba3f1b7
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/inode.c
|
|
@@ -0,0 +1,657 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_update.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "inode.h"
|
|
+#include "str_hash.h"
|
|
+#include "varint.h"
|
|
+
|
|
+#include <linux/random.h>
|
|
+
|
|
+#include <asm/unaligned.h>
|
|
+
|
|
+const char * const bch2_inode_opts[] = {
|
|
+#define x(name, ...) #name,
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ NULL,
|
|
+};
|
|
+
|
|
+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
|
|
+static const u8 bits_table[8] = {
|
|
+ 1 * 8 - 1,
|
|
+ 2 * 8 - 2,
|
|
+ 3 * 8 - 3,
|
|
+ 4 * 8 - 4,
|
|
+ 6 * 8 - 5,
|
|
+ 8 * 8 - 6,
|
|
+ 10 * 8 - 7,
|
|
+ 13 * 8 - 8,
|
|
+};
|
|
+
|
|
+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo)
|
|
+{
|
|
+ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), };
|
|
+ unsigned shift, bytes, bits = likely(!hi)
|
|
+ ? fls64(lo)
|
|
+ : fls64(hi) + 64;
|
|
+
|
|
+ for (shift = 1; shift <= 8; shift++)
|
|
+ if (bits < bits_table[shift - 1])
|
|
+ goto got_shift;
|
|
+
|
|
+ BUG();
|
|
+got_shift:
|
|
+ bytes = byte_table[shift - 1];
|
|
+
|
|
+ BUG_ON(out + bytes > end);
|
|
+
|
|
+ memcpy(out, (u8 *) in + 16 - bytes, bytes);
|
|
+ *out |= (1 << 8) >> shift;
|
|
+
|
|
+ return bytes;
|
|
+}
|
|
+
|
|
+static int inode_decode_field(const u8 *in, const u8 *end,
|
|
+ u64 out[2], unsigned *out_bits)
|
|
+{
|
|
+ __be64 be[2] = { 0, 0 };
|
|
+ unsigned bytes, shift;
|
|
+ u8 *p;
|
|
+
|
|
+ if (in >= end)
|
|
+ return -1;
|
|
+
|
|
+ if (!*in)
|
|
+ return -1;
|
|
+
|
|
+ /*
|
|
+ * position of highest set bit indicates number of bytes:
|
|
+ * shift = number of bits to remove in high byte:
|
|
+ */
|
|
+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */
|
|
+ bytes = byte_table[shift - 1];
|
|
+
|
|
+ if (in + bytes > end)
|
|
+ return -1;
|
|
+
|
|
+ p = (u8 *) be + 16 - bytes;
|
|
+ memcpy(p, in, bytes);
|
|
+ *p ^= (1 << 8) >> shift;
|
|
+
|
|
+ out[0] = be64_to_cpu(be[0]);
|
|
+ out[1] = be64_to_cpu(be[1]);
|
|
+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]);
|
|
+
|
|
+ return bytes;
|
|
+}
|
|
+
|
|
+static noinline void bch2_inode_pack_v1(struct bkey_inode_buf *packed,
|
|
+ const struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bkey_i_inode *k = &packed->inode;
|
|
+ u8 *out = k->v.fields;
|
|
+ u8 *end = (void *) &packed[1];
|
|
+ u8 *last_nonzero_field = out;
|
|
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
|
|
+ unsigned bytes;
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ out += inode_encode_field(out, end, 0, inode->_name); \
|
|
+ nr_fields++; \
|
|
+ \
|
|
+ if (inode->_name) { \
|
|
+ last_nonzero_field = out; \
|
|
+ last_nonzero_fieldnr = nr_fields; \
|
|
+ }
|
|
+
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+
|
|
+ out = last_nonzero_field;
|
|
+ nr_fields = last_nonzero_fieldnr;
|
|
+
|
|
+ bytes = out - (u8 *) &packed->inode.v;
|
|
+ set_bkey_val_bytes(&packed->inode.k, bytes);
|
|
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
|
|
+
|
|
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
|
|
+}
|
|
+
|
|
+static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
|
|
+ const struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bkey_i_inode *k = &packed->inode;
|
|
+ u8 *out = k->v.fields;
|
|
+ u8 *end = (void *) &packed[1];
|
|
+ u8 *last_nonzero_field = out;
|
|
+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
|
|
+ unsigned bytes;
|
|
+ int ret;
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ nr_fields++; \
|
|
+ \
|
|
+ if (inode->_name) { \
|
|
+ ret = bch2_varint_encode(out, inode->_name); \
|
|
+ out += ret; \
|
|
+ \
|
|
+ if (_bits > 64) \
|
|
+ *out++ = 0; \
|
|
+ \
|
|
+ last_nonzero_field = out; \
|
|
+ last_nonzero_fieldnr = nr_fields; \
|
|
+ } else { \
|
|
+ *out++ = 0; \
|
|
+ \
|
|
+ if (_bits > 64) \
|
|
+ *out++ = 0; \
|
|
+ }
|
|
+
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+ BUG_ON(out > end);
|
|
+
|
|
+ out = last_nonzero_field;
|
|
+ nr_fields = last_nonzero_fieldnr;
|
|
+
|
|
+ bytes = out - (u8 *) &packed->inode.v;
|
|
+ set_bkey_val_bytes(&packed->inode.k, bytes);
|
|
+ memset_u64s_tail(&packed->inode.v, 0, bytes);
|
|
+
|
|
+ SET_INODE_NR_FIELDS(&k->v, nr_fields);
|
|
+}
|
|
+
|
|
+void bch2_inode_pack(struct bch_fs *c,
|
|
+ struct bkey_inode_buf *packed,
|
|
+ const struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ bkey_inode_init(&packed->inode.k_i);
|
|
+ packed->inode.k.p.offset = inode->bi_inum;
|
|
+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed;
|
|
+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags);
|
|
+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode);
|
|
+
|
|
+ if (c->sb.features & (1ULL << BCH_FEATURE_new_varint)) {
|
|
+ SET_INODE_NEW_VARINT(&packed->inode.v, true);
|
|
+ bch2_inode_pack_v2(packed, inode);
|
|
+ } else {
|
|
+ bch2_inode_pack_v1(packed, inode);
|
|
+ }
|
|
+
|
|
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
|
|
+ struct bch_inode_unpacked unpacked;
|
|
+
|
|
+ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode),
|
|
+ &unpacked);
|
|
+ BUG_ON(ret);
|
|
+ BUG_ON(unpacked.bi_inum != inode->bi_inum);
|
|
+ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed);
|
|
+ BUG_ON(unpacked.bi_mode != inode->bi_mode);
|
|
+
|
|
+#define x(_name, _bits) if (unpacked._name != inode->_name) \
|
|
+ panic("unpacked %llu should be %llu", \
|
|
+ (u64) unpacked._name, (u64) inode->_name);
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+ }
|
|
+}
|
|
+
|
|
+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
|
|
+ struct bch_inode_unpacked *unpacked)
|
|
+{
|
|
+ const u8 *in = inode.v->fields;
|
|
+ const u8 *end = bkey_val_end(inode);
|
|
+ u64 field[2];
|
|
+ unsigned fieldnr = 0, field_bits;
|
|
+ int ret;
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
|
|
+ memset(&unpacked->_name, 0, \
|
|
+ sizeof(*unpacked) - \
|
|
+ offsetof(struct bch_inode_unpacked, _name)); \
|
|
+ return 0; \
|
|
+ } \
|
|
+ \
|
|
+ ret = inode_decode_field(in, end, field, &field_bits); \
|
|
+ if (ret < 0) \
|
|
+ return ret; \
|
|
+ \
|
|
+ if (field_bits > sizeof(unpacked->_name) * 8) \
|
|
+ return -1; \
|
|
+ \
|
|
+ unpacked->_name = field[1]; \
|
|
+ in += ret;
|
|
+
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+
|
|
+ /* XXX: signal if there were more fields than expected? */
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
|
|
+ struct bch_inode_unpacked *unpacked)
|
|
+{
|
|
+ const u8 *in = inode.v->fields;
|
|
+ const u8 *end = bkey_val_end(inode);
|
|
+ unsigned fieldnr = 0;
|
|
+ int ret;
|
|
+ u64 v[2];
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ if (fieldnr < INODE_NR_FIELDS(inode.v)) { \
|
|
+ ret = bch2_varint_decode(in, end, &v[0]); \
|
|
+ if (ret < 0) \
|
|
+ return ret; \
|
|
+ in += ret; \
|
|
+ \
|
|
+ if (_bits > 64) { \
|
|
+ ret = bch2_varint_decode(in, end, &v[1]); \
|
|
+ if (ret < 0) \
|
|
+ return ret; \
|
|
+ in += ret; \
|
|
+ } else { \
|
|
+ v[1] = 0; \
|
|
+ } \
|
|
+ } else { \
|
|
+ v[0] = v[1] = 0; \
|
|
+ } \
|
|
+ \
|
|
+ unpacked->_name = v[0]; \
|
|
+ if (v[1] || v[0] != unpacked->_name) \
|
|
+ return -1; \
|
|
+ fieldnr++;
|
|
+
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+
|
|
+ /* XXX: signal if there were more fields than expected? */
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_inode_unpack(struct bkey_s_c_inode inode,
|
|
+ struct bch_inode_unpacked *unpacked)
|
|
+{
|
|
+ unpacked->bi_inum = inode.k->p.offset;
|
|
+ unpacked->bi_hash_seed = inode.v->bi_hash_seed;
|
|
+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
|
|
+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
|
|
+
|
|
+ if (INODE_NEW_VARINT(inode.v)) {
|
|
+ return bch2_inode_unpack_v2(inode, unpacked);
|
|
+ } else {
|
|
+ return bch2_inode_unpack_v1(inode, unpacked);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ u64 inum, unsigned flags)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
|
|
+ BTREE_ITER_CACHED|flags);
|
|
+ k = bch2_btree_iter_peek_cached(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = k.k->type == KEY_TYPE_inode ? 0 : -ENOENT;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ return iter;
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ERR_PTR(ret);
|
|
+}
|
|
+
|
|
+int bch2_inode_write(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bkey_inode_buf *inode_p;
|
|
+
|
|
+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p));
|
|
+ if (IS_ERR(inode_p))
|
|
+ return PTR_ERR(inode_p);
|
|
+
|
|
+ bch2_inode_pack(trans->c, inode_p, inode);
|
|
+ inode_p->inode.k.p.snapshot = iter->snapshot;
|
|
+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
|
|
+ struct bch_inode_unpacked unpacked;
|
|
+
|
|
+ if (k.k->p.inode)
|
|
+ return "nonzero k.p.inode";
|
|
+
|
|
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ if (k.k->p.offset < BLOCKDEV_INODE_MAX)
|
|
+ return "fs inode in blockdev range";
|
|
+
|
|
+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR)
|
|
+ return "invalid str hash type";
|
|
+
|
|
+ if (bch2_inode_unpack(inode, &unpacked))
|
|
+ return "invalid variable length fields";
|
|
+
|
|
+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1)
|
|
+ return "invalid data checksum type";
|
|
+
|
|
+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1)
|
|
+ return "invalid data checksum type";
|
|
+
|
|
+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) &&
|
|
+ unpacked.bi_nlink != 0)
|
|
+ return "flagged as unlinked but bi_nlink != 0";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
|
|
+ struct bch_inode_unpacked unpacked;
|
|
+
|
|
+ if (bch2_inode_unpack(inode, &unpacked)) {
|
|
+ pr_buf(out, "(unpack error)");
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ pr_buf(out, "mode: %o ", unpacked.bi_mode);
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ pr_buf(out, #_name ": %llu ", (u64) unpacked._name);
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+}
|
|
+
|
|
+const char *bch2_inode_generation_invalid(const struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ if (k.k->p.inode)
|
|
+ return "nonzero k.p.inode";
|
|
+
|
|
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k);
|
|
+
|
|
+ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation));
|
|
+}
|
|
+
|
|
+void bch2_inode_init_early(struct bch_fs *c,
|
|
+ struct bch_inode_unpacked *inode_u)
|
|
+{
|
|
+ enum bch_str_hash_type str_hash =
|
|
+ bch2_str_hash_opt_to_type(c, c->opts.str_hash);
|
|
+
|
|
+ memset(inode_u, 0, sizeof(*inode_u));
|
|
+
|
|
+ /* ick */
|
|
+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
|
|
+ get_random_bytes(&inode_u->bi_hash_seed,
|
|
+ sizeof(inode_u->bi_hash_seed));
|
|
+}
|
|
+
|
|
+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
|
|
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
|
+ struct bch_inode_unpacked *parent)
|
|
+{
|
|
+ inode_u->bi_mode = mode;
|
|
+ inode_u->bi_uid = uid;
|
|
+ inode_u->bi_gid = gid;
|
|
+ inode_u->bi_dev = rdev;
|
|
+ inode_u->bi_atime = now;
|
|
+ inode_u->bi_mtime = now;
|
|
+ inode_u->bi_ctime = now;
|
|
+ inode_u->bi_otime = now;
|
|
+
|
|
+ if (parent && parent->bi_mode & S_ISGID) {
|
|
+ inode_u->bi_gid = parent->bi_gid;
|
|
+ if (S_ISDIR(mode))
|
|
+ inode_u->bi_mode |= S_ISGID;
|
|
+ }
|
|
+
|
|
+ if (parent) {
|
|
+#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name;
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
|
|
+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
|
|
+ struct bch_inode_unpacked *parent)
|
|
+{
|
|
+ bch2_inode_init_early(c, inode_u);
|
|
+ bch2_inode_init_late(inode_u, bch2_current_time(c),
|
|
+ uid, gid, mode, rdev, parent);
|
|
+}
|
|
+
|
|
+static inline u32 bkey_generation(struct bkey_s_c k)
|
|
+{
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_inode:
|
|
+ BUG();
|
|
+ case KEY_TYPE_inode_generation:
|
|
+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation);
|
|
+ default:
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+struct btree_iter *bch2_inode_create(struct btree_trans *trans,
|
|
+ struct bch_inode_unpacked *inode_u,
|
|
+ u32 snapshot)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter = NULL;
|
|
+ struct bkey_s_c k;
|
|
+ u64 min, max, start, pos, *hint;
|
|
+ int ret;
|
|
+
|
|
+ u64 cpu = raw_smp_processor_id();
|
|
+ unsigned bits = (c->opts.inodes_32bit
|
|
+ ? 31 : 63) - c->inode_shard_bits;
|
|
+
|
|
+ min = (cpu << bits);
|
|
+ max = (cpu << bits) | ~(ULLONG_MAX << bits);
|
|
+
|
|
+ min = max_t(u64, min, BLOCKDEV_INODE_MAX);
|
|
+ hint = c->unused_inode_hints + cpu;
|
|
+
|
|
+ start = READ_ONCE(*hint);
|
|
+
|
|
+ if (start >= max || start < min)
|
|
+ start = min;
|
|
+
|
|
+ pos = start;
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, pos),
|
|
+ BTREE_ITER_ALL_SNAPSHOTS|
|
|
+ BTREE_ITER_INTENT);
|
|
+again:
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k)) &&
|
|
+ bkey_cmp(k.k->p, POS(0, max)) < 0) {
|
|
+ while (pos < iter->pos.offset) {
|
|
+ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
|
|
+ goto found_slot;
|
|
+
|
|
+ pos++;
|
|
+ }
|
|
+
|
|
+ if (k.k->p.snapshot == snapshot &&
|
|
+ k.k->type != KEY_TYPE_inode &&
|
|
+ !bch2_btree_key_cache_find(c, BTREE_ID_inodes, SPOS(0, pos, snapshot))) {
|
|
+ bch2_btree_iter_next(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We don't need to iterate over keys in every snapshot once
|
|
+ * we've found just one:
|
|
+ */
|
|
+ pos = iter->pos.offset + 1;
|
|
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
|
|
+ }
|
|
+
|
|
+ while (!ret && pos < max) {
|
|
+ if (!bch2_btree_key_cache_find(c, BTREE_ID_inodes, POS(0, pos)))
|
|
+ goto found_slot;
|
|
+
|
|
+ pos++;
|
|
+ }
|
|
+
|
|
+ if (!ret && start == min)
|
|
+ ret = -ENOSPC;
|
|
+
|
|
+ if (ret) {
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ERR_PTR(ret);
|
|
+ }
|
|
+
|
|
+ /* Retry from start */
|
|
+ pos = start = min;
|
|
+ bch2_btree_iter_set_pos(iter, POS(0, pos));
|
|
+ goto again;
|
|
+found_slot:
|
|
+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot));
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret) {
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ERR_PTR(ret);
|
|
+ }
|
|
+
|
|
+ /* We may have raced while the iterator wasn't pointing at pos: */
|
|
+ if (k.k->type == KEY_TYPE_inode ||
|
|
+ bch2_btree_key_cache_find(c, BTREE_ID_inodes, k.k->p))
|
|
+ goto again;
|
|
+
|
|
+ *hint = k.k->p.offset;
|
|
+ inode_u->bi_inum = k.k->p.offset;
|
|
+ inode_u->bi_generation = bkey_generation(k);
|
|
+ return iter;
|
|
+}
|
|
+
|
|
+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter = NULL;
|
|
+ struct bkey_i_inode_generation delete;
|
|
+ struct bpos start = POS(inode_nr, 0);
|
|
+ struct bpos end = POS(inode_nr + 1, 0);
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ /*
|
|
+ * If this was a directory, there shouldn't be any real dirents left -
|
|
+ * but there could be whiteouts (from hash collisions) that we should
|
|
+ * delete:
|
|
+ *
|
|
+ * XXX: the dirent could ideally would delete whiteouts when they're no
|
|
+ * longer needed
|
|
+ */
|
|
+ ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents,
|
|
+ start, end, NULL) ?:
|
|
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs,
|
|
+ start, end, NULL) ?:
|
|
+ bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents,
|
|
+ start, end, NULL);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ if (cached) {
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
|
|
+ BTREE_ITER_CACHED|BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek_cached(iter);
|
|
+ } else {
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ }
|
|
+
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_inode) {
|
|
+ bch2_fs_inconsistent(trans.c,
|
|
+ "inode %llu not found when deleting",
|
|
+ inode_nr);
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u);
|
|
+
|
|
+ bkey_inode_generation_init(&delete.k_i);
|
|
+ delete.k.p = iter->pos;
|
|
+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1);
|
|
+
|
|
+ bch2_trans_update(&trans, iter, &delete.k_i, 0);
|
|
+
|
|
+ ret = bch2_trans_commit(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ if (ret == -EINTR)
|
|
+ goto retry;
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_inode_peek(trans, inode, inode_nr, 0);
|
|
+ ret = PTR_ERR_OR_ZERO(iter);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ return bch2_trans_do(c, NULL, NULL, 0,
|
|
+ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode));
|
|
+}
|
|
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
|
|
new file mode 100644
|
|
index 000000000000..558d5464095d
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/inode.h
|
|
@@ -0,0 +1,177 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_INODE_H
|
|
+#define _BCACHEFS_INODE_H
|
|
+
|
|
+#include "opts.h"
|
|
+
|
|
+extern const char * const bch2_inode_opts[];
|
|
+
|
|
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_inode (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_inode_invalid, \
|
|
+ .val_to_text = bch2_inode_to_text, \
|
|
+}
|
|
+
|
|
+const char *bch2_inode_generation_invalid(const struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_inode_generation_invalid, \
|
|
+ .val_to_text = bch2_inode_generation_to_text, \
|
|
+}
|
|
+
|
|
+#if 0
|
|
+typedef struct {
|
|
+ u64 lo;
|
|
+ u32 hi;
|
|
+} __packed __aligned(4) u96;
|
|
+#endif
|
|
+typedef u64 u96;
|
|
+
|
|
+struct bch_inode_unpacked {
|
|
+ u64 bi_inum;
|
|
+ __le64 bi_hash_seed;
|
|
+ u32 bi_flags;
|
|
+ u16 bi_mode;
|
|
+
|
|
+#define x(_name, _bits) u##_bits _name;
|
|
+ BCH_INODE_FIELDS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct bkey_inode_buf {
|
|
+ struct bkey_i_inode inode;
|
|
+
|
|
+#define x(_name, _bits) + 8 + _bits / 8
|
|
+ u8 _pad[0 + BCH_INODE_FIELDS()];
|
|
+#undef x
|
|
+} __attribute__((packed, aligned(8)));
|
|
+
|
|
+void bch2_inode_pack(struct bch_fs *, struct bkey_inode_buf *,
|
|
+ const struct bch_inode_unpacked *);
|
|
+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *);
|
|
+
|
|
+struct btree_iter *bch2_inode_peek(struct btree_trans *,
|
|
+ struct bch_inode_unpacked *, u64, unsigned);
|
|
+int bch2_inode_write(struct btree_trans *, struct btree_iter *,
|
|
+ struct bch_inode_unpacked *);
|
|
+
|
|
+void bch2_inode_init_early(struct bch_fs *,
|
|
+ struct bch_inode_unpacked *);
|
|
+void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
|
|
+ uid_t, gid_t, umode_t, dev_t,
|
|
+ struct bch_inode_unpacked *);
|
|
+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
|
|
+ uid_t, gid_t, umode_t, dev_t,
|
|
+ struct bch_inode_unpacked *);
|
|
+
|
|
+struct btree_iter *bch2_inode_create(struct btree_trans *,
|
|
+ struct bch_inode_unpacked *, u32);
|
|
+
|
|
+int bch2_inode_rm(struct bch_fs *, u64, bool);
|
|
+
|
|
+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *);
|
|
+
|
|
+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bch_io_opts ret = { 0 };
|
|
+
|
|
+#define x(_name, _bits) \
|
|
+ if (inode->bi_##_name) \
|
|
+ opt_set(ret, _name, inode->bi_##_name - 1);
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode,
|
|
+ enum inode_opt_id id, u64 v)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(_name, ...) \
|
|
+ case Inode_opt_##_name: \
|
|
+ inode->bi_##_name = v; \
|
|
+ break;
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode,
|
|
+ enum inode_opt_id id)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(_name, ...) \
|
|
+ case Inode_opt_##_name: \
|
|
+ return inode->bi_##_name;
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct bch_io_opts
|
|
+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
|
|
+
|
|
+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode));
|
|
+ return opts;
|
|
+}
|
|
+
|
|
+static inline u8 mode_to_type(umode_t mode)
|
|
+{
|
|
+ return (mode >> 12) & 15;
|
|
+}
|
|
+
|
|
+/* i_nlink: */
|
|
+
|
|
+static inline unsigned nlink_bias(umode_t mode)
|
|
+{
|
|
+ return S_ISDIR(mode) ? 2 : 1;
|
|
+}
|
|
+
|
|
+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
|
|
+{
|
|
+ if (bi->bi_flags & BCH_INODE_UNLINKED)
|
|
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
|
|
+ else
|
|
+ bi->bi_nlink++;
|
|
+}
|
|
+
|
|
+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi)
|
|
+{
|
|
+ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED);
|
|
+ if (bi->bi_nlink)
|
|
+ bi->bi_nlink--;
|
|
+ else
|
|
+ bi->bi_flags |= BCH_INODE_UNLINKED;
|
|
+}
|
|
+
|
|
+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi)
|
|
+{
|
|
+ return bi->bi_flags & BCH_INODE_UNLINKED
|
|
+ ? 0
|
|
+ : bi->bi_nlink + nlink_bias(bi->bi_mode);
|
|
+}
|
|
+
|
|
+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi,
|
|
+ unsigned nlink)
|
|
+{
|
|
+ if (nlink) {
|
|
+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode);
|
|
+ bi->bi_flags &= ~BCH_INODE_UNLINKED;
|
|
+ } else {
|
|
+ bi->bi_nlink = 0;
|
|
+ bi->bi_flags |= BCH_INODE_UNLINKED;
|
|
+ }
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_INODE_H */
|
|
diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c
|
|
new file mode 100644
|
|
index 000000000000..d1a623991bbc
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/io.c
|
|
@@ -0,0 +1,2361 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Some low level IO code, and hacks for various block layer limitations
|
|
+ *
|
|
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "bset.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "checksum.h"
|
|
+#include "compress.h"
|
|
+#include "clock.h"
|
|
+#include "debug.h"
|
|
+#include "disk_groups.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "extent_update.h"
|
|
+#include "inode.h"
|
|
+#include "io.h"
|
|
+#include "journal.h"
|
|
+#include "keylist.h"
|
|
+#include "move.h"
|
|
+#include "rebalance.h"
|
|
+#include "super.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/sched/mm.h>
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+const char *bch2_blk_status_to_str(blk_status_t status)
|
|
+{
|
|
+ if (status == BLK_STS_REMOVED)
|
|
+ return "device removed";
|
|
+ return blk_status_to_str(status);
|
|
+}
|
|
+
|
|
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
|
|
+{
|
|
+ const struct bch_devs_mask *devs;
|
|
+ unsigned d, nr = 0, total = 0;
|
|
+ u64 now = local_clock(), last;
|
|
+ s64 congested;
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ if (!target)
|
|
+ return false;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ devs = bch2_target_to_mask(c, target) ?:
|
|
+ &c->rw_devs[BCH_DATA_user];
|
|
+
|
|
+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
|
|
+ ca = rcu_dereference(c->devs[d]);
|
|
+ if (!ca)
|
|
+ continue;
|
|
+
|
|
+ congested = atomic_read(&ca->congested);
|
|
+ last = READ_ONCE(ca->congested_last);
|
|
+ if (time_after64(now, last))
|
|
+ congested -= (now - last) >> 12;
|
|
+
|
|
+ total += max(congested, 0LL);
|
|
+ nr++;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return bch2_rand_range(nr * CONGESTED_MAX) < total;
|
|
+}
|
|
+
|
|
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
|
|
+ u64 now, int rw)
|
|
+{
|
|
+ u64 latency_capable =
|
|
+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
|
|
+ /* ideally we'd be taking into account the device's variance here: */
|
|
+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
|
|
+ s64 latency_over = io_latency - latency_threshold;
|
|
+
|
|
+ if (latency_threshold && latency_over > 0) {
|
|
+ /*
|
|
+ * bump up congested by approximately latency_over * 4 /
|
|
+ * latency_threshold - we don't need much accuracy here so don't
|
|
+ * bother with the divide:
|
|
+ */
|
|
+ if (atomic_read(&ca->congested) < CONGESTED_MAX)
|
|
+ atomic_add(latency_over >>
|
|
+ max_t(int, ilog2(latency_threshold) - 2, 0),
|
|
+ &ca->congested);
|
|
+
|
|
+ ca->congested_last = now;
|
|
+ } else if (atomic_read(&ca->congested) > 0) {
|
|
+ atomic_dec(&ca->congested);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
|
|
+{
|
|
+ atomic64_t *latency = &ca->cur_latency[rw];
|
|
+ u64 now = local_clock();
|
|
+ u64 io_latency = time_after64(now, submit_time)
|
|
+ ? now - submit_time
|
|
+ : 0;
|
|
+ u64 old, new, v = atomic64_read(latency);
|
|
+
|
|
+ do {
|
|
+ old = v;
|
|
+
|
|
+ /*
|
|
+ * If the io latency was reasonably close to the current
|
|
+ * latency, skip doing the update and atomic operation - most of
|
|
+ * the time:
|
|
+ */
|
|
+ if (abs((int) (old - io_latency)) < (old >> 1) &&
|
|
+ now & ~(~0 << 5))
|
|
+ break;
|
|
+
|
|
+ new = ewma_add(old, io_latency, 5);
|
|
+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old);
|
|
+
|
|
+ bch2_congested_acct(ca, io_latency, now, rw);
|
|
+
|
|
+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
|
|
+}
|
|
+
|
|
+/* Allocate, free from mempool: */
|
|
+
|
|
+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
|
|
+{
|
|
+ struct bvec_iter_all iter;
|
|
+ struct bio_vec *bv;
|
|
+
|
|
+ bio_for_each_segment_all(bv, bio, iter)
|
|
+ if (bv->bv_page != ZERO_PAGE(0))
|
|
+ mempool_free(bv->bv_page, &c->bio_bounce_pages);
|
|
+ bio->bi_vcnt = 0;
|
|
+}
|
|
+
|
|
+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
|
|
+{
|
|
+ struct page *page;
|
|
+
|
|
+ if (likely(!*using_mempool)) {
|
|
+ page = alloc_page(GFP_NOIO);
|
|
+ if (unlikely(!page)) {
|
|
+ mutex_lock(&c->bio_bounce_pages_lock);
|
|
+ *using_mempool = true;
|
|
+ goto pool_alloc;
|
|
+
|
|
+ }
|
|
+ } else {
|
|
+pool_alloc:
|
|
+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
|
|
+ }
|
|
+
|
|
+ return page;
|
|
+}
|
|
+
|
|
+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
|
|
+ size_t size)
|
|
+{
|
|
+ bool using_mempool = false;
|
|
+
|
|
+ while (size) {
|
|
+ struct page *page = __bio_alloc_page_pool(c, &using_mempool);
|
|
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
|
|
+
|
|
+ BUG_ON(!bio_add_page(bio, page, len, 0));
|
|
+ size -= len;
|
|
+ }
|
|
+
|
|
+ if (using_mempool)
|
|
+ mutex_unlock(&c->bio_bounce_pages_lock);
|
|
+}
|
|
+
|
|
+/* Extent update path: */
|
|
+
|
|
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
|
|
+ struct btree_iter *extent_iter,
|
|
+ struct bkey_i *new,
|
|
+ bool *maybe_extending,
|
|
+ bool *should_check_enospc,
|
|
+ s64 *i_sectors_delta,
|
|
+ s64 *disk_sectors_delta)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c old;
|
|
+ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
|
|
+ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
|
|
+ int ret = 0;
|
|
+
|
|
+ *maybe_extending = true;
|
|
+ *should_check_enospc = false;
|
|
+ *i_sectors_delta = 0;
|
|
+ *disk_sectors_delta = 0;
|
|
+
|
|
+ iter = bch2_trans_copy_iter(trans, extent_iter);
|
|
+
|
|
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
|
|
+ s64 sectors = min(new->k.p.offset, old.k->p.offset) -
|
|
+ max(bkey_start_offset(&new->k),
|
|
+ bkey_start_offset(old.k));
|
|
+
|
|
+ *i_sectors_delta += sectors *
|
|
+ (bkey_extent_is_allocation(&new->k) -
|
|
+ bkey_extent_is_allocation(old.k));
|
|
+
|
|
+ *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new));
|
|
+ *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot
|
|
+ ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old)
|
|
+ : 0;
|
|
+
|
|
+ if (!*should_check_enospc &&
|
|
+ (new_replicas > bch2_bkey_replicas(c, old) ||
|
|
+ (!new_compressed && bch2_bkey_sectors_compressed(old))))
|
|
+ *should_check_enospc = true;
|
|
+
|
|
+ if (bkey_cmp(old.k->p, new->k.p) >= 0) {
|
|
+ /*
|
|
+ * Check if there's already data above where we're
|
|
+ * going to be writing to - this means we're definitely
|
|
+ * not extending the file:
|
|
+ *
|
|
+ * Note that it's not sufficient to check if there's
|
|
+ * data up to the sector offset we're going to be
|
|
+ * writing to, because i_size could be up to one block
|
|
+ * less:
|
|
+ */
|
|
+ if (!bkey_cmp(old.k->p, new->k.p))
|
|
+ old = bch2_btree_iter_next(iter);
|
|
+
|
|
+ if (old.k && !bkey_err(old) &&
|
|
+ old.k->p.inode == extent_iter->pos.inode &&
|
|
+ bkey_extent_is_data(old.k))
|
|
+ *maybe_extending = false;
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_extent_update(struct btree_trans *trans,
|
|
+ struct btree_iter *iter,
|
|
+ struct bkey_i *k,
|
|
+ struct disk_reservation *disk_res,
|
|
+ u64 *journal_seq,
|
|
+ u64 new_i_size,
|
|
+ s64 *i_sectors_delta_total)
|
|
+{
|
|
+ /* this must live until after bch2_trans_commit(): */
|
|
+ struct bkey_inode_buf inode_p;
|
|
+ bool extending = false, should_check_enospc;
|
|
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_extent_trim_atomic(k, iter);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_sum_sector_overwrites(trans, iter, k,
|
|
+ &extending,
|
|
+ &should_check_enospc,
|
|
+ &i_sectors_delta,
|
|
+ &disk_sectors_delta);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (disk_res &&
|
|
+ disk_sectors_delta > (s64) disk_res->sectors) {
|
|
+ ret = bch2_disk_reservation_add(trans->c, disk_res,
|
|
+ disk_sectors_delta - disk_res->sectors,
|
|
+ !should_check_enospc
|
|
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ new_i_size = extending
|
|
+ ? min(k->k.p.offset << 9, new_i_size)
|
|
+ : 0;
|
|
+
|
|
+ if (i_sectors_delta || new_i_size) {
|
|
+ struct btree_iter *inode_iter;
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+
|
|
+ inode_iter = bch2_inode_peek(trans, &inode_u,
|
|
+ k->k.p.inode, BTREE_ITER_INTENT);
|
|
+ if (IS_ERR(inode_iter))
|
|
+ return PTR_ERR(inode_iter);
|
|
+
|
|
+ /*
|
|
+ * XXX:
|
|
+ * writeback can race a bit with truncate, because truncate
|
|
+ * first updates the inode then truncates the pagecache. This is
|
|
+ * ugly, but lets us preserve the invariant that the in memory
|
|
+ * i_size is always >= the on disk i_size.
|
|
+ *
|
|
+ BUG_ON(new_i_size > inode_u.bi_size &&
|
|
+ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY));
|
|
+ */
|
|
+ BUG_ON(new_i_size > inode_u.bi_size && !extending);
|
|
+
|
|
+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
|
|
+ new_i_size > inode_u.bi_size)
|
|
+ inode_u.bi_size = new_i_size;
|
|
+ else
|
|
+ new_i_size = 0;
|
|
+
|
|
+ inode_u.bi_sectors += i_sectors_delta;
|
|
+
|
|
+ if (i_sectors_delta || new_i_size) {
|
|
+ bch2_inode_pack(trans->c, &inode_p, &inode_u);
|
|
+
|
|
+ inode_p.inode.k.p.snapshot = iter->snapshot;
|
|
+
|
|
+ bch2_trans_update(trans, inode_iter,
|
|
+ &inode_p.inode.k_i, 0);
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(trans, inode_iter);
|
|
+ }
|
|
+
|
|
+ bch2_trans_update(trans, iter, k, 0);
|
|
+
|
|
+ ret = bch2_trans_commit(trans, disk_res, journal_seq,
|
|
+ BTREE_INSERT_NOCHECK_RW|
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (i_sectors_delta_total)
|
|
+ *i_sectors_delta_total += i_sectors_delta;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
|
|
+ struct bpos end, u64 *journal_seq,
|
|
+ s64 *i_sectors_delta)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0, ret2 = 0;
|
|
+
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ bkey_cmp(iter->pos, end) < 0) {
|
|
+ struct disk_reservation disk_res =
|
|
+ bch2_disk_reservation_init(c, 0);
|
|
+ struct bkey_i delete;
|
|
+
|
|
+ bch2_trans_begin(trans);
|
|
+
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto btree_err;
|
|
+
|
|
+ bkey_init(&delete.k);
|
|
+ delete.k.p = iter->pos;
|
|
+
|
|
+ /* create the biggest key we can */
|
|
+ bch2_key_resize(&delete.k, max_sectors);
|
|
+ bch2_cut_back(end, &delete);
|
|
+
|
|
+ ret = bch2_extent_update(trans, iter, &delete,
|
|
+ &disk_res, journal_seq,
|
|
+ 0, i_sectors_delta);
|
|
+ bch2_disk_reservation_put(c, &disk_res);
|
|
+btree_err:
|
|
+ if (ret == -EINTR) {
|
|
+ ret2 = ret;
|
|
+ ret = 0;
|
|
+ }
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (bkey_cmp(iter->pos, end) > 0) {
|
|
+ bch2_btree_iter_set_pos(iter, end);
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ }
|
|
+
|
|
+ return ret ?: ret2;
|
|
+}
|
|
+
|
|
+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
|
|
+ u64 *journal_seq, s64 *i_sectors_delta)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ POS(inum, start),
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ ret = bch2_fpunch_at(&trans, iter, POS(inum, end),
|
|
+ journal_seq, i_sectors_delta);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ if (ret == -EINTR)
|
|
+ ret = 0;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_write_index_default(struct bch_write_op *op)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct bkey_buf sk;
|
|
+ struct keylist *keys = &op->insert_keys;
|
|
+ struct bkey_i *k = bch2_keylist_front(keys);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ bkey_start_pos(&k->k),
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+
|
|
+ do {
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ k = bch2_keylist_front(keys);
|
|
+
|
|
+ k->k.p.snapshot = iter->snapshot;
|
|
+
|
|
+ bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
|
|
+ bkey_copy(sk.k, k);
|
|
+ bch2_cut_front(iter->pos, sk.k);
|
|
+
|
|
+ ret = bch2_extent_update(&trans, iter, sk.k,
|
|
+ &op->res, op_journal_seq(op),
|
|
+ op->new_i_size, &op->i_sectors_delta);
|
|
+ if (ret == -EINTR)
|
|
+ continue;
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (bkey_cmp(iter->pos, k->k.p) >= 0)
|
|
+ bch2_keylist_pop_front(keys);
|
|
+ } while (!bch2_keylist_empty(keys));
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Writes */
|
|
+
|
|
+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
|
|
+ enum bch_data_type type,
|
|
+ const struct bkey_i *k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+ struct bch_write_bio *n;
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ BUG_ON(c->opts.nochanges);
|
|
+
|
|
+ bkey_for_each_ptr(ptrs, ptr) {
|
|
+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
|
|
+ !c->devs[ptr->dev]);
|
|
+
|
|
+ ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+
|
|
+ if (to_entry(ptr + 1) < ptrs.end) {
|
|
+ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
|
|
+ &ca->replica_set));
|
|
+
|
|
+ n->bio.bi_end_io = wbio->bio.bi_end_io;
|
|
+ n->bio.bi_private = wbio->bio.bi_private;
|
|
+ n->parent = wbio;
|
|
+ n->split = true;
|
|
+ n->bounce = false;
|
|
+ n->put_bio = true;
|
|
+ n->bio.bi_opf = wbio->bio.bi_opf;
|
|
+ bio_inc_remaining(&wbio->bio);
|
|
+ } else {
|
|
+ n = wbio;
|
|
+ n->split = false;
|
|
+ }
|
|
+
|
|
+ n->c = c;
|
|
+ n->dev = ptr->dev;
|
|
+ n->have_ioref = bch2_dev_get_ioref(ca,
|
|
+ type == BCH_DATA_btree ? READ : WRITE);
|
|
+ n->submit_time = local_clock();
|
|
+ n->bio.bi_iter.bi_sector = ptr->offset;
|
|
+
|
|
+ if (likely(n->have_ioref)) {
|
|
+ this_cpu_add(ca->io_done->sectors[WRITE][type],
|
|
+ bio_sectors(&n->bio));
|
|
+
|
|
+ bio_set_dev(&n->bio, ca->disk_sb.bdev);
|
|
+ submit_bio(&n->bio);
|
|
+ } else {
|
|
+ n->bio.bi_status = BLK_STS_REMOVED;
|
|
+ bio_endio(&n->bio);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void __bch2_write(struct closure *);
|
|
+
|
|
+static void bch2_write_done(struct closure *cl)
|
|
+{
|
|
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
|
+ struct bch_fs *c = op->c;
|
|
+
|
|
+ if (!op->error && (op->flags & BCH_WRITE_FLUSH))
|
|
+ op->error = bch2_journal_error(&c->journal);
|
|
+
|
|
+ bch2_disk_reservation_put(c, &op->res);
|
|
+ percpu_ref_put(&c->writes);
|
|
+ bch2_keylist_free(&op->insert_keys, op->inline_keys);
|
|
+
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
|
|
+
|
|
+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
|
|
+ up(&c->io_in_flight);
|
|
+
|
|
+ if (op->end_io) {
|
|
+ EBUG_ON(cl->parent);
|
|
+ closure_debug_destroy(cl);
|
|
+ op->end_io(op);
|
|
+ } else {
|
|
+ closure_return(cl);
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_write_index - after a write, update index to point to new data
|
|
+ */
|
|
+static void __bch2_write_index(struct bch_write_op *op)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct keylist *keys = &op->insert_keys;
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct bkey_i *src, *dst = keys->keys, *n, *k;
|
|
+ unsigned dev;
|
|
+ int ret;
|
|
+
|
|
+ for (src = keys->keys; src != keys->top; src = n) {
|
|
+ n = bkey_next(src);
|
|
+
|
|
+ if (bkey_extent_is_direct_data(&src->k)) {
|
|
+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
|
|
+ test_bit(ptr->dev, op->failed.d));
|
|
+
|
|
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (dst != src)
|
|
+ memmove_u64s_down(dst, src, src->u64s);
|
|
+ dst = bkey_next(dst);
|
|
+ }
|
|
+
|
|
+ keys->top = dst;
|
|
+
|
|
+ /*
|
|
+ * probably not the ideal place to hook this in, but I don't
|
|
+ * particularly want to plumb io_opts all the way through the btree
|
|
+ * update stack right now
|
|
+ */
|
|
+ for_each_keylist_key(keys, k) {
|
|
+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
|
|
+
|
|
+ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
|
|
+ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
|
|
+
|
|
+ }
|
|
+
|
|
+ if (!bch2_keylist_empty(keys)) {
|
|
+ u64 sectors_start = keylist_sectors(keys);
|
|
+ int ret = op->index_update_fn(op);
|
|
+
|
|
+ BUG_ON(ret == -EINTR);
|
|
+ BUG_ON(keylist_sectors(keys) && !ret);
|
|
+
|
|
+ op->written += sectors_start - keylist_sectors(keys);
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err_inum_ratelimited(c, op->pos.inode,
|
|
+ "write error %i from btree update", ret);
|
|
+ op->error = ret;
|
|
+ }
|
|
+ }
|
|
+out:
|
|
+ /* If some a bucket wasn't written, we can't erasure code it: */
|
|
+ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
|
|
+ bch2_open_bucket_write_error(c, &op->open_buckets, dev);
|
|
+
|
|
+ bch2_open_buckets_put(c, &op->open_buckets);
|
|
+ return;
|
|
+err:
|
|
+ keys->top = keys->keys;
|
|
+ op->error = ret;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static void bch2_write_index(struct closure *cl)
|
|
+{
|
|
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
|
+ struct bch_fs *c = op->c;
|
|
+
|
|
+ __bch2_write_index(op);
|
|
+
|
|
+ if (!(op->flags & BCH_WRITE_DONE)) {
|
|
+ continue_at(cl, __bch2_write, index_update_wq(op));
|
|
+ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
|
|
+ bch2_journal_flush_seq_async(&c->journal,
|
|
+ *op_journal_seq(op),
|
|
+ cl);
|
|
+ continue_at(cl, bch2_write_done, index_update_wq(op));
|
|
+ } else {
|
|
+ continue_at_nobarrier(cl, bch2_write_done, NULL);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bch2_write_endio(struct bio *bio)
|
|
+{
|
|
+ struct closure *cl = bio->bi_private;
|
|
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
|
+ struct bch_write_bio *wbio = to_wbio(bio);
|
|
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
|
|
+ struct bch_fs *c = wbio->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
|
|
+
|
|
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
|
|
+ op->pos.inode,
|
|
+ op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */
|
|
+ "data write error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status)))
|
|
+ set_bit(wbio->dev, op->failed.d);
|
|
+
|
|
+ if (wbio->have_ioref) {
|
|
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ }
|
|
+
|
|
+ if (wbio->bounce)
|
|
+ bch2_bio_free_pages_pool(c, bio);
|
|
+
|
|
+ if (wbio->put_bio)
|
|
+ bio_put(bio);
|
|
+
|
|
+ if (parent)
|
|
+ bio_endio(&parent->bio);
|
|
+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT))
|
|
+ closure_put(cl);
|
|
+ else
|
|
+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op));
|
|
+}
|
|
+
|
|
+static void init_append_extent(struct bch_write_op *op,
|
|
+ struct write_point *wp,
|
|
+ struct bversion version,
|
|
+ struct bch_extent_crc_unpacked crc)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct bkey_i_extent *e;
|
|
+ struct open_bucket *ob;
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(crc.compressed_size > wp->sectors_free);
|
|
+ wp->sectors_free -= crc.compressed_size;
|
|
+ op->pos.offset += crc.uncompressed_size;
|
|
+
|
|
+ e = bkey_extent_init(op->insert_keys.top);
|
|
+ e->k.p = op->pos;
|
|
+ e->k.size = crc.uncompressed_size;
|
|
+ e->k.version = version;
|
|
+
|
|
+ if (crc.csum_type ||
|
|
+ crc.compression_type ||
|
|
+ crc.nonce)
|
|
+ bch2_extent_crc_append(&e->k_i, crc);
|
|
+
|
|
+ open_bucket_for_each(c, &wp->ptrs, ob, i) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
|
|
+ union bch_extent_entry *end =
|
|
+ bkey_val_end(bkey_i_to_s(&e->k_i));
|
|
+
|
|
+ end->ptr = ob->ptr;
|
|
+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
|
|
+ end->ptr.cached = !ca->mi.durability ||
|
|
+ (op->flags & BCH_WRITE_CACHED) != 0;
|
|
+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free;
|
|
+
|
|
+ e->k.u64s++;
|
|
+
|
|
+ BUG_ON(crc.compressed_size > ob->sectors_free);
|
|
+ ob->sectors_free -= crc.compressed_size;
|
|
+ }
|
|
+
|
|
+ bch2_keylist_push(&op->insert_keys);
|
|
+}
|
|
+
|
|
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
|
|
+ struct write_point *wp,
|
|
+ struct bio *src,
|
|
+ bool *page_alloc_failed,
|
|
+ void *buf)
|
|
+{
|
|
+ struct bch_write_bio *wbio;
|
|
+ struct bio *bio;
|
|
+ unsigned output_available =
|
|
+ min(wp->sectors_free << 9, src->bi_iter.bi_size);
|
|
+ unsigned pages = DIV_ROUND_UP(output_available +
|
|
+ (buf
|
|
+ ? ((unsigned long) buf & (PAGE_SIZE - 1))
|
|
+ : 0), PAGE_SIZE);
|
|
+
|
|
+ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
|
|
+ wbio = wbio_init(bio);
|
|
+ wbio->put_bio = true;
|
|
+ /* copy WRITE_SYNC flag */
|
|
+ wbio->bio.bi_opf = src->bi_opf;
|
|
+
|
|
+ if (buf) {
|
|
+ bch2_bio_map(bio, buf, output_available);
|
|
+ return bio;
|
|
+ }
|
|
+
|
|
+ wbio->bounce = true;
|
|
+
|
|
+ /*
|
|
+ * We can't use mempool for more than c->sb.encoded_extent_max
|
|
+ * worth of pages, but we'd like to allocate more if we can:
|
|
+ */
|
|
+ bch2_bio_alloc_pages_pool(c, bio,
|
|
+ min_t(unsigned, output_available,
|
|
+ c->sb.encoded_extent_max << 9));
|
|
+
|
|
+ if (bio->bi_iter.bi_size < output_available)
|
|
+ *page_alloc_failed =
|
|
+ bch2_bio_alloc_pages(bio,
|
|
+ output_available -
|
|
+ bio->bi_iter.bi_size,
|
|
+ GFP_NOFS) != 0;
|
|
+
|
|
+ return bio;
|
|
+}
|
|
+
|
|
+static int bch2_write_rechecksum(struct bch_fs *c,
|
|
+ struct bch_write_op *op,
|
|
+ unsigned new_csum_type)
|
|
+{
|
|
+ struct bio *bio = &op->wbio.bio;
|
|
+ struct bch_extent_crc_unpacked new_crc;
|
|
+ int ret;
|
|
+
|
|
+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
|
|
+ bch2_csum_type_is_encryption(new_csum_type))
|
|
+ new_csum_type = op->crc.csum_type;
|
|
+
|
|
+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
|
|
+ NULL, &new_crc,
|
|
+ op->crc.offset, op->crc.live_size,
|
|
+ new_csum_type);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bio_advance(bio, op->crc.offset << 9);
|
|
+ bio->bi_iter.bi_size = op->crc.live_size << 9;
|
|
+ op->crc = new_crc;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_write_decrypt(struct bch_write_op *op)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct nonce nonce = extent_nonce(op->version, op->crc);
|
|
+ struct bch_csum csum;
|
|
+
|
|
+ if (!bch2_csum_type_is_encryption(op->crc.csum_type))
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * If we need to decrypt data in the write path, we'll no longer be able
|
|
+ * to verify the existing checksum (poly1305 mac, in this case) after
|
|
+ * it's decrypted - this is the last point we'll be able to reverify the
|
|
+ * checksum:
|
|
+ */
|
|
+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
|
|
+ if (bch2_crc_cmp(op->crc.csum, csum))
|
|
+ return -EIO;
|
|
+
|
|
+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
|
|
+ op->crc.csum_type = 0;
|
|
+ op->crc.csum = (struct bch_csum) { 0, 0 };
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static enum prep_encoded_ret {
|
|
+ PREP_ENCODED_OK,
|
|
+ PREP_ENCODED_ERR,
|
|
+ PREP_ENCODED_CHECKSUM_ERR,
|
|
+ PREP_ENCODED_DO_WRITE,
|
|
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct bio *bio = &op->wbio.bio;
|
|
+
|
|
+ if (!(op->flags & BCH_WRITE_DATA_ENCODED))
|
|
+ return PREP_ENCODED_OK;
|
|
+
|
|
+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
|
|
+
|
|
+ /* Can we just write the entire extent as is? */
|
|
+ if (op->crc.uncompressed_size == op->crc.live_size &&
|
|
+ op->crc.compressed_size <= wp->sectors_free &&
|
|
+ (op->crc.compression_type == op->compression_type ||
|
|
+ op->incompressible)) {
|
|
+ if (!crc_is_compressed(op->crc) &&
|
|
+ op->csum_type != op->crc.csum_type &&
|
|
+ bch2_write_rechecksum(c, op, op->csum_type))
|
|
+ return PREP_ENCODED_CHECKSUM_ERR;
|
|
+
|
|
+ return PREP_ENCODED_DO_WRITE;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If the data is compressed and we couldn't write the entire extent as
|
|
+ * is, we have to decompress it:
|
|
+ */
|
|
+ if (crc_is_compressed(op->crc)) {
|
|
+ struct bch_csum csum;
|
|
+
|
|
+ if (bch2_write_decrypt(op))
|
|
+ return PREP_ENCODED_CHECKSUM_ERR;
|
|
+
|
|
+ /* Last point we can still verify checksum: */
|
|
+ csum = bch2_checksum_bio(c, op->crc.csum_type,
|
|
+ extent_nonce(op->version, op->crc),
|
|
+ bio);
|
|
+ if (bch2_crc_cmp(op->crc.csum, csum))
|
|
+ return PREP_ENCODED_CHECKSUM_ERR;
|
|
+
|
|
+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
|
|
+ return PREP_ENCODED_ERR;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * No longer have compressed data after this point - data might be
|
|
+ * encrypted:
|
|
+ */
|
|
+
|
|
+ /*
|
|
+ * If the data is checksummed and we're only writing a subset,
|
|
+ * rechecksum and adjust bio to point to currently live data:
|
|
+ */
|
|
+ if ((op->crc.live_size != op->crc.uncompressed_size ||
|
|
+ op->crc.csum_type != op->csum_type) &&
|
|
+ bch2_write_rechecksum(c, op, op->csum_type))
|
|
+ return PREP_ENCODED_CHECKSUM_ERR;
|
|
+
|
|
+ /*
|
|
+ * If we want to compress the data, it has to be decrypted:
|
|
+ */
|
|
+ if ((op->compression_type ||
|
|
+ bch2_csum_type_is_encryption(op->crc.csum_type) !=
|
|
+ bch2_csum_type_is_encryption(op->csum_type)) &&
|
|
+ bch2_write_decrypt(op))
|
|
+ return PREP_ENCODED_CHECKSUM_ERR;
|
|
+
|
|
+ return PREP_ENCODED_OK;
|
|
+}
|
|
+
|
|
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
|
|
+ struct bio **_dst)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct bio *src = &op->wbio.bio, *dst = src;
|
|
+ struct bvec_iter saved_iter;
|
|
+ void *ec_buf;
|
|
+ struct bpos ec_pos = op->pos;
|
|
+ unsigned total_output = 0, total_input = 0;
|
|
+ bool bounce = false;
|
|
+ bool page_alloc_failed = false;
|
|
+ int ret, more = 0;
|
|
+
|
|
+ BUG_ON(!bio_sectors(src));
|
|
+
|
|
+ ec_buf = bch2_writepoint_ec_buf(c, wp);
|
|
+
|
|
+ switch (bch2_write_prep_encoded_data(op, wp)) {
|
|
+ case PREP_ENCODED_OK:
|
|
+ break;
|
|
+ case PREP_ENCODED_ERR:
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ case PREP_ENCODED_CHECKSUM_ERR:
|
|
+ BUG();
|
|
+ goto csum_err;
|
|
+ case PREP_ENCODED_DO_WRITE:
|
|
+ /* XXX look for bug here */
|
|
+ if (ec_buf) {
|
|
+ dst = bch2_write_bio_alloc(c, wp, src,
|
|
+ &page_alloc_failed,
|
|
+ ec_buf);
|
|
+ bio_copy_data(dst, src);
|
|
+ bounce = true;
|
|
+ }
|
|
+ init_append_extent(op, wp, op->version, op->crc);
|
|
+ goto do_write;
|
|
+ }
|
|
+
|
|
+ if (ec_buf ||
|
|
+ op->compression_type ||
|
|
+ (op->csum_type &&
|
|
+ !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
|
|
+ (bch2_csum_type_is_encryption(op->csum_type) &&
|
|
+ !(op->flags & BCH_WRITE_PAGES_OWNED))) {
|
|
+ dst = bch2_write_bio_alloc(c, wp, src,
|
|
+ &page_alloc_failed,
|
|
+ ec_buf);
|
|
+ bounce = true;
|
|
+ }
|
|
+
|
|
+ saved_iter = dst->bi_iter;
|
|
+
|
|
+ do {
|
|
+ struct bch_extent_crc_unpacked crc =
|
|
+ (struct bch_extent_crc_unpacked) { 0 };
|
|
+ struct bversion version = op->version;
|
|
+ size_t dst_len, src_len;
|
|
+
|
|
+ if (page_alloc_failed &&
|
|
+ bio_sectors(dst) < wp->sectors_free &&
|
|
+ bio_sectors(dst) < c->sb.encoded_extent_max)
|
|
+ break;
|
|
+
|
|
+ BUG_ON(op->compression_type &&
|
|
+ (op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ bch2_csum_type_is_encryption(op->crc.csum_type));
|
|
+ BUG_ON(op->compression_type && !bounce);
|
|
+
|
|
+ crc.compression_type = op->incompressible
|
|
+ ? BCH_COMPRESSION_TYPE_incompressible
|
|
+ : op->compression_type
|
|
+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
|
|
+ op->compression_type)
|
|
+ : 0;
|
|
+ if (!crc_is_compressed(crc)) {
|
|
+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
|
|
+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
|
|
+
|
|
+ if (op->csum_type)
|
|
+ dst_len = min_t(unsigned, dst_len,
|
|
+ c->sb.encoded_extent_max << 9);
|
|
+
|
|
+ if (bounce) {
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ bio_copy_data(dst, src);
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ }
|
|
+
|
|
+ src_len = dst_len;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!src_len || !dst_len);
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(op->csum_type)) {
|
|
+ if (bversion_zero(version)) {
|
|
+ version.lo = atomic64_inc_return(&c->key_version);
|
|
+ } else {
|
|
+ crc.nonce = op->nonce;
|
|
+ op->nonce += src_len >> 9;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ !crc_is_compressed(crc) &&
|
|
+ bch2_csum_type_is_encryption(op->crc.csum_type) ==
|
|
+ bch2_csum_type_is_encryption(op->csum_type)) {
|
|
+ /*
|
|
+ * Note: when we're using rechecksum(), we need to be
|
|
+ * checksumming @src because it has all the data our
|
|
+ * existing checksum covers - if we bounced (because we
|
|
+ * were trying to compress), @dst will only have the
|
|
+ * part of the data the new checksum will cover.
|
|
+ *
|
|
+ * But normally we want to be checksumming post bounce,
|
|
+ * because part of the reason for bouncing is so the
|
|
+ * data can't be modified (by userspace) while it's in
|
|
+ * flight.
|
|
+ */
|
|
+ if (bch2_rechecksum_bio(c, src, version, op->crc,
|
|
+ &crc, &op->crc,
|
|
+ src_len >> 9,
|
|
+ bio_sectors(src) - (src_len >> 9),
|
|
+ op->csum_type))
|
|
+ goto csum_err;
|
|
+ } else {
|
|
+ if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
|
|
+ bch2_rechecksum_bio(c, src, version, op->crc,
|
|
+ NULL, &op->crc,
|
|
+ src_len >> 9,
|
|
+ bio_sectors(src) - (src_len >> 9),
|
|
+ op->crc.csum_type))
|
|
+ goto csum_err;
|
|
+
|
|
+ crc.compressed_size = dst_len >> 9;
|
|
+ crc.uncompressed_size = src_len >> 9;
|
|
+ crc.live_size = src_len >> 9;
|
|
+
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ bch2_encrypt_bio(c, op->csum_type,
|
|
+ extent_nonce(version, crc), dst);
|
|
+ crc.csum = bch2_checksum_bio(c, op->csum_type,
|
|
+ extent_nonce(version, crc), dst);
|
|
+ crc.csum_type = op->csum_type;
|
|
+ swap(dst->bi_iter.bi_size, dst_len);
|
|
+ }
|
|
+
|
|
+ init_append_extent(op, wp, version, crc);
|
|
+
|
|
+ if (dst != src)
|
|
+ bio_advance(dst, dst_len);
|
|
+ bio_advance(src, src_len);
|
|
+ total_output += dst_len;
|
|
+ total_input += src_len;
|
|
+ } while (dst->bi_iter.bi_size &&
|
|
+ src->bi_iter.bi_size &&
|
|
+ wp->sectors_free &&
|
|
+ !bch2_keylist_realloc(&op->insert_keys,
|
|
+ op->inline_keys,
|
|
+ ARRAY_SIZE(op->inline_keys),
|
|
+ BKEY_EXTENT_U64s_MAX));
|
|
+
|
|
+ more = src->bi_iter.bi_size != 0;
|
|
+
|
|
+ dst->bi_iter = saved_iter;
|
|
+
|
|
+ if (dst == src && more) {
|
|
+ BUG_ON(total_output != total_input);
|
|
+
|
|
+ dst = bio_split(src, total_input >> 9,
|
|
+ GFP_NOIO, &c->bio_write);
|
|
+ wbio_init(dst)->put_bio = true;
|
|
+ /* copy WRITE_SYNC flag */
|
|
+ dst->bi_opf = src->bi_opf;
|
|
+ }
|
|
+
|
|
+ dst->bi_iter.bi_size = total_output;
|
|
+do_write:
|
|
+ /* might have done a realloc... */
|
|
+ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9);
|
|
+
|
|
+ *_dst = dst;
|
|
+ return more;
|
|
+csum_err:
|
|
+ bch_err(c, "error verifying existing checksum while "
|
|
+ "rewriting existing data (memory corruption?)");
|
|
+ ret = -EIO;
|
|
+err:
|
|
+ if (to_wbio(dst)->bounce)
|
|
+ bch2_bio_free_pages_pool(c, dst);
|
|
+ if (to_wbio(dst)->put_bio)
|
|
+ bio_put(dst);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void __bch2_write(struct closure *cl)
|
|
+{
|
|
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct write_point *wp;
|
|
+ struct bio *bio;
|
|
+ bool skip_put = true;
|
|
+ unsigned nofs_flags;
|
|
+ int ret;
|
|
+
|
|
+ nofs_flags = memalloc_nofs_save();
|
|
+again:
|
|
+ memset(&op->failed, 0, sizeof(op->failed));
|
|
+
|
|
+ do {
|
|
+ struct bkey_i *key_to_write;
|
|
+ unsigned key_to_write_offset = op->insert_keys.top_p -
|
|
+ op->insert_keys.keys_p;
|
|
+
|
|
+ /* +1 for possible cache device: */
|
|
+ if (op->open_buckets.nr + op->nr_replicas + 1 >
|
|
+ ARRAY_SIZE(op->open_buckets.v))
|
|
+ goto flush_io;
|
|
+
|
|
+ if (bch2_keylist_realloc(&op->insert_keys,
|
|
+ op->inline_keys,
|
|
+ ARRAY_SIZE(op->inline_keys),
|
|
+ BKEY_EXTENT_U64s_MAX))
|
|
+ goto flush_io;
|
|
+
|
|
+ if ((op->flags & BCH_WRITE_FROM_INTERNAL) &&
|
|
+ percpu_ref_is_dying(&c->writes)) {
|
|
+ ret = -EROFS;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * The copygc thread is now global, which means it's no longer
|
|
+ * freeing up space on specific disks, which means that
|
|
+ * allocations for specific disks may hang arbitrarily long:
|
|
+ */
|
|
+ wp = bch2_alloc_sectors_start(c,
|
|
+ op->target,
|
|
+ op->opts.erasure_code,
|
|
+ op->write_point,
|
|
+ &op->devs_have,
|
|
+ op->nr_replicas,
|
|
+ op->nr_replicas_required,
|
|
+ op->alloc_reserve,
|
|
+ op->flags,
|
|
+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
|
|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
|
|
+ EBUG_ON(!wp);
|
|
+
|
|
+ if (unlikely(IS_ERR(wp))) {
|
|
+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
|
|
+ ret = PTR_ERR(wp);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ goto flush_io;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * It's possible for the allocator to fail, put us on the
|
|
+ * freelist waitlist, and then succeed in one of various retry
|
|
+ * paths: if that happens, we need to disable the skip_put
|
|
+ * optimization because otherwise there won't necessarily be a
|
|
+ * barrier before we free the bch_write_op:
|
|
+ */
|
|
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
|
|
+ skip_put = false;
|
|
+
|
|
+ bch2_open_bucket_get(c, wp, &op->open_buckets);
|
|
+ ret = bch2_write_extent(op, wp, &bio);
|
|
+ bch2_alloc_sectors_done(c, wp);
|
|
+
|
|
+ if (ret < 0)
|
|
+ goto err;
|
|
+
|
|
+ if (ret) {
|
|
+ skip_put = false;
|
|
+ } else {
|
|
+ /*
|
|
+ * for the skip_put optimization this has to be set
|
|
+ * before we submit the bio:
|
|
+ */
|
|
+ op->flags |= BCH_WRITE_DONE;
|
|
+ }
|
|
+
|
|
+ bio->bi_end_io = bch2_write_endio;
|
|
+ bio->bi_private = &op->cl;
|
|
+ bio->bi_opf |= REQ_OP_WRITE;
|
|
+
|
|
+ if (!skip_put)
|
|
+ closure_get(bio->bi_private);
|
|
+ else
|
|
+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT;
|
|
+
|
|
+ key_to_write = (void *) (op->insert_keys.keys_p +
|
|
+ key_to_write_offset);
|
|
+
|
|
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
|
|
+ key_to_write);
|
|
+ } while (ret);
|
|
+
|
|
+ if (!skip_put)
|
|
+ continue_at(cl, bch2_write_index, index_update_wq(op));
|
|
+out:
|
|
+ memalloc_nofs_restore(nofs_flags);
|
|
+ return;
|
|
+err:
|
|
+ op->error = ret;
|
|
+ op->flags |= BCH_WRITE_DONE;
|
|
+
|
|
+ continue_at(cl, bch2_write_index, index_update_wq(op));
|
|
+ goto out;
|
|
+flush_io:
|
|
+ /*
|
|
+ * If the write can't all be submitted at once, we generally want to
|
|
+ * block synchronously as that signals backpressure to the caller.
|
|
+ *
|
|
+ * However, if we're running out of a workqueue, we can't block here
|
|
+ * because we'll be blocking other work items from completing:
|
|
+ */
|
|
+ if (current->flags & PF_WQ_WORKER) {
|
|
+ continue_at(cl, bch2_write_index, index_update_wq(op));
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ closure_sync(cl);
|
|
+
|
|
+ if (!bch2_keylist_empty(&op->insert_keys)) {
|
|
+ __bch2_write_index(op);
|
|
+
|
|
+ if (op->error) {
|
|
+ op->flags |= BCH_WRITE_DONE;
|
|
+ continue_at_nobarrier(cl, bch2_write_done, NULL);
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ goto again;
|
|
+}
|
|
+
|
|
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
|
|
+{
|
|
+ struct closure *cl = &op->cl;
|
|
+ struct bio *bio = &op->wbio.bio;
|
|
+ struct bvec_iter iter;
|
|
+ struct bkey_i_inline_data *id;
|
|
+ unsigned sectors;
|
|
+ int ret;
|
|
+
|
|
+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
|
|
+
|
|
+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
|
|
+ ARRAY_SIZE(op->inline_keys),
|
|
+ BKEY_U64s + DIV_ROUND_UP(data_len, 8));
|
|
+ if (ret) {
|
|
+ op->error = ret;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ sectors = bio_sectors(bio);
|
|
+ op->pos.offset += sectors;
|
|
+
|
|
+ id = bkey_inline_data_init(op->insert_keys.top);
|
|
+ id->k.p = op->pos;
|
|
+ id->k.version = op->version;
|
|
+ id->k.size = sectors;
|
|
+
|
|
+ iter = bio->bi_iter;
|
|
+ iter.bi_size = data_len;
|
|
+ memcpy_from_bio(id->v.data, bio, iter);
|
|
+
|
|
+ while (data_len & 7)
|
|
+ id->v.data[data_len++] = '\0';
|
|
+ set_bkey_val_bytes(&id->k, data_len);
|
|
+ bch2_keylist_push(&op->insert_keys);
|
|
+
|
|
+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
|
|
+ op->flags |= BCH_WRITE_DONE;
|
|
+
|
|
+ continue_at_nobarrier(cl, bch2_write_index, NULL);
|
|
+ return;
|
|
+err:
|
|
+ bch2_write_done(&op->cl);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch_write - handle a write to a cache device or flash only volume
|
|
+ *
|
|
+ * This is the starting point for any data to end up in a cache device; it could
|
|
+ * be from a normal write, or a writeback write, or a write to a flash only
|
|
+ * volume - it's also used by the moving garbage collector to compact data in
|
|
+ * mostly empty buckets.
|
|
+ *
|
|
+ * It first writes the data to the cache, creating a list of keys to be inserted
|
|
+ * (if the data won't fit in a single open bucket, there will be multiple keys);
|
|
+ * after the data is written it calls bch_journal, and after the keys have been
|
|
+ * added to the next journal write they're inserted into the btree.
|
|
+ *
|
|
+ * If op->discard is true, instead of inserting the data it invalidates the
|
|
+ * region of the cache represented by op->bio and op->inode.
|
|
+ */
|
|
+void bch2_write(struct closure *cl)
|
|
+{
|
|
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
|
|
+ struct bio *bio = &op->wbio.bio;
|
|
+ struct bch_fs *c = op->c;
|
|
+ unsigned data_len;
|
|
+
|
|
+ BUG_ON(!op->nr_replicas);
|
|
+ BUG_ON(!op->write_point.v);
|
|
+ BUG_ON(!bkey_cmp(op->pos, POS_MAX));
|
|
+
|
|
+ op->start_time = local_clock();
|
|
+ bch2_keylist_init(&op->insert_keys, op->inline_keys);
|
|
+ wbio_init(bio)->put_bio = false;
|
|
+
|
|
+ if (bio_sectors(bio) & (c->opts.block_size - 1)) {
|
|
+ bch_err_inum_ratelimited(c, op->pos.inode,
|
|
+ "misaligned write");
|
|
+ op->error = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (c->opts.nochanges ||
|
|
+ !percpu_ref_tryget(&c->writes)) {
|
|
+ op->error = -EROFS;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Can't ratelimit copygc - we'd deadlock:
|
|
+ */
|
|
+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL))
|
|
+ down(&c->io_in_flight);
|
|
+
|
|
+ bch2_increment_clock(c, bio_sectors(bio), WRITE);
|
|
+
|
|
+ data_len = min_t(u64, bio->bi_iter.bi_size,
|
|
+ op->new_i_size - (op->pos.offset << 9));
|
|
+
|
|
+ if (c->opts.inline_data &&
|
|
+ data_len <= min(block_bytes(c) / 2, 1024U)) {
|
|
+ bch2_write_data_inline(op, data_len);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ continue_at_nobarrier(cl, __bch2_write, NULL);
|
|
+ return;
|
|
+err:
|
|
+ bch2_disk_reservation_put(c, &op->res);
|
|
+
|
|
+ if (op->end_io) {
|
|
+ EBUG_ON(cl->parent);
|
|
+ closure_debug_destroy(cl);
|
|
+ op->end_io(op);
|
|
+ } else {
|
|
+ closure_return(cl);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Cache promotion on read */
|
|
+
|
|
+struct promote_op {
|
|
+ struct closure cl;
|
|
+ struct rcu_head rcu;
|
|
+ u64 start_time;
|
|
+
|
|
+ struct rhash_head hash;
|
|
+ struct bpos pos;
|
|
+
|
|
+ struct migrate_write write;
|
|
+ struct bio_vec bi_inline_vecs[0]; /* must be last */
|
|
+};
|
|
+
|
|
+static const struct rhashtable_params bch_promote_params = {
|
|
+ .head_offset = offsetof(struct promote_op, hash),
|
|
+ .key_offset = offsetof(struct promote_op, pos),
|
|
+ .key_len = sizeof(struct bpos),
|
|
+};
|
|
+
|
|
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
|
|
+ struct bpos pos,
|
|
+ struct bch_io_opts opts,
|
|
+ unsigned flags)
|
|
+{
|
|
+ if (!(flags & BCH_READ_MAY_PROMOTE))
|
|
+ return false;
|
|
+
|
|
+ if (!opts.promote_target)
|
|
+ return false;
|
|
+
|
|
+ if (bch2_bkey_has_target(c, k, opts.promote_target))
|
|
+ return false;
|
|
+
|
|
+ if (bch2_target_congested(c, opts.promote_target)) {
|
|
+ /* XXX trace this */
|
|
+ return false;
|
|
+ }
|
|
+
|
|
+ if (rhashtable_lookup_fast(&c->promote_table, &pos,
|
|
+ bch_promote_params))
|
|
+ return false;
|
|
+
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void promote_free(struct bch_fs *c, struct promote_op *op)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
|
|
+ bch_promote_params);
|
|
+ BUG_ON(ret);
|
|
+ percpu_ref_put(&c->writes);
|
|
+ kfree_rcu(op, rcu);
|
|
+}
|
|
+
|
|
+static void promote_done(struct closure *cl)
|
|
+{
|
|
+ struct promote_op *op =
|
|
+ container_of(cl, struct promote_op, cl);
|
|
+ struct bch_fs *c = op->write.op.c;
|
|
+
|
|
+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
|
|
+ op->start_time);
|
|
+
|
|
+ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
|
|
+ promote_free(c, op);
|
|
+}
|
|
+
|
|
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
|
|
+{
|
|
+ struct bch_fs *c = rbio->c;
|
|
+ struct closure *cl = &op->cl;
|
|
+ struct bio *bio = &op->write.op.wbio.bio;
|
|
+
|
|
+ trace_promote(&rbio->bio);
|
|
+
|
|
+ /* we now own pages: */
|
|
+ BUG_ON(!rbio->bounce);
|
|
+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
|
|
+
|
|
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
|
|
+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
|
|
+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
|
|
+
|
|
+ bch2_migrate_read_done(&op->write, rbio);
|
|
+
|
|
+ closure_init(cl, NULL);
|
|
+ closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
|
|
+ closure_return_with_destructor(cl, promote_done);
|
|
+}
|
|
+
|
|
+static struct promote_op *__promote_alloc(struct bch_fs *c,
|
|
+ enum btree_id btree_id,
|
|
+ struct bkey_s_c k,
|
|
+ struct bpos pos,
|
|
+ struct extent_ptr_decoded *pick,
|
|
+ struct bch_io_opts opts,
|
|
+ unsigned sectors,
|
|
+ struct bch_read_bio **rbio)
|
|
+{
|
|
+ struct promote_op *op = NULL;
|
|
+ struct bio *bio;
|
|
+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
+ int ret;
|
|
+
|
|
+ if (!percpu_ref_tryget(&c->writes))
|
|
+ return NULL;
|
|
+
|
|
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
|
|
+ if (!op)
|
|
+ goto err;
|
|
+
|
|
+ op->start_time = local_clock();
|
|
+ op->pos = pos;
|
|
+
|
|
+ /*
|
|
+ * We don't use the mempool here because extents that aren't
|
|
+ * checksummed or compressed can be too big for the mempool:
|
|
+ */
|
|
+ *rbio = kzalloc(sizeof(struct bch_read_bio) +
|
|
+ sizeof(struct bio_vec) * pages,
|
|
+ GFP_NOIO);
|
|
+ if (!*rbio)
|
|
+ goto err;
|
|
+
|
|
+ rbio_init(&(*rbio)->bio, opts);
|
|
+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages);
|
|
+
|
|
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
|
|
+ GFP_NOIO))
|
|
+ goto err;
|
|
+
|
|
+ (*rbio)->bounce = true;
|
|
+ (*rbio)->split = true;
|
|
+ (*rbio)->kmalloc = true;
|
|
+
|
|
+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
|
|
+ bch_promote_params))
|
|
+ goto err;
|
|
+
|
|
+ bio = &op->write.op.wbio.bio;
|
|
+ bio_init(bio, bio->bi_inline_vecs, pages);
|
|
+
|
|
+ ret = bch2_migrate_write_init(c, &op->write,
|
|
+ writepoint_hashed((unsigned long) current),
|
|
+ opts,
|
|
+ DATA_PROMOTE,
|
|
+ (struct data_opts) {
|
|
+ .target = opts.promote_target,
|
|
+ .nr_replicas = 1,
|
|
+ },
|
|
+ btree_id, k);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ return op;
|
|
+err:
|
|
+ if (*rbio)
|
|
+ bio_free_pages(&(*rbio)->bio);
|
|
+ kfree(*rbio);
|
|
+ *rbio = NULL;
|
|
+ kfree(op);
|
|
+ percpu_ref_put(&c->writes);
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+noinline
|
|
+static struct promote_op *promote_alloc(struct bch_fs *c,
|
|
+ struct bvec_iter iter,
|
|
+ struct bkey_s_c k,
|
|
+ struct extent_ptr_decoded *pick,
|
|
+ struct bch_io_opts opts,
|
|
+ unsigned flags,
|
|
+ struct bch_read_bio **rbio,
|
|
+ bool *bounce,
|
|
+ bool *read_full)
|
|
+{
|
|
+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
|
|
+ /* data might have to be decompressed in the write path: */
|
|
+ unsigned sectors = promote_full
|
|
+ ? max(pick->crc.compressed_size, pick->crc.live_size)
|
|
+ : bvec_iter_sectors(iter);
|
|
+ struct bpos pos = promote_full
|
|
+ ? bkey_start_pos(k.k)
|
|
+ : POS(k.k->p.inode, iter.bi_sector);
|
|
+ struct promote_op *promote;
|
|
+
|
|
+ if (!should_promote(c, k, pos, opts, flags))
|
|
+ return NULL;
|
|
+
|
|
+ promote = __promote_alloc(c,
|
|
+ k.k->type == KEY_TYPE_reflink_v
|
|
+ ? BTREE_ID_reflink
|
|
+ : BTREE_ID_extents,
|
|
+ k, pos, pick, opts, sectors, rbio);
|
|
+ if (!promote)
|
|
+ return NULL;
|
|
+
|
|
+ *bounce = true;
|
|
+ *read_full = promote_full;
|
|
+ return promote;
|
|
+}
|
|
+
|
|
+/* Read */
|
|
+
|
|
+#define READ_RETRY_AVOID 1
|
|
+#define READ_RETRY 2
|
|
+#define READ_ERR 3
|
|
+
|
|
+enum rbio_context {
|
|
+ RBIO_CONTEXT_NULL,
|
|
+ RBIO_CONTEXT_HIGHPRI,
|
|
+ RBIO_CONTEXT_UNBOUND,
|
|
+};
|
|
+
|
|
+static inline struct bch_read_bio *
|
|
+bch2_rbio_parent(struct bch_read_bio *rbio)
|
|
+{
|
|
+ return rbio->split ? rbio->parent : rbio;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
|
|
+ enum rbio_context context,
|
|
+ struct workqueue_struct *wq)
|
|
+{
|
|
+ if (context <= rbio->context) {
|
|
+ fn(&rbio->work);
|
|
+ } else {
|
|
+ rbio->work.func = fn;
|
|
+ rbio->context = context;
|
|
+ queue_work(wq, &rbio->work);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
|
|
+{
|
|
+ BUG_ON(rbio->bounce && !rbio->split);
|
|
+
|
|
+ if (rbio->promote)
|
|
+ promote_free(rbio->c, rbio->promote);
|
|
+ rbio->promote = NULL;
|
|
+
|
|
+ if (rbio->bounce)
|
|
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
|
|
+
|
|
+ if (rbio->split) {
|
|
+ struct bch_read_bio *parent = rbio->parent;
|
|
+
|
|
+ if (rbio->kmalloc)
|
|
+ kfree(rbio);
|
|
+ else
|
|
+ bio_put(&rbio->bio);
|
|
+
|
|
+ rbio = parent;
|
|
+ }
|
|
+
|
|
+ return rbio;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Only called on a top level bch_read_bio to complete an entire read request,
|
|
+ * not a split:
|
|
+ */
|
|
+static void bch2_rbio_done(struct bch_read_bio *rbio)
|
|
+{
|
|
+ if (rbio->start_time)
|
|
+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read],
|
|
+ rbio->start_time);
|
|
+ bio_endio(&rbio->bio);
|
|
+}
|
|
+
|
|
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
+ struct bvec_iter bvec_iter, u64 inode,
|
|
+ struct bch_io_failures *failed,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_buf sk;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ flags &= ~BCH_READ_LAST_FRAGMENT;
|
|
+ flags |= BCH_READ_MUST_CLONE;
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, rbio->data_btree,
|
|
+ rbio->read_pos, BTREE_ITER_SLOTS);
|
|
+retry:
|
|
+ rbio->bio.bi_status = 0;
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ if (bkey_err(k))
|
|
+ goto err;
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+ k = bkey_i_to_s_c(sk.k);
|
|
+ bch2_trans_unlock(&trans);
|
|
+
|
|
+ if (!bch2_bkey_matches_ptr(c, k,
|
|
+ rbio->pick.ptr,
|
|
+ rbio->data_pos.offset -
|
|
+ rbio->pick.crc.offset)) {
|
|
+ /* extent we wanted to read no longer exists: */
|
|
+ rbio->hole = true;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter,
|
|
+ rbio->read_pos,
|
|
+ rbio->data_btree,
|
|
+ k, 0, failed, flags);
|
|
+ if (ret == READ_RETRY)
|
|
+ goto retry;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+out:
|
|
+ bch2_rbio_done(rbio);
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+ return;
|
|
+err:
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static void bch2_rbio_retry(struct work_struct *work)
|
|
+{
|
|
+ struct bch_read_bio *rbio =
|
|
+ container_of(work, struct bch_read_bio, work);
|
|
+ struct bch_fs *c = rbio->c;
|
|
+ struct bvec_iter iter = rbio->bvec_iter;
|
|
+ unsigned flags = rbio->flags;
|
|
+ u64 inode = rbio->read_pos.inode;
|
|
+ struct bch_io_failures failed = { .nr = 0 };
|
|
+
|
|
+ trace_read_retry(&rbio->bio);
|
|
+
|
|
+ if (rbio->retry == READ_RETRY_AVOID)
|
|
+ bch2_mark_io_failure(&failed, &rbio->pick);
|
|
+
|
|
+ rbio->bio.bi_status = 0;
|
|
+
|
|
+ rbio = bch2_rbio_free(rbio);
|
|
+
|
|
+ flags |= BCH_READ_IN_RETRY;
|
|
+ flags &= ~BCH_READ_MAY_PROMOTE;
|
|
+
|
|
+ if (flags & BCH_READ_NODECODE) {
|
|
+ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags);
|
|
+ } else {
|
|
+ flags &= ~BCH_READ_LAST_FRAGMENT;
|
|
+ flags |= BCH_READ_MUST_CLONE;
|
|
+
|
|
+ __bch2_read(c, rbio, iter, inode, &failed, flags);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
|
|
+ blk_status_t error)
|
|
+{
|
|
+ rbio->retry = retry;
|
|
+
|
|
+ if (rbio->flags & BCH_READ_IN_RETRY)
|
|
+ return;
|
|
+
|
|
+ if (retry == READ_ERR) {
|
|
+ rbio = bch2_rbio_free(rbio);
|
|
+
|
|
+ rbio->bio.bi_status = error;
|
|
+ bch2_rbio_done(rbio);
|
|
+ } else {
|
|
+ bch2_rbio_punt(rbio, bch2_rbio_retry,
|
|
+ RBIO_CONTEXT_UNBOUND, system_unbound_wq);
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
|
|
+ struct bch_read_bio *rbio)
|
|
+{
|
|
+ struct bch_fs *c = rbio->c;
|
|
+ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
|
|
+ struct bch_extent_crc_unpacked new_crc;
|
|
+ struct btree_iter *iter = NULL;
|
|
+ struct bkey_i *new;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (crc_is_compressed(rbio->pick.crc))
|
|
+ return 0;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos,
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ if ((ret = bkey_err(k)))
|
|
+ goto out;
|
|
+
|
|
+ if (bversion_cmp(k.k->version, rbio->version) ||
|
|
+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
|
|
+ goto out;
|
|
+
|
|
+ /* Extent was merged? */
|
|
+ if (bkey_start_offset(k.k) < data_offset ||
|
|
+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size)
|
|
+ goto out;
|
|
+
|
|
+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
|
|
+ rbio->pick.crc, NULL, &new_crc,
|
|
+ bkey_start_offset(k.k) - data_offset, k.k->size,
|
|
+ rbio->pick.crc.csum_type)) {
|
|
+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
|
|
+ ret = 0;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * going to be temporarily appending another checksum entry:
|
|
+ */
|
|
+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
|
|
+ sizeof(struct bch_extent_crc128));
|
|
+ if ((ret = PTR_ERR_OR_ZERO(new)))
|
|
+ goto out;
|
|
+
|
|
+ bkey_reassemble(new, k);
|
|
+
|
|
+ if (!bch2_bkey_narrow_crcs(new, new_crc))
|
|
+ goto out;
|
|
+
|
|
+ bch2_trans_update(trans, iter, new, 0);
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
|
|
+{
|
|
+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
|
|
+ __bch2_rbio_narrow_crcs(&trans, rbio));
|
|
+}
|
|
+
|
|
+/* Inner part that may run in process context */
|
|
+static void __bch2_read_endio(struct work_struct *work)
|
|
+{
|
|
+ struct bch_read_bio *rbio =
|
|
+ container_of(work, struct bch_read_bio, work);
|
|
+ struct bch_fs *c = rbio->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
|
|
+ struct bio *src = &rbio->bio;
|
|
+ struct bio *dst = &bch2_rbio_parent(rbio)->bio;
|
|
+ struct bvec_iter dst_iter = rbio->bvec_iter;
|
|
+ struct bch_extent_crc_unpacked crc = rbio->pick.crc;
|
|
+ struct nonce nonce = extent_nonce(rbio->version, crc);
|
|
+ struct bch_csum csum;
|
|
+
|
|
+ /* Reset iterator for checksumming and copying bounced data: */
|
|
+ if (rbio->bounce) {
|
|
+ src->bi_iter.bi_size = crc.compressed_size << 9;
|
|
+ src->bi_iter.bi_idx = 0;
|
|
+ src->bi_iter.bi_bvec_done = 0;
|
|
+ } else {
|
|
+ src->bi_iter = rbio->bvec_iter;
|
|
+ }
|
|
+
|
|
+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
|
|
+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
|
|
+ goto csum_err;
|
|
+
|
|
+ if (unlikely(rbio->narrow_crcs))
|
|
+ bch2_rbio_narrow_crcs(rbio);
|
|
+
|
|
+ if (rbio->flags & BCH_READ_NODECODE)
|
|
+ goto nodecode;
|
|
+
|
|
+ /* Adjust crc to point to subset of data we want: */
|
|
+ crc.offset += rbio->offset_into_extent;
|
|
+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter);
|
|
+
|
|
+ if (crc_is_compressed(crc)) {
|
|
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
|
|
+ goto decompression_err;
|
|
+ } else {
|
|
+ /* don't need to decrypt the entire bio: */
|
|
+ nonce = nonce_add(nonce, crc.offset << 9);
|
|
+ bio_advance(src, crc.offset << 9);
|
|
+
|
|
+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
|
|
+ src->bi_iter.bi_size = dst_iter.bi_size;
|
|
+
|
|
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+
|
|
+ if (rbio->bounce) {
|
|
+ struct bvec_iter src_iter = src->bi_iter;
|
|
+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (rbio->promote) {
|
|
+ /*
|
|
+ * Re encrypt data we decrypted, so it's consistent with
|
|
+ * rbio->crc:
|
|
+ */
|
|
+ bch2_encrypt_bio(c, crc.csum_type, nonce, src);
|
|
+ promote_start(rbio->promote, rbio);
|
|
+ rbio->promote = NULL;
|
|
+ }
|
|
+nodecode:
|
|
+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
|
|
+ rbio = bch2_rbio_free(rbio);
|
|
+ bch2_rbio_done(rbio);
|
|
+ }
|
|
+ return;
|
|
+csum_err:
|
|
+ /*
|
|
+ * Checksum error: if the bio wasn't bounced, we may have been
|
|
+ * reading into buffers owned by userspace (that userspace can
|
|
+ * scribble over) - retry the read, bouncing it this time:
|
|
+ */
|
|
+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
|
|
+ rbio->flags |= BCH_READ_MUST_BOUNCE;
|
|
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector,
|
|
+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)",
|
|
+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
|
|
+ csum.hi, csum.lo, crc.csum_type);
|
|
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ return;
|
|
+decompression_err:
|
|
+ bch_err_inum_ratelimited(c, rbio->read_pos.inode,
|
|
+ "decompression error");
|
|
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR);
|
|
+ return;
|
|
+}
|
|
+
|
|
+static void bch2_read_endio(struct bio *bio)
|
|
+{
|
|
+ struct bch_read_bio *rbio =
|
|
+ container_of(bio, struct bch_read_bio, bio);
|
|
+ struct bch_fs *c = rbio->c;
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
|
|
+ struct workqueue_struct *wq = NULL;
|
|
+ enum rbio_context context = RBIO_CONTEXT_NULL;
|
|
+
|
|
+ if (rbio->have_ioref) {
|
|
+ bch2_latency_acct(ca, rbio->submit_time, READ);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ }
|
|
+
|
|
+ if (!rbio->split)
|
|
+ rbio->bio.bi_end_io = rbio->end_io;
|
|
+
|
|
+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca,
|
|
+ rbio->read_pos.inode,
|
|
+ rbio->read_pos.offset,
|
|
+ "data read error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status))) {
|
|
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (rbio->pick.ptr.cached &&
|
|
+ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
|
|
+ ptr_stale(ca, &rbio->pick.ptr))) {
|
|
+ atomic_long_inc(&c->read_realloc_races);
|
|
+
|
|
+ if (rbio->flags & BCH_READ_RETRY_IF_STALE)
|
|
+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN);
|
|
+ else
|
|
+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (rbio->narrow_crcs ||
|
|
+ crc_is_compressed(rbio->pick.crc) ||
|
|
+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
|
|
+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq;
|
|
+ else if (rbio->pick.crc.csum_type)
|
|
+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq;
|
|
+
|
|
+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
|
|
+}
|
|
+
|
|
+int __bch2_read_indirect_extent(struct btree_trans *trans,
|
|
+ unsigned *offset_into_extent,
|
|
+ struct bkey_buf *orig_k)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 reflink_offset;
|
|
+ int ret;
|
|
+
|
|
+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
|
|
+ *offset_into_extent;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_reflink,
|
|
+ POS(0, reflink_offset),
|
|
+ BTREE_ITER_SLOTS);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_reflink_v &&
|
|
+ k.k->type != KEY_TYPE_indirect_inline_data) {
|
|
+ bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode,
|
|
+ "pointer to nonexistent indirect extent");
|
|
+ bch2_inconsistent_error(trans->c);
|
|
+ ret = -EIO;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
|
|
+ bch2_bkey_buf_reassemble(orig_k, trans->c, k);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
|
|
+ struct bvec_iter iter, struct bpos read_pos,
|
|
+ enum btree_id data_btree, struct bkey_s_c k,
|
|
+ unsigned offset_into_extent,
|
|
+ struct bch_io_failures *failed, unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct extent_ptr_decoded pick;
|
|
+ struct bch_read_bio *rbio = NULL;
|
|
+ struct bch_dev *ca;
|
|
+ struct promote_op *promote = NULL;
|
|
+ bool bounce = false, read_full = false, narrow_crcs = false;
|
|
+ struct bpos data_pos = bkey_start_pos(k.k);
|
|
+ int pick_ret;
|
|
+
|
|
+ if (bkey_extent_is_inline_data(k.k)) {
|
|
+ unsigned bytes = min_t(unsigned, iter.bi_size,
|
|
+ bkey_inline_data_bytes(k.k));
|
|
+
|
|
+ swap(iter.bi_size, bytes);
|
|
+ memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k));
|
|
+ swap(iter.bi_size, bytes);
|
|
+ bio_advance_iter(&orig->bio, &iter, bytes);
|
|
+ zero_fill_bio_iter(&orig->bio, iter);
|
|
+ goto out_read_done;
|
|
+ }
|
|
+
|
|
+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
|
|
+
|
|
+ /* hole or reservation - just zero fill: */
|
|
+ if (!pick_ret)
|
|
+ goto hole;
|
|
+
|
|
+ if (pick_ret < 0) {
|
|
+ bch_err_inum_ratelimited(c, k.k->p.inode,
|
|
+ "no device to read from");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (pick_ret > 0)
|
|
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
|
|
+
|
|
+ if (flags & BCH_READ_NODECODE) {
|
|
+ /*
|
|
+ * can happen if we retry, and the extent we were going to read
|
|
+ * has been merged in the meantime:
|
|
+ */
|
|
+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
|
|
+ goto hole;
|
|
+
|
|
+ iter.bi_size = pick.crc.compressed_size << 9;
|
|
+ goto get_bio;
|
|
+ }
|
|
+
|
|
+ if (!(flags & BCH_READ_LAST_FRAGMENT) ||
|
|
+ bio_flagged(&orig->bio, BIO_CHAIN))
|
|
+ flags |= BCH_READ_MUST_CLONE;
|
|
+
|
|
+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) &&
|
|
+ bch2_can_narrow_extent_crcs(k, pick.crc);
|
|
+
|
|
+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
|
|
+ flags |= BCH_READ_MUST_BOUNCE;
|
|
+
|
|
+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size);
|
|
+
|
|
+ if (crc_is_compressed(pick.crc) ||
|
|
+ (pick.crc.csum_type != BCH_CSUM_NONE &&
|
|
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
|
+ (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
|
|
+ (flags & BCH_READ_USER_MAPPED)) ||
|
|
+ (flags & BCH_READ_MUST_BOUNCE)))) {
|
|
+ read_full = true;
|
|
+ bounce = true;
|
|
+ }
|
|
+
|
|
+ if (orig->opts.promote_target)
|
|
+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
|
|
+ &rbio, &bounce, &read_full);
|
|
+
|
|
+ if (!read_full) {
|
|
+ EBUG_ON(crc_is_compressed(pick.crc));
|
|
+ EBUG_ON(pick.crc.csum_type &&
|
|
+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
|
|
+ bvec_iter_sectors(iter) != pick.crc.live_size ||
|
|
+ pick.crc.offset ||
|
|
+ offset_into_extent));
|
|
+
|
|
+ data_pos.offset += offset_into_extent;
|
|
+ pick.ptr.offset += pick.crc.offset +
|
|
+ offset_into_extent;
|
|
+ offset_into_extent = 0;
|
|
+ pick.crc.compressed_size = bvec_iter_sectors(iter);
|
|
+ pick.crc.uncompressed_size = bvec_iter_sectors(iter);
|
|
+ pick.crc.offset = 0;
|
|
+ pick.crc.live_size = bvec_iter_sectors(iter);
|
|
+ offset_into_extent = 0;
|
|
+ }
|
|
+get_bio:
|
|
+ if (rbio) {
|
|
+ /*
|
|
+ * promote already allocated bounce rbio:
|
|
+ * promote needs to allocate a bio big enough for uncompressing
|
|
+ * data in the write path, but we're not going to use it all
|
|
+ * here:
|
|
+ */
|
|
+ EBUG_ON(rbio->bio.bi_iter.bi_size <
|
|
+ pick.crc.compressed_size << 9);
|
|
+ rbio->bio.bi_iter.bi_size =
|
|
+ pick.crc.compressed_size << 9;
|
|
+ } else if (bounce) {
|
|
+ unsigned sectors = pick.crc.compressed_size;
|
|
+
|
|
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
|
|
+ DIV_ROUND_UP(sectors, PAGE_SECTORS),
|
|
+ &c->bio_read_split),
|
|
+ orig->opts);
|
|
+
|
|
+ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
|
|
+ rbio->bounce = true;
|
|
+ rbio->split = true;
|
|
+ } else if (flags & BCH_READ_MUST_CLONE) {
|
|
+ /*
|
|
+ * Have to clone if there were any splits, due to error
|
|
+ * reporting issues (if a split errored, and retrying didn't
|
|
+ * work, when it reports the error to its parent (us) we don't
|
|
+ * know if the error was from our bio, and we should retry, or
|
|
+ * from the whole bio, in which case we don't want to retry and
|
|
+ * lose the error)
|
|
+ */
|
|
+ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
|
|
+ &c->bio_read_split),
|
|
+ orig->opts);
|
|
+ rbio->bio.bi_iter = iter;
|
|
+ rbio->split = true;
|
|
+ } else {
|
|
+ rbio = orig;
|
|
+ rbio->bio.bi_iter = iter;
|
|
+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
|
|
+ }
|
|
+
|
|
+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
|
|
+
|
|
+ rbio->c = c;
|
|
+ rbio->submit_time = local_clock();
|
|
+ if (rbio->split)
|
|
+ rbio->parent = orig;
|
|
+ else
|
|
+ rbio->end_io = orig->bio.bi_end_io;
|
|
+ rbio->bvec_iter = iter;
|
|
+ rbio->offset_into_extent= offset_into_extent;
|
|
+ rbio->flags = flags;
|
|
+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
|
|
+ rbio->narrow_crcs = narrow_crcs;
|
|
+ rbio->hole = 0;
|
|
+ rbio->retry = 0;
|
|
+ rbio->context = 0;
|
|
+ /* XXX: only initialize this if needed */
|
|
+ rbio->devs_have = bch2_bkey_devs(k);
|
|
+ rbio->pick = pick;
|
|
+ rbio->read_pos = read_pos;
|
|
+ rbio->data_btree = data_btree;
|
|
+ rbio->data_pos = data_pos;
|
|
+ rbio->version = k.k->version;
|
|
+ rbio->promote = promote;
|
|
+ INIT_WORK(&rbio->work, NULL);
|
|
+
|
|
+ rbio->bio.bi_opf = orig->bio.bi_opf;
|
|
+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
|
|
+ rbio->bio.bi_end_io = bch2_read_endio;
|
|
+
|
|
+ if (rbio->bounce)
|
|
+ trace_read_bounce(&rbio->bio);
|
|
+
|
|
+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
|
|
+
|
|
+ /*
|
|
+ * If it's being moved internally, we don't want to flag it as a cache
|
|
+ * hit:
|
|
+ */
|
|
+ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
|
|
+ bch2_bucket_io_time_reset(trans, pick.ptr.dev,
|
|
+ PTR_BUCKET_NR(ca, &pick.ptr), READ);
|
|
+
|
|
+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
|
|
+ bio_inc_remaining(&orig->bio);
|
|
+ trace_read_split(&orig->bio);
|
|
+ }
|
|
+
|
|
+ if (!rbio->pick.idx) {
|
|
+ if (!rbio->have_ioref) {
|
|
+ bch_err_inum_ratelimited(c, k.k->p.inode,
|
|
+ "no device to read from");
|
|
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
|
|
+ bio_sectors(&rbio->bio));
|
|
+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
|
|
+
|
|
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ submit_bio(&rbio->bio);
|
|
+ else
|
|
+ submit_bio_wait(&rbio->bio);
|
|
+ } else {
|
|
+ /* Attempting reconstruct read: */
|
|
+ if (bch2_ec_read_extent(c, rbio)) {
|
|
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (likely(!(flags & BCH_READ_IN_RETRY)))
|
|
+ bio_endio(&rbio->bio);
|
|
+ }
|
|
+out:
|
|
+ if (likely(!(flags & BCH_READ_IN_RETRY))) {
|
|
+ return 0;
|
|
+ } else {
|
|
+ int ret;
|
|
+
|
|
+ rbio->context = RBIO_CONTEXT_UNBOUND;
|
|
+ bch2_read_endio(&rbio->bio);
|
|
+
|
|
+ ret = rbio->retry;
|
|
+ rbio = bch2_rbio_free(rbio);
|
|
+
|
|
+ if (ret == READ_RETRY_AVOID) {
|
|
+ bch2_mark_io_failure(failed, &pick);
|
|
+ ret = READ_RETRY;
|
|
+ }
|
|
+
|
|
+ if (!ret)
|
|
+ goto out_read_done;
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+err:
|
|
+ if (flags & BCH_READ_IN_RETRY)
|
|
+ return READ_ERR;
|
|
+
|
|
+ orig->bio.bi_status = BLK_STS_IOERR;
|
|
+ goto out_read_done;
|
|
+
|
|
+hole:
|
|
+ /*
|
|
+ * won't normally happen in the BCH_READ_NODECODE
|
|
+ * (bch2_move_extent()) path, but if we retry and the extent we wanted
|
|
+ * to read no longer exists we have to signal that:
|
|
+ */
|
|
+ if (flags & BCH_READ_NODECODE)
|
|
+ orig->hole = true;
|
|
+
|
|
+ zero_fill_bio_iter(&orig->bio, iter);
|
|
+out_read_done:
|
|
+ if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ bch2_rbio_done(orig);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
+ struct bvec_iter bvec_iter, u64 inode,
|
|
+ struct bch_io_failures *failed, unsigned flags)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_buf sk;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(flags & BCH_READ_NODECODE);
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+retry:
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents,
|
|
+ POS(inode, bvec_iter.bi_sector),
|
|
+ BTREE_ITER_SLOTS);
|
|
+ while (1) {
|
|
+ unsigned bytes, sectors, offset_into_extent;
|
|
+ enum btree_id data_btree = BTREE_ID_extents;
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter,
|
|
+ POS(inode, bvec_iter.bi_sector));
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ offset_into_extent = iter->pos.offset -
|
|
+ bkey_start_offset(k.k);
|
|
+ sectors = k.k->size - offset_into_extent;
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+
|
|
+ ret = bch2_read_indirect_extent(&trans, &data_btree,
|
|
+ &offset_into_extent, &sk);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ k = bkey_i_to_s_c(sk.k);
|
|
+
|
|
+ /*
|
|
+ * With indirect extents, the amount of data to read is the min
|
|
+ * of the original extent and the indirect extent:
|
|
+ */
|
|
+ sectors = min(sectors, k.k->size - offset_into_extent);
|
|
+
|
|
+ /*
|
|
+ * Unlock the iterator while the btree node's lock is still in
|
|
+ * cache, before doing the IO:
|
|
+ */
|
|
+ bch2_trans_unlock(&trans);
|
|
+
|
|
+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
|
|
+ swap(bvec_iter.bi_size, bytes);
|
|
+
|
|
+ if (bvec_iter.bi_size == bytes)
|
|
+ flags |= BCH_READ_LAST_FRAGMENT;
|
|
+
|
|
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos,
|
|
+ data_btree, k,
|
|
+ offset_into_extent, failed, flags);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ if (flags & BCH_READ_LAST_FRAGMENT)
|
|
+ break;
|
|
+
|
|
+ swap(bvec_iter.bi_size, bytes);
|
|
+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID)
|
|
+ goto retry;
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err_inum_ratelimited(c, inode,
|
|
+ "read error %i from btree lookup", ret);
|
|
+ rbio->bio.bi_status = BLK_STS_IOERR;
|
|
+ bch2_rbio_done(rbio);
|
|
+ }
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+}
|
|
+
|
|
+void bch2_fs_io_exit(struct bch_fs *c)
|
|
+{
|
|
+ if (c->promote_table.tbl)
|
|
+ rhashtable_destroy(&c->promote_table);
|
|
+ mempool_exit(&c->bio_bounce_pages);
|
|
+ bioset_exit(&c->bio_write);
|
|
+ bioset_exit(&c->bio_read_split);
|
|
+ bioset_exit(&c->bio_read);
|
|
+}
|
|
+
|
|
+int bch2_fs_io_init(struct bch_fs *c)
|
|
+{
|
|
+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
|
|
+ BIOSET_NEED_BVECS) ||
|
|
+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
|
|
+ BIOSET_NEED_BVECS) ||
|
|
+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
|
|
+ BIOSET_NEED_BVECS) ||
|
|
+ mempool_init_page_pool(&c->bio_bounce_pages,
|
|
+ max_t(unsigned,
|
|
+ c->opts.btree_node_size,
|
|
+ c->sb.encoded_extent_max) /
|
|
+ PAGE_SECTORS, 0) ||
|
|
+ rhashtable_init(&c->promote_table, &bch_promote_params))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h
|
|
new file mode 100644
|
|
index 000000000000..2ac03c049c92
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/io.h
|
|
@@ -0,0 +1,191 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_IO_H
|
|
+#define _BCACHEFS_IO_H
|
|
+
|
|
+#include "checksum.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "io_types.h"
|
|
+
|
|
+#define to_wbio(_bio) \
|
|
+ container_of((_bio), struct bch_write_bio, bio)
|
|
+
|
|
+#define to_rbio(_bio) \
|
|
+ container_of((_bio), struct bch_read_bio, bio)
|
|
+
|
|
+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
|
|
+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
|
|
+
|
|
+void bch2_latency_acct(struct bch_dev *, u64, int);
|
|
+
|
|
+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
|
|
+ enum bch_data_type, const struct bkey_i *);
|
|
+
|
|
+#define BLK_STS_REMOVED ((__force blk_status_t)128)
|
|
+
|
|
+const char *bch2_blk_status_to_str(blk_status_t);
|
|
+
|
|
+enum bch_write_flags {
|
|
+ BCH_WRITE_ALLOC_NOWAIT = (1 << 0),
|
|
+ BCH_WRITE_CACHED = (1 << 1),
|
|
+ BCH_WRITE_FLUSH = (1 << 2),
|
|
+ BCH_WRITE_DATA_ENCODED = (1 << 3),
|
|
+ BCH_WRITE_PAGES_STABLE = (1 << 4),
|
|
+ BCH_WRITE_PAGES_OWNED = (1 << 5),
|
|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6),
|
|
+ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7),
|
|
+ BCH_WRITE_FROM_INTERNAL = (1 << 8),
|
|
+
|
|
+ /* Internal: */
|
|
+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9),
|
|
+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10),
|
|
+ BCH_WRITE_DONE = (1 << 11),
|
|
+};
|
|
+
|
|
+static inline u64 *op_journal_seq(struct bch_write_op *op)
|
|
+{
|
|
+ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR)
|
|
+ ? op->journal_seq_p : &op->journal_seq;
|
|
+}
|
|
+
|
|
+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq)
|
|
+{
|
|
+ op->journal_seq_p = journal_seq;
|
|
+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
|
|
+}
|
|
+
|
|
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
|
|
+{
|
|
+ return op->alloc_reserve == RESERVE_MOVINGGC
|
|
+ ? op->c->copygc_wq
|
|
+ : op->c->wq;
|
|
+}
|
|
+
|
|
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_i *, bool *, bool *, s64 *, s64 *);
|
|
+int bch2_extent_update(struct btree_trans *, struct btree_iter *,
|
|
+ struct bkey_i *, struct disk_reservation *,
|
|
+ u64 *, u64, s64 *);
|
|
+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *,
|
|
+ struct bpos, u64 *, s64 *);
|
|
+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *);
|
|
+
|
|
+int bch2_write_index_default(struct bch_write_op *);
|
|
+
|
|
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
|
|
+ struct bch_io_opts opts)
|
|
+{
|
|
+ op->c = c;
|
|
+ op->end_io = NULL;
|
|
+ op->flags = 0;
|
|
+ op->written = 0;
|
|
+ op->error = 0;
|
|
+ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum);
|
|
+ op->compression_type = bch2_compression_opt_to_type[opts.compression];
|
|
+ op->nr_replicas = 0;
|
|
+ op->nr_replicas_required = c->opts.data_replicas_required;
|
|
+ op->alloc_reserve = RESERVE_NONE;
|
|
+ op->incompressible = 0;
|
|
+ op->open_buckets.nr = 0;
|
|
+ op->devs_have.nr = 0;
|
|
+ op->target = 0;
|
|
+ op->opts = opts;
|
|
+ op->pos = POS_MAX;
|
|
+ op->version = ZERO_VERSION;
|
|
+ op->write_point = (struct write_point_specifier) { 0 };
|
|
+ op->res = (struct disk_reservation) { 0 };
|
|
+ op->journal_seq = 0;
|
|
+ op->new_i_size = U64_MAX;
|
|
+ op->i_sectors_delta = 0;
|
|
+ op->index_update_fn = bch2_write_index_default;
|
|
+}
|
|
+
|
|
+void bch2_write(struct closure *);
|
|
+
|
|
+static inline struct bch_write_bio *wbio_init(struct bio *bio)
|
|
+{
|
|
+ struct bch_write_bio *wbio = to_wbio(bio);
|
|
+
|
|
+ memset(wbio, 0, offsetof(struct bch_write_bio, bio));
|
|
+ return wbio;
|
|
+}
|
|
+
|
|
+struct bch_devs_mask;
|
|
+struct cache_promote_op;
|
|
+struct extent_ptr_decoded;
|
|
+
|
|
+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
|
|
+ struct bkey_buf *);
|
|
+
|
|
+static inline int bch2_read_indirect_extent(struct btree_trans *trans,
|
|
+ enum btree_id *data_btree,
|
|
+ unsigned *offset_into_extent,
|
|
+ struct bkey_buf *k)
|
|
+{
|
|
+ if (k->k->k.type != KEY_TYPE_reflink_p)
|
|
+ return 0;
|
|
+
|
|
+ *data_btree = BTREE_ID_reflink;
|
|
+ return __bch2_read_indirect_extent(trans, offset_into_extent, k);
|
|
+}
|
|
+
|
|
+enum bch_read_flags {
|
|
+ BCH_READ_RETRY_IF_STALE = 1 << 0,
|
|
+ BCH_READ_MAY_PROMOTE = 1 << 1,
|
|
+ BCH_READ_USER_MAPPED = 1 << 2,
|
|
+ BCH_READ_NODECODE = 1 << 3,
|
|
+ BCH_READ_LAST_FRAGMENT = 1 << 4,
|
|
+
|
|
+ /* internal: */
|
|
+ BCH_READ_MUST_BOUNCE = 1 << 5,
|
|
+ BCH_READ_MUST_CLONE = 1 << 6,
|
|
+ BCH_READ_IN_RETRY = 1 << 7,
|
|
+};
|
|
+
|
|
+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *,
|
|
+ struct bvec_iter, struct bpos, enum btree_id,
|
|
+ struct bkey_s_c, unsigned,
|
|
+ struct bch_io_failures *, unsigned);
|
|
+
|
|
+static inline void bch2_read_extent(struct btree_trans *trans,
|
|
+ struct bch_read_bio *rbio, struct bpos read_pos,
|
|
+ enum btree_id data_btree, struct bkey_s_c k,
|
|
+ unsigned offset_into_extent, unsigned flags)
|
|
+{
|
|
+ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos,
|
|
+ data_btree, k, offset_into_extent, NULL, flags);
|
|
+}
|
|
+
|
|
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
|
|
+ u64, struct bch_io_failures *, unsigned flags);
|
|
+
|
|
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
|
|
+ u64 inode)
|
|
+{
|
|
+ struct bch_io_failures failed = { .nr = 0 };
|
|
+
|
|
+ BUG_ON(rbio->_state);
|
|
+
|
|
+ rbio->c = c;
|
|
+ rbio->start_time = local_clock();
|
|
+
|
|
+ __bch2_read(c, rbio, rbio->bio.bi_iter, inode, &failed,
|
|
+ BCH_READ_RETRY_IF_STALE|
|
|
+ BCH_READ_MAY_PROMOTE|
|
|
+ BCH_READ_USER_MAPPED);
|
|
+}
|
|
+
|
|
+static inline struct bch_read_bio *rbio_init(struct bio *bio,
|
|
+ struct bch_io_opts opts)
|
|
+{
|
|
+ struct bch_read_bio *rbio = to_rbio(bio);
|
|
+
|
|
+ rbio->_state = 0;
|
|
+ rbio->promote = NULL;
|
|
+ rbio->opts = opts;
|
|
+ return rbio;
|
|
+}
|
|
+
|
|
+void bch2_fs_io_exit(struct bch_fs *);
|
|
+int bch2_fs_io_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_IO_H */
|
|
diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h
|
|
new file mode 100644
|
|
index 000000000000..e7aca7c9823a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/io_types.h
|
|
@@ -0,0 +1,158 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_IO_TYPES_H
|
|
+#define _BCACHEFS_IO_TYPES_H
|
|
+
|
|
+#include "alloc_types.h"
|
|
+#include "btree_types.h"
|
|
+#include "buckets_types.h"
|
|
+#include "extents_types.h"
|
|
+#include "keylist_types.h"
|
|
+#include "opts.h"
|
|
+#include "super_types.h"
|
|
+
|
|
+#include <linux/llist.h>
|
|
+#include <linux/workqueue.h>
|
|
+
|
|
+struct bch_read_bio {
|
|
+ struct bch_fs *c;
|
|
+ u64 start_time;
|
|
+ u64 submit_time;
|
|
+
|
|
+ /*
|
|
+ * Reads will often have to be split, and if the extent being read from
|
|
+ * was checksummed or compressed we'll also have to allocate bounce
|
|
+ * buffers and copy the data back into the original bio.
|
|
+ *
|
|
+ * If we didn't have to split, we have to save and restore the original
|
|
+ * bi_end_io - @split below indicates which:
|
|
+ */
|
|
+ union {
|
|
+ struct bch_read_bio *parent;
|
|
+ bio_end_io_t *end_io;
|
|
+ };
|
|
+
|
|
+ /*
|
|
+ * Saved copy of bio->bi_iter, from submission time - allows us to
|
|
+ * resubmit on IO error, and also to copy data back to the original bio
|
|
+ * when we're bouncing:
|
|
+ */
|
|
+ struct bvec_iter bvec_iter;
|
|
+
|
|
+ unsigned offset_into_extent;
|
|
+
|
|
+ u16 flags;
|
|
+ union {
|
|
+ struct {
|
|
+ u16 bounce:1,
|
|
+ split:1,
|
|
+ kmalloc:1,
|
|
+ have_ioref:1,
|
|
+ narrow_crcs:1,
|
|
+ hole:1,
|
|
+ retry:2,
|
|
+ context:2;
|
|
+ };
|
|
+ u16 _state;
|
|
+ };
|
|
+
|
|
+ struct bch_devs_list devs_have;
|
|
+
|
|
+ struct extent_ptr_decoded pick;
|
|
+
|
|
+ /*
|
|
+ * pos we read from - different from data_pos for indirect extents:
|
|
+ */
|
|
+ struct bpos read_pos;
|
|
+
|
|
+ /*
|
|
+ * start pos of data we read (may not be pos of data we want) - for
|
|
+ * promote, narrow extents paths:
|
|
+ */
|
|
+ enum btree_id data_btree;
|
|
+ struct bpos data_pos;
|
|
+ struct bversion version;
|
|
+
|
|
+ struct promote_op *promote;
|
|
+
|
|
+ struct bch_io_opts opts;
|
|
+
|
|
+ struct work_struct work;
|
|
+
|
|
+ struct bio bio;
|
|
+};
|
|
+
|
|
+struct bch_write_bio {
|
|
+ struct bch_fs *c;
|
|
+ struct bch_write_bio *parent;
|
|
+
|
|
+ u64 submit_time;
|
|
+
|
|
+ struct bch_devs_list failed;
|
|
+ u8 dev;
|
|
+
|
|
+ unsigned split:1,
|
|
+ bounce:1,
|
|
+ put_bio:1,
|
|
+ have_ioref:1,
|
|
+ used_mempool:1;
|
|
+
|
|
+ struct bio bio;
|
|
+};
|
|
+
|
|
+struct bch_write_op {
|
|
+ struct closure cl;
|
|
+ struct bch_fs *c;
|
|
+ void (*end_io)(struct bch_write_op *);
|
|
+ u64 start_time;
|
|
+
|
|
+ unsigned written; /* sectors */
|
|
+ u16 flags;
|
|
+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */
|
|
+
|
|
+ unsigned csum_type:4;
|
|
+ unsigned compression_type:4;
|
|
+ unsigned nr_replicas:4;
|
|
+ unsigned nr_replicas_required:4;
|
|
+ unsigned alloc_reserve:3;
|
|
+ unsigned incompressible:1;
|
|
+
|
|
+ struct bch_devs_list devs_have;
|
|
+ u16 target;
|
|
+ u16 nonce;
|
|
+ struct bch_io_opts opts;
|
|
+
|
|
+ struct bpos pos;
|
|
+ struct bversion version;
|
|
+
|
|
+ /* For BCH_WRITE_DATA_ENCODED: */
|
|
+ struct bch_extent_crc_unpacked crc;
|
|
+
|
|
+ struct write_point_specifier write_point;
|
|
+
|
|
+ struct disk_reservation res;
|
|
+
|
|
+ struct open_buckets open_buckets;
|
|
+
|
|
+ /*
|
|
+ * If caller wants to flush but hasn't passed us a journal_seq ptr, we
|
|
+ * still need to stash the journal_seq somewhere:
|
|
+ */
|
|
+ union {
|
|
+ u64 *journal_seq_p;
|
|
+ u64 journal_seq;
|
|
+ };
|
|
+ u64 new_i_size;
|
|
+ s64 i_sectors_delta;
|
|
+
|
|
+ int (*index_update_fn)(struct bch_write_op *);
|
|
+
|
|
+ struct bch_devs_mask failed;
|
|
+
|
|
+ struct keylist insert_keys;
|
|
+ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2];
|
|
+
|
|
+ /* Must be last: */
|
|
+ struct bch_write_bio wbio;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_IO_TYPES_H */
|
|
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
|
|
new file mode 100644
|
|
index 000000000000..c2773126a8c6
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal.c
|
|
@@ -0,0 +1,1319 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * bcachefs journalling code, for btree insertions
|
|
+ *
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "error.h"
|
|
+#include "journal.h"
|
|
+#include "journal_io.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static u64 last_unwritten_seq(struct journal *j)
|
|
+{
|
|
+ union journal_res_state s = READ_ONCE(j->reservations);
|
|
+
|
|
+ lockdep_assert_held(&j->lock);
|
|
+
|
|
+ return journal_cur_seq(j) - ((s.idx - s.unwritten_idx) & JOURNAL_BUF_MASK);
|
|
+}
|
|
+
|
|
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
|
|
+{
|
|
+ return seq >= last_unwritten_seq(j);
|
|
+}
|
|
+
|
|
+static bool __journal_entry_is_open(union journal_res_state state)
|
|
+{
|
|
+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
|
|
+}
|
|
+
|
|
+static bool journal_entry_is_open(struct journal *j)
|
|
+{
|
|
+ return __journal_entry_is_open(j->reservations);
|
|
+}
|
|
+
|
|
+static inline struct journal_buf *
|
|
+journal_seq_to_buf(struct journal *j, u64 seq)
|
|
+{
|
|
+ struct journal_buf *buf = NULL;
|
|
+
|
|
+ EBUG_ON(seq > journal_cur_seq(j));
|
|
+ EBUG_ON(seq == journal_cur_seq(j) &&
|
|
+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL);
|
|
+
|
|
+ if (journal_seq_unwritten(j, seq)) {
|
|
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
|
|
+ EBUG_ON(le64_to_cpu(buf->data->seq) != seq);
|
|
+ }
|
|
+ return buf;
|
|
+}
|
|
+
|
|
+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count)
|
|
+{
|
|
+ INIT_LIST_HEAD(&p->list);
|
|
+ INIT_LIST_HEAD(&p->key_cache_list);
|
|
+ INIT_LIST_HEAD(&p->flushed);
|
|
+ atomic_set(&p->count, count);
|
|
+ p->devs.nr = 0;
|
|
+}
|
|
+
|
|
+static void journal_pin_new_entry(struct journal *j)
|
|
+{
|
|
+ /*
|
|
+ * The fifo_push() needs to happen at the same time as j->seq is
|
|
+ * incremented for journal_last_seq() to be calculated correctly
|
|
+ */
|
|
+ atomic64_inc(&j->seq);
|
|
+ journal_pin_list_init(fifo_push_ref(&j->pin), 1);
|
|
+}
|
|
+
|
|
+static void bch2_journal_buf_init(struct journal *j)
|
|
+{
|
|
+ struct journal_buf *buf = journal_cur_buf(j);
|
|
+
|
|
+ bkey_extent_init(&buf->key);
|
|
+ buf->noflush = false;
|
|
+ buf->must_flush = false;
|
|
+ buf->separate_flush = false;
|
|
+
|
|
+ memset(buf->has_inode, 0, sizeof(buf->has_inode));
|
|
+
|
|
+ memset(buf->data, 0, sizeof(*buf->data));
|
|
+ buf->data->seq = cpu_to_le64(journal_cur_seq(j));
|
|
+ buf->data->u64s = 0;
|
|
+}
|
|
+
|
|
+void bch2_journal_halt(struct journal *j)
|
|
+{
|
|
+ union journal_res_state old, new;
|
|
+ u64 v = atomic64_read(&j->reservations.counter);
|
|
+
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
|
|
+ return;
|
|
+
|
|
+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
|
|
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ j->err_seq = journal_cur_seq(j);
|
|
+ journal_wake(j);
|
|
+ closure_wake_up(&journal_cur_buf(j)->wait);
|
|
+}
|
|
+
|
|
+/* journal entry close/open: */
|
|
+
|
|
+void __bch2_journal_buf_put(struct journal *j)
|
|
+{
|
|
+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Returns true if journal entry is now closed:
|
|
+ *
|
|
+ * We don't close a journal_buf until the next journal_buf is finished writing,
|
|
+ * and can be opened again - this also initializes the next journal_buf:
|
|
+ */
|
|
+static bool __journal_entry_close(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_buf *buf = journal_cur_buf(j);
|
|
+ union journal_res_state old, new;
|
|
+ u64 v = atomic64_read(&j->reservations.counter);
|
|
+ unsigned sectors;
|
|
+
|
|
+ lockdep_assert_held(&j->lock);
|
|
+
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
|
|
+ return true;
|
|
+
|
|
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
|
|
+ /* this entry will never be written: */
|
|
+ closure_wake_up(&buf->wait);
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
|
|
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
|
|
+ j->need_write_time = local_clock();
|
|
+ }
|
|
+
|
|
+ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
|
|
+ new.idx++;
|
|
+
|
|
+ if (new.idx == new.unwritten_idx)
|
|
+ return false;
|
|
+
|
|
+ BUG_ON(journal_state_count(new, new.idx));
|
|
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ /* Close out old buffer: */
|
|
+ buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
|
|
+
|
|
+ sectors = vstruct_blocks_plus(buf->data, c->block_bits,
|
|
+ buf->u64s_reserved) << c->block_bits;
|
|
+ BUG_ON(sectors > buf->sectors);
|
|
+ buf->sectors = sectors;
|
|
+
|
|
+ /*
|
|
+ * We have to set last_seq here, _before_ opening a new journal entry:
|
|
+ *
|
|
+ * A threads may replace an old pin with a new pin on their current
|
|
+ * journal reservation - the expectation being that the journal will
|
|
+ * contain either what the old pin protected or what the new pin
|
|
+ * protects.
|
|
+ *
|
|
+ * After the old pin is dropped journal_last_seq() won't include the old
|
|
+ * pin, so we can only write the updated last_seq on the entry that
|
|
+ * contains whatever the new pin protects.
|
|
+ *
|
|
+ * Restated, we can _not_ update last_seq for a given entry if there
|
|
+ * could be a newer entry open with reservations/pins that have been
|
|
+ * taken against it.
|
|
+ *
|
|
+ * Hence, we want update/set last_seq on the current journal entry right
|
|
+ * before we open a new one:
|
|
+ */
|
|
+ buf->data->last_seq = cpu_to_le64(journal_last_seq(j));
|
|
+
|
|
+ __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq));
|
|
+
|
|
+ /* Initialize new buffer: */
|
|
+ journal_pin_new_entry(j);
|
|
+
|
|
+ bch2_journal_buf_init(j);
|
|
+
|
|
+ cancel_delayed_work(&j->write_work);
|
|
+ clear_bit(JOURNAL_NEED_WRITE, &j->flags);
|
|
+
|
|
+ bch2_journal_space_available(j);
|
|
+
|
|
+ bch2_journal_buf_put(j, old.idx);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool journal_entry_want_write(struct journal *j)
|
|
+{
|
|
+ union journal_res_state s = READ_ONCE(j->reservations);
|
|
+ bool ret = false;
|
|
+
|
|
+ /*
|
|
+ * Don't close it yet if we already have a write in flight, but do set
|
|
+ * NEED_WRITE:
|
|
+ */
|
|
+ if (s.idx != s.unwritten_idx)
|
|
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
|
|
+ else
|
|
+ ret = __journal_entry_close(j);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static bool journal_entry_close(struct journal *j)
|
|
+{
|
|
+ bool ret;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ ret = journal_entry_want_write(j);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * should _only_ called from journal_res_get() - when we actually want a
|
|
+ * journal reservation - journal entry is open means journal is dirty:
|
|
+ *
|
|
+ * returns:
|
|
+ * 0: success
|
|
+ * -ENOSPC: journal currently full, must invoke reclaim
|
|
+ * -EAGAIN: journal blocked, must wait
|
|
+ * -EROFS: insufficient rw devices or journal error
|
|
+ */
|
|
+static int journal_entry_open(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_buf *buf = journal_cur_buf(j);
|
|
+ union journal_res_state old, new;
|
|
+ int u64s;
|
|
+ u64 v;
|
|
+
|
|
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
|
|
+
|
|
+ lockdep_assert_held(&j->lock);
|
|
+ BUG_ON(journal_entry_is_open(j));
|
|
+
|
|
+ if (j->blocked)
|
|
+ return cur_entry_blocked;
|
|
+
|
|
+ if (j->cur_entry_error)
|
|
+ return j->cur_entry_error;
|
|
+
|
|
+ BUG_ON(!j->cur_entry_sectors);
|
|
+
|
|
+ buf->u64s_reserved = j->entry_u64s_reserved;
|
|
+ buf->disk_sectors = j->cur_entry_sectors;
|
|
+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9);
|
|
+
|
|
+ u64s = (int) (buf->sectors << 9) / sizeof(u64) -
|
|
+ journal_entry_overhead(j);
|
|
+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
|
|
+
|
|
+ if (u64s <= le32_to_cpu(buf->data->u64s))
|
|
+ return cur_entry_journal_full;
|
|
+
|
|
+ /*
|
|
+ * Must be set before marking the journal entry as open:
|
|
+ */
|
|
+ j->cur_entry_u64s = u64s;
|
|
+
|
|
+ v = atomic64_read(&j->reservations.counter);
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+
|
|
+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
|
|
+ return cur_entry_insufficient_devices;
|
|
+
|
|
+ /* Handle any already added entries */
|
|
+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
|
|
+
|
|
+ EBUG_ON(journal_state_count(new, new.idx));
|
|
+ journal_state_inc(&new);
|
|
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ if (j->res_get_blocked_start)
|
|
+ bch2_time_stats_update(j->blocked_time,
|
|
+ j->res_get_blocked_start);
|
|
+ j->res_get_blocked_start = 0;
|
|
+
|
|
+ mod_delayed_work(system_freezable_wq,
|
|
+ &j->write_work,
|
|
+ msecs_to_jiffies(j->write_delay_ms));
|
|
+ journal_wake(j);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static bool journal_quiesced(struct journal *j)
|
|
+{
|
|
+ union journal_res_state s = READ_ONCE(j->reservations);
|
|
+ bool ret = s.idx == s.unwritten_idx && !__journal_entry_is_open(s);
|
|
+
|
|
+ if (!ret)
|
|
+ journal_entry_close(j);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void journal_quiesce(struct journal *j)
|
|
+{
|
|
+ wait_event(j->wait, journal_quiesced(j));
|
|
+}
|
|
+
|
|
+static void journal_write_work(struct work_struct *work)
|
|
+{
|
|
+ struct journal *j = container_of(work, struct journal, write_work.work);
|
|
+
|
|
+ journal_entry_close(j);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Given an inode number, if that inode number has data in the journal that
|
|
+ * hasn't yet been flushed, return the journal sequence number that needs to be
|
|
+ * flushed:
|
|
+ */
|
|
+u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
|
|
+{
|
|
+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
|
|
+ union journal_res_state s;
|
|
+ unsigned i;
|
|
+ u64 seq;
|
|
+
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ seq = journal_cur_seq(j);
|
|
+ s = READ_ONCE(j->reservations);
|
|
+ i = s.idx;
|
|
+
|
|
+ while (1) {
|
|
+ if (test_bit(h, j->buf[i].has_inode))
|
|
+ goto out;
|
|
+
|
|
+ if (i == s.unwritten_idx)
|
|
+ break;
|
|
+
|
|
+ i = (i - 1) & JOURNAL_BUF_MASK;
|
|
+ seq--;
|
|
+ }
|
|
+
|
|
+ seq = 0;
|
|
+out:
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return seq;
|
|
+}
|
|
+
|
|
+void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq)
|
|
+{
|
|
+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
|
|
+ struct journal_buf *buf;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ if ((buf = journal_seq_to_buf(j, seq)))
|
|
+ set_bit(h, buf->has_inode);
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+}
|
|
+
|
|
+static int __journal_res_get(struct journal *j, struct journal_res *res,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_buf *buf;
|
|
+ bool can_discard;
|
|
+ int ret;
|
|
+retry:
|
|
+ if (journal_res_get_fast(j, res, flags))
|
|
+ return 0;
|
|
+
|
|
+ if (bch2_journal_error(j))
|
|
+ return -EROFS;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ /*
|
|
+ * Recheck after taking the lock, so we don't race with another thread
|
|
+ * that just did journal_entry_open() and call journal_entry_close()
|
|
+ * unnecessarily
|
|
+ */
|
|
+ if (journal_res_get_fast(j, res, flags)) {
|
|
+ spin_unlock(&j->lock);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (!(flags & JOURNAL_RES_GET_RESERVED) &&
|
|
+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
|
|
+ /*
|
|
+ * Don't want to close current journal entry, just need to
|
|
+ * invoke reclaim:
|
|
+ */
|
|
+ ret = cur_entry_journal_full;
|
|
+ goto unlock;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If we couldn't get a reservation because the current buf filled up,
|
|
+ * and we had room for a bigger entry on disk, signal that we want to
|
|
+ * realloc the journal bufs:
|
|
+ */
|
|
+ buf = journal_cur_buf(j);
|
|
+ if (journal_entry_is_open(j) &&
|
|
+ buf->buf_size >> 9 < buf->disk_sectors &&
|
|
+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
|
|
+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
|
|
+
|
|
+ if (journal_entry_is_open(j) &&
|
|
+ !__journal_entry_close(j)) {
|
|
+ /*
|
|
+ * We failed to get a reservation on the current open journal
|
|
+ * entry because it's full, and we can't close it because
|
|
+ * there's still a previous one in flight:
|
|
+ */
|
|
+ trace_journal_entry_full(c);
|
|
+ ret = cur_entry_blocked;
|
|
+ } else {
|
|
+ ret = journal_entry_open(j);
|
|
+ }
|
|
+unlock:
|
|
+ if ((ret && ret != cur_entry_insufficient_devices) &&
|
|
+ !j->res_get_blocked_start) {
|
|
+ j->res_get_blocked_start = local_clock() ?: 1;
|
|
+ trace_journal_full(c);
|
|
+ }
|
|
+
|
|
+ can_discard = j->can_discard;
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ if (!ret)
|
|
+ goto retry;
|
|
+
|
|
+ if ((ret == cur_entry_journal_full ||
|
|
+ ret == cur_entry_journal_pin_full) &&
|
|
+ !can_discard &&
|
|
+ j->reservations.idx == j->reservations.unwritten_idx &&
|
|
+ (flags & JOURNAL_RES_GET_RESERVED)) {
|
|
+ char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
|
|
+
|
|
+ bch_err(c, "Journal stuck!");
|
|
+ if (journal_debug_buf) {
|
|
+ bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
|
|
+ bch_err(c, "%s", journal_debug_buf);
|
|
+
|
|
+ bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
|
|
+ bch_err(c, "Journal pins:\n%s", journal_debug_buf);
|
|
+ kfree(journal_debug_buf);
|
|
+ }
|
|
+
|
|
+ bch2_fatal_error(c);
|
|
+ dump_stack();
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Journal is full - can't rely on reclaim from work item due to
|
|
+ * freezing:
|
|
+ */
|
|
+ if ((ret == cur_entry_journal_full ||
|
|
+ ret == cur_entry_journal_pin_full) &&
|
|
+ !(flags & JOURNAL_RES_GET_NONBLOCK)) {
|
|
+ if (can_discard) {
|
|
+ bch2_journal_do_discards(j);
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ if (mutex_trylock(&j->reclaim_lock)) {
|
|
+ bch2_journal_reclaim(j);
|
|
+ mutex_unlock(&j->reclaim_lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Essentially the entry function to the journaling code. When bcachefs is doing
|
|
+ * a btree insert, it calls this function to get the current journal write.
|
|
+ * Journal write is the structure used set up journal writes. The calling
|
|
+ * function will then add its keys to the structure, queuing them for the next
|
|
+ * write.
|
|
+ *
|
|
+ * To ensure forward progress, the current task must not be holding any
|
|
+ * btree node write locks.
|
|
+ */
|
|
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
|
|
+ unsigned flags)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ closure_wait_event(&j->async_wait,
|
|
+ (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
|
|
+ (flags & JOURNAL_RES_GET_NONBLOCK));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* journal_preres: */
|
|
+
|
|
+static bool journal_preres_available(struct journal *j,
|
|
+ struct journal_preres *res,
|
|
+ unsigned new_u64s,
|
|
+ unsigned flags)
|
|
+{
|
|
+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true);
|
|
+
|
|
+ if (!ret && mutex_trylock(&j->reclaim_lock)) {
|
|
+ bch2_journal_reclaim(j);
|
|
+ mutex_unlock(&j->reclaim_lock);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int __bch2_journal_preres_get(struct journal *j,
|
|
+ struct journal_preres *res,
|
|
+ unsigned new_u64s,
|
|
+ unsigned flags)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ closure_wait_event(&j->preres_wait,
|
|
+ (ret = bch2_journal_error(j)) ||
|
|
+ journal_preres_available(j, res, new_u64s, flags));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* journal_entry_res: */
|
|
+
|
|
+void bch2_journal_entry_res_resize(struct journal *j,
|
|
+ struct journal_entry_res *res,
|
|
+ unsigned new_u64s)
|
|
+{
|
|
+ union journal_res_state state;
|
|
+ int d = new_u64s - res->u64s;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ j->entry_u64s_reserved += d;
|
|
+ if (d <= 0)
|
|
+ goto out;
|
|
+
|
|
+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
|
|
+ smp_mb();
|
|
+ state = READ_ONCE(j->reservations);
|
|
+
|
|
+ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL &&
|
|
+ state.cur_entry_offset > j->cur_entry_u64s) {
|
|
+ j->cur_entry_u64s += d;
|
|
+ /*
|
|
+ * Not enough room in current journal entry, have to flush it:
|
|
+ */
|
|
+ __journal_entry_close(j);
|
|
+ } else {
|
|
+ journal_cur_buf(j)->u64s_reserved += d;
|
|
+ }
|
|
+out:
|
|
+ spin_unlock(&j->lock);
|
|
+ res->u64s += d;
|
|
+}
|
|
+
|
|
+/* journal flushing: */
|
|
+
|
|
+/**
|
|
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
|
|
+ *
|
|
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
|
|
+ * necessary
|
|
+ */
|
|
+int bch2_journal_flush_seq_async(struct journal *j, u64 seq,
|
|
+ struct closure *parent)
|
|
+{
|
|
+ struct journal_buf *buf;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (seq <= j->flushed_seq_ondisk)
|
|
+ return 1;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ BUG_ON(seq > journal_cur_seq(j));
|
|
+
|
|
+ /* Recheck under lock: */
|
|
+ if (j->err_seq && seq >= j->err_seq) {
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (seq <= j->flushed_seq_ondisk) {
|
|
+ ret = 1;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ /* if seq was written, but not flushed - flush a newer one instead */
|
|
+ seq = max(seq, last_unwritten_seq(j));
|
|
+
|
|
+recheck_need_open:
|
|
+ if (seq == journal_cur_seq(j) && !journal_entry_is_open(j)) {
|
|
+ struct journal_res res = { 0 };
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ seq = res.seq;
|
|
+ buf = j->buf + (seq & JOURNAL_BUF_MASK);
|
|
+ buf->must_flush = true;
|
|
+ set_bit(JOURNAL_NEED_WRITE, &j->flags);
|
|
+
|
|
+ if (parent && !closure_wait(&buf->wait, parent))
|
|
+ BUG();
|
|
+
|
|
+ bch2_journal_res_put(j, &res);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ goto want_write;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * if write was kicked off without a flush, flush the next sequence
|
|
+ * number instead
|
|
+ */
|
|
+ buf = journal_seq_to_buf(j, seq);
|
|
+ if (buf->noflush) {
|
|
+ seq++;
|
|
+ goto recheck_need_open;
|
|
+ }
|
|
+
|
|
+ buf->must_flush = true;
|
|
+
|
|
+ if (parent && !closure_wait(&buf->wait, parent))
|
|
+ BUG();
|
|
+want_write:
|
|
+ if (seq == journal_cur_seq(j))
|
|
+ journal_entry_want_write(j);
|
|
+out:
|
|
+ spin_unlock(&j->lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
|
|
+{
|
|
+ u64 start_time = local_clock();
|
|
+ int ret, ret2;
|
|
+
|
|
+ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
|
|
+
|
|
+ if (!ret)
|
|
+ bch2_time_stats_update(j->flush_seq_time, start_time);
|
|
+
|
|
+ return ret ?: ret2 < 0 ? ret2 : 0;
|
|
+}
|
|
+
|
|
+int bch2_journal_meta(struct journal *j)
|
|
+{
|
|
+ struct journal_res res;
|
|
+ int ret;
|
|
+
|
|
+ memset(&res, 0, sizeof(res));
|
|
+
|
|
+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_journal_res_put(j, &res);
|
|
+
|
|
+ return bch2_journal_flush_seq(j, res.seq);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
|
|
+ * still being written, write it and wait for the write to complete
|
|
+ */
|
|
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
|
|
+{
|
|
+ u64 seq, journal_seq;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ journal_seq = journal_cur_seq(j);
|
|
+
|
|
+ if (journal_entry_is_open(j)) {
|
|
+ seq = journal_seq;
|
|
+ } else if (journal_seq) {
|
|
+ seq = journal_seq - 1;
|
|
+ } else {
|
|
+ spin_unlock(&j->lock);
|
|
+ return;
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ bch2_journal_flush_seq_async(j, seq, parent);
|
|
+}
|
|
+
|
|
+int bch2_journal_flush(struct journal *j)
|
|
+{
|
|
+ u64 seq, journal_seq;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ journal_seq = journal_cur_seq(j);
|
|
+
|
|
+ if (journal_entry_is_open(j)) {
|
|
+ seq = journal_seq;
|
|
+ } else if (journal_seq) {
|
|
+ seq = journal_seq - 1;
|
|
+ } else {
|
|
+ spin_unlock(&j->lock);
|
|
+ return 0;
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return bch2_journal_flush_seq(j, seq);
|
|
+}
|
|
+
|
|
+/* block/unlock the journal: */
|
|
+
|
|
+void bch2_journal_unblock(struct journal *j)
|
|
+{
|
|
+ spin_lock(&j->lock);
|
|
+ j->blocked--;
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ journal_wake(j);
|
|
+}
|
|
+
|
|
+void bch2_journal_block(struct journal *j)
|
|
+{
|
|
+ spin_lock(&j->lock);
|
|
+ j->blocked++;
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ journal_quiesce(j);
|
|
+}
|
|
+
|
|
+/* allocate journal on a device: */
|
|
+
|
|
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
|
|
+ bool new_fs, struct closure *cl)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+ struct bch_sb_field_journal *journal_buckets;
|
|
+ u64 *new_bucket_seq = NULL, *new_buckets = NULL;
|
|
+ int ret = 0;
|
|
+
|
|
+ /* don't handle reducing nr of buckets yet: */
|
|
+ if (nr <= ja->nr)
|
|
+ return 0;
|
|
+
|
|
+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL);
|
|
+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL);
|
|
+ if (!new_buckets || !new_bucket_seq) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
|
|
+ nr + sizeof(*journal_buckets) / sizeof(u64));
|
|
+ if (!journal_buckets) {
|
|
+ ret = -ENOSPC;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We may be called from the device add path, before the new device has
|
|
+ * actually been added to the running filesystem:
|
|
+ */
|
|
+ if (!new_fs)
|
|
+ spin_lock(&c->journal.lock);
|
|
+
|
|
+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64));
|
|
+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64));
|
|
+ swap(new_buckets, ja->buckets);
|
|
+ swap(new_bucket_seq, ja->bucket_seq);
|
|
+
|
|
+ if (!new_fs)
|
|
+ spin_unlock(&c->journal.lock);
|
|
+
|
|
+ while (ja->nr < nr) {
|
|
+ struct open_bucket *ob = NULL;
|
|
+ unsigned pos;
|
|
+ long b;
|
|
+
|
|
+ if (new_fs) {
|
|
+ b = bch2_bucket_alloc_new_fs(ca);
|
|
+ if (b < 0) {
|
|
+ ret = -ENOSPC;
|
|
+ goto err;
|
|
+ }
|
|
+ } else {
|
|
+ rcu_read_lock();
|
|
+ ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
|
|
+ false, cl);
|
|
+ rcu_read_unlock();
|
|
+ if (IS_ERR(ob)) {
|
|
+ ret = cl ? -EAGAIN : -ENOSPC;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ b = sector_to_bucket(ca, ob->ptr.offset);
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ spin_lock(&c->journal.lock);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * XXX
|
|
+ * For resize at runtime, we should be writing the new
|
|
+ * superblock before inserting into the journal array
|
|
+ */
|
|
+
|
|
+ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0;
|
|
+ __array_insert_item(ja->buckets, ja->nr, pos);
|
|
+ __array_insert_item(ja->bucket_seq, ja->nr, pos);
|
|
+ __array_insert_item(journal_buckets->buckets, ja->nr, pos);
|
|
+ ja->nr++;
|
|
+
|
|
+ ja->buckets[pos] = b;
|
|
+ ja->bucket_seq[pos] = 0;
|
|
+ journal_buckets->buckets[pos] = cpu_to_le64(b);
|
|
+
|
|
+ if (pos <= ja->discard_idx)
|
|
+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
|
|
+ if (pos <= ja->dirty_idx_ondisk)
|
|
+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
|
+ if (pos <= ja->dirty_idx)
|
|
+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
|
|
+ if (pos <= ja->cur_idx)
|
|
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
|
|
+
|
|
+ if (new_fs) {
|
|
+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
|
|
+ ca->mi.bucket_size,
|
|
+ gc_phase(GC_PHASE_SB),
|
|
+ 0);
|
|
+ } else {
|
|
+ spin_unlock(&c->journal.lock);
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
|
|
+ bch2_trans_mark_metadata_bucket(&trans, ca,
|
|
+ b, BCH_DATA_journal,
|
|
+ ca->mi.bucket_size));
|
|
+
|
|
+ bch2_open_bucket_put(c, ob);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+err:
|
|
+ bch2_sb_resize_journal(&ca->disk_sb,
|
|
+ ja->nr + sizeof(*journal_buckets) / sizeof(u64));
|
|
+ kfree(new_bucket_seq);
|
|
+ kfree(new_buckets);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Allocate more journal space at runtime - not currently making use if it, but
|
|
+ * the code works:
|
|
+ */
|
|
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
|
|
+ unsigned nr)
|
|
+{
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+ struct closure cl;
|
|
+ unsigned current_nr;
|
|
+ int ret;
|
|
+
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ do {
|
|
+ struct disk_reservation disk_res = { 0, 0 };
|
|
+
|
|
+ closure_sync(&cl);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ current_nr = ja->nr;
|
|
+
|
|
+ /*
|
|
+ * note: journal buckets aren't really counted as _sectors_ used yet, so
|
|
+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
|
|
+ * when space used goes up without a reservation - but we do need the
|
|
+ * reservation to ensure we'll actually be able to allocate:
|
|
+ */
|
|
+
|
|
+ if (bch2_disk_reservation_get(c, &disk_res,
|
|
+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return -ENOSPC;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
|
|
+
|
|
+ bch2_disk_reservation_put(c, &disk_res);
|
|
+
|
|
+ if (ja->nr != current_nr)
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ } while (ret == -EAGAIN);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_dev_journal_alloc(struct bch_dev *ca)
|
|
+{
|
|
+ unsigned nr;
|
|
+
|
|
+ if (dynamic_fault("bcachefs:add:journal_alloc"))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ /* 1/128th of the device by default: */
|
|
+ nr = ca->mi.nbuckets >> 7;
|
|
+
|
|
+ /*
|
|
+ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever
|
|
+ * is smaller:
|
|
+ */
|
|
+ nr = clamp_t(unsigned, nr,
|
|
+ BCH_JOURNAL_BUCKETS_MIN,
|
|
+ min(1 << 13,
|
|
+ (1 << 24) / ca->mi.bucket_size));
|
|
+
|
|
+ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
|
|
+}
|
|
+
|
|
+/* startup/shutdown: */
|
|
+
|
|
+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
|
|
+{
|
|
+ union journal_res_state state;
|
|
+ bool ret = false;
|
|
+ unsigned i;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ state = READ_ONCE(j->reservations);
|
|
+ i = state.idx;
|
|
+
|
|
+ while (i != state.unwritten_idx) {
|
|
+ i = (i - 1) & JOURNAL_BUF_MASK;
|
|
+ if (bch2_bkey_has_device(bkey_i_to_s_c(&j->buf[i].key), dev_idx))
|
|
+ ret = true;
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
|
|
+{
|
|
+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
|
|
+}
|
|
+
|
|
+void bch2_fs_journal_stop(struct journal *j)
|
|
+{
|
|
+ bch2_journal_flush_all_pins(j);
|
|
+
|
|
+ wait_event(j->wait, journal_entry_close(j));
|
|
+
|
|
+ /*
|
|
+ * Always write a new journal entry, to make sure the clock hands are up
|
|
+ * to date (and match the superblock)
|
|
+ */
|
|
+ bch2_journal_meta(j);
|
|
+
|
|
+ journal_quiesce(j);
|
|
+
|
|
+ BUG_ON(!bch2_journal_error(j) &&
|
|
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags) &&
|
|
+ (journal_entry_is_open(j) ||
|
|
+ j->last_empty_seq + 1 != journal_cur_seq(j)));
|
|
+
|
|
+ cancel_delayed_work_sync(&j->write_work);
|
|
+ bch2_journal_reclaim_stop(j);
|
|
+}
|
|
+
|
|
+int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
|
|
+ struct list_head *journal_entries)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_entry_pin_list *p;
|
|
+ struct journal_replay *i;
|
|
+ u64 last_seq = cur_seq, nr, seq;
|
|
+
|
|
+ if (!list_empty(journal_entries))
|
|
+ last_seq = le64_to_cpu(list_last_entry(journal_entries,
|
|
+ struct journal_replay, list)->j.last_seq);
|
|
+
|
|
+ nr = cur_seq - last_seq;
|
|
+
|
|
+ if (nr + 1 > j->pin.size) {
|
|
+ free_fifo(&j->pin);
|
|
+ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL);
|
|
+ if (!j->pin.data) {
|
|
+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ j->replay_journal_seq = last_seq;
|
|
+ j->replay_journal_seq_end = cur_seq;
|
|
+ j->last_seq_ondisk = last_seq;
|
|
+ j->pin.front = last_seq;
|
|
+ j->pin.back = cur_seq;
|
|
+ atomic64_set(&j->seq, cur_seq - 1);
|
|
+
|
|
+ fifo_for_each_entry_ptr(p, &j->pin, seq)
|
|
+ journal_pin_list_init(p, 1);
|
|
+
|
|
+ list_for_each_entry(i, journal_entries, list) {
|
|
+ unsigned ptr;
|
|
+
|
|
+ seq = le64_to_cpu(i->j.seq);
|
|
+ BUG_ON(seq >= cur_seq);
|
|
+
|
|
+ if (seq < last_seq)
|
|
+ continue;
|
|
+
|
|
+ p = journal_seq_pin(j, seq);
|
|
+
|
|
+ p->devs.nr = 0;
|
|
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
|
|
+ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
|
|
+ }
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ set_bit(JOURNAL_STARTED, &j->flags);
|
|
+ j->last_flush_write = jiffies;
|
|
+
|
|
+ journal_pin_new_entry(j);
|
|
+
|
|
+ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j);
|
|
+
|
|
+ bch2_journal_buf_init(j);
|
|
+
|
|
+ c->last_bucket_seq_cleanup = journal_cur_seq(j);
|
|
+
|
|
+ bch2_journal_space_available(j);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* init/exit: */
|
|
+
|
|
+void bch2_dev_journal_exit(struct bch_dev *ca)
|
|
+{
|
|
+ kfree(ca->journal.bio);
|
|
+ kfree(ca->journal.buckets);
|
|
+ kfree(ca->journal.bucket_seq);
|
|
+
|
|
+ ca->journal.bio = NULL;
|
|
+ ca->journal.buckets = NULL;
|
|
+ ca->journal.bucket_seq = NULL;
|
|
+}
|
|
+
|
|
+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
|
|
+{
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+ struct bch_sb_field_journal *journal_buckets =
|
|
+ bch2_sb_get_journal(sb);
|
|
+ unsigned i;
|
|
+
|
|
+ ja->nr = bch2_nr_journal_buckets(journal_buckets);
|
|
+
|
|
+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
|
|
+ if (!ja->bucket_seq)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ca->journal.bio = bio_kmalloc(GFP_KERNEL,
|
|
+ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE));
|
|
+ if (!ca->journal.bio)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
|
|
+ if (!ja->buckets)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ for (i = 0; i < ja->nr; i++)
|
|
+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_fs_journal_exit(struct journal *j)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++)
|
|
+ kvpfree(j->buf[i].data, j->buf[i].buf_size);
|
|
+ free_fifo(&j->pin);
|
|
+}
|
|
+
|
|
+int bch2_fs_journal_init(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ static struct lock_class_key res_key;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_verbose_init(c->opts, "");
|
|
+
|
|
+ spin_lock_init(&j->lock);
|
|
+ spin_lock_init(&j->err_lock);
|
|
+ init_waitqueue_head(&j->wait);
|
|
+ INIT_DELAYED_WORK(&j->write_work, journal_write_work);
|
|
+ init_waitqueue_head(&j->reclaim_wait);
|
|
+ init_waitqueue_head(&j->pin_flush_wait);
|
|
+ mutex_init(&j->reclaim_lock);
|
|
+ mutex_init(&j->discard_lock);
|
|
+
|
|
+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
|
|
+
|
|
+ j->write_delay_ms = 1000;
|
|
+ j->reclaim_delay_ms = 100;
|
|
+
|
|
+ atomic64_set(&j->reservations.counter,
|
|
+ ((union journal_res_state)
|
|
+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
|
|
+
|
|
+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
|
|
+ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
|
|
+ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
|
|
+ if (!j->buf[i].data) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ j->pin.front = j->pin.back = 1;
|
|
+out:
|
|
+ pr_verbose_init(c->opts, "ret %i", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* debug: */
|
|
+
|
|
+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ union journal_res_state s;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ s = READ_ONCE(j->reservations);
|
|
+
|
|
+ pr_buf(out,
|
|
+ "active journal entries:\t%llu\n"
|
|
+ "seq:\t\t\t%llu\n"
|
|
+ "last_seq:\t\t%llu\n"
|
|
+ "last_seq_ondisk:\t%llu\n"
|
|
+ "flushed_seq_ondisk:\t%llu\n"
|
|
+ "prereserved:\t\t%u/%u\n"
|
|
+ "each entry reserved:\t%u\n"
|
|
+ "nr flush writes:\t%llu\n"
|
|
+ "nr noflush writes:\t%llu\n"
|
|
+ "nr direct reclaim:\t%llu\n"
|
|
+ "nr background reclaim:\t%llu\n"
|
|
+ "reclaim kicked:\t\t%u\n"
|
|
+ "reclaim runs in:\t%u ms\n"
|
|
+ "current entry sectors:\t%u\n"
|
|
+ "current entry error:\t%u\n"
|
|
+ "current entry:\t\t",
|
|
+ fifo_used(&j->pin),
|
|
+ journal_cur_seq(j),
|
|
+ journal_last_seq(j),
|
|
+ j->last_seq_ondisk,
|
|
+ j->flushed_seq_ondisk,
|
|
+ j->prereserved.reserved,
|
|
+ j->prereserved.remaining,
|
|
+ j->entry_u64s_reserved,
|
|
+ j->nr_flush_writes,
|
|
+ j->nr_noflush_writes,
|
|
+ j->nr_direct_reclaim,
|
|
+ j->nr_background_reclaim,
|
|
+ j->reclaim_kicked,
|
|
+ jiffies_to_msecs(j->next_reclaim - jiffies),
|
|
+ j->cur_entry_sectors,
|
|
+ j->cur_entry_error);
|
|
+
|
|
+ switch (s.cur_entry_offset) {
|
|
+ case JOURNAL_ENTRY_ERROR_VAL:
|
|
+ pr_buf(out, "error\n");
|
|
+ break;
|
|
+ case JOURNAL_ENTRY_CLOSED_VAL:
|
|
+ pr_buf(out, "closed\n");
|
|
+ break;
|
|
+ default:
|
|
+ pr_buf(out, "%u/%u\n",
|
|
+ s.cur_entry_offset,
|
|
+ j->cur_entry_u64s);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ pr_buf(out,
|
|
+ "current entry:\t\tidx %u refcount %u\n",
|
|
+ s.idx, journal_state_count(s, s.idx));
|
|
+
|
|
+ i = s.idx;
|
|
+ while (i != s.unwritten_idx) {
|
|
+ i = (i - 1) & JOURNAL_BUF_MASK;
|
|
+
|
|
+ pr_buf(out, "unwritten entry:\tidx %u refcount %u sectors %u\n",
|
|
+ i, journal_state_count(s, i), j->buf[i].sectors);
|
|
+ }
|
|
+
|
|
+ pr_buf(out,
|
|
+ "need write:\t\t%i\n"
|
|
+ "replay done:\t\t%i\n",
|
|
+ test_bit(JOURNAL_NEED_WRITE, &j->flags),
|
|
+ test_bit(JOURNAL_REPLAY_DONE, &j->flags));
|
|
+
|
|
+ pr_buf(out, "space:\n");
|
|
+ pr_buf(out, "\tdiscarded\t%u:%u\n",
|
|
+ j->space[journal_space_discarded].next_entry,
|
|
+ j->space[journal_space_discarded].total);
|
|
+ pr_buf(out, "\tclean ondisk\t%u:%u\n",
|
|
+ j->space[journal_space_clean_ondisk].next_entry,
|
|
+ j->space[journal_space_clean_ondisk].total);
|
|
+ pr_buf(out, "\tclean\t\t%u:%u\n",
|
|
+ j->space[journal_space_clean].next_entry,
|
|
+ j->space[journal_space_clean].total);
|
|
+ pr_buf(out, "\ttotal\t\t%u:%u\n",
|
|
+ j->space[journal_space_total].next_entry,
|
|
+ j->space[journal_space_total].total);
|
|
+
|
|
+ for_each_member_device_rcu(ca, c, i,
|
|
+ &c->rw_devs[BCH_DATA_journal]) {
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+
|
|
+ if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
|
|
+ continue;
|
|
+
|
|
+ if (!ja->nr)
|
|
+ continue;
|
|
+
|
|
+ pr_buf(out,
|
|
+ "dev %u:\n"
|
|
+ "\tnr\t\t%u\n"
|
|
+ "\tbucket size\t%u\n"
|
|
+ "\tavailable\t%u:%u\n"
|
|
+ "\tdiscard_idx\t%u\n"
|
|
+ "\tdirty_ondisk\t%u (seq %llu)\n"
|
|
+ "\tdirty_idx\t%u (seq %llu)\n"
|
|
+ "\tcur_idx\t\t%u (seq %llu)\n",
|
|
+ i, ja->nr, ca->mi.bucket_size,
|
|
+ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded),
|
|
+ ja->sectors_free,
|
|
+ ja->discard_idx,
|
|
+ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk],
|
|
+ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx],
|
|
+ ja->cur_idx, ja->bucket_seq[ja->cur_idx]);
|
|
+ }
|
|
+
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
|
|
+{
|
|
+ spin_lock(&j->lock);
|
|
+ __bch2_journal_debug_to_text(out, j);
|
|
+ spin_unlock(&j->lock);
|
|
+}
|
|
+
|
|
+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j)
|
|
+{
|
|
+ struct journal_entry_pin_list *pin_list;
|
|
+ struct journal_entry_pin *pin;
|
|
+ u64 i;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) {
|
|
+ pr_buf(out, "%llu: count %u\n",
|
|
+ i, atomic_read(&pin_list->count));
|
|
+
|
|
+ list_for_each_entry(pin, &pin_list->list, list)
|
|
+ pr_buf(out, "\t%px %ps\n",
|
|
+ pin, pin->flush);
|
|
+
|
|
+ if (!list_empty(&pin_list->flushed))
|
|
+ pr_buf(out, "flushed:\n");
|
|
+
|
|
+ list_for_each_entry(pin, &pin_list->flushed, list)
|
|
+ pr_buf(out, "\t%px %ps\n",
|
|
+ pin, pin->flush);
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
|
|
new file mode 100644
|
|
index 000000000000..1d556790b38e
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal.h
|
|
@@ -0,0 +1,531 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_JOURNAL_H
|
|
+#define _BCACHEFS_JOURNAL_H
|
|
+
|
|
+/*
|
|
+ * THE JOURNAL:
|
|
+ *
|
|
+ * The primary purpose of the journal is to log updates (insertions) to the
|
|
+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk.
|
|
+ *
|
|
+ * Without the journal, the b-tree is always internally consistent on
|
|
+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal
|
|
+ * but did handle unclean shutdowns by doing all index updates synchronously
|
|
+ * (with coalescing).
|
|
+ *
|
|
+ * Updates to interior nodes still happen synchronously and without the journal
|
|
+ * (for simplicity) - this may change eventually but updates to interior nodes
|
|
+ * are rare enough it's not a huge priority.
|
|
+ *
|
|
+ * This means the journal is relatively separate from the b-tree; it consists of
|
|
+ * just a list of keys and journal replay consists of just redoing those
|
|
+ * insertions in same order that they appear in the journal.
|
|
+ *
|
|
+ * PERSISTENCE:
|
|
+ *
|
|
+ * For synchronous updates (where we're waiting on the index update to hit
|
|
+ * disk), the journal entry will be written out immediately (or as soon as
|
|
+ * possible, if the write for the previous journal entry was still in flight).
|
|
+ *
|
|
+ * Synchronous updates are specified by passing a closure (@flush_cl) to
|
|
+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter
|
|
+ * down to the journalling code. That closure will will wait on the journal
|
|
+ * write to complete (via closure_wait()).
|
|
+ *
|
|
+ * If the index update wasn't synchronous, the journal entry will be
|
|
+ * written out after 10 ms have elapsed, by default (the delay_ms field
|
|
+ * in struct journal).
|
|
+ *
|
|
+ * JOURNAL ENTRIES:
|
|
+ *
|
|
+ * A journal entry is variable size (struct jset), it's got a fixed length
|
|
+ * header and then a variable number of struct jset_entry entries.
|
|
+ *
|
|
+ * Journal entries are identified by monotonically increasing 64 bit sequence
|
|
+ * numbers - jset->seq; other places in the code refer to this sequence number.
|
|
+ *
|
|
+ * A jset_entry entry contains one or more bkeys (which is what gets inserted
|
|
+ * into the b-tree). We need a container to indicate which b-tree the key is
|
|
+ * for; also, the roots of the various b-trees are stored in jset_entry entries
|
|
+ * (one for each b-tree) - this lets us add new b-tree types without changing
|
|
+ * the on disk format.
|
|
+ *
|
|
+ * We also keep some things in the journal header that are logically part of the
|
|
+ * superblock - all the things that are frequently updated. This is for future
|
|
+ * bcache on raw flash support; the superblock (which will become another
|
|
+ * journal) can't be moved or wear leveled, so it contains just enough
|
|
+ * information to find the main journal, and the superblock only has to be
|
|
+ * rewritten when we want to move/wear level the main journal.
|
|
+ *
|
|
+ * JOURNAL LAYOUT ON DISK:
|
|
+ *
|
|
+ * The journal is written to a ringbuffer of buckets (which is kept in the
|
|
+ * superblock); the individual buckets are not necessarily contiguous on disk
|
|
+ * which means that journal entries are not allowed to span buckets, but also
|
|
+ * that we can resize the journal at runtime if desired (unimplemented).
|
|
+ *
|
|
+ * The journal buckets exist in the same pool as all the other buckets that are
|
|
+ * managed by the allocator and garbage collection - garbage collection marks
|
|
+ * the journal buckets as metadata buckets.
|
|
+ *
|
|
+ * OPEN/DIRTY JOURNAL ENTRIES:
|
|
+ *
|
|
+ * Open/dirty journal entries are journal entries that contain b-tree updates
|
|
+ * that have not yet been written out to the b-tree on disk. We have to track
|
|
+ * which journal entries are dirty, and we also have to avoid wrapping around
|
|
+ * the journal and overwriting old but still dirty journal entries with new
|
|
+ * journal entries.
|
|
+ *
|
|
+ * On disk, this is represented with the "last_seq" field of struct jset;
|
|
+ * last_seq is the first sequence number that journal replay has to replay.
|
|
+ *
|
|
+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in
|
|
+ * journal_device->seq) of for each journal bucket, the highest sequence number
|
|
+ * any journal entry it contains. Then, by comparing that against last_seq we
|
|
+ * can determine whether that journal bucket contains dirty journal entries or
|
|
+ * not.
|
|
+ *
|
|
+ * To track which journal entries are dirty, we maintain a fifo of refcounts
|
|
+ * (where each entry corresponds to a specific sequence number) - when a ref
|
|
+ * goes to 0, that journal entry is no longer dirty.
|
|
+ *
|
|
+ * Journalling of index updates is done at the same time as the b-tree itself is
|
|
+ * being modified (see btree_insert_key()); when we add the key to the journal
|
|
+ * the pending b-tree write takes a ref on the journal entry the key was added
|
|
+ * to. If a pending b-tree write would need to take refs on multiple dirty
|
|
+ * journal entries, it only keeps the ref on the oldest one (since a newer
|
|
+ * journal entry will still be replayed if an older entry was dirty).
|
|
+ *
|
|
+ * JOURNAL FILLING UP:
|
|
+ *
|
|
+ * There are two ways the journal could fill up; either we could run out of
|
|
+ * space to write to, or we could have too many open journal entries and run out
|
|
+ * of room in the fifo of refcounts. Since those refcounts are decremented
|
|
+ * without any locking we can't safely resize that fifo, so we handle it the
|
|
+ * same way.
|
|
+ *
|
|
+ * If the journal fills up, we start flushing dirty btree nodes until we can
|
|
+ * allocate space for a journal write again - preferentially flushing btree
|
|
+ * nodes that are pinning the oldest journal entries first.
|
|
+ */
|
|
+
|
|
+#include <linux/hash.h>
|
|
+
|
|
+#include "journal_types.h"
|
|
+
|
|
+struct bch_fs;
|
|
+
|
|
+static inline void journal_wake(struct journal *j)
|
|
+{
|
|
+ wake_up(&j->wait);
|
|
+ closure_wake_up(&j->async_wait);
|
|
+ closure_wake_up(&j->preres_wait);
|
|
+}
|
|
+
|
|
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
|
|
+{
|
|
+ return j->buf + j->reservations.idx;
|
|
+}
|
|
+
|
|
+/* Sequence number of oldest dirty journal entry */
|
|
+
|
|
+static inline u64 journal_last_seq(struct journal *j)
|
|
+{
|
|
+ return j->pin.front;
|
|
+}
|
|
+
|
|
+static inline u64 journal_cur_seq(struct journal *j)
|
|
+{
|
|
+ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
|
|
+
|
|
+ return j->pin.back - 1;
|
|
+}
|
|
+
|
|
+u64 bch2_inode_journal_seq(struct journal *, u64);
|
|
+void bch2_journal_set_has_inum(struct journal *, u64, u64);
|
|
+
|
|
+static inline int journal_state_count(union journal_res_state s, int idx)
|
|
+{
|
|
+ switch (idx) {
|
|
+ case 0: return s.buf0_count;
|
|
+ case 1: return s.buf1_count;
|
|
+ case 2: return s.buf2_count;
|
|
+ case 3: return s.buf3_count;
|
|
+ }
|
|
+ BUG();
|
|
+}
|
|
+
|
|
+static inline void journal_state_inc(union journal_res_state *s)
|
|
+{
|
|
+ s->buf0_count += s->idx == 0;
|
|
+ s->buf1_count += s->idx == 1;
|
|
+ s->buf2_count += s->idx == 2;
|
|
+ s->buf3_count += s->idx == 3;
|
|
+}
|
|
+
|
|
+static inline void bch2_journal_set_has_inode(struct journal *j,
|
|
+ struct journal_res *res,
|
|
+ u64 inum)
|
|
+{
|
|
+ struct journal_buf *buf = &j->buf[res->idx];
|
|
+ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8));
|
|
+
|
|
+ /* avoid atomic op if possible */
|
|
+ if (unlikely(!test_bit(bit, buf->has_inode)))
|
|
+ set_bit(bit, buf->has_inode);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Amount of space that will be taken up by some keys in the journal (i.e.
|
|
+ * including the jset header)
|
|
+ */
|
|
+static inline unsigned jset_u64s(unsigned u64s)
|
|
+{
|
|
+ return u64s + sizeof(struct jset_entry) / sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline int journal_entry_overhead(struct journal *j)
|
|
+{
|
|
+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
|
|
+}
|
|
+
|
|
+static inline struct jset_entry *
|
|
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
|
|
+{
|
|
+ struct jset *jset = buf->data;
|
|
+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
|
|
+
|
|
+ memset(entry, 0, sizeof(*entry));
|
|
+ entry->u64s = cpu_to_le16(u64s);
|
|
+
|
|
+ le32_add_cpu(&jset->u64s, jset_u64s(u64s));
|
|
+
|
|
+ return entry;
|
|
+}
|
|
+
|
|
+static inline struct jset_entry *
|
|
+journal_res_entry(struct journal *j, struct journal_res *res)
|
|
+{
|
|
+ return vstruct_idx(j->buf[res->idx].data, res->offset);
|
|
+}
|
|
+
|
|
+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type,
|
|
+ enum btree_id id, unsigned level,
|
|
+ const void *data, unsigned u64s)
|
|
+{
|
|
+ entry->u64s = cpu_to_le16(u64s);
|
|
+ entry->btree_id = id;
|
|
+ entry->level = level;
|
|
+ entry->type = type;
|
|
+ entry->pad[0] = 0;
|
|
+ entry->pad[1] = 0;
|
|
+ entry->pad[2] = 0;
|
|
+ memcpy_u64s_small(entry->_data, data, u64s);
|
|
+
|
|
+ return jset_u64s(u64s);
|
|
+}
|
|
+
|
|
+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
|
|
+ unsigned type, enum btree_id id,
|
|
+ unsigned level,
|
|
+ const void *data, unsigned u64s)
|
|
+{
|
|
+ unsigned actual = journal_entry_set(journal_res_entry(j, res),
|
|
+ type, id, level, data, u64s);
|
|
+
|
|
+ EBUG_ON(!res->ref);
|
|
+ EBUG_ON(actual > res->u64s);
|
|
+
|
|
+ res->offset += actual;
|
|
+ res->u64s -= actual;
|
|
+}
|
|
+
|
|
+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
|
|
+ enum btree_id id, unsigned level,
|
|
+ const struct bkey_i *k)
|
|
+{
|
|
+ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
|
|
+ id, level, k, k->k.u64s);
|
|
+}
|
|
+
|
|
+static inline bool journal_entry_empty(struct jset *j)
|
|
+{
|
|
+ struct jset_entry *i;
|
|
+
|
|
+ if (j->seq != j->last_seq)
|
|
+ return false;
|
|
+
|
|
+ vstruct_for_each(j, i)
|
|
+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s)
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+void __bch2_journal_buf_put(struct journal *);
|
|
+
|
|
+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx)
|
|
+{
|
|
+ union journal_res_state s;
|
|
+
|
|
+ s.v = atomic64_sub_return(((union journal_res_state) {
|
|
+ .buf0_count = idx == 0,
|
|
+ .buf1_count = idx == 1,
|
|
+ .buf2_count = idx == 2,
|
|
+ .buf3_count = idx == 3,
|
|
+ }).v, &j->reservations.counter);
|
|
+
|
|
+ EBUG_ON(((s.idx - idx) & 3) >
|
|
+ ((s.idx - s.unwritten_idx) & 3));
|
|
+
|
|
+ if (!journal_state_count(s, idx) && idx == s.unwritten_idx)
|
|
+ __bch2_journal_buf_put(j);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * This function releases the journal write structure so other threads can
|
|
+ * then proceed to add their keys as well.
|
|
+ */
|
|
+static inline void bch2_journal_res_put(struct journal *j,
|
|
+ struct journal_res *res)
|
|
+{
|
|
+ if (!res->ref)
|
|
+ return;
|
|
+
|
|
+ lock_release(&j->res_map, _THIS_IP_);
|
|
+
|
|
+ while (res->u64s)
|
|
+ bch2_journal_add_entry(j, res,
|
|
+ BCH_JSET_ENTRY_btree_keys,
|
|
+ 0, 0, NULL, 0);
|
|
+
|
|
+ bch2_journal_buf_put(j, res->idx);
|
|
+
|
|
+ res->ref = 0;
|
|
+}
|
|
+
|
|
+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
|
|
+ unsigned);
|
|
+
|
|
+#define JOURNAL_RES_GET_NONBLOCK (1 << 0)
|
|
+#define JOURNAL_RES_GET_CHECK (1 << 1)
|
|
+#define JOURNAL_RES_GET_RESERVED (1 << 2)
|
|
+
|
|
+static inline int journal_res_get_fast(struct journal *j,
|
|
+ struct journal_res *res,
|
|
+ unsigned flags)
|
|
+{
|
|
+ union journal_res_state old, new;
|
|
+ u64 v = atomic64_read(&j->reservations.counter);
|
|
+
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+
|
|
+ /*
|
|
+ * Check if there is still room in the current journal
|
|
+ * entry:
|
|
+ */
|
|
+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
|
|
+ return 0;
|
|
+
|
|
+ EBUG_ON(!journal_state_count(new, new.idx));
|
|
+
|
|
+ if (!(flags & JOURNAL_RES_GET_RESERVED) &&
|
|
+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags))
|
|
+ return 0;
|
|
+
|
|
+ new.cur_entry_offset += res->u64s;
|
|
+ journal_state_inc(&new);
|
|
+
|
|
+ /*
|
|
+ * If the refcount would overflow, we have to wait:
|
|
+ * XXX - tracepoint this:
|
|
+ */
|
|
+ if (!journal_state_count(new, new.idx))
|
|
+ return 0;
|
|
+
|
|
+ if (flags & JOURNAL_RES_GET_CHECK)
|
|
+ return 1;
|
|
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ res->ref = true;
|
|
+ res->idx = old.idx;
|
|
+ res->offset = old.cur_entry_offset;
|
|
+ res->seq = le64_to_cpu(j->buf[old.idx].data->seq);
|
|
+ return 1;
|
|
+}
|
|
+
|
|
+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
|
|
+ unsigned u64s, unsigned flags)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ EBUG_ON(res->ref);
|
|
+ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
|
|
+
|
|
+ res->u64s = u64s;
|
|
+
|
|
+ if (journal_res_get_fast(j, res, flags))
|
|
+ goto out;
|
|
+
|
|
+ ret = bch2_journal_res_get_slowpath(j, res, flags);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+out:
|
|
+ if (!(flags & JOURNAL_RES_GET_CHECK)) {
|
|
+ lock_acquire_shared(&j->res_map, 0,
|
|
+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
|
|
+ NULL, _THIS_IP_);
|
|
+ EBUG_ON(!res->ref);
|
|
+ }
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* journal_preres: */
|
|
+
|
|
+static inline bool journal_check_may_get_unreserved(struct journal *j)
|
|
+{
|
|
+ union journal_preres_state s = READ_ONCE(j->prereserved);
|
|
+ bool ret = s.reserved < s.remaining &&
|
|
+ fifo_free(&j->pin) > 8;
|
|
+
|
|
+ lockdep_assert_held(&j->lock);
|
|
+
|
|
+ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
|
|
+ if (ret) {
|
|
+ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
|
|
+ journal_wake(j);
|
|
+ } else {
|
|
+ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags);
|
|
+ }
|
|
+ }
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline void bch2_journal_preres_put(struct journal *j,
|
|
+ struct journal_preres *res)
|
|
+{
|
|
+ union journal_preres_state s = { .reserved = res->u64s };
|
|
+
|
|
+ if (!res->u64s)
|
|
+ return;
|
|
+
|
|
+ s.v = atomic64_sub_return(s.v, &j->prereserved.counter);
|
|
+ res->u64s = 0;
|
|
+
|
|
+ if (unlikely(s.waiting)) {
|
|
+ clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)),
|
|
+ (unsigned long *) &j->prereserved.v);
|
|
+ closure_wake_up(&j->preres_wait);
|
|
+ }
|
|
+
|
|
+ if (s.reserved <= s.remaining &&
|
|
+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) {
|
|
+ spin_lock(&j->lock);
|
|
+ journal_check_may_get_unreserved(j);
|
|
+ spin_unlock(&j->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+int __bch2_journal_preres_get(struct journal *,
|
|
+ struct journal_preres *, unsigned, unsigned);
|
|
+
|
|
+static inline int bch2_journal_preres_get_fast(struct journal *j,
|
|
+ struct journal_preres *res,
|
|
+ unsigned new_u64s,
|
|
+ unsigned flags,
|
|
+ bool set_waiting)
|
|
+{
|
|
+ int d = new_u64s - res->u64s;
|
|
+ union journal_preres_state old, new;
|
|
+ u64 v = atomic64_read(&j->prereserved.counter);
|
|
+ int ret;
|
|
+
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+ ret = 0;
|
|
+
|
|
+ if ((flags & JOURNAL_RES_GET_RESERVED) ||
|
|
+ new.reserved + d < new.remaining) {
|
|
+ new.reserved += d;
|
|
+ ret = 1;
|
|
+ } else if (set_waiting && !new.waiting)
|
|
+ new.waiting = true;
|
|
+ else
|
|
+ return 0;
|
|
+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ if (ret)
|
|
+ res->u64s += d;
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline int bch2_journal_preres_get(struct journal *j,
|
|
+ struct journal_preres *res,
|
|
+ unsigned new_u64s,
|
|
+ unsigned flags)
|
|
+{
|
|
+ if (new_u64s <= res->u64s)
|
|
+ return 0;
|
|
+
|
|
+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false))
|
|
+ return 0;
|
|
+
|
|
+ if (flags & JOURNAL_RES_GET_NONBLOCK)
|
|
+ return -EAGAIN;
|
|
+
|
|
+ return __bch2_journal_preres_get(j, res, new_u64s, flags);
|
|
+}
|
|
+
|
|
+/* journal_entry_res: */
|
|
+
|
|
+void bch2_journal_entry_res_resize(struct journal *,
|
|
+ struct journal_entry_res *,
|
|
+ unsigned);
|
|
+
|
|
+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
|
|
+void bch2_journal_flush_async(struct journal *, struct closure *);
|
|
+
|
|
+int bch2_journal_flush_seq(struct journal *, u64);
|
|
+int bch2_journal_flush(struct journal *);
|
|
+int bch2_journal_meta(struct journal *);
|
|
+
|
|
+void bch2_journal_halt(struct journal *);
|
|
+
|
|
+static inline int bch2_journal_error(struct journal *j)
|
|
+{
|
|
+ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL
|
|
+ ? -EIO : 0;
|
|
+}
|
|
+
|
|
+struct bch_dev;
|
|
+
|
|
+static inline void bch2_journal_set_replay_done(struct journal *j)
|
|
+{
|
|
+ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
|
|
+ set_bit(JOURNAL_REPLAY_DONE, &j->flags);
|
|
+}
|
|
+
|
|
+void bch2_journal_unblock(struct journal *);
|
|
+void bch2_journal_block(struct journal *);
|
|
+
|
|
+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
|
|
+void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
|
|
+void bch2_journal_pins_to_text(struct printbuf *, struct journal *);
|
|
+
|
|
+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
|
|
+ unsigned nr);
|
|
+int bch2_dev_journal_alloc(struct bch_dev *);
|
|
+
|
|
+void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
|
|
+
|
|
+void bch2_fs_journal_stop(struct journal *);
|
|
+int bch2_fs_journal_start(struct journal *, u64, struct list_head *);
|
|
+
|
|
+void bch2_dev_journal_exit(struct bch_dev *);
|
|
+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
|
|
+void bch2_fs_journal_exit(struct journal *);
|
|
+int bch2_fs_journal_init(struct journal *);
|
|
+
|
|
+#endif /* _BCACHEFS_JOURNAL_H */
|
|
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
|
|
new file mode 100644
|
|
index 000000000000..c7fa03cfbde6
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_io.c
|
|
@@ -0,0 +1,1556 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "buckets.h"
|
|
+#include "checksum.h"
|
|
+#include "disk_groups.h"
|
|
+#include "error.h"
|
|
+#include "io.h"
|
|
+#include "journal.h"
|
|
+#include "journal_io.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "replicas.h"
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+static void __journal_replay_free(struct journal_replay *i)
|
|
+{
|
|
+ list_del(&i->list);
|
|
+ kvpfree(i, offsetof(struct journal_replay, j) +
|
|
+ vstruct_bytes(&i->j));
|
|
+
|
|
+}
|
|
+
|
|
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
|
|
+{
|
|
+ i->ignore = true;
|
|
+
|
|
+ if (!c->opts.read_entire_journal)
|
|
+ __journal_replay_free(i);
|
|
+}
|
|
+
|
|
+struct journal_list {
|
|
+ struct closure cl;
|
|
+ struct mutex lock;
|
|
+ struct list_head *head;
|
|
+ int ret;
|
|
+};
|
|
+
|
|
+#define JOURNAL_ENTRY_ADD_OK 0
|
|
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5
|
|
+
|
|
+/*
|
|
+ * Given a journal entry we just read, add it to the list of journal entries to
|
|
+ * be replayed:
|
|
+ */
|
|
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
|
|
+ struct bch_extent_ptr entry_ptr,
|
|
+ struct journal_list *jlist, struct jset *j,
|
|
+ bool bad)
|
|
+{
|
|
+ struct journal_replay *i, *pos, *dup = NULL;
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct list_head *where;
|
|
+ size_t bytes = vstruct_bytes(j);
|
|
+ u64 last_seq = 0;
|
|
+ int ret = JOURNAL_ENTRY_ADD_OK;
|
|
+
|
|
+ list_for_each_entry_reverse(i, jlist->head, list) {
|
|
+ if (!JSET_NO_FLUSH(&i->j)) {
|
|
+ last_seq = le64_to_cpu(i->j.last_seq);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Is this entry older than the range we need? */
|
|
+ if (!c->opts.read_entire_journal &&
|
|
+ le64_to_cpu(j->seq) < last_seq) {
|
|
+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ /* Drop entries we don't need anymore */
|
|
+ if (!JSET_NO_FLUSH(j)) {
|
|
+ list_for_each_entry_safe(i, pos, jlist->head, list) {
|
|
+ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
|
|
+ break;
|
|
+ journal_replay_free(c, i);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ list_for_each_entry_reverse(i, jlist->head, list) {
|
|
+ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
|
|
+ where = &i->list;
|
|
+ goto add;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ where = jlist->head;
|
|
+add:
|
|
+ dup = where->next != jlist->head
|
|
+ ? container_of(where->next, struct journal_replay, list)
|
|
+ : NULL;
|
|
+
|
|
+ if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
|
|
+ dup = NULL;
|
|
+
|
|
+ /*
|
|
+ * Duplicate journal entries? If so we want the one that didn't have a
|
|
+ * checksum error:
|
|
+ */
|
|
+ if (dup) {
|
|
+ if (dup->bad) {
|
|
+ /* we'll replace @dup: */
|
|
+ } else if (bad) {
|
|
+ i = dup;
|
|
+ goto found;
|
|
+ } else {
|
|
+ fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
|
|
+ memcmp(j, &dup->j, bytes), c,
|
|
+ "found duplicate but non identical journal entries (seq %llu)",
|
|
+ le64_to_cpu(j->seq));
|
|
+ i = dup;
|
|
+ goto found;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
|
|
+ if (!i) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ i->nr_ptrs = 0;
|
|
+ i->bad = bad;
|
|
+ i->ignore = false;
|
|
+ memcpy(&i->j, j, bytes);
|
|
+
|
|
+ if (dup) {
|
|
+ i->nr_ptrs = dup->nr_ptrs;
|
|
+ memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
|
|
+ __journal_replay_free(dup);
|
|
+ }
|
|
+
|
|
+ list_add(&i->list, where);
|
|
+found:
|
|
+ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
|
|
+ if (ptr->dev == ca->dev_idx) {
|
|
+ bch_err(c, "duplicate journal entry %llu on same device",
|
|
+ le64_to_cpu(i->j.seq));
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
|
|
+ bch_err(c, "found too many copies of journal entry %llu",
|
|
+ le64_to_cpu(i->j.seq));
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ i->ptrs[i->nr_ptrs++] = entry_ptr;
|
|
+out:
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct nonce journal_nonce(const struct jset *jset)
|
|
+{
|
|
+ return (struct nonce) {{
|
|
+ [0] = 0,
|
|
+ [1] = ((__le32 *) &jset->seq)[0],
|
|
+ [2] = ((__le32 *) &jset->seq)[1],
|
|
+ [3] = BCH_NONCE_JOURNAL,
|
|
+ }};
|
|
+}
|
|
+
|
|
+/* this fills in a range with empty jset_entries: */
|
|
+static void journal_entry_null_range(void *start, void *end)
|
|
+{
|
|
+ struct jset_entry *entry;
|
|
+
|
|
+ for (entry = start; entry != end; entry = vstruct_next(entry))
|
|
+ memset(entry, 0, sizeof(*entry));
|
|
+}
|
|
+
|
|
+#define JOURNAL_ENTRY_REREAD 5
|
|
+#define JOURNAL_ENTRY_NONE 6
|
|
+#define JOURNAL_ENTRY_BAD 7
|
|
+
|
|
+#define journal_entry_err(c, msg, ...) \
|
|
+({ \
|
|
+ switch (write) { \
|
|
+ case READ: \
|
|
+ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \
|
|
+ break; \
|
|
+ case WRITE: \
|
|
+ bch_err(c, "corrupt metadata before write:\n" \
|
|
+ msg, ##__VA_ARGS__); \
|
|
+ if (bch2_fs_inconsistent(c)) { \
|
|
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
|
|
+ goto fsck_err; \
|
|
+ } \
|
|
+ break; \
|
|
+ } \
|
|
+ true; \
|
|
+})
|
|
+
|
|
+#define journal_entry_err_on(cond, c, msg, ...) \
|
|
+ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
|
|
+
|
|
+#define FSCK_DELETED_KEY 5
|
|
+
|
|
+static int journal_validate_key(struct bch_fs *c, const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned level, enum btree_id btree_id,
|
|
+ struct bkey_i *k, const char *type,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ void *next = vstruct_next(entry);
|
|
+ const char *invalid;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(!k->k.u64s, c,
|
|
+ "invalid %s in %s entry offset %zi/%u: k->u64s 0",
|
|
+ type, where,
|
|
+ (u64 *) k - entry->_data,
|
|
+ le16_to_cpu(entry->u64s))) {
|
|
+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
|
|
+ journal_entry_null_range(vstruct_next(entry), next);
|
|
+ return FSCK_DELETED_KEY;
|
|
+ }
|
|
+
|
|
+ if (journal_entry_err_on((void *) bkey_next(k) >
|
|
+ (void *) vstruct_next(entry), c,
|
|
+ "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
|
|
+ type, where,
|
|
+ (u64 *) k - entry->_data,
|
|
+ le16_to_cpu(entry->u64s))) {
|
|
+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
|
|
+ journal_entry_null_range(vstruct_next(entry), next);
|
|
+ return FSCK_DELETED_KEY;
|
|
+ }
|
|
+
|
|
+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
|
|
+ "invalid %s in %s entry offset %zi/%u: bad format %u",
|
|
+ type, where,
|
|
+ (u64 *) k - entry->_data,
|
|
+ le16_to_cpu(entry->u64s),
|
|
+ k->k.format)) {
|
|
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
|
|
+ memmove(k, bkey_next(k), next - (void *) bkey_next(k));
|
|
+ journal_entry_null_range(vstruct_next(entry), next);
|
|
+ return FSCK_DELETED_KEY;
|
|
+ }
|
|
+
|
|
+ if (!write)
|
|
+ bch2_bkey_compat(level, btree_id, version, big_endian,
|
|
+ write, NULL, bkey_to_packed(k));
|
|
+
|
|
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
|
|
+ __btree_node_type(level, btree_id));
|
|
+ if (invalid) {
|
|
+ char buf[160];
|
|
+
|
|
+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
|
|
+ mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
|
|
+ type, where,
|
|
+ (u64 *) k - entry->_data,
|
|
+ le16_to_cpu(entry->u64s),
|
|
+ invalid, buf);
|
|
+
|
|
+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s));
|
|
+ memmove(k, bkey_next(k), next - (void *) bkey_next(k));
|
|
+ journal_entry_null_range(vstruct_next(entry), next);
|
|
+ return FSCK_DELETED_KEY;
|
|
+ }
|
|
+
|
|
+ if (write)
|
|
+ bch2_bkey_compat(level, btree_id, version, big_endian,
|
|
+ write, NULL, bkey_to_packed(k));
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_btree_keys(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct bkey_i *k = entry->start;
|
|
+
|
|
+ while (k != vstruct_last(entry)) {
|
|
+ int ret = journal_validate_key(c, where, entry,
|
|
+ entry->level,
|
|
+ entry->btree_id,
|
|
+ k, "key", version, big_endian, write);
|
|
+ if (ret == FSCK_DELETED_KEY)
|
|
+ continue;
|
|
+
|
|
+ k = bkey_next(k);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_btree_root(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct bkey_i *k = entry->start;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(!entry->u64s ||
|
|
+ le16_to_cpu(entry->u64s) != k->k.u64s, c,
|
|
+ "invalid btree root journal entry: wrong number of keys")) {
|
|
+ void *next = vstruct_next(entry);
|
|
+ /*
|
|
+ * we don't want to null out this jset_entry,
|
|
+ * just the contents, so that later we can tell
|
|
+ * we were _supposed_ to have a btree root
|
|
+ */
|
|
+ entry->u64s = 0;
|
|
+ journal_entry_null_range(vstruct_next(entry), next);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
|
|
+ "btree root", version, big_endian, write);
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ /* obsolete, don't care: */
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_blacklist(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
|
|
+ "invalid journal seq blacklist entry: bad size")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct jset_entry_blacklist_v2 *bl_entry;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
|
|
+ "invalid journal seq blacklist entry: bad size")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
|
|
+
|
|
+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
|
|
+ le64_to_cpu(bl_entry->end), c,
|
|
+ "invalid journal seq blacklist entry: start > end")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ }
|
|
+out:
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_usage(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct jset_entry_usage *u =
|
|
+ container_of(entry, struct jset_entry_usage, entry);
|
|
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(bytes < sizeof(*u),
|
|
+ c,
|
|
+ "invalid journal entry usage: bad size")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_data_usage(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct jset_entry_data_usage *u =
|
|
+ container_of(entry, struct jset_entry_data_usage, entry);
|
|
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(bytes < sizeof(*u) ||
|
|
+ bytes < sizeof(*u) + u->r.nr_devs,
|
|
+ c,
|
|
+ "invalid journal entry usage: bad size")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_clock(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct jset_entry_clock *clock =
|
|
+ container_of(entry, struct jset_entry_clock, entry);
|
|
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(bytes != sizeof(*clock),
|
|
+ c, "invalid journal entry clock: bad size")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (journal_entry_err_on(clock->rw > 1,
|
|
+ c, "invalid journal entry clock: bad rw")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
|
|
+ const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ struct jset_entry_dev_usage *u =
|
|
+ container_of(entry, struct jset_entry_dev_usage, entry);
|
|
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
|
|
+ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
|
|
+ unsigned dev;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (journal_entry_err_on(bytes < expected,
|
|
+ c, "invalid journal entry dev usage: bad size (%u < %u)",
|
|
+ bytes, expected)) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ dev = le32_to_cpu(u->dev);
|
|
+
|
|
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
|
|
+ c, "invalid journal entry dev usage: bad dev")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (journal_entry_err_on(u->pad,
|
|
+ c, "invalid journal entry dev usage: bad pad")) {
|
|
+ journal_entry_null_range(entry, vstruct_next(entry));
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct jset_entry_ops {
|
|
+ int (*validate)(struct bch_fs *, const char *,
|
|
+ struct jset_entry *, unsigned, int, int);
|
|
+};
|
|
+
|
|
+static const struct jset_entry_ops bch2_jset_entry_ops[] = {
|
|
+#define x(f, nr) \
|
|
+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \
|
|
+ .validate = journal_entry_validate_##f, \
|
|
+ },
|
|
+ BCH_JSET_ENTRY_TYPES()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
|
|
+ struct jset_entry *entry,
|
|
+ unsigned version, int big_endian, int write)
|
|
+{
|
|
+ return entry->type < BCH_JSET_ENTRY_NR
|
|
+ ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
|
|
+ version, big_endian, write)
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
|
|
+ int write)
|
|
+{
|
|
+ char buf[100];
|
|
+ struct jset_entry *entry;
|
|
+ int ret = 0;
|
|
+
|
|
+ vstruct_for_each(jset, entry) {
|
|
+ scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
|
|
+ le64_to_cpu(jset->seq),
|
|
+ (u64 *) entry - jset->_data,
|
|
+ le32_to_cpu(jset->u64s));
|
|
+
|
|
+ if (journal_entry_err_on(vstruct_next(entry) >
|
|
+ vstruct_last(jset), c,
|
|
+ "journal entry extends past end of jset")) {
|
|
+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = bch2_journal_entry_validate(c, buf, entry,
|
|
+ le32_to_cpu(jset->version),
|
|
+ JSET_BIG_ENDIAN(jset), write);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int jset_validate(struct bch_fs *c,
|
|
+ struct bch_dev *ca,
|
|
+ struct jset *jset, u64 sector,
|
|
+ unsigned bucket_sectors_left,
|
|
+ unsigned sectors_read,
|
|
+ int write)
|
|
+{
|
|
+ size_t bytes = vstruct_bytes(jset);
|
|
+ struct bch_csum csum;
|
|
+ unsigned version;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (le64_to_cpu(jset->magic) != jset_magic(c))
|
|
+ return JOURNAL_ENTRY_NONE;
|
|
+
|
|
+ version = le32_to_cpu(jset->version);
|
|
+ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
|
|
+ version < bcachefs_metadata_version_min) ||
|
|
+ version >= bcachefs_metadata_version_max, c,
|
|
+ "%s sector %llu seq %llu: unknown journal entry version %u",
|
|
+ ca ? ca->name : c->name,
|
|
+ sector, le64_to_cpu(jset->seq),
|
|
+ version)) {
|
|
+ /* don't try to continue: */
|
|
+ return EINVAL;
|
|
+ }
|
|
+
|
|
+ if (bytes > (sectors_read << 9) &&
|
|
+ sectors_read < bucket_sectors_left)
|
|
+ return JOURNAL_ENTRY_REREAD;
|
|
+
|
|
+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
|
|
+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)",
|
|
+ ca ? ca->name : c->name,
|
|
+ sector, le64_to_cpu(jset->seq), bytes)) {
|
|
+ ret = JOURNAL_ENTRY_BAD;
|
|
+ le32_add_cpu(&jset->u64s,
|
|
+ -((bytes - (bucket_sectors_left << 9)) / 8));
|
|
+ }
|
|
+
|
|
+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
|
|
+ "%s sector %llu seq %llu: journal entry with unknown csum type %llu",
|
|
+ ca ? ca->name : c->name,
|
|
+ sector, le64_to_cpu(jset->seq),
|
|
+ JSET_CSUM_TYPE(jset))) {
|
|
+ ret = JOURNAL_ENTRY_BAD;
|
|
+ goto csum_done;
|
|
+ }
|
|
+
|
|
+ if (write)
|
|
+ goto csum_done;
|
|
+
|
|
+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
|
|
+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
|
|
+ "%s sector %llu seq %llu: journal checksum bad",
|
|
+ ca ? ca->name : c->name,
|
|
+ sector, le64_to_cpu(jset->seq)))
|
|
+ ret = JOURNAL_ENTRY_BAD;
|
|
+
|
|
+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
|
|
+ jset->encrypted_start,
|
|
+ vstruct_end(jset) - (void *) jset->encrypted_start);
|
|
+csum_done:
|
|
+ /* last_seq is ignored when JSET_NO_FLUSH is true */
|
|
+ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) &&
|
|
+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
|
|
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
|
|
+ le64_to_cpu(jset->last_seq),
|
|
+ le64_to_cpu(jset->seq))) {
|
|
+ jset->last_seq = jset->seq;
|
|
+ return JOURNAL_ENTRY_BAD;
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int jset_validate_for_write(struct bch_fs *c, struct jset *jset)
|
|
+{
|
|
+ unsigned sectors = vstruct_sectors(jset, c->block_bits);
|
|
+
|
|
+ return jset_validate(c, NULL, jset, 0, sectors, sectors, WRITE) ?:
|
|
+ jset_validate_entries(c, jset, WRITE);
|
|
+}
|
|
+
|
|
+struct journal_read_buf {
|
|
+ void *data;
|
|
+ size_t size;
|
|
+};
|
|
+
|
|
+static int journal_read_buf_realloc(struct journal_read_buf *b,
|
|
+ size_t new_size)
|
|
+{
|
|
+ void *n;
|
|
+
|
|
+ /* the bios are sized for this many pages, max: */
|
|
+ if (new_size > JOURNAL_ENTRY_SIZE_MAX)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ new_size = roundup_pow_of_two(new_size);
|
|
+ n = kvpmalloc(new_size, GFP_KERNEL);
|
|
+ if (!n)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ kvpfree(b->data, b->size);
|
|
+ b->data = n;
|
|
+ b->size = new_size;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int journal_read_bucket(struct bch_dev *ca,
|
|
+ struct journal_read_buf *buf,
|
|
+ struct journal_list *jlist,
|
|
+ unsigned bucket)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+ struct jset *j = NULL;
|
|
+ unsigned sectors, sectors_read = 0;
|
|
+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
|
|
+ end = offset + ca->mi.bucket_size;
|
|
+ bool saw_bad = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_debug("reading %u", bucket);
|
|
+
|
|
+ while (offset < end) {
|
|
+ if (!sectors_read) {
|
|
+ struct bio *bio;
|
|
+reread:
|
|
+ sectors_read = min_t(unsigned,
|
|
+ end - offset, buf->size >> 9);
|
|
+
|
|
+ bio = bio_kmalloc(GFP_KERNEL,
|
|
+ buf_pages(buf->data,
|
|
+ sectors_read << 9));
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ bio->bi_iter.bi_sector = offset;
|
|
+ bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
|
+ bch2_bio_map(bio, buf->data, sectors_read << 9);
|
|
+
|
|
+ ret = submit_bio_wait(bio);
|
|
+ bio_put(bio);
|
|
+
|
|
+ if (bch2_dev_io_err_on(ret, ca,
|
|
+ "journal read error: sector %llu",
|
|
+ offset) ||
|
|
+ bch2_meta_read_fault("journal")) {
|
|
+ /*
|
|
+ * We don't error out of the recovery process
|
|
+ * here, since the relevant journal entry may be
|
|
+ * found on a different device, and missing or
|
|
+ * no journal entries will be handled later
|
|
+ */
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ j = buf->data;
|
|
+ }
|
|
+
|
|
+ ret = jset_validate(c, ca, j, offset,
|
|
+ end - offset, sectors_read,
|
|
+ READ);
|
|
+ switch (ret) {
|
|
+ case BCH_FSCK_OK:
|
|
+ sectors = vstruct_sectors(j, c->block_bits);
|
|
+ break;
|
|
+ case JOURNAL_ENTRY_REREAD:
|
|
+ if (vstruct_bytes(j) > buf->size) {
|
|
+ ret = journal_read_buf_realloc(buf,
|
|
+ vstruct_bytes(j));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ goto reread;
|
|
+ case JOURNAL_ENTRY_NONE:
|
|
+ if (!saw_bad)
|
|
+ return 0;
|
|
+ sectors = c->opts.block_size;
|
|
+ goto next_block;
|
|
+ case JOURNAL_ENTRY_BAD:
|
|
+ saw_bad = true;
|
|
+ /*
|
|
+ * On checksum error we don't really trust the size
|
|
+ * field of the journal entry we read, so try reading
|
|
+ * again at next block boundary:
|
|
+ */
|
|
+ sectors = c->opts.block_size;
|
|
+ break;
|
|
+ default:
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * This happens sometimes if we don't have discards on -
|
|
+ * when we've partially overwritten a bucket with new
|
|
+ * journal entries. We don't need the rest of the
|
|
+ * bucket:
|
|
+ */
|
|
+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
|
|
+ return 0;
|
|
+
|
|
+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
|
|
+
|
|
+ mutex_lock(&jlist->lock);
|
|
+ ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
|
|
+ .dev = ca->dev_idx,
|
|
+ .offset = offset,
|
|
+ }, jlist, j, ret != 0);
|
|
+ mutex_unlock(&jlist->lock);
|
|
+
|
|
+ switch (ret) {
|
|
+ case JOURNAL_ENTRY_ADD_OK:
|
|
+ break;
|
|
+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
|
|
+ break;
|
|
+ default:
|
|
+ return ret;
|
|
+ }
|
|
+next_block:
|
|
+ pr_debug("next");
|
|
+ offset += sectors;
|
|
+ sectors_read -= sectors;
|
|
+ j = ((void *) j) + (sectors << 9);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void bch2_journal_read_device(struct closure *cl)
|
|
+{
|
|
+ struct journal_device *ja =
|
|
+ container_of(cl, struct journal_device, read);
|
|
+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
|
|
+ struct journal_list *jlist =
|
|
+ container_of(cl->parent, struct journal_list, cl);
|
|
+ struct journal_read_buf buf = { NULL, 0 };
|
|
+ u64 min_seq = U64_MAX;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ if (!ja->nr)
|
|
+ goto out;
|
|
+
|
|
+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ pr_debug("%u journal buckets", ja->nr);
|
|
+
|
|
+ for (i = 0; i < ja->nr; i++) {
|
|
+ ret = journal_read_bucket(ca, &buf, jlist, i);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /* Find the journal bucket with the highest sequence number: */
|
|
+ for (i = 0; i < ja->nr; i++) {
|
|
+ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
|
|
+ ja->cur_idx = i;
|
|
+
|
|
+ min_seq = min(ja->bucket_seq[i], min_seq);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If there's duplicate journal entries in multiple buckets (which
|
|
+ * definitely isn't supposed to happen, but...) - make sure to start
|
|
+ * cur_idx at the last of those buckets, so we don't deadlock trying to
|
|
+ * allocate
|
|
+ */
|
|
+ while (ja->bucket_seq[ja->cur_idx] > min_seq &&
|
|
+ ja->bucket_seq[ja->cur_idx] >
|
|
+ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
|
|
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
|
|
+
|
|
+ ja->sectors_free = 0;
|
|
+
|
|
+ /*
|
|
+ * Set dirty_idx to indicate the entire journal is full and needs to be
|
|
+ * reclaimed - journal reclaim will immediately reclaim whatever isn't
|
|
+ * pinned when it first runs:
|
|
+ */
|
|
+ ja->discard_idx = ja->dirty_idx_ondisk =
|
|
+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
|
|
+out:
|
|
+ kvpfree(buf.data, buf.size);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ closure_return(cl);
|
|
+ return;
|
|
+err:
|
|
+ mutex_lock(&jlist->lock);
|
|
+ jlist->ret = ret;
|
|
+ mutex_unlock(&jlist->lock);
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct journal_replay *j)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < j->nr_ptrs; i++) {
|
|
+ struct bch_dev *ca = c->devs[j->ptrs[i].dev];
|
|
+ u64 offset;
|
|
+
|
|
+ div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
|
|
+
|
|
+ if (i)
|
|
+ pr_buf(out, " ");
|
|
+ pr_buf(out, "%u:%llu (offset %llu)",
|
|
+ j->ptrs[i].dev,
|
|
+ (u64) j->ptrs[i].offset, offset);
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_journal_read(struct bch_fs *c, struct list_head *list,
|
|
+ u64 *blacklist_seq, u64 *start_seq)
|
|
+{
|
|
+ struct journal_list jlist;
|
|
+ struct journal_replay *i, *t;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned iter;
|
|
+ size_t keys = 0, entries = 0;
|
|
+ bool degraded = false;
|
|
+ u64 seq, last_seq = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ closure_init_stack(&jlist.cl);
|
|
+ mutex_init(&jlist.lock);
|
|
+ jlist.head = list;
|
|
+ jlist.ret = 0;
|
|
+
|
|
+ for_each_member_device(ca, c, iter) {
|
|
+ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
|
|
+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
|
|
+ continue;
|
|
+
|
|
+ if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
+ ca->mi.state == BCH_MEMBER_STATE_ro) &&
|
|
+ percpu_ref_tryget(&ca->io_ref))
|
|
+ closure_call(&ca->journal.read,
|
|
+ bch2_journal_read_device,
|
|
+ system_unbound_wq,
|
|
+ &jlist.cl);
|
|
+ else
|
|
+ degraded = true;
|
|
+ }
|
|
+
|
|
+ closure_sync(&jlist.cl);
|
|
+
|
|
+ if (jlist.ret)
|
|
+ return jlist.ret;
|
|
+
|
|
+ if (list_empty(list)) {
|
|
+ bch_info(c, "journal read done, but no entries found");
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ i = list_last_entry(list, struct journal_replay, list);
|
|
+ *start_seq = le64_to_cpu(i->j.seq) + 1;
|
|
+
|
|
+ /*
|
|
+ * Find most recent flush entry, and ignore newer non flush entries -
|
|
+ * those entries will be blacklisted:
|
|
+ */
|
|
+ list_for_each_entry_safe_reverse(i, t, list, list) {
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ if (!JSET_NO_FLUSH(&i->j)) {
|
|
+ last_seq = le64_to_cpu(i->j.last_seq);
|
|
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ journal_replay_free(c, i);
|
|
+ }
|
|
+
|
|
+ if (!last_seq) {
|
|
+ fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /* Drop blacklisted entries and entries older than last_seq: */
|
|
+ list_for_each_entry_safe(i, t, list, list) {
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ seq = le64_to_cpu(i->j.seq);
|
|
+ if (seq < last_seq) {
|
|
+ journal_replay_free(c, i);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
|
|
+ fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
|
|
+ "found blacklisted journal entry %llu", seq);
|
|
+
|
|
+ journal_replay_free(c, i);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Check for missing entries: */
|
|
+ seq = last_seq;
|
|
+ list_for_each_entry(i, list, list) {
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ BUG_ON(seq > le64_to_cpu(i->j.seq));
|
|
+
|
|
+ while (seq < le64_to_cpu(i->j.seq)) {
|
|
+ u64 missing_start, missing_end;
|
|
+ char buf1[200], buf2[200];
|
|
+
|
|
+ while (seq < le64_to_cpu(i->j.seq) &&
|
|
+ bch2_journal_seq_is_blacklisted(c, seq, false))
|
|
+ seq++;
|
|
+
|
|
+ if (seq == le64_to_cpu(i->j.seq))
|
|
+ break;
|
|
+
|
|
+ missing_start = seq;
|
|
+
|
|
+ while (seq < le64_to_cpu(i->j.seq) &&
|
|
+ !bch2_journal_seq_is_blacklisted(c, seq, false))
|
|
+ seq++;
|
|
+
|
|
+ if (i->list.prev != list) {
|
|
+ struct printbuf out = PBUF(buf1);
|
|
+ struct journal_replay *p = list_prev_entry(i, list);
|
|
+
|
|
+ bch2_journal_ptrs_to_text(&out, c, p);
|
|
+ pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
|
|
+ } else
|
|
+ sprintf(buf1, "(none)");
|
|
+ bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
|
|
+
|
|
+ missing_end = seq - 1;
|
|
+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
|
|
+ " prev at %s\n"
|
|
+ " next at %s",
|
|
+ missing_start, missing_end,
|
|
+ last_seq, *blacklist_seq - 1,
|
|
+ buf1, buf2);
|
|
+ }
|
|
+
|
|
+ seq++;
|
|
+ }
|
|
+
|
|
+ list_for_each_entry(i, list, list) {
|
|
+ struct jset_entry *entry;
|
|
+ struct bkey_i *k, *_n;
|
|
+ struct bch_replicas_padded replicas = {
|
|
+ .e.data_type = BCH_DATA_journal,
|
|
+ .e.nr_required = 1,
|
|
+ };
|
|
+ unsigned ptr;
|
|
+ char buf[80];
|
|
+
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ ret = jset_validate_entries(c, &i->j, READ);
|
|
+ if (ret)
|
|
+ goto fsck_err;
|
|
+
|
|
+ for (ptr = 0; ptr < i->nr_ptrs; ptr++)
|
|
+ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
|
|
+
|
|
+ bch2_replicas_entry_sort(&replicas.e);
|
|
+
|
|
+ /*
|
|
+ * If we're mounting in degraded mode - if we didn't read all
|
|
+ * the devices - this is wrong:
|
|
+ */
|
|
+
|
|
+ if (!degraded &&
|
|
+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
|
|
+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
|
|
+ "superblock not marked as containing replicas %s",
|
|
+ (bch2_replicas_entry_to_text(&PBUF(buf),
|
|
+ &replicas.e), buf)))) {
|
|
+ ret = bch2_mark_replicas(c, &replicas.e);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ for_each_jset_key(k, _n, entry, &i->j)
|
|
+ keys++;
|
|
+ entries++;
|
|
+ }
|
|
+
|
|
+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
|
|
+ keys, entries, *start_seq);
|
|
+
|
|
+ if (*start_seq != *blacklist_seq)
|
|
+ bch_info(c, "dropped unflushed entries %llu-%llu",
|
|
+ *blacklist_seq, *start_seq - 1);
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* journal write: */
|
|
+
|
|
+static void __journal_write_alloc(struct journal *j,
|
|
+ struct journal_buf *w,
|
|
+ struct dev_alloc_list *devs_sorted,
|
|
+ unsigned sectors,
|
|
+ unsigned *replicas,
|
|
+ unsigned replicas_want)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_device *ja;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ if (*replicas >= replicas_want)
|
|
+ return;
|
|
+
|
|
+ for (i = 0; i < devs_sorted->nr; i++) {
|
|
+ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
|
|
+ if (!ca)
|
|
+ continue;
|
|
+
|
|
+ ja = &ca->journal;
|
|
+
|
|
+ /*
|
|
+ * Check that we can use this device, and aren't already using
|
|
+ * it:
|
|
+ */
|
|
+ if (!ca->mi.durability ||
|
|
+ ca->mi.state != BCH_MEMBER_STATE_rw ||
|
|
+ !ja->nr ||
|
|
+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
|
|
+ ca->dev_idx) ||
|
|
+ sectors > ja->sectors_free)
|
|
+ continue;
|
|
+
|
|
+ bch2_dev_stripe_increment(ca, &j->wp.stripe);
|
|
+
|
|
+ bch2_bkey_append_ptr(&w->key,
|
|
+ (struct bch_extent_ptr) {
|
|
+ .offset = bucket_to_sector(ca,
|
|
+ ja->buckets[ja->cur_idx]) +
|
|
+ ca->mi.bucket_size -
|
|
+ ja->sectors_free,
|
|
+ .dev = ca->dev_idx,
|
|
+ });
|
|
+
|
|
+ ja->sectors_free -= sectors;
|
|
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
|
|
+
|
|
+ *replicas += ca->mi.durability;
|
|
+
|
|
+ if (*replicas >= replicas_want)
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+/**
|
|
+ * journal_next_bucket - move on to the next journal bucket if possible
|
|
+ */
|
|
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
|
|
+ unsigned sectors)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_devs_mask devs;
|
|
+ struct journal_device *ja;
|
|
+ struct bch_dev *ca;
|
|
+ struct dev_alloc_list devs_sorted;
|
|
+ unsigned target = c->opts.metadata_target ?:
|
|
+ c->opts.foreground_target;
|
|
+ unsigned i, replicas = 0, replicas_want =
|
|
+ READ_ONCE(c->opts.metadata_replicas);
|
|
+
|
|
+ rcu_read_lock();
|
|
+retry:
|
|
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
|
|
+
|
|
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
|
|
+
|
|
+ __journal_write_alloc(j, w, &devs_sorted,
|
|
+ sectors, &replicas, replicas_want);
|
|
+
|
|
+ if (replicas >= replicas_want)
|
|
+ goto done;
|
|
+
|
|
+ for (i = 0; i < devs_sorted.nr; i++) {
|
|
+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
|
|
+ if (!ca)
|
|
+ continue;
|
|
+
|
|
+ ja = &ca->journal;
|
|
+
|
|
+ if (sectors > ja->sectors_free &&
|
|
+ sectors <= ca->mi.bucket_size &&
|
|
+ bch2_journal_dev_buckets_available(j, ja,
|
|
+ journal_space_discarded)) {
|
|
+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
|
|
+ ja->sectors_free = ca->mi.bucket_size;
|
|
+
|
|
+ /*
|
|
+ * ja->bucket_seq[ja->cur_idx] must always have
|
|
+ * something sensible:
|
|
+ */
|
|
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ __journal_write_alloc(j, w, &devs_sorted,
|
|
+ sectors, &replicas, replicas_want);
|
|
+
|
|
+ if (replicas < replicas_want && target) {
|
|
+ /* Retry from all devices: */
|
|
+ target = 0;
|
|
+ goto retry;
|
|
+ }
|
|
+done:
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
|
|
+
|
|
+ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
|
|
+}
|
|
+
|
|
+static void journal_write_compact(struct jset *jset)
|
|
+{
|
|
+ struct jset_entry *i, *next, *prev = NULL;
|
|
+
|
|
+ /*
|
|
+ * Simple compaction, dropping empty jset_entries (from journal
|
|
+ * reservations that weren't fully used) and merging jset_entries that
|
|
+ * can be.
|
|
+ *
|
|
+ * If we wanted to be really fancy here, we could sort all the keys in
|
|
+ * the jset and drop keys that were overwritten - probably not worth it:
|
|
+ */
|
|
+ vstruct_for_each_safe(jset, i, next) {
|
|
+ unsigned u64s = le16_to_cpu(i->u64s);
|
|
+
|
|
+ /* Empty entry: */
|
|
+ if (!u64s)
|
|
+ continue;
|
|
+
|
|
+ /* Can we merge with previous entry? */
|
|
+ if (prev &&
|
|
+ i->btree_id == prev->btree_id &&
|
|
+ i->level == prev->level &&
|
|
+ i->type == prev->type &&
|
|
+ i->type == BCH_JSET_ENTRY_btree_keys &&
|
|
+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
|
|
+ memmove_u64s_down(vstruct_next(prev),
|
|
+ i->_data,
|
|
+ u64s);
|
|
+ le16_add_cpu(&prev->u64s, u64s);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* Couldn't merge, move i into new position (after prev): */
|
|
+ prev = prev ? vstruct_next(prev) : jset->start;
|
|
+ if (i != prev)
|
|
+ memmove_u64s_down(prev, i, jset_u64s(u64s));
|
|
+ }
|
|
+
|
|
+ prev = prev ? vstruct_next(prev) : jset->start;
|
|
+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
|
|
+}
|
|
+
|
|
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
|
|
+{
|
|
+ /* we aren't holding j->lock: */
|
|
+ unsigned new_size = READ_ONCE(j->buf_size_want);
|
|
+ void *new_buf;
|
|
+
|
|
+ if (buf->buf_size >= new_size)
|
|
+ return;
|
|
+
|
|
+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
|
|
+ if (!new_buf)
|
|
+ return;
|
|
+
|
|
+ memcpy(new_buf, buf->data, buf->buf_size);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ swap(buf->data, new_buf);
|
|
+ swap(buf->buf_size, new_size);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ kvpfree(new_buf, new_size);
|
|
+}
|
|
+
|
|
+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
|
|
+{
|
|
+ return j->buf + j->reservations.unwritten_idx;
|
|
+}
|
|
+
|
|
+static void journal_write_done(struct closure *cl)
|
|
+{
|
|
+ struct journal *j = container_of(cl, struct journal, io);
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_buf *w = journal_last_unwritten_buf(j);
|
|
+ struct bch_devs_list devs =
|
|
+ bch2_bkey_devs(bkey_i_to_s_c(&w->key));
|
|
+ struct bch_replicas_padded replicas;
|
|
+ union journal_res_state old, new;
|
|
+ u64 v, seq, last_seq;
|
|
+ int err = 0;
|
|
+
|
|
+ bch2_time_stats_update(j->write_time, j->write_start_time);
|
|
+
|
|
+ if (!devs.nr) {
|
|
+ bch_err(c, "unable to write journal to sufficient devices");
|
|
+ err = -EIO;
|
|
+ } else {
|
|
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
|
|
+ if (bch2_mark_replicas(c, &replicas.e))
|
|
+ err = -EIO;
|
|
+ }
|
|
+
|
|
+ if (err)
|
|
+ bch2_fatal_error(c);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ seq = le64_to_cpu(w->data->seq);
|
|
+ last_seq = le64_to_cpu(w->data->last_seq);
|
|
+
|
|
+ if (seq >= j->pin.front)
|
|
+ journal_seq_pin(j, seq)->devs = devs;
|
|
+
|
|
+ j->seq_ondisk = seq;
|
|
+ if (err && (!j->err_seq || seq < j->err_seq))
|
|
+ j->err_seq = seq;
|
|
+
|
|
+ if (!JSET_NO_FLUSH(w->data)) {
|
|
+ j->flushed_seq_ondisk = seq;
|
|
+ j->last_seq_ondisk = last_seq;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
|
|
+ * more buckets:
|
|
+ *
|
|
+ * Must come before signaling write completion, for
|
|
+ * bch2_fs_journal_stop():
|
|
+ */
|
|
+ journal_reclaim_kick(&c->journal);
|
|
+
|
|
+ /* also must come before signalling write completion: */
|
|
+ closure_debug_destroy(cl);
|
|
+
|
|
+ v = atomic64_read(&j->reservations.counter);
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+ BUG_ON(new.idx == new.unwritten_idx);
|
|
+
|
|
+ new.unwritten_idx++;
|
|
+ } while ((v = atomic64_cmpxchg(&j->reservations.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ bch2_journal_space_available(j);
|
|
+
|
|
+ closure_wake_up(&w->wait);
|
|
+ journal_wake(j);
|
|
+
|
|
+ if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
|
|
+ mod_delayed_work(system_freezable_wq, &j->write_work, 0);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ if (new.unwritten_idx != new.idx &&
|
|
+ !journal_state_count(new, new.unwritten_idx))
|
|
+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
|
|
+}
|
|
+
|
|
+static void journal_write_endio(struct bio *bio)
|
|
+{
|
|
+ struct bch_dev *ca = bio->bi_private;
|
|
+ struct journal *j = &ca->fs->journal;
|
|
+
|
|
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status)) ||
|
|
+ bch2_meta_write_fault("journal")) {
|
|
+ struct journal_buf *w = journal_last_unwritten_buf(j);
|
|
+ unsigned long flags;
|
|
+
|
|
+ spin_lock_irqsave(&j->err_lock, flags);
|
|
+ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
|
|
+ spin_unlock_irqrestore(&j->err_lock, flags);
|
|
+ }
|
|
+
|
|
+ closure_put(&j->io);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+}
|
|
+
|
|
+static void do_journal_write(struct closure *cl)
|
|
+{
|
|
+ struct journal *j = container_of(cl, struct journal, io);
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_dev *ca;
|
|
+ struct journal_buf *w = journal_last_unwritten_buf(j);
|
|
+ struct bch_extent_ptr *ptr;
|
|
+ struct bio *bio;
|
|
+ unsigned sectors = vstruct_sectors(w->data, c->block_bits);
|
|
+
|
|
+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
|
|
+ ca = bch_dev_bkey_exists(c, ptr->dev);
|
|
+ if (!percpu_ref_tryget(&ca->io_ref)) {
|
|
+ /* XXX: fix this */
|
|
+ bch_err(c, "missing device for journal write\n");
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
|
|
+ sectors);
|
|
+
|
|
+ bio = ca->journal.bio;
|
|
+ bio_reset(bio);
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ bio->bi_iter.bi_sector = ptr->offset;
|
|
+ bio->bi_end_io = journal_write_endio;
|
|
+ bio->bi_private = ca;
|
|
+ bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
|
|
+
|
|
+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
|
|
+ ca->prev_journal_sector = bio->bi_iter.bi_sector;
|
|
+
|
|
+ if (!JSET_NO_FLUSH(w->data))
|
|
+ bio->bi_opf |= REQ_FUA;
|
|
+ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
|
|
+ bio->bi_opf |= REQ_PREFLUSH;
|
|
+
|
|
+ bch2_bio_map(bio, w->data, sectors << 9);
|
|
+
|
|
+ trace_journal_write(bio);
|
|
+ closure_bio_submit(bio, cl);
|
|
+
|
|
+ ca->journal.bucket_seq[ca->journal.cur_idx] =
|
|
+ le64_to_cpu(w->data->seq);
|
|
+ }
|
|
+
|
|
+ continue_at(cl, journal_write_done, system_highpri_wq);
|
|
+ return;
|
|
+}
|
|
+
|
|
+void bch2_journal_write(struct closure *cl)
|
|
+{
|
|
+ struct journal *j = container_of(cl, struct journal, io);
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_dev *ca;
|
|
+ struct journal_buf *w = journal_last_unwritten_buf(j);
|
|
+ struct jset_entry *start, *end;
|
|
+ struct jset *jset;
|
|
+ struct bio *bio;
|
|
+ char *journal_debug_buf = NULL;
|
|
+ bool validate_before_checksum = false;
|
|
+ unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
|
|
+ int ret;
|
|
+
|
|
+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
|
|
+
|
|
+ journal_buf_realloc(j, w);
|
|
+ jset = w->data;
|
|
+
|
|
+ j->write_start_time = local_clock();
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ if (c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush) &&
|
|
+ !w->must_flush &&
|
|
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(j->write_delay_ms) &&
|
|
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
|
|
+ w->noflush = true;
|
|
+ SET_JSET_NO_FLUSH(jset, true);
|
|
+ jset->last_seq = 0;
|
|
+
|
|
+ j->nr_noflush_writes++;
|
|
+ } else {
|
|
+ j->last_flush_write = jiffies;
|
|
+ j->nr_flush_writes++;
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ /*
|
|
+ * New btree roots are set by journalling them; when the journal entry
|
|
+ * gets written we have to propagate them to c->btree_roots
|
|
+ *
|
|
+ * But, every journal entry we write has to contain all the btree roots
|
|
+ * (at least for now); so after we copy btree roots to c->btree_roots we
|
|
+ * have to get any missing btree roots and add them to this journal
|
|
+ * entry:
|
|
+ */
|
|
+
|
|
+ bch2_journal_entries_to_btree_roots(c, jset);
|
|
+
|
|
+ start = end = vstruct_last(jset);
|
|
+
|
|
+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
|
|
+
|
|
+ bch2_journal_super_entries_add_common(c, &end,
|
|
+ le64_to_cpu(jset->seq));
|
|
+ u64s = (u64 *) end - (u64 *) start;
|
|
+ BUG_ON(u64s > j->entry_u64s_reserved);
|
|
+
|
|
+ le32_add_cpu(&jset->u64s, u64s);
|
|
+ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
|
|
+
|
|
+ journal_write_compact(jset);
|
|
+
|
|
+ jset->magic = cpu_to_le64(jset_magic(c));
|
|
+ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
|
|
+ ? cpu_to_le32(BCH_JSET_VERSION_OLD)
|
|
+ : cpu_to_le32(c->sb.version);
|
|
+
|
|
+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
|
|
+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
|
|
+
|
|
+ if (journal_entry_empty(jset))
|
|
+ j->last_empty_seq = le64_to_cpu(jset->seq);
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
|
|
+ validate_before_checksum = true;
|
|
+
|
|
+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current)
|
|
+ validate_before_checksum = true;
|
|
+
|
|
+ if (validate_before_checksum &&
|
|
+ jset_validate_for_write(c, jset))
|
|
+ goto err;
|
|
+
|
|
+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
|
|
+ jset->encrypted_start,
|
|
+ vstruct_end(jset) - (void *) jset->encrypted_start);
|
|
+
|
|
+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
|
|
+ journal_nonce(jset), jset);
|
|
+
|
|
+ if (!validate_before_checksum &&
|
|
+ jset_validate_for_write(c, jset))
|
|
+ goto err;
|
|
+
|
|
+ sectors = vstruct_sectors(jset, c->block_bits);
|
|
+ BUG_ON(sectors > w->sectors);
|
|
+
|
|
+ bytes = vstruct_bytes(jset);
|
|
+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
|
|
+
|
|
+retry_alloc:
|
|
+ spin_lock(&j->lock);
|
|
+ ret = journal_write_alloc(j, w, sectors);
|
|
+
|
|
+ if (ret && j->can_discard) {
|
|
+ spin_unlock(&j->lock);
|
|
+ bch2_journal_do_discards(j);
|
|
+ goto retry_alloc;
|
|
+ }
|
|
+
|
|
+ if (ret) {
|
|
+ journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
|
|
+ if (journal_debug_buf)
|
|
+ __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * write is allocated, no longer need to account for it in
|
|
+ * bch2_journal_space_available():
|
|
+ */
|
|
+ w->sectors = 0;
|
|
+
|
|
+ /*
|
|
+ * journal entry has been compacted and allocated, recalculate space
|
|
+ * available:
|
|
+ */
|
|
+ bch2_journal_space_available(j);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ if (ret) {
|
|
+ bch_err(c, "Unable to allocate journal write:\n%s",
|
|
+ journal_debug_buf);
|
|
+ kfree(journal_debug_buf);
|
|
+ bch2_fatal_error(c);
|
|
+ continue_at(cl, journal_write_done, system_highpri_wq);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * XXX: we really should just disable the entire journal in nochanges
|
|
+ * mode
|
|
+ */
|
|
+ if (c->opts.nochanges)
|
|
+ goto no_io;
|
|
+
|
|
+ for_each_rw_member(ca, c, i)
|
|
+ nr_rw_members++;
|
|
+
|
|
+ if (nr_rw_members > 1)
|
|
+ w->separate_flush = true;
|
|
+
|
|
+ if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
|
|
+ for_each_rw_member(ca, c, i) {
|
|
+ percpu_ref_get(&ca->io_ref);
|
|
+
|
|
+ bio = ca->journal.bio;
|
|
+ bio_reset(bio);
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ bio->bi_opf = REQ_OP_FLUSH;
|
|
+ bio->bi_end_io = journal_write_endio;
|
|
+ bio->bi_private = ca;
|
|
+ closure_bio_submit(bio, cl);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_bucket_seq_cleanup(c);
|
|
+
|
|
+ continue_at(cl, do_journal_write, system_highpri_wq);
|
|
+ return;
|
|
+no_io:
|
|
+ bch2_bucket_seq_cleanup(c);
|
|
+
|
|
+ continue_at(cl, journal_write_done, system_highpri_wq);
|
|
+ return;
|
|
+err:
|
|
+ bch2_inconsistent_error(c);
|
|
+ continue_at(cl, journal_write_done, system_highpri_wq);
|
|
+}
|
|
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
|
|
new file mode 100644
|
|
index 000000000000..f34281a28f12
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_io.h
|
|
@@ -0,0 +1,50 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_JOURNAL_IO_H
|
|
+#define _BCACHEFS_JOURNAL_IO_H
|
|
+
|
|
+/*
|
|
+ * Only used for holding the journal entries we read in btree_journal_read()
|
|
+ * during cache_registration
|
|
+ */
|
|
+struct journal_replay {
|
|
+ struct list_head list;
|
|
+ struct bch_extent_ptr ptrs[BCH_REPLICAS_MAX];
|
|
+ unsigned nr_ptrs;
|
|
+
|
|
+ /* checksum error, but we may want to try using it anyways: */
|
|
+ bool bad;
|
|
+ bool ignore;
|
|
+ /* must be last: */
|
|
+ struct jset j;
|
|
+};
|
|
+
|
|
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
|
|
+ struct jset_entry *entry, unsigned type)
|
|
+{
|
|
+ while (entry < vstruct_last(jset)) {
|
|
+ if (entry->type == type)
|
|
+ return entry;
|
|
+
|
|
+ entry = vstruct_next(entry);
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+#define for_each_jset_entry_type(entry, jset, type) \
|
|
+ for (entry = (jset)->start; \
|
|
+ (entry = __jset_entry_type_next(jset, entry, type)); \
|
|
+ entry = vstruct_next(entry))
|
|
+
|
|
+#define for_each_jset_key(k, _n, entry, jset) \
|
|
+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \
|
|
+ vstruct_for_each_safe(entry, k, _n)
|
|
+
|
|
+int bch2_journal_entry_validate(struct bch_fs *, const char *, struct jset_entry *,
|
|
+ unsigned, int, int);
|
|
+
|
|
+int bch2_journal_read(struct bch_fs *, struct list_head *, u64 *, u64 *);
|
|
+
|
|
+void bch2_journal_write(struct closure *);
|
|
+
|
|
+#endif /* _BCACHEFS_JOURNAL_IO_H */
|
|
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
|
|
new file mode 100644
|
|
index 000000000000..427be2da1dfc
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_reclaim.c
|
|
@@ -0,0 +1,840 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "error.h"
|
|
+#include "journal.h"
|
|
+#include "journal_io.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "replicas.h"
|
|
+#include "super.h"
|
|
+
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/sched/mm.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+/* Free space calculations: */
|
|
+
|
|
+static unsigned journal_space_from(struct journal_device *ja,
|
|
+ enum journal_space_from from)
|
|
+{
|
|
+ switch (from) {
|
|
+ case journal_space_discarded:
|
|
+ return ja->discard_idx;
|
|
+ case journal_space_clean_ondisk:
|
|
+ return ja->dirty_idx_ondisk;
|
|
+ case journal_space_clean:
|
|
+ return ja->dirty_idx;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
|
|
+ struct journal_device *ja,
|
|
+ enum journal_space_from from)
|
|
+{
|
|
+ unsigned available = (journal_space_from(ja, from) -
|
|
+ ja->cur_idx - 1 + ja->nr) % ja->nr;
|
|
+
|
|
+ /*
|
|
+ * Don't use the last bucket unless writing the new last_seq
|
|
+ * will make another bucket available:
|
|
+ */
|
|
+ if (available && ja->dirty_idx_ondisk == ja->dirty_idx)
|
|
+ --available;
|
|
+
|
|
+ return available;
|
|
+}
|
|
+
|
|
+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining)
|
|
+{
|
|
+ union journal_preres_state old, new;
|
|
+ u64 v = atomic64_read(&j->prereserved.counter);
|
|
+
|
|
+ do {
|
|
+ old.v = new.v = v;
|
|
+ new.remaining = u64s_remaining;
|
|
+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+}
|
|
+
|
|
+static inline unsigned get_unwritten_sectors(struct journal *j, unsigned *idx)
|
|
+{
|
|
+ unsigned sectors = 0;
|
|
+
|
|
+ while (!sectors && *idx != j->reservations.idx) {
|
|
+ sectors = j->buf[*idx].sectors;
|
|
+
|
|
+ *idx = (*idx + 1) & JOURNAL_BUF_MASK;
|
|
+ }
|
|
+
|
|
+ return sectors;
|
|
+}
|
|
+
|
|
+static struct journal_space
|
|
+journal_dev_space_available(struct journal *j, struct bch_dev *ca,
|
|
+ enum journal_space_from from)
|
|
+{
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+ unsigned sectors, buckets, unwritten, idx = j->reservations.unwritten_idx;
|
|
+
|
|
+ if (from == journal_space_total)
|
|
+ return (struct journal_space) {
|
|
+ .next_entry = ca->mi.bucket_size,
|
|
+ .total = ca->mi.bucket_size * ja->nr,
|
|
+ };
|
|
+
|
|
+ buckets = bch2_journal_dev_buckets_available(j, ja, from);
|
|
+ sectors = ja->sectors_free;
|
|
+
|
|
+ /*
|
|
+ * We that we don't allocate the space for a journal entry
|
|
+ * until we write it out - thus, account for it here:
|
|
+ */
|
|
+ while ((unwritten = get_unwritten_sectors(j, &idx))) {
|
|
+ if (unwritten >= sectors) {
|
|
+ if (!buckets) {
|
|
+ sectors = 0;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ buckets--;
|
|
+ sectors = ca->mi.bucket_size;
|
|
+ }
|
|
+
|
|
+ sectors -= unwritten;
|
|
+ }
|
|
+
|
|
+ if (sectors < ca->mi.bucket_size && buckets) {
|
|
+ buckets--;
|
|
+ sectors = ca->mi.bucket_size;
|
|
+ }
|
|
+
|
|
+ return (struct journal_space) {
|
|
+ .next_entry = sectors,
|
|
+ .total = sectors + buckets * ca->mi.bucket_size,
|
|
+ };
|
|
+}
|
|
+
|
|
+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want,
|
|
+ enum journal_space_from from)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, pos, nr_devs = 0;
|
|
+ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
|
|
+
|
|
+ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_member_device_rcu(ca, c, i,
|
|
+ &c->rw_devs[BCH_DATA_journal]) {
|
|
+ if (!ca->journal.nr)
|
|
+ continue;
|
|
+
|
|
+ space = journal_dev_space_available(j, ca, from);
|
|
+ if (!space.next_entry)
|
|
+ continue;
|
|
+
|
|
+ for (pos = 0; pos < nr_devs; pos++)
|
|
+ if (space.total > dev_space[pos].total)
|
|
+ break;
|
|
+
|
|
+ array_insert_item(dev_space, nr_devs, pos, space);
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ if (nr_devs < nr_devs_want)
|
|
+ return (struct journal_space) { 0, 0 };
|
|
+
|
|
+ /*
|
|
+ * We sorted largest to smallest, and we want the smallest out of the
|
|
+ * @nr_devs_want largest devices:
|
|
+ */
|
|
+ return dev_space[nr_devs_want - 1];
|
|
+}
|
|
+
|
|
+void bch2_journal_space_available(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned clean, clean_ondisk, total;
|
|
+ s64 u64s_remaining = 0;
|
|
+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
|
|
+ j->buf[1].buf_size >> 9);
|
|
+ unsigned i, nr_online = 0, nr_devs_want;
|
|
+ bool can_discard = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ lockdep_assert_held(&j->lock);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_member_device_rcu(ca, c, i,
|
|
+ &c->rw_devs[BCH_DATA_journal]) {
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+
|
|
+ if (!ja->nr)
|
|
+ continue;
|
|
+
|
|
+ while (ja->dirty_idx != ja->cur_idx &&
|
|
+ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
|
|
+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
|
|
+
|
|
+ while (ja->dirty_idx_ondisk != ja->dirty_idx &&
|
|
+ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk)
|
|
+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr;
|
|
+
|
|
+ if (ja->discard_idx != ja->dirty_idx_ondisk)
|
|
+ can_discard = true;
|
|
+
|
|
+ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size);
|
|
+ nr_online++;
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ j->can_discard = can_discard;
|
|
+
|
|
+ if (nr_online < c->opts.metadata_replicas_required) {
|
|
+ ret = cur_entry_insufficient_devices;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
|
|
+
|
|
+ for (i = 0; i < journal_space_nr; i++)
|
|
+ j->space[i] = __journal_space_available(j, nr_devs_want, i);
|
|
+
|
|
+ clean_ondisk = j->space[journal_space_clean_ondisk].total;
|
|
+ clean = j->space[journal_space_clean].total;
|
|
+ total = j->space[journal_space_total].total;
|
|
+
|
|
+ if (!clean_ondisk &&
|
|
+ j->reservations.idx ==
|
|
+ j->reservations.unwritten_idx) {
|
|
+ char *buf = kmalloc(4096, GFP_ATOMIC);
|
|
+
|
|
+ bch_err(c, "journal stuck");
|
|
+ if (buf) {
|
|
+ __bch2_journal_debug_to_text(&_PBUF(buf, 4096), j);
|
|
+ pr_err("\n%s", buf);
|
|
+ kfree(buf);
|
|
+ }
|
|
+
|
|
+ bch2_fatal_error(c);
|
|
+ ret = cur_entry_journal_stuck;
|
|
+ } else if (!j->space[journal_space_discarded].next_entry)
|
|
+ ret = cur_entry_journal_full;
|
|
+ else if (!fifo_free(&j->pin))
|
|
+ ret = cur_entry_journal_pin_full;
|
|
+
|
|
+ if ((j->space[journal_space_clean_ondisk].next_entry <
|
|
+ j->space[journal_space_clean_ondisk].total) &&
|
|
+ (clean - clean_ondisk <= total / 8) &&
|
|
+ (clean_ondisk * 2 > clean ))
|
|
+ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
|
|
+ else
|
|
+ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
|
|
+
|
|
+ u64s_remaining = (u64) clean << 6;
|
|
+ u64s_remaining -= (u64) total << 3;
|
|
+ u64s_remaining = max(0LL, u64s_remaining);
|
|
+ u64s_remaining /= 4;
|
|
+ u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
|
|
+out:
|
|
+ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
|
|
+ j->cur_entry_error = ret;
|
|
+ journal_set_remaining(j, u64s_remaining);
|
|
+ journal_check_may_get_unreserved(j);
|
|
+
|
|
+ if (!ret)
|
|
+ journal_wake(j);
|
|
+}
|
|
+
|
|
+/* Discards - last part of journal reclaim: */
|
|
+
|
|
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
|
|
+{
|
|
+ bool ret;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ ret = ja->discard_idx != ja->dirty_idx_ondisk;
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Advance ja->discard_idx as long as it points to buckets that are no longer
|
|
+ * dirty, issuing discards if necessary:
|
|
+ */
|
|
+void bch2_journal_do_discards(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned iter;
|
|
+
|
|
+ mutex_lock(&j->discard_lock);
|
|
+
|
|
+ for_each_rw_member(ca, c, iter) {
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+
|
|
+ while (should_discard_bucket(j, ja)) {
|
|
+ if (ca->mi.discard &&
|
|
+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
|
|
+ blkdev_issue_discard(ca->disk_sb.bdev,
|
|
+ bucket_to_sector(ca,
|
|
+ ja->buckets[ja->discard_idx]),
|
|
+ ca->mi.bucket_size, GFP_NOIO, 0);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
|
|
+
|
|
+ bch2_journal_space_available(j);
|
|
+ spin_unlock(&j->lock);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&j->discard_lock);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Journal entry pinning - machinery for holding a reference on a given journal
|
|
+ * entry, holding it open to ensure it gets replayed during recovery:
|
|
+ */
|
|
+
|
|
+static void bch2_journal_reclaim_fast(struct journal *j)
|
|
+{
|
|
+ struct journal_entry_pin_list temp;
|
|
+ bool popped = false;
|
|
+
|
|
+ lockdep_assert_held(&j->lock);
|
|
+
|
|
+ /*
|
|
+ * Unpin journal entries whose reference counts reached zero, meaning
|
|
+ * all btree nodes got written out
|
|
+ */
|
|
+ while (!fifo_empty(&j->pin) &&
|
|
+ !atomic_read(&fifo_peek_front(&j->pin).count)) {
|
|
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
|
|
+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).flushed));
|
|
+ BUG_ON(!fifo_pop(&j->pin, temp));
|
|
+ popped = true;
|
|
+ }
|
|
+
|
|
+ if (popped)
|
|
+ bch2_journal_space_available(j);
|
|
+}
|
|
+
|
|
+void __bch2_journal_pin_put(struct journal *j, u64 seq)
|
|
+{
|
|
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
|
|
+
|
|
+ if (atomic_dec_and_test(&pin_list->count))
|
|
+ bch2_journal_reclaim_fast(j);
|
|
+}
|
|
+
|
|
+void bch2_journal_pin_put(struct journal *j, u64 seq)
|
|
+{
|
|
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
|
|
+
|
|
+ if (atomic_dec_and_test(&pin_list->count)) {
|
|
+ spin_lock(&j->lock);
|
|
+ bch2_journal_reclaim_fast(j);
|
|
+ spin_unlock(&j->lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void __journal_pin_drop(struct journal *j,
|
|
+ struct journal_entry_pin *pin)
|
|
+{
|
|
+ struct journal_entry_pin_list *pin_list;
|
|
+
|
|
+ if (!journal_pin_active(pin))
|
|
+ return;
|
|
+
|
|
+ if (j->flush_in_progress == pin)
|
|
+ j->flush_in_progress_dropped = true;
|
|
+
|
|
+ pin_list = journal_seq_pin(j, pin->seq);
|
|
+ pin->seq = 0;
|
|
+ list_del_init(&pin->list);
|
|
+
|
|
+ /*
|
|
+ * Unpinning a journal entry make make journal_next_bucket() succeed, if
|
|
+ * writing a new last_seq will now make another bucket available:
|
|
+ */
|
|
+ if (atomic_dec_and_test(&pin_list->count) &&
|
|
+ pin_list == &fifo_peek_front(&j->pin))
|
|
+ bch2_journal_reclaim_fast(j);
|
|
+ else if (fifo_used(&j->pin) == 1 &&
|
|
+ atomic_read(&pin_list->count) == 1)
|
|
+ journal_wake(j);
|
|
+}
|
|
+
|
|
+void bch2_journal_pin_drop(struct journal *j,
|
|
+ struct journal_entry_pin *pin)
|
|
+{
|
|
+ spin_lock(&j->lock);
|
|
+ __journal_pin_drop(j, pin);
|
|
+ spin_unlock(&j->lock);
|
|
+}
|
|
+
|
|
+void bch2_journal_pin_set(struct journal *j, u64 seq,
|
|
+ struct journal_entry_pin *pin,
|
|
+ journal_pin_flush_fn flush_fn)
|
|
+{
|
|
+ struct journal_entry_pin_list *pin_list;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ if (seq < journal_last_seq(j)) {
|
|
+ /*
|
|
+ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
|
|
+ * the src pin - with the pin dropped, the entry to pin might no
|
|
+ * longer to exist, but that means there's no longer anything to
|
|
+ * copy and we can bail out here:
|
|
+ */
|
|
+ spin_unlock(&j->lock);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ pin_list = journal_seq_pin(j, seq);
|
|
+
|
|
+ __journal_pin_drop(j, pin);
|
|
+
|
|
+ atomic_inc(&pin_list->count);
|
|
+ pin->seq = seq;
|
|
+ pin->flush = flush_fn;
|
|
+
|
|
+ if (flush_fn == bch2_btree_key_cache_journal_flush)
|
|
+ list_add(&pin->list, &pin_list->key_cache_list);
|
|
+ else if (flush_fn)
|
|
+ list_add(&pin->list, &pin_list->list);
|
|
+ else
|
|
+ list_add(&pin->list, &pin_list->flushed);
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ /*
|
|
+ * If the journal is currently full, we might want to call flush_fn
|
|
+ * immediately:
|
|
+ */
|
|
+ journal_wake(j);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running
|
|
+ */
|
|
+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
|
|
+{
|
|
+ BUG_ON(journal_pin_active(pin));
|
|
+
|
|
+ wait_event(j->pin_flush_wait, j->flush_in_progress != pin);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Journal reclaim: flush references to open journal entries to reclaim space in
|
|
+ * the journal
|
|
+ *
|
|
+ * May be done by the journal code in the background as needed to free up space
|
|
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
|
|
+ * data off of a specific device:
|
|
+ */
|
|
+
|
|
+static struct journal_entry_pin *
|
|
+journal_get_next_pin(struct journal *j,
|
|
+ bool get_any,
|
|
+ bool get_key_cache,
|
|
+ u64 max_seq, u64 *seq)
|
|
+{
|
|
+ struct journal_entry_pin_list *pin_list;
|
|
+ struct journal_entry_pin *ret = NULL;
|
|
+
|
|
+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) {
|
|
+ if (*seq > max_seq && !get_any && !get_key_cache)
|
|
+ break;
|
|
+
|
|
+ if (*seq <= max_seq || get_any) {
|
|
+ ret = list_first_entry_or_null(&pin_list->list,
|
|
+ struct journal_entry_pin, list);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (*seq <= max_seq || get_any || get_key_cache) {
|
|
+ ret = list_first_entry_or_null(&pin_list->key_cache_list,
|
|
+ struct journal_entry_pin, list);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/* returns true if we did work */
|
|
+static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
|
|
+ unsigned min_any,
|
|
+ unsigned min_key_cache)
|
|
+{
|
|
+ struct journal_entry_pin *pin;
|
|
+ size_t nr_flushed = 0;
|
|
+ journal_pin_flush_fn flush_fn;
|
|
+ u64 seq;
|
|
+ int err;
|
|
+
|
|
+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
|
|
+ return 0;
|
|
+
|
|
+ lockdep_assert_held(&j->reclaim_lock);
|
|
+
|
|
+ while (1) {
|
|
+ cond_resched();
|
|
+
|
|
+ j->last_flushed = jiffies;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ pin = journal_get_next_pin(j,
|
|
+ min_any != 0,
|
|
+ min_key_cache != 0,
|
|
+ seq_to_flush, &seq);
|
|
+ if (pin) {
|
|
+ BUG_ON(j->flush_in_progress);
|
|
+ j->flush_in_progress = pin;
|
|
+ j->flush_in_progress_dropped = false;
|
|
+ flush_fn = pin->flush;
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ if (!pin)
|
|
+ break;
|
|
+
|
|
+ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush)
|
|
+ min_key_cache--;
|
|
+
|
|
+ if (min_any)
|
|
+ min_any--;
|
|
+
|
|
+ err = flush_fn(j, pin, seq);
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ /* Pin might have been dropped or rearmed: */
|
|
+ if (likely(!err && !j->flush_in_progress_dropped))
|
|
+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
|
|
+ j->flush_in_progress = NULL;
|
|
+ j->flush_in_progress_dropped = false;
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ wake_up(&j->pin_flush_wait);
|
|
+
|
|
+ if (err)
|
|
+ break;
|
|
+
|
|
+ nr_flushed++;
|
|
+ }
|
|
+
|
|
+ return nr_flushed;
|
|
+}
|
|
+
|
|
+static u64 journal_seq_to_flush(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct bch_dev *ca;
|
|
+ u64 seq_to_flush = 0;
|
|
+ unsigned iter;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+
|
|
+ for_each_rw_member(ca, c, iter) {
|
|
+ struct journal_device *ja = &ca->journal;
|
|
+ unsigned nr_buckets, bucket_to_flush;
|
|
+
|
|
+ if (!ja->nr)
|
|
+ continue;
|
|
+
|
|
+ /* Try to keep the journal at most half full: */
|
|
+ nr_buckets = ja->nr / 2;
|
|
+
|
|
+ /* And include pre-reservations: */
|
|
+ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved,
|
|
+ (ca->mi.bucket_size << 6) -
|
|
+ journal_entry_overhead(j));
|
|
+
|
|
+ nr_buckets = min(nr_buckets, ja->nr);
|
|
+
|
|
+ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
|
|
+ seq_to_flush = max(seq_to_flush,
|
|
+ ja->bucket_seq[bucket_to_flush]);
|
|
+ }
|
|
+
|
|
+ /* Also flush if the pin fifo is more than half full */
|
|
+ seq_to_flush = max_t(s64, seq_to_flush,
|
|
+ (s64) journal_cur_seq(j) -
|
|
+ (j->pin.size >> 1));
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ return seq_to_flush;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_journal_reclaim - free up journal buckets
|
|
+ *
|
|
+ * Background journal reclaim writes out btree nodes. It should be run
|
|
+ * early enough so that we never completely run out of journal buckets.
|
|
+ *
|
|
+ * High watermarks for triggering background reclaim:
|
|
+ * - FIFO has fewer than 512 entries left
|
|
+ * - fewer than 25% journal buckets free
|
|
+ *
|
|
+ * Background reclaim runs until low watermarks are reached:
|
|
+ * - FIFO has more than 1024 entries left
|
|
+ * - more than 50% journal buckets free
|
|
+ *
|
|
+ * As long as a reclaim can complete in the time it takes to fill up
|
|
+ * 512 journal entries or 25% of all journal buckets, then
|
|
+ * journal_next_bucket() should not stall.
|
|
+ */
|
|
+static int __bch2_journal_reclaim(struct journal *j, bool direct)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
|
|
+ u64 seq_to_flush;
|
|
+ size_t min_nr, min_key_cache, nr_flushed;
|
|
+ unsigned flags;
|
|
+ int ret = 0;
|
|
+
|
|
+ /*
|
|
+ * We can't invoke memory reclaim while holding the reclaim_lock -
|
|
+ * journal reclaim is required to make progress for memory reclaim
|
|
+ * (cleaning the caches), so we can't get stuck in memory reclaim while
|
|
+ * we're holding the reclaim lock:
|
|
+ */
|
|
+ lockdep_assert_held(&j->reclaim_lock);
|
|
+ flags = memalloc_noreclaim_save();
|
|
+
|
|
+ do {
|
|
+ if (kthread && kthread_should_stop())
|
|
+ break;
|
|
+
|
|
+ if (bch2_journal_error(j)) {
|
|
+ ret = -EIO;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_journal_do_discards(j);
|
|
+
|
|
+ seq_to_flush = journal_seq_to_flush(j);
|
|
+ min_nr = 0;
|
|
+
|
|
+ /*
|
|
+ * If it's been longer than j->reclaim_delay_ms since we last flushed,
|
|
+ * make sure to flush at least one journal pin:
|
|
+ */
|
|
+ if (time_after(jiffies, j->last_flushed +
|
|
+ msecs_to_jiffies(j->reclaim_delay_ms)))
|
|
+ min_nr = 1;
|
|
+
|
|
+ if (j->prereserved.reserved * 4 > j->prereserved.remaining)
|
|
+ min_nr = 1;
|
|
+
|
|
+ if (fifo_free(&j->pin) <= 32)
|
|
+ min_nr = 1;
|
|
+
|
|
+ trace_journal_reclaim_start(c,
|
|
+ min_nr,
|
|
+ j->prereserved.reserved,
|
|
+ j->prereserved.remaining,
|
|
+ atomic_read(&c->btree_cache.dirty),
|
|
+ c->btree_cache.used,
|
|
+ atomic_long_read(&c->btree_key_cache.nr_dirty),
|
|
+ atomic_long_read(&c->btree_key_cache.nr_keys));
|
|
+
|
|
+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), 128UL);
|
|
+
|
|
+ nr_flushed = journal_flush_pins(j, seq_to_flush,
|
|
+ min_nr, min_key_cache);
|
|
+
|
|
+ if (direct)
|
|
+ j->nr_direct_reclaim += nr_flushed;
|
|
+ else
|
|
+ j->nr_background_reclaim += nr_flushed;
|
|
+ trace_journal_reclaim_finish(c, nr_flushed);
|
|
+
|
|
+ if (nr_flushed)
|
|
+ wake_up(&j->reclaim_wait);
|
|
+ } while ((min_nr || min_key_cache) && !direct);
|
|
+
|
|
+ memalloc_noreclaim_restore(flags);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_journal_reclaim(struct journal *j)
|
|
+{
|
|
+ return __bch2_journal_reclaim(j, true);
|
|
+}
|
|
+
|
|
+static int bch2_journal_reclaim_thread(void *arg)
|
|
+{
|
|
+ struct journal *j = arg;
|
|
+ unsigned long delay, now;
|
|
+ int ret = 0;
|
|
+
|
|
+ set_freezable();
|
|
+
|
|
+ kthread_wait_freezable(test_bit(JOURNAL_RECLAIM_STARTED, &j->flags));
|
|
+
|
|
+ j->last_flushed = jiffies;
|
|
+
|
|
+ while (!ret && !kthread_should_stop()) {
|
|
+ j->reclaim_kicked = false;
|
|
+
|
|
+ mutex_lock(&j->reclaim_lock);
|
|
+ ret = __bch2_journal_reclaim(j, false);
|
|
+ mutex_unlock(&j->reclaim_lock);
|
|
+
|
|
+ now = jiffies;
|
|
+ delay = msecs_to_jiffies(j->reclaim_delay_ms);
|
|
+ j->next_reclaim = j->last_flushed + delay;
|
|
+
|
|
+ if (!time_in_range(j->next_reclaim, now, now + delay))
|
|
+ j->next_reclaim = now + delay;
|
|
+
|
|
+ while (1) {
|
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
|
+ if (kthread_should_stop())
|
|
+ break;
|
|
+ if (j->reclaim_kicked)
|
|
+ break;
|
|
+ if (time_after_eq(jiffies, j->next_reclaim))
|
|
+ break;
|
|
+ freezable_schedule_timeout(j->next_reclaim - jiffies);
|
|
+
|
|
+ }
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_journal_reclaim_stop(struct journal *j)
|
|
+{
|
|
+ struct task_struct *p = j->reclaim_thread;
|
|
+
|
|
+ j->reclaim_thread = NULL;
|
|
+
|
|
+ if (p) {
|
|
+ kthread_stop(p);
|
|
+ put_task_struct(p);
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_journal_reclaim_start(struct journal *j)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct task_struct *p;
|
|
+
|
|
+ if (j->reclaim_thread)
|
|
+ return 0;
|
|
+
|
|
+ p = kthread_create(bch2_journal_reclaim_thread, j,
|
|
+ "bch-reclaim/%s", c->name);
|
|
+ if (IS_ERR(p)) {
|
|
+ bch_err(c, "error creating journal reclaim thread: %li", PTR_ERR(p));
|
|
+ return PTR_ERR(p);
|
|
+ }
|
|
+
|
|
+ get_task_struct(p);
|
|
+ j->reclaim_thread = p;
|
|
+ wake_up_process(p);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
|
+ bool *did_work)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_journal_error(j);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ mutex_lock(&j->reclaim_lock);
|
|
+
|
|
+ *did_work = journal_flush_pins(j, seq_to_flush, 0, 0) != 0;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ /*
|
|
+ * If journal replay hasn't completed, the unreplayed journal entries
|
|
+ * hold refs on their corresponding sequence numbers
|
|
+ */
|
|
+ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
|
|
+ journal_last_seq(j) > seq_to_flush ||
|
|
+ (fifo_used(&j->pin) == 1 &&
|
|
+ atomic_read(&fifo_peek_front(&j->pin).count) == 1);
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+ mutex_unlock(&j->reclaim_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
|
|
+{
|
|
+ bool did_work = false;
|
|
+
|
|
+ if (!test_bit(JOURNAL_STARTED, &j->flags))
|
|
+ return false;
|
|
+
|
|
+ closure_wait_event(&j->async_wait,
|
|
+ journal_flush_done(j, seq_to_flush, &did_work));
|
|
+
|
|
+ return did_work;
|
|
+}
|
|
+
|
|
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
|
|
+{
|
|
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
|
|
+ struct journal_entry_pin_list *p;
|
|
+ u64 iter, seq = 0;
|
|
+ int ret = 0;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ fifo_for_each_entry_ptr(p, &j->pin, iter)
|
|
+ if (dev_idx >= 0
|
|
+ ? bch2_dev_list_has_dev(p->devs, dev_idx)
|
|
+ : p->devs.nr < c->opts.metadata_replicas)
|
|
+ seq = iter;
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ bch2_journal_flush_pins(j, seq);
|
|
+
|
|
+ ret = bch2_journal_error(j);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ mutex_lock(&c->replicas_gc_lock);
|
|
+ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
|
|
+
|
|
+ seq = 0;
|
|
+
|
|
+ spin_lock(&j->lock);
|
|
+ while (!ret && seq < j->pin.back) {
|
|
+ struct bch_replicas_padded replicas;
|
|
+
|
|
+ seq = max(seq, journal_last_seq(j));
|
|
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
|
|
+ journal_seq_pin(j, seq)->devs);
|
|
+ seq++;
|
|
+
|
|
+ spin_unlock(&j->lock);
|
|
+ ret = bch2_mark_replicas(c, &replicas.e);
|
|
+ spin_lock(&j->lock);
|
|
+ }
|
|
+ spin_unlock(&j->lock);
|
|
+
|
|
+ ret = bch2_replicas_gc_end(c, ret);
|
|
+ mutex_unlock(&c->replicas_gc_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
|
|
new file mode 100644
|
|
index 000000000000..0fd1af120db5
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_reclaim.h
|
|
@@ -0,0 +1,86 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
|
|
+#define _BCACHEFS_JOURNAL_RECLAIM_H
|
|
+
|
|
+#define JOURNAL_PIN (32 * 1024)
|
|
+
|
|
+static inline void journal_reclaim_kick(struct journal *j)
|
|
+{
|
|
+ struct task_struct *p = READ_ONCE(j->reclaim_thread);
|
|
+
|
|
+ j->reclaim_kicked = true;
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+}
|
|
+
|
|
+unsigned bch2_journal_dev_buckets_available(struct journal *,
|
|
+ struct journal_device *,
|
|
+ enum journal_space_from);
|
|
+void bch2_journal_space_available(struct journal *);
|
|
+
|
|
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
|
|
+{
|
|
+ return pin->seq != 0;
|
|
+}
|
|
+
|
|
+static inline struct journal_entry_pin_list *
|
|
+journal_seq_pin(struct journal *j, u64 seq)
|
|
+{
|
|
+ EBUG_ON(seq < j->pin.front || seq >= j->pin.back);
|
|
+
|
|
+ return &j->pin.data[seq & j->pin.mask];
|
|
+}
|
|
+
|
|
+void __bch2_journal_pin_put(struct journal *, u64);
|
|
+void bch2_journal_pin_put(struct journal *, u64);
|
|
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
|
|
+
|
|
+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *,
|
|
+ journal_pin_flush_fn);
|
|
+
|
|
+static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
|
|
+ struct journal_entry_pin *pin,
|
|
+ journal_pin_flush_fn flush_fn)
|
|
+{
|
|
+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq))
|
|
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
|
|
+}
|
|
+
|
|
+static inline void bch2_journal_pin_copy(struct journal *j,
|
|
+ struct journal_entry_pin *dst,
|
|
+ struct journal_entry_pin *src,
|
|
+ journal_pin_flush_fn flush_fn)
|
|
+{
|
|
+ /* Guard against racing with journal_pin_drop(src): */
|
|
+ u64 seq = READ_ONCE(src->seq);
|
|
+
|
|
+ if (seq)
|
|
+ bch2_journal_pin_add(j, seq, dst, flush_fn);
|
|
+}
|
|
+
|
|
+static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
|
|
+ struct journal_entry_pin *pin,
|
|
+ journal_pin_flush_fn flush_fn)
|
|
+{
|
|
+ if (unlikely(!journal_pin_active(pin) || pin->seq < seq))
|
|
+ bch2_journal_pin_set(j, seq, pin, flush_fn);
|
|
+}
|
|
+
|
|
+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
|
|
+
|
|
+void bch2_journal_do_discards(struct journal *);
|
|
+int bch2_journal_reclaim(struct journal *);
|
|
+
|
|
+void bch2_journal_reclaim_stop(struct journal *);
|
|
+int bch2_journal_reclaim_start(struct journal *);
|
|
+
|
|
+bool bch2_journal_flush_pins(struct journal *, u64);
|
|
+
|
|
+static inline bool bch2_journal_flush_all_pins(struct journal *j)
|
|
+{
|
|
+ return bch2_journal_flush_pins(j, U64_MAX);
|
|
+}
|
|
+
|
|
+int bch2_journal_flush_device_pins(struct journal *, int);
|
|
+
|
|
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
|
|
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
|
|
new file mode 100644
|
|
index 000000000000..e1b63f3879f4
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_seq_blacklist.c
|
|
@@ -0,0 +1,308 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_iter.h"
|
|
+#include "eytzinger.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+/*
|
|
+ * journal_seq_blacklist machinery:
|
|
+ *
|
|
+ * To guarantee order of btree updates after a crash, we need to detect when a
|
|
+ * btree node entry (bset) is newer than the newest journal entry that was
|
|
+ * successfully written, and ignore it - effectively ignoring any btree updates
|
|
+ * that didn't make it into the journal.
|
|
+ *
|
|
+ * If we didn't do this, we might have two btree nodes, a and b, both with
|
|
+ * updates that weren't written to the journal yet: if b was updated after a,
|
|
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
|
|
+ * to b happened, but not the updates to a that happened before it.
|
|
+ *
|
|
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
|
|
+ * because everything they contain will also have been journalled - and must
|
|
+ * still be present in the journal on disk until a journal entry has been
|
|
+ * written _after_ that bset was written.
|
|
+ *
|
|
+ * To accomplish this, bsets record the newest journal sequence number they
|
|
+ * contain updates for; then, on startup, the btree code queries the journal
|
|
+ * code to ask "Is this sequence number newer than the newest journal entry? If
|
|
+ * so, ignore it."
|
|
+ *
|
|
+ * When this happens, we must blacklist that journal sequence number: the
|
|
+ * journal must not write any entries with that sequence number, and it must
|
|
+ * record that it was blacklisted so that a) on recovery we don't think we have
|
|
+ * missing journal entries and b) so that the btree code continues to ignore
|
|
+ * that bset, until that btree node is rewritten.
|
|
+ */
|
|
+
|
|
+static unsigned sb_blacklist_u64s(unsigned nr)
|
|
+{
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl;
|
|
+
|
|
+ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
|
|
+}
|
|
+
|
|
+static struct bch_sb_field_journal_seq_blacklist *
|
|
+blacklist_entry_try_merge(struct bch_fs *c,
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl,
|
|
+ unsigned i)
|
|
+{
|
|
+ unsigned nr = blacklist_nr_entries(bl);
|
|
+
|
|
+ if (le64_to_cpu(bl->start[i].end) >=
|
|
+ le64_to_cpu(bl->start[i + 1].start)) {
|
|
+ bl->start[i].end = bl->start[i + 1].end;
|
|
+ --nr;
|
|
+ memmove(&bl->start[i],
|
|
+ &bl->start[i + 1],
|
|
+ sizeof(bl->start[0]) * (nr - i));
|
|
+
|
|
+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
|
|
+ sb_blacklist_u64s(nr));
|
|
+ BUG_ON(!bl);
|
|
+ }
|
|
+
|
|
+ return bl;
|
|
+}
|
|
+
|
|
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
|
|
+{
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl;
|
|
+ unsigned i, nr;
|
|
+ int ret = 0;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
|
|
+ nr = blacklist_nr_entries(bl);
|
|
+
|
|
+ if (bl) {
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ struct journal_seq_blacklist_entry *e =
|
|
+ bl->start + i;
|
|
+
|
|
+ if (start == le64_to_cpu(e->start) &&
|
|
+ end == le64_to_cpu(e->end))
|
|
+ goto out;
|
|
+
|
|
+ if (start <= le64_to_cpu(e->start) &&
|
|
+ end >= le64_to_cpu(e->end)) {
|
|
+ e->start = cpu_to_le64(start);
|
|
+ e->end = cpu_to_le64(end);
|
|
+
|
|
+ if (i + 1 < nr)
|
|
+ bl = blacklist_entry_try_merge(c,
|
|
+ bl, i);
|
|
+ if (i)
|
|
+ bl = blacklist_entry_try_merge(c,
|
|
+ bl, i - 1);
|
|
+ goto out_write_sb;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
|
|
+ sb_blacklist_u64s(nr + 1));
|
|
+ if (!bl) {
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bl->start[nr].start = cpu_to_le64(start);
|
|
+ bl->start[nr].end = cpu_to_le64(end);
|
|
+out_write_sb:
|
|
+ c->disk_sb.sb->features[0] |=
|
|
+ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3;
|
|
+
|
|
+ ret = bch2_write_super(c);
|
|
+out:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return ret ?: bch2_blacklist_table_initialize(c);
|
|
+}
|
|
+
|
|
+static int journal_seq_blacklist_table_cmp(const void *_l,
|
|
+ const void *_r, size_t size)
|
|
+{
|
|
+ const struct journal_seq_blacklist_table_entry *l = _l;
|
|
+ const struct journal_seq_blacklist_table_entry *r = _r;
|
|
+
|
|
+ return cmp_int(l->start, r->start);
|
|
+}
|
|
+
|
|
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq,
|
|
+ bool dirty)
|
|
+{
|
|
+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table;
|
|
+ struct journal_seq_blacklist_table_entry search = { .start = seq };
|
|
+ int idx;
|
|
+
|
|
+ if (!t)
|
|
+ return false;
|
|
+
|
|
+ idx = eytzinger0_find_le(t->entries, t->nr,
|
|
+ sizeof(t->entries[0]),
|
|
+ journal_seq_blacklist_table_cmp,
|
|
+ &search);
|
|
+ if (idx < 0)
|
|
+ return false;
|
|
+
|
|
+ BUG_ON(t->entries[idx].start > seq);
|
|
+
|
|
+ if (seq >= t->entries[idx].end)
|
|
+ return false;
|
|
+
|
|
+ if (dirty)
|
|
+ t->entries[idx].dirty = true;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+int bch2_blacklist_table_initialize(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl =
|
|
+ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
|
|
+ struct journal_seq_blacklist_table *t;
|
|
+ unsigned i, nr = blacklist_nr_entries(bl);
|
|
+
|
|
+ if (!bl)
|
|
+ return 0;
|
|
+
|
|
+ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
|
|
+ GFP_KERNEL);
|
|
+ if (!t)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ t->nr = nr;
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ t->entries[i].start = le64_to_cpu(bl->start[i].start);
|
|
+ t->entries[i].end = le64_to_cpu(bl->start[i].end);
|
|
+ }
|
|
+
|
|
+ eytzinger0_sort(t->entries,
|
|
+ t->nr,
|
|
+ sizeof(t->entries[0]),
|
|
+ journal_seq_blacklist_table_cmp,
|
|
+ NULL);
|
|
+
|
|
+ kfree(c->journal_seq_blacklist_table);
|
|
+ c->journal_seq_blacklist_table = t;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static const char *
|
|
+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl =
|
|
+ field_to_type(f, journal_seq_blacklist);
|
|
+ struct journal_seq_blacklist_entry *i;
|
|
+ unsigned nr = blacklist_nr_entries(bl);
|
|
+
|
|
+ for (i = bl->start; i < bl->start + nr; i++) {
|
|
+ if (le64_to_cpu(i->start) >=
|
|
+ le64_to_cpu(i->end))
|
|
+ return "entry start >= end";
|
|
+
|
|
+ if (i + 1 < bl->start + nr &&
|
|
+ le64_to_cpu(i[0].end) >
|
|
+ le64_to_cpu(i[1].start))
|
|
+ return "entries out of order";
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out,
|
|
+ struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl =
|
|
+ field_to_type(f, journal_seq_blacklist);
|
|
+ struct journal_seq_blacklist_entry *i;
|
|
+ unsigned nr = blacklist_nr_entries(bl);
|
|
+
|
|
+ for (i = bl->start; i < bl->start + nr; i++) {
|
|
+ if (i != bl->start)
|
|
+ pr_buf(out, " ");
|
|
+
|
|
+ pr_buf(out, "%llu-%llu",
|
|
+ le64_to_cpu(i->start),
|
|
+ le64_to_cpu(i->end));
|
|
+ }
|
|
+}
|
|
+
|
|
+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = {
|
|
+ .validate = bch2_sb_journal_seq_blacklist_validate,
|
|
+ .to_text = bch2_sb_journal_seq_blacklist_to_text
|
|
+};
|
|
+
|
|
+void bch2_blacklist_entries_gc(struct work_struct *work)
|
|
+{
|
|
+ struct bch_fs *c = container_of(work, struct bch_fs,
|
|
+ journal_seq_blacklist_gc_work);
|
|
+ struct journal_seq_blacklist_table *t;
|
|
+ struct bch_sb_field_journal_seq_blacklist *bl;
|
|
+ struct journal_seq_blacklist_entry *src, *dst;
|
|
+ struct btree_trans trans;
|
|
+ unsigned i, nr, new_nr;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++) {
|
|
+ struct btree_iter *iter;
|
|
+ struct btree *b;
|
|
+
|
|
+ for_each_btree_node(&trans, iter, i, POS_MIN,
|
|
+ BTREE_ITER_PREFETCH, b)
|
|
+ if (test_bit(BCH_FS_STOPPING, &c->flags)) {
|
|
+ bch2_trans_exit(&trans);
|
|
+ return;
|
|
+ }
|
|
+ bch2_trans_iter_free(&trans, iter);
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_exit(&trans);
|
|
+ if (ret)
|
|
+ return;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb);
|
|
+ if (!bl)
|
|
+ goto out;
|
|
+
|
|
+ nr = blacklist_nr_entries(bl);
|
|
+ dst = bl->start;
|
|
+
|
|
+ t = c->journal_seq_blacklist_table;
|
|
+ BUG_ON(nr != t->nr);
|
|
+
|
|
+ for (src = bl->start, i = eytzinger0_first(t->nr);
|
|
+ src < bl->start + nr;
|
|
+ src++, i = eytzinger0_next(i, nr)) {
|
|
+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start));
|
|
+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end));
|
|
+
|
|
+ if (t->entries[i].dirty)
|
|
+ *dst++ = *src;
|
|
+ }
|
|
+
|
|
+ new_nr = dst - bl->start;
|
|
+
|
|
+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr);
|
|
+
|
|
+ if (new_nr != nr) {
|
|
+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb,
|
|
+ new_nr ? sb_blacklist_u64s(new_nr) : 0);
|
|
+ BUG_ON(new_nr && !bl);
|
|
+
|
|
+ if (!new_nr)
|
|
+ c->disk_sb.sb->features[0] &=
|
|
+ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ }
|
|
+out:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h
|
|
new file mode 100644
|
|
index 000000000000..afb886ec8e25
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_seq_blacklist.h
|
|
@@ -0,0 +1,22 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
|
|
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
|
|
+
|
|
+static inline unsigned
|
|
+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl)
|
|
+{
|
|
+ return bl
|
|
+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) /
|
|
+ sizeof(struct journal_seq_blacklist_entry))
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool);
|
|
+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64);
|
|
+int bch2_blacklist_table_initialize(struct bch_fs *);
|
|
+
|
|
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist;
|
|
+
|
|
+void bch2_blacklist_entries_gc(struct work_struct *);
|
|
+
|
|
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
|
|
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
|
|
new file mode 100644
|
|
index 000000000000..a7aa12e919e2
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/journal_types.h
|
|
@@ -0,0 +1,323 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_JOURNAL_TYPES_H
|
|
+#define _BCACHEFS_JOURNAL_TYPES_H
|
|
+
|
|
+#include <linux/cache.h>
|
|
+#include <linux/workqueue.h>
|
|
+
|
|
+#include "alloc_types.h"
|
|
+#include "super_types.h"
|
|
+#include "fifo.h"
|
|
+
|
|
+#define JOURNAL_BUF_BITS 2
|
|
+#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS)
|
|
+#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1)
|
|
+
|
|
+/*
|
|
+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to
|
|
+ * the journal that are being staged or in flight.
|
|
+ */
|
|
+struct journal_buf {
|
|
+ struct jset *data;
|
|
+
|
|
+ __BKEY_PADDED(key, BCH_REPLICAS_MAX);
|
|
+
|
|
+ struct closure_waitlist wait;
|
|
+
|
|
+ unsigned buf_size; /* size in bytes of @data */
|
|
+ unsigned sectors; /* maximum size for current entry */
|
|
+ unsigned disk_sectors; /* maximum size entry could have been, if
|
|
+ buf_size was bigger */
|
|
+ unsigned u64s_reserved;
|
|
+ bool noflush; /* write has already been kicked off, and was noflush */
|
|
+ bool must_flush; /* something wants a flush */
|
|
+ bool separate_flush;
|
|
+ /* bloom filter: */
|
|
+ unsigned long has_inode[1024 / sizeof(unsigned long)];
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Something that makes a journal entry dirty - i.e. a btree node that has to be
|
|
+ * flushed:
|
|
+ */
|
|
+
|
|
+struct journal_entry_pin_list {
|
|
+ struct list_head list;
|
|
+ struct list_head key_cache_list;
|
|
+ struct list_head flushed;
|
|
+ atomic_t count;
|
|
+ struct bch_devs_list devs;
|
|
+};
|
|
+
|
|
+struct journal;
|
|
+struct journal_entry_pin;
|
|
+typedef int (*journal_pin_flush_fn)(struct journal *j,
|
|
+ struct journal_entry_pin *, u64);
|
|
+
|
|
+struct journal_entry_pin {
|
|
+ struct list_head list;
|
|
+ journal_pin_flush_fn flush;
|
|
+ u64 seq;
|
|
+};
|
|
+
|
|
+struct journal_res {
|
|
+ bool ref;
|
|
+ u8 idx;
|
|
+ u16 u64s;
|
|
+ u32 offset;
|
|
+ u64 seq;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * For reserving space in the journal prior to getting a reservation on a
|
|
+ * particular journal entry:
|
|
+ */
|
|
+struct journal_preres {
|
|
+ unsigned u64s;
|
|
+};
|
|
+
|
|
+union journal_res_state {
|
|
+ struct {
|
|
+ atomic64_t counter;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ u64 v;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ u64 cur_entry_offset:20,
|
|
+ idx:2,
|
|
+ unwritten_idx:2,
|
|
+ buf0_count:10,
|
|
+ buf1_count:10,
|
|
+ buf2_count:10,
|
|
+ buf3_count:10;
|
|
+ };
|
|
+};
|
|
+
|
|
+union journal_preres_state {
|
|
+ struct {
|
|
+ atomic64_t counter;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ u64 v;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ u64 waiting:1,
|
|
+ reserved:31,
|
|
+ remaining:32;
|
|
+ };
|
|
+};
|
|
+
|
|
+/* bytes: */
|
|
+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */
|
|
+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */
|
|
+
|
|
+/*
|
|
+ * We stash some journal state as sentinal values in cur_entry_offset:
|
|
+ * note - cur_entry_offset is in units of u64s
|
|
+ */
|
|
+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1)
|
|
+
|
|
+#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1)
|
|
+#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX)
|
|
+
|
|
+struct journal_space {
|
|
+ /* Units of 512 bytes sectors: */
|
|
+ unsigned next_entry; /* How big the next journal entry can be */
|
|
+ unsigned total;
|
|
+};
|
|
+
|
|
+enum journal_space_from {
|
|
+ journal_space_discarded,
|
|
+ journal_space_clean_ondisk,
|
|
+ journal_space_clean,
|
|
+ journal_space_total,
|
|
+ journal_space_nr,
|
|
+};
|
|
+
|
|
+/*
|
|
+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP,
|
|
+ * either because something's waiting on the write to complete or because it's
|
|
+ * been dirty too long and the timer's expired.
|
|
+ */
|
|
+
|
|
+enum {
|
|
+ JOURNAL_REPLAY_DONE,
|
|
+ JOURNAL_STARTED,
|
|
+ JOURNAL_RECLAIM_STARTED,
|
|
+ JOURNAL_NEED_WRITE,
|
|
+ JOURNAL_MAY_GET_UNRESERVED,
|
|
+ JOURNAL_MAY_SKIP_FLUSH,
|
|
+};
|
|
+
|
|
+/* Embedded in struct bch_fs */
|
|
+struct journal {
|
|
+ /* Fastpath stuff up front: */
|
|
+
|
|
+ unsigned long flags;
|
|
+
|
|
+ union journal_res_state reservations;
|
|
+
|
|
+ /* Max size of current journal entry */
|
|
+ unsigned cur_entry_u64s;
|
|
+ unsigned cur_entry_sectors;
|
|
+
|
|
+ /*
|
|
+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
|
|
+ * insufficient devices:
|
|
+ */
|
|
+ enum {
|
|
+ cur_entry_ok,
|
|
+ cur_entry_blocked,
|
|
+ cur_entry_journal_full,
|
|
+ cur_entry_journal_pin_full,
|
|
+ cur_entry_journal_stuck,
|
|
+ cur_entry_insufficient_devices,
|
|
+ } cur_entry_error;
|
|
+
|
|
+ union journal_preres_state prereserved;
|
|
+
|
|
+ /* Reserved space in journal entry to be used just prior to write */
|
|
+ unsigned entry_u64s_reserved;
|
|
+
|
|
+ unsigned buf_size_want;
|
|
+
|
|
+ /*
|
|
+ * Two journal entries -- one is currently open for new entries, the
|
|
+ * other is possibly being written out.
|
|
+ */
|
|
+ struct journal_buf buf[JOURNAL_BUF_NR];
|
|
+
|
|
+ spinlock_t lock;
|
|
+
|
|
+ /* if nonzero, we may not open a new journal entry: */
|
|
+ unsigned blocked;
|
|
+
|
|
+ /* Used when waiting because the journal was full */
|
|
+ wait_queue_head_t wait;
|
|
+ struct closure_waitlist async_wait;
|
|
+ struct closure_waitlist preres_wait;
|
|
+
|
|
+ struct closure io;
|
|
+ struct delayed_work write_work;
|
|
+
|
|
+ /* Sequence number of most recent journal entry (last entry in @pin) */
|
|
+ atomic64_t seq;
|
|
+
|
|
+ /* seq, last_seq from the most recent journal entry successfully written */
|
|
+ u64 seq_ondisk;
|
|
+ u64 flushed_seq_ondisk;
|
|
+ u64 last_seq_ondisk;
|
|
+ u64 err_seq;
|
|
+ u64 last_empty_seq;
|
|
+
|
|
+ /*
|
|
+ * FIFO of journal entries whose btree updates have not yet been
|
|
+ * written out.
|
|
+ *
|
|
+ * Each entry is a reference count. The position in the FIFO is the
|
|
+ * entry's sequence number relative to @seq.
|
|
+ *
|
|
+ * The journal entry itself holds a reference count, put when the
|
|
+ * journal entry is written out. Each btree node modified by the journal
|
|
+ * entry also holds a reference count, put when the btree node is
|
|
+ * written.
|
|
+ *
|
|
+ * When a reference count reaches zero, the journal entry is no longer
|
|
+ * needed. When all journal entries in the oldest journal bucket are no
|
|
+ * longer needed, the bucket can be discarded and reused.
|
|
+ */
|
|
+ struct {
|
|
+ u64 front, back, size, mask;
|
|
+ struct journal_entry_pin_list *data;
|
|
+ } pin;
|
|
+
|
|
+ struct journal_space space[journal_space_nr];
|
|
+
|
|
+ u64 replay_journal_seq;
|
|
+ u64 replay_journal_seq_end;
|
|
+
|
|
+ struct write_point wp;
|
|
+ spinlock_t err_lock;
|
|
+
|
|
+ struct mutex reclaim_lock;
|
|
+ wait_queue_head_t reclaim_wait;
|
|
+ struct task_struct *reclaim_thread;
|
|
+ bool reclaim_kicked;
|
|
+ unsigned long next_reclaim;
|
|
+ u64 nr_direct_reclaim;
|
|
+ u64 nr_background_reclaim;
|
|
+
|
|
+ unsigned long last_flushed;
|
|
+ struct journal_entry_pin *flush_in_progress;
|
|
+ bool flush_in_progress_dropped;
|
|
+ wait_queue_head_t pin_flush_wait;
|
|
+
|
|
+ /* protects advancing ja->discard_idx: */
|
|
+ struct mutex discard_lock;
|
|
+ bool can_discard;
|
|
+
|
|
+ unsigned write_delay_ms;
|
|
+ unsigned reclaim_delay_ms;
|
|
+ unsigned long last_flush_write;
|
|
+
|
|
+ u64 res_get_blocked_start;
|
|
+ u64 need_write_time;
|
|
+ u64 write_start_time;
|
|
+
|
|
+ u64 nr_flush_writes;
|
|
+ u64 nr_noflush_writes;
|
|
+
|
|
+ struct time_stats *write_time;
|
|
+ struct time_stats *delay_time;
|
|
+ struct time_stats *blocked_time;
|
|
+ struct time_stats *flush_seq_time;
|
|
+
|
|
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
+ struct lockdep_map res_map;
|
|
+#endif
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Embedded in struct bch_dev. First three fields refer to the array of journal
|
|
+ * buckets, in bch_sb.
|
|
+ */
|
|
+struct journal_device {
|
|
+ /*
|
|
+ * For each journal bucket, contains the max sequence number of the
|
|
+ * journal writes it contains - so we know when a bucket can be reused.
|
|
+ */
|
|
+ u64 *bucket_seq;
|
|
+
|
|
+ unsigned sectors_free;
|
|
+
|
|
+ /*
|
|
+ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx:
|
|
+ */
|
|
+ unsigned discard_idx; /* Next bucket to discard */
|
|
+ unsigned dirty_idx_ondisk;
|
|
+ unsigned dirty_idx;
|
|
+ unsigned cur_idx; /* Journal bucket we're currently writing to */
|
|
+ unsigned nr;
|
|
+
|
|
+ u64 *buckets;
|
|
+
|
|
+ /* Bio for journal reads/writes to this device */
|
|
+ struct bio *bio;
|
|
+
|
|
+ /* for bch_journal_read_device */
|
|
+ struct closure read;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * journal_entry_res - reserve space in every journal entry:
|
|
+ */
|
|
+struct journal_entry_res {
|
|
+ unsigned u64s;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_JOURNAL_TYPES_H */
|
|
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
|
|
new file mode 100644
|
|
index 000000000000..cda77835b9ea
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/keylist.c
|
|
@@ -0,0 +1,67 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "keylist.h"
|
|
+
|
|
+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s,
|
|
+ size_t nr_inline_u64s, size_t new_u64s)
|
|
+{
|
|
+ size_t oldsize = bch2_keylist_u64s(l);
|
|
+ size_t newsize = oldsize + new_u64s;
|
|
+ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p;
|
|
+ u64 *new_keys;
|
|
+
|
|
+ newsize = roundup_pow_of_two(newsize);
|
|
+
|
|
+ if (newsize <= nr_inline_u64s ||
|
|
+ (old_buf && roundup_pow_of_two(oldsize) == newsize))
|
|
+ return 0;
|
|
+
|
|
+ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO);
|
|
+ if (!new_keys)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ if (!old_buf)
|
|
+ memcpy_u64s(new_keys, inline_u64s, oldsize);
|
|
+
|
|
+ l->keys_p = new_keys;
|
|
+ l->top_p = new_keys + oldsize;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert)
|
|
+{
|
|
+ struct bkey_i *where;
|
|
+
|
|
+ for_each_keylist_key(l, where)
|
|
+ if (bkey_cmp(insert->k.p, where->k.p) < 0)
|
|
+ break;
|
|
+
|
|
+ memmove_u64s_up((u64 *) where + insert->k.u64s,
|
|
+ where,
|
|
+ ((u64 *) l->top) - ((u64 *) where));
|
|
+
|
|
+ l->top_p += insert->k.u64s;
|
|
+ bkey_copy(where, insert);
|
|
+}
|
|
+
|
|
+void bch2_keylist_pop_front(struct keylist *l)
|
|
+{
|
|
+ l->top_p -= bch2_keylist_front(l)->k.u64s;
|
|
+
|
|
+ memmove_u64s_down(l->keys,
|
|
+ bkey_next(l->keys),
|
|
+ bch2_keylist_u64s(l));
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_verify_keylist_sorted(struct keylist *l)
|
|
+{
|
|
+ struct bkey_i *k;
|
|
+
|
|
+ for_each_keylist_key(l, k)
|
|
+ BUG_ON(bkey_next(k) != l->top &&
|
|
+ bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
|
|
+}
|
|
+#endif
|
|
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
|
|
new file mode 100644
|
|
index 000000000000..195799bb20bc
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/keylist.h
|
|
@@ -0,0 +1,76 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_KEYLIST_H
|
|
+#define _BCACHEFS_KEYLIST_H
|
|
+
|
|
+#include "keylist_types.h"
|
|
+
|
|
+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
|
|
+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
|
|
+void bch2_keylist_pop_front(struct keylist *);
|
|
+
|
|
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
|
|
+{
|
|
+ l->top_p = l->keys_p = inline_keys;
|
|
+}
|
|
+
|
|
+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
|
|
+{
|
|
+ if (l->keys_p != inline_keys)
|
|
+ kfree(l->keys_p);
|
|
+ bch2_keylist_init(l, inline_keys);
|
|
+}
|
|
+
|
|
+static inline void bch2_keylist_push(struct keylist *l)
|
|
+{
|
|
+ l->top = bkey_next(l->top);
|
|
+}
|
|
+
|
|
+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k)
|
|
+{
|
|
+ bkey_copy(l->top, k);
|
|
+ bch2_keylist_push(l);
|
|
+}
|
|
+
|
|
+static inline bool bch2_keylist_empty(struct keylist *l)
|
|
+{
|
|
+ return l->top == l->keys;
|
|
+}
|
|
+
|
|
+static inline size_t bch2_keylist_u64s(struct keylist *l)
|
|
+{
|
|
+ return l->top_p - l->keys_p;
|
|
+}
|
|
+
|
|
+static inline size_t bch2_keylist_bytes(struct keylist *l)
|
|
+{
|
|
+ return bch2_keylist_u64s(l) * sizeof(u64);
|
|
+}
|
|
+
|
|
+static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
|
|
+{
|
|
+ return l->keys;
|
|
+}
|
|
+
|
|
+#define for_each_keylist_key(_keylist, _k) \
|
|
+ for (_k = (_keylist)->keys; \
|
|
+ _k != (_keylist)->top; \
|
|
+ _k = bkey_next(_k))
|
|
+
|
|
+static inline u64 keylist_sectors(struct keylist *keys)
|
|
+{
|
|
+ struct bkey_i *k;
|
|
+ u64 ret = 0;
|
|
+
|
|
+ for_each_keylist_key(keys, k)
|
|
+ ret += k->k.size;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+void bch2_verify_keylist_sorted(struct keylist *);
|
|
+#else
|
|
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
|
|
+#endif
|
|
+
|
|
+#endif /* _BCACHEFS_KEYLIST_H */
|
|
diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h
|
|
new file mode 100644
|
|
index 000000000000..4b3ff7d8a875
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/keylist_types.h
|
|
@@ -0,0 +1,16 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_KEYLIST_TYPES_H
|
|
+#define _BCACHEFS_KEYLIST_TYPES_H
|
|
+
|
|
+struct keylist {
|
|
+ union {
|
|
+ struct bkey_i *keys;
|
|
+ u64 *keys_p;
|
|
+ };
|
|
+ union {
|
|
+ struct bkey_i *top;
|
|
+ u64 *top_p;
|
|
+ };
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_KEYLIST_TYPES_H */
|
|
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
|
|
new file mode 100644
|
|
index 000000000000..ef69a19f494a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/migrate.c
|
|
@@ -0,0 +1,177 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Code for moving data off a device.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "buckets.h"
|
|
+#include "extents.h"
|
|
+#include "io.h"
|
|
+#include "journal.h"
|
|
+#include "keylist.h"
|
|
+#include "migrate.h"
|
|
+#include "move.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
|
|
+ unsigned dev_idx, int flags, bool metadata)
|
|
+{
|
|
+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
|
|
+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
|
|
+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED;
|
|
+ unsigned nr_good;
|
|
+
|
|
+ bch2_bkey_drop_device(k, dev_idx);
|
|
+
|
|
+ nr_good = bch2_bkey_durability(c, k.s_c);
|
|
+ if ((!nr_good && !(flags & lost)) ||
|
|
+ (nr_good < replicas && !(flags & degraded)))
|
|
+ return -EINVAL;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags,
|
|
+ enum btree_id btree_id)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf sk;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
|
|
+ BTREE_ITER_PREFETCH);
|
|
+
|
|
+ while ((k = bch2_btree_iter_peek(iter)).k &&
|
|
+ !(ret = bkey_err(k))) {
|
|
+ if (!bch2_bkey_has_device(k, dev_idx)) {
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+
|
|
+ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
|
|
+ dev_idx, flags, false);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ /*
|
|
+ * If the new extent no longer has any pointers, bch2_extent_normalize()
|
|
+ * will do the appropriate thing with it (turning it into a
|
|
+ * KEY_TYPE_error key, or just a discard if it was a cached extent)
|
|
+ */
|
|
+ bch2_extent_normalize(c, bkey_i_to_s(sk.k));
|
|
+
|
|
+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
|
|
+
|
|
+ bch2_trans_update(&trans, iter, sk.k, 0);
|
|
+
|
|
+ ret = bch2_trans_commit(&trans, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL);
|
|
+
|
|
+ /*
|
|
+ * don't want to leave ret == -EINTR, since if we raced and
|
|
+ * something else overwrote the key we could spuriously return
|
|
+ * -EINTR below:
|
|
+ */
|
|
+ if (ret == -EINTR)
|
|
+ ret = 0;
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+
|
|
+ BUG_ON(ret == -EINTR);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+{
|
|
+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_extents) ?:
|
|
+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_reflink);
|
|
+}
|
|
+
|
|
+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct closure cl;
|
|
+ struct btree *b;
|
|
+ struct bkey_buf k;
|
|
+ unsigned id;
|
|
+ int ret;
|
|
+
|
|
+ /* don't handle this yet: */
|
|
+ if (flags & BCH_FORCE_IF_METADATA_LOST)
|
|
+ return -EINVAL;
|
|
+
|
|
+ bch2_bkey_buf_init(&k);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ closure_init_stack(&cl);
|
|
+
|
|
+ for (id = 0; id < BTREE_ID_NR; id++) {
|
|
+ for_each_btree_node(&trans, iter, id, POS_MIN,
|
|
+ BTREE_ITER_PREFETCH, b) {
|
|
+retry:
|
|
+ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
|
|
+ dev_idx))
|
|
+ continue;
|
|
+
|
|
+ bch2_bkey_buf_copy(&k, c, &b->key);
|
|
+
|
|
+ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
|
|
+ dev_idx, flags, true);
|
|
+ if (ret) {
|
|
+ bch_err(c, "Cannot drop device without losing data");
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret = bch2_btree_node_update_key(c, iter, b, k.k);
|
|
+ if (ret == -EINTR) {
|
|
+ b = bch2_btree_iter_peek_node(iter);
|
|
+ ret = 0;
|
|
+ goto retry;
|
|
+ }
|
|
+ if (ret) {
|
|
+ bch_err(c, "Error updating btree node key: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_free(&trans, iter);
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /* flush relevant btree updates */
|
|
+ closure_wait_event(&c->btree_interior_update_wait,
|
|
+ !bch2_btree_interior_updates_nr_pending(c));
|
|
+
|
|
+ ret = 0;
|
|
+err:
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ bch2_bkey_buf_exit(&k, c);
|
|
+
|
|
+ BUG_ON(ret == -EINTR);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
|
|
+{
|
|
+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?:
|
|
+ bch2_dev_metadata_drop(c, dev_idx, flags);
|
|
+}
|
|
diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h
|
|
new file mode 100644
|
|
index 000000000000..027efaa0d575
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/migrate.h
|
|
@@ -0,0 +1,7 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_MIGRATE_H
|
|
+#define _BCACHEFS_MIGRATE_H
|
|
+
|
|
+int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
|
|
+
|
|
+#endif /* _BCACHEFS_MIGRATE_H */
|
|
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
|
|
new file mode 100644
|
|
index 000000000000..778ff72cf5b2
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/move.c
|
|
@@ -0,0 +1,990 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "buckets.h"
|
|
+#include "disk_groups.h"
|
|
+#include "inode.h"
|
|
+#include "io.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "move.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+#include "keylist.h"
|
|
+
|
|
+#include <linux/ioprio.h>
|
|
+#include <linux/kthread.h>
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
|
|
+
|
|
+struct moving_io {
|
|
+ struct list_head list;
|
|
+ struct closure cl;
|
|
+ bool read_completed;
|
|
+
|
|
+ unsigned read_sectors;
|
|
+ unsigned write_sectors;
|
|
+
|
|
+ struct bch_read_bio rbio;
|
|
+
|
|
+ struct migrate_write write;
|
|
+ /* Must be last since it is variable size */
|
|
+ struct bio_vec bi_inline_vecs[0];
|
|
+};
|
|
+
|
|
+struct moving_context {
|
|
+ /* Closure for waiting on all reads and writes to complete */
|
|
+ struct closure cl;
|
|
+
|
|
+ struct bch_move_stats *stats;
|
|
+
|
|
+ struct list_head reads;
|
|
+
|
|
+ /* in flight sectors: */
|
|
+ atomic_t read_sectors;
|
|
+ atomic_t write_sectors;
|
|
+
|
|
+ wait_queue_head_t wait;
|
|
+};
|
|
+
|
|
+static int bch2_migrate_index_update(struct bch_write_op *op)
|
|
+{
|
|
+ struct bch_fs *c = op->c;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct migrate_write *m =
|
|
+ container_of(op, struct migrate_write, op);
|
|
+ struct keylist *keys = &op->insert_keys;
|
|
+ struct bkey_buf _new, _insert;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_bkey_buf_init(&_new);
|
|
+ bch2_bkey_buf_init(&_insert);
|
|
+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
|
|
+
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, m->btree_id,
|
|
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+
|
|
+ while (1) {
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_i *insert;
|
|
+ struct bkey_i_extent *new;
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ bool did_work = false;
|
|
+ bool extending = false, should_check_enospc;
|
|
+ s64 i_sectors_delta = 0, disk_sectors_delta = 0;
|
|
+
|
|
+ bch2_trans_reset(&trans, 0);
|
|
+
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ new = bkey_i_to_extent(bch2_keylist_front(keys));
|
|
+
|
|
+ if (bversion_cmp(k.k->version, new->k.version) ||
|
|
+ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
|
|
+ goto nomatch;
|
|
+
|
|
+ bkey_reassemble(_insert.k, k);
|
|
+ insert = _insert.k;
|
|
+
|
|
+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
|
|
+ new = bkey_i_to_extent(_new.k);
|
|
+ bch2_cut_front(iter->pos, &new->k_i);
|
|
+
|
|
+ bch2_cut_front(iter->pos, insert);
|
|
+ bch2_cut_back(new->k.p, insert);
|
|
+ bch2_cut_back(insert->k.p, &new->k_i);
|
|
+
|
|
+ if (m->data_cmd == DATA_REWRITE) {
|
|
+ struct bch_extent_ptr *new_ptr, *old_ptr = (void *)
|
|
+ bch2_bkey_has_device(bkey_i_to_s_c(insert),
|
|
+ m->data_opts.rewrite_dev);
|
|
+ if (!old_ptr)
|
|
+ goto nomatch;
|
|
+
|
|
+ if (old_ptr->cached)
|
|
+ extent_for_each_ptr(extent_i_to_s(new), new_ptr)
|
|
+ new_ptr->cached = true;
|
|
+
|
|
+ bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
|
|
+ }
|
|
+
|
|
+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
|
|
+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) {
|
|
+ /*
|
|
+ * raced with another move op? extent already
|
|
+ * has a pointer to the device we just wrote
|
|
+ * data to
|
|
+ */
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ bch2_extent_ptr_decoded_append(insert, &p);
|
|
+ did_work = true;
|
|
+ }
|
|
+
|
|
+ if (!did_work)
|
|
+ goto nomatch;
|
|
+
|
|
+ bch2_bkey_narrow_crcs(insert,
|
|
+ (struct bch_extent_crc_unpacked) { 0 });
|
|
+ bch2_extent_normalize(c, bkey_i_to_s(insert));
|
|
+ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert),
|
|
+ op->opts.background_target,
|
|
+ op->opts.data_replicas);
|
|
+
|
|
+ ret = bch2_sum_sector_overwrites(&trans, iter, insert,
|
|
+ &extending,
|
|
+ &should_check_enospc,
|
|
+ &i_sectors_delta,
|
|
+ &disk_sectors_delta);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (disk_sectors_delta > (s64) op->res.sectors) {
|
|
+ ret = bch2_disk_reservation_add(c, &op->res,
|
|
+ disk_sectors_delta - op->res.sectors,
|
|
+ !should_check_enospc
|
|
+ ? BCH_DISK_RESERVATION_NOFAIL : 0);
|
|
+ if (ret)
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_trans_update(&trans, iter, insert, 0);
|
|
+
|
|
+ ret = bch2_trans_commit(&trans, &op->res,
|
|
+ op_journal_seq(op),
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ m->data_opts.btree_insert_flags);
|
|
+err:
|
|
+ if (!ret)
|
|
+ atomic_long_inc(&c->extent_migrate_done);
|
|
+ if (ret == -EINTR)
|
|
+ ret = 0;
|
|
+ if (ret)
|
|
+ break;
|
|
+next:
|
|
+ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) {
|
|
+ bch2_keylist_pop_front(keys);
|
|
+ if (bch2_keylist_empty(keys))
|
|
+ goto out;
|
|
+ }
|
|
+ continue;
|
|
+nomatch:
|
|
+ if (m->ctxt) {
|
|
+ BUG_ON(k.k->p.offset <= iter->pos.offset);
|
|
+ atomic64_inc(&m->ctxt->stats->keys_raced);
|
|
+ atomic64_add(k.k->p.offset - iter->pos.offset,
|
|
+ &m->ctxt->stats->sectors_raced);
|
|
+ }
|
|
+ atomic_long_inc(&c->extent_migrate_raced);
|
|
+ trace_move_race(&new->k);
|
|
+ bch2_btree_iter_next_slot(iter);
|
|
+ goto next;
|
|
+ }
|
|
+out:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ bch2_bkey_buf_exit(&_insert, c);
|
|
+ bch2_bkey_buf_exit(&_new, c);
|
|
+ BUG_ON(ret == -EINTR);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
|
|
+{
|
|
+ /* write bio must own pages: */
|
|
+ BUG_ON(!m->op.wbio.bio.bi_vcnt);
|
|
+
|
|
+ m->ptr = rbio->pick.ptr;
|
|
+ m->offset = rbio->data_pos.offset - rbio->pick.crc.offset;
|
|
+ m->op.devs_have = rbio->devs_have;
|
|
+ m->op.pos = rbio->data_pos;
|
|
+ m->op.version = rbio->version;
|
|
+ m->op.crc = rbio->pick.crc;
|
|
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
|
|
+
|
|
+ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
|
|
+ m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
|
|
+ m->op.csum_type = m->op.crc.csum_type;
|
|
+ }
|
|
+
|
|
+ if (m->data_cmd == DATA_REWRITE)
|
|
+ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
|
|
+}
|
|
+
|
|
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
|
|
+ struct write_point_specifier wp,
|
|
+ struct bch_io_opts io_opts,
|
|
+ enum data_cmd data_cmd,
|
|
+ struct data_opts data_opts,
|
|
+ enum btree_id btree_id,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ int ret;
|
|
+
|
|
+ m->btree_id = btree_id;
|
|
+ m->data_cmd = data_cmd;
|
|
+ m->data_opts = data_opts;
|
|
+ m->nr_ptrs_reserved = 0;
|
|
+
|
|
+ bch2_write_op_init(&m->op, c, io_opts);
|
|
+
|
|
+ if (!bch2_bkey_is_incompressible(k))
|
|
+ m->op.compression_type =
|
|
+ bch2_compression_opt_to_type[io_opts.background_compression ?:
|
|
+ io_opts.compression];
|
|
+ else
|
|
+ m->op.incompressible = true;
|
|
+
|
|
+ m->op.target = data_opts.target,
|
|
+ m->op.write_point = wp;
|
|
+
|
|
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) {
|
|
+ m->op.alloc_reserve = RESERVE_MOVINGGC;
|
|
+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
|
|
+ } else {
|
|
+ /* XXX: this should probably be passed in */
|
|
+ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS;
|
|
+ }
|
|
+
|
|
+ m->op.flags |= BCH_WRITE_PAGES_STABLE|
|
|
+ BCH_WRITE_PAGES_OWNED|
|
|
+ BCH_WRITE_DATA_ENCODED|
|
|
+ BCH_WRITE_FROM_INTERNAL;
|
|
+
|
|
+ m->op.nr_replicas = data_opts.nr_replicas;
|
|
+ m->op.nr_replicas_required = data_opts.nr_replicas;
|
|
+ m->op.index_update_fn = bch2_migrate_index_update;
|
|
+
|
|
+ switch (data_cmd) {
|
|
+ case DATA_ADD_REPLICAS: {
|
|
+ /*
|
|
+ * DATA_ADD_REPLICAS is used for moving data to a different
|
|
+ * device in the background, and due to compression the new copy
|
|
+ * might take up more space than the old copy:
|
|
+ */
|
|
+#if 0
|
|
+ int nr = (int) io_opts.data_replicas -
|
|
+ bch2_bkey_nr_ptrs_allocated(k);
|
|
+#endif
|
|
+ int nr = (int) io_opts.data_replicas;
|
|
+
|
|
+ if (nr > 0) {
|
|
+ m->op.nr_replicas = m->nr_ptrs_reserved = nr;
|
|
+
|
|
+ ret = bch2_disk_reservation_get(c, &m->op.res,
|
|
+ k.k->size, m->op.nr_replicas, 0);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case DATA_REWRITE: {
|
|
+ unsigned compressed_sectors = 0;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (p.ptr.dev == data_opts.rewrite_dev &&
|
|
+ !p.ptr.cached &&
|
|
+ crc_is_compressed(p.crc))
|
|
+ compressed_sectors += p.crc.compressed_size;
|
|
+
|
|
+ if (compressed_sectors) {
|
|
+ ret = bch2_disk_reservation_add(c, &m->op.res,
|
|
+ k.k->size * m->op.nr_replicas,
|
|
+ BCH_DISK_RESERVATION_NOFAIL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case DATA_PROMOTE:
|
|
+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
|
|
+ m->op.flags |= BCH_WRITE_CACHED;
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void move_free(struct closure *cl)
|
|
+{
|
|
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
|
|
+ struct moving_context *ctxt = io->write.ctxt;
|
|
+ struct bvec_iter_all iter;
|
|
+ struct bio_vec *bv;
|
|
+
|
|
+ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
|
|
+
|
|
+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter)
|
|
+ if (bv->bv_page)
|
|
+ __free_page(bv->bv_page);
|
|
+
|
|
+ wake_up(&ctxt->wait);
|
|
+
|
|
+ kfree(io);
|
|
+}
|
|
+
|
|
+static void move_write_done(struct closure *cl)
|
|
+{
|
|
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
|
|
+
|
|
+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
+ closure_return_with_destructor(cl, move_free);
|
|
+}
|
|
+
|
|
+static void move_write(struct closure *cl)
|
|
+{
|
|
+ struct moving_io *io = container_of(cl, struct moving_io, cl);
|
|
+
|
|
+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) {
|
|
+ closure_return_with_destructor(cl, move_free);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ bch2_migrate_read_done(&io->write, &io->rbio);
|
|
+
|
|
+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
|
|
+ closure_call(&io->write.op.cl, bch2_write, NULL, cl);
|
|
+ continue_at(cl, move_write_done, NULL);
|
|
+}
|
|
+
|
|
+static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
|
|
+{
|
|
+ struct moving_io *io =
|
|
+ list_first_entry_or_null(&ctxt->reads, struct moving_io, list);
|
|
+
|
|
+ return io && io->read_completed ? io : NULL;
|
|
+}
|
|
+
|
|
+static void move_read_endio(struct bio *bio)
|
|
+{
|
|
+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
|
|
+ struct moving_context *ctxt = io->write.ctxt;
|
|
+
|
|
+ atomic_sub(io->read_sectors, &ctxt->read_sectors);
|
|
+ io->read_completed = true;
|
|
+
|
|
+ if (next_pending_write(ctxt))
|
|
+ wake_up(&ctxt->wait);
|
|
+
|
|
+ closure_put(&ctxt->cl);
|
|
+}
|
|
+
|
|
+static void do_pending_writes(struct moving_context *ctxt)
|
|
+{
|
|
+ struct moving_io *io;
|
|
+
|
|
+ while ((io = next_pending_write(ctxt))) {
|
|
+ list_del(&io->list);
|
|
+ closure_call(&io->cl, move_write, NULL, &ctxt->cl);
|
|
+ }
|
|
+}
|
|
+
|
|
+#define move_ctxt_wait_event(_ctxt, _cond) \
|
|
+do { \
|
|
+ do_pending_writes(_ctxt); \
|
|
+ \
|
|
+ if (_cond) \
|
|
+ break; \
|
|
+ __wait_event((_ctxt)->wait, \
|
|
+ next_pending_write(_ctxt) || (_cond)); \
|
|
+} while (1)
|
|
+
|
|
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
|
|
+{
|
|
+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
|
|
+
|
|
+ move_ctxt_wait_event(ctxt,
|
|
+ !atomic_read(&ctxt->write_sectors) ||
|
|
+ atomic_read(&ctxt->write_sectors) != sectors_pending);
|
|
+}
|
|
+
|
|
+static int bch2_move_extent(struct btree_trans *trans,
|
|
+ struct moving_context *ctxt,
|
|
+ struct write_point_specifier wp,
|
|
+ struct bch_io_opts io_opts,
|
|
+ enum btree_id btree_id,
|
|
+ struct bkey_s_c k,
|
|
+ enum data_cmd data_cmd,
|
|
+ struct data_opts data_opts)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ struct moving_io *io;
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+ unsigned sectors = k.k->size, pages;
|
|
+ int ret = -ENOMEM;
|
|
+
|
|
+ move_ctxt_wait_event(ctxt,
|
|
+ atomic_read(&ctxt->write_sectors) <
|
|
+ SECTORS_IN_FLIGHT_PER_DEVICE);
|
|
+
|
|
+ move_ctxt_wait_event(ctxt,
|
|
+ atomic_read(&ctxt->read_sectors) <
|
|
+ SECTORS_IN_FLIGHT_PER_DEVICE);
|
|
+
|
|
+ /* write path might have to decompress data: */
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size);
|
|
+
|
|
+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
|
|
+ io = kzalloc(sizeof(struct moving_io) +
|
|
+ sizeof(struct bio_vec) * pages, GFP_KERNEL);
|
|
+ if (!io)
|
|
+ goto err;
|
|
+
|
|
+ io->write.ctxt = ctxt;
|
|
+ io->read_sectors = k.k->size;
|
|
+ io->write_sectors = k.k->size;
|
|
+
|
|
+ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
|
|
+ bio_set_prio(&io->write.op.wbio.bio,
|
|
+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
|
+
|
|
+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9,
|
|
+ GFP_KERNEL))
|
|
+ goto err_free;
|
|
+
|
|
+ io->rbio.c = c;
|
|
+ io->rbio.opts = io_opts;
|
|
+ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
|
|
+ io->rbio.bio.bi_vcnt = pages;
|
|
+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
|
|
+ io->rbio.bio.bi_iter.bi_size = sectors << 9;
|
|
+
|
|
+ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
|
|
+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k);
|
|
+ io->rbio.bio.bi_end_io = move_read_endio;
|
|
+
|
|
+ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts,
|
|
+ data_cmd, data_opts, btree_id, k);
|
|
+ if (ret)
|
|
+ goto err_free_pages;
|
|
+
|
|
+ atomic64_inc(&ctxt->stats->keys_moved);
|
|
+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
|
|
+
|
|
+ trace_move_extent(k.k);
|
|
+
|
|
+ atomic_add(io->read_sectors, &ctxt->read_sectors);
|
|
+ list_add_tail(&io->list, &ctxt->reads);
|
|
+
|
|
+ /*
|
|
+ * dropped by move_read_endio() - guards against use after free of
|
|
+ * ctxt when doing wakeup
|
|
+ */
|
|
+ closure_get(&ctxt->cl);
|
|
+ bch2_read_extent(trans, &io->rbio,
|
|
+ bkey_start_pos(k.k),
|
|
+ btree_id, k, 0,
|
|
+ BCH_READ_NODECODE|
|
|
+ BCH_READ_LAST_FRAGMENT);
|
|
+ return 0;
|
|
+err_free_pages:
|
|
+ bio_free_pages(&io->write.op.wbio.bio);
|
|
+err_free:
|
|
+ kfree(io);
|
|
+err:
|
|
+ trace_move_alloc_fail(k.k);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int lookup_inode(struct btree_trans *trans, struct bpos pos,
|
|
+ struct bch_inode_unpacked *inode)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, pos,
|
|
+ BTREE_ITER_ALL_SNAPSHOTS);
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __bch2_move_data(struct bch_fs *c,
|
|
+ struct moving_context *ctxt,
|
|
+ struct bch_ratelimit *rate,
|
|
+ struct write_point_specifier wp,
|
|
+ struct bpos start,
|
|
+ struct bpos end,
|
|
+ move_pred_fn pred, void *arg,
|
|
+ struct bch_move_stats *stats,
|
|
+ enum btree_id btree_id)
|
|
+{
|
|
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
|
|
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
|
+ struct bkey_buf sk;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct data_opts data_opts;
|
|
+ enum data_cmd data_cmd;
|
|
+ u64 delay, cur_inum = U64_MAX;
|
|
+ int ret = 0, ret2;
|
|
+
|
|
+ bch2_bkey_buf_init(&sk);
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ stats->data_type = BCH_DATA_user;
|
|
+ stats->btree_id = btree_id;
|
|
+ stats->pos = start;
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, btree_id, start,
|
|
+ BTREE_ITER_PREFETCH);
|
|
+
|
|
+ if (rate)
|
|
+ bch2_ratelimit_reset(rate);
|
|
+
|
|
+ while (1) {
|
|
+ do {
|
|
+ delay = rate ? bch2_ratelimit_delay(rate) : 0;
|
|
+
|
|
+ if (delay) {
|
|
+ bch2_trans_unlock(&trans);
|
|
+ set_current_state(TASK_INTERRUPTIBLE);
|
|
+ }
|
|
+
|
|
+ if (kthread && (ret = kthread_should_stop())) {
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ if (delay)
|
|
+ schedule_timeout(delay);
|
|
+
|
|
+ if (unlikely(freezing(current))) {
|
|
+ bch2_trans_unlock(&trans);
|
|
+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
|
|
+ try_to_freeze();
|
|
+ }
|
|
+ } while (delay);
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+
|
|
+ stats->pos = iter->pos;
|
|
+
|
|
+ if (!k.k)
|
|
+ break;
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ break;
|
|
+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
|
|
+ break;
|
|
+
|
|
+ if (!bkey_extent_is_direct_data(k.k))
|
|
+ goto next_nondata;
|
|
+
|
|
+ if (btree_id == BTREE_ID_extents &&
|
|
+ cur_inum != k.k->p.inode) {
|
|
+ struct bch_inode_unpacked inode;
|
|
+
|
|
+ io_opts = bch2_opts_to_inode_opts(c->opts);
|
|
+
|
|
+ ret = lookup_inode(&trans,
|
|
+ SPOS(0, k.k->p.inode, k.k->p.snapshot),
|
|
+ &inode);
|
|
+ if (ret == -EINTR)
|
|
+ continue;
|
|
+
|
|
+ if (!ret)
|
|
+ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
|
|
+
|
|
+ cur_inum = k.k->p.inode;
|
|
+ }
|
|
+
|
|
+ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) {
|
|
+ case DATA_SKIP:
|
|
+ goto next;
|
|
+ case DATA_SCRUB:
|
|
+ BUG();
|
|
+ case DATA_ADD_REPLICAS:
|
|
+ case DATA_REWRITE:
|
|
+ case DATA_PROMOTE:
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ /* unlock before doing IO: */
|
|
+ bch2_bkey_buf_reassemble(&sk, c, k);
|
|
+ k = bkey_i_to_s_c(sk.k);
|
|
+ bch2_trans_unlock(&trans);
|
|
+
|
|
+ ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
|
|
+ data_cmd, data_opts);
|
|
+ if (ret2) {
|
|
+ if (ret2 == -EINTR) {
|
|
+ bch2_trans_reset(&trans, 0);
|
|
+ bch2_trans_cond_resched(&trans);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (ret2 == -ENOMEM) {
|
|
+ /* memory allocation failure, wait for some IO to finish */
|
|
+ bch2_move_ctxt_wait_for_io(ctxt);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /* XXX signal failure */
|
|
+ goto next;
|
|
+ }
|
|
+
|
|
+ if (rate)
|
|
+ bch2_ratelimit_increment(rate, k.k->size);
|
|
+next:
|
|
+ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
|
|
+ &stats->sectors_seen);
|
|
+next_nondata:
|
|
+ bch2_btree_iter_advance(iter);
|
|
+ bch2_trans_cond_resched(&trans);
|
|
+ }
|
|
+out:
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ bch2_bkey_buf_exit(&sk, c);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_move_data(struct bch_fs *c,
|
|
+ enum btree_id start_btree_id, struct bpos start_pos,
|
|
+ enum btree_id end_btree_id, struct bpos end_pos,
|
|
+ struct bch_ratelimit *rate,
|
|
+ struct write_point_specifier wp,
|
|
+ move_pred_fn pred, void *arg,
|
|
+ struct bch_move_stats *stats)
|
|
+{
|
|
+ struct moving_context ctxt = { .stats = stats };
|
|
+ enum btree_id id;
|
|
+ int ret;
|
|
+
|
|
+ closure_init_stack(&ctxt.cl);
|
|
+ INIT_LIST_HEAD(&ctxt.reads);
|
|
+ init_waitqueue_head(&ctxt.wait);
|
|
+
|
|
+ stats->data_type = BCH_DATA_user;
|
|
+
|
|
+ for (id = start_btree_id;
|
|
+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
|
|
+ id++) {
|
|
+ stats->btree_id = id;
|
|
+
|
|
+ if (id != BTREE_ID_extents &&
|
|
+ id != BTREE_ID_reflink)
|
|
+ continue;
|
|
+
|
|
+ ret = __bch2_move_data(c, &ctxt, rate, wp,
|
|
+ id == start_btree_id ? start_pos : POS_MIN,
|
|
+ id == end_btree_id ? end_pos : POS_MAX,
|
|
+ pred, arg, stats, id);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+
|
|
+ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads));
|
|
+ closure_sync(&ctxt.cl);
|
|
+
|
|
+ EBUG_ON(atomic_read(&ctxt.write_sectors));
|
|
+
|
|
+ trace_move_data(c,
|
|
+ atomic64_read(&stats->sectors_moved),
|
|
+ atomic64_read(&stats->keys_moved));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+typedef enum data_cmd (*move_btree_pred)(struct bch_fs *, void *,
|
|
+ struct btree *, struct bch_io_opts *,
|
|
+ struct data_opts *);
|
|
+
|
|
+static int bch2_move_btree(struct bch_fs *c,
|
|
+ enum btree_id start_btree_id, struct bpos start_pos,
|
|
+ enum btree_id end_btree_id, struct bpos end_pos,
|
|
+ move_btree_pred pred, void *arg,
|
|
+ struct bch_move_stats *stats)
|
|
+{
|
|
+ bool kthread = (current->flags & PF_KTHREAD) != 0;
|
|
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct btree *b;
|
|
+ enum btree_id id;
|
|
+ struct data_opts data_opts;
|
|
+ enum data_cmd cmd;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ stats->data_type = BCH_DATA_btree;
|
|
+
|
|
+ for (id = start_btree_id;
|
|
+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1);
|
|
+ id++) {
|
|
+ stats->btree_id = id;
|
|
+
|
|
+ for_each_btree_node(&trans, iter, id,
|
|
+ id == start_btree_id ? start_pos : POS_MIN,
|
|
+ BTREE_ITER_PREFETCH, b) {
|
|
+ if (kthread && kthread_should_stop())
|
|
+ break;
|
|
+
|
|
+ if ((cmp_int(id, end_btree_id) ?:
|
|
+ bkey_cmp(b->key.k.p, end_pos)) > 0)
|
|
+ break;
|
|
+
|
|
+ stats->pos = iter->pos;
|
|
+
|
|
+ switch ((cmd = pred(c, arg, b, &io_opts, &data_opts))) {
|
|
+ case DATA_SKIP:
|
|
+ goto next;
|
|
+ case DATA_SCRUB:
|
|
+ BUG();
|
|
+ case DATA_ADD_REPLICAS:
|
|
+ case DATA_REWRITE:
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ ret = bch2_btree_node_rewrite(c, iter,
|
|
+ b->data->keys.seq, 0) ?: ret;
|
|
+next:
|
|
+ bch2_trans_cond_resched(&trans);
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_iter_free(&trans, iter) ?: ret;
|
|
+ if (kthread && kthread_should_stop())
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ if (ret)
|
|
+ bch_err(c, "error %i in bch2_move_btree", ret);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#if 0
|
|
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ return DATA_SCRUB;
|
|
+}
|
|
+#endif
|
|
+
|
|
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ unsigned nr_good = bch2_bkey_durability(c, k);
|
|
+ unsigned replicas = 0;
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ replicas = c->opts.metadata_replicas;
|
|
+ break;
|
|
+ case KEY_TYPE_extent:
|
|
+ replicas = io_opts->data_replicas;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!nr_good || nr_good >= replicas)
|
|
+ return DATA_SKIP;
|
|
+
|
|
+ data_opts->target = 0;
|
|
+ data_opts->nr_replicas = 1;
|
|
+ data_opts->btree_insert_flags = 0;
|
|
+ return DATA_ADD_REPLICAS;
|
|
+}
|
|
+
|
|
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ struct bch_ioctl_data *op = arg;
|
|
+
|
|
+ if (!bch2_bkey_has_device(k, op->migrate.dev))
|
|
+ return DATA_SKIP;
|
|
+
|
|
+ data_opts->target = 0;
|
|
+ data_opts->nr_replicas = 1;
|
|
+ data_opts->btree_insert_flags = 0;
|
|
+ data_opts->rewrite_dev = op->migrate.dev;
|
|
+ return DATA_REWRITE;
|
|
+}
|
|
+
|
|
+static enum data_cmd rereplicate_btree_pred(struct bch_fs *c, void *arg,
|
|
+ struct btree *b,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
+}
|
|
+
|
|
+static enum data_cmd migrate_btree_pred(struct bch_fs *c, void *arg,
|
|
+ struct btree *b,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
|
|
+}
|
|
+
|
|
+static bool bformat_needs_redo(struct bkey_format *f)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < f->nr_fields; i++) {
|
|
+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
|
|
+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1));
|
|
+ u64 field_offset = le64_to_cpu(f->field_offset[i]);
|
|
+
|
|
+ if (f->bits_per_field[i] > unpacked_bits)
|
|
+ return true;
|
|
+
|
|
+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset)
|
|
+ return true;
|
|
+
|
|
+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) &
|
|
+ unpacked_mask) <
|
|
+ field_offset)
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static enum data_cmd rewrite_old_nodes_pred(struct bch_fs *c, void *arg,
|
|
+ struct btree *b,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ if (b->version_ondisk != c->sb.version ||
|
|
+ btree_node_need_rewrite(b) ||
|
|
+ bformat_needs_redo(&b->format)) {
|
|
+ data_opts->target = 0;
|
|
+ data_opts->nr_replicas = 1;
|
|
+ data_opts->btree_insert_flags = 0;
|
|
+ return DATA_REWRITE;
|
|
+ }
|
|
+
|
|
+ return DATA_SKIP;
|
|
+}
|
|
+
|
|
+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_move_btree(c,
|
|
+ 0, POS_MIN,
|
|
+ BTREE_ID_NR, POS_MAX,
|
|
+ rewrite_old_nodes_pred, c, stats);
|
|
+ if (!ret) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
|
|
+ c->disk_sb.sb->version_min = c->disk_sb.sb->version;
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_data_job(struct bch_fs *c,
|
|
+ struct bch_move_stats *stats,
|
|
+ struct bch_ioctl_data op)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ switch (op.op) {
|
|
+ case BCH_DATA_OP_REREPLICATE:
|
|
+ stats->data_type = BCH_DATA_journal;
|
|
+ ret = bch2_journal_flush_device_pins(&c->journal, -1);
|
|
+
|
|
+ ret = bch2_move_btree(c,
|
|
+ op.start_btree, op.start_pos,
|
|
+ op.end_btree, op.end_pos,
|
|
+ rereplicate_btree_pred, c, stats) ?: ret;
|
|
+
|
|
+ closure_wait_event(&c->btree_interior_update_wait,
|
|
+ !bch2_btree_interior_updates_nr_pending(c));
|
|
+
|
|
+ ret = bch2_replicas_gc2(c) ?: ret;
|
|
+
|
|
+ ret = bch2_move_data(c,
|
|
+ op.start_btree, op.start_pos,
|
|
+ op.end_btree, op.end_pos,
|
|
+ NULL, writepoint_hashed((unsigned long) current),
|
|
+ rereplicate_pred, c, stats) ?: ret;
|
|
+ ret = bch2_replicas_gc2(c) ?: ret;
|
|
+ break;
|
|
+ case BCH_DATA_OP_MIGRATE:
|
|
+ if (op.migrate.dev >= c->sb.nr_devices)
|
|
+ return -EINVAL;
|
|
+
|
|
+ stats->data_type = BCH_DATA_journal;
|
|
+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
|
|
+
|
|
+ ret = bch2_move_btree(c,
|
|
+ op.start_btree, op.start_pos,
|
|
+ op.end_btree, op.end_pos,
|
|
+ migrate_btree_pred, &op, stats) ?: ret;
|
|
+ ret = bch2_replicas_gc2(c) ?: ret;
|
|
+
|
|
+ ret = bch2_move_data(c,
|
|
+ op.start_btree, op.start_pos,
|
|
+ op.end_btree, op.end_pos,
|
|
+ NULL, writepoint_hashed((unsigned long) current),
|
|
+ migrate_pred, &op, stats) ?: ret;
|
|
+ ret = bch2_replicas_gc2(c) ?: ret;
|
|
+ break;
|
|
+ case BCH_DATA_OP_REWRITE_OLD_NODES:
|
|
+ ret = bch2_scan_old_btree_nodes(c, stats);
|
|
+ break;
|
|
+ default:
|
|
+ ret = -EINVAL;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
|
|
new file mode 100644
|
|
index 000000000000..5076153689d1
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/move.h
|
|
@@ -0,0 +1,69 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_MOVE_H
|
|
+#define _BCACHEFS_MOVE_H
|
|
+
|
|
+#include "btree_iter.h"
|
|
+#include "buckets.h"
|
|
+#include "io_types.h"
|
|
+#include "move_types.h"
|
|
+
|
|
+struct bch_read_bio;
|
|
+struct moving_context;
|
|
+
|
|
+enum data_cmd {
|
|
+ DATA_SKIP,
|
|
+ DATA_SCRUB,
|
|
+ DATA_ADD_REPLICAS,
|
|
+ DATA_REWRITE,
|
|
+ DATA_PROMOTE,
|
|
+};
|
|
+
|
|
+struct data_opts {
|
|
+ u16 target;
|
|
+ u8 rewrite_dev;
|
|
+ u8 nr_replicas;
|
|
+ int btree_insert_flags;
|
|
+};
|
|
+
|
|
+struct migrate_write {
|
|
+ enum btree_id btree_id;
|
|
+ enum data_cmd data_cmd;
|
|
+ struct data_opts data_opts;
|
|
+
|
|
+ unsigned nr_ptrs_reserved;
|
|
+
|
|
+ struct moving_context *ctxt;
|
|
+
|
|
+ /* what we read: */
|
|
+ struct bch_extent_ptr ptr;
|
|
+ u64 offset;
|
|
+
|
|
+ struct bch_write_op op;
|
|
+};
|
|
+
|
|
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
|
|
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
|
|
+ struct write_point_specifier,
|
|
+ struct bch_io_opts,
|
|
+ enum data_cmd, struct data_opts,
|
|
+ enum btree_id, struct bkey_s_c);
|
|
+
|
|
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
|
|
+ struct bkey_s_c,
|
|
+ struct bch_io_opts *, struct data_opts *);
|
|
+
|
|
+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
|
|
+
|
|
+int bch2_move_data(struct bch_fs *,
|
|
+ enum btree_id, struct bpos,
|
|
+ enum btree_id, struct bpos,
|
|
+ struct bch_ratelimit *,
|
|
+ struct write_point_specifier,
|
|
+ move_pred_fn, void *,
|
|
+ struct bch_move_stats *);
|
|
+
|
|
+int bch2_data_job(struct bch_fs *,
|
|
+ struct bch_move_stats *,
|
|
+ struct bch_ioctl_data);
|
|
+
|
|
+#endif /* _BCACHEFS_MOVE_H */
|
|
diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h
|
|
new file mode 100644
|
|
index 000000000000..fc0de165af9f
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/move_types.h
|
|
@@ -0,0 +1,17 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_MOVE_TYPES_H
|
|
+#define _BCACHEFS_MOVE_TYPES_H
|
|
+
|
|
+struct bch_move_stats {
|
|
+ enum bch_data_type data_type;
|
|
+ enum btree_id btree_id;
|
|
+ struct bpos pos;
|
|
+
|
|
+ atomic64_t keys_moved;
|
|
+ atomic64_t keys_raced;
|
|
+ atomic64_t sectors_moved;
|
|
+ atomic64_t sectors_seen;
|
|
+ atomic64_t sectors_raced;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_MOVE_TYPES_H */
|
|
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
|
|
new file mode 100644
|
|
index 000000000000..61c5901f0980
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/movinggc.c
|
|
@@ -0,0 +1,380 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Moving/copying garbage collector
|
|
+ *
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_update.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "disk_groups.h"
|
|
+#include "error.h"
|
|
+#include "extents.h"
|
|
+#include "eytzinger.h"
|
|
+#include "io.h"
|
|
+#include "keylist.h"
|
|
+#include "move.h"
|
|
+#include "movinggc.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/math64.h>
|
|
+#include <linux/sched/task.h>
|
|
+#include <linux/sort.h>
|
|
+#include <linux/wait.h>
|
|
+
|
|
+/*
|
|
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
|
|
+ * need the buckets we're freeing up to go back into the copygc reserve to make
|
|
+ * forward progress, but if the copygc reserve is full they'll be available for
|
|
+ * any allocation - and it's possible that in a given iteration, we free up most
|
|
+ * of the buckets we're going to free before we allocate most of the buckets
|
|
+ * we're going to allocate.
|
|
+ *
|
|
+ * If we only use half of the reserve per iteration, then in steady state we'll
|
|
+ * always have room in the reserve for the buckets we're going to need in the
|
|
+ * next iteration:
|
|
+ */
|
|
+#define COPYGC_BUCKETS_PER_ITER(ca) \
|
|
+ ((ca)->free[RESERVE_MOVINGGC].size / 2)
|
|
+
|
|
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
|
|
+{
|
|
+ const struct copygc_heap_entry *l = _l;
|
|
+ const struct copygc_heap_entry *r = _r;
|
|
+
|
|
+ return cmp_int(l->dev, r->dev) ?:
|
|
+ cmp_int(l->offset, r->offset);
|
|
+}
|
|
+
|
|
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ copygc_heap *h = &c->copygc_heap;
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p = { 0 };
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
|
|
+ struct copygc_heap_entry search = {
|
|
+ .dev = p.ptr.dev,
|
|
+ .offset = p.ptr.offset,
|
|
+ };
|
|
+
|
|
+ ssize_t i = eytzinger0_find_le(h->data, h->used,
|
|
+ sizeof(h->data[0]),
|
|
+ bucket_offset_cmp, &search);
|
|
+#if 0
|
|
+ /* eytzinger search verify code: */
|
|
+ ssize_t j = -1, k;
|
|
+
|
|
+ for (k = 0; k < h->used; k++)
|
|
+ if (h->data[k].offset <= ptr->offset &&
|
|
+ (j < 0 || h->data[k].offset > h->data[j].offset))
|
|
+ j = k;
|
|
+
|
|
+ BUG_ON(i != j);
|
|
+#endif
|
|
+ if (i >= 0 &&
|
|
+ p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
|
|
+ p.ptr.gen == h->data[i].gen) {
|
|
+ /*
|
|
+ * We need to use the journal reserve here, because
|
|
+ * - journal reclaim depends on btree key cache
|
|
+ * flushing to make forward progress,
|
|
+ * - which has to make forward progress when the
|
|
+ * journal is pre-reservation full,
|
|
+ * - and depends on allocation - meaning allocator and
|
|
+ * copygc
|
|
+ */
|
|
+
|
|
+ data_opts->target = io_opts->background_target;
|
|
+ data_opts->nr_replicas = 1;
|
|
+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE|
|
|
+ BTREE_INSERT_JOURNAL_RESERVED;
|
|
+ data_opts->rewrite_dev = p.ptr.dev;
|
|
+
|
|
+ if (p.has_ec)
|
|
+ data_opts->nr_replicas += p.ec.redundancy;
|
|
+
|
|
+ return DATA_REWRITE;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return DATA_SKIP;
|
|
+}
|
|
+
|
|
+static bool have_copygc_reserve(struct bch_dev *ca)
|
|
+{
|
|
+ bool ret;
|
|
+
|
|
+ spin_lock(&ca->fs->freelist_lock);
|
|
+ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
|
|
+ ca->allocator_state != ALLOCATOR_running;
|
|
+ spin_unlock(&ca->fs->freelist_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline int fragmentation_cmp(copygc_heap *heap,
|
|
+ struct copygc_heap_entry l,
|
|
+ struct copygc_heap_entry r)
|
|
+{
|
|
+ return cmp_int(l.fragmentation, r.fragmentation);
|
|
+}
|
|
+
|
|
+static int bch2_copygc(struct bch_fs *c)
|
|
+{
|
|
+ copygc_heap *h = &c->copygc_heap;
|
|
+ struct copygc_heap_entry e, *i;
|
|
+ struct bucket_array *buckets;
|
|
+ struct bch_move_stats move_stats;
|
|
+ u64 sectors_to_move = 0, sectors_not_moved = 0;
|
|
+ u64 sectors_reserved = 0;
|
|
+ u64 buckets_to_move, buckets_not_moved = 0;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned dev_idx;
|
|
+ size_t b, heap_size = 0;
|
|
+ int ret;
|
|
+
|
|
+ memset(&move_stats, 0, sizeof(move_stats));
|
|
+ /*
|
|
+ * Find buckets with lowest sector counts, skipping completely
|
|
+ * empty buckets, by building a maxheap sorted by sector count,
|
|
+ * and repeatedly replacing the maximum element until all
|
|
+ * buckets have been visited.
|
|
+ */
|
|
+ h->used = 0;
|
|
+
|
|
+ for_each_rw_member(ca, c, dev_idx)
|
|
+ heap_size += ca->mi.nbuckets >> 7;
|
|
+
|
|
+ if (h->size < heap_size) {
|
|
+ free_heap(&c->copygc_heap);
|
|
+ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) {
|
|
+ bch_err(c, "error allocating copygc heap");
|
|
+ return 0;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for_each_rw_member(ca, c, dev_idx) {
|
|
+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
|
|
+
|
|
+ spin_lock(&ca->fs->freelist_lock);
|
|
+ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
|
|
+ spin_unlock(&ca->fs->freelist_lock);
|
|
+
|
|
+ down_read(&ca->bucket_lock);
|
|
+ buckets = bucket_array(ca);
|
|
+
|
|
+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
|
|
+ struct bucket *g = buckets->b + b;
|
|
+ struct bucket_mark m = READ_ONCE(g->mark);
|
|
+ struct copygc_heap_entry e;
|
|
+
|
|
+ if (m.owned_by_allocator ||
|
|
+ m.data_type != BCH_DATA_user ||
|
|
+ !bucket_sectors_used(m) ||
|
|
+ bucket_sectors_used(m) >= ca->mi.bucket_size)
|
|
+ continue;
|
|
+
|
|
+ WARN_ON(m.stripe && !g->stripe_redundancy);
|
|
+
|
|
+ e = (struct copygc_heap_entry) {
|
|
+ .dev = dev_idx,
|
|
+ .gen = m.gen,
|
|
+ .replicas = 1 + g->stripe_redundancy,
|
|
+ .fragmentation = bucket_sectors_used(m) * (1U << 15)
|
|
+ / ca->mi.bucket_size,
|
|
+ .sectors = bucket_sectors_used(m),
|
|
+ .offset = bucket_to_sector(ca, b),
|
|
+ };
|
|
+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
|
|
+ }
|
|
+ up_read(&ca->bucket_lock);
|
|
+ }
|
|
+
|
|
+ if (!sectors_reserved) {
|
|
+ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!");
|
|
+ return -1;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Our btree node allocations also come out of RESERVE_MOVINGGC:
|
|
+ */
|
|
+ sectors_to_move = (sectors_to_move * 3) / 4;
|
|
+
|
|
+ for (i = h->data; i < h->data + h->used; i++)
|
|
+ sectors_to_move += i->sectors * i->replicas;
|
|
+
|
|
+ while (sectors_to_move > sectors_reserved) {
|
|
+ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL));
|
|
+ sectors_to_move -= e.sectors * e.replicas;
|
|
+ }
|
|
+
|
|
+ buckets_to_move = h->used;
|
|
+
|
|
+ if (!buckets_to_move)
|
|
+ return 0;
|
|
+
|
|
+ eytzinger0_sort(h->data, h->used,
|
|
+ sizeof(h->data[0]),
|
|
+ bucket_offset_cmp, NULL);
|
|
+
|
|
+ ret = bch2_move_data(c,
|
|
+ 0, POS_MIN,
|
|
+ BTREE_ID_NR, POS_MAX,
|
|
+ NULL,
|
|
+ writepoint_ptr(&c->copygc_write_point),
|
|
+ copygc_pred, NULL,
|
|
+ &move_stats);
|
|
+
|
|
+ for_each_rw_member(ca, c, dev_idx) {
|
|
+ down_read(&ca->bucket_lock);
|
|
+ buckets = bucket_array(ca);
|
|
+ for (i = h->data; i < h->data + h->used; i++) {
|
|
+ struct bucket_mark m;
|
|
+ size_t b;
|
|
+
|
|
+ if (i->dev != dev_idx)
|
|
+ continue;
|
|
+
|
|
+ b = sector_to_bucket(ca, i->offset);
|
|
+ m = READ_ONCE(buckets->b[b].mark);
|
|
+
|
|
+ if (i->gen == m.gen &&
|
|
+ bucket_sectors_used(m)) {
|
|
+ sectors_not_moved += bucket_sectors_used(m);
|
|
+ buckets_not_moved++;
|
|
+ }
|
|
+ }
|
|
+ up_read(&ca->bucket_lock);
|
|
+ }
|
|
+
|
|
+ if (sectors_not_moved && !ret)
|
|
+ bch_warn_ratelimited(c,
|
|
+ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
|
|
+ sectors_not_moved, sectors_to_move,
|
|
+ buckets_not_moved, buckets_to_move,
|
|
+ atomic64_read(&move_stats.sectors_moved),
|
|
+ atomic64_read(&move_stats.keys_raced),
|
|
+ atomic64_read(&move_stats.sectors_raced));
|
|
+
|
|
+ trace_copygc(c,
|
|
+ atomic64_read(&move_stats.sectors_moved), sectors_not_moved,
|
|
+ buckets_to_move, buckets_not_moved);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Copygc runs when the amount of fragmented data is above some arbitrary
|
|
+ * threshold:
|
|
+ *
|
|
+ * The threshold at the limit - when the device is full - is the amount of space
|
|
+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of
|
|
+ * disk space stranded due to fragmentation and store everything we have
|
|
+ * promised to store.
|
|
+ *
|
|
+ * But we don't want to be running copygc unnecessarily when the device still
|
|
+ * has plenty of free space - rather, we want copygc to smoothly run every so
|
|
+ * often and continually reduce the amount of fragmented space as the device
|
|
+ * fills up. So, we increase the threshold by half the current free space.
|
|
+ */
|
|
+unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned dev_idx;
|
|
+ s64 wait = S64_MAX, fragmented_allowed, fragmented;
|
|
+
|
|
+ for_each_rw_member(ca, c, dev_idx) {
|
|
+ struct bch_dev_usage usage = bch2_dev_usage_read(ca);
|
|
+
|
|
+ fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
|
|
+ ca->mi.bucket_size) >> 1);
|
|
+ fragmented = usage.d[BCH_DATA_user].fragmented;
|
|
+
|
|
+ wait = min(wait, max(0LL, fragmented_allowed - fragmented));
|
|
+ }
|
|
+
|
|
+ return wait;
|
|
+}
|
|
+
|
|
+static int bch2_copygc_thread(void *arg)
|
|
+{
|
|
+ struct bch_fs *c = arg;
|
|
+ struct io_clock *clock = &c->io_clock[WRITE];
|
|
+ u64 last, wait;
|
|
+
|
|
+ set_freezable();
|
|
+
|
|
+ while (!kthread_should_stop()) {
|
|
+ if (kthread_wait_freezable(c->copy_gc_enabled))
|
|
+ break;
|
|
+
|
|
+ last = atomic64_read(&clock->now);
|
|
+ wait = bch2_copygc_wait_amount(c);
|
|
+
|
|
+ if (wait > clock->max_slop) {
|
|
+ c->copygc_wait = last + wait;
|
|
+ bch2_kthread_io_clock_wait(clock, last + wait,
|
|
+ MAX_SCHEDULE_TIMEOUT);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ c->copygc_wait = 0;
|
|
+
|
|
+ if (bch2_copygc(c))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_copygc_stop(struct bch_fs *c)
|
|
+{
|
|
+ if (c->copygc_thread) {
|
|
+ kthread_stop(c->copygc_thread);
|
|
+ put_task_struct(c->copygc_thread);
|
|
+ }
|
|
+ c->copygc_thread = NULL;
|
|
+}
|
|
+
|
|
+int bch2_copygc_start(struct bch_fs *c)
|
|
+{
|
|
+ struct task_struct *t;
|
|
+
|
|
+ if (c->copygc_thread)
|
|
+ return 0;
|
|
+
|
|
+ if (c->opts.nochanges)
|
|
+ return 0;
|
|
+
|
|
+ if (bch2_fs_init_fault("copygc_start"))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
|
|
+ if (IS_ERR(t)) {
|
|
+ bch_err(c, "error creating copygc thread: %li", PTR_ERR(t));
|
|
+ return PTR_ERR(t);
|
|
+ }
|
|
+
|
|
+ get_task_struct(t);
|
|
+
|
|
+ c->copygc_thread = t;
|
|
+ wake_up_process(c->copygc_thread);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_fs_copygc_init(struct bch_fs *c)
|
|
+{
|
|
+}
|
|
diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h
|
|
new file mode 100644
|
|
index 000000000000..922738247d03
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/movinggc.h
|
|
@@ -0,0 +1,9 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_MOVINGGC_H
|
|
+#define _BCACHEFS_MOVINGGC_H
|
|
+
|
|
+void bch2_copygc_stop(struct bch_fs *);
|
|
+int bch2_copygc_start(struct bch_fs *);
|
|
+void bch2_fs_copygc_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_MOVINGGC_H */
|
|
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
|
|
new file mode 100644
|
|
index 000000000000..0cfbb56a57c1
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/opts.c
|
|
@@ -0,0 +1,425 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include <linux/kernel.h>
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "compress.h"
|
|
+#include "disk_groups.h"
|
|
+#include "opts.h"
|
|
+#include "super-io.h"
|
|
+#include "util.h"
|
|
+
|
|
+#define x(t, n) #t,
|
|
+
|
|
+const char * const bch2_error_actions[] = {
|
|
+ BCH_ERROR_ACTIONS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_sb_features[] = {
|
|
+ BCH_SB_FEATURES()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_sb_compat[] = {
|
|
+ BCH_SB_COMPAT()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_btree_ids[] = {
|
|
+ BCH_BTREE_IDS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_csum_opts[] = {
|
|
+ BCH_CSUM_OPTS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_compression_opts[] = {
|
|
+ BCH_COMPRESSION_OPTS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_str_hash_types[] = {
|
|
+ BCH_STR_HASH_OPTS()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_data_types[] = {
|
|
+ BCH_DATA_TYPES()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_cache_replacement_policies[] = {
|
|
+ BCH_CACHE_REPLACEMENT_POLICIES()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+const char * const bch2_member_states[] = {
|
|
+ BCH_MEMBER_STATES()
|
|
+ NULL
|
|
+};
|
|
+
|
|
+#undef x
|
|
+
|
|
+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src)
|
|
+{
|
|
+#define x(_name, ...) \
|
|
+ if (opt_defined(src, _name)) \
|
|
+ opt_set(*dst, _name, src._name);
|
|
+
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+}
|
|
+
|
|
+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(_name, ...) \
|
|
+ case Opt_##_name: \
|
|
+ return opt_defined(*opts, _name);
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(_name, ...) \
|
|
+ case Opt_##_name: \
|
|
+ return opts->_name;
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(_name, ...) \
|
|
+ case Opt_##_name: \
|
|
+ opt_set(*opts, _name, v); \
|
|
+ break;
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Initial options from superblock - here we don't want any options undefined,
|
|
+ * any options the superblock doesn't specify are set to 0:
|
|
+ */
|
|
+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb)
|
|
+{
|
|
+ struct bch_opts opts = bch2_opts_empty();
|
|
+
|
|
+#define x(_name, _bits, _mode, _type, _sb_opt, ...) \
|
|
+ if (_sb_opt != NO_SB_OPT) \
|
|
+ opt_set(opts, _name, _sb_opt(sb));
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+
|
|
+ return opts;
|
|
+}
|
|
+
|
|
+const struct bch_option bch2_opt_table[] = {
|
|
+#define OPT_BOOL() .type = BCH_OPT_BOOL
|
|
+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max
|
|
+#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max
|
|
+#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices
|
|
+#define OPT_FN(_fn) .type = BCH_OPT_FN, \
|
|
+ .parse = _fn##_parse, \
|
|
+ .to_text = _fn##_to_text
|
|
+
|
|
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \
|
|
+ [Opt_##_name] = { \
|
|
+ .attr = { \
|
|
+ .name = #_name, \
|
|
+ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \
|
|
+ }, \
|
|
+ .mode = _mode, \
|
|
+ .hint = _hint, \
|
|
+ .help = _help, \
|
|
+ .set_sb = SET_##_sb_opt, \
|
|
+ _type \
|
|
+ },
|
|
+
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+int bch2_opt_lookup(const char *name)
|
|
+{
|
|
+ const struct bch_option *i;
|
|
+
|
|
+ for (i = bch2_opt_table;
|
|
+ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table);
|
|
+ i++)
|
|
+ if (!strcmp(name, i->attr.name))
|
|
+ return i - bch2_opt_table;
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+struct synonym {
|
|
+ const char *s1, *s2;
|
|
+};
|
|
+
|
|
+static const struct synonym bch_opt_synonyms[] = {
|
|
+ { "quota", "usrquota" },
|
|
+};
|
|
+
|
|
+static int bch2_mount_opt_lookup(const char *name)
|
|
+{
|
|
+ const struct synonym *i;
|
|
+
|
|
+ for (i = bch_opt_synonyms;
|
|
+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms);
|
|
+ i++)
|
|
+ if (!strcmp(name, i->s1))
|
|
+ name = i->s2;
|
|
+
|
|
+ return bch2_opt_lookup(name);
|
|
+}
|
|
+
|
|
+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt,
|
|
+ const char *val, u64 *res)
|
|
+{
|
|
+ ssize_t ret;
|
|
+
|
|
+ switch (opt->type) {
|
|
+ case BCH_OPT_BOOL:
|
|
+ ret = kstrtou64(val, 10, res);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if (*res > 1)
|
|
+ return -ERANGE;
|
|
+ break;
|
|
+ case BCH_OPT_UINT:
|
|
+ ret = kstrtou64(val, 10, res);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if (*res < opt->min || *res >= opt->max)
|
|
+ return -ERANGE;
|
|
+ break;
|
|
+ case BCH_OPT_SECTORS:
|
|
+ ret = bch2_strtou64_h(val, res);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if (*res & 511)
|
|
+ return -EINVAL;
|
|
+
|
|
+ *res >>= 9;
|
|
+
|
|
+ if (*res < opt->min || *res >= opt->max)
|
|
+ return -ERANGE;
|
|
+ break;
|
|
+ case BCH_OPT_STR:
|
|
+ ret = match_string(opt->choices, -1, val);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ *res = ret;
|
|
+ break;
|
|
+ case BCH_OPT_FN:
|
|
+ if (!c)
|
|
+ return 0;
|
|
+
|
|
+ return opt->parse(c, val, res);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ const struct bch_option *opt, u64 v,
|
|
+ unsigned flags)
|
|
+{
|
|
+ if (flags & OPT_SHOW_MOUNT_STYLE) {
|
|
+ if (opt->type == BCH_OPT_BOOL) {
|
|
+ pr_buf(out, "%s%s",
|
|
+ v ? "" : "no",
|
|
+ opt->attr.name);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ pr_buf(out, "%s=", opt->attr.name);
|
|
+ }
|
|
+
|
|
+ switch (opt->type) {
|
|
+ case BCH_OPT_BOOL:
|
|
+ case BCH_OPT_UINT:
|
|
+ pr_buf(out, "%lli", v);
|
|
+ break;
|
|
+ case BCH_OPT_SECTORS:
|
|
+ bch2_hprint(out, v);
|
|
+ break;
|
|
+ case BCH_OPT_STR:
|
|
+ if (flags & OPT_SHOW_FULL_LIST)
|
|
+ bch2_string_opt_to_text(out, opt->choices, v);
|
|
+ else
|
|
+ pr_buf(out, opt->choices[v]);
|
|
+ break;
|
|
+ case BCH_OPT_FN:
|
|
+ opt->to_text(out, c, v);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ switch (id) {
|
|
+ case Opt_compression:
|
|
+ case Opt_background_compression:
|
|
+ ret = bch2_check_set_has_compressed_data(c, v);
|
|
+ break;
|
|
+ case Opt_erasure_code:
|
|
+ if (v)
|
|
+ bch2_check_set_feature(c, BCH_FEATURE_ec);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_opts_check_may_set(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ for (i = 0; i < bch2_opts_nr; i++) {
|
|
+ ret = bch2_opt_check_may_set(c, i,
|
|
+ bch2_opt_get_by_id(&c->opts, i));
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
|
|
+ char *options)
|
|
+{
|
|
+ char *opt, *name, *val;
|
|
+ int ret, id;
|
|
+ u64 v;
|
|
+
|
|
+ while ((opt = strsep(&options, ",")) != NULL) {
|
|
+ name = strsep(&opt, "=");
|
|
+ val = opt;
|
|
+
|
|
+ if (val) {
|
|
+ id = bch2_mount_opt_lookup(name);
|
|
+ if (id < 0)
|
|
+ goto bad_opt;
|
|
+
|
|
+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v);
|
|
+ if (ret < 0)
|
|
+ goto bad_val;
|
|
+ } else {
|
|
+ id = bch2_mount_opt_lookup(name);
|
|
+ v = 1;
|
|
+
|
|
+ if (id < 0 &&
|
|
+ !strncmp("no", name, 2)) {
|
|
+ id = bch2_mount_opt_lookup(name + 2);
|
|
+ v = 0;
|
|
+ }
|
|
+
|
|
+ if (id < 0)
|
|
+ goto bad_opt;
|
|
+
|
|
+ if (bch2_opt_table[id].type != BCH_OPT_BOOL)
|
|
+ goto no_val;
|
|
+ }
|
|
+
|
|
+ if (!(bch2_opt_table[id].mode & OPT_MOUNT))
|
|
+ goto bad_opt;
|
|
+
|
|
+ if (id == Opt_acl &&
|
|
+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL))
|
|
+ goto bad_opt;
|
|
+
|
|
+ if ((id == Opt_usrquota ||
|
|
+ id == Opt_grpquota) &&
|
|
+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA))
|
|
+ goto bad_opt;
|
|
+
|
|
+ bch2_opt_set_by_id(opts, id, v);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+bad_opt:
|
|
+ pr_err("Bad mount option %s", name);
|
|
+ return -1;
|
|
+bad_val:
|
|
+ pr_err("Invalid value %s for mount option %s", val, name);
|
|
+ return -1;
|
|
+no_val:
|
|
+ pr_err("Mount option %s requires a value", name);
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+/* io opts: */
|
|
+
|
|
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src)
|
|
+{
|
|
+ struct bch_io_opts ret = { 0 };
|
|
+#define x(_name, _bits) \
|
|
+ if (opt_defined(src, _name)) \
|
|
+ opt_set(ret, _name, src._name);
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src)
|
|
+{
|
|
+ struct bch_opts ret = { 0 };
|
|
+#define x(_name, _bits) \
|
|
+ if (opt_defined(src, _name)) \
|
|
+ opt_set(ret, _name, src._name);
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src)
|
|
+{
|
|
+#define x(_name, _bits) \
|
|
+ if (opt_defined(src, _name)) \
|
|
+ opt_set(*dst, _name, src._name);
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+}
|
|
+
|
|
+bool bch2_opt_is_inode_opt(enum bch_opt_id id)
|
|
+{
|
|
+ static const enum bch_opt_id inode_opt_list[] = {
|
|
+#define x(_name, _bits) Opt_##_name,
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ };
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++)
|
|
+ if (inode_opt_list[i] == id)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
|
|
new file mode 100644
|
|
index 000000000000..001e865c5555
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/opts.h
|
|
@@ -0,0 +1,446 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_OPTS_H
|
|
+#define _BCACHEFS_OPTS_H
|
|
+
|
|
+#include <linux/bug.h>
|
|
+#include <linux/log2.h>
|
|
+#include <linux/string.h>
|
|
+#include <linux/sysfs.h>
|
|
+#include "bcachefs_format.h"
|
|
+
|
|
+extern const char * const bch2_error_actions[];
|
|
+extern const char * const bch2_sb_features[];
|
|
+extern const char * const bch2_sb_compat[];
|
|
+extern const char * const bch2_btree_ids[];
|
|
+extern const char * const bch2_csum_opts[];
|
|
+extern const char * const bch2_compression_opts[];
|
|
+extern const char * const bch2_str_hash_types[];
|
|
+extern const char * const bch2_data_types[];
|
|
+extern const char * const bch2_cache_replacement_policies[];
|
|
+extern const char * const bch2_member_states[];
|
|
+
|
|
+/*
|
|
+ * Mount options; we also store defaults in the superblock.
|
|
+ *
|
|
+ * Also exposed via sysfs: if an option is writeable, and it's also stored in
|
|
+ * the superblock, changing it via sysfs (currently? might change this) also
|
|
+ * updates the superblock.
|
|
+ *
|
|
+ * We store options as signed integers, where -1 means undefined. This means we
|
|
+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only
|
|
+ * apply the options from that struct that are defined.
|
|
+ */
|
|
+
|
|
+/* dummy option, for options that aren't stored in the superblock */
|
|
+LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0);
|
|
+
|
|
+/* When can be set: */
|
|
+enum opt_mode {
|
|
+ OPT_FORMAT = (1 << 0),
|
|
+ OPT_MOUNT = (1 << 1),
|
|
+ OPT_RUNTIME = (1 << 2),
|
|
+ OPT_INODE = (1 << 3),
|
|
+ OPT_DEVICE = (1 << 4),
|
|
+};
|
|
+
|
|
+enum opt_type {
|
|
+ BCH_OPT_BOOL,
|
|
+ BCH_OPT_UINT,
|
|
+ BCH_OPT_SECTORS,
|
|
+ BCH_OPT_STR,
|
|
+ BCH_OPT_FN,
|
|
+};
|
|
+
|
|
+/**
|
|
+ * x(name, shortopt, type, in mem type, mode, sb_opt)
|
|
+ *
|
|
+ * @name - name of mount option, sysfs attribute, and struct bch_opts
|
|
+ * member
|
|
+ *
|
|
+ * @mode - when opt may be set
|
|
+ *
|
|
+ * @sb_option - name of corresponding superblock option
|
|
+ *
|
|
+ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * XXX: add fields for
|
|
+ * - default value
|
|
+ * - helptext
|
|
+ */
|
|
+
|
|
+#ifdef __KERNEL__
|
|
+#define RATELIMIT_ERRORS true
|
|
+#else
|
|
+#define RATELIMIT_ERRORS false
|
|
+#endif
|
|
+
|
|
+#define BCH_OPTS() \
|
|
+ x(block_size, u16, \
|
|
+ OPT_FORMAT, \
|
|
+ OPT_SECTORS(1, 128), \
|
|
+ BCH_SB_BLOCK_SIZE, 8, \
|
|
+ "size", NULL) \
|
|
+ x(btree_node_size, u16, \
|
|
+ OPT_FORMAT, \
|
|
+ OPT_SECTORS(1, 512), \
|
|
+ BCH_SB_BTREE_NODE_SIZE, 512, \
|
|
+ "size", "Btree node size, default 256k") \
|
|
+ x(errors, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_STR(bch2_error_actions), \
|
|
+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \
|
|
+ NULL, "Action to take on filesystem error") \
|
|
+ x(metadata_replicas, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
|
|
+ BCH_SB_META_REPLICAS_WANT, 1, \
|
|
+ "#", "Number of metadata replicas") \
|
|
+ x(data_replicas, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
|
|
+ BCH_SB_DATA_REPLICAS_WANT, 1, \
|
|
+ "#", "Number of data replicas") \
|
|
+ x(metadata_replicas_required, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
|
|
+ BCH_SB_META_REPLICAS_REQ, 1, \
|
|
+ "#", NULL) \
|
|
+ x(data_replicas_required, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_UINT(1, BCH_REPLICAS_MAX), \
|
|
+ BCH_SB_DATA_REPLICAS_REQ, 1, \
|
|
+ "#", NULL) \
|
|
+ x(metadata_checksum, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_STR(bch2_csum_opts), \
|
|
+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
|
|
+ NULL, NULL) \
|
|
+ x(data_checksum, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_STR(bch2_csum_opts), \
|
|
+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \
|
|
+ NULL, NULL) \
|
|
+ x(compression, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_STR(bch2_compression_opts), \
|
|
+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \
|
|
+ NULL, NULL) \
|
|
+ x(background_compression, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_STR(bch2_compression_opts), \
|
|
+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \
|
|
+ NULL, NULL) \
|
|
+ x(str_hash, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_STR(bch2_str_hash_types), \
|
|
+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \
|
|
+ NULL, "Hash function for directory entries and xattrs")\
|
|
+ x(metadata_target, u16, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_FN(bch2_opt_target), \
|
|
+ BCH_SB_METADATA_TARGET, 0, \
|
|
+ "(target)", "Device or disk group for metadata writes") \
|
|
+ x(foreground_target, u16, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_FN(bch2_opt_target), \
|
|
+ BCH_SB_FOREGROUND_TARGET, 0, \
|
|
+ "(target)", "Device or disk group for foreground writes") \
|
|
+ x(background_target, u16, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_FN(bch2_opt_target), \
|
|
+ BCH_SB_BACKGROUND_TARGET, 0, \
|
|
+ "(target)", "Device or disk group to move data to in the background")\
|
|
+ x(promote_target, u16, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_FN(bch2_opt_target), \
|
|
+ BCH_SB_PROMOTE_TARGET, 0, \
|
|
+ "(target)", "Device or disk group to promote data to on read")\
|
|
+ x(erasure_code, u16, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_ERASURE_CODE, false, \
|
|
+ NULL, "Enable erasure coding (DO NOT USE YET)") \
|
|
+ x(inodes_32bit, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_INODE_32BIT, false, \
|
|
+ NULL, "Constrain inode numbers to 32 bits") \
|
|
+ x(gc_reserve_percent, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_UINT(5, 21), \
|
|
+ BCH_SB_GC_RESERVE, 8, \
|
|
+ "%", "Percentage of disk space to reserve for copygc")\
|
|
+ x(gc_reserve_bytes, u64, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_SECTORS(0, U64_MAX), \
|
|
+ BCH_SB_GC_RESERVE_BYTES, 0, \
|
|
+ "%", "Amount of disk space to reserve for copygc\n" \
|
|
+ "Takes precedence over gc_reserve_percent if set")\
|
|
+ x(root_reserve_percent, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_UINT(0, 100), \
|
|
+ BCH_SB_ROOT_RESERVE, 0, \
|
|
+ "%", "Percentage of disk space to reserve for superuser")\
|
|
+ x(wide_macs, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_128_BIT_MACS, false, \
|
|
+ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\
|
|
+ x(inline_data, u8, \
|
|
+ OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, true, \
|
|
+ NULL, "Enable inline data extents") \
|
|
+ x(acl, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_POSIX_ACL, true, \
|
|
+ NULL, "Enable POSIX acls") \
|
|
+ x(usrquota, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_USRQUOTA, false, \
|
|
+ NULL, "Enable user quotas") \
|
|
+ x(grpquota, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_GRPQUOTA, false, \
|
|
+ NULL, "Enable group quotas") \
|
|
+ x(prjquota, u8, \
|
|
+ OPT_FORMAT|OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ BCH_SB_PRJQUOTA, false, \
|
|
+ NULL, "Enable project quotas") \
|
|
+ x(degraded, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Allow mounting in degraded mode") \
|
|
+ x(very_degraded, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Allow mounting in when data will be missing") \
|
|
+ x(discard, u8, \
|
|
+ OPT_MOUNT|OPT_DEVICE, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Enable discard/TRIM support") \
|
|
+ x(verbose, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Extra debugging information during mount/recovery")\
|
|
+ x(journal_flush_disabled, u8, \
|
|
+ OPT_MOUNT|OPT_RUNTIME, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Disable journal flush on sync/fsync\n" \
|
|
+ "If enabled, writes can be lost, but only since the\n"\
|
|
+ "last journal write (default 1 second)") \
|
|
+ x(fsck, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Run fsck on mount") \
|
|
+ x(fix_errors, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Fix errors during fsck without asking") \
|
|
+ x(ratelimit_errors, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, RATELIMIT_ERRORS, \
|
|
+ NULL, "Ratelimit error messages during fsck") \
|
|
+ x(nochanges, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Super read only mode - no writes at all will be issued,\n"\
|
|
+ "even if we have to replay the journal") \
|
|
+ x(norecovery, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Don't replay the journal") \
|
|
+ x(rebuild_replicas, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Rebuild the superblock replicas section") \
|
|
+ x(keep_journal, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Don't free journal entries/keys after startup")\
|
|
+ x(read_entire_journal, u8, \
|
|
+ 0, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Read all journal entries, not just dirty ones")\
|
|
+ x(noexcl, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Don't open device in exclusive mode") \
|
|
+ x(sb, u64, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_UINT(0, S64_MAX), \
|
|
+ NO_SB_OPT, BCH_SB_SECTOR, \
|
|
+ "offset", "Sector offset of superblock") \
|
|
+ x(read_only, u8, \
|
|
+ 0, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, NULL) \
|
|
+ x(nostart, u8, \
|
|
+ 0, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Don\'t start filesystem, only open devices") \
|
|
+ x(reconstruct_alloc, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Reconstruct alloc btree") \
|
|
+ x(version_upgrade, u8, \
|
|
+ OPT_MOUNT, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, "Set superblock to latest version,\n" \
|
|
+ "allowing any new features to be used") \
|
|
+ x(project, u8, \
|
|
+ OPT_INODE, \
|
|
+ OPT_BOOL(), \
|
|
+ NO_SB_OPT, false, \
|
|
+ NULL, NULL) \
|
|
+ x(fs_size, u64, \
|
|
+ OPT_DEVICE, \
|
|
+ OPT_SECTORS(0, S64_MAX), \
|
|
+ NO_SB_OPT, 0, \
|
|
+ "size", "Size of filesystem on device") \
|
|
+ x(bucket, u32, \
|
|
+ OPT_DEVICE, \
|
|
+ OPT_SECTORS(0, S64_MAX), \
|
|
+ NO_SB_OPT, 0, \
|
|
+ "size", "Size of filesystem on device") \
|
|
+ x(durability, u8, \
|
|
+ OPT_DEVICE, \
|
|
+ OPT_UINT(0, BCH_REPLICAS_MAX), \
|
|
+ NO_SB_OPT, 1, \
|
|
+ "n", "Data written to this device will be considered\n"\
|
|
+ "to have already been replicated n times")
|
|
+
|
|
+struct bch_opts {
|
|
+#define x(_name, _bits, ...) unsigned _name##_defined:1;
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+
|
|
+#define x(_name, _bits, ...) _bits _name;
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+static const struct bch_opts bch2_opts_default = {
|
|
+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \
|
|
+ ._name##_defined = true, \
|
|
+ ._name = _default, \
|
|
+
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+#define opt_defined(_opts, _name) ((_opts)._name##_defined)
|
|
+
|
|
+#define opt_get(_opts, _name) \
|
|
+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name)
|
|
+
|
|
+#define opt_set(_opts, _name, _v) \
|
|
+do { \
|
|
+ (_opts)._name##_defined = true; \
|
|
+ (_opts)._name = _v; \
|
|
+} while (0)
|
|
+
|
|
+static inline struct bch_opts bch2_opts_empty(void)
|
|
+{
|
|
+ return (struct bch_opts) { 0 };
|
|
+}
|
|
+
|
|
+void bch2_opts_apply(struct bch_opts *, struct bch_opts);
|
|
+
|
|
+enum bch_opt_id {
|
|
+#define x(_name, ...) Opt_##_name,
|
|
+ BCH_OPTS()
|
|
+#undef x
|
|
+ bch2_opts_nr
|
|
+};
|
|
+
|
|
+struct bch_fs;
|
|
+struct printbuf;
|
|
+
|
|
+struct bch_option {
|
|
+ struct attribute attr;
|
|
+ void (*set_sb)(struct bch_sb *, u64);
|
|
+ enum opt_mode mode;
|
|
+ enum opt_type type;
|
|
+
|
|
+ union {
|
|
+ struct {
|
|
+ u64 min, max;
|
|
+ };
|
|
+ struct {
|
|
+ const char * const *choices;
|
|
+ };
|
|
+ struct {
|
|
+ int (*parse)(struct bch_fs *, const char *, u64 *);
|
|
+ void (*to_text)(struct printbuf *, struct bch_fs *, u64);
|
|
+ };
|
|
+ };
|
|
+
|
|
+ const char *hint;
|
|
+ const char *help;
|
|
+
|
|
+};
|
|
+
|
|
+extern const struct bch_option bch2_opt_table[];
|
|
+
|
|
+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id);
|
|
+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id);
|
|
+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64);
|
|
+
|
|
+struct bch_opts bch2_opts_from_sb(struct bch_sb *);
|
|
+
|
|
+int bch2_opt_lookup(const char *);
|
|
+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *);
|
|
+
|
|
+#define OPT_SHOW_FULL_LIST (1 << 0)
|
|
+#define OPT_SHOW_MOUNT_STYLE (1 << 1)
|
|
+
|
|
+void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
|
|
+ const struct bch_option *, u64, unsigned);
|
|
+
|
|
+int bch2_opt_check_may_set(struct bch_fs *, int, u64);
|
|
+int bch2_opts_check_may_set(struct bch_fs *);
|
|
+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *);
|
|
+
|
|
+/* inode opts: */
|
|
+
|
|
+struct bch_io_opts {
|
|
+#define x(_name, _bits) unsigned _name##_defined:1;
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+
|
|
+#define x(_name, _bits) u##_bits _name;
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
|
|
+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts);
|
|
+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts);
|
|
+bool bch2_opt_is_inode_opt(enum bch_opt_id);
|
|
+
|
|
+#endif /* _BCACHEFS_OPTS_H */
|
|
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
|
|
new file mode 100644
|
|
index 000000000000..3f78fe7d37f4
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/quota.c
|
|
@@ -0,0 +1,781 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "btree_update.h"
|
|
+#include "inode.h"
|
|
+#include "quota.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_quota *q = field_to_type(f, quota);
|
|
+
|
|
+ if (vstruct_bytes(&q->field) != sizeof(*q))
|
|
+ return "invalid field quota: wrong size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
|
|
+ .validate = bch2_sb_validate_quota,
|
|
+};
|
|
+
|
|
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ if (k.k->p.inode >= QTYP_NR)
|
|
+ return "invalid quota type";
|
|
+
|
|
+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static const char * const bch2_quota_counters[] = {
|
|
+ "space",
|
|
+ "inodes",
|
|
+};
|
|
+
|
|
+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k);
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < Q_COUNTERS; i++)
|
|
+ pr_buf(out, "%s hardlimit %llu softlimit %llu",
|
|
+ bch2_quota_counters[i],
|
|
+ le64_to_cpu(dq.v->c[i].hardlimit),
|
|
+ le64_to_cpu(dq.v->c[i].softlimit));
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_QUOTA
|
|
+
|
|
+#include <linux/cred.h>
|
|
+#include <linux/fs.h>
|
|
+#include <linux/quota.h>
|
|
+
|
|
+static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
|
|
+{
|
|
+ qtypes >>= i;
|
|
+ return qtypes ? i + __ffs(qtypes) : QTYP_NR;
|
|
+}
|
|
+
|
|
+#define for_each_set_qtype(_c, _i, _q, _qtypes) \
|
|
+ for (_i = 0; \
|
|
+ (_i = __next_qtype(_i, _qtypes), \
|
|
+ _q = &(_c)->quotas[_i], \
|
|
+ _i < QTYP_NR); \
|
|
+ _i++)
|
|
+
|
|
+static bool ignore_hardlimit(struct bch_memquota_type *q)
|
|
+{
|
|
+ if (capable(CAP_SYS_RESOURCE))
|
|
+ return true;
|
|
+#if 0
|
|
+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
|
|
+
|
|
+ return capable(CAP_SYS_RESOURCE) &&
|
|
+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
|
|
+ !(info->dqi_flags & DQF_ROOT_SQUASH));
|
|
+#endif
|
|
+ return false;
|
|
+}
|
|
+
|
|
+enum quota_msg {
|
|
+ SOFTWARN, /* Softlimit reached */
|
|
+ SOFTLONGWARN, /* Grace time expired */
|
|
+ HARDWARN, /* Hardlimit reached */
|
|
+
|
|
+ HARDBELOW, /* Usage got below inode hardlimit */
|
|
+ SOFTBELOW, /* Usage got below inode softlimit */
|
|
+};
|
|
+
|
|
+static int quota_nl[][Q_COUNTERS] = {
|
|
+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN,
|
|
+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN,
|
|
+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN,
|
|
+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW,
|
|
+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW,
|
|
+
|
|
+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN,
|
|
+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN,
|
|
+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN,
|
|
+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW,
|
|
+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW,
|
|
+};
|
|
+
|
|
+struct quota_msgs {
|
|
+ u8 nr;
|
|
+ struct {
|
|
+ u8 qtype;
|
|
+ u8 msg;
|
|
+ } m[QTYP_NR * Q_COUNTERS];
|
|
+};
|
|
+
|
|
+static void prepare_msg(unsigned qtype,
|
|
+ enum quota_counters counter,
|
|
+ struct quota_msgs *msgs,
|
|
+ enum quota_msg msg_type)
|
|
+{
|
|
+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m));
|
|
+
|
|
+ msgs->m[msgs->nr].qtype = qtype;
|
|
+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter];
|
|
+ msgs->nr++;
|
|
+}
|
|
+
|
|
+static void prepare_warning(struct memquota_counter *qc,
|
|
+ unsigned qtype,
|
|
+ enum quota_counters counter,
|
|
+ struct quota_msgs *msgs,
|
|
+ enum quota_msg msg_type)
|
|
+{
|
|
+ if (qc->warning_issued & (1 << msg_type))
|
|
+ return;
|
|
+
|
|
+ prepare_msg(qtype, counter, msgs, msg_type);
|
|
+}
|
|
+
|
|
+static void flush_warnings(struct bch_qid qid,
|
|
+ struct super_block *sb,
|
|
+ struct quota_msgs *msgs)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < msgs->nr; i++)
|
|
+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]),
|
|
+ sb->s_dev, msgs->m[i].msg);
|
|
+}
|
|
+
|
|
+static int bch2_quota_check_limit(struct bch_fs *c,
|
|
+ unsigned qtype,
|
|
+ struct bch_memquota *mq,
|
|
+ struct quota_msgs *msgs,
|
|
+ enum quota_counters counter,
|
|
+ s64 v,
|
|
+ enum quota_acct_mode mode)
|
|
+{
|
|
+ struct bch_memquota_type *q = &c->quotas[qtype];
|
|
+ struct memquota_counter *qc = &mq->c[counter];
|
|
+ u64 n = qc->v + v;
|
|
+
|
|
+ BUG_ON((s64) n < 0);
|
|
+
|
|
+ if (mode == KEY_TYPE_QUOTA_NOCHECK)
|
|
+ return 0;
|
|
+
|
|
+ if (v <= 0) {
|
|
+ if (n < qc->hardlimit &&
|
|
+ (qc->warning_issued & (1 << HARDWARN))) {
|
|
+ qc->warning_issued &= ~(1 << HARDWARN);
|
|
+ prepare_msg(qtype, counter, msgs, HARDBELOW);
|
|
+ }
|
|
+
|
|
+ if (n < qc->softlimit &&
|
|
+ (qc->warning_issued & (1 << SOFTWARN))) {
|
|
+ qc->warning_issued &= ~(1 << SOFTWARN);
|
|
+ prepare_msg(qtype, counter, msgs, SOFTBELOW);
|
|
+ }
|
|
+
|
|
+ qc->warning_issued = 0;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (qc->hardlimit &&
|
|
+ qc->hardlimit < n &&
|
|
+ !ignore_hardlimit(q)) {
|
|
+ if (mode == KEY_TYPE_QUOTA_PREALLOC)
|
|
+ return -EDQUOT;
|
|
+
|
|
+ prepare_warning(qc, qtype, counter, msgs, HARDWARN);
|
|
+ }
|
|
+
|
|
+ if (qc->softlimit &&
|
|
+ qc->softlimit < n &&
|
|
+ qc->timer &&
|
|
+ ktime_get_real_seconds() >= qc->timer &&
|
|
+ !ignore_hardlimit(q)) {
|
|
+ if (mode == KEY_TYPE_QUOTA_PREALLOC)
|
|
+ return -EDQUOT;
|
|
+
|
|
+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN);
|
|
+ }
|
|
+
|
|
+ if (qc->softlimit &&
|
|
+ qc->softlimit < n &&
|
|
+ qc->timer == 0) {
|
|
+ if (mode == KEY_TYPE_QUOTA_PREALLOC)
|
|
+ return -EDQUOT;
|
|
+
|
|
+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN);
|
|
+
|
|
+ /* XXX is this the right one? */
|
|
+ qc->timer = ktime_get_real_seconds() +
|
|
+ q->limits[counter].warnlimit;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
|
|
+ enum quota_counters counter, s64 v,
|
|
+ enum quota_acct_mode mode)
|
|
+{
|
|
+ unsigned qtypes = enabled_qtypes(c);
|
|
+ struct bch_memquota_type *q;
|
|
+ struct bch_memquota *mq[QTYP_NR];
|
|
+ struct quota_msgs msgs;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ memset(&msgs, 0, sizeof(msgs));
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes)
|
|
+ mutex_lock_nested(&q->lock, i);
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes) {
|
|
+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS);
|
|
+ if (!mq[i]) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes)
|
|
+ mq[i]->c[counter].v += v;
|
|
+err:
|
|
+ for_each_set_qtype(c, i, q, qtypes)
|
|
+ mutex_unlock(&q->lock);
|
|
+
|
|
+ flush_warnings(qid, c->vfs_sb, &msgs);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void __bch2_quota_transfer(struct bch_memquota *src_q,
|
|
+ struct bch_memquota *dst_q,
|
|
+ enum quota_counters counter, s64 v)
|
|
+{
|
|
+ BUG_ON(v > src_q->c[counter].v);
|
|
+ BUG_ON(v + dst_q->c[counter].v < v);
|
|
+
|
|
+ src_q->c[counter].v -= v;
|
|
+ dst_q->c[counter].v += v;
|
|
+}
|
|
+
|
|
+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
|
|
+ struct bch_qid dst,
|
|
+ struct bch_qid src, u64 space,
|
|
+ enum quota_acct_mode mode)
|
|
+{
|
|
+ struct bch_memquota_type *q;
|
|
+ struct bch_memquota *src_q[3], *dst_q[3];
|
|
+ struct quota_msgs msgs;
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ qtypes &= enabled_qtypes(c);
|
|
+
|
|
+ memset(&msgs, 0, sizeof(msgs));
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes)
|
|
+ mutex_lock_nested(&q->lock, i);
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes) {
|
|
+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS);
|
|
+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS);
|
|
+
|
|
+ if (!src_q[i] || !dst_q[i]) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC,
|
|
+ dst_q[i]->c[Q_SPC].v + space,
|
|
+ mode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO,
|
|
+ dst_q[i]->c[Q_INO].v + 1,
|
|
+ mode);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes) {
|
|
+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space);
|
|
+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1);
|
|
+ }
|
|
+
|
|
+err:
|
|
+ for_each_set_qtype(c, i, q, qtypes)
|
|
+ mutex_unlock(&q->lock);
|
|
+
|
|
+ flush_warnings(dst, c->vfs_sb, &msgs);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_quota dq;
|
|
+ struct bch_memquota_type *q;
|
|
+ struct bch_memquota *mq;
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(k.k->p.inode >= QTYP_NR);
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_quota:
|
|
+ dq = bkey_s_c_to_quota(k);
|
|
+ q = &c->quotas[k.k->p.inode];
|
|
+
|
|
+ mutex_lock(&q->lock);
|
|
+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL);
|
|
+ if (!mq) {
|
|
+ mutex_unlock(&q->lock);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < Q_COUNTERS; i++) {
|
|
+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit);
|
|
+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&q->lock);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_quotas, POS(type, 0),
|
|
+ BTREE_ITER_PREFETCH, k, ret) {
|
|
+ if (k.k->p.inode != type)
|
|
+ break;
|
|
+
|
|
+ ret = __bch2_quota_set(c, k);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+void bch2_fs_quota_exit(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
|
|
+ genradix_free(&c->quotas[i].table);
|
|
+}
|
|
+
|
|
+void bch2_fs_quota_init(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++)
|
|
+ mutex_init(&c->quotas[i].lock);
|
|
+}
|
|
+
|
|
+static void bch2_sb_quota_read(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_quota *sb_quota;
|
|
+ unsigned i, j;
|
|
+
|
|
+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
|
|
+ if (!sb_quota)
|
|
+ return;
|
|
+
|
|
+ for (i = 0; i < QTYP_NR; i++) {
|
|
+ struct bch_memquota_type *q = &c->quotas[i];
|
|
+
|
|
+ for (j = 0; j < Q_COUNTERS; j++) {
|
|
+ q->limits[j].timelimit =
|
|
+ le32_to_cpu(sb_quota->q[i].c[j].timelimit);
|
|
+ q->limits[j].warnlimit =
|
|
+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_fs_quota_read(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i, qtypes = enabled_qtypes(c);
|
|
+ struct bch_memquota_type *q;
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bch_inode_unpacked u;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bch2_sb_quota_read(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ for_each_set_qtype(c, i, q, qtypes) {
|
|
+ ret = bch2_quota_init_type(c, i);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN,
|
|
+ BTREE_ITER_PREFETCH, k, ret) {
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_inode:
|
|
+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors,
|
|
+ KEY_TYPE_QUOTA_NOCHECK);
|
|
+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1,
|
|
+ KEY_TYPE_QUOTA_NOCHECK);
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ return bch2_trans_exit(&trans) ?: ret;
|
|
+}
|
|
+
|
|
+/* Enable/disable/delete quotas for an entire filesystem: */
|
|
+
|
|
+static int bch2_quota_enable(struct super_block *sb, unsigned uflags)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+
|
|
+ if (sb->s_flags & SB_RDONLY)
|
|
+ return -EROFS;
|
|
+
|
|
+ /* Accounting must be enabled at mount time: */
|
|
+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT))
|
|
+ return -EINVAL;
|
|
+
|
|
+ /* Can't enable enforcement without accounting: */
|
|
+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
|
|
+ return -EINVAL;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (uflags & FS_QUOTA_UDQ_ENFD)
|
|
+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
|
|
+
|
|
+ if (uflags & FS_QUOTA_GDQ_ENFD)
|
|
+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
|
|
+
|
|
+ if (uflags & FS_QUOTA_PDQ_ENFD)
|
|
+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+
|
|
+ if (sb->s_flags & SB_RDONLY)
|
|
+ return -EROFS;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (uflags & FS_QUOTA_UDQ_ENFD)
|
|
+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
|
|
+
|
|
+ if (uflags & FS_QUOTA_GDQ_ENFD)
|
|
+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
|
|
+
|
|
+ if (uflags & FS_QUOTA_PDQ_ENFD)
|
|
+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_quota_remove(struct super_block *sb, unsigned uflags)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ int ret;
|
|
+
|
|
+ if (sb->s_flags & SB_RDONLY)
|
|
+ return -EROFS;
|
|
+
|
|
+ if (uflags & FS_USER_QUOTA) {
|
|
+ if (c->opts.usrquota)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
|
|
+ POS(QTYP_USR, 0),
|
|
+ POS(QTYP_USR + 1, 0),
|
|
+ NULL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (uflags & FS_GROUP_QUOTA) {
|
|
+ if (c->opts.grpquota)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
|
|
+ POS(QTYP_GRP, 0),
|
|
+ POS(QTYP_GRP + 1, 0),
|
|
+ NULL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (uflags & FS_PROJ_QUOTA) {
|
|
+ if (c->opts.prjquota)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas,
|
|
+ POS(QTYP_PRJ, 0),
|
|
+ POS(QTYP_PRJ + 1, 0),
|
|
+ NULL);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Return quota status information, such as enforcements, quota file inode
|
|
+ * numbers etc.
|
|
+ */
|
|
+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ unsigned qtypes = enabled_qtypes(c);
|
|
+ unsigned i;
|
|
+
|
|
+ memset(state, 0, sizeof(*state));
|
|
+
|
|
+ for (i = 0; i < QTYP_NR; i++) {
|
|
+ state->s_state[i].flags |= QCI_SYSFILE;
|
|
+
|
|
+ if (!(qtypes & (1 << i)))
|
|
+ continue;
|
|
+
|
|
+ state->s_state[i].flags |= QCI_ACCT_ENABLED;
|
|
+
|
|
+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit;
|
|
+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit;
|
|
+
|
|
+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit;
|
|
+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Adjust quota timers & warnings
|
|
+ */
|
|
+static int bch2_quota_set_info(struct super_block *sb, int type,
|
|
+ struct qc_info *info)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct bch_sb_field_quota *sb_quota;
|
|
+ struct bch_memquota_type *q;
|
|
+
|
|
+ if (sb->s_flags & SB_RDONLY)
|
|
+ return -EROFS;
|
|
+
|
|
+ if (type >= QTYP_NR)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (!((1 << type) & enabled_qtypes(c)))
|
|
+ return -ESRCH;
|
|
+
|
|
+ if (info->i_fieldmask &
|
|
+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS))
|
|
+ return -EINVAL;
|
|
+
|
|
+ q = &c->quotas[type];
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
|
|
+ if (!sb_quota) {
|
|
+ sb_quota = bch2_sb_resize_quota(&c->disk_sb,
|
|
+ sizeof(*sb_quota) / sizeof(u64));
|
|
+ if (!sb_quota)
|
|
+ return -ENOSPC;
|
|
+ }
|
|
+
|
|
+ if (info->i_fieldmask & QC_SPC_TIMER)
|
|
+ sb_quota->q[type].c[Q_SPC].timelimit =
|
|
+ cpu_to_le32(info->i_spc_timelimit);
|
|
+
|
|
+ if (info->i_fieldmask & QC_SPC_WARNS)
|
|
+ sb_quota->q[type].c[Q_SPC].warnlimit =
|
|
+ cpu_to_le32(info->i_spc_warnlimit);
|
|
+
|
|
+ if (info->i_fieldmask & QC_INO_TIMER)
|
|
+ sb_quota->q[type].c[Q_INO].timelimit =
|
|
+ cpu_to_le32(info->i_ino_timelimit);
|
|
+
|
|
+ if (info->i_fieldmask & QC_INO_WARNS)
|
|
+ sb_quota->q[type].c[Q_INO].warnlimit =
|
|
+ cpu_to_le32(info->i_ino_warnlimit);
|
|
+
|
|
+ bch2_sb_quota_read(c);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Get/set individual quotas: */
|
|
+
|
|
+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src)
|
|
+{
|
|
+ dst->d_space = src->c[Q_SPC].v << 9;
|
|
+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9;
|
|
+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9;
|
|
+ dst->d_spc_timer = src->c[Q_SPC].timer;
|
|
+ dst->d_spc_warns = src->c[Q_SPC].warns;
|
|
+
|
|
+ dst->d_ino_count = src->c[Q_INO].v;
|
|
+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit;
|
|
+ dst->d_ino_softlimit = src->c[Q_INO].softlimit;
|
|
+ dst->d_ino_timer = src->c[Q_INO].timer;
|
|
+ dst->d_ino_warns = src->c[Q_INO].warns;
|
|
+}
|
|
+
|
|
+static int bch2_get_quota(struct super_block *sb, struct kqid kqid,
|
|
+ struct qc_dqblk *qdq)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct bch_memquota_type *q = &c->quotas[kqid.type];
|
|
+ qid_t qid = from_kqid(&init_user_ns, kqid);
|
|
+ struct bch_memquota *mq;
|
|
+
|
|
+ memset(qdq, 0, sizeof(*qdq));
|
|
+
|
|
+ mutex_lock(&q->lock);
|
|
+ mq = genradix_ptr(&q->table, qid);
|
|
+ if (mq)
|
|
+ __bch2_quota_get(qdq, mq);
|
|
+ mutex_unlock(&q->lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid,
|
|
+ struct qc_dqblk *qdq)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct bch_memquota_type *q = &c->quotas[kqid->type];
|
|
+ qid_t qid = from_kqid(&init_user_ns, *kqid);
|
|
+ struct genradix_iter iter;
|
|
+ struct bch_memquota *mq;
|
|
+ int ret = 0;
|
|
+
|
|
+ mutex_lock(&q->lock);
|
|
+
|
|
+ genradix_for_each_from(&q->table, iter, mq, qid)
|
|
+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) {
|
|
+ __bch2_quota_get(qdq, mq);
|
|
+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos);
|
|
+ goto found;
|
|
+ }
|
|
+
|
|
+ ret = -ENOENT;
|
|
+found:
|
|
+ mutex_unlock(&q->lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_set_quota_trans(struct btree_trans *trans,
|
|
+ struct bkey_i_quota *new_quota,
|
|
+ struct qc_dqblk *qdq)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_quotas, new_quota->k.p,
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek_slot(iter);
|
|
+
|
|
+ ret = bkey_err(k);
|
|
+ if (unlikely(ret))
|
|
+ return ret;
|
|
+
|
|
+ if (k.k->type == KEY_TYPE_quota)
|
|
+ new_quota->v = *bkey_s_c_to_quota(k).v;
|
|
+
|
|
+ if (qdq->d_fieldmask & QC_SPC_SOFT)
|
|
+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9);
|
|
+ if (qdq->d_fieldmask & QC_SPC_HARD)
|
|
+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9);
|
|
+
|
|
+ if (qdq->d_fieldmask & QC_INO_SOFT)
|
|
+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit);
|
|
+ if (qdq->d_fieldmask & QC_INO_HARD)
|
|
+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit);
|
|
+
|
|
+ return bch2_trans_update(trans, iter, &new_quota->k_i, 0);
|
|
+}
|
|
+
|
|
+static int bch2_set_quota(struct super_block *sb, struct kqid qid,
|
|
+ struct qc_dqblk *qdq)
|
|
+{
|
|
+ struct bch_fs *c = sb->s_fs_info;
|
|
+ struct bkey_i_quota new_quota;
|
|
+ int ret;
|
|
+
|
|
+ if (sb->s_flags & SB_RDONLY)
|
|
+ return -EROFS;
|
|
+
|
|
+ bkey_quota_init(&new_quota.k_i);
|
|
+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
|
|
+
|
|
+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK,
|
|
+ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?:
|
|
+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+const struct quotactl_ops bch2_quotactl_operations = {
|
|
+ .quota_enable = bch2_quota_enable,
|
|
+ .quota_disable = bch2_quota_disable,
|
|
+ .rm_xquota = bch2_quota_remove,
|
|
+
|
|
+ .get_state = bch2_quota_get_state,
|
|
+ .set_info = bch2_quota_set_info,
|
|
+
|
|
+ .get_dqblk = bch2_get_quota,
|
|
+ .get_nextdqblk = bch2_get_next_quota,
|
|
+ .set_dqblk = bch2_set_quota,
|
|
+};
|
|
+
|
|
+#endif /* CONFIG_BCACHEFS_QUOTA */
|
|
diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h
|
|
new file mode 100644
|
|
index 000000000000..51e4f9713ef0
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/quota.h
|
|
@@ -0,0 +1,71 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_QUOTA_H
|
|
+#define _BCACHEFS_QUOTA_H
|
|
+
|
|
+#include "inode.h"
|
|
+#include "quota_types.h"
|
|
+
|
|
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
|
|
+
|
|
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_quota (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_quota_invalid, \
|
|
+ .val_to_text = bch2_quota_to_text, \
|
|
+}
|
|
+
|
|
+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
|
|
+{
|
|
+ return (struct bch_qid) {
|
|
+ .q[QTYP_USR] = u->bi_uid,
|
|
+ .q[QTYP_GRP] = u->bi_gid,
|
|
+ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0,
|
|
+ };
|
|
+}
|
|
+
|
|
+static inline unsigned enabled_qtypes(struct bch_fs *c)
|
|
+{
|
|
+ return ((c->opts.usrquota << QTYP_USR)|
|
|
+ (c->opts.grpquota << QTYP_GRP)|
|
|
+ (c->opts.prjquota << QTYP_PRJ));
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_QUOTA
|
|
+
|
|
+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
|
|
+ s64, enum quota_acct_mode);
|
|
+
|
|
+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid,
|
|
+ struct bch_qid, u64, enum quota_acct_mode);
|
|
+
|
|
+void bch2_fs_quota_exit(struct bch_fs *);
|
|
+void bch2_fs_quota_init(struct bch_fs *);
|
|
+int bch2_fs_quota_read(struct bch_fs *);
|
|
+
|
|
+extern const struct quotactl_ops bch2_quotactl_operations;
|
|
+
|
|
+#else
|
|
+
|
|
+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid,
|
|
+ enum quota_counters counter, s64 v,
|
|
+ enum quota_acct_mode mode)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes,
|
|
+ struct bch_qid dst,
|
|
+ struct bch_qid src, u64 space,
|
|
+ enum quota_acct_mode mode)
|
|
+{
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static inline void bch2_fs_quota_exit(struct bch_fs *c) {}
|
|
+static inline void bch2_fs_quota_init(struct bch_fs *c) {}
|
|
+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; }
|
|
+
|
|
+#endif
|
|
+
|
|
+#endif /* _BCACHEFS_QUOTA_H */
|
|
diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h
|
|
new file mode 100644
|
|
index 000000000000..6a136083d389
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/quota_types.h
|
|
@@ -0,0 +1,43 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_QUOTA_TYPES_H
|
|
+#define _BCACHEFS_QUOTA_TYPES_H
|
|
+
|
|
+#include <linux/generic-radix-tree.h>
|
|
+
|
|
+struct bch_qid {
|
|
+ u32 q[QTYP_NR];
|
|
+};
|
|
+
|
|
+enum quota_acct_mode {
|
|
+ KEY_TYPE_QUOTA_PREALLOC,
|
|
+ KEY_TYPE_QUOTA_WARN,
|
|
+ KEY_TYPE_QUOTA_NOCHECK,
|
|
+};
|
|
+
|
|
+struct memquota_counter {
|
|
+ u64 v;
|
|
+ u64 hardlimit;
|
|
+ u64 softlimit;
|
|
+ s64 timer;
|
|
+ int warns;
|
|
+ int warning_issued;
|
|
+};
|
|
+
|
|
+struct bch_memquota {
|
|
+ struct memquota_counter c[Q_COUNTERS];
|
|
+};
|
|
+
|
|
+typedef GENRADIX(struct bch_memquota) bch_memquota_table;
|
|
+
|
|
+struct quota_limit {
|
|
+ u32 timelimit;
|
|
+ u32 warnlimit;
|
|
+};
|
|
+
|
|
+struct bch_memquota_type {
|
|
+ struct quota_limit limits[Q_COUNTERS];
|
|
+ bch_memquota_table table;
|
|
+ struct mutex lock;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_QUOTA_TYPES_H */
|
|
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
|
|
new file mode 100644
|
|
index 000000000000..a0dbf41d1d37
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/rebalance.c
|
|
@@ -0,0 +1,338 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "btree_iter.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "disk_groups.h"
|
|
+#include "extents.h"
|
|
+#include "io.h"
|
|
+#include "move.h"
|
|
+#include "rebalance.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/sched/cputime.h>
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+/*
|
|
+ * Check if an extent should be moved:
|
|
+ * returns -1 if it should not be moved, or
|
|
+ * device of pointer that should be moved, if known, or INT_MAX if unknown
|
|
+ */
|
|
+static int __bch2_rebalance_pred(struct bch_fs *c,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ if (io_opts->background_compression &&
|
|
+ !bch2_bkey_is_incompressible(k))
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (!p.ptr.cached &&
|
|
+ p.crc.compression_type !=
|
|
+ bch2_compression_opt_to_type[io_opts->background_compression])
|
|
+ return p.ptr.dev;
|
|
+
|
|
+ if (io_opts->background_target)
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
|
|
+ if (!p.ptr.cached &&
|
|
+ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target))
|
|
+ return p.ptr.dev;
|
|
+
|
|
+ return -1;
|
|
+}
|
|
+
|
|
+void bch2_rebalance_add_key(struct bch_fs *c,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts)
|
|
+{
|
|
+ atomic64_t *counter;
|
|
+ int dev;
|
|
+
|
|
+ dev = __bch2_rebalance_pred(c, k, io_opts);
|
|
+ if (dev < 0)
|
|
+ return;
|
|
+
|
|
+ counter = dev < INT_MAX
|
|
+ ? &bch_dev_bkey_exists(c, dev)->rebalance_work
|
|
+ : &c->rebalance.work_unknown_dev;
|
|
+
|
|
+ if (atomic64_add_return(k.k->size, counter) == k.k->size)
|
|
+ rebalance_wakeup(c);
|
|
+}
|
|
+
|
|
+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg,
|
|
+ struct bkey_s_c k,
|
|
+ struct bch_io_opts *io_opts,
|
|
+ struct data_opts *data_opts)
|
|
+{
|
|
+ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) {
|
|
+ data_opts->target = io_opts->background_target;
|
|
+ data_opts->nr_replicas = 1;
|
|
+ data_opts->btree_insert_flags = 0;
|
|
+ return DATA_ADD_REPLICAS;
|
|
+ } else {
|
|
+ return DATA_SKIP;
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
|
|
+{
|
|
+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
|
|
+ sectors)
|
|
+ rebalance_wakeup(c);
|
|
+}
|
|
+
|
|
+struct rebalance_work {
|
|
+ int dev_most_full_idx;
|
|
+ unsigned dev_most_full_percent;
|
|
+ u64 dev_most_full_work;
|
|
+ u64 dev_most_full_capacity;
|
|
+ u64 total_work;
|
|
+};
|
|
+
|
|
+static void rebalance_work_accumulate(struct rebalance_work *w,
|
|
+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
|
|
+{
|
|
+ unsigned percent_full;
|
|
+ u64 work = dev_work + unknown_dev;
|
|
+
|
|
+ if (work < dev_work || work < unknown_dev)
|
|
+ work = U64_MAX;
|
|
+ work = min(work, capacity);
|
|
+
|
|
+ percent_full = div64_u64(work * 100, capacity);
|
|
+
|
|
+ if (percent_full >= w->dev_most_full_percent) {
|
|
+ w->dev_most_full_idx = idx;
|
|
+ w->dev_most_full_percent = percent_full;
|
|
+ w->dev_most_full_work = work;
|
|
+ w->dev_most_full_capacity = capacity;
|
|
+ }
|
|
+
|
|
+ if (w->total_work + dev_work >= w->total_work &&
|
|
+ w->total_work + dev_work >= dev_work)
|
|
+ w->total_work += dev_work;
|
|
+}
|
|
+
|
|
+static struct rebalance_work rebalance_work(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ struct rebalance_work ret = { .dev_most_full_idx = -1 };
|
|
+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
|
|
+ unsigned i;
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ rebalance_work_accumulate(&ret,
|
|
+ atomic64_read(&ca->rebalance_work),
|
|
+ unknown_dev,
|
|
+ bucket_to_sector(ca, ca->mi.nbuckets -
|
|
+ ca->mi.first_bucket),
|
|
+ i);
|
|
+
|
|
+ rebalance_work_accumulate(&ret,
|
|
+ unknown_dev, 0, c->capacity, -1);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static void rebalance_work_reset(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ atomic64_set(&ca->rebalance_work, 0);
|
|
+
|
|
+ atomic64_set(&c->rebalance.work_unknown_dev, 0);
|
|
+}
|
|
+
|
|
+static unsigned long curr_cputime(void)
|
|
+{
|
|
+ u64 utime, stime;
|
|
+
|
|
+ task_cputime_adjusted(current, &utime, &stime);
|
|
+ return nsecs_to_jiffies(utime + stime);
|
|
+}
|
|
+
|
|
+static int bch2_rebalance_thread(void *arg)
|
|
+{
|
|
+ struct bch_fs *c = arg;
|
|
+ struct bch_fs_rebalance *r = &c->rebalance;
|
|
+ struct io_clock *clock = &c->io_clock[WRITE];
|
|
+ struct rebalance_work w, p;
|
|
+ unsigned long start, prev_start;
|
|
+ unsigned long prev_run_time, prev_run_cputime;
|
|
+ unsigned long cputime, prev_cputime;
|
|
+ u64 io_start;
|
|
+ long throttle;
|
|
+
|
|
+ set_freezable();
|
|
+
|
|
+ io_start = atomic64_read(&clock->now);
|
|
+ p = rebalance_work(c);
|
|
+ prev_start = jiffies;
|
|
+ prev_cputime = curr_cputime();
|
|
+
|
|
+ while (!kthread_wait_freezable(r->enabled)) {
|
|
+ cond_resched();
|
|
+
|
|
+ start = jiffies;
|
|
+ cputime = curr_cputime();
|
|
+
|
|
+ prev_run_time = start - prev_start;
|
|
+ prev_run_cputime = cputime - prev_cputime;
|
|
+
|
|
+ w = rebalance_work(c);
|
|
+ BUG_ON(!w.dev_most_full_capacity);
|
|
+
|
|
+ if (!w.total_work) {
|
|
+ r->state = REBALANCE_WAITING;
|
|
+ kthread_wait_freezable(rebalance_work(c).total_work);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If there isn't much work to do, throttle cpu usage:
|
|
+ */
|
|
+ throttle = prev_run_cputime * 100 /
|
|
+ max(1U, w.dev_most_full_percent) -
|
|
+ prev_run_time;
|
|
+
|
|
+ if (w.dev_most_full_percent < 20 && throttle > 0) {
|
|
+ r->throttled_until_iotime = io_start +
|
|
+ div_u64(w.dev_most_full_capacity *
|
|
+ (20 - w.dev_most_full_percent),
|
|
+ 50);
|
|
+
|
|
+ if (atomic64_read(&clock->now) + clock->max_slop <
|
|
+ r->throttled_until_iotime) {
|
|
+ r->throttled_until_cputime = start + throttle;
|
|
+ r->state = REBALANCE_THROTTLED;
|
|
+
|
|
+ bch2_kthread_io_clock_wait(clock,
|
|
+ r->throttled_until_iotime,
|
|
+ throttle);
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* minimum 1 mb/sec: */
|
|
+ r->pd.rate.rate =
|
|
+ max_t(u64, 1 << 11,
|
|
+ r->pd.rate.rate *
|
|
+ max(p.dev_most_full_percent, 1U) /
|
|
+ max(w.dev_most_full_percent, 1U));
|
|
+
|
|
+ io_start = atomic64_read(&clock->now);
|
|
+ p = w;
|
|
+ prev_start = start;
|
|
+ prev_cputime = cputime;
|
|
+
|
|
+ r->state = REBALANCE_RUNNING;
|
|
+ memset(&r->move_stats, 0, sizeof(r->move_stats));
|
|
+ rebalance_work_reset(c);
|
|
+
|
|
+ bch2_move_data(c,
|
|
+ 0, POS_MIN,
|
|
+ BTREE_ID_NR, POS_MAX,
|
|
+ /* ratelimiting disabled for now */
|
|
+ NULL, /* &r->pd.rate, */
|
|
+ writepoint_ptr(&c->rebalance_write_point),
|
|
+ rebalance_pred, NULL,
|
|
+ &r->move_stats);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_rebalance *r = &c->rebalance;
|
|
+ struct rebalance_work w = rebalance_work(c);
|
|
+ char h1[21], h2[21];
|
|
+
|
|
+ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9);
|
|
+ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9);
|
|
+ pr_buf(out, "fullest_dev (%i):\t%s/%s\n",
|
|
+ w.dev_most_full_idx, h1, h2);
|
|
+
|
|
+ bch2_hprint(&PBUF(h1), w.total_work << 9);
|
|
+ bch2_hprint(&PBUF(h2), c->capacity << 9);
|
|
+ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2);
|
|
+
|
|
+ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate);
|
|
+
|
|
+ switch (r->state) {
|
|
+ case REBALANCE_WAITING:
|
|
+ pr_buf(out, "waiting\n");
|
|
+ break;
|
|
+ case REBALANCE_THROTTLED:
|
|
+ bch2_hprint(&PBUF(h1),
|
|
+ (r->throttled_until_iotime -
|
|
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
|
|
+ pr_buf(out, "throttled for %lu sec or %s io\n",
|
|
+ (r->throttled_until_cputime - jiffies) / HZ,
|
|
+ h1);
|
|
+ break;
|
|
+ case REBALANCE_RUNNING:
|
|
+ pr_buf(out, "running\n"
|
|
+ "pos ");
|
|
+ bch2_bpos_to_text(out, r->move_stats.pos);
|
|
+ pr_buf(out, "\n");
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_rebalance_stop(struct bch_fs *c)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ c->rebalance.pd.rate.rate = UINT_MAX;
|
|
+ bch2_ratelimit_reset(&c->rebalance.pd.rate);
|
|
+
|
|
+ p = rcu_dereference_protected(c->rebalance.thread, 1);
|
|
+ c->rebalance.thread = NULL;
|
|
+
|
|
+ if (p) {
|
|
+ /* for sychronizing with rebalance_wakeup() */
|
|
+ synchronize_rcu();
|
|
+
|
|
+ kthread_stop(p);
|
|
+ put_task_struct(p);
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_rebalance_start(struct bch_fs *c)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ if (c->rebalance.thread)
|
|
+ return 0;
|
|
+
|
|
+ if (c->opts.nochanges)
|
|
+ return 0;
|
|
+
|
|
+ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
|
|
+ if (IS_ERR(p)) {
|
|
+ bch_err(c, "error creating rebalance thread: %li", PTR_ERR(p));
|
|
+ return PTR_ERR(p);
|
|
+ }
|
|
+
|
|
+ get_task_struct(p);
|
|
+ rcu_assign_pointer(c->rebalance.thread, p);
|
|
+ wake_up_process(p);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_fs_rebalance_init(struct bch_fs *c)
|
|
+{
|
|
+ bch2_pd_controller_init(&c->rebalance.pd);
|
|
+
|
|
+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
|
|
+}
|
|
diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h
|
|
new file mode 100644
|
|
index 000000000000..7ade0bb81cce
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/rebalance.h
|
|
@@ -0,0 +1,28 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_REBALANCE_H
|
|
+#define _BCACHEFS_REBALANCE_H
|
|
+
|
|
+#include "rebalance_types.h"
|
|
+
|
|
+static inline void rebalance_wakeup(struct bch_fs *c)
|
|
+{
|
|
+ struct task_struct *p;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ p = rcu_dereference(c->rebalance.thread);
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
|
|
+ struct bch_io_opts *);
|
|
+void bch2_rebalance_add_work(struct bch_fs *, u64);
|
|
+
|
|
+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
|
|
+
|
|
+void bch2_rebalance_stop(struct bch_fs *);
|
|
+int bch2_rebalance_start(struct bch_fs *);
|
|
+void bch2_fs_rebalance_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_REBALANCE_H */
|
|
diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h
|
|
new file mode 100644
|
|
index 000000000000..2f62a643c39f
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/rebalance_types.h
|
|
@@ -0,0 +1,27 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_REBALANCE_TYPES_H
|
|
+#define _BCACHEFS_REBALANCE_TYPES_H
|
|
+
|
|
+#include "move_types.h"
|
|
+
|
|
+enum rebalance_state {
|
|
+ REBALANCE_WAITING,
|
|
+ REBALANCE_THROTTLED,
|
|
+ REBALANCE_RUNNING,
|
|
+};
|
|
+
|
|
+struct bch_fs_rebalance {
|
|
+ struct task_struct __rcu *thread;
|
|
+ struct bch_pd_controller pd;
|
|
+
|
|
+ atomic64_t work_unknown_dev;
|
|
+
|
|
+ enum rebalance_state state;
|
|
+ u64 throttled_until_iotime;
|
|
+ unsigned long throttled_until_cputime;
|
|
+ struct bch_move_stats move_stats;
|
|
+
|
|
+ unsigned enabled:1;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_REBALANCE_TYPES_H */
|
|
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
|
|
new file mode 100644
|
|
index 000000000000..b35b297d4446
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/recovery.c
|
|
@@ -0,0 +1,1384 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "alloc_background.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_io.h"
|
|
+#include "buckets.h"
|
|
+#include "dirent.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "fs-common.h"
|
|
+#include "fsck.h"
|
|
+#include "journal_io.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "move.h"
|
|
+#include "quota.h"
|
|
+#include "recovery.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+#include <linux/sort.h>
|
|
+#include <linux/stat.h>
|
|
+
|
|
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
|
|
+
|
|
+/* for -o reconstruct_alloc: */
|
|
+static void drop_alloc_keys(struct journal_keys *keys)
|
|
+{
|
|
+ size_t src, dst;
|
|
+
|
|
+ for (src = 0, dst = 0; src < keys->nr; src++)
|
|
+ if (keys->d[src].btree_id != BTREE_ID_alloc)
|
|
+ keys->d[dst++] = keys->d[src];
|
|
+
|
|
+ keys->nr = dst;
|
|
+}
|
|
+
|
|
+/* iterate over keys read from the journal: */
|
|
+
|
|
+static int __journal_key_cmp(enum btree_id l_btree_id,
|
|
+ unsigned l_level,
|
|
+ struct bpos l_pos,
|
|
+ struct journal_key *r)
|
|
+{
|
|
+ return (cmp_int(l_btree_id, r->btree_id) ?:
|
|
+ cmp_int(l_level, r->level) ?:
|
|
+ bpos_cmp(l_pos, r->k->k.p));
|
|
+}
|
|
+
|
|
+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
|
|
+{
|
|
+ return (cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ cmp_int(l->level, r->level) ?:
|
|
+ bpos_cmp(l->k->k.p, r->k->k.p));
|
|
+}
|
|
+
|
|
+static size_t journal_key_search(struct journal_keys *journal_keys,
|
|
+ enum btree_id id, unsigned level,
|
|
+ struct bpos pos)
|
|
+{
|
|
+ size_t l = 0, r = journal_keys->nr, m;
|
|
+
|
|
+ while (l < r) {
|
|
+ m = l + ((r - l) >> 1);
|
|
+ if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
|
|
+ l = m + 1;
|
|
+ else
|
|
+ r = m;
|
|
+ }
|
|
+
|
|
+ BUG_ON(l < journal_keys->nr &&
|
|
+ __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
|
|
+
|
|
+ BUG_ON(l &&
|
|
+ __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
|
|
+
|
|
+ return l;
|
|
+}
|
|
+
|
|
+static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
|
|
+{
|
|
+ struct bkey_i *n = iter->keys->d[idx].k;
|
|
+ struct btree_and_journal_iter *biter =
|
|
+ container_of(iter, struct btree_and_journal_iter, journal);
|
|
+
|
|
+ if (iter->idx > idx ||
|
|
+ (iter->idx == idx &&
|
|
+ biter->last &&
|
|
+ bpos_cmp(n->k.p, biter->unpacked.p) <= 0))
|
|
+ iter->idx++;
|
|
+}
|
|
+
|
|
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
|
|
+ unsigned level, struct bkey_i *k)
|
|
+{
|
|
+ struct journal_key n = {
|
|
+ .btree_id = id,
|
|
+ .level = level,
|
|
+ .k = k,
|
|
+ .allocated = true
|
|
+ };
|
|
+ struct journal_keys *keys = &c->journal_keys;
|
|
+ struct journal_iter *iter;
|
|
+ unsigned idx = journal_key_search(keys, id, level, k->k.p);
|
|
+
|
|
+ if (idx < keys->nr &&
|
|
+ journal_key_cmp(&n, &keys->d[idx]) == 0) {
|
|
+ if (keys->d[idx].allocated)
|
|
+ kfree(keys->d[idx].k);
|
|
+ keys->d[idx] = n;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (keys->nr == keys->size) {
|
|
+ struct journal_keys new_keys = {
|
|
+ .nr = keys->nr,
|
|
+ .size = keys->size * 2,
|
|
+ .journal_seq_base = keys->journal_seq_base,
|
|
+ };
|
|
+
|
|
+ new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
|
|
+ if (!new_keys.d) {
|
|
+ bch_err(c, "%s: error allocating new key array (size %zu)",
|
|
+ __func__, new_keys.size);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
|
|
+ kvfree(keys->d);
|
|
+ *keys = new_keys;
|
|
+ }
|
|
+
|
|
+ array_insert_item(keys->d, keys->nr, idx, n);
|
|
+
|
|
+ list_for_each_entry(iter, &c->journal_iters, list)
|
|
+ journal_iter_fix(c, iter, idx);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
|
|
+ unsigned level, struct bpos pos)
|
|
+{
|
|
+ struct bkey_i *whiteout =
|
|
+ kmalloc(sizeof(struct bkey), GFP_KERNEL);
|
|
+ int ret;
|
|
+
|
|
+ if (!whiteout) {
|
|
+ bch_err(c, "%s: error allocating new key", __func__);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ bkey_init(&whiteout->k);
|
|
+ whiteout->k.p = pos;
|
|
+
|
|
+ ret = bch2_journal_key_insert(c, id, level, whiteout);
|
|
+ if (ret)
|
|
+ kfree(whiteout);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
|
|
+{
|
|
+ struct journal_key *k = iter->idx - iter->keys->nr
|
|
+ ? iter->keys->d + iter->idx : NULL;
|
|
+
|
|
+ if (k &&
|
|
+ k->btree_id == iter->btree_id &&
|
|
+ k->level == iter->level)
|
|
+ return k->k;
|
|
+
|
|
+ iter->idx = iter->keys->nr;
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void bch2_journal_iter_advance(struct journal_iter *iter)
|
|
+{
|
|
+ if (iter->idx < iter->keys->nr)
|
|
+ iter->idx++;
|
|
+}
|
|
+
|
|
+static void bch2_journal_iter_exit(struct journal_iter *iter)
|
|
+{
|
|
+ list_del(&iter->list);
|
|
+}
|
|
+
|
|
+static void bch2_journal_iter_init(struct bch_fs *c,
|
|
+ struct journal_iter *iter,
|
|
+ enum btree_id id, unsigned level,
|
|
+ struct bpos pos)
|
|
+{
|
|
+ iter->btree_id = id;
|
|
+ iter->level = level;
|
|
+ iter->keys = &c->journal_keys;
|
|
+ iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
|
|
+ list_add(&iter->list, &c->journal_iters);
|
|
+}
|
|
+
|
|
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
|
|
+{
|
|
+ return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
|
|
+ iter->b, &iter->unpacked);
|
|
+}
|
|
+
|
|
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
|
|
+{
|
|
+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
|
|
+}
|
|
+
|
|
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
|
|
+{
|
|
+ switch (iter->last) {
|
|
+ case none:
|
|
+ break;
|
|
+ case btree:
|
|
+ bch2_journal_iter_advance_btree(iter);
|
|
+ break;
|
|
+ case journal:
|
|
+ bch2_journal_iter_advance(&iter->journal);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ iter->last = none;
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
|
|
+{
|
|
+ struct bkey_s_c ret;
|
|
+
|
|
+ while (1) {
|
|
+ struct bkey_s_c btree_k =
|
|
+ bch2_journal_iter_peek_btree(iter);
|
|
+ struct bkey_s_c journal_k =
|
|
+ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
|
|
+
|
|
+ if (btree_k.k && journal_k.k) {
|
|
+ int cmp = bpos_cmp(btree_k.k->p, journal_k.k->p);
|
|
+
|
|
+ if (!cmp)
|
|
+ bch2_journal_iter_advance_btree(iter);
|
|
+
|
|
+ iter->last = cmp < 0 ? btree : journal;
|
|
+ } else if (btree_k.k) {
|
|
+ iter->last = btree;
|
|
+ } else if (journal_k.k) {
|
|
+ iter->last = journal;
|
|
+ } else {
|
|
+ iter->last = none;
|
|
+ return bkey_s_c_null;
|
|
+ }
|
|
+
|
|
+ ret = iter->last == journal ? journal_k : btree_k;
|
|
+
|
|
+ if (iter->b &&
|
|
+ bpos_cmp(ret.k->p, iter->b->data->max_key) > 0) {
|
|
+ iter->journal.idx = iter->journal.keys->nr;
|
|
+ iter->last = none;
|
|
+ return bkey_s_c_null;
|
|
+ }
|
|
+
|
|
+ if (!bkey_deleted(ret.k))
|
|
+ break;
|
|
+
|
|
+ bch2_btree_and_journal_iter_advance(iter);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter)
|
|
+{
|
|
+ bch2_btree_and_journal_iter_advance(iter);
|
|
+
|
|
+ return bch2_btree_and_journal_iter_peek(iter);
|
|
+}
|
|
+
|
|
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
|
|
+{
|
|
+ bch2_journal_iter_exit(&iter->journal);
|
|
+}
|
|
+
|
|
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
|
|
+ struct bch_fs *c,
|
|
+ struct btree *b)
|
|
+{
|
|
+ memset(iter, 0, sizeof(*iter));
|
|
+
|
|
+ iter->b = b;
|
|
+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
|
|
+ bch2_journal_iter_init(c, &iter->journal,
|
|
+ b->c.btree_id, b->c.level, b->data->min_key);
|
|
+}
|
|
+
|
|
+/* Walk btree, overlaying keys from the journal: */
|
|
+
|
|
+static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
|
|
+ struct btree_and_journal_iter iter)
|
|
+{
|
|
+ unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf tmp;
|
|
+
|
|
+ BUG_ON(!b->c.level);
|
|
+
|
|
+ bch2_bkey_buf_init(&tmp);
|
|
+
|
|
+ while (i < nr &&
|
|
+ (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
|
+ bch2_bkey_buf_reassemble(&tmp, c, k);
|
|
+
|
|
+ bch2_btree_node_prefetch(c, NULL, tmp.k,
|
|
+ b->c.btree_id, b->c.level - 1);
|
|
+
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+ i++;
|
|
+ }
|
|
+
|
|
+ bch2_bkey_buf_exit(&tmp, c);
|
|
+}
|
|
+
|
|
+static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b,
|
|
+ enum btree_id btree_id,
|
|
+ btree_walk_key_fn key_fn)
|
|
+{
|
|
+ struct btree_and_journal_iter iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_buf tmp;
|
|
+ struct btree *child;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_bkey_buf_init(&tmp);
|
|
+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
|
|
+
|
|
+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
|
|
+ if (b->c.level) {
|
|
+ bch2_bkey_buf_reassemble(&tmp, c, k);
|
|
+
|
|
+ child = bch2_btree_node_get_noiter(c, tmp.k,
|
|
+ b->c.btree_id, b->c.level - 1,
|
|
+ false);
|
|
+
|
|
+ ret = PTR_ERR_OR_ZERO(child);
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ btree_and_journal_iter_prefetch(c, b, iter);
|
|
+
|
|
+ ret = bch2_btree_and_journal_walk_recurse(c, child,
|
|
+ btree_id, key_fn);
|
|
+ six_unlock_read(&child->c.lock);
|
|
+ } else {
|
|
+ ret = key_fn(c, k);
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ bch2_btree_and_journal_iter_advance(&iter);
|
|
+ }
|
|
+
|
|
+ bch2_btree_and_journal_iter_exit(&iter);
|
|
+ bch2_bkey_buf_exit(&tmp, c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_btree_and_journal_walk(struct bch_fs *c, enum btree_id btree_id,
|
|
+ btree_walk_key_fn key_fn)
|
|
+{
|
|
+ struct btree *b = c->btree_roots[btree_id].b;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (btree_node_fake(b))
|
|
+ return 0;
|
|
+
|
|
+ six_lock_read(&b->c.lock, NULL, NULL);
|
|
+ ret = bch2_btree_and_journal_walk_recurse(c, b, btree_id, key_fn);
|
|
+ six_unlock_read(&b->c.lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* sort and dedup all keys in the journal: */
|
|
+
|
|
+void bch2_journal_entries_free(struct list_head *list)
|
|
+{
|
|
+
|
|
+ while (!list_empty(list)) {
|
|
+ struct journal_replay *i =
|
|
+ list_first_entry(list, struct journal_replay, list);
|
|
+ list_del(&i->list);
|
|
+ kvpfree(i, offsetof(struct journal_replay, j) +
|
|
+ vstruct_bytes(&i->j));
|
|
+ }
|
|
+}
|
|
+
|
|
+/*
|
|
+ * When keys compare equal, oldest compares first:
|
|
+ */
|
|
+static int journal_sort_key_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ const struct journal_key *l = _l;
|
|
+ const struct journal_key *r = _r;
|
|
+
|
|
+ return cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ cmp_int(l->level, r->level) ?:
|
|
+ bpos_cmp(l->k->k.p, r->k->k.p) ?:
|
|
+ cmp_int(l->journal_seq, r->journal_seq) ?:
|
|
+ cmp_int(l->journal_offset, r->journal_offset);
|
|
+}
|
|
+
|
|
+void bch2_journal_keys_free(struct journal_keys *keys)
|
|
+{
|
|
+ struct journal_key *i;
|
|
+
|
|
+ for (i = keys->d; i < keys->d + keys->nr; i++)
|
|
+ if (i->allocated)
|
|
+ kfree(i->k);
|
|
+
|
|
+ kvfree(keys->d);
|
|
+ keys->d = NULL;
|
|
+ keys->nr = 0;
|
|
+}
|
|
+
|
|
+static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
|
|
+{
|
|
+ struct journal_replay *i;
|
|
+ struct jset_entry *entry;
|
|
+ struct bkey_i *k, *_n;
|
|
+ struct journal_keys keys = { NULL };
|
|
+ struct journal_key *src, *dst;
|
|
+ size_t nr_keys = 0;
|
|
+
|
|
+ if (list_empty(journal_entries))
|
|
+ return keys;
|
|
+
|
|
+ list_for_each_entry(i, journal_entries, list) {
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ if (!keys.journal_seq_base)
|
|
+ keys.journal_seq_base = le64_to_cpu(i->j.seq);
|
|
+
|
|
+ for_each_jset_key(k, _n, entry, &i->j)
|
|
+ nr_keys++;
|
|
+ }
|
|
+
|
|
+ keys.size = roundup_pow_of_two(nr_keys);
|
|
+
|
|
+ keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
|
|
+ if (!keys.d)
|
|
+ goto err;
|
|
+
|
|
+ list_for_each_entry(i, journal_entries, list) {
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ BUG_ON(le64_to_cpu(i->j.seq) - keys.journal_seq_base > U32_MAX);
|
|
+
|
|
+ for_each_jset_key(k, _n, entry, &i->j)
|
|
+ keys.d[keys.nr++] = (struct journal_key) {
|
|
+ .btree_id = entry->btree_id,
|
|
+ .level = entry->level,
|
|
+ .k = k,
|
|
+ .journal_seq = le64_to_cpu(i->j.seq) -
|
|
+ keys.journal_seq_base,
|
|
+ .journal_offset = k->_data - i->j._data,
|
|
+ };
|
|
+ }
|
|
+
|
|
+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL);
|
|
+
|
|
+ src = dst = keys.d;
|
|
+ while (src < keys.d + keys.nr) {
|
|
+ while (src + 1 < keys.d + keys.nr &&
|
|
+ src[0].btree_id == src[1].btree_id &&
|
|
+ src[0].level == src[1].level &&
|
|
+ !bpos_cmp(src[0].k->k.p, src[1].k->k.p))
|
|
+ src++;
|
|
+
|
|
+ *dst++ = *src++;
|
|
+ }
|
|
+
|
|
+ keys.nr = dst - keys.d;
|
|
+err:
|
|
+ return keys;
|
|
+}
|
|
+
|
|
+/* journal replay: */
|
|
+
|
|
+static void replay_now_at(struct journal *j, u64 seq)
|
|
+{
|
|
+ BUG_ON(seq < j->replay_journal_seq);
|
|
+ BUG_ON(seq > j->replay_journal_seq_end);
|
|
+
|
|
+ while (j->replay_journal_seq < seq)
|
|
+ bch2_journal_pin_put(j, j->replay_journal_seq++);
|
|
+}
|
|
+
|
|
+static int __bch2_journal_replay_key(struct btree_trans *trans,
|
|
+ enum btree_id id, unsigned level,
|
|
+ struct bkey_i *k)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_node_iter(trans, id, k->k.p,
|
|
+ BTREE_MAX_DEPTH, level,
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ /*
|
|
+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run
|
|
+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't
|
|
+ * want that here, journal replay is supposed to treat extents like
|
|
+ * regular keys:
|
|
+ */
|
|
+ BUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
|
|
+
|
|
+ ret = bch2_btree_iter_traverse(iter) ?:
|
|
+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
|
|
+{
|
|
+ unsigned commit_flags = BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_LAZY_RW;
|
|
+
|
|
+ if (!k->allocated)
|
|
+ commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
|
|
+
|
|
+ return bch2_trans_do(c, NULL, NULL, commit_flags,
|
|
+ __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
|
|
+}
|
|
+
|
|
+static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, k->k.p,
|
|
+ BTREE_ITER_CACHED|
|
|
+ BTREE_ITER_CACHED_NOFILL|
|
|
+ BTREE_ITER_INTENT);
|
|
+ ret = bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
|
|
+{
|
|
+ return bch2_trans_do(c, NULL, NULL,
|
|
+ BTREE_INSERT_NOFAIL|
|
|
+ BTREE_INSERT_USE_RESERVE|
|
|
+ BTREE_INSERT_LAZY_RW|
|
|
+ BTREE_INSERT_JOURNAL_REPLAY,
|
|
+ __bch2_alloc_replay_key(&trans, k));
|
|
+}
|
|
+
|
|
+static int journal_sort_seq_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ const struct journal_key *l = _l;
|
|
+ const struct journal_key *r = _r;
|
|
+
|
|
+ return cmp_int(r->level, l->level) ?:
|
|
+ cmp_int(l->journal_seq, r->journal_seq) ?:
|
|
+ cmp_int(l->btree_id, r->btree_id) ?:
|
|
+ bpos_cmp(l->k->k.p, r->k->k.p);
|
|
+}
|
|
+
|
|
+static int bch2_journal_replay(struct bch_fs *c,
|
|
+ struct journal_keys keys)
|
|
+{
|
|
+ struct journal *j = &c->journal;
|
|
+ struct journal_key *i;
|
|
+ u64 seq;
|
|
+ int ret;
|
|
+
|
|
+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
|
|
+
|
|
+ if (keys.nr)
|
|
+ replay_now_at(j, keys.journal_seq_base);
|
|
+
|
|
+ seq = j->replay_journal_seq;
|
|
+
|
|
+ /*
|
|
+ * First replay updates to the alloc btree - these will only update the
|
|
+ * btree key cache:
|
|
+ */
|
|
+ for_each_journal_key(keys, i) {
|
|
+ cond_resched();
|
|
+
|
|
+ if (!i->level && i->btree_id == BTREE_ID_alloc) {
|
|
+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
|
|
+ ret = bch2_alloc_replay_key(c, i->k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Next replay updates to interior btree nodes:
|
|
+ */
|
|
+ for_each_journal_key(keys, i) {
|
|
+ cond_resched();
|
|
+
|
|
+ if (i->level) {
|
|
+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
|
|
+ ret = bch2_journal_replay_key(c, i);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * Now that the btree is in a consistent state, we can start journal
|
|
+ * reclaim (which will be flushing entries from the btree key cache back
|
|
+ * to the btree:
|
|
+ */
|
|
+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
|
|
+ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags);
|
|
+ journal_reclaim_kick(j);
|
|
+
|
|
+ j->replay_journal_seq = seq;
|
|
+
|
|
+ /*
|
|
+ * Now replay leaf node updates:
|
|
+ */
|
|
+ for_each_journal_key(keys, i) {
|
|
+ cond_resched();
|
|
+
|
|
+ if (i->level || i->btree_id == BTREE_ID_alloc)
|
|
+ continue;
|
|
+
|
|
+ replay_now_at(j, keys.journal_seq_base + i->journal_seq);
|
|
+
|
|
+ ret = bch2_journal_replay_key(c, i);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ replay_now_at(j, j->replay_journal_seq_end);
|
|
+ j->replay_journal_seq = 0;
|
|
+
|
|
+ bch2_journal_set_replay_done(j);
|
|
+ bch2_journal_flush_all_pins(j);
|
|
+ return bch2_journal_error(j);
|
|
+err:
|
|
+ bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
|
|
+ ret, bch2_btree_ids[i->btree_id], i->level);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* journal replay early: */
|
|
+
|
|
+static int journal_replay_entry_early(struct bch_fs *c,
|
|
+ struct jset_entry *entry)
|
|
+{
|
|
+ int ret = 0;
|
|
+
|
|
+ switch (entry->type) {
|
|
+ case BCH_JSET_ENTRY_btree_root: {
|
|
+ struct btree_root *r;
|
|
+
|
|
+ if (entry->btree_id >= BTREE_ID_NR) {
|
|
+ bch_err(c, "filesystem has unknown btree type %u",
|
|
+ entry->btree_id);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ r = &c->btree_roots[entry->btree_id];
|
|
+
|
|
+ if (entry->u64s) {
|
|
+ r->level = entry->level;
|
|
+ bkey_copy(&r->key, &entry->start[0]);
|
|
+ r->error = 0;
|
|
+ } else {
|
|
+ r->error = -EIO;
|
|
+ }
|
|
+ r->alive = true;
|
|
+ break;
|
|
+ }
|
|
+ case BCH_JSET_ENTRY_usage: {
|
|
+ struct jset_entry_usage *u =
|
|
+ container_of(entry, struct jset_entry_usage, entry);
|
|
+
|
|
+ switch (entry->btree_id) {
|
|
+ case FS_USAGE_RESERVED:
|
|
+ if (entry->level < BCH_REPLICAS_MAX)
|
|
+ c->usage_base->persistent_reserved[entry->level] =
|
|
+ le64_to_cpu(u->v);
|
|
+ break;
|
|
+ case FS_USAGE_INODES:
|
|
+ c->usage_base->nr_inodes = le64_to_cpu(u->v);
|
|
+ break;
|
|
+ case FS_USAGE_KEY_VERSION:
|
|
+ atomic64_set(&c->key_version,
|
|
+ le64_to_cpu(u->v));
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case BCH_JSET_ENTRY_data_usage: {
|
|
+ struct jset_entry_data_usage *u =
|
|
+ container_of(entry, struct jset_entry_data_usage, entry);
|
|
+
|
|
+ ret = bch2_replicas_set_usage(c, &u->r,
|
|
+ le64_to_cpu(u->v));
|
|
+ break;
|
|
+ }
|
|
+ case BCH_JSET_ENTRY_dev_usage: {
|
|
+ struct jset_entry_dev_usage *u =
|
|
+ container_of(entry, struct jset_entry_dev_usage, entry);
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
|
|
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
|
|
+ unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
|
|
+ sizeof(struct jset_entry_dev_usage_type);
|
|
+ unsigned i;
|
|
+
|
|
+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
|
|
+ ca->usage_base->buckets_unavailable = le64_to_cpu(u->buckets_unavailable);
|
|
+
|
|
+ for (i = 0; i < nr_types; i++) {
|
|
+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
|
|
+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
|
|
+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
|
|
+ }
|
|
+
|
|
+ break;
|
|
+ }
|
|
+ case BCH_JSET_ENTRY_blacklist: {
|
|
+ struct jset_entry_blacklist *bl_entry =
|
|
+ container_of(entry, struct jset_entry_blacklist, entry);
|
|
+
|
|
+ ret = bch2_journal_seq_blacklist_add(c,
|
|
+ le64_to_cpu(bl_entry->seq),
|
|
+ le64_to_cpu(bl_entry->seq) + 1);
|
|
+ break;
|
|
+ }
|
|
+ case BCH_JSET_ENTRY_blacklist_v2: {
|
|
+ struct jset_entry_blacklist_v2 *bl_entry =
|
|
+ container_of(entry, struct jset_entry_blacklist_v2, entry);
|
|
+
|
|
+ ret = bch2_journal_seq_blacklist_add(c,
|
|
+ le64_to_cpu(bl_entry->start),
|
|
+ le64_to_cpu(bl_entry->end) + 1);
|
|
+ break;
|
|
+ }
|
|
+ case BCH_JSET_ENTRY_clock: {
|
|
+ struct jset_entry_clock *clock =
|
|
+ container_of(entry, struct jset_entry_clock, entry);
|
|
+
|
|
+ atomic64_set(&c->io_clock[clock->rw].now, clock->time);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int journal_replay_early(struct bch_fs *c,
|
|
+ struct bch_sb_field_clean *clean,
|
|
+ struct list_head *journal)
|
|
+{
|
|
+ struct journal_replay *i;
|
|
+ struct jset_entry *entry;
|
|
+ int ret;
|
|
+
|
|
+ if (clean) {
|
|
+ for (entry = clean->start;
|
|
+ entry != vstruct_end(&clean->field);
|
|
+ entry = vstruct_next(entry)) {
|
|
+ ret = journal_replay_entry_early(c, entry);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ } else {
|
|
+ list_for_each_entry(i, journal, list) {
|
|
+ if (i->ignore)
|
|
+ continue;
|
|
+
|
|
+ vstruct_for_each(&i->j, entry) {
|
|
+ ret = journal_replay_entry_early(c, entry);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_fs_usage_initialize(c);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* sb clean section: */
|
|
+
|
|
+static struct bkey_i *btree_root_find(struct bch_fs *c,
|
|
+ struct bch_sb_field_clean *clean,
|
|
+ struct jset *j,
|
|
+ enum btree_id id, unsigned *level)
|
|
+{
|
|
+ struct bkey_i *k;
|
|
+ struct jset_entry *entry, *start, *end;
|
|
+
|
|
+ if (clean) {
|
|
+ start = clean->start;
|
|
+ end = vstruct_end(&clean->field);
|
|
+ } else {
|
|
+ start = j->start;
|
|
+ end = vstruct_last(j);
|
|
+ }
|
|
+
|
|
+ for (entry = start; entry < end; entry = vstruct_next(entry))
|
|
+ if (entry->type == BCH_JSET_ENTRY_btree_root &&
|
|
+ entry->btree_id == id)
|
|
+ goto found;
|
|
+
|
|
+ return NULL;
|
|
+found:
|
|
+ if (!entry->u64s)
|
|
+ return ERR_PTR(-EINVAL);
|
|
+
|
|
+ k = entry->start;
|
|
+ *level = entry->level;
|
|
+ return k;
|
|
+}
|
|
+
|
|
+static int verify_superblock_clean(struct bch_fs *c,
|
|
+ struct bch_sb_field_clean **cleanp,
|
|
+ struct jset *j)
|
|
+{
|
|
+ unsigned i;
|
|
+ struct bch_sb_field_clean *clean = *cleanp;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
|
|
+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
|
|
+ le64_to_cpu(clean->journal_seq),
|
|
+ le64_to_cpu(j->seq))) {
|
|
+ kfree(clean);
|
|
+ *cleanp = NULL;
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++) {
|
|
+ char buf1[200], buf2[200];
|
|
+ struct bkey_i *k1, *k2;
|
|
+ unsigned l1 = 0, l2 = 0;
|
|
+
|
|
+ k1 = btree_root_find(c, clean, NULL, i, &l1);
|
|
+ k2 = btree_root_find(c, NULL, j, i, &l2);
|
|
+
|
|
+ if (!k1 && !k2)
|
|
+ continue;
|
|
+
|
|
+ mustfix_fsck_err_on(!k1 || !k2 ||
|
|
+ IS_ERR(k1) ||
|
|
+ IS_ERR(k2) ||
|
|
+ k1->k.u64s != k2->k.u64s ||
|
|
+ memcmp(k1, k2, bkey_bytes(k1)) ||
|
|
+ l1 != l2, c,
|
|
+ "superblock btree root %u doesn't match journal after clean shutdown\n"
|
|
+ "sb: l=%u %s\n"
|
|
+ "journal: l=%u %s\n", i,
|
|
+ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1),
|
|
+ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2));
|
|
+ }
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_clean *clean, *sb_clean;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
|
|
+
|
|
+ if (fsck_err_on(!sb_clean, c,
|
|
+ "superblock marked clean but clean section not present")) {
|
|
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
|
+ c->sb.clean = false;
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
|
|
+ GFP_KERNEL);
|
|
+ if (!clean) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return ERR_PTR(-ENOMEM);
|
|
+ }
|
|
+
|
|
+ ret = bch2_sb_clean_validate(c, clean, READ);
|
|
+ if (ret) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return ERR_PTR(ret);
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return clean;
|
|
+fsck_err:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return ERR_PTR(ret);
|
|
+}
|
|
+
|
|
+static int read_btree_roots(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+ int ret = 0;
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++) {
|
|
+ struct btree_root *r = &c->btree_roots[i];
|
|
+
|
|
+ if (!r->alive)
|
|
+ continue;
|
|
+
|
|
+ if (i == BTREE_ID_alloc &&
|
|
+ c->opts.reconstruct_alloc) {
|
|
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (r->error) {
|
|
+ __fsck_err(c, i == BTREE_ID_alloc
|
|
+ ? FSCK_CAN_IGNORE : 0,
|
|
+ "invalid btree root %s",
|
|
+ bch2_btree_ids[i]);
|
|
+ if (i == BTREE_ID_alloc)
|
|
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
|
+ }
|
|
+
|
|
+ ret = bch2_btree_root_read(c, i, &r->key, r->level);
|
|
+ if (ret) {
|
|
+ __fsck_err(c, i == BTREE_ID_alloc
|
|
+ ? FSCK_CAN_IGNORE : 0,
|
|
+ "error reading btree root %s",
|
|
+ bch2_btree_ids[i]);
|
|
+ if (i == BTREE_ID_alloc)
|
|
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ if (!c->btree_roots[i].b)
|
|
+ bch2_btree_root_alloc(c, i);
|
|
+fsck_err:
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_fs_recovery(struct bch_fs *c)
|
|
+{
|
|
+ const char *err = "cannot allocate memory";
|
|
+ struct bch_sb_field_clean *clean = NULL;
|
|
+ struct jset *last_journal_entry = NULL;
|
|
+ u64 blacklist_seq, journal_seq;
|
|
+ bool write_sb = false;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (c->sb.clean)
|
|
+ clean = read_superblock_clean(c);
|
|
+ ret = PTR_ERR_OR_ZERO(clean);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (c->sb.clean)
|
|
+ bch_info(c, "recovering from clean shutdown, journal seq %llu",
|
|
+ le64_to_cpu(clean->journal_seq));
|
|
+
|
|
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) {
|
|
+ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!c->sb.clean &&
|
|
+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) {
|
|
+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
|
|
+ bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+
|
|
+ }
|
|
+
|
|
+ if (!(c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))) {
|
|
+ bch_info(c, "alloc_v2 feature bit not set, fsck required");
|
|
+ c->opts.fsck = true;
|
|
+ c->opts.fix_errors = FSCK_OPT_YES;
|
|
+ }
|
|
+
|
|
+ if (!c->replicas.entries ||
|
|
+ c->opts.rebuild_replicas) {
|
|
+ bch_info(c, "building replicas info");
|
|
+ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
|
|
+ }
|
|
+
|
|
+ if (c->sb.version < bcachefs_metadata_version_inode_backpointers) {
|
|
+ bch_info(c, "version prior to inode backpointers, upgrade and fsck required");
|
|
+ c->opts.version_upgrade = true;
|
|
+ c->opts.fsck = true;
|
|
+ c->opts.fix_errors = FSCK_OPT_YES;
|
|
+ }
|
|
+
|
|
+ ret = bch2_blacklist_table_initialize(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error initializing blacklist table");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
|
|
+ struct journal_replay *i;
|
|
+
|
|
+ ret = bch2_journal_read(c, &c->journal_entries,
|
|
+ &blacklist_seq, &journal_seq);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ list_for_each_entry_reverse(i, &c->journal_entries, list)
|
|
+ if (!i->ignore) {
|
|
+ last_journal_entry = &i->j;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (mustfix_fsck_err_on(c->sb.clean &&
|
|
+ last_journal_entry &&
|
|
+ !journal_entry_empty(last_journal_entry), c,
|
|
+ "filesystem marked clean but journal not empty")) {
|
|
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
|
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
|
+ c->sb.clean = false;
|
|
+ }
|
|
+
|
|
+ if (!last_journal_entry) {
|
|
+ fsck_err_on(!c->sb.clean, c, "no journal entries found");
|
|
+ goto use_clean;
|
|
+ }
|
|
+
|
|
+ c->journal_keys = journal_keys_sort(&c->journal_entries);
|
|
+ if (!c->journal_keys.d) {
|
|
+ ret = -ENOMEM;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (c->sb.clean && last_journal_entry) {
|
|
+ ret = verify_superblock_clean(c, &clean,
|
|
+ last_journal_entry);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+ } else {
|
|
+use_clean:
|
|
+ if (!clean) {
|
|
+ bch_err(c, "no superblock clean section found");
|
|
+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE;
|
|
+ goto err;
|
|
+
|
|
+ }
|
|
+ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
|
|
+ }
|
|
+
|
|
+ if (c->opts.reconstruct_alloc) {
|
|
+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
|
|
+ drop_alloc_keys(&c->journal_keys);
|
|
+ }
|
|
+
|
|
+ ret = journal_replay_early(c, clean, &c->journal_entries);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * After an unclean shutdown, skip then next few journal sequence
|
|
+ * numbers as they may have been referenced by btree writes that
|
|
+ * happened before their corresponding journal writes - those btree
|
|
+ * writes need to be ignored, by skipping and blacklisting the next few
|
|
+ * journal sequence numbers:
|
|
+ */
|
|
+ if (!c->sb.clean)
|
|
+ journal_seq += 8;
|
|
+
|
|
+ if (blacklist_seq != journal_seq) {
|
|
+ ret = bch2_journal_seq_blacklist_add(c,
|
|
+ blacklist_seq, journal_seq);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error creating new journal seq blacklist entry");
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_fs_journal_start(&c->journal, journal_seq,
|
|
+ &c->journal_entries);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = read_btree_roots(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch_verbose(c, "starting alloc read");
|
|
+ err = "error reading allocation information";
|
|
+ ret = bch2_alloc_read(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "alloc read done");
|
|
+
|
|
+ bch_verbose(c, "starting stripes_read");
|
|
+ err = "error reading stripes";
|
|
+ ret = bch2_stripes_read(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "stripes_read done");
|
|
+
|
|
+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
|
|
+
|
|
+ if (c->opts.fsck ||
|
|
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)) ||
|
|
+ !(c->sb.compat & (1ULL << BCH_COMPAT_alloc_metadata)) ||
|
|
+ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
|
|
+ bool metadata_only = c->opts.norecovery;
|
|
+
|
|
+ bch_info(c, "starting mark and sweep");
|
|
+ err = "error in mark and sweep";
|
|
+ ret = bch2_gc(c, true, metadata_only);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "mark and sweep done");
|
|
+ }
|
|
+
|
|
+ bch2_stripes_heap_start(c);
|
|
+
|
|
+ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
|
|
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
|
|
+
|
|
+ /*
|
|
+ * Skip past versions that might have possibly been used (as nonces),
|
|
+ * but hadn't had their pointers written:
|
|
+ */
|
|
+ if (c->sb.encryption_type && !c->sb.clean)
|
|
+ atomic64_add(1 << 16, &c->key_version);
|
|
+
|
|
+ if (c->opts.norecovery)
|
|
+ goto out;
|
|
+
|
|
+ bch_verbose(c, "starting journal replay");
|
|
+ err = "journal replay failed";
|
|
+ ret = bch2_journal_replay(c, c->journal_keys);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "journal replay done");
|
|
+
|
|
+ if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
|
|
+ !c->opts.nochanges) {
|
|
+ /*
|
|
+ * note that even when filesystem was clean there might be work
|
|
+ * to do here, if we ran gc (because of fsck) which recalculated
|
|
+ * oldest_gen:
|
|
+ */
|
|
+ bch_verbose(c, "writing allocation info");
|
|
+ err = "error writing out alloc info";
|
|
+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?:
|
|
+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error writing alloc info");
|
|
+ goto err;
|
|
+ }
|
|
+ bch_verbose(c, "alloc write done");
|
|
+ }
|
|
+
|
|
+ if (c->opts.fsck) {
|
|
+ bch_info(c, "starting fsck");
|
|
+ err = "error in fsck";
|
|
+ ret = bch2_fsck_full(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "fsck done");
|
|
+ } else if (!c->sb.clean) {
|
|
+ bch_verbose(c, "checking for deleted inodes");
|
|
+ err = "error in recovery";
|
|
+ ret = bch2_fsck_walk_inodes_only(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "check inodes done");
|
|
+ }
|
|
+
|
|
+ if (enabled_qtypes(c)) {
|
|
+ bch_verbose(c, "reading quotas");
|
|
+ ret = bch2_fs_quota_read(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_verbose(c, "quotas done");
|
|
+ }
|
|
+
|
|
+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) ||
|
|
+ !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) {
|
|
+ struct bch_move_stats stats = { 0 };
|
|
+
|
|
+ bch_info(c, "scanning for old btree nodes");
|
|
+ ret = bch2_fs_read_write(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_scan_old_btree_nodes(c, &stats);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ bch_info(c, "scanning for old btree nodes done");
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (c->opts.version_upgrade) {
|
|
+ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
|
|
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
|
|
+ write_sb = true;
|
|
+ }
|
|
+
|
|
+ if (!test_bit(BCH_FS_ERROR, &c->flags)) {
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
|
|
+ write_sb = true;
|
|
+ }
|
|
+
|
|
+ if (c->opts.fsck &&
|
|
+ !test_bit(BCH_FS_ERROR, &c->flags) &&
|
|
+ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
|
|
+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
|
|
+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
|
|
+ write_sb = true;
|
|
+ }
|
|
+
|
|
+ if (write_sb)
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ if (c->journal_seq_blacklist_table &&
|
|
+ c->journal_seq_blacklist_table->nr > 128)
|
|
+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
|
|
+
|
|
+ ret = 0;
|
|
+out:
|
|
+ set_bit(BCH_FS_FSCK_DONE, &c->flags);
|
|
+ bch2_flush_fsck_errs(c);
|
|
+
|
|
+ if (!c->opts.keep_journal) {
|
|
+ bch2_journal_keys_free(&c->journal_keys);
|
|
+ bch2_journal_entries_free(&c->journal_entries);
|
|
+ }
|
|
+ kfree(clean);
|
|
+ if (ret)
|
|
+ bch_err(c, "Error in recovery: %s (%i)", err, ret);
|
|
+ else
|
|
+ bch_verbose(c, "ret %i", ret);
|
|
+ return ret;
|
|
+err:
|
|
+fsck_err:
|
|
+ bch2_fs_emergency_read_only(c);
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+int bch2_fs_initialize(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_inode_unpacked root_inode, lostfound_inode;
|
|
+ struct bkey_inode_buf packed_inode;
|
|
+ struct qstr lostfound = QSTR("lost+found");
|
|
+ const char *err = "cannot allocate memory";
|
|
+ struct bch_dev *ca;
|
|
+ LIST_HEAD(journal);
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ bch_notice(c, "initializing new filesystem");
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
|
|
+
|
|
+ if (c->opts.version_upgrade) {
|
|
+ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
|
|
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
|
|
+ bch2_write_super(c);
|
|
+ }
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ bch2_mark_dev_superblock(c, ca, 0);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
|
|
+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
|
|
+
|
|
+ for (i = 0; i < BTREE_ID_NR; i++)
|
|
+ bch2_btree_root_alloc(c, i);
|
|
+
|
|
+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags);
|
|
+ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags);
|
|
+
|
|
+ err = "unable to allocate journal buckets";
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ ret = bch2_dev_journal_alloc(ca);
|
|
+ if (ret) {
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * journal_res_get() will crash if called before this has
|
|
+ * set up the journal.pin FIFO and journal.cur pointer:
|
|
+ */
|
|
+ bch2_fs_journal_start(&c->journal, 1, &journal);
|
|
+ bch2_journal_set_replay_done(&c->journal);
|
|
+
|
|
+ err = "error going read-write";
|
|
+ ret = bch2_fs_read_write_early(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * Write out the superblock and journal buckets, now that we can do
|
|
+ * btree updates
|
|
+ */
|
|
+ err = "error marking superblock and journal";
|
|
+ for_each_member_device(ca, c, i) {
|
|
+ ret = bch2_trans_mark_dev_sb(c, ca);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_inode_init(c, &root_inode, 0, 0,
|
|
+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
|
|
+ root_inode.bi_inum = BCACHEFS_ROOT_INO;
|
|
+ bch2_inode_pack(c, &packed_inode, &root_inode);
|
|
+ packed_inode.inode.k.p.snapshot = U32_MAX;
|
|
+
|
|
+ err = "error creating root directory";
|
|
+ ret = bch2_btree_insert(c, BTREE_ID_inodes,
|
|
+ &packed_inode.inode.k_i,
|
|
+ NULL, NULL, 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_inode_init_early(c, &lostfound_inode);
|
|
+
|
|
+ err = "error creating lost+found";
|
|
+ ret = bch2_trans_do(c, NULL, NULL, 0,
|
|
+ bch2_create_trans(&trans, BCACHEFS_ROOT_INO,
|
|
+ &root_inode, &lostfound_inode,
|
|
+ &lostfound,
|
|
+ 0, 0, S_IFDIR|0700, 0,
|
|
+ NULL, NULL));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error creating lost+found");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (enabled_qtypes(c)) {
|
|
+ ret = bch2_fs_quota_read(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ err = "error writing first journal entry";
|
|
+ ret = bch2_journal_meta(&c->journal);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
|
|
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return 0;
|
|
+err:
|
|
+ pr_err("Error initializing new filesystem: %s (%i)", err, ret);
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
|
|
new file mode 100644
|
|
index 000000000000..e5565e4f335a
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/recovery.h
|
|
@@ -0,0 +1,58 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_RECOVERY_H
|
|
+#define _BCACHEFS_RECOVERY_H
|
|
+
|
|
+#define for_each_journal_key(keys, i) \
|
|
+ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
|
|
+
|
|
+struct journal_iter {
|
|
+ struct list_head list;
|
|
+ enum btree_id btree_id;
|
|
+ unsigned level;
|
|
+ size_t idx;
|
|
+ struct journal_keys *keys;
|
|
+};
|
|
+
|
|
+/*
|
|
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
|
|
+ */
|
|
+
|
|
+struct btree_and_journal_iter {
|
|
+ struct btree *b;
|
|
+ struct btree_node_iter node_iter;
|
|
+ struct bkey unpacked;
|
|
+
|
|
+ struct journal_iter journal;
|
|
+
|
|
+ enum last_key_returned {
|
|
+ none,
|
|
+ btree,
|
|
+ journal,
|
|
+ } last;
|
|
+};
|
|
+
|
|
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
|
|
+ unsigned, struct bkey_i *);
|
|
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
|
|
+ unsigned, struct bpos);
|
|
+
|
|
+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
|
|
+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
|
|
+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
|
|
+
|
|
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
|
|
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
|
|
+ struct bch_fs *,
|
|
+ struct btree *);
|
|
+
|
|
+typedef int (*btree_walk_key_fn)(struct bch_fs *c, struct bkey_s_c k);
|
|
+
|
|
+int bch2_btree_and_journal_walk(struct bch_fs *, enum btree_id, btree_walk_key_fn);
|
|
+
|
|
+void bch2_journal_keys_free(struct journal_keys *);
|
|
+void bch2_journal_entries_free(struct list_head *);
|
|
+
|
|
+int bch2_fs_recovery(struct bch_fs *);
|
|
+int bch2_fs_initialize(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_RECOVERY_H */
|
|
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
|
|
new file mode 100644
|
|
index 000000000000..0978ad92614c
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/reflink.c
|
|
@@ -0,0 +1,335 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_buf.h"
|
|
+#include "btree_update.h"
|
|
+#include "extents.h"
|
|
+#include "inode.h"
|
|
+#include "io.h"
|
|
+#include "reflink.h"
|
|
+
|
|
+#include <linux/sched/signal.h>
|
|
+
|
|
+static inline unsigned bkey_type_to_indirect(const struct bkey *k)
|
|
+{
|
|
+ switch (k->type) {
|
|
+ case KEY_TYPE_extent:
|
|
+ return KEY_TYPE_reflink_v;
|
|
+ case KEY_TYPE_inline_data:
|
|
+ return KEY_TYPE_indirect_inline_data;
|
|
+ default:
|
|
+ return 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+/* reflink pointers */
|
|
+
|
|
+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
|
+
|
|
+ if (bkey_val_bytes(p.k) != sizeof(*p.v))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
|
|
+
|
|
+ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx));
|
|
+}
|
|
+
|
|
+enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
|
|
+ struct bkey_s _l, struct bkey_s _r)
|
|
+{
|
|
+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l);
|
|
+ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r);
|
|
+
|
|
+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx))
|
|
+ return BCH_MERGE_NOMERGE;
|
|
+
|
|
+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
|
|
+ bch2_key_resize(l.k, KEY_SIZE_MAX);
|
|
+ bch2_cut_front_s(l.k->p, _r);
|
|
+ return BCH_MERGE_PARTIAL;
|
|
+ }
|
|
+
|
|
+ bch2_key_resize(l.k, l.k->size + r.k->size);
|
|
+
|
|
+ return BCH_MERGE_MERGE;
|
|
+}
|
|
+
|
|
+/* indirect extents */
|
|
+
|
|
+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
|
|
+
|
|
+ if (bkey_val_bytes(r.k) < sizeof(*r.v))
|
|
+ return "incorrect value size";
|
|
+
|
|
+ return bch2_bkey_ptrs_invalid(c, k);
|
|
+}
|
|
+
|
|
+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
|
|
+
|
|
+ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount));
|
|
+
|
|
+ bch2_bkey_ptrs_to_text(out, c, k);
|
|
+}
|
|
+
|
|
+/* indirect inline data */
|
|
+
|
|
+const char *bch2_indirect_inline_data_invalid(const struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_indirect_inline_data))
|
|
+ return "incorrect value size";
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_indirect_inline_data_to_text(struct printbuf *out,
|
|
+ struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
|
|
+ unsigned datalen = bkey_inline_data_bytes(k.k);
|
|
+
|
|
+ pr_buf(out, "refcount %llu datalen %u: %*phN",
|
|
+ le64_to_cpu(d.v->refcount), datalen,
|
|
+ min(datalen, 32U), d.v->data);
|
|
+}
|
|
+
|
|
+static int bch2_make_extent_indirect(struct btree_trans *trans,
|
|
+ struct btree_iter *extent_iter,
|
|
+ struct bkey_i *orig)
|
|
+{
|
|
+ struct bch_fs *c = trans->c;
|
|
+ struct btree_iter *reflink_iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_i *r_v;
|
|
+ struct bkey_i_reflink_p *r_p;
|
|
+ __le64 *refcount;
|
|
+ int ret;
|
|
+
|
|
+ if (orig->k.type == KEY_TYPE_inline_data)
|
|
+ bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data);
|
|
+
|
|
+ for_each_btree_key(trans, reflink_iter, BTREE_ID_reflink,
|
|
+ POS(0, c->reflink_hint),
|
|
+ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) {
|
|
+ if (reflink_iter->pos.inode) {
|
|
+ bch2_btree_iter_set_pos(reflink_iter, POS_MIN);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (bkey_deleted(k.k) && orig->k.size <= k.k->size)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /* rewind iter to start of hole, if necessary: */
|
|
+ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k));
|
|
+
|
|
+ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_val_bytes(&orig->k));
|
|
+ ret = PTR_ERR_OR_ZERO(r_v);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bkey_init(&r_v->k);
|
|
+ r_v->k.type = bkey_type_to_indirect(&orig->k);
|
|
+ r_v->k.p = reflink_iter->pos;
|
|
+ bch2_key_resize(&r_v->k, orig->k.size);
|
|
+ r_v->k.version = orig->k.version;
|
|
+
|
|
+ set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
|
|
+
|
|
+ refcount = (void *) &r_v->v;
|
|
+ *refcount = 0;
|
|
+ memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
|
|
+
|
|
+ bch2_trans_update(trans, reflink_iter, r_v, 0);
|
|
+
|
|
+ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p));
|
|
+ if (IS_ERR(r_p)) {
|
|
+ ret = PTR_ERR(r_p);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ orig->k.type = KEY_TYPE_reflink_p;
|
|
+ r_p = bkey_i_to_reflink_p(orig);
|
|
+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v));
|
|
+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k));
|
|
+
|
|
+ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0);
|
|
+err:
|
|
+ if (!IS_ERR(reflink_iter))
|
|
+ c->reflink_hint = reflink_iter->pos.offset;
|
|
+ bch2_trans_iter_put(trans, reflink_iter);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
|
|
+{
|
|
+ struct bkey_s_c k = bch2_btree_iter_peek(iter);
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key_continue(iter, 0, k, ret) {
|
|
+ if (bkey_cmp(iter->pos, end) >= 0)
|
|
+ return bkey_s_c_null;
|
|
+
|
|
+ if (bkey_extent_is_data(k.k))
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return k;
|
|
+}
|
|
+
|
|
+s64 bch2_remap_range(struct bch_fs *c,
|
|
+ struct bpos dst_start, struct bpos src_start,
|
|
+ u64 remap_sectors, u64 *journal_seq,
|
|
+ u64 new_i_size, s64 *i_sectors_delta)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *dst_iter, *src_iter;
|
|
+ struct bkey_s_c src_k;
|
|
+ struct bkey_buf new_dst, new_src;
|
|
+ struct bpos dst_end = dst_start, src_end = src_start;
|
|
+ struct bpos dst_want, src_want;
|
|
+ u64 src_done, dst_done;
|
|
+ int ret = 0, ret2 = 0;
|
|
+
|
|
+ if (!percpu_ref_tryget(&c->writes))
|
|
+ return -EROFS;
|
|
+
|
|
+ bch2_check_set_feature(c, BCH_FEATURE_reflink);
|
|
+
|
|
+ dst_end.offset += remap_sectors;
|
|
+ src_end.offset += remap_sectors;
|
|
+
|
|
+ bch2_bkey_buf_init(&new_dst);
|
|
+ bch2_bkey_buf_init(&new_src);
|
|
+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
|
|
+
|
|
+ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, src_start,
|
|
+ BTREE_ITER_INTENT);
|
|
+ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ while (ret == 0 || ret == -EINTR) {
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ if (fatal_signal_pending(current)) {
|
|
+ ret = -EINTR;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ src_k = get_next_src(src_iter, src_end);
|
|
+ ret = bkey_err(src_k);
|
|
+ if (ret)
|
|
+ continue;
|
|
+
|
|
+ src_done = bpos_min(src_iter->pos, src_end).offset -
|
|
+ src_start.offset;
|
|
+ dst_want = POS(dst_start.inode, dst_start.offset + src_done);
|
|
+
|
|
+ if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
|
|
+ ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
|
|
+ journal_seq, i_sectors_delta);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
|
|
+
|
|
+ if (!bkey_cmp(dst_iter->pos, dst_end))
|
|
+ break;
|
|
+
|
|
+ if (src_k.k->type != KEY_TYPE_reflink_p) {
|
|
+ bch2_bkey_buf_reassemble(&new_src, c, src_k);
|
|
+ src_k = bkey_i_to_s_c(new_src.k);
|
|
+
|
|
+ bch2_cut_front(src_iter->pos, new_src.k);
|
|
+ bch2_cut_back(src_end, new_src.k);
|
|
+
|
|
+ ret = bch2_make_extent_indirect(&trans, src_iter,
|
|
+ new_src.k);
|
|
+ if (ret)
|
|
+ continue;
|
|
+
|
|
+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p);
|
|
+ }
|
|
+
|
|
+ if (src_k.k->type == KEY_TYPE_reflink_p) {
|
|
+ struct bkey_s_c_reflink_p src_p =
|
|
+ bkey_s_c_to_reflink_p(src_k);
|
|
+ struct bkey_i_reflink_p *dst_p =
|
|
+ bkey_reflink_p_init(new_dst.k);
|
|
+
|
|
+ u64 offset = le64_to_cpu(src_p.v->idx) +
|
|
+ (src_iter->pos.offset -
|
|
+ bkey_start_offset(src_k.k));
|
|
+
|
|
+ dst_p->v.idx = cpu_to_le64(offset);
|
|
+ } else {
|
|
+ BUG();
|
|
+ }
|
|
+
|
|
+ new_dst.k->k.p = dst_iter->pos;
|
|
+ bch2_key_resize(&new_dst.k->k,
|
|
+ min(src_k.k->p.offset - src_iter->pos.offset,
|
|
+ dst_end.offset - dst_iter->pos.offset));
|
|
+
|
|
+ ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
|
|
+ NULL, journal_seq,
|
|
+ new_i_size, i_sectors_delta);
|
|
+ if (ret)
|
|
+ continue;
|
|
+
|
|
+ dst_done = dst_iter->pos.offset - dst_start.offset;
|
|
+ src_want = POS(src_start.inode, src_start.offset + dst_done);
|
|
+ bch2_btree_iter_set_pos(src_iter, src_want);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, dst_iter);
|
|
+ bch2_trans_iter_put(&trans, src_iter);
|
|
+
|
|
+ BUG_ON(!ret && bkey_cmp(dst_iter->pos, dst_end));
|
|
+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0);
|
|
+
|
|
+ dst_done = dst_iter->pos.offset - dst_start.offset;
|
|
+ new_i_size = min(dst_iter->pos.offset << 9, new_i_size);
|
|
+
|
|
+ bch2_trans_begin(&trans);
|
|
+
|
|
+ do {
|
|
+ struct bch_inode_unpacked inode_u;
|
|
+ struct btree_iter *inode_iter;
|
|
+
|
|
+ inode_iter = bch2_inode_peek(&trans, &inode_u,
|
|
+ dst_start.inode, BTREE_ITER_INTENT);
|
|
+ ret2 = PTR_ERR_OR_ZERO(inode_iter);
|
|
+
|
|
+ if (!ret2 &&
|
|
+ inode_u.bi_size < new_i_size) {
|
|
+ inode_u.bi_size = new_i_size;
|
|
+ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?:
|
|
+ bch2_trans_commit(&trans, NULL, journal_seq, 0);
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(&trans, inode_iter);
|
|
+ } while (ret2 == -EINTR);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ bch2_bkey_buf_exit(&new_src, c);
|
|
+ bch2_bkey_buf_exit(&new_dst, c);
|
|
+
|
|
+ percpu_ref_put(&c->writes);
|
|
+
|
|
+ return dst_done ?: ret ?: ret2;
|
|
+}
|
|
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
|
|
new file mode 100644
|
|
index 000000000000..9d5e7dc58f2b
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/reflink.h
|
|
@@ -0,0 +1,40 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_REFLINK_H
|
|
+#define _BCACHEFS_REFLINK_H
|
|
+
|
|
+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+enum merge_result bch2_reflink_p_merge(struct bch_fs *,
|
|
+ struct bkey_s, struct bkey_s);
|
|
+
|
|
+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_reflink_p_invalid, \
|
|
+ .val_to_text = bch2_reflink_p_to_text, \
|
|
+ .key_merge = bch2_reflink_p_merge, \
|
|
+}
|
|
+
|
|
+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_reflink_v_invalid, \
|
|
+ .val_to_text = bch2_reflink_v_to_text, \
|
|
+ .swab = bch2_ptr_swab, \
|
|
+}
|
|
+
|
|
+const char *bch2_indirect_inline_data_invalid(const struct bch_fs *,
|
|
+ struct bkey_s_c);
|
|
+void bch2_indirect_inline_data_to_text(struct printbuf *,
|
|
+ struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_indirect_inline_data (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_indirect_inline_data_invalid, \
|
|
+ .val_to_text = bch2_indirect_inline_data_to_text, \
|
|
+}
|
|
+
|
|
+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
|
|
+ u64, u64 *, u64, s64 *);
|
|
+
|
|
+#endif /* _BCACHEFS_REFLINK_H */
|
|
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
|
|
new file mode 100644
|
|
index 000000000000..8e6cccd39383
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/replicas.c
|
|
@@ -0,0 +1,1089 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "buckets.h"
|
|
+#include "journal.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+
|
|
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
|
|
+ struct bch_replicas_cpu *);
|
|
+
|
|
+/* Replicas tracking - in memory: */
|
|
+
|
|
+static void verify_replicas_entry(struct bch_replicas_entry *e)
|
|
+{
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(e->data_type >= BCH_DATA_NR);
|
|
+ BUG_ON(!e->nr_devs);
|
|
+ BUG_ON(e->nr_required > 1 &&
|
|
+ e->nr_required >= e->nr_devs);
|
|
+
|
|
+ for (i = 0; i + 1 < e->nr_devs; i++)
|
|
+ BUG_ON(e->devs[i] >= e->devs[i + 1]);
|
|
+#endif
|
|
+}
|
|
+
|
|
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
|
|
+{
|
|
+ bubble_sort(e->devs, e->nr_devs, u8_cmp);
|
|
+}
|
|
+
|
|
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
|
|
+{
|
|
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
|
|
+}
|
|
+
|
|
+void bch2_replicas_entry_to_text(struct printbuf *out,
|
|
+ struct bch_replicas_entry *e)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ pr_buf(out, "%s: %u/%u [",
|
|
+ bch2_data_types[e->data_type],
|
|
+ e->nr_required,
|
|
+ e->nr_devs);
|
|
+
|
|
+ for (i = 0; i < e->nr_devs; i++)
|
|
+ pr_buf(out, i ? " %u" : "%u", e->devs[i]);
|
|
+ pr_buf(out, "]");
|
|
+}
|
|
+
|
|
+void bch2_cpu_replicas_to_text(struct printbuf *out,
|
|
+ struct bch_replicas_cpu *r)
|
|
+{
|
|
+ struct bch_replicas_entry *e;
|
|
+ bool first = true;
|
|
+
|
|
+ for_each_cpu_replicas_entry(r, e) {
|
|
+ if (!first)
|
|
+ pr_buf(out, " ");
|
|
+ first = false;
|
|
+
|
|
+ bch2_replicas_entry_to_text(out, e);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void extent_to_replicas(struct bkey_s_c k,
|
|
+ struct bch_replicas_entry *r)
|
|
+{
|
|
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ r->nr_required = 1;
|
|
+
|
|
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
|
|
+ if (p.ptr.cached)
|
|
+ continue;
|
|
+
|
|
+ if (!p.has_ec)
|
|
+ r->devs[r->nr_devs++] = p.ptr.dev;
|
|
+ else
|
|
+ r->nr_required = 0;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void stripe_to_replicas(struct bkey_s_c k,
|
|
+ struct bch_replicas_entry *r)
|
|
+{
|
|
+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
|
|
+ const struct bch_extent_ptr *ptr;
|
|
+
|
|
+ r->nr_required = s.v->nr_blocks - s.v->nr_redundant;
|
|
+
|
|
+ for (ptr = s.v->ptrs;
|
|
+ ptr < s.v->ptrs + s.v->nr_blocks;
|
|
+ ptr++)
|
|
+ r->devs[r->nr_devs++] = ptr->dev;
|
|
+}
|
|
+
|
|
+void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ e->nr_devs = 0;
|
|
+
|
|
+ switch (k.k->type) {
|
|
+ case KEY_TYPE_btree_ptr:
|
|
+ case KEY_TYPE_btree_ptr_v2:
|
|
+ e->data_type = BCH_DATA_btree;
|
|
+ extent_to_replicas(k, e);
|
|
+ break;
|
|
+ case KEY_TYPE_extent:
|
|
+ case KEY_TYPE_reflink_v:
|
|
+ e->data_type = BCH_DATA_user;
|
|
+ extent_to_replicas(k, e);
|
|
+ break;
|
|
+ case KEY_TYPE_stripe:
|
|
+ e->data_type = BCH_DATA_parity;
|
|
+ stripe_to_replicas(k, e);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ bch2_replicas_entry_sort(e);
|
|
+}
|
|
+
|
|
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
|
|
+ enum bch_data_type data_type,
|
|
+ struct bch_devs_list devs)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ BUG_ON(!data_type ||
|
|
+ data_type == BCH_DATA_sb ||
|
|
+ data_type >= BCH_DATA_NR);
|
|
+
|
|
+ e->data_type = data_type;
|
|
+ e->nr_devs = 0;
|
|
+ e->nr_required = 1;
|
|
+
|
|
+ for (i = 0; i < devs.nr; i++)
|
|
+ e->devs[e->nr_devs++] = devs.devs[i];
|
|
+
|
|
+ bch2_replicas_entry_sort(e);
|
|
+}
|
|
+
|
|
+static struct bch_replicas_cpu
|
|
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
|
|
+ struct bch_replicas_entry *new_entry)
|
|
+{
|
|
+ unsigned i;
|
|
+ struct bch_replicas_cpu new = {
|
|
+ .nr = old->nr + 1,
|
|
+ .entry_size = max_t(unsigned, old->entry_size,
|
|
+ replicas_entry_bytes(new_entry)),
|
|
+ };
|
|
+
|
|
+ BUG_ON(!new_entry->data_type);
|
|
+ verify_replicas_entry(new_entry);
|
|
+
|
|
+ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
|
|
+ if (!new.entries)
|
|
+ return new;
|
|
+
|
|
+ for (i = 0; i < old->nr; i++)
|
|
+ memcpy(cpu_replicas_entry(&new, i),
|
|
+ cpu_replicas_entry(old, i),
|
|
+ old->entry_size);
|
|
+
|
|
+ memcpy(cpu_replicas_entry(&new, old->nr),
|
|
+ new_entry,
|
|
+ replicas_entry_bytes(new_entry));
|
|
+
|
|
+ bch2_cpu_replicas_sort(&new);
|
|
+ return new;
|
|
+}
|
|
+
|
|
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
|
|
+ struct bch_replicas_entry *search)
|
|
+{
|
|
+ int idx, entry_size = replicas_entry_bytes(search);
|
|
+
|
|
+ if (unlikely(entry_size > r->entry_size))
|
|
+ return -1;
|
|
+
|
|
+ verify_replicas_entry(search);
|
|
+
|
|
+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size)
|
|
+ idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
|
|
+ entry_cmp, search);
|
|
+#undef entry_cmp
|
|
+
|
|
+ return idx < r->nr ? idx : -1;
|
|
+}
|
|
+
|
|
+int bch2_replicas_entry_idx(struct bch_fs *c,
|
|
+ struct bch_replicas_entry *search)
|
|
+{
|
|
+ bch2_replicas_entry_sort(search);
|
|
+
|
|
+ return __replicas_entry_idx(&c->replicas, search);
|
|
+}
|
|
+
|
|
+static bool __replicas_has_entry(struct bch_replicas_cpu *r,
|
|
+ struct bch_replicas_entry *search)
|
|
+{
|
|
+ return __replicas_entry_idx(r, search) >= 0;
|
|
+}
|
|
+
|
|
+bool bch2_replicas_marked(struct bch_fs *c,
|
|
+ struct bch_replicas_entry *search)
|
|
+{
|
|
+ bool marked;
|
|
+
|
|
+ if (!search->nr_devs)
|
|
+ return true;
|
|
+
|
|
+ verify_replicas_entry(search);
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ marked = __replicas_has_entry(&c->replicas, search) &&
|
|
+ (likely((!c->replicas_gc.entries)) ||
|
|
+ __replicas_has_entry(&c->replicas_gc, search));
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return marked;
|
|
+}
|
|
+
|
|
+static void __replicas_table_update(struct bch_fs_usage *dst,
|
|
+ struct bch_replicas_cpu *dst_r,
|
|
+ struct bch_fs_usage *src,
|
|
+ struct bch_replicas_cpu *src_r)
|
|
+{
|
|
+ int src_idx, dst_idx;
|
|
+
|
|
+ *dst = *src;
|
|
+
|
|
+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
|
|
+ if (!src->replicas[src_idx])
|
|
+ continue;
|
|
+
|
|
+ dst_idx = __replicas_entry_idx(dst_r,
|
|
+ cpu_replicas_entry(src_r, src_idx));
|
|
+ BUG_ON(dst_idx < 0);
|
|
+
|
|
+ dst->replicas[dst_idx] = src->replicas[src_idx];
|
|
+ }
|
|
+}
|
|
+
|
|
+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p,
|
|
+ struct bch_replicas_cpu *dst_r,
|
|
+ struct bch_fs_usage __percpu *src_p,
|
|
+ struct bch_replicas_cpu *src_r)
|
|
+{
|
|
+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
|
|
+ struct bch_fs_usage *dst, *src = (void *)
|
|
+ bch2_acc_percpu_u64s((void *) src_p, src_nr);
|
|
+
|
|
+ preempt_disable();
|
|
+ dst = this_cpu_ptr(dst_p);
|
|
+ preempt_enable();
|
|
+
|
|
+ __replicas_table_update(dst, dst_r, src, src_r);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Resize filesystem accounting:
|
|
+ */
|
|
+static int replicas_table_update(struct bch_fs *c,
|
|
+ struct bch_replicas_cpu *new_r)
|
|
+{
|
|
+ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR];
|
|
+ struct bch_fs_usage_online *new_scratch = NULL;
|
|
+ struct bch_fs_usage __percpu *new_gc = NULL;
|
|
+ struct bch_fs_usage *new_base = NULL;
|
|
+ unsigned i, bytes = sizeof(struct bch_fs_usage) +
|
|
+ sizeof(u64) * new_r->nr;
|
|
+ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) +
|
|
+ sizeof(u64) * new_r->nr;
|
|
+ int ret = 0;
|
|
+
|
|
+ memset(new_usage, 0, sizeof(new_usage));
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
|
+ if (!(new_usage[i] = __alloc_percpu_gfp(bytes,
|
|
+ sizeof(u64), GFP_KERNEL)))
|
|
+ goto err;
|
|
+
|
|
+ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) ||
|
|
+ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) ||
|
|
+ (c->usage_gc &&
|
|
+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL))))
|
|
+ goto err;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
|
+ if (c->usage[i])
|
|
+ __replicas_table_update_pcpu(new_usage[i], new_r,
|
|
+ c->usage[i], &c->replicas);
|
|
+ if (c->usage_base)
|
|
+ __replicas_table_update(new_base, new_r,
|
|
+ c->usage_base, &c->replicas);
|
|
+ if (c->usage_gc)
|
|
+ __replicas_table_update_pcpu(new_gc, new_r,
|
|
+ c->usage_gc, &c->replicas);
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
|
+ swap(c->usage[i], new_usage[i]);
|
|
+ swap(c->usage_base, new_base);
|
|
+ swap(c->usage_scratch, new_scratch);
|
|
+ swap(c->usage_gc, new_gc);
|
|
+ swap(c->replicas, *new_r);
|
|
+out:
|
|
+ free_percpu(new_gc);
|
|
+ kfree(new_scratch);
|
|
+ for (i = 0; i < ARRAY_SIZE(new_usage); i++)
|
|
+ free_percpu(new_usage[i]);
|
|
+ kfree(new_base);
|
|
+ return ret;
|
|
+err:
|
|
+ bch_err(c, "error updating replicas table: memory allocation failure");
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static unsigned reserve_journal_replicas(struct bch_fs *c,
|
|
+ struct bch_replicas_cpu *r)
|
|
+{
|
|
+ struct bch_replicas_entry *e;
|
|
+ unsigned journal_res_u64s = 0;
|
|
+
|
|
+ /* nr_inodes: */
|
|
+ journal_res_u64s +=
|
|
+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
|
|
+
|
|
+ /* key_version: */
|
|
+ journal_res_u64s +=
|
|
+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64));
|
|
+
|
|
+ /* persistent_reserved: */
|
|
+ journal_res_u64s +=
|
|
+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) *
|
|
+ BCH_REPLICAS_MAX;
|
|
+
|
|
+ for_each_cpu_replicas_entry(r, e)
|
|
+ journal_res_u64s +=
|
|
+ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) +
|
|
+ e->nr_devs, sizeof(u64));
|
|
+ return journal_res_u64s;
|
|
+}
|
|
+
|
|
+noinline
|
|
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
|
|
+ struct bch_replicas_entry *new_entry)
|
|
+{
|
|
+ struct bch_replicas_cpu new_r, new_gc;
|
|
+ int ret = 0;
|
|
+
|
|
+ verify_replicas_entry(new_entry);
|
|
+
|
|
+ memset(&new_r, 0, sizeof(new_r));
|
|
+ memset(&new_gc, 0, sizeof(new_gc));
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ if (c->replicas_gc.entries &&
|
|
+ !__replicas_has_entry(&c->replicas_gc, new_entry)) {
|
|
+ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry);
|
|
+ if (!new_gc.entries)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (!__replicas_has_entry(&c->replicas, new_entry)) {
|
|
+ new_r = cpu_replicas_add_entry(&c->replicas, new_entry);
|
|
+ if (!new_r.entries)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ bch2_journal_entry_res_resize(&c->journal,
|
|
+ &c->replicas_journal_res,
|
|
+ reserve_journal_replicas(c, &new_r));
|
|
+ }
|
|
+
|
|
+ if (!new_r.entries &&
|
|
+ !new_gc.entries)
|
|
+ goto out;
|
|
+
|
|
+ /* allocations done, now commit: */
|
|
+
|
|
+ if (new_r.entries)
|
|
+ bch2_write_super(c);
|
|
+
|
|
+ /* don't update in memory replicas until changes are persistent */
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+ if (new_r.entries)
|
|
+ ret = replicas_table_update(c, &new_r);
|
|
+ if (new_gc.entries)
|
|
+ swap(new_gc, c->replicas_gc);
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+out:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ kfree(new_r.entries);
|
|
+ kfree(new_gc.entries);
|
|
+
|
|
+ return ret;
|
|
+err:
|
|
+ bch_err(c, "error adding replicas entry: memory allocation failure");
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static int __bch2_mark_replicas(struct bch_fs *c,
|
|
+ struct bch_replicas_entry *r,
|
|
+ bool check)
|
|
+{
|
|
+ return likely(bch2_replicas_marked(c, r)) ? 0
|
|
+ : check ? -1
|
|
+ : bch2_mark_replicas_slowpath(c, r);
|
|
+}
|
|
+
|
|
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
|
|
+{
|
|
+ return __bch2_mark_replicas(c, r, false);
|
|
+}
|
|
+
|
|
+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k,
|
|
+ bool check)
|
|
+{
|
|
+ struct bch_replicas_padded search;
|
|
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ for (i = 0; i < cached.nr; i++) {
|
|
+ bch2_replicas_entry_cached(&search.e, cached.devs[i]);
|
|
+
|
|
+ ret = __bch2_mark_replicas(c, &search.e, check);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ bch2_bkey_to_replicas(&search.e, k);
|
|
+
|
|
+ ret = __bch2_mark_replicas(c, &search.e, check);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ if (search.e.data_type == BCH_DATA_parity) {
|
|
+ search.e.data_type = BCH_DATA_cached;
|
|
+ ret = __bch2_mark_replicas(c, &search.e, check);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ search.e.data_type = BCH_DATA_user;
|
|
+ ret = __bch2_mark_replicas(c, &search.e, check);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* replicas delta list: */
|
|
+
|
|
+bool bch2_replicas_delta_list_marked(struct bch_fs *c,
|
|
+ struct replicas_delta_list *r)
|
|
+{
|
|
+ struct replicas_delta *d = r->d;
|
|
+ struct replicas_delta *top = (void *) r->d + r->used;
|
|
+
|
|
+ percpu_rwsem_assert_held(&c->mark_lock);
|
|
+
|
|
+ for (d = r->d; d != top; d = replicas_delta_next(d))
|
|
+ if (bch2_replicas_entry_idx(c, &d->r) < 0)
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+int bch2_replicas_delta_list_mark(struct bch_fs *c,
|
|
+ struct replicas_delta_list *r)
|
|
+{
|
|
+ struct replicas_delta *d = r->d;
|
|
+ struct replicas_delta *top = (void *) r->d + r->used;
|
|
+ int ret = 0;
|
|
+
|
|
+ for (d = r->d; !ret && d != top; d = replicas_delta_next(d))
|
|
+ ret = bch2_mark_replicas(c, &d->r);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* bkey replicas: */
|
|
+
|
|
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ return __bch2_mark_bkey_replicas(c, k, true) == 0;
|
|
+}
|
|
+
|
|
+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ return __bch2_mark_bkey_replicas(c, k, false);
|
|
+}
|
|
+
|
|
+/*
|
|
+ * Old replicas_gc mechanism: only used for journal replicas entries now, should
|
|
+ * die at some point:
|
|
+ */
|
|
+
|
|
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ lockdep_assert_held(&c->replicas_gc_lock);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+
|
|
+ /*
|
|
+ * this is kind of crappy; the replicas gc mechanism needs to be ripped
|
|
+ * out
|
|
+ */
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+ struct bch_replicas_cpu n;
|
|
+
|
|
+ if (!__replicas_has_entry(&c->replicas_gc, e) &&
|
|
+ bch2_fs_usage_read_one(c, &c->usage_base->replicas[i])) {
|
|
+ n = cpu_replicas_add_entry(&c->replicas_gc, e);
|
|
+ if (!n.entries) {
|
|
+ ret = -ENOSPC;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ swap(n, c->replicas_gc);
|
|
+ kfree(n.entries);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
|
|
+ ret = -ENOSPC;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = replicas_table_update(c, &c->replicas_gc);
|
|
+err:
|
|
+ kfree(c->replicas_gc.entries);
|
|
+ c->replicas_gc.entries = NULL;
|
|
+
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+
|
|
+ if (!ret)
|
|
+ bch2_write_super(c);
|
|
+
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
|
|
+{
|
|
+ struct bch_replicas_entry *e;
|
|
+ unsigned i = 0;
|
|
+
|
|
+ lockdep_assert_held(&c->replicas_gc_lock);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ BUG_ON(c->replicas_gc.entries);
|
|
+
|
|
+ c->replicas_gc.nr = 0;
|
|
+ c->replicas_gc.entry_size = 0;
|
|
+
|
|
+ for_each_cpu_replicas_entry(&c->replicas, e)
|
|
+ if (!((1 << e->data_type) & typemask)) {
|
|
+ c->replicas_gc.nr++;
|
|
+ c->replicas_gc.entry_size =
|
|
+ max_t(unsigned, c->replicas_gc.entry_size,
|
|
+ replicas_entry_bytes(e));
|
|
+ }
|
|
+
|
|
+ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
|
|
+ c->replicas_gc.entry_size,
|
|
+ GFP_KERNEL);
|
|
+ if (!c->replicas_gc.entries) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ bch_err(c, "error allocating c->replicas_gc");
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ for_each_cpu_replicas_entry(&c->replicas, e)
|
|
+ if (!((1 << e->data_type) & typemask))
|
|
+ memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
|
|
+ e, c->replicas_gc.entry_size);
|
|
+
|
|
+ bch2_cpu_replicas_sort(&c->replicas_gc);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* New much simpler mechanism for clearing out unneeded replicas entries: */
|
|
+
|
|
+int bch2_replicas_gc2(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_replicas_cpu new = { 0 };
|
|
+ unsigned i, nr;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_journal_meta(&c->journal);
|
|
+retry:
|
|
+ nr = READ_ONCE(c->replicas.nr);
|
|
+ new.entry_size = READ_ONCE(c->replicas.entry_size);
|
|
+ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL);
|
|
+ if (!new.entries) {
|
|
+ bch_err(c, "error allocating c->replicas_gc");
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+
|
|
+ if (nr != c->replicas.nr ||
|
|
+ new.entry_size != c->replicas.entry_size) {
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ kfree(new.entries);
|
|
+ goto retry;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+
|
|
+ if (e->data_type == BCH_DATA_journal ||
|
|
+ c->usage_base->replicas[i] ||
|
|
+ percpu_u64_get(&c->usage[0]->replicas[i]) ||
|
|
+ percpu_u64_get(&c->usage[1]->replicas[i]) ||
|
|
+ percpu_u64_get(&c->usage[2]->replicas[i]) ||
|
|
+ percpu_u64_get(&c->usage[3]->replicas[i]))
|
|
+ memcpy(cpu_replicas_entry(&new, new.nr++),
|
|
+ e, new.entry_size);
|
|
+ }
|
|
+
|
|
+ bch2_cpu_replicas_sort(&new);
|
|
+
|
|
+ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) {
|
|
+ ret = -ENOSPC;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = replicas_table_update(c, &new);
|
|
+err:
|
|
+ kfree(new.entries);
|
|
+
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+
|
|
+ if (!ret)
|
|
+ bch2_write_super(c);
|
|
+
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_replicas_set_usage(struct bch_fs *c,
|
|
+ struct bch_replicas_entry *r,
|
|
+ u64 sectors)
|
|
+{
|
|
+ int ret, idx = bch2_replicas_entry_idx(c, r);
|
|
+
|
|
+ if (idx < 0) {
|
|
+ struct bch_replicas_cpu n;
|
|
+
|
|
+ n = cpu_replicas_add_entry(&c->replicas, r);
|
|
+ if (!n.entries)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = replicas_table_update(c, &n);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ kfree(n.entries);
|
|
+
|
|
+ idx = bch2_replicas_entry_idx(c, r);
|
|
+ BUG_ON(ret < 0);
|
|
+ }
|
|
+
|
|
+ c->usage_base->replicas[idx] = sectors;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Replicas tracking - superblock: */
|
|
+
|
|
+static int
|
|
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
|
|
+ struct bch_replicas_cpu *cpu_r)
|
|
+{
|
|
+ struct bch_replicas_entry *e, *dst;
|
|
+ unsigned nr = 0, entry_size = 0, idx = 0;
|
|
+
|
|
+ for_each_replicas_entry(sb_r, e) {
|
|
+ entry_size = max_t(unsigned, entry_size,
|
|
+ replicas_entry_bytes(e));
|
|
+ nr++;
|
|
+ }
|
|
+
|
|
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
|
|
+ if (!cpu_r->entries)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ cpu_r->nr = nr;
|
|
+ cpu_r->entry_size = entry_size;
|
|
+
|
|
+ for_each_replicas_entry(sb_r, e) {
|
|
+ dst = cpu_replicas_entry(cpu_r, idx++);
|
|
+ memcpy(dst, e, replicas_entry_bytes(e));
|
|
+ bch2_replicas_entry_sort(dst);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int
|
|
+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
|
|
+ struct bch_replicas_cpu *cpu_r)
|
|
+{
|
|
+ struct bch_replicas_entry_v0 *e;
|
|
+ unsigned nr = 0, entry_size = 0, idx = 0;
|
|
+
|
|
+ for_each_replicas_entry(sb_r, e) {
|
|
+ entry_size = max_t(unsigned, entry_size,
|
|
+ replicas_entry_bytes(e));
|
|
+ nr++;
|
|
+ }
|
|
+
|
|
+ entry_size += sizeof(struct bch_replicas_entry) -
|
|
+ sizeof(struct bch_replicas_entry_v0);
|
|
+
|
|
+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
|
|
+ if (!cpu_r->entries)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ cpu_r->nr = nr;
|
|
+ cpu_r->entry_size = entry_size;
|
|
+
|
|
+ for_each_replicas_entry(sb_r, e) {
|
|
+ struct bch_replicas_entry *dst =
|
|
+ cpu_replicas_entry(cpu_r, idx++);
|
|
+
|
|
+ dst->data_type = e->data_type;
|
|
+ dst->nr_devs = e->nr_devs;
|
|
+ dst->nr_required = 1;
|
|
+ memcpy(dst->devs, e->devs, e->nr_devs);
|
|
+ bch2_replicas_entry_sort(dst);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_replicas *sb_v1;
|
|
+ struct bch_sb_field_replicas_v0 *sb_v0;
|
|
+ struct bch_replicas_cpu new_r = { 0, 0, NULL };
|
|
+ int ret = 0;
|
|
+
|
|
+ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb)))
|
|
+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r);
|
|
+ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb)))
|
|
+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r);
|
|
+
|
|
+ if (ret)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ bch2_cpu_replicas_sort(&new_r);
|
|
+
|
|
+ percpu_down_write(&c->mark_lock);
|
|
+
|
|
+ ret = replicas_table_update(c, &new_r);
|
|
+ percpu_up_write(&c->mark_lock);
|
|
+
|
|
+ kfree(new_r.entries);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
|
|
+ struct bch_replicas_cpu *r)
|
|
+{
|
|
+ struct bch_sb_field_replicas_v0 *sb_r;
|
|
+ struct bch_replicas_entry_v0 *dst;
|
|
+ struct bch_replicas_entry *src;
|
|
+ size_t bytes;
|
|
+
|
|
+ bytes = sizeof(struct bch_sb_field_replicas);
|
|
+
|
|
+ for_each_cpu_replicas_entry(r, src)
|
|
+ bytes += replicas_entry_bytes(src) - 1;
|
|
+
|
|
+ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb,
|
|
+ DIV_ROUND_UP(bytes, sizeof(u64)));
|
|
+ if (!sb_r)
|
|
+ return -ENOSPC;
|
|
+
|
|
+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas);
|
|
+ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb);
|
|
+
|
|
+ memset(&sb_r->entries, 0,
|
|
+ vstruct_end(&sb_r->field) -
|
|
+ (void *) &sb_r->entries);
|
|
+
|
|
+ dst = sb_r->entries;
|
|
+ for_each_cpu_replicas_entry(r, src) {
|
|
+ dst->data_type = src->data_type;
|
|
+ dst->nr_devs = src->nr_devs;
|
|
+ memcpy(dst->devs, src->devs, src->nr_devs);
|
|
+
|
|
+ dst = replicas_entry_next(dst);
|
|
+
|
|
+ BUG_ON((void *) dst > vstruct_end(&sb_r->field));
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
|
|
+ struct bch_replicas_cpu *r)
|
|
+{
|
|
+ struct bch_sb_field_replicas *sb_r;
|
|
+ struct bch_replicas_entry *dst, *src;
|
|
+ bool need_v1 = false;
|
|
+ size_t bytes;
|
|
+
|
|
+ bytes = sizeof(struct bch_sb_field_replicas);
|
|
+
|
|
+ for_each_cpu_replicas_entry(r, src) {
|
|
+ bytes += replicas_entry_bytes(src);
|
|
+ if (src->nr_required != 1)
|
|
+ need_v1 = true;
|
|
+ }
|
|
+
|
|
+ if (!need_v1)
|
|
+ return bch2_cpu_replicas_to_sb_replicas_v0(c, r);
|
|
+
|
|
+ sb_r = bch2_sb_resize_replicas(&c->disk_sb,
|
|
+ DIV_ROUND_UP(bytes, sizeof(u64)));
|
|
+ if (!sb_r)
|
|
+ return -ENOSPC;
|
|
+
|
|
+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0);
|
|
+ sb_r = bch2_sb_get_replicas(c->disk_sb.sb);
|
|
+
|
|
+ memset(&sb_r->entries, 0,
|
|
+ vstruct_end(&sb_r->field) -
|
|
+ (void *) &sb_r->entries);
|
|
+
|
|
+ dst = sb_r->entries;
|
|
+ for_each_cpu_replicas_entry(r, src) {
|
|
+ memcpy(dst, src, replicas_entry_bytes(src));
|
|
+
|
|
+ dst = replicas_entry_next(dst);
|
|
+
|
|
+ BUG_ON((void *) dst > vstruct_end(&sb_r->field));
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ sort_cmp_size(cpu_r->entries,
|
|
+ cpu_r->nr,
|
|
+ cpu_r->entry_size,
|
|
+ memcmp, NULL);
|
|
+
|
|
+ for (i = 0; i + 1 < cpu_r->nr; i++) {
|
|
+ struct bch_replicas_entry *l =
|
|
+ cpu_replicas_entry(cpu_r, i);
|
|
+ struct bch_replicas_entry *r =
|
|
+ cpu_replicas_entry(cpu_r, i + 1);
|
|
+
|
|
+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
|
|
+
|
|
+ if (!memcmp(l, r, cpu_r->entry_size))
|
|
+ return "duplicate replicas entry";
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
|
|
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
|
|
+ struct bch_replicas_cpu cpu_r = { .entries = NULL };
|
|
+ struct bch_replicas_entry *e;
|
|
+ const char *err;
|
|
+ unsigned i;
|
|
+
|
|
+ for_each_replicas_entry(sb_r, e) {
|
|
+ err = "invalid replicas entry: invalid data type";
|
|
+ if (e->data_type >= BCH_DATA_NR)
|
|
+ goto err;
|
|
+
|
|
+ err = "invalid replicas entry: no devices";
|
|
+ if (!e->nr_devs)
|
|
+ goto err;
|
|
+
|
|
+ err = "invalid replicas entry: bad nr_required";
|
|
+ if (e->nr_required > 1 &&
|
|
+ e->nr_required >= e->nr_devs)
|
|
+ goto err;
|
|
+
|
|
+ err = "invalid replicas entry: invalid device";
|
|
+ for (i = 0; i < e->nr_devs; i++)
|
|
+ if (!bch2_dev_exists(sb, mi, e->devs[i]))
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ err = "cannot allocate memory";
|
|
+ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r))
|
|
+ goto err;
|
|
+
|
|
+ err = check_dup_replicas_entries(&cpu_r);
|
|
+err:
|
|
+ kfree(cpu_r.entries);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static void bch2_sb_replicas_to_text(struct printbuf *out,
|
|
+ struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_replicas *r = field_to_type(f, replicas);
|
|
+ struct bch_replicas_entry *e;
|
|
+ bool first = true;
|
|
+
|
|
+ for_each_replicas_entry(r, e) {
|
|
+ if (!first)
|
|
+ pr_buf(out, " ");
|
|
+ first = false;
|
|
+
|
|
+ bch2_replicas_entry_to_text(out, e);
|
|
+ }
|
|
+}
|
|
+
|
|
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
|
|
+ .validate = bch2_sb_validate_replicas,
|
|
+ .to_text = bch2_sb_replicas_to_text,
|
|
+};
|
|
+
|
|
+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
|
|
+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
|
|
+ struct bch_replicas_cpu cpu_r = { .entries = NULL };
|
|
+ struct bch_replicas_entry_v0 *e;
|
|
+ const char *err;
|
|
+ unsigned i;
|
|
+
|
|
+ for_each_replicas_entry_v0(sb_r, e) {
|
|
+ err = "invalid replicas entry: invalid data type";
|
|
+ if (e->data_type >= BCH_DATA_NR)
|
|
+ goto err;
|
|
+
|
|
+ err = "invalid replicas entry: no devices";
|
|
+ if (!e->nr_devs)
|
|
+ goto err;
|
|
+
|
|
+ err = "invalid replicas entry: invalid device";
|
|
+ for (i = 0; i < e->nr_devs; i++)
|
|
+ if (!bch2_dev_exists(sb, mi, e->devs[i]))
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ err = "cannot allocate memory";
|
|
+ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r))
|
|
+ goto err;
|
|
+
|
|
+ err = check_dup_replicas_entries(&cpu_r);
|
|
+err:
|
|
+ kfree(cpu_r.entries);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
|
|
+ .validate = bch2_sb_validate_replicas_v0,
|
|
+};
|
|
+
|
|
+/* Query replicas: */
|
|
+
|
|
+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
|
|
+ unsigned flags, bool print)
|
|
+{
|
|
+ struct bch_replicas_entry *e;
|
|
+ bool ret = true;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+ for_each_cpu_replicas_entry(&c->replicas, e) {
|
|
+ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0;
|
|
+ bool metadata = e->data_type < BCH_DATA_user;
|
|
+
|
|
+ for (i = 0; i < e->nr_devs; i++) {
|
|
+ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]);
|
|
+
|
|
+ nr_online += test_bit(e->devs[i], devs.d);
|
|
+ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed;
|
|
+ }
|
|
+
|
|
+ if (nr_failed == e->nr_devs)
|
|
+ continue;
|
|
+
|
|
+ if (nr_online < e->nr_required)
|
|
+ dflags |= metadata
|
|
+ ? BCH_FORCE_IF_METADATA_LOST
|
|
+ : BCH_FORCE_IF_DATA_LOST;
|
|
+
|
|
+ if (nr_online < e->nr_devs)
|
|
+ dflags |= metadata
|
|
+ ? BCH_FORCE_IF_METADATA_DEGRADED
|
|
+ : BCH_FORCE_IF_DATA_DEGRADED;
|
|
+
|
|
+ if (dflags & ~flags) {
|
|
+ if (print) {
|
|
+ char buf[100];
|
|
+
|
|
+ bch2_replicas_entry_to_text(&PBUF(buf), e);
|
|
+ bch_err(c, "insufficient devices online (%u) for replicas entry %s",
|
|
+ nr_online, buf);
|
|
+ }
|
|
+ ret = false;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ }
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_replicas_entry *e;
|
|
+ unsigned i, ret = 0;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+
|
|
+ for_each_cpu_replicas_entry(&c->replicas, e)
|
|
+ for (i = 0; i < e->nr_devs; i++)
|
|
+ if (e->devs[i] == ca->dev_idx)
|
|
+ ret |= 1 << e->data_type;
|
|
+
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void bch2_fs_replicas_exit(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ kfree(c->usage_scratch);
|
|
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
|
+ free_percpu(c->usage[i]);
|
|
+ kfree(c->usage_base);
|
|
+ kfree(c->replicas.entries);
|
|
+ kfree(c->replicas_gc.entries);
|
|
+
|
|
+ mempool_exit(&c->replicas_delta_pool);
|
|
+}
|
|
+
|
|
+int bch2_fs_replicas_init(struct bch_fs *c)
|
|
+{
|
|
+ bch2_journal_entry_res_resize(&c->journal,
|
|
+ &c->replicas_journal_res,
|
|
+ reserve_journal_replicas(c, &c->replicas));
|
|
+
|
|
+ return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
|
|
+ REPLICAS_DELTA_LIST_MAX) ?:
|
|
+ replicas_table_update(c, &c->replicas);
|
|
+}
|
|
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
|
|
new file mode 100644
|
|
index 000000000000..72ac544f16d8
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/replicas.h
|
|
@@ -0,0 +1,108 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_REPLICAS_H
|
|
+#define _BCACHEFS_REPLICAS_H
|
|
+
|
|
+#include "eytzinger.h"
|
|
+#include "replicas_types.h"
|
|
+
|
|
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
|
|
+void bch2_replicas_entry_to_text(struct printbuf *,
|
|
+ struct bch_replicas_entry *);
|
|
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
|
|
+
|
|
+static inline struct bch_replicas_entry *
|
|
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
|
|
+{
|
|
+ return (void *) r->entries + r->entry_size * i;
|
|
+}
|
|
+
|
|
+int bch2_replicas_entry_idx(struct bch_fs *,
|
|
+ struct bch_replicas_entry *);
|
|
+
|
|
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
|
|
+ enum bch_data_type,
|
|
+ struct bch_devs_list);
|
|
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
|
|
+int bch2_mark_replicas(struct bch_fs *,
|
|
+ struct bch_replicas_entry *);
|
|
+
|
|
+struct replicas_delta {
|
|
+ s64 delta;
|
|
+ struct bch_replicas_entry r;
|
|
+} __packed;
|
|
+
|
|
+struct replicas_delta_list {
|
|
+ unsigned size;
|
|
+ unsigned used;
|
|
+
|
|
+ struct {} memset_start;
|
|
+ u64 nr_inodes;
|
|
+ u64 persistent_reserved[BCH_REPLICAS_MAX];
|
|
+ struct {} memset_end;
|
|
+ struct replicas_delta d[0];
|
|
+};
|
|
+
|
|
+static inline struct replicas_delta *
|
|
+replicas_delta_next(struct replicas_delta *d)
|
|
+{
|
|
+ return (void *) d + replicas_entry_bytes(&d->r) + 8;
|
|
+}
|
|
+
|
|
+bool bch2_replicas_delta_list_marked(struct bch_fs *, struct replicas_delta_list *);
|
|
+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
|
|
+
|
|
+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
|
|
+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c);
|
|
+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
|
|
+ unsigned dev)
|
|
+{
|
|
+ e->data_type = BCH_DATA_cached;
|
|
+ e->nr_devs = 1;
|
|
+ e->nr_required = 1;
|
|
+ e->devs[0] = dev;
|
|
+}
|
|
+
|
|
+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
|
|
+ unsigned, bool);
|
|
+
|
|
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+int bch2_replicas_gc_end(struct bch_fs *, int);
|
|
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
|
|
+int bch2_replicas_gc2(struct bch_fs *);
|
|
+
|
|
+int bch2_replicas_set_usage(struct bch_fs *,
|
|
+ struct bch_replicas_entry *,
|
|
+ u64);
|
|
+
|
|
+#define for_each_cpu_replicas_entry(_r, _i) \
|
|
+ for (_i = (_r)->entries; \
|
|
+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
|
|
+ _i = (void *) (_i) + (_r)->entry_size)
|
|
+
|
|
+/* iterate over superblock replicas - used by userspace tools: */
|
|
+
|
|
+#define replicas_entry_next(_i) \
|
|
+ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))
|
|
+
|
|
+#define for_each_replicas_entry(_r, _i) \
|
|
+ for (_i = (_r)->entries; \
|
|
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
|
|
+ (_i) = replicas_entry_next(_i))
|
|
+
|
|
+#define for_each_replicas_entry_v0(_r, _i) \
|
|
+ for (_i = (_r)->entries; \
|
|
+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
|
|
+ (_i) = replicas_entry_next(_i))
|
|
+
|
|
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
|
|
+
|
|
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
|
|
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
|
|
+
|
|
+void bch2_fs_replicas_exit(struct bch_fs *);
|
|
+int bch2_fs_replicas_init(struct bch_fs *);
|
|
+
|
|
+#endif /* _BCACHEFS_REPLICAS_H */
|
|
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
|
|
new file mode 100644
|
|
index 000000000000..0535b1d3760e
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/replicas_types.h
|
|
@@ -0,0 +1,10 @@
|
|
+#ifndef _BCACHEFS_REPLICAS_TYPES_H
|
|
+#define _BCACHEFS_REPLICAS_TYPES_H
|
|
+
|
|
+struct bch_replicas_cpu {
|
|
+ unsigned nr;
|
|
+ unsigned entry_size;
|
|
+ struct bch_replicas_entry *entries;
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_REPLICAS_TYPES_H */
|
|
diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c
|
|
new file mode 100644
|
|
index 000000000000..c062edb3fbc2
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/siphash.c
|
|
@@ -0,0 +1,173 @@
|
|
+// SPDX-License-Identifier: BSD-3-Clause
|
|
+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */
|
|
+
|
|
+/*-
|
|
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
|
|
+ * All rights reserved.
|
|
+ *
|
|
+ * Redistribution and use in source and binary forms, with or without
|
|
+ * modification, are permitted provided that the following conditions
|
|
+ * are met:
|
|
+ * 1. Redistributions of source code must retain the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer.
|
|
+ * 2. Redistributions in binary form must reproduce the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer in the
|
|
+ * documentation and/or other materials provided with the distribution.
|
|
+ * 3. The name of the author may not be used to endorse or promote
|
|
+ * products derived from this software without specific prior written
|
|
+ * permission.
|
|
+ *
|
|
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
+ * SUCH DAMAGE.
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d
|
|
+ * are the number of compression rounds and the number of finalization rounds.
|
|
+ * A compression round is identical to a finalization round and this round
|
|
+ * function is called SipRound. Given a 128-bit key k and a (possibly empty)
|
|
+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m).
|
|
+ *
|
|
+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18,
|
|
+ * by Jean-Philippe Aumasson and Daniel J. Bernstein,
|
|
+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa
|
|
+ * https://131002.net/siphash/siphash.pdf
|
|
+ * https://131002.net/siphash/
|
|
+ */
|
|
+
|
|
+#include <asm/byteorder.h>
|
|
+#include <asm/unaligned.h>
|
|
+#include <linux/bitops.h>
|
|
+#include <linux/string.h>
|
|
+
|
|
+#include "siphash.h"
|
|
+
|
|
+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds)
|
|
+{
|
|
+ while (rounds--) {
|
|
+ ctx->v[0] += ctx->v[1];
|
|
+ ctx->v[2] += ctx->v[3];
|
|
+ ctx->v[1] = rol64(ctx->v[1], 13);
|
|
+ ctx->v[3] = rol64(ctx->v[3], 16);
|
|
+
|
|
+ ctx->v[1] ^= ctx->v[0];
|
|
+ ctx->v[3] ^= ctx->v[2];
|
|
+ ctx->v[0] = rol64(ctx->v[0], 32);
|
|
+
|
|
+ ctx->v[2] += ctx->v[1];
|
|
+ ctx->v[0] += ctx->v[3];
|
|
+ ctx->v[1] = rol64(ctx->v[1], 17);
|
|
+ ctx->v[3] = rol64(ctx->v[3], 21);
|
|
+
|
|
+ ctx->v[1] ^= ctx->v[2];
|
|
+ ctx->v[3] ^= ctx->v[0];
|
|
+ ctx->v[2] = rol64(ctx->v[2], 32);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds)
|
|
+{
|
|
+ u64 m = get_unaligned_le64(ptr);
|
|
+
|
|
+ ctx->v[3] ^= m;
|
|
+ SipHash_Rounds(ctx, rounds);
|
|
+ ctx->v[0] ^= m;
|
|
+}
|
|
+
|
|
+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key)
|
|
+{
|
|
+ u64 k0, k1;
|
|
+
|
|
+ k0 = le64_to_cpu(key->k0);
|
|
+ k1 = le64_to_cpu(key->k1);
|
|
+
|
|
+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0;
|
|
+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1;
|
|
+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0;
|
|
+ ctx->v[3] = 0x7465646279746573ULL ^ k1;
|
|
+
|
|
+ memset(ctx->buf, 0, sizeof(ctx->buf));
|
|
+ ctx->bytes = 0;
|
|
+}
|
|
+
|
|
+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf,
|
|
+ const void *src, size_t len)
|
|
+{
|
|
+ const u8 *ptr = src;
|
|
+ size_t left, used;
|
|
+
|
|
+ if (len == 0)
|
|
+ return;
|
|
+
|
|
+ used = ctx->bytes % sizeof(ctx->buf);
|
|
+ ctx->bytes += len;
|
|
+
|
|
+ if (used > 0) {
|
|
+ left = sizeof(ctx->buf) - used;
|
|
+
|
|
+ if (len >= left) {
|
|
+ memcpy(&ctx->buf[used], ptr, left);
|
|
+ SipHash_CRounds(ctx, ctx->buf, rc);
|
|
+ len -= left;
|
|
+ ptr += left;
|
|
+ } else {
|
|
+ memcpy(&ctx->buf[used], ptr, len);
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ while (len >= sizeof(ctx->buf)) {
|
|
+ SipHash_CRounds(ctx, ptr, rc);
|
|
+ len -= sizeof(ctx->buf);
|
|
+ ptr += sizeof(ctx->buf);
|
|
+ }
|
|
+
|
|
+ if (len > 0)
|
|
+ memcpy(&ctx->buf[used], ptr, len);
|
|
+}
|
|
+
|
|
+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf)
|
|
+{
|
|
+ u64 r;
|
|
+
|
|
+ r = SipHash_End(ctx, rc, rf);
|
|
+
|
|
+ *((__le64 *) dst) = cpu_to_le64(r);
|
|
+}
|
|
+
|
|
+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf)
|
|
+{
|
|
+ u64 r;
|
|
+ size_t left, used;
|
|
+
|
|
+ used = ctx->bytes % sizeof(ctx->buf);
|
|
+ left = sizeof(ctx->buf) - used;
|
|
+ memset(&ctx->buf[used], 0, left - 1);
|
|
+ ctx->buf[7] = ctx->bytes;
|
|
+
|
|
+ SipHash_CRounds(ctx, ctx->buf, rc);
|
|
+ ctx->v[2] ^= 0xff;
|
|
+ SipHash_Rounds(ctx, rf);
|
|
+
|
|
+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]);
|
|
+ memset(ctx, 0, sizeof(*ctx));
|
|
+ return (r);
|
|
+}
|
|
+
|
|
+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len)
|
|
+{
|
|
+ SIPHASH_CTX ctx;
|
|
+
|
|
+ SipHash_Init(&ctx, key);
|
|
+ SipHash_Update(&ctx, rc, rf, src, len);
|
|
+ return SipHash_End(&ctx, rc, rf);
|
|
+}
|
|
diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h
|
|
new file mode 100644
|
|
index 000000000000..3dfaf34a43b2
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/siphash.h
|
|
@@ -0,0 +1,87 @@
|
|
+/* SPDX-License-Identifier: BSD-3-Clause */
|
|
+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */
|
|
+/*-
|
|
+ * Copyright (c) 2013 Andre Oppermann <andre@FreeBSD.org>
|
|
+ * All rights reserved.
|
|
+ *
|
|
+ * Redistribution and use in source and binary forms, with or without
|
|
+ * modification, are permitted provided that the following conditions
|
|
+ * are met:
|
|
+ * 1. Redistributions of source code must retain the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer.
|
|
+ * 2. Redistributions in binary form must reproduce the above copyright
|
|
+ * notice, this list of conditions and the following disclaimer in the
|
|
+ * documentation and/or other materials provided with the distribution.
|
|
+ * 3. The name of the author may not be used to endorse or promote
|
|
+ * products derived from this software without specific prior written
|
|
+ * permission.
|
|
+ *
|
|
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
+ * SUCH DAMAGE.
|
|
+ *
|
|
+ * $FreeBSD$
|
|
+ */
|
|
+
|
|
+/*
|
|
+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions)
|
|
+ * optimized for speed on short messages returning a 64bit hash/digest value.
|
|
+ *
|
|
+ * The number of rounds is defined during the initialization:
|
|
+ * SipHash24_Init() for the fast and resonable strong version
|
|
+ * SipHash48_Init() for the strong version (half as fast)
|
|
+ *
|
|
+ * struct SIPHASH_CTX ctx;
|
|
+ * SipHash24_Init(&ctx);
|
|
+ * SipHash_SetKey(&ctx, "16bytes long key");
|
|
+ * SipHash_Update(&ctx, pointer_to_string, length_of_string);
|
|
+ * SipHash_Final(output, &ctx);
|
|
+ */
|
|
+
|
|
+#ifndef _SIPHASH_H_
|
|
+#define _SIPHASH_H_
|
|
+
|
|
+#include <linux/types.h>
|
|
+
|
|
+#define SIPHASH_BLOCK_LENGTH 8
|
|
+#define SIPHASH_KEY_LENGTH 16
|
|
+#define SIPHASH_DIGEST_LENGTH 8
|
|
+
|
|
+typedef struct _SIPHASH_CTX {
|
|
+ u64 v[4];
|
|
+ u8 buf[SIPHASH_BLOCK_LENGTH];
|
|
+ u32 bytes;
|
|
+} SIPHASH_CTX;
|
|
+
|
|
+typedef struct {
|
|
+ __le64 k0;
|
|
+ __le64 k1;
|
|
+} SIPHASH_KEY;
|
|
+
|
|
+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *);
|
|
+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t);
|
|
+u64 SipHash_End(SIPHASH_CTX *, int, int);
|
|
+void SipHash_Final(void *, SIPHASH_CTX *, int, int);
|
|
+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t);
|
|
+
|
|
+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k))
|
|
+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l))
|
|
+#define SipHash24_End(_d) SipHash_End((_d), 2, 4)
|
|
+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4)
|
|
+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l))
|
|
+
|
|
+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k))
|
|
+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l))
|
|
+#define SipHash48_End(_d) SipHash_End((_d), 4, 8)
|
|
+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8)
|
|
+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l))
|
|
+
|
|
+#endif /* _SIPHASH_H_ */
|
|
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
|
|
new file mode 100644
|
|
index 000000000000..b85f895de346
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/str_hash.h
|
|
@@ -0,0 +1,331 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_STR_HASH_H
|
|
+#define _BCACHEFS_STR_HASH_H
|
|
+
|
|
+#include "btree_iter.h"
|
|
+#include "btree_update.h"
|
|
+#include "checksum.h"
|
|
+#include "error.h"
|
|
+#include "inode.h"
|
|
+#include "siphash.h"
|
|
+#include "super.h"
|
|
+
|
|
+#include <linux/crc32c.h>
|
|
+#include <crypto/hash.h>
|
|
+#include <crypto/sha2.h>
|
|
+
|
|
+static inline enum bch_str_hash_type
|
|
+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
|
|
+{
|
|
+ switch (opt) {
|
|
+ case BCH_STR_HASH_OPT_crc32c:
|
|
+ return BCH_STR_HASH_CRC32C;
|
|
+ case BCH_STR_HASH_OPT_crc64:
|
|
+ return BCH_STR_HASH_CRC64;
|
|
+ case BCH_STR_HASH_OPT_siphash:
|
|
+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash)
|
|
+ ? BCH_STR_HASH_SIPHASH
|
|
+ : BCH_STR_HASH_SIPHASH_OLD;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+struct bch_hash_info {
|
|
+ u8 type;
|
|
+ union {
|
|
+ __le64 crc_key;
|
|
+ SIPHASH_KEY siphash_key;
|
|
+ };
|
|
+};
|
|
+
|
|
+static inline struct bch_hash_info
|
|
+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
|
|
+{
|
|
+ /* XXX ick */
|
|
+ struct bch_hash_info info = {
|
|
+ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
|
|
+ ~(~0U << INODE_STR_HASH_BITS),
|
|
+ .crc_key = bi->bi_hash_seed,
|
|
+ };
|
|
+
|
|
+ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) {
|
|
+ SHASH_DESC_ON_STACK(desc, c->sha256);
|
|
+ u8 digest[SHA256_DIGEST_SIZE];
|
|
+
|
|
+ desc->tfm = c->sha256;
|
|
+
|
|
+ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed,
|
|
+ sizeof(bi->bi_hash_seed), digest);
|
|
+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key));
|
|
+ }
|
|
+
|
|
+ return info;
|
|
+}
|
|
+
|
|
+struct bch_str_hash_ctx {
|
|
+ union {
|
|
+ u32 crc32c;
|
|
+ u64 crc64;
|
|
+ SIPHASH_CTX siphash;
|
|
+ };
|
|
+};
|
|
+
|
|
+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx,
|
|
+ const struct bch_hash_info *info)
|
|
+{
|
|
+ switch (info->type) {
|
|
+ case BCH_STR_HASH_CRC32C:
|
|
+ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key));
|
|
+ break;
|
|
+ case BCH_STR_HASH_CRC64:
|
|
+ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key));
|
|
+ break;
|
|
+ case BCH_STR_HASH_SIPHASH_OLD:
|
|
+ case BCH_STR_HASH_SIPHASH:
|
|
+ SipHash24_Init(&ctx->siphash, &info->siphash_key);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx,
|
|
+ const struct bch_hash_info *info,
|
|
+ const void *data, size_t len)
|
|
+{
|
|
+ switch (info->type) {
|
|
+ case BCH_STR_HASH_CRC32C:
|
|
+ ctx->crc32c = crc32c(ctx->crc32c, data, len);
|
|
+ break;
|
|
+ case BCH_STR_HASH_CRC64:
|
|
+ ctx->crc64 = crc64_be(ctx->crc64, data, len);
|
|
+ break;
|
|
+ case BCH_STR_HASH_SIPHASH_OLD:
|
|
+ case BCH_STR_HASH_SIPHASH:
|
|
+ SipHash24_Update(&ctx->siphash, data, len);
|
|
+ break;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx,
|
|
+ const struct bch_hash_info *info)
|
|
+{
|
|
+ switch (info->type) {
|
|
+ case BCH_STR_HASH_CRC32C:
|
|
+ return ctx->crc32c;
|
|
+ case BCH_STR_HASH_CRC64:
|
|
+ return ctx->crc64 >> 1;
|
|
+ case BCH_STR_HASH_SIPHASH_OLD:
|
|
+ case BCH_STR_HASH_SIPHASH:
|
|
+ return SipHash24_End(&ctx->siphash) >> 1;
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+struct bch_hash_desc {
|
|
+ enum btree_id btree_id;
|
|
+ u8 key_type;
|
|
+
|
|
+ u64 (*hash_key)(const struct bch_hash_info *, const void *);
|
|
+ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c);
|
|
+ bool (*cmp_key)(struct bkey_s_c, const void *);
|
|
+ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
|
|
+};
|
|
+
|
|
+static __always_inline struct btree_iter *
|
|
+bch2_hash_lookup(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ const struct bch_hash_info *info,
|
|
+ u64 inode, const void *key,
|
|
+ unsigned flags)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key(trans, iter, desc.btree_id,
|
|
+ POS(inode, desc.hash_key(info, key)),
|
|
+ BTREE_ITER_SLOTS|flags, k, ret) {
|
|
+ if (iter->pos.inode != inode)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type == desc.key_type) {
|
|
+ if (!desc.cmp_key(k, key))
|
|
+ return iter;
|
|
+ } else if (k.k->type == KEY_TYPE_hash_whiteout) {
|
|
+ ;
|
|
+ } else {
|
|
+ /* hole, not found */
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+
|
|
+ return ERR_PTR(ret ?: -ENOENT);
|
|
+}
|
|
+
|
|
+static __always_inline struct btree_iter *
|
|
+bch2_hash_hole(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ const struct bch_hash_info *info,
|
|
+ u64 inode, const void *key)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key(trans, iter, desc.btree_id,
|
|
+ POS(inode, desc.hash_key(info, key)),
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
|
|
+ if (iter->pos.inode != inode)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type != desc.key_type)
|
|
+ return iter;
|
|
+ }
|
|
+
|
|
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+
|
|
+ return ERR_PTR(ret ?: -ENOSPC);
|
|
+}
|
|
+
|
|
+static __always_inline
|
|
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ const struct bch_hash_info *info,
|
|
+ struct btree_iter *start)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_trans_copy_iter(trans, start);
|
|
+
|
|
+ bch2_btree_iter_next_slot(iter);
|
|
+
|
|
+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) {
|
|
+ if (k.k->type != desc.key_type &&
|
|
+ k.k->type != KEY_TYPE_hash_whiteout)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type == desc.key_type &&
|
|
+ desc.hash_bkey(info, k) <= start->pos.offset) {
|
|
+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
|
|
+ ret = 1;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static __always_inline
|
|
+int bch2_hash_set(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ const struct bch_hash_info *info,
|
|
+ u64 inode, struct bkey_i *insert, int flags)
|
|
+{
|
|
+ struct btree_iter *iter, *slot = NULL;
|
|
+ struct bkey_s_c k;
|
|
+ bool found = false;
|
|
+ int ret;
|
|
+
|
|
+ for_each_btree_key(trans, iter, desc.btree_id,
|
|
+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))),
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
|
|
+ if (iter->pos.inode != inode)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type == desc.key_type) {
|
|
+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert)))
|
|
+ goto found;
|
|
+
|
|
+ /* hash collision: */
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (!slot &&
|
|
+ !(flags & BCH_HASH_SET_MUST_REPLACE))
|
|
+ slot = bch2_trans_copy_iter(trans, iter);
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_hash_whiteout)
|
|
+ goto not_found;
|
|
+ }
|
|
+
|
|
+ if (!ret)
|
|
+ ret = -ENOSPC;
|
|
+out:
|
|
+ bch2_trans_iter_put(trans, slot);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+
|
|
+ return ret;
|
|
+found:
|
|
+ found = true;
|
|
+not_found:
|
|
+
|
|
+ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
|
|
+ ret = -ENOENT;
|
|
+ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
|
|
+ ret = -EEXIST;
|
|
+ } else {
|
|
+ if (!found && slot)
|
|
+ swap(iter, slot);
|
|
+
|
|
+ insert->k.p = iter->pos;
|
|
+ bch2_trans_update(trans, iter, insert, 0);
|
|
+ }
|
|
+
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static __always_inline
|
|
+int bch2_hash_delete_at(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ const struct bch_hash_info *info,
|
|
+ struct btree_iter *iter)
|
|
+{
|
|
+ struct bkey_i *delete;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ delete = bch2_trans_kmalloc(trans, sizeof(*delete));
|
|
+ if (IS_ERR(delete))
|
|
+ return PTR_ERR(delete);
|
|
+
|
|
+ bkey_init(&delete->k);
|
|
+ delete->k.p = iter->pos;
|
|
+ delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted;
|
|
+
|
|
+ bch2_trans_update(trans, iter, delete, 0);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static __always_inline
|
|
+int bch2_hash_delete(struct btree_trans *trans,
|
|
+ const struct bch_hash_desc desc,
|
|
+ const struct bch_hash_info *info,
|
|
+ u64 inode, const void *key)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ int ret;
|
|
+
|
|
+ iter = bch2_hash_lookup(trans, desc, info, inode, key,
|
|
+ BTREE_ITER_INTENT);
|
|
+ if (IS_ERR(iter))
|
|
+ return PTR_ERR(iter);
|
|
+
|
|
+ ret = bch2_hash_delete_at(trans, desc, info, iter);
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_STR_HASH_H */
|
|
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
|
|
new file mode 100644
|
|
index 000000000000..74a75ced031e
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/super-io.c
|
|
@@ -0,0 +1,1202 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "buckets.h"
|
|
+#include "checksum.h"
|
|
+#include "disk_groups.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "io.h"
|
|
+#include "journal.h"
|
|
+#include "journal_io.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "replicas.h"
|
|
+#include "quota.h"
|
|
+#include "super-io.h"
|
|
+#include "super.h"
|
|
+#include "vstructs.h"
|
|
+
|
|
+#include <linux/backing-dev.h>
|
|
+#include <linux/sort.h>
|
|
+
|
|
+const char * const bch2_sb_fields[] = {
|
|
+#define x(name, nr) #name,
|
|
+ BCH_SB_FIELDS()
|
|
+#undef x
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static const char *bch2_sb_field_validate(struct bch_sb *,
|
|
+ struct bch_sb_field *);
|
|
+
|
|
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
|
|
+ enum bch_sb_field_type type)
|
|
+{
|
|
+ struct bch_sb_field *f;
|
|
+
|
|
+ /* XXX: need locking around superblock to access optional fields */
|
|
+
|
|
+ vstruct_for_each(sb, f)
|
|
+ if (le32_to_cpu(f->type) == type)
|
|
+ return f;
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
|
|
+ struct bch_sb_field *f,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
|
|
+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
|
|
+
|
|
+ BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size);
|
|
+
|
|
+ if (!f && !u64s) {
|
|
+ /* nothing to do: */
|
|
+ } else if (!f) {
|
|
+ f = vstruct_last(sb->sb);
|
|
+ memset(f, 0, sizeof(u64) * u64s);
|
|
+ f->u64s = cpu_to_le32(u64s);
|
|
+ f->type = 0;
|
|
+ } else {
|
|
+ void *src, *dst;
|
|
+
|
|
+ src = vstruct_end(f);
|
|
+
|
|
+ if (u64s) {
|
|
+ f->u64s = cpu_to_le32(u64s);
|
|
+ dst = vstruct_end(f);
|
|
+ } else {
|
|
+ dst = f;
|
|
+ }
|
|
+
|
|
+ memmove(dst, src, vstruct_end(sb->sb) - src);
|
|
+
|
|
+ if (dst > src)
|
|
+ memset(src, 0, dst - src);
|
|
+ }
|
|
+
|
|
+ sb->sb->u64s = cpu_to_le32(sb_u64s);
|
|
+
|
|
+ return u64s ? f : NULL;
|
|
+}
|
|
+
|
|
+void bch2_sb_field_delete(struct bch_sb_handle *sb,
|
|
+ enum bch_sb_field_type type)
|
|
+{
|
|
+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
|
|
+
|
|
+ if (f)
|
|
+ __bch2_sb_field_resize(sb, f, 0);
|
|
+}
|
|
+
|
|
+/* Superblock realloc/free: */
|
|
+
|
|
+void bch2_free_super(struct bch_sb_handle *sb)
|
|
+{
|
|
+ if (sb->bio)
|
|
+ bio_put(sb->bio);
|
|
+ if (!IS_ERR_OR_NULL(sb->bdev))
|
|
+ blkdev_put(sb->bdev, sb->mode);
|
|
+
|
|
+ kfree(sb->sb);
|
|
+ memset(sb, 0, sizeof(*sb));
|
|
+}
|
|
+
|
|
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
|
|
+{
|
|
+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
|
|
+ size_t new_buffer_size;
|
|
+ struct bch_sb *new_sb;
|
|
+ struct bio *bio;
|
|
+
|
|
+ if (sb->bdev)
|
|
+ new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev));
|
|
+
|
|
+ new_buffer_size = roundup_pow_of_two(new_bytes);
|
|
+
|
|
+ if (sb->sb && sb->buffer_size >= new_buffer_size)
|
|
+ return 0;
|
|
+
|
|
+ if (sb->have_layout) {
|
|
+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
|
|
+
|
|
+ if (new_bytes > max_bytes) {
|
|
+ char buf[BDEVNAME_SIZE];
|
|
+
|
|
+ pr_err("%s: superblock too big: want %zu but have %llu",
|
|
+ bdevname(sb->bdev, buf), new_bytes, max_bytes);
|
|
+ return -ENOSPC;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (sb->buffer_size >= new_buffer_size && sb->sb)
|
|
+ return 0;
|
|
+
|
|
+ if (dynamic_fault("bcachefs:add:super_realloc"))
|
|
+ return -ENOMEM;
|
|
+
|
|
+ if (sb->have_bio) {
|
|
+ bio = bio_kmalloc(GFP_KERNEL,
|
|
+ DIV_ROUND_UP(new_buffer_size, PAGE_SIZE));
|
|
+ if (!bio)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ if (sb->bio)
|
|
+ bio_put(sb->bio);
|
|
+ sb->bio = bio;
|
|
+ }
|
|
+
|
|
+ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO);
|
|
+ if (!new_sb)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ sb->sb = new_sb;
|
|
+ sb->buffer_size = new_buffer_size;
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
|
|
+ enum bch_sb_field_type type,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type);
|
|
+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
|
|
+ ssize_t d = -old_u64s + u64s;
|
|
+
|
|
+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
|
|
+ return NULL;
|
|
+
|
|
+ if (sb->fs_sb) {
|
|
+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+
|
|
+ /* XXX: we're not checking that offline device have enough space */
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ struct bch_sb_handle *sb = &ca->disk_sb;
|
|
+
|
|
+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
|
|
+ percpu_ref_put(&ca->ref);
|
|
+ return NULL;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ f = bch2_sb_field_get(sb->sb, type);
|
|
+ f = __bch2_sb_field_resize(sb, f, u64s);
|
|
+ if (f)
|
|
+ f->type = cpu_to_le32(type);
|
|
+ return f;
|
|
+}
|
|
+
|
|
+/* Superblock validate: */
|
|
+
|
|
+static inline void __bch2_sb_layout_size_assert(void)
|
|
+{
|
|
+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512);
|
|
+}
|
|
+
|
|
+static const char *validate_sb_layout(struct bch_sb_layout *layout)
|
|
+{
|
|
+ u64 offset, prev_offset, max_sectors;
|
|
+ unsigned i;
|
|
+
|
|
+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC))
|
|
+ return "Not a bcachefs superblock layout";
|
|
+
|
|
+ if (layout->layout_type != 0)
|
|
+ return "Invalid superblock layout type";
|
|
+
|
|
+ if (!layout->nr_superblocks)
|
|
+ return "Invalid superblock layout: no superblocks";
|
|
+
|
|
+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset))
|
|
+ return "Invalid superblock layout: too many superblocks";
|
|
+
|
|
+ max_sectors = 1 << layout->sb_max_size_bits;
|
|
+
|
|
+ prev_offset = le64_to_cpu(layout->sb_offset[0]);
|
|
+
|
|
+ for (i = 1; i < layout->nr_superblocks; i++) {
|
|
+ offset = le64_to_cpu(layout->sb_offset[i]);
|
|
+
|
|
+ if (offset < prev_offset + max_sectors)
|
|
+ return "Invalid superblock layout: superblocks overlap";
|
|
+ prev_offset = offset;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
|
|
+{
|
|
+ struct bch_sb *sb = disk_sb->sb;
|
|
+ struct bch_sb_field *f;
|
|
+ struct bch_sb_field_members *mi;
|
|
+ const char *err;
|
|
+ u32 version, version_min;
|
|
+ u16 block_size;
|
|
+
|
|
+ version = le16_to_cpu(sb->version);
|
|
+ version_min = version >= bcachefs_metadata_version_new_versioning
|
|
+ ? le16_to_cpu(sb->version_min)
|
|
+ : version;
|
|
+
|
|
+ if (version >= bcachefs_metadata_version_max ||
|
|
+ version_min < bcachefs_metadata_version_min)
|
|
+ return "Unsupported superblock version";
|
|
+
|
|
+ if (version_min > version)
|
|
+ return "Bad minimum version";
|
|
+
|
|
+ if (sb->features[1] ||
|
|
+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR)))
|
|
+ return "Filesystem has incompatible features";
|
|
+
|
|
+ block_size = le16_to_cpu(sb->block_size);
|
|
+
|
|
+ if (!is_power_of_2(block_size) ||
|
|
+ block_size > PAGE_SECTORS)
|
|
+ return "Bad block size";
|
|
+
|
|
+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le)))
|
|
+ return "Bad user UUID";
|
|
+
|
|
+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le)))
|
|
+ return "Bad internal UUID";
|
|
+
|
|
+ if (!sb->nr_devices ||
|
|
+ sb->nr_devices <= sb->dev_idx ||
|
|
+ sb->nr_devices > BCH_SB_MEMBERS_MAX)
|
|
+ return "Bad number of member devices";
|
|
+
|
|
+ if (!BCH_SB_META_REPLICAS_WANT(sb) ||
|
|
+ BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
|
|
+ return "Invalid number of metadata replicas";
|
|
+
|
|
+ if (!BCH_SB_META_REPLICAS_REQ(sb) ||
|
|
+ BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
|
|
+ return "Invalid number of metadata replicas";
|
|
+
|
|
+ if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
|
|
+ BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
|
|
+ return "Invalid number of data replicas";
|
|
+
|
|
+ if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
|
|
+ BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
|
|
+ return "Invalid number of data replicas";
|
|
+
|
|
+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
|
|
+ return "Invalid metadata checksum type";
|
|
+
|
|
+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
|
|
+ return "Invalid metadata checksum type";
|
|
+
|
|
+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR)
|
|
+ return "Invalid compression type";
|
|
+
|
|
+ if (!BCH_SB_BTREE_NODE_SIZE(sb))
|
|
+ return "Btree node size not set";
|
|
+
|
|
+ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb)))
|
|
+ return "Btree node size not a power of two";
|
|
+
|
|
+ if (BCH_SB_GC_RESERVE(sb) < 5)
|
|
+ return "gc reserve percentage too small";
|
|
+
|
|
+ if (!sb->time_precision ||
|
|
+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC)
|
|
+ return "invalid time precision";
|
|
+
|
|
+ /* validate layout */
|
|
+ err = validate_sb_layout(&sb->layout);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ vstruct_for_each(sb, f) {
|
|
+ if (!f->u64s)
|
|
+ return "Invalid superblock: invalid optional field";
|
|
+
|
|
+ if (vstruct_next(f) > vstruct_last(sb))
|
|
+ return "Invalid superblock: invalid optional field";
|
|
+ }
|
|
+
|
|
+ /* members must be validated first: */
|
|
+ mi = bch2_sb_get_members(sb);
|
|
+ if (!mi)
|
|
+ return "Invalid superblock: member info area missing";
|
|
+
|
|
+ err = bch2_sb_field_validate(sb, &mi->field);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ vstruct_for_each(sb, f) {
|
|
+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members)
|
|
+ continue;
|
|
+
|
|
+ err = bch2_sb_field_validate(sb, f);
|
|
+ if (err)
|
|
+ return err;
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/* device open: */
|
|
+
|
|
+static void bch2_sb_update(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb *src = c->disk_sb.sb;
|
|
+ struct bch_sb_field_members *mi = bch2_sb_get_members(src);
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+
|
|
+ c->sb.uuid = src->uuid;
|
|
+ c->sb.user_uuid = src->user_uuid;
|
|
+ c->sb.version = le16_to_cpu(src->version);
|
|
+ c->sb.version_min = le16_to_cpu(src->version_min);
|
|
+ c->sb.nr_devices = src->nr_devices;
|
|
+ c->sb.clean = BCH_SB_CLEAN(src);
|
|
+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src);
|
|
+ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src);
|
|
+
|
|
+ c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision);
|
|
+ c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit;
|
|
+
|
|
+ /* XXX this is wrong, we need a 96 or 128 bit integer type */
|
|
+ c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo),
|
|
+ c->sb.nsec_per_time_unit);
|
|
+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi);
|
|
+
|
|
+ c->sb.features = le64_to_cpu(src->features[0]);
|
|
+ c->sb.compat = le64_to_cpu(src->compat[0]);
|
|
+
|
|
+ for_each_member_device(ca, c, i)
|
|
+ ca->mi = bch2_mi_to_cpu(mi->members + i);
|
|
+}
|
|
+
|
|
+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
|
|
+{
|
|
+ struct bch_sb_field *src_f, *dst_f;
|
|
+ struct bch_sb *dst = dst_handle->sb;
|
|
+ unsigned i;
|
|
+
|
|
+ dst->version = src->version;
|
|
+ dst->version_min = src->version_min;
|
|
+ dst->seq = src->seq;
|
|
+ dst->uuid = src->uuid;
|
|
+ dst->user_uuid = src->user_uuid;
|
|
+ memcpy(dst->label, src->label, sizeof(dst->label));
|
|
+
|
|
+ dst->block_size = src->block_size;
|
|
+ dst->nr_devices = src->nr_devices;
|
|
+
|
|
+ dst->time_base_lo = src->time_base_lo;
|
|
+ dst->time_base_hi = src->time_base_hi;
|
|
+ dst->time_precision = src->time_precision;
|
|
+
|
|
+ memcpy(dst->flags, src->flags, sizeof(dst->flags));
|
|
+ memcpy(dst->features, src->features, sizeof(dst->features));
|
|
+ memcpy(dst->compat, src->compat, sizeof(dst->compat));
|
|
+
|
|
+ for (i = 0; i < BCH_SB_FIELD_NR; i++) {
|
|
+ if (i == BCH_SB_FIELD_journal)
|
|
+ continue;
|
|
+
|
|
+ src_f = bch2_sb_field_get(src, i);
|
|
+ dst_f = bch2_sb_field_get(dst, i);
|
|
+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
|
|
+ src_f ? le32_to_cpu(src_f->u64s) : 0);
|
|
+
|
|
+ if (src_f)
|
|
+ memcpy(dst_f, src_f, vstruct_bytes(src_f));
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
|
|
+{
|
|
+ struct bch_sb_field_journal *journal_buckets =
|
|
+ bch2_sb_get_journal(src);
|
|
+ unsigned journal_u64s = journal_buckets
|
|
+ ? le32_to_cpu(journal_buckets->field.u64s)
|
|
+ : 0;
|
|
+ int ret;
|
|
+
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+
|
|
+ ret = bch2_sb_realloc(&c->disk_sb,
|
|
+ le32_to_cpu(src->u64s) - journal_u64s);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ __copy_super(&c->disk_sb, src);
|
|
+
|
|
+ if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
|
|
+ set_bit(BCH_FS_ERROR, &c->flags);
|
|
+ if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
|
|
+ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
|
|
+
|
|
+ ret = bch2_sb_replicas_to_cpu_replicas(c);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_sb_disk_groups_to_cpu(c);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_sb_update(c);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
|
|
+ struct bch_sb_field_journal *journal_buckets =
|
|
+ bch2_sb_get_journal(dst);
|
|
+ unsigned journal_u64s = journal_buckets
|
|
+ ? le32_to_cpu(journal_buckets->field.u64s)
|
|
+ : 0;
|
|
+ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_sb_realloc(&ca->disk_sb, u64s);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ __copy_super(&ca->disk_sb, src);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* read superblock: */
|
|
+
|
|
+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
|
|
+{
|
|
+ struct bch_csum csum;
|
|
+ size_t bytes;
|
|
+reread:
|
|
+ bio_reset(sb->bio);
|
|
+ bio_set_dev(sb->bio, sb->bdev);
|
|
+ sb->bio->bi_iter.bi_sector = offset;
|
|
+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
|
|
+ bch2_bio_map(sb->bio, sb->sb, sb->buffer_size);
|
|
+
|
|
+ if (submit_bio_wait(sb->bio))
|
|
+ return "IO error";
|
|
+
|
|
+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC))
|
|
+ return "Not a bcachefs superblock";
|
|
+
|
|
+ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min ||
|
|
+ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max)
|
|
+ return "Unsupported superblock version";
|
|
+
|
|
+ bytes = vstruct_bytes(sb->sb);
|
|
+
|
|
+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
|
|
+ return "Bad superblock: too big";
|
|
+
|
|
+ if (bytes > sb->buffer_size) {
|
|
+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
|
|
+ return "cannot allocate memory";
|
|
+ goto reread;
|
|
+ }
|
|
+
|
|
+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR)
|
|
+ return "unknown csum type";
|
|
+
|
|
+ /* XXX: verify MACs */
|
|
+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
|
|
+ null_nonce(), sb->sb);
|
|
+
|
|
+ if (bch2_crc_cmp(csum, sb->sb->csum))
|
|
+ return "bad checksum reading superblock";
|
|
+
|
|
+ sb->seq = le64_to_cpu(sb->sb->seq);
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+int bch2_read_super(const char *path, struct bch_opts *opts,
|
|
+ struct bch_sb_handle *sb)
|
|
+{
|
|
+ u64 offset = opt_get(*opts, sb);
|
|
+ struct bch_sb_layout layout;
|
|
+ const char *err;
|
|
+ __le64 *i;
|
|
+ int ret;
|
|
+
|
|
+ pr_verbose_init(*opts, "");
|
|
+
|
|
+ memset(sb, 0, sizeof(*sb));
|
|
+ sb->mode = FMODE_READ;
|
|
+ sb->have_bio = true;
|
|
+
|
|
+ if (!opt_get(*opts, noexcl))
|
|
+ sb->mode |= FMODE_EXCL;
|
|
+
|
|
+ if (!opt_get(*opts, nochanges))
|
|
+ sb->mode |= FMODE_WRITE;
|
|
+
|
|
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
|
|
+ if (IS_ERR(sb->bdev) &&
|
|
+ PTR_ERR(sb->bdev) == -EACCES &&
|
|
+ opt_get(*opts, read_only)) {
|
|
+ sb->mode &= ~FMODE_WRITE;
|
|
+
|
|
+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb);
|
|
+ if (!IS_ERR(sb->bdev))
|
|
+ opt_set(*opts, nochanges, true);
|
|
+ }
|
|
+
|
|
+ if (IS_ERR(sb->bdev)) {
|
|
+ ret = PTR_ERR(sb->bdev);
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ err = "cannot allocate memory";
|
|
+ ret = bch2_sb_realloc(sb, 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = -EFAULT;
|
|
+ err = "dynamic fault";
|
|
+ if (bch2_fs_init_fault("read_super"))
|
|
+ goto err;
|
|
+
|
|
+ ret = -EINVAL;
|
|
+ err = read_one_super(sb, offset);
|
|
+ if (!err)
|
|
+ goto got_super;
|
|
+
|
|
+ if (opt_defined(*opts, sb))
|
|
+ goto err;
|
|
+
|
|
+ pr_err("error reading default superblock: %s", err);
|
|
+
|
|
+ /*
|
|
+ * Error reading primary superblock - read location of backup
|
|
+ * superblocks:
|
|
+ */
|
|
+ bio_reset(sb->bio);
|
|
+ bio_set_dev(sb->bio, sb->bdev);
|
|
+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR;
|
|
+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
|
|
+ /*
|
|
+ * use sb buffer to read layout, since sb buffer is page aligned but
|
|
+ * layout won't be:
|
|
+ */
|
|
+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout));
|
|
+
|
|
+ err = "IO error";
|
|
+ if (submit_bio_wait(sb->bio))
|
|
+ goto err;
|
|
+
|
|
+ memcpy(&layout, sb->sb, sizeof(layout));
|
|
+ err = validate_sb_layout(&layout);
|
|
+ if (err)
|
|
+ goto err;
|
|
+
|
|
+ for (i = layout.sb_offset;
|
|
+ i < layout.sb_offset + layout.nr_superblocks; i++) {
|
|
+ offset = le64_to_cpu(*i);
|
|
+
|
|
+ if (offset == opt_get(*opts, sb))
|
|
+ continue;
|
|
+
|
|
+ err = read_one_super(sb, offset);
|
|
+ if (!err)
|
|
+ goto got_super;
|
|
+ }
|
|
+
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+
|
|
+got_super:
|
|
+ err = "Superblock block size smaller than device block size";
|
|
+ ret = -EINVAL;
|
|
+ if (le16_to_cpu(sb->sb->block_size) << 9 <
|
|
+ bdev_logical_block_size(sb->bdev))
|
|
+ goto err;
|
|
+
|
|
+ ret = 0;
|
|
+ sb->have_layout = true;
|
|
+out:
|
|
+ pr_verbose_init(*opts, "ret %i", ret);
|
|
+ return ret;
|
|
+err:
|
|
+ bch2_free_super(sb);
|
|
+ pr_err("error reading superblock: %s", err);
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+/* write superblock: */
|
|
+
|
|
+static void write_super_endio(struct bio *bio)
|
|
+{
|
|
+ struct bch_dev *ca = bio->bi_private;
|
|
+
|
|
+ /* XXX: return errors directly */
|
|
+
|
|
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s",
|
|
+ bch2_blk_status_to_str(bio->bi_status)))
|
|
+ ca->sb_write_error = 1;
|
|
+
|
|
+ closure_put(&ca->fs->sb_write);
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+}
|
|
+
|
|
+static void read_back_super(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_sb *sb = ca->disk_sb.sb;
|
|
+ struct bio *bio = ca->disk_sb.bio;
|
|
+
|
|
+ bio_reset(bio);
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]);
|
|
+ bio->bi_end_io = write_super_endio;
|
|
+ bio->bi_private = ca;
|
|
+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META);
|
|
+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE);
|
|
+
|
|
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb],
|
|
+ bio_sectors(bio));
|
|
+
|
|
+ percpu_ref_get(&ca->io_ref);
|
|
+ closure_bio_submit(bio, &c->sb_write);
|
|
+}
|
|
+
|
|
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
|
|
+{
|
|
+ struct bch_sb *sb = ca->disk_sb.sb;
|
|
+ struct bio *bio = ca->disk_sb.bio;
|
|
+
|
|
+ sb->offset = sb->layout.sb_offset[idx];
|
|
+
|
|
+ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
|
|
+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb),
|
|
+ null_nonce(), sb);
|
|
+
|
|
+ bio_reset(bio);
|
|
+ bio_set_dev(bio, ca->disk_sb.bdev);
|
|
+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset);
|
|
+ bio->bi_end_io = write_super_endio;
|
|
+ bio->bi_private = ca;
|
|
+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
|
|
+ bch2_bio_map(bio, sb,
|
|
+ roundup((size_t) vstruct_bytes(sb),
|
|
+ bdev_logical_block_size(ca->disk_sb.bdev)));
|
|
+
|
|
+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb],
|
|
+ bio_sectors(bio));
|
|
+
|
|
+ percpu_ref_get(&ca->io_ref);
|
|
+ closure_bio_submit(bio, &c->sb_write);
|
|
+}
|
|
+
|
|
+int bch2_write_super(struct bch_fs *c)
|
|
+{
|
|
+ struct closure *cl = &c->sb_write;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, sb = 0, nr_wrote;
|
|
+ const char *err;
|
|
+ struct bch_devs_mask sb_written;
|
|
+ bool wrote, can_mount_without_written, can_mount_with_written;
|
|
+ unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (c->opts.very_degraded)
|
|
+ degraded_flags |= BCH_FORCE_IF_LOST;
|
|
+
|
|
+ lockdep_assert_held(&c->sb_lock);
|
|
+
|
|
+ closure_init_stack(cl);
|
|
+ memset(&sb_written, 0, sizeof(sb_written));
|
|
+
|
|
+ le64_add_cpu(&c->disk_sb.sb->seq, 1);
|
|
+
|
|
+ if (test_bit(BCH_FS_ERROR, &c->flags))
|
|
+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
|
|
+ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
|
|
+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
|
|
+
|
|
+ SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ bch2_sb_from_fs(c, ca);
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ err = bch2_sb_validate(&ca->disk_sb);
|
|
+ if (err) {
|
|
+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err);
|
|
+ ret = -1;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (c->opts.nochanges)
|
|
+ goto out;
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ __set_bit(ca->dev_idx, sb_written.d);
|
|
+ ca->sb_write_error = 0;
|
|
+ }
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ read_back_super(c, ca);
|
|
+ closure_sync(cl);
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ if (!ca->sb_write_error &&
|
|
+ ca->disk_sb.seq !=
|
|
+ le64_to_cpu(ca->sb_read_scratch->seq)) {
|
|
+ bch2_fs_fatal_error(c,
|
|
+ "Superblock modified by another process");
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ ret = -EROFS;
|
|
+ goto out;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ do {
|
|
+ wrote = false;
|
|
+ for_each_online_member(ca, c, i)
|
|
+ if (!ca->sb_write_error &&
|
|
+ sb < ca->disk_sb.sb->layout.nr_superblocks) {
|
|
+ write_one_super(c, ca, sb);
|
|
+ wrote = true;
|
|
+ }
|
|
+ closure_sync(cl);
|
|
+ sb++;
|
|
+ } while (wrote);
|
|
+
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ if (ca->sb_write_error)
|
|
+ __clear_bit(ca->dev_idx, sb_written.d);
|
|
+ else
|
|
+ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq);
|
|
+ }
|
|
+
|
|
+ nr_wrote = dev_mask_nr(&sb_written);
|
|
+
|
|
+ can_mount_with_written =
|
|
+ bch2_have_enough_devs(c, sb_written, degraded_flags, false);
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
|
|
+ sb_written.d[i] = ~sb_written.d[i];
|
|
+
|
|
+ can_mount_without_written =
|
|
+ bch2_have_enough_devs(c, sb_written, degraded_flags, false);
|
|
+
|
|
+ /*
|
|
+ * If we would be able to mount _without_ the devices we successfully
|
|
+ * wrote superblocks to, we weren't able to write to enough devices:
|
|
+ *
|
|
+ * Exception: if we can mount without the successes because we haven't
|
|
+ * written anything (new filesystem), we continue if we'd be able to
|
|
+ * mount with the devices we did successfully write to:
|
|
+ */
|
|
+ if (bch2_fs_fatal_err_on(!nr_wrote ||
|
|
+ !can_mount_with_written ||
|
|
+ (can_mount_without_written &&
|
|
+ !can_mount_with_written), c,
|
|
+ "Unable to write superblock to sufficient devices"))
|
|
+ ret = -1;
|
|
+out:
|
|
+ /* Make new options visible after they're persistent: */
|
|
+ bch2_sb_update(c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
|
|
+{
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (!(c->sb.features & (1ULL << feat))) {
|
|
+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ }
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
+
|
|
+/* BCH_SB_FIELD_journal: */
|
|
+
|
|
+static int u64_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
|
|
+
|
|
+ return l < r ? -1 : l > r ? 1 : 0;
|
|
+}
|
|
+
|
|
+static const char *bch2_sb_validate_journal(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_journal *journal = field_to_type(f, journal);
|
|
+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
|
|
+ const char *err;
|
|
+ unsigned nr;
|
|
+ unsigned i;
|
|
+ u64 *b;
|
|
+
|
|
+ journal = bch2_sb_get_journal(sb);
|
|
+ if (!journal)
|
|
+ return NULL;
|
|
+
|
|
+ nr = bch2_nr_journal_buckets(journal);
|
|
+ if (!nr)
|
|
+ return NULL;
|
|
+
|
|
+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
|
|
+ if (!b)
|
|
+ return "cannot allocate memory";
|
|
+
|
|
+ for (i = 0; i < nr; i++)
|
|
+ b[i] = le64_to_cpu(journal->buckets[i]);
|
|
+
|
|
+ sort(b, nr, sizeof(u64), u64_cmp, NULL);
|
|
+
|
|
+ err = "journal bucket at sector 0";
|
|
+ if (!b[0])
|
|
+ goto err;
|
|
+
|
|
+ err = "journal bucket before first bucket";
|
|
+ if (m && b[0] < le16_to_cpu(m->first_bucket))
|
|
+ goto err;
|
|
+
|
|
+ err = "journal bucket past end of device";
|
|
+ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets))
|
|
+ goto err;
|
|
+
|
|
+ err = "duplicate journal buckets";
|
|
+ for (i = 0; i + 1 < nr; i++)
|
|
+ if (b[i] == b[i + 1])
|
|
+ goto err;
|
|
+
|
|
+ err = NULL;
|
|
+err:
|
|
+ kfree(b);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
|
|
+ .validate = bch2_sb_validate_journal,
|
|
+};
|
|
+
|
|
+/* BCH_SB_FIELD_members: */
|
|
+
|
|
+static const char *bch2_sb_validate_members(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_members *mi = field_to_type(f, members);
|
|
+ struct bch_member *m;
|
|
+
|
|
+ if ((void *) (mi->members + sb->nr_devices) >
|
|
+ vstruct_end(&mi->field))
|
|
+ return "Invalid superblock: bad member info";
|
|
+
|
|
+ for (m = mi->members;
|
|
+ m < mi->members + sb->nr_devices;
|
|
+ m++) {
|
|
+ if (!bch2_member_exists(m))
|
|
+ continue;
|
|
+
|
|
+ if (le64_to_cpu(m->nbuckets) > LONG_MAX)
|
|
+ return "Too many buckets";
|
|
+
|
|
+ if (le64_to_cpu(m->nbuckets) -
|
|
+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS)
|
|
+ return "Not enough buckets";
|
|
+
|
|
+ if (le16_to_cpu(m->bucket_size) <
|
|
+ le16_to_cpu(sb->block_size))
|
|
+ return "bucket size smaller than block size";
|
|
+
|
|
+ if (le16_to_cpu(m->bucket_size) <
|
|
+ BCH_SB_BTREE_NODE_SIZE(sb))
|
|
+ return "bucket size smaller than btree node size";
|
|
+ }
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
|
|
+ .validate = bch2_sb_validate_members,
|
|
+};
|
|
+
|
|
+/* BCH_SB_FIELD_crypt: */
|
|
+
|
|
+static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt);
|
|
+
|
|
+ if (vstruct_bytes(&crypt->field) != sizeof(*crypt))
|
|
+ return "invalid field crypt: wrong size";
|
|
+
|
|
+ if (BCH_CRYPT_KDF_TYPE(crypt))
|
|
+ return "invalid field crypt: bad kdf type";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
|
|
+ .validate = bch2_sb_validate_crypt,
|
|
+};
|
|
+
|
|
+/* BCH_SB_FIELD_clean: */
|
|
+
|
|
+int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write)
|
|
+{
|
|
+ struct jset_entry *entry;
|
|
+ int ret;
|
|
+
|
|
+ for (entry = clean->start;
|
|
+ entry < (struct jset_entry *) vstruct_end(&clean->field);
|
|
+ entry = vstruct_next(entry)) {
|
|
+ ret = bch2_journal_entry_validate(c, "superblock", entry,
|
|
+ le16_to_cpu(c->disk_sb.sb->version),
|
|
+ BCH_SB_BIG_ENDIAN(c->disk_sb.sb),
|
|
+ write);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_fs_mark_dirty(struct bch_fs *c)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ /*
|
|
+ * Unconditionally write superblock, to verify it hasn't changed before
|
|
+ * we go rw:
|
|
+ */
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
|
|
+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
|
|
+ ret = bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
|
|
+{
|
|
+ struct jset_entry *entry = *end;
|
|
+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
|
|
+
|
|
+ memset(entry, 0, u64s * sizeof(u64));
|
|
+ /*
|
|
+ * The u64s field counts from the start of data, ignoring the shared
|
|
+ * fields.
|
|
+ */
|
|
+ entry->u64s = u64s - 1;
|
|
+
|
|
+ *end = vstruct_next(*end);
|
|
+ return entry;
|
|
+}
|
|
+
|
|
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
|
|
+ struct jset_entry **end,
|
|
+ u64 journal_seq)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, dev;
|
|
+
|
|
+ percpu_down_read(&c->mark_lock);
|
|
+
|
|
+ if (!journal_seq) {
|
|
+ for (i = 0; i < ARRAY_SIZE(c->usage); i++)
|
|
+ bch2_fs_usage_acc_to_base(c, i);
|
|
+ } else {
|
|
+ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
|
|
+ }
|
|
+
|
|
+ {
|
|
+ struct jset_entry_usage *u =
|
|
+ container_of(jset_entry_init(end, sizeof(*u)),
|
|
+ struct jset_entry_usage, entry);
|
|
+
|
|
+ u->entry.type = BCH_JSET_ENTRY_usage;
|
|
+ u->entry.btree_id = FS_USAGE_INODES;
|
|
+ u->v = cpu_to_le64(c->usage_base->nr_inodes);
|
|
+ }
|
|
+
|
|
+ {
|
|
+ struct jset_entry_usage *u =
|
|
+ container_of(jset_entry_init(end, sizeof(*u)),
|
|
+ struct jset_entry_usage, entry);
|
|
+
|
|
+ u->entry.type = BCH_JSET_ENTRY_usage;
|
|
+ u->entry.btree_id = FS_USAGE_KEY_VERSION;
|
|
+ u->v = cpu_to_le64(atomic64_read(&c->key_version));
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
|
|
+ struct jset_entry_usage *u =
|
|
+ container_of(jset_entry_init(end, sizeof(*u)),
|
|
+ struct jset_entry_usage, entry);
|
|
+
|
|
+ u->entry.type = BCH_JSET_ENTRY_usage;
|
|
+ u->entry.btree_id = FS_USAGE_RESERVED;
|
|
+ u->entry.level = i;
|
|
+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < c->replicas.nr; i++) {
|
|
+ struct bch_replicas_entry *e =
|
|
+ cpu_replicas_entry(&c->replicas, i);
|
|
+ struct jset_entry_data_usage *u =
|
|
+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
|
|
+ struct jset_entry_data_usage, entry);
|
|
+
|
|
+ u->entry.type = BCH_JSET_ENTRY_data_usage;
|
|
+ u->v = cpu_to_le64(c->usage_base->replicas[i]);
|
|
+ memcpy(&u->r, e, replicas_entry_bytes(e));
|
|
+ }
|
|
+
|
|
+ for_each_member_device(ca, c, dev) {
|
|
+ unsigned b = sizeof(struct jset_entry_dev_usage) +
|
|
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
|
|
+ struct jset_entry_dev_usage *u =
|
|
+ container_of(jset_entry_init(end, b),
|
|
+ struct jset_entry_dev_usage, entry);
|
|
+
|
|
+ u->entry.type = BCH_JSET_ENTRY_dev_usage;
|
|
+ u->dev = cpu_to_le32(dev);
|
|
+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
|
|
+ u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable);
|
|
+
|
|
+ for (i = 0; i < BCH_DATA_NR; i++) {
|
|
+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
|
|
+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
|
|
+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ for (i = 0; i < 2; i++) {
|
|
+ struct jset_entry_clock *clock =
|
|
+ container_of(jset_entry_init(end, sizeof(*clock)),
|
|
+ struct jset_entry_clock, entry);
|
|
+
|
|
+ clock->entry.type = BCH_JSET_ENTRY_clock;
|
|
+ clock->rw = i;
|
|
+ clock->time = atomic64_read(&c->io_clock[i].now);
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_fs_mark_clean(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_clean *sb_clean;
|
|
+ struct jset_entry *entry;
|
|
+ unsigned u64s;
|
|
+ int ret;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (BCH_SB_CLEAN(c->disk_sb.sb))
|
|
+ goto out;
|
|
+
|
|
+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
|
|
+
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
|
|
+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
|
|
+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
|
|
+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
|
|
+
|
|
+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
|
|
+
|
|
+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
|
|
+ if (!sb_clean) {
|
|
+ bch_err(c, "error resizing superblock while setting filesystem clean");
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ sb_clean->flags = 0;
|
|
+ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
|
|
+
|
|
+ /* Trying to catch outstanding bug: */
|
|
+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
|
|
+
|
|
+ entry = sb_clean->start;
|
|
+ bch2_journal_super_entries_add_common(c, &entry, 0);
|
|
+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
|
|
+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
|
|
+
|
|
+ memset(entry, 0,
|
|
+ vstruct_end(&sb_clean->field) - (void *) entry);
|
|
+
|
|
+ /*
|
|
+ * this should be in the write path, and we should be validating every
|
|
+ * superblock section:
|
|
+ */
|
|
+ ret = bch2_sb_clean_validate(c, sb_clean, WRITE);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error writing marking filesystem clean: validate error");
|
|
+ goto out;
|
|
+ }
|
|
+
|
|
+ bch2_write_super(c);
|
|
+out:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+}
|
|
+
|
|
+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ struct bch_sb_field_clean *clean = field_to_type(f, clean);
|
|
+
|
|
+ if (vstruct_bytes(&clean->field) < sizeof(*clean))
|
|
+ return "invalid field crypt: wrong size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
|
|
+ .validate = bch2_sb_validate_clean,
|
|
+};
|
|
+
|
|
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
|
|
+#define x(f, nr) \
|
|
+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
|
|
+ BCH_SB_FIELDS()
|
|
+#undef x
|
|
+};
|
|
+
|
|
+static const char *bch2_sb_field_validate(struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ unsigned type = le32_to_cpu(f->type);
|
|
+
|
|
+ return type < BCH_SB_FIELD_NR
|
|
+ ? bch2_sb_field_ops[type]->validate(sb, f)
|
|
+ : NULL;
|
|
+}
|
|
+
|
|
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
|
|
+ struct bch_sb_field *f)
|
|
+{
|
|
+ unsigned type = le32_to_cpu(f->type);
|
|
+ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR
|
|
+ ? bch2_sb_field_ops[type] : NULL;
|
|
+
|
|
+ if (ops)
|
|
+ pr_buf(out, "%s", bch2_sb_fields[type]);
|
|
+ else
|
|
+ pr_buf(out, "(unknown field %u)", type);
|
|
+
|
|
+ pr_buf(out, " (size %llu):", vstruct_bytes(f));
|
|
+
|
|
+ if (ops && ops->to_text)
|
|
+ bch2_sb_field_ops[type]->to_text(out, sb, f);
|
|
+}
|
|
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
|
|
new file mode 100644
|
|
index 000000000000..b64ac2fbbf8b
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/super-io.h
|
|
@@ -0,0 +1,136 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_SUPER_IO_H
|
|
+#define _BCACHEFS_SUPER_IO_H
|
|
+
|
|
+#include "extents.h"
|
|
+#include "eytzinger.h"
|
|
+#include "super_types.h"
|
|
+#include "super.h"
|
|
+
|
|
+#include <asm/byteorder.h>
|
|
+
|
|
+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
|
|
+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
|
|
+ enum bch_sb_field_type, unsigned);
|
|
+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type);
|
|
+
|
|
+#define field_to_type(_f, _name) \
|
|
+ container_of_or_null(_f, struct bch_sb_field_##_name, field)
|
|
+
|
|
+#define x(_name, _nr) \
|
|
+static inline struct bch_sb_field_##_name * \
|
|
+bch2_sb_get_##_name(struct bch_sb *sb) \
|
|
+{ \
|
|
+ return field_to_type(bch2_sb_field_get(sb, \
|
|
+ BCH_SB_FIELD_##_name), _name); \
|
|
+} \
|
|
+ \
|
|
+static inline struct bch_sb_field_##_name * \
|
|
+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \
|
|
+{ \
|
|
+ return field_to_type(bch2_sb_field_resize(sb, \
|
|
+ BCH_SB_FIELD_##_name, u64s), _name); \
|
|
+}
|
|
+
|
|
+BCH_SB_FIELDS()
|
|
+#undef x
|
|
+
|
|
+extern const char * const bch2_sb_fields[];
|
|
+
|
|
+struct bch_sb_field_ops {
|
|
+ const char * (*validate)(struct bch_sb *, struct bch_sb_field *);
|
|
+ void (*to_text)(struct printbuf *, struct bch_sb *,
|
|
+ struct bch_sb_field *);
|
|
+};
|
|
+
|
|
+static inline __le64 bch2_sb_magic(struct bch_fs *c)
|
|
+{
|
|
+ __le64 ret;
|
|
+ memcpy(&ret, &c->sb.uuid, sizeof(ret));
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline __u64 jset_magic(struct bch_fs *c)
|
|
+{
|
|
+ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC);
|
|
+}
|
|
+
|
|
+static inline __u64 bset_magic(struct bch_fs *c)
|
|
+{
|
|
+ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC);
|
|
+}
|
|
+
|
|
+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
|
|
+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+void bch2_free_super(struct bch_sb_handle *);
|
|
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
|
|
+
|
|
+const char *bch2_sb_validate(struct bch_sb_handle *);
|
|
+
|
|
+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
|
|
+int bch2_write_super(struct bch_fs *);
|
|
+void __bch2_check_set_feature(struct bch_fs *, unsigned);
|
|
+
|
|
+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
|
|
+{
|
|
+ if (!(c->sb.features & (1ULL << feat)))
|
|
+ __bch2_check_set_feature(c, feat);
|
|
+}
|
|
+
|
|
+/* BCH_SB_FIELD_journal: */
|
|
+
|
|
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
|
|
+{
|
|
+ return j
|
|
+ ? (__le64 *) vstruct_end(&j->field) - j->buckets
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+/* BCH_SB_FIELD_members: */
|
|
+
|
|
+static inline bool bch2_member_exists(struct bch_member *m)
|
|
+{
|
|
+ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le));
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_exists(struct bch_sb *sb,
|
|
+ struct bch_sb_field_members *mi,
|
|
+ unsigned dev)
|
|
+{
|
|
+ return dev < sb->nr_devices &&
|
|
+ bch2_member_exists(&mi->members[dev]);
|
|
+}
|
|
+
|
|
+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
|
|
+{
|
|
+ return (struct bch_member_cpu) {
|
|
+ .nbuckets = le64_to_cpu(mi->nbuckets),
|
|
+ .first_bucket = le16_to_cpu(mi->first_bucket),
|
|
+ .bucket_size = le16_to_cpu(mi->bucket_size),
|
|
+ .group = BCH_MEMBER_GROUP(mi),
|
|
+ .state = BCH_MEMBER_STATE(mi),
|
|
+ .replacement = BCH_MEMBER_REPLACEMENT(mi),
|
|
+ .discard = BCH_MEMBER_DISCARD(mi),
|
|
+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi),
|
|
+ .durability = BCH_MEMBER_DURABILITY(mi)
|
|
+ ? BCH_MEMBER_DURABILITY(mi) - 1
|
|
+ : 1,
|
|
+ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
|
|
+ };
|
|
+}
|
|
+
|
|
+/* BCH_SB_FIELD_clean: */
|
|
+
|
|
+void bch2_journal_super_entries_add_common(struct bch_fs *,
|
|
+ struct jset_entry **, u64);
|
|
+
|
|
+int bch2_sb_clean_validate(struct bch_fs *, struct bch_sb_field_clean *, int);
|
|
+
|
|
+int bch2_fs_mark_dirty(struct bch_fs *);
|
|
+void bch2_fs_mark_clean(struct bch_fs *);
|
|
+
|
|
+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
|
|
+ struct bch_sb_field *);
|
|
+
|
|
+#endif /* _BCACHEFS_SUPER_IO_H */
|
|
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
|
|
new file mode 100644
|
|
index 000000000000..bbd313a4287e
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/super.c
|
|
@@ -0,0 +1,2070 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * bcachefs setup/teardown code, and some metadata io - read a superblock and
|
|
+ * figure out what to do with it.
|
|
+ *
|
|
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "alloc_foreground.h"
|
|
+#include "bkey_sort.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_gc.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_io.h"
|
|
+#include "chardev.h"
|
|
+#include "checksum.h"
|
|
+#include "clock.h"
|
|
+#include "compress.h"
|
|
+#include "debug.h"
|
|
+#include "disk_groups.h"
|
|
+#include "ec.h"
|
|
+#include "error.h"
|
|
+#include "fs.h"
|
|
+#include "fs-io.h"
|
|
+#include "fsck.h"
|
|
+#include "inode.h"
|
|
+#include "io.h"
|
|
+#include "journal.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "journal_seq_blacklist.h"
|
|
+#include "move.h"
|
|
+#include "migrate.h"
|
|
+#include "movinggc.h"
|
|
+#include "quota.h"
|
|
+#include "rebalance.h"
|
|
+#include "recovery.h"
|
|
+#include "replicas.h"
|
|
+#include "super.h"
|
|
+#include "super-io.h"
|
|
+#include "sysfs.h"
|
|
+
|
|
+#include <linux/backing-dev.h>
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/debugfs.h>
|
|
+#include <linux/device.h>
|
|
+#include <linux/genhd.h>
|
|
+#include <linux/idr.h>
|
|
+#include <linux/module.h>
|
|
+#include <linux/percpu.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/sysfs.h>
|
|
+#include <crypto/hash.h>
|
|
+
|
|
+#include <trace/events/bcachefs.h>
|
|
+
|
|
+MODULE_LICENSE("GPL");
|
|
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
|
|
+
|
|
+#define KTYPE(type) \
|
|
+struct kobj_type type ## _ktype = { \
|
|
+ .release = type ## _release, \
|
|
+ .sysfs_ops = &type ## _sysfs_ops, \
|
|
+ .default_attrs = type ## _files \
|
|
+}
|
|
+
|
|
+static void bch2_fs_release(struct kobject *);
|
|
+static void bch2_dev_release(struct kobject *);
|
|
+
|
|
+static void bch2_fs_internal_release(struct kobject *k)
|
|
+{
|
|
+}
|
|
+
|
|
+static void bch2_fs_opts_dir_release(struct kobject *k)
|
|
+{
|
|
+}
|
|
+
|
|
+static void bch2_fs_time_stats_release(struct kobject *k)
|
|
+{
|
|
+}
|
|
+
|
|
+static KTYPE(bch2_fs);
|
|
+static KTYPE(bch2_fs_internal);
|
|
+static KTYPE(bch2_fs_opts_dir);
|
|
+static KTYPE(bch2_fs_time_stats);
|
|
+static KTYPE(bch2_dev);
|
|
+
|
|
+static struct kset *bcachefs_kset;
|
|
+static LIST_HEAD(bch_fs_list);
|
|
+static DEFINE_MUTEX(bch_fs_list_lock);
|
|
+
|
|
+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
|
|
+
|
|
+static void bch2_dev_free(struct bch_dev *);
|
|
+static int bch2_dev_alloc(struct bch_fs *, unsigned);
|
|
+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
|
|
+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
|
|
+
|
|
+struct bch_fs *bch2_dev_to_fs(dev_t dev)
|
|
+{
|
|
+ struct bch_fs *c;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ mutex_lock(&bch_fs_list_lock);
|
|
+ rcu_read_lock();
|
|
+
|
|
+ list_for_each_entry(c, &bch_fs_list, list)
|
|
+ for_each_member_device_rcu(ca, c, i, NULL)
|
|
+ if (ca->disk_sb.bdev->bd_dev == dev) {
|
|
+ closure_get(&c->cl);
|
|
+ goto found;
|
|
+ }
|
|
+ c = NULL;
|
|
+found:
|
|
+ rcu_read_unlock();
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+
|
|
+ return c;
|
|
+}
|
|
+
|
|
+static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
|
|
+{
|
|
+ struct bch_fs *c;
|
|
+
|
|
+ lockdep_assert_held(&bch_fs_list_lock);
|
|
+
|
|
+ list_for_each_entry(c, &bch_fs_list, list)
|
|
+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
|
|
+ return c;
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
|
|
+{
|
|
+ struct bch_fs *c;
|
|
+
|
|
+ mutex_lock(&bch_fs_list_lock);
|
|
+ c = __bch2_uuid_to_fs(uuid);
|
|
+ if (c)
|
|
+ closure_get(&c->cl);
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+
|
|
+ return c;
|
|
+}
|
|
+
|
|
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, nr = 0, u64s =
|
|
+ ((sizeof(struct jset_entry_dev_usage) +
|
|
+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
|
|
+ sizeof(u64);
|
|
+
|
|
+ rcu_read_lock();
|
|
+ for_each_member_device_rcu(ca, c, i, NULL)
|
|
+ nr++;
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ bch2_journal_entry_res_resize(&c->journal,
|
|
+ &c->dev_usage_journal_res, u64s * nr);
|
|
+}
|
|
+
|
|
+/* Filesystem RO/RW: */
|
|
+
|
|
+/*
|
|
+ * For startup/shutdown of RW stuff, the dependencies are:
|
|
+ *
|
|
+ * - foreground writes depend on copygc and rebalance (to free up space)
|
|
+ *
|
|
+ * - copygc and rebalance depend on mark and sweep gc (they actually probably
|
|
+ * don't because they either reserve ahead of time or don't block if
|
|
+ * allocations fail, but allocations can require mark and sweep gc to run
|
|
+ * because of generation number wraparound)
|
|
+ *
|
|
+ * - all of the above depends on the allocator threads
|
|
+ *
|
|
+ * - allocator depends on the journal (when it rewrites prios and gens)
|
|
+ */
|
|
+
|
|
+static void __bch2_fs_read_only(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, clean_passes = 0;
|
|
+
|
|
+ bch2_rebalance_stop(c);
|
|
+ bch2_copygc_stop(c);
|
|
+ bch2_gc_thread_stop(c);
|
|
+
|
|
+ /*
|
|
+ * Flush journal before stopping allocators, because flushing journal
|
|
+ * blacklist entries involves allocating new btree nodes:
|
|
+ */
|
|
+ bch2_journal_flush_all_pins(&c->journal);
|
|
+
|
|
+ /*
|
|
+ * If the allocator threads didn't all start up, the btree updates to
|
|
+ * write out alloc info aren't going to work:
|
|
+ */
|
|
+ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
|
|
+ goto nowrote_alloc;
|
|
+
|
|
+ bch_verbose(c, "flushing journal and stopping allocators");
|
|
+
|
|
+ bch2_journal_flush_all_pins(&c->journal);
|
|
+ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
|
|
+
|
|
+ do {
|
|
+ clean_passes++;
|
|
+
|
|
+ if (bch2_journal_flush_all_pins(&c->journal))
|
|
+ clean_passes = 0;
|
|
+
|
|
+ /*
|
|
+ * In flight interior btree updates will generate more journal
|
|
+ * updates and btree updates (alloc btree):
|
|
+ */
|
|
+ if (bch2_btree_interior_updates_nr_pending(c)) {
|
|
+ closure_wait_event(&c->btree_interior_update_wait,
|
|
+ !bch2_btree_interior_updates_nr_pending(c));
|
|
+ clean_passes = 0;
|
|
+ }
|
|
+ flush_work(&c->btree_interior_update_work);
|
|
+
|
|
+ if (bch2_journal_flush_all_pins(&c->journal))
|
|
+ clean_passes = 0;
|
|
+ } while (clean_passes < 2);
|
|
+ bch_verbose(c, "flushing journal and stopping allocators complete");
|
|
+
|
|
+ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
|
|
+nowrote_alloc:
|
|
+ closure_wait_event(&c->btree_interior_update_wait,
|
|
+ !bch2_btree_interior_updates_nr_pending(c));
|
|
+ flush_work(&c->btree_interior_update_work);
|
|
+
|
|
+ for_each_member_device(ca, c, i)
|
|
+ bch2_dev_allocator_stop(ca);
|
|
+
|
|
+ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
|
|
+ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
|
|
+
|
|
+ bch2_fs_journal_stop(&c->journal);
|
|
+
|
|
+ /*
|
|
+ * the journal kicks off btree writes via reclaim - wait for in flight
|
|
+ * writes after stopping journal:
|
|
+ */
|
|
+ bch2_btree_flush_all_writes(c);
|
|
+
|
|
+ /*
|
|
+ * After stopping journal:
|
|
+ */
|
|
+ for_each_member_device(ca, c, i)
|
|
+ bch2_dev_allocator_remove(c, ca);
|
|
+}
|
|
+
|
|
+static void bch2_writes_disabled(struct percpu_ref *writes)
|
|
+{
|
|
+ struct bch_fs *c = container_of(writes, struct bch_fs, writes);
|
|
+
|
|
+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
|
|
+ wake_up(&bch_read_only_wait);
|
|
+}
|
|
+
|
|
+void bch2_fs_read_only(struct bch_fs *c)
|
|
+{
|
|
+ if (!test_bit(BCH_FS_RW, &c->flags)) {
|
|
+ BUG_ON(c->journal.reclaim_thread);
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
|
|
+
|
|
+ /*
|
|
+ * Block new foreground-end write operations from starting - any new
|
|
+ * writes will return -EROFS:
|
|
+ *
|
|
+ * (This is really blocking new _allocations_, writes to previously
|
|
+ * allocated space can still happen until stopping the allocator in
|
|
+ * bch2_dev_allocator_stop()).
|
|
+ */
|
|
+ percpu_ref_kill(&c->writes);
|
|
+
|
|
+ cancel_work_sync(&c->ec_stripe_delete_work);
|
|
+
|
|
+ /*
|
|
+ * If we're not doing an emergency shutdown, we want to wait on
|
|
+ * outstanding writes to complete so they don't see spurious errors due
|
|
+ * to shutting down the allocator:
|
|
+ *
|
|
+ * If we are doing an emergency shutdown outstanding writes may
|
|
+ * hang until we shutdown the allocator so we don't want to wait
|
|
+ * on outstanding writes before shutting everything down - but
|
|
+ * we do need to wait on them before returning and signalling
|
|
+ * that going RO is complete:
|
|
+ */
|
|
+ wait_event(bch_read_only_wait,
|
|
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
|
|
+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
|
|
+
|
|
+ __bch2_fs_read_only(c);
|
|
+
|
|
+ wait_event(bch_read_only_wait,
|
|
+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
|
|
+
|
|
+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
|
|
+
|
|
+ if (!bch2_journal_error(&c->journal) &&
|
|
+ !test_bit(BCH_FS_ERROR, &c->flags) &&
|
|
+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
|
|
+ test_bit(BCH_FS_STARTED, &c->flags) &&
|
|
+ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) &&
|
|
+ !c->opts.norecovery) {
|
|
+ bch_verbose(c, "marking filesystem clean");
|
|
+ bch2_fs_mark_clean(c);
|
|
+ }
|
|
+
|
|
+ clear_bit(BCH_FS_RW, &c->flags);
|
|
+}
|
|
+
|
|
+static void bch2_fs_read_only_work(struct work_struct *work)
|
|
+{
|
|
+ struct bch_fs *c =
|
|
+ container_of(work, struct bch_fs, read_only_work);
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ bch2_fs_read_only(c);
|
|
+ up_write(&c->state_lock);
|
|
+}
|
|
+
|
|
+static void bch2_fs_read_only_async(struct bch_fs *c)
|
|
+{
|
|
+ queue_work(system_long_wq, &c->read_only_work);
|
|
+}
|
|
+
|
|
+bool bch2_fs_emergency_read_only(struct bch_fs *c)
|
|
+{
|
|
+ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
|
|
+
|
|
+ bch2_journal_halt(&c->journal);
|
|
+ bch2_fs_read_only_async(c);
|
|
+
|
|
+ wake_up(&bch_read_only_wait);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int bch2_fs_read_write_late(struct bch_fs *c)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_gc_thread_start(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error starting gc thread");
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ ret = bch2_copygc_start(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error starting copygc thread");
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ ret = bch2_rebalance_start(c);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error starting rebalance thread");
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ schedule_work(&c->ec_stripe_delete_work);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int __bch2_fs_read_write(struct bch_fs *c, bool early)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
|
|
+ bch_err(c, "cannot go rw, unfixed btree errors");
|
|
+ return -EROFS;
|
|
+ }
|
|
+
|
|
+ if (test_bit(BCH_FS_RW, &c->flags))
|
|
+ return 0;
|
|
+
|
|
+ /*
|
|
+ * nochanges is used for fsck -n mode - we have to allow going rw
|
|
+ * during recovery for that to work:
|
|
+ */
|
|
+ if (c->opts.norecovery ||
|
|
+ (c->opts.nochanges &&
|
|
+ (!early || c->opts.read_only)))
|
|
+ return -EROFS;
|
|
+
|
|
+ bch_info(c, "going read-write");
|
|
+
|
|
+ ret = bch2_fs_mark_dirty(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ /*
|
|
+ * We need to write out a journal entry before we start doing btree
|
|
+ * updates, to ensure that on unclean shutdown new journal blacklist
|
|
+ * entries are created:
|
|
+ */
|
|
+ bch2_journal_meta(&c->journal);
|
|
+
|
|
+ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
|
|
+
|
|
+ for_each_rw_member(ca, c, i)
|
|
+ bch2_dev_allocator_add(c, ca);
|
|
+ bch2_recalc_capacity(c);
|
|
+
|
|
+ for_each_rw_member(ca, c, i) {
|
|
+ ret = bch2_dev_allocator_start(ca);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error starting allocator threads");
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
|
|
+
|
|
+ for_each_rw_member(ca, c, i)
|
|
+ bch2_wake_allocator(ca);
|
|
+
|
|
+ ret = bch2_journal_reclaim_start(&c->journal);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error starting journal reclaim: %i", ret);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (!early) {
|
|
+ ret = bch2_fs_read_write_late(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ percpu_ref_reinit(&c->writes);
|
|
+ set_bit(BCH_FS_RW, &c->flags);
|
|
+ set_bit(BCH_FS_WAS_RW, &c->flags);
|
|
+ return 0;
|
|
+err:
|
|
+ __bch2_fs_read_only(c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_fs_read_write(struct bch_fs *c)
|
|
+{
|
|
+ return __bch2_fs_read_write(c, false);
|
|
+}
|
|
+
|
|
+int bch2_fs_read_write_early(struct bch_fs *c)
|
|
+{
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
+ return __bch2_fs_read_write(c, true);
|
|
+}
|
|
+
|
|
+/* Filesystem startup/shutdown: */
|
|
+
|
|
+static void __bch2_fs_free(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+ int cpu;
|
|
+
|
|
+ for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
|
+ bch2_time_stats_exit(&c->times[i]);
|
|
+
|
|
+ bch2_fs_quota_exit(c);
|
|
+ bch2_fs_fsio_exit(c);
|
|
+ bch2_fs_ec_exit(c);
|
|
+ bch2_fs_encryption_exit(c);
|
|
+ bch2_fs_io_exit(c);
|
|
+ bch2_fs_btree_interior_update_exit(c);
|
|
+ bch2_fs_btree_iter_exit(c);
|
|
+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
|
|
+ bch2_fs_btree_cache_exit(c);
|
|
+ bch2_fs_replicas_exit(c);
|
|
+ bch2_fs_journal_exit(&c->journal);
|
|
+ bch2_io_clock_exit(&c->io_clock[WRITE]);
|
|
+ bch2_io_clock_exit(&c->io_clock[READ]);
|
|
+ bch2_fs_compress_exit(c);
|
|
+ bch2_journal_keys_free(&c->journal_keys);
|
|
+ bch2_journal_entries_free(&c->journal_entries);
|
|
+ percpu_free_rwsem(&c->mark_lock);
|
|
+
|
|
+ if (c->btree_iters_bufs)
|
|
+ for_each_possible_cpu(cpu)
|
|
+ kfree(per_cpu_ptr(c->btree_iters_bufs, cpu)->iter);
|
|
+
|
|
+ free_percpu(c->online_reserved);
|
|
+ free_percpu(c->btree_iters_bufs);
|
|
+ free_percpu(c->pcpu);
|
|
+ mempool_exit(&c->large_bkey_pool);
|
|
+ mempool_exit(&c->btree_bounce_pool);
|
|
+ bioset_exit(&c->btree_bio);
|
|
+ mempool_exit(&c->fill_iter);
|
|
+ percpu_ref_exit(&c->writes);
|
|
+ kfree(rcu_dereference_protected(c->disk_groups, 1));
|
|
+ kfree(c->journal_seq_blacklist_table);
|
|
+ kfree(c->unused_inode_hints);
|
|
+ free_heap(&c->copygc_heap);
|
|
+
|
|
+ if (c->copygc_wq)
|
|
+ destroy_workqueue(c->copygc_wq);
|
|
+ if (c->wq)
|
|
+ destroy_workqueue(c->wq);
|
|
+
|
|
+ bch2_free_super(&c->disk_sb);
|
|
+ kvpfree(c, sizeof(*c));
|
|
+ module_put(THIS_MODULE);
|
|
+}
|
|
+
|
|
+static void bch2_fs_release(struct kobject *kobj)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
|
+
|
|
+ __bch2_fs_free(c);
|
|
+}
|
|
+
|
|
+void __bch2_fs_stop(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ bch_verbose(c, "shutting down");
|
|
+
|
|
+ set_bit(BCH_FS_STOPPING, &c->flags);
|
|
+
|
|
+ cancel_work_sync(&c->journal_seq_blacklist_gc_work);
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ bch2_fs_read_only(c);
|
|
+ up_write(&c->state_lock);
|
|
+
|
|
+ for_each_member_device(ca, c, i)
|
|
+ if (ca->kobj.state_in_sysfs &&
|
|
+ ca->disk_sb.bdev)
|
|
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
|
|
+
|
|
+ if (c->kobj.state_in_sysfs)
|
|
+ kobject_del(&c->kobj);
|
|
+
|
|
+ bch2_fs_debug_exit(c);
|
|
+ bch2_fs_chardev_exit(c);
|
|
+
|
|
+ kobject_put(&c->time_stats);
|
|
+ kobject_put(&c->opts_dir);
|
|
+ kobject_put(&c->internal);
|
|
+
|
|
+ /* btree prefetch might have kicked off reads in the background: */
|
|
+ bch2_btree_flush_all_reads(c);
|
|
+
|
|
+ for_each_member_device(ca, c, i)
|
|
+ cancel_work_sync(&ca->io_error_work);
|
|
+
|
|
+ cancel_work_sync(&c->btree_write_error_work);
|
|
+ cancel_work_sync(&c->read_only_work);
|
|
+
|
|
+ for (i = 0; i < c->sb.nr_devices; i++)
|
|
+ if (c->devs[i])
|
|
+ bch2_free_super(&c->devs[i]->disk_sb);
|
|
+}
|
|
+
|
|
+void bch2_fs_free(struct bch_fs *c)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ mutex_lock(&bch_fs_list_lock);
|
|
+ list_del(&c->list);
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+
|
|
+ closure_sync(&c->cl);
|
|
+ closure_debug_destroy(&c->cl);
|
|
+
|
|
+ for (i = 0; i < c->sb.nr_devices; i++)
|
|
+ if (c->devs[i])
|
|
+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1));
|
|
+
|
|
+ bch_verbose(c, "shutdown complete");
|
|
+
|
|
+ kobject_put(&c->kobj);
|
|
+}
|
|
+
|
|
+void bch2_fs_stop(struct bch_fs *c)
|
|
+{
|
|
+ __bch2_fs_stop(c);
|
|
+ bch2_fs_free(c);
|
|
+}
|
|
+
|
|
+static const char *bch2_fs_online(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ const char *err = NULL;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ lockdep_assert_held(&bch_fs_list_lock);
|
|
+
|
|
+ if (!list_empty(&c->list))
|
|
+ return NULL;
|
|
+
|
|
+ if (__bch2_uuid_to_fs(c->sb.uuid))
|
|
+ return "filesystem UUID already open";
|
|
+
|
|
+ ret = bch2_fs_chardev_init(c);
|
|
+ if (ret)
|
|
+ return "error creating character device";
|
|
+
|
|
+ bch2_fs_debug_init(c);
|
|
+
|
|
+ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
|
|
+ kobject_add(&c->internal, &c->kobj, "internal") ||
|
|
+ kobject_add(&c->opts_dir, &c->kobj, "options") ||
|
|
+ kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
|
|
+ bch2_opts_create_sysfs_files(&c->opts_dir))
|
|
+ return "error creating sysfs objects";
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ err = "error creating sysfs objects";
|
|
+ __for_each_member_device(ca, c, i, NULL)
|
|
+ if (bch2_dev_sysfs_online(c, ca))
|
|
+ goto err;
|
|
+
|
|
+ list_add(&c->list, &bch_fs_list);
|
|
+ err = NULL;
|
|
+err:
|
|
+ up_write(&c->state_lock);
|
|
+ return err;
|
|
+}
|
|
+
|
|
+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
|
|
+{
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_fs *c;
|
|
+ unsigned i, iter_size;
|
|
+ const char *err;
|
|
+
|
|
+ pr_verbose_init(opts, "");
|
|
+
|
|
+ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
|
|
+ if (!c)
|
|
+ goto out;
|
|
+
|
|
+ __module_get(THIS_MODULE);
|
|
+
|
|
+ closure_init(&c->cl, NULL);
|
|
+
|
|
+ c->kobj.kset = bcachefs_kset;
|
|
+ kobject_init(&c->kobj, &bch2_fs_ktype);
|
|
+ kobject_init(&c->internal, &bch2_fs_internal_ktype);
|
|
+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
|
|
+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
|
|
+
|
|
+ c->minor = -1;
|
|
+ c->disk_sb.fs_sb = true;
|
|
+
|
|
+ init_rwsem(&c->state_lock);
|
|
+ mutex_init(&c->sb_lock);
|
|
+ mutex_init(&c->replicas_gc_lock);
|
|
+ mutex_init(&c->btree_root_lock);
|
|
+ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
|
|
+
|
|
+ init_rwsem(&c->gc_lock);
|
|
+
|
|
+ for (i = 0; i < BCH_TIME_STAT_NR; i++)
|
|
+ bch2_time_stats_init(&c->times[i]);
|
|
+
|
|
+ bch2_fs_copygc_init(c);
|
|
+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache);
|
|
+ bch2_fs_allocator_background_init(c);
|
|
+ bch2_fs_allocator_foreground_init(c);
|
|
+ bch2_fs_rebalance_init(c);
|
|
+ bch2_fs_quota_init(c);
|
|
+
|
|
+ INIT_LIST_HEAD(&c->list);
|
|
+
|
|
+ mutex_init(&c->usage_scratch_lock);
|
|
+
|
|
+ mutex_init(&c->bio_bounce_pages_lock);
|
|
+
|
|
+ bio_list_init(&c->btree_write_error_list);
|
|
+ spin_lock_init(&c->btree_write_error_lock);
|
|
+ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
|
|
+
|
|
+ INIT_WORK(&c->journal_seq_blacklist_gc_work,
|
|
+ bch2_blacklist_entries_gc);
|
|
+
|
|
+ INIT_LIST_HEAD(&c->journal_entries);
|
|
+ INIT_LIST_HEAD(&c->journal_iters);
|
|
+
|
|
+ INIT_LIST_HEAD(&c->fsck_errors);
|
|
+ mutex_init(&c->fsck_error_lock);
|
|
+
|
|
+ INIT_LIST_HEAD(&c->ec_stripe_head_list);
|
|
+ mutex_init(&c->ec_stripe_head_lock);
|
|
+
|
|
+ INIT_LIST_HEAD(&c->ec_stripe_new_list);
|
|
+ mutex_init(&c->ec_stripe_new_lock);
|
|
+
|
|
+ spin_lock_init(&c->ec_stripes_heap_lock);
|
|
+
|
|
+ seqcount_init(&c->gc_pos_lock);
|
|
+
|
|
+ seqcount_init(&c->usage_lock);
|
|
+
|
|
+ sema_init(&c->io_in_flight, 64);
|
|
+
|
|
+ c->copy_gc_enabled = 1;
|
|
+ c->rebalance.enabled = 1;
|
|
+ c->promote_whole_extents = true;
|
|
+
|
|
+ c->journal.write_time = &c->times[BCH_TIME_journal_write];
|
|
+ c->journal.delay_time = &c->times[BCH_TIME_journal_delay];
|
|
+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
|
|
+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
|
|
+
|
|
+ bch2_fs_btree_cache_init_early(&c->btree_cache);
|
|
+
|
|
+ mutex_init(&c->sectors_available_lock);
|
|
+
|
|
+ if (percpu_init_rwsem(&c->mark_lock))
|
|
+ goto err;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ if (bch2_sb_to_fs(c, sb)) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid);
|
|
+
|
|
+ c->opts = bch2_opts_default;
|
|
+ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb));
|
|
+ bch2_opts_apply(&c->opts, opts);
|
|
+
|
|
+ c->block_bits = ilog2(c->opts.block_size);
|
|
+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);
|
|
+
|
|
+ if (bch2_fs_init_fault("fs_alloc"))
|
|
+ goto err;
|
|
+
|
|
+ iter_size = sizeof(struct sort_iter) +
|
|
+ (btree_blocks(c) + 1) * 2 *
|
|
+ sizeof(struct sort_iter_set);
|
|
+
|
|
+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
|
|
+
|
|
+ if (!(c->wq = alloc_workqueue("bcachefs",
|
|
+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
|
|
+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
|
|
+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
|
|
+ percpu_ref_init(&c->writes, bch2_writes_disabled,
|
|
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
|
|
+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
|
|
+ bioset_init(&c->btree_bio, 1,
|
|
+ max(offsetof(struct btree_read_bio, bio),
|
|
+ offsetof(struct btree_write_bio, wbio.bio)),
|
|
+ BIOSET_NEED_BVECS) ||
|
|
+ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
|
|
+ !(c->btree_iters_bufs = alloc_percpu(struct btree_iter_buf)) ||
|
|
+ !(c->online_reserved = alloc_percpu(u64)) ||
|
|
+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
|
|
+ btree_bytes(c)) ||
|
|
+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
|
|
+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
|
|
+ sizeof(u64), GFP_KERNEL)) ||
|
|
+ bch2_io_clock_init(&c->io_clock[READ]) ||
|
|
+ bch2_io_clock_init(&c->io_clock[WRITE]) ||
|
|
+ bch2_fs_journal_init(&c->journal) ||
|
|
+ bch2_fs_replicas_init(c) ||
|
|
+ bch2_fs_btree_cache_init(c) ||
|
|
+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ||
|
|
+ bch2_fs_btree_iter_init(c) ||
|
|
+ bch2_fs_btree_interior_update_init(c) ||
|
|
+ bch2_fs_io_init(c) ||
|
|
+ bch2_fs_encryption_init(c) ||
|
|
+ bch2_fs_compress_init(c) ||
|
|
+ bch2_fs_ec_init(c) ||
|
|
+ bch2_fs_fsio_init(c))
|
|
+ goto err;
|
|
+
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+ for (i = 0; i < c->sb.nr_devices; i++)
|
|
+ if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
|
|
+ bch2_dev_alloc(c, i))
|
|
+ goto err;
|
|
+
|
|
+ bch2_journal_entry_res_resize(&c->journal,
|
|
+ &c->btree_root_journal_res,
|
|
+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX));
|
|
+ bch2_dev_usage_journal_reserve(c);
|
|
+ bch2_journal_entry_res_resize(&c->journal,
|
|
+ &c->clock_journal_res,
|
|
+ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2);
|
|
+
|
|
+ mutex_lock(&bch_fs_list_lock);
|
|
+ err = bch2_fs_online(c);
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+ if (err) {
|
|
+ bch_err(c, "bch2_fs_online() error: %s", err);
|
|
+ goto err;
|
|
+ }
|
|
+out:
|
|
+ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
|
|
+ return c;
|
|
+err:
|
|
+ bch2_fs_free(c);
|
|
+ c = NULL;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+noinline_for_stack
|
|
+static void print_mount_opts(struct bch_fs *c)
|
|
+{
|
|
+ enum bch_opt_id i;
|
|
+ char buf[512];
|
|
+ struct printbuf p = PBUF(buf);
|
|
+ bool first = true;
|
|
+
|
|
+ strcpy(buf, "(null)");
|
|
+
|
|
+ if (c->opts.read_only) {
|
|
+ pr_buf(&p, "ro");
|
|
+ first = false;
|
|
+ }
|
|
+
|
|
+ for (i = 0; i < bch2_opts_nr; i++) {
|
|
+ const struct bch_option *opt = &bch2_opt_table[i];
|
|
+ u64 v = bch2_opt_get_by_id(&c->opts, i);
|
|
+
|
|
+ if (!(opt->mode & OPT_MOUNT))
|
|
+ continue;
|
|
+
|
|
+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i))
|
|
+ continue;
|
|
+
|
|
+ if (!first)
|
|
+ pr_buf(&p, ",");
|
|
+ first = false;
|
|
+ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE);
|
|
+ }
|
|
+
|
|
+ bch_info(c, "mounted with opts: %s", buf);
|
|
+}
|
|
+
|
|
+int bch2_fs_start(struct bch_fs *c)
|
|
+{
|
|
+ const char *err = "cannot allocate memory";
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_dev *ca;
|
|
+ time64_t now = ktime_get_real_seconds();
|
|
+ unsigned i;
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ for_each_online_member(ca, c, i)
|
|
+ bch2_sb_from_fs(c, ca);
|
|
+
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+ for_each_online_member(ca, c, i)
|
|
+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
|
|
+
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ for_each_rw_member(ca, c, i)
|
|
+ bch2_dev_allocator_add(c, ca);
|
|
+ bch2_recalc_capacity(c);
|
|
+
|
|
+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
|
|
+ ? bch2_fs_recovery(c)
|
|
+ : bch2_fs_initialize(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ ret = bch2_opts_check_may_set(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ err = "dynamic fault";
|
|
+ ret = -EINVAL;
|
|
+ if (bch2_fs_init_fault("fs_start"))
|
|
+ goto err;
|
|
+
|
|
+ set_bit(BCH_FS_STARTED, &c->flags);
|
|
+
|
|
+ /*
|
|
+ * Allocator threads don't start filling copygc reserve until after we
|
|
+ * set BCH_FS_STARTED - wake them now:
|
|
+ *
|
|
+ * XXX ugly hack:
|
|
+ * Need to set ca->allocator_state here instead of relying on the
|
|
+ * allocator threads to do it to avoid racing with the copygc threads
|
|
+ * checking it and thinking they have no alloc reserve:
|
|
+ */
|
|
+ for_each_online_member(ca, c, i) {
|
|
+ ca->allocator_state = ALLOCATOR_running;
|
|
+ bch2_wake_allocator(ca);
|
|
+ }
|
|
+
|
|
+ if (c->opts.read_only || c->opts.nochanges) {
|
|
+ bch2_fs_read_only(c);
|
|
+ } else {
|
|
+ err = "error going read write";
|
|
+ ret = !test_bit(BCH_FS_RW, &c->flags)
|
|
+ ? bch2_fs_read_write(c)
|
|
+ : bch2_fs_read_write_late(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ print_mount_opts(c);
|
|
+ ret = 0;
|
|
+out:
|
|
+ up_write(&c->state_lock);
|
|
+ return ret;
|
|
+err:
|
|
+ switch (ret) {
|
|
+ case BCH_FSCK_ERRORS_NOT_FIXED:
|
|
+ bch_err(c, "filesystem contains errors: please report this to the developers");
|
|
+ pr_cont("mount with -o fix_errors to repair\n");
|
|
+ err = "fsck error";
|
|
+ break;
|
|
+ case BCH_FSCK_REPAIR_UNIMPLEMENTED:
|
|
+ bch_err(c, "filesystem contains errors: please report this to the developers");
|
|
+ pr_cont("repair unimplemented: inform the developers so that it can be added\n");
|
|
+ err = "fsck error";
|
|
+ break;
|
|
+ case BCH_FSCK_REPAIR_IMPOSSIBLE:
|
|
+ bch_err(c, "filesystem contains errors, but repair impossible");
|
|
+ err = "fsck error";
|
|
+ break;
|
|
+ case BCH_FSCK_UNKNOWN_VERSION:
|
|
+ err = "unknown metadata version";;
|
|
+ break;
|
|
+ case -ENOMEM:
|
|
+ err = "cannot allocate memory";
|
|
+ break;
|
|
+ case -EIO:
|
|
+ err = "IO error";
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (ret >= 0)
|
|
+ ret = -EIO;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_members *sb_mi;
|
|
+
|
|
+ sb_mi = bch2_sb_get_members(sb);
|
|
+ if (!sb_mi)
|
|
+ return "Invalid superblock: member info area missing";
|
|
+
|
|
+ if (le16_to_cpu(sb->block_size) != c->opts.block_size)
|
|
+ return "mismatched block size";
|
|
+
|
|
+ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
|
|
+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
|
|
+ return "new cache bucket size is too small";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
|
|
+{
|
|
+ struct bch_sb *newest =
|
|
+ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
|
|
+ struct bch_sb_field_members *mi = bch2_sb_get_members(newest);
|
|
+
|
|
+ if (uuid_le_cmp(fs->uuid, sb->uuid))
|
|
+ return "device not a member of filesystem";
|
|
+
|
|
+ if (!bch2_dev_exists(newest, mi, sb->dev_idx))
|
|
+ return "device has been removed";
|
|
+
|
|
+ if (fs->block_size != sb->block_size)
|
|
+ return "mismatched block size";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+/* Device startup/shutdown: */
|
|
+
|
|
+static void bch2_dev_release(struct kobject *kobj)
|
|
+{
|
|
+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
|
+
|
|
+ kfree(ca);
|
|
+}
|
|
+
|
|
+static void bch2_dev_free(struct bch_dev *ca)
|
|
+{
|
|
+ bch2_dev_allocator_stop(ca);
|
|
+
|
|
+ cancel_work_sync(&ca->io_error_work);
|
|
+
|
|
+ if (ca->kobj.state_in_sysfs &&
|
|
+ ca->disk_sb.bdev)
|
|
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
|
|
+
|
|
+ if (ca->kobj.state_in_sysfs)
|
|
+ kobject_del(&ca->kobj);
|
|
+
|
|
+ bch2_free_super(&ca->disk_sb);
|
|
+ bch2_dev_journal_exit(ca);
|
|
+
|
|
+ free_percpu(ca->io_done);
|
|
+ bioset_exit(&ca->replica_set);
|
|
+ bch2_dev_buckets_free(ca);
|
|
+ free_page((unsigned long) ca->sb_read_scratch);
|
|
+
|
|
+ bch2_time_stats_exit(&ca->io_latency[WRITE]);
|
|
+ bch2_time_stats_exit(&ca->io_latency[READ]);
|
|
+
|
|
+ percpu_ref_exit(&ca->io_ref);
|
|
+ percpu_ref_exit(&ca->ref);
|
|
+ kobject_put(&ca->kobj);
|
|
+}
|
|
+
|
|
+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
+ if (percpu_ref_is_zero(&ca->io_ref))
|
|
+ return;
|
|
+
|
|
+ __bch2_dev_read_only(c, ca);
|
|
+
|
|
+ reinit_completion(&ca->io_ref_completion);
|
|
+ percpu_ref_kill(&ca->io_ref);
|
|
+ wait_for_completion(&ca->io_ref_completion);
|
|
+
|
|
+ if (ca->kobj.state_in_sysfs) {
|
|
+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
|
|
+ sysfs_remove_link(&ca->kobj, "block");
|
|
+ }
|
|
+
|
|
+ bch2_free_super(&ca->disk_sb);
|
|
+ bch2_dev_journal_exit(ca);
|
|
+}
|
|
+
|
|
+static void bch2_dev_ref_complete(struct percpu_ref *ref)
|
|
+{
|
|
+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref);
|
|
+
|
|
+ complete(&ca->ref_completion);
|
|
+}
|
|
+
|
|
+static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
|
|
+{
|
|
+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref);
|
|
+
|
|
+ complete(&ca->io_ref_completion);
|
|
+}
|
|
+
|
|
+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (!c->kobj.state_in_sysfs)
|
|
+ return 0;
|
|
+
|
|
+ if (!ca->kobj.state_in_sysfs) {
|
|
+ ret = kobject_add(&ca->kobj, &c->kobj,
|
|
+ "dev-%u", ca->dev_idx);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (ca->disk_sb.bdev) {
|
|
+ struct kobject *block = bdev_kobj(ca->disk_sb.bdev);
|
|
+
|
|
+ ret = sysfs_create_link(block, &ca->kobj, "bcachefs");
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = sysfs_create_link(&ca->kobj, block, "block");
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
|
|
+ struct bch_member *member)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
|
+ if (!ca)
|
|
+ return NULL;
|
|
+
|
|
+ kobject_init(&ca->kobj, &bch2_dev_ktype);
|
|
+ init_completion(&ca->ref_completion);
|
|
+ init_completion(&ca->io_ref_completion);
|
|
+
|
|
+ init_rwsem(&ca->bucket_lock);
|
|
+
|
|
+ INIT_WORK(&ca->io_error_work, bch2_io_error_work);
|
|
+
|
|
+ bch2_time_stats_init(&ca->io_latency[READ]);
|
|
+ bch2_time_stats_init(&ca->io_latency[WRITE]);
|
|
+
|
|
+ ca->mi = bch2_mi_to_cpu(member);
|
|
+ ca->uuid = member->uuid;
|
|
+
|
|
+ if (opt_defined(c->opts, discard))
|
|
+ ca->mi.discard = opt_get(c->opts, discard);
|
|
+
|
|
+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
|
|
+ 0, GFP_KERNEL) ||
|
|
+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
|
|
+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
|
|
+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) ||
|
|
+ bch2_dev_buckets_alloc(c, ca) ||
|
|
+ bioset_init(&ca->replica_set, 4,
|
|
+ offsetof(struct bch_write_bio, bio), 0) ||
|
|
+ !(ca->io_done = alloc_percpu(*ca->io_done)))
|
|
+ goto err;
|
|
+
|
|
+ return ca;
|
|
+err:
|
|
+ bch2_dev_free(ca);
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
|
|
+ unsigned dev_idx)
|
|
+{
|
|
+ ca->dev_idx = dev_idx;
|
|
+ __set_bit(ca->dev_idx, ca->self.d);
|
|
+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
|
|
+
|
|
+ ca->fs = c;
|
|
+ rcu_assign_pointer(c->devs[ca->dev_idx], ca);
|
|
+
|
|
+ if (bch2_dev_sysfs_online(c, ca))
|
|
+ pr_warn("error creating sysfs objects");
|
|
+}
|
|
+
|
|
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
|
|
+{
|
|
+ struct bch_member *member =
|
|
+ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
|
|
+ struct bch_dev *ca = NULL;
|
|
+ int ret = 0;
|
|
+
|
|
+ pr_verbose_init(c->opts, "");
|
|
+
|
|
+ if (bch2_fs_init_fault("dev_alloc"))
|
|
+ goto err;
|
|
+
|
|
+ ca = __bch2_dev_alloc(c, member);
|
|
+ if (!ca)
|
|
+ goto err;
|
|
+
|
|
+ ca->fs = c;
|
|
+
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw &&
|
|
+ bch2_dev_allocator_start(ca)) {
|
|
+ bch2_dev_free(ca);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_dev_attach(c, ca, dev_idx);
|
|
+out:
|
|
+ pr_verbose_init(c->opts, "ret %i", ret);
|
|
+ return ret;
|
|
+err:
|
|
+ if (ca)
|
|
+ bch2_dev_free(ca);
|
|
+ ret = -ENOMEM;
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
|
|
+{
|
|
+ unsigned ret;
|
|
+
|
|
+ if (bch2_dev_is_online(ca)) {
|
|
+ bch_err(ca, "already have device online in slot %u",
|
|
+ sb->sb->dev_idx);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ if (get_capacity(sb->bdev->bd_disk) <
|
|
+ ca->mi.bucket_size * ca->mi.nbuckets) {
|
|
+ bch_err(ca, "cannot online: device too small");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
|
|
+
|
|
+ if (get_capacity(sb->bdev->bd_disk) <
|
|
+ ca->mi.bucket_size * ca->mi.nbuckets) {
|
|
+ bch_err(ca, "device too small");
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ ret = bch2_dev_journal_init(ca, sb->sb);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ /* Commit: */
|
|
+ ca->disk_sb = *sb;
|
|
+ if (sb->mode & FMODE_EXCL)
|
|
+ ca->disk_sb.bdev->bd_holder = ca;
|
|
+ memset(sb, 0, sizeof(*sb));
|
|
+
|
|
+ percpu_ref_reinit(&ca->io_ref);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ int ret;
|
|
+
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
+ if (le64_to_cpu(sb->sb->seq) >
|
|
+ le64_to_cpu(c->disk_sb.sb->seq))
|
|
+ bch2_sb_to_fs(c, sb->sb);
|
|
+
|
|
+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
|
|
+ !c->devs[sb->sb->dev_idx]);
|
|
+
|
|
+ ca = bch_dev_locked(c, sb->sb->dev_idx);
|
|
+
|
|
+ ret = __bch2_dev_attach_bdev(ca, sb);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ bch2_dev_sysfs_online(c, ca);
|
|
+
|
|
+ if (c->sb.nr_devices == 1)
|
|
+ bdevname(ca->disk_sb.bdev, c->name);
|
|
+ bdevname(ca->disk_sb.bdev, ca->name);
|
|
+
|
|
+ rebalance_wakeup(c);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* Device management: */
|
|
+
|
|
+/*
|
|
+ * Note: this function is also used by the error paths - when a particular
|
|
+ * device sees an error, we call it to determine whether we can just set the
|
|
+ * device RO, or - if this function returns false - we'll set the whole
|
|
+ * filesystem RO:
|
|
+ *
|
|
+ * XXX: maybe we should be more explicit about whether we're changing state
|
|
+ * because we got an error or what have you?
|
|
+ */
|
|
+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
|
|
+ enum bch_member_state new_state, int flags)
|
|
+{
|
|
+ struct bch_devs_mask new_online_devs;
|
|
+ struct bch_dev *ca2;
|
|
+ int i, nr_rw = 0, required;
|
|
+
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
+ switch (new_state) {
|
|
+ case BCH_MEMBER_STATE_rw:
|
|
+ return true;
|
|
+ case BCH_MEMBER_STATE_ro:
|
|
+ if (ca->mi.state != BCH_MEMBER_STATE_rw)
|
|
+ return true;
|
|
+
|
|
+ /* do we have enough devices to write to? */
|
|
+ for_each_member_device(ca2, c, i)
|
|
+ if (ca2 != ca)
|
|
+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
|
|
+
|
|
+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
|
|
+ ? c->opts.metadata_replicas
|
|
+ : c->opts.metadata_replicas_required,
|
|
+ !(flags & BCH_FORCE_IF_DATA_DEGRADED)
|
|
+ ? c->opts.data_replicas
|
|
+ : c->opts.data_replicas_required);
|
|
+
|
|
+ return nr_rw >= required;
|
|
+ case BCH_MEMBER_STATE_failed:
|
|
+ case BCH_MEMBER_STATE_spare:
|
|
+ if (ca->mi.state != BCH_MEMBER_STATE_rw &&
|
|
+ ca->mi.state != BCH_MEMBER_STATE_ro)
|
|
+ return true;
|
|
+
|
|
+ /* do we have enough devices to read from? */
|
|
+ new_online_devs = bch2_online_devs(c);
|
|
+ __clear_bit(ca->dev_idx, new_online_devs.d);
|
|
+
|
|
+ return bch2_have_enough_devs(c, new_online_devs, flags, false);
|
|
+ default:
|
|
+ BUG();
|
|
+ }
|
|
+}
|
|
+
|
|
+static bool bch2_fs_may_start(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i, flags = 0;
|
|
+
|
|
+ if (c->opts.very_degraded)
|
|
+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
|
|
+
|
|
+ if (c->opts.degraded)
|
|
+ flags |= BCH_FORCE_IF_DEGRADED;
|
|
+
|
|
+ if (!c->opts.degraded &&
|
|
+ !c->opts.very_degraded) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+
|
|
+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
|
|
+ if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
|
|
+ continue;
|
|
+
|
|
+ ca = bch_dev_locked(c, i);
|
|
+
|
|
+ if (!bch2_dev_is_online(ca) &&
|
|
+ (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
+ ca->mi.state == BCH_MEMBER_STATE_ro)) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ return false;
|
|
+ }
|
|
+ }
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true);
|
|
+}
|
|
+
|
|
+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ /*
|
|
+ * Device going read only means the copygc reserve get smaller, so we
|
|
+ * don't want that happening while copygc is in progress:
|
|
+ */
|
|
+ bch2_copygc_stop(c);
|
|
+
|
|
+ /*
|
|
+ * The allocator thread itself allocates btree nodes, so stop it first:
|
|
+ */
|
|
+ bch2_dev_allocator_stop(ca);
|
|
+ bch2_dev_allocator_remove(c, ca);
|
|
+ bch2_dev_journal_stop(&c->journal, ca);
|
|
+
|
|
+ bch2_copygc_start(c);
|
|
+}
|
|
+
|
|
+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ lockdep_assert_held(&c->state_lock);
|
|
+
|
|
+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw);
|
|
+
|
|
+ bch2_dev_allocator_add(c, ca);
|
|
+ bch2_recalc_capacity(c);
|
|
+
|
|
+ if (bch2_dev_allocator_start(ca))
|
|
+ return "error starting allocator thread";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
|
|
+ enum bch_member_state new_state, int flags)
|
|
+{
|
|
+ struct bch_sb_field_members *mi;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (ca->mi.state == new_state)
|
|
+ return 0;
|
|
+
|
|
+ if (!bch2_dev_state_allowed(c, ca, new_state, flags))
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (new_state != BCH_MEMBER_STATE_rw)
|
|
+ __bch2_dev_read_only(c, ca);
|
|
+
|
|
+ bch_notice(ca, "%s", bch2_member_states[new_state]);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ if (new_state == BCH_MEMBER_STATE_rw &&
|
|
+ __bch2_dev_read_write(c, ca))
|
|
+ ret = -ENOMEM;
|
|
+
|
|
+ rebalance_wakeup(c);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
|
|
+ enum bch_member_state new_state, int flags)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ ret = __bch2_dev_set_state(c, ca, new_state, flags);
|
|
+ up_write(&c->state_lock);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Device add/removal: */
|
|
+
|
|
+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ size_t i;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for (i = 0; i < ca->mi.nbuckets; i++) {
|
|
+ ret = bch2_btree_key_cache_flush(&trans,
|
|
+ BTREE_ID_alloc, POS(ca->dev_idx, i));
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ return bch2_btree_delete_range(c, BTREE_ID_alloc,
|
|
+ POS(ca->dev_idx, 0),
|
|
+ POS(ca->dev_idx + 1, 0),
|
|
+ NULL);
|
|
+}
|
|
+
|
|
+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
+{
|
|
+ struct bch_sb_field_members *mi;
|
|
+ unsigned dev_idx = ca->dev_idx, data;
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ /*
|
|
+ * We consume a reference to ca->ref, regardless of whether we succeed
|
|
+ * or fail:
|
|
+ */
|
|
+ percpu_ref_put(&ca->ref);
|
|
+
|
|
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
|
|
+ bch_err(ca, "Cannot remove without losing data");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ __bch2_dev_read_only(c, ca);
|
|
+
|
|
+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "Remove failed: error %i dropping data", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "Remove failed: error %i flushing journal", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_dev_remove_alloc(c, ca);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "Remove failed, error deleting alloc info");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * must flush all existing journal entries, they might have
|
|
+ * (overwritten) keys that point to the device we're removing:
|
|
+ */
|
|
+ bch2_journal_flush_all_pins(&c->journal);
|
|
+ /*
|
|
+ * hack to ensure bch2_replicas_gc2() clears out entries to this device
|
|
+ */
|
|
+ bch2_journal_meta(&c->journal);
|
|
+ ret = bch2_journal_error(&c->journal);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "Remove failed, journal error");
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_replicas_gc2(c);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "Remove failed: error %i from replicas gc", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ data = bch2_dev_has_data(c, ca);
|
|
+ if (data) {
|
|
+ char data_has_str[100];
|
|
+
|
|
+ bch2_flags_to_text(&PBUF(data_has_str),
|
|
+ bch2_data_types, data);
|
|
+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str);
|
|
+ ret = -EBUSY;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ __bch2_dev_offline(c, ca);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ percpu_ref_kill(&ca->ref);
|
|
+ wait_for_completion(&ca->ref_completion);
|
|
+
|
|
+ bch2_dev_free(ca);
|
|
+
|
|
+ /*
|
|
+ * Free this device's slot in the bch_member array - all pointers to
|
|
+ * this device must be gone:
|
|
+ */
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
|
|
+
|
|
+ bch2_write_super(c);
|
|
+
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ up_write(&c->state_lock);
|
|
+
|
|
+ bch2_dev_usage_journal_reserve(c);
|
|
+ return 0;
|
|
+err:
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw &&
|
|
+ !percpu_ref_is_zero(&ca->io_ref))
|
|
+ __bch2_dev_read_write(c, ca);
|
|
+ up_write(&c->state_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* Add new device to running filesystem: */
|
|
+int bch2_dev_add(struct bch_fs *c, const char *path)
|
|
+{
|
|
+ struct bch_opts opts = bch2_opts_empty();
|
|
+ struct bch_sb_handle sb;
|
|
+ const char *err;
|
|
+ struct bch_dev *ca = NULL;
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_member dev_mi;
|
|
+ unsigned dev_idx, nr_devices, u64s;
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_read_super(path, &opts, &sb);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ err = bch2_sb_validate(&sb);
|
|
+ if (err)
|
|
+ return -EINVAL;
|
|
+
|
|
+ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
|
|
+
|
|
+ err = bch2_dev_may_add(sb.sb, c);
|
|
+ if (err)
|
|
+ return -EINVAL;
|
|
+
|
|
+ ca = __bch2_dev_alloc(c, &dev_mi);
|
|
+ if (!ca) {
|
|
+ bch2_free_super(&sb);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_dev_attach_bdev(ca, &sb);
|
|
+ if (ret) {
|
|
+ bch2_dev_free(ca);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * We want to allocate journal on the new device before adding the new
|
|
+ * device to the filesystem because allocating after we attach requires
|
|
+ * spinning up the allocator thread, and the allocator thread requires
|
|
+ * doing btree writes, which if the existing devices are RO isn't going
|
|
+ * to work
|
|
+ *
|
|
+ * So we have to mark where the superblocks are, but marking allocated
|
|
+ * data normally updates the filesystem usage too, so we have to mark,
|
|
+ * allocate the journal, reset all the marks, then remark after we
|
|
+ * attach...
|
|
+ */
|
|
+ bch2_mark_dev_superblock(NULL, ca, 0);
|
|
+
|
|
+ err = "journal alloc failed";
|
|
+ ret = bch2_dev_journal_alloc(ca);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+ mutex_lock(&c->sb_lock);
|
|
+
|
|
+ err = "insufficient space in new superblock";
|
|
+ ret = bch2_sb_from_fs(c, ca);
|
|
+ if (ret)
|
|
+ goto err_unlock;
|
|
+
|
|
+ mi = bch2_sb_get_members(ca->disk_sb.sb);
|
|
+
|
|
+ if (!bch2_sb_resize_members(&ca->disk_sb,
|
|
+ le32_to_cpu(mi->field.u64s) +
|
|
+ sizeof(dev_mi) / sizeof(u64))) {
|
|
+ ret = -ENOSPC;
|
|
+ goto err_unlock;
|
|
+ }
|
|
+
|
|
+ if (dynamic_fault("bcachefs:add:no_slot"))
|
|
+ goto no_slot;
|
|
+
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
|
|
+ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
|
|
+ goto have_slot;
|
|
+no_slot:
|
|
+ err = "no slots available in superblock";
|
|
+ ret = -ENOSPC;
|
|
+ goto err_unlock;
|
|
+
|
|
+have_slot:
|
|
+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
|
|
+ u64s = (sizeof(struct bch_sb_field_members) +
|
|
+ sizeof(struct bch_member) * nr_devices) / sizeof(u64);
|
|
+
|
|
+ err = "no space in superblock for member info";
|
|
+ ret = -ENOSPC;
|
|
+
|
|
+ mi = bch2_sb_resize_members(&c->disk_sb, u64s);
|
|
+ if (!mi)
|
|
+ goto err_unlock;
|
|
+
|
|
+ /* success: */
|
|
+
|
|
+ mi->members[dev_idx] = dev_mi;
|
|
+ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds());
|
|
+ c->disk_sb.sb->nr_devices = nr_devices;
|
|
+
|
|
+ ca->disk_sb.sb->dev_idx = dev_idx;
|
|
+ bch2_dev_attach(c, ca, dev_idx);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ bch2_dev_usage_journal_reserve(c);
|
|
+
|
|
+ err = "error marking superblock";
|
|
+ ret = bch2_trans_mark_dev_sb(c, ca);
|
|
+ if (ret)
|
|
+ goto err_late;
|
|
+
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
|
|
+ err = __bch2_dev_read_write(c, ca);
|
|
+ if (err)
|
|
+ goto err_late;
|
|
+ }
|
|
+
|
|
+ up_write(&c->state_lock);
|
|
+ return 0;
|
|
+
|
|
+err_unlock:
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ up_write(&c->state_lock);
|
|
+err:
|
|
+ if (ca)
|
|
+ bch2_dev_free(ca);
|
|
+ bch2_free_super(&sb);
|
|
+ bch_err(c, "Unable to add device: %s", err);
|
|
+ return ret;
|
|
+err_late:
|
|
+ up_write(&c->state_lock);
|
|
+ bch_err(c, "Error going rw after adding device: %s", err);
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+/* Hot add existing device to running filesystem: */
|
|
+int bch2_dev_online(struct bch_fs *c, const char *path)
|
|
+{
|
|
+ struct bch_opts opts = bch2_opts_empty();
|
|
+ struct bch_sb_handle sb = { NULL };
|
|
+ struct bch_sb_field_members *mi;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned dev_idx;
|
|
+ const char *err;
|
|
+ int ret;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ ret = bch2_read_super(path, &opts, &sb);
|
|
+ if (ret) {
|
|
+ up_write(&c->state_lock);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ dev_idx = sb.sb->dev_idx;
|
|
+
|
|
+ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
|
|
+ if (err)
|
|
+ goto err;
|
|
+
|
|
+ if (bch2_dev_attach_bdev(c, &sb)) {
|
|
+ err = "bch2_dev_attach_bdev() error";
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ca = bch_dev_locked(c, dev_idx);
|
|
+
|
|
+ if (bch2_trans_mark_dev_sb(c, ca)) {
|
|
+ err = "bch2_trans_mark_dev_sb() error";
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw) {
|
|
+ err = __bch2_dev_read_write(c, ca);
|
|
+ if (err)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = bch2_sb_get_members(c->disk_sb.sb);
|
|
+
|
|
+ mi->members[ca->dev_idx].last_mount =
|
|
+ cpu_to_le64(ktime_get_real_seconds());
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ up_write(&c->state_lock);
|
|
+ return 0;
|
|
+err:
|
|
+ up_write(&c->state_lock);
|
|
+ bch2_free_super(&sb);
|
|
+ bch_err(c, "error bringing %s online: %s", path, err);
|
|
+ return -EINVAL;
|
|
+}
|
|
+
|
|
+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
|
|
+{
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ if (!bch2_dev_is_online(ca)) {
|
|
+ bch_err(ca, "Already offline");
|
|
+ up_write(&c->state_lock);
|
|
+ return 0;
|
|
+ }
|
|
+
|
|
+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) {
|
|
+ bch_err(ca, "Cannot offline required disk");
|
|
+ up_write(&c->state_lock);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ __bch2_dev_offline(c, ca);
|
|
+
|
|
+ up_write(&c->state_lock);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
|
|
+{
|
|
+ struct bch_member *mi;
|
|
+ int ret = 0;
|
|
+
|
|
+ down_write(&c->state_lock);
|
|
+
|
|
+ if (nbuckets < ca->mi.nbuckets) {
|
|
+ bch_err(ca, "Cannot shrink yet");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ if (bch2_dev_is_online(ca) &&
|
|
+ get_capacity(ca->disk_sb.bdev->bd_disk) <
|
|
+ ca->mi.bucket_size * nbuckets) {
|
|
+ bch_err(ca, "New size larger than device");
|
|
+ ret = -EINVAL;
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_dev_buckets_resize(c, ca, nbuckets);
|
|
+ if (ret) {
|
|
+ bch_err(ca, "Resize error: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
|
|
+ mi->nbuckets = cpu_to_le64(nbuckets);
|
|
+
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ bch2_recalc_capacity(c);
|
|
+err:
|
|
+ up_write(&c->state_lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+/* return with ref on ca->ref: */
|
|
+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+ dev_t dev;
|
|
+ unsigned i;
|
|
+ int ret;
|
|
+
|
|
+ ret = lookup_bdev(path, &dev);
|
|
+ if (ret)
|
|
+ return ERR_PTR(ret);
|
|
+
|
|
+ for_each_member_device(ca, c, i)
|
|
+ if (ca->disk_sb.bdev->bd_dev == dev)
|
|
+ goto found;
|
|
+
|
|
+ ca = ERR_PTR(-ENOENT);
|
|
+found:
|
|
+ return ca;
|
|
+}
|
|
+
|
|
+/* Filesystem open: */
|
|
+
|
|
+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
|
|
+ struct bch_opts opts)
|
|
+{
|
|
+ struct bch_sb_handle *sb = NULL;
|
|
+ struct bch_fs *c = NULL;
|
|
+ struct bch_sb_field_members *mi;
|
|
+ unsigned i, best_sb = 0;
|
|
+ const char *err;
|
|
+ int ret = -ENOMEM;
|
|
+
|
|
+ pr_verbose_init(opts, "");
|
|
+
|
|
+ if (!nr_devices) {
|
|
+ c = ERR_PTR(-EINVAL);
|
|
+ goto out2;
|
|
+ }
|
|
+
|
|
+ if (!try_module_get(THIS_MODULE)) {
|
|
+ c = ERR_PTR(-ENODEV);
|
|
+ goto out2;
|
|
+ }
|
|
+
|
|
+ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
|
|
+ if (!sb)
|
|
+ goto err;
|
|
+
|
|
+ for (i = 0; i < nr_devices; i++) {
|
|
+ ret = bch2_read_super(devices[i], &opts, &sb[i]);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ err = bch2_sb_validate(&sb[i]);
|
|
+ if (err)
|
|
+ goto err_print;
|
|
+ }
|
|
+
|
|
+ for (i = 1; i < nr_devices; i++)
|
|
+ if (le64_to_cpu(sb[i].sb->seq) >
|
|
+ le64_to_cpu(sb[best_sb].sb->seq))
|
|
+ best_sb = i;
|
|
+
|
|
+ mi = bch2_sb_get_members(sb[best_sb].sb);
|
|
+
|
|
+ i = 0;
|
|
+ while (i < nr_devices) {
|
|
+ if (i != best_sb &&
|
|
+ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) {
|
|
+ char buf[BDEVNAME_SIZE];
|
|
+ pr_info("%s has been removed, skipping",
|
|
+ bdevname(sb[i].bdev, buf));
|
|
+ bch2_free_super(&sb[i]);
|
|
+ array_remove_item(sb, nr_devices, i);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb);
|
|
+ if (err)
|
|
+ goto err_print;
|
|
+ i++;
|
|
+ }
|
|
+
|
|
+ ret = -ENOMEM;
|
|
+ c = bch2_fs_alloc(sb[best_sb].sb, opts);
|
|
+ if (!c)
|
|
+ goto err;
|
|
+
|
|
+ err = "bch2_dev_online() error";
|
|
+ down_write(&c->state_lock);
|
|
+ for (i = 0; i < nr_devices; i++)
|
|
+ if (bch2_dev_attach_bdev(c, &sb[i])) {
|
|
+ up_write(&c->state_lock);
|
|
+ goto err_print;
|
|
+ }
|
|
+ up_write(&c->state_lock);
|
|
+
|
|
+ err = "insufficient devices";
|
|
+ if (!bch2_fs_may_start(c))
|
|
+ goto err_print;
|
|
+
|
|
+ if (!c->opts.nostart) {
|
|
+ ret = bch2_fs_start(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+out:
|
|
+ kfree(sb);
|
|
+ module_put(THIS_MODULE);
|
|
+out2:
|
|
+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
|
|
+ return c;
|
|
+err_print:
|
|
+ pr_err("bch_fs_open err opening %s: %s",
|
|
+ devices[0], err);
|
|
+ ret = -EINVAL;
|
|
+err:
|
|
+ if (c)
|
|
+ bch2_fs_stop(c);
|
|
+ for (i = 0; i < nr_devices; i++)
|
|
+ bch2_free_super(&sb[i]);
|
|
+ c = ERR_PTR(ret);
|
|
+ goto out;
|
|
+}
|
|
+
|
|
+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
|
|
+ struct bch_opts opts)
|
|
+{
|
|
+ const char *err;
|
|
+ struct bch_fs *c;
|
|
+ bool allocated_fs = false;
|
|
+ int ret;
|
|
+
|
|
+ err = bch2_sb_validate(sb);
|
|
+ if (err)
|
|
+ return err;
|
|
+
|
|
+ mutex_lock(&bch_fs_list_lock);
|
|
+ c = __bch2_uuid_to_fs(sb->sb->uuid);
|
|
+ if (c) {
|
|
+ closure_get(&c->cl);
|
|
+
|
|
+ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
|
|
+ if (err)
|
|
+ goto err;
|
|
+ } else {
|
|
+ c = bch2_fs_alloc(sb->sb, opts);
|
|
+ err = "cannot allocate memory";
|
|
+ if (!c)
|
|
+ goto err;
|
|
+
|
|
+ allocated_fs = true;
|
|
+ }
|
|
+
|
|
+ err = "bch2_dev_online() error";
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ if (bch2_dev_attach_bdev(c, sb)) {
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ goto err;
|
|
+ }
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+
|
|
+ if (!c->opts.nostart && bch2_fs_may_start(c)) {
|
|
+ err = "error starting filesystem";
|
|
+ ret = bch2_fs_start(c);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ closure_put(&c->cl);
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+
|
|
+ return NULL;
|
|
+err:
|
|
+ mutex_unlock(&bch_fs_list_lock);
|
|
+
|
|
+ if (allocated_fs)
|
|
+ bch2_fs_stop(c);
|
|
+ else if (c)
|
|
+ closure_put(&c->cl);
|
|
+
|
|
+ return err;
|
|
+}
|
|
+
|
|
+const char *bch2_fs_open_incremental(const char *path)
|
|
+{
|
|
+ struct bch_sb_handle sb;
|
|
+ struct bch_opts opts = bch2_opts_empty();
|
|
+ const char *err;
|
|
+
|
|
+ if (bch2_read_super(path, &opts, &sb))
|
|
+ return "error reading superblock";
|
|
+
|
|
+ err = __bch2_fs_open_incremental(&sb, opts);
|
|
+ bch2_free_super(&sb);
|
|
+
|
|
+ return err;
|
|
+}
|
|
+
|
|
+/* Global interfaces/init */
|
|
+
|
|
+static void bcachefs_exit(void)
|
|
+{
|
|
+ bch2_debug_exit();
|
|
+ bch2_vfs_exit();
|
|
+ bch2_chardev_exit();
|
|
+ bch2_btree_key_cache_exit();
|
|
+ if (bcachefs_kset)
|
|
+ kset_unregister(bcachefs_kset);
|
|
+}
|
|
+
|
|
+static int __init bcachefs_init(void)
|
|
+{
|
|
+ bch2_bkey_pack_test();
|
|
+
|
|
+ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) ||
|
|
+ bch2_btree_key_cache_init() ||
|
|
+ bch2_chardev_init() ||
|
|
+ bch2_vfs_init() ||
|
|
+ bch2_debug_init())
|
|
+ goto err;
|
|
+
|
|
+ return 0;
|
|
+err:
|
|
+ bcachefs_exit();
|
|
+ return -ENOMEM;
|
|
+}
|
|
+
|
|
+#define BCH_DEBUG_PARAM(name, description) \
|
|
+ bool bch2_##name; \
|
|
+ module_param_named(name, bch2_##name, bool, 0644); \
|
|
+ MODULE_PARM_DESC(name, description);
|
|
+BCH_DEBUG_PARAMS()
|
|
+#undef BCH_DEBUG_PARAM
|
|
+
|
|
+module_exit(bcachefs_exit);
|
|
+module_init(bcachefs_init);
|
|
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
|
|
new file mode 100644
|
|
index 000000000000..6cab506150a8
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/super.h
|
|
@@ -0,0 +1,241 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_SUPER_H
|
|
+#define _BCACHEFS_SUPER_H
|
|
+
|
|
+#include "extents.h"
|
|
+
|
|
+#include "bcachefs_ioctl.h"
|
|
+
|
|
+#include <linux/math64.h>
|
|
+
|
|
+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s)
|
|
+{
|
|
+ return div_u64(s, ca->mi.bucket_size);
|
|
+}
|
|
+
|
|
+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b)
|
|
+{
|
|
+ return ((sector_t) b) * ca->mi.bucket_size;
|
|
+}
|
|
+
|
|
+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
|
|
+{
|
|
+ u32 remainder;
|
|
+
|
|
+ div_u64_rem(s, ca->mi.bucket_size, &remainder);
|
|
+ return remainder;
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_is_online(struct bch_dev *ca)
|
|
+{
|
|
+ return !percpu_ref_is_zero(&ca->io_ref);
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
|
|
+{
|
|
+ return bch2_dev_is_online(ca) &&
|
|
+ ca->mi.state != BCH_MEMBER_STATE_failed;
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
|
|
+{
|
|
+ if (!percpu_ref_tryget(&ca->io_ref))
|
|
+ return false;
|
|
+
|
|
+ if (ca->mi.state == BCH_MEMBER_STATE_rw ||
|
|
+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))
|
|
+ return true;
|
|
+
|
|
+ percpu_ref_put(&ca->io_ref);
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
|
|
+{
|
|
+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
|
|
+}
|
|
+
|
|
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
|
|
+ unsigned dev)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < devs.nr; i++)
|
|
+ if (devs.devs[i] == dev)
|
|
+ return true;
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
|
|
+ unsigned dev)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < devs->nr; i++)
|
|
+ if (devs->devs[i] == dev) {
|
|
+ array_remove_item(devs->devs, devs->nr, i);
|
|
+ return;
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
|
|
+ unsigned dev)
|
|
+{
|
|
+ BUG_ON(bch2_dev_list_has_dev(*devs, dev));
|
|
+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX);
|
|
+ devs->devs[devs->nr++] = dev;
|
|
+}
|
|
+
|
|
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
|
|
+{
|
|
+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
|
|
+}
|
|
+
|
|
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
|
|
+ const struct bch_devs_mask *mask)
|
|
+{
|
|
+ struct bch_dev *ca = NULL;
|
|
+
|
|
+ while ((*iter = mask
|
|
+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
|
|
+ : *iter) < c->sb.nr_devices &&
|
|
+ !(ca = rcu_dereference_check(c->devs[*iter],
|
|
+ lockdep_is_held(&c->state_lock))))
|
|
+ (*iter)++;
|
|
+
|
|
+ return ca;
|
|
+}
|
|
+
|
|
+#define __for_each_member_device(ca, c, iter, mask) \
|
|
+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
|
|
+
|
|
+#define for_each_member_device_rcu(ca, c, iter, mask) \
|
|
+ __for_each_member_device(ca, c, iter, mask)
|
|
+
|
|
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ if ((ca = __bch2_next_dev(c, iter, NULL)))
|
|
+ percpu_ref_get(&ca->ref);
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return ca;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * If you break early, you must drop your ref on the current device
|
|
+ */
|
|
+#define for_each_member_device(ca, c, iter) \
|
|
+ for ((iter) = 0; \
|
|
+ (ca = bch2_get_next_dev(c, &(iter))); \
|
|
+ percpu_ref_put(&ca->ref), (iter)++)
|
|
+
|
|
+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
|
|
+ unsigned *iter,
|
|
+ int state_mask)
|
|
+{
|
|
+ struct bch_dev *ca;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ while ((ca = __bch2_next_dev(c, iter, NULL)) &&
|
|
+ (!((1 << ca->mi.state) & state_mask) ||
|
|
+ !percpu_ref_tryget(&ca->io_ref)))
|
|
+ (*iter)++;
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return ca;
|
|
+}
|
|
+
|
|
+#define __for_each_online_member(ca, c, iter, state_mask) \
|
|
+ for ((iter) = 0; \
|
|
+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
|
|
+ percpu_ref_put(&ca->io_ref), (iter)++)
|
|
+
|
|
+#define for_each_online_member(ca, c, iter) \
|
|
+ __for_each_online_member(ca, c, iter, ~0)
|
|
+
|
|
+#define for_each_rw_member(ca, c, iter) \
|
|
+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
|
|
+
|
|
+#define for_each_readable_member(ca, c, iter) \
|
|
+ __for_each_online_member(ca, c, iter, \
|
|
+ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
|
|
+
|
|
+/*
|
|
+ * If a key exists that references a device, the device won't be going away and
|
|
+ * we can omit rcu_read_lock():
|
|
+ */
|
|
+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx)
|
|
+{
|
|
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
|
+
|
|
+ return rcu_dereference_check(c->devs[idx], 1);
|
|
+}
|
|
+
|
|
+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
|
|
+{
|
|
+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]);
|
|
+
|
|
+ return rcu_dereference_protected(c->devs[idx],
|
|
+ lockdep_is_held(&c->sb_lock) ||
|
|
+ lockdep_is_held(&c->state_lock));
|
|
+}
|
|
+
|
|
+/* XXX kill, move to struct bch_fs */
|
|
+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
|
|
+{
|
|
+ struct bch_devs_mask devs;
|
|
+ struct bch_dev *ca;
|
|
+ unsigned i;
|
|
+
|
|
+ memset(&devs, 0, sizeof(devs));
|
|
+ for_each_online_member(ca, c, i)
|
|
+ __set_bit(ca->dev_idx, devs.d);
|
|
+ return devs;
|
|
+}
|
|
+
|
|
+struct bch_fs *bch2_dev_to_fs(dev_t);
|
|
+struct bch_fs *bch2_uuid_to_fs(uuid_le);
|
|
+
|
|
+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *,
|
|
+ enum bch_member_state, int);
|
|
+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
|
|
+ enum bch_member_state, int);
|
|
+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *,
|
|
+ enum bch_member_state, int);
|
|
+
|
|
+int bch2_dev_fail(struct bch_dev *, int);
|
|
+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
|
|
+int bch2_dev_add(struct bch_fs *, const char *);
|
|
+int bch2_dev_online(struct bch_fs *, const char *);
|
|
+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
|
|
+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
|
|
+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
|
|
+
|
|
+bool bch2_fs_emergency_read_only(struct bch_fs *);
|
|
+void bch2_fs_read_only(struct bch_fs *);
|
|
+
|
|
+int bch2_fs_read_write(struct bch_fs *);
|
|
+int bch2_fs_read_write_early(struct bch_fs *);
|
|
+
|
|
+/*
|
|
+ * Only for use in the recovery/fsck path:
|
|
+ */
|
|
+static inline void bch2_fs_lazy_rw(struct bch_fs *c)
|
|
+{
|
|
+ if (percpu_ref_is_zero(&c->writes))
|
|
+ bch2_fs_read_write_early(c);
|
|
+}
|
|
+
|
|
+void __bch2_fs_stop(struct bch_fs *);
|
|
+void bch2_fs_free(struct bch_fs *);
|
|
+void bch2_fs_stop(struct bch_fs *);
|
|
+
|
|
+int bch2_fs_start(struct bch_fs *);
|
|
+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts);
|
|
+const char *bch2_fs_open_incremental(const char *path);
|
|
+
|
|
+#endif /* _BCACHEFS_SUPER_H */
|
|
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
|
|
new file mode 100644
|
|
index 000000000000..96023f37afea
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/super_types.h
|
|
@@ -0,0 +1,51 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_SUPER_TYPES_H
|
|
+#define _BCACHEFS_SUPER_TYPES_H
|
|
+
|
|
+struct bch_sb_handle {
|
|
+ struct bch_sb *sb;
|
|
+ struct block_device *bdev;
|
|
+ struct bio *bio;
|
|
+ size_t buffer_size;
|
|
+ fmode_t mode;
|
|
+ unsigned have_layout:1;
|
|
+ unsigned have_bio:1;
|
|
+ unsigned fs_sb:1;
|
|
+ u64 seq;
|
|
+};
|
|
+
|
|
+struct bch_devs_mask {
|
|
+ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
|
|
+};
|
|
+
|
|
+struct bch_devs_list {
|
|
+ u8 nr;
|
|
+ u8 devs[BCH_BKEY_PTRS_MAX];
|
|
+};
|
|
+
|
|
+struct bch_member_cpu {
|
|
+ u64 nbuckets; /* device size */
|
|
+ u16 first_bucket; /* index of first bucket used */
|
|
+ u16 bucket_size; /* sectors */
|
|
+ u16 group;
|
|
+ u8 state;
|
|
+ u8 replacement;
|
|
+ u8 discard;
|
|
+ u8 data_allowed;
|
|
+ u8 durability;
|
|
+ u8 valid;
|
|
+};
|
|
+
|
|
+struct bch_disk_group_cpu {
|
|
+ bool deleted;
|
|
+ u16 parent;
|
|
+ struct bch_devs_mask devs;
|
|
+};
|
|
+
|
|
+struct bch_disk_groups_cpu {
|
|
+ struct rcu_head rcu;
|
|
+ unsigned nr;
|
|
+ struct bch_disk_group_cpu entries[];
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_SUPER_TYPES_H */
|
|
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
|
|
new file mode 100644
|
|
index 000000000000..21ef7719cf55
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/sysfs.c
|
|
@@ -0,0 +1,1051 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * bcache sysfs interfaces
|
|
+ *
|
|
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#ifndef NO_BCACHEFS_SYSFS
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_background.h"
|
|
+#include "sysfs.h"
|
|
+#include "btree_cache.h"
|
|
+#include "btree_io.h"
|
|
+#include "btree_iter.h"
|
|
+#include "btree_key_cache.h"
|
|
+#include "btree_update.h"
|
|
+#include "btree_update_interior.h"
|
|
+#include "btree_gc.h"
|
|
+#include "buckets.h"
|
|
+#include "clock.h"
|
|
+#include "disk_groups.h"
|
|
+#include "ec.h"
|
|
+#include "inode.h"
|
|
+#include "journal.h"
|
|
+#include "keylist.h"
|
|
+#include "move.h"
|
|
+#include "opts.h"
|
|
+#include "rebalance.h"
|
|
+#include "replicas.h"
|
|
+#include "super-io.h"
|
|
+#include "tests.h"
|
|
+
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/sort.h>
|
|
+#include <linux/sched/clock.h>
|
|
+
|
|
+#include "util.h"
|
|
+
|
|
+#define SYSFS_OPS(type) \
|
|
+struct sysfs_ops type ## _sysfs_ops = { \
|
|
+ .show = type ## _show, \
|
|
+ .store = type ## _store \
|
|
+}
|
|
+
|
|
+#define SHOW(fn) \
|
|
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
|
|
+ char *buf) \
|
|
+
|
|
+#define STORE(fn) \
|
|
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
|
|
+ const char *buf, size_t size) \
|
|
+
|
|
+#define __sysfs_attribute(_name, _mode) \
|
|
+ static struct attribute sysfs_##_name = \
|
|
+ { .name = #_name, .mode = _mode }
|
|
+
|
|
+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR)
|
|
+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO)
|
|
+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR)
|
|
+
|
|
+#define sysfs_printf(file, fmt, ...) \
|
|
+do { \
|
|
+ if (attr == &sysfs_ ## file) \
|
|
+ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\
|
|
+} while (0)
|
|
+
|
|
+#define sysfs_print(file, var) \
|
|
+do { \
|
|
+ if (attr == &sysfs_ ## file) \
|
|
+ return snprint(buf, PAGE_SIZE, var); \
|
|
+} while (0)
|
|
+
|
|
+#define sysfs_hprint(file, val) \
|
|
+do { \
|
|
+ if (attr == &sysfs_ ## file) { \
|
|
+ bch2_hprint(&out, val); \
|
|
+ pr_buf(&out, "\n"); \
|
|
+ return out.pos - buf; \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var))
|
|
+#define var_print(_var) sysfs_print(_var, var(_var))
|
|
+#define var_hprint(_var) sysfs_hprint(_var, var(_var))
|
|
+
|
|
+#define sysfs_strtoul(file, var) \
|
|
+do { \
|
|
+ if (attr == &sysfs_ ## file) \
|
|
+ return strtoul_safe(buf, var) ?: (ssize_t) size; \
|
|
+} while (0)
|
|
+
|
|
+#define sysfs_strtoul_clamp(file, var, min, max) \
|
|
+do { \
|
|
+ if (attr == &sysfs_ ## file) \
|
|
+ return strtoul_safe_clamp(buf, var, min, max) \
|
|
+ ?: (ssize_t) size; \
|
|
+} while (0)
|
|
+
|
|
+#define strtoul_or_return(cp) \
|
|
+({ \
|
|
+ unsigned long _v; \
|
|
+ int _r = kstrtoul(cp, 10, &_v); \
|
|
+ if (_r) \
|
|
+ return _r; \
|
|
+ _v; \
|
|
+})
|
|
+
|
|
+#define strtoul_restrict_or_return(cp, min, max) \
|
|
+({ \
|
|
+ unsigned long __v = 0; \
|
|
+ int _r = strtoul_safe_restrict(cp, __v, min, max); \
|
|
+ if (_r) \
|
|
+ return _r; \
|
|
+ __v; \
|
|
+})
|
|
+
|
|
+#define strtoi_h_or_return(cp) \
|
|
+({ \
|
|
+ u64 _v; \
|
|
+ int _r = strtoi_h(cp, &_v); \
|
|
+ if (_r) \
|
|
+ return _r; \
|
|
+ _v; \
|
|
+})
|
|
+
|
|
+#define sysfs_hatoi(file, var) \
|
|
+do { \
|
|
+ if (attr == &sysfs_ ## file) \
|
|
+ return strtoi_h(buf, &var) ?: (ssize_t) size; \
|
|
+} while (0)
|
|
+
|
|
+write_attribute(trigger_journal_flush);
|
|
+write_attribute(trigger_gc);
|
|
+write_attribute(prune_cache);
|
|
+rw_attribute(btree_gc_periodic);
|
|
+rw_attribute(gc_gens_pos);
|
|
+
|
|
+read_attribute(uuid);
|
|
+read_attribute(minor);
|
|
+read_attribute(bucket_size);
|
|
+read_attribute(block_size);
|
|
+read_attribute(btree_node_size);
|
|
+read_attribute(first_bucket);
|
|
+read_attribute(nbuckets);
|
|
+read_attribute(durability);
|
|
+read_attribute(iodone);
|
|
+
|
|
+read_attribute(io_latency_read);
|
|
+read_attribute(io_latency_write);
|
|
+read_attribute(io_latency_stats_read);
|
|
+read_attribute(io_latency_stats_write);
|
|
+read_attribute(congested);
|
|
+
|
|
+read_attribute(btree_avg_write_size);
|
|
+
|
|
+read_attribute(bucket_quantiles_last_read);
|
|
+read_attribute(bucket_quantiles_last_write);
|
|
+read_attribute(bucket_quantiles_fragmentation);
|
|
+read_attribute(bucket_quantiles_oldest_gen);
|
|
+
|
|
+read_attribute(reserve_stats);
|
|
+read_attribute(btree_cache_size);
|
|
+read_attribute(compression_stats);
|
|
+read_attribute(journal_debug);
|
|
+read_attribute(journal_pins);
|
|
+read_attribute(btree_updates);
|
|
+read_attribute(dirty_btree_nodes);
|
|
+read_attribute(btree_cache);
|
|
+read_attribute(btree_key_cache);
|
|
+read_attribute(btree_transactions);
|
|
+read_attribute(stripes_heap);
|
|
+
|
|
+read_attribute(internal_uuid);
|
|
+
|
|
+read_attribute(has_data);
|
|
+read_attribute(alloc_debug);
|
|
+write_attribute(wake_allocator);
|
|
+
|
|
+read_attribute(read_realloc_races);
|
|
+read_attribute(extent_migrate_done);
|
|
+read_attribute(extent_migrate_raced);
|
|
+
|
|
+rw_attribute(journal_write_delay_ms);
|
|
+rw_attribute(journal_reclaim_delay_ms);
|
|
+
|
|
+rw_attribute(discard);
|
|
+rw_attribute(cache_replacement_policy);
|
|
+rw_attribute(label);
|
|
+
|
|
+rw_attribute(copy_gc_enabled);
|
|
+read_attribute(copy_gc_wait);
|
|
+
|
|
+rw_attribute(rebalance_enabled);
|
|
+sysfs_pd_controller_attribute(rebalance);
|
|
+read_attribute(rebalance_work);
|
|
+rw_attribute(promote_whole_extents);
|
|
+
|
|
+read_attribute(new_stripes);
|
|
+
|
|
+read_attribute(io_timers_read);
|
|
+read_attribute(io_timers_write);
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_TESTS
|
|
+write_attribute(perf_test);
|
|
+#endif /* CONFIG_BCACHEFS_TESTS */
|
|
+
|
|
+#define x(_name) \
|
|
+ static struct attribute sysfs_time_stat_##_name = \
|
|
+ { .name = #_name, .mode = S_IRUGO };
|
|
+ BCH_TIME_STATS()
|
|
+#undef x
|
|
+
|
|
+static struct attribute sysfs_state_rw = {
|
|
+ .name = "state",
|
|
+ .mode = S_IRUGO
|
|
+};
|
|
+
|
|
+static size_t bch2_btree_cache_size(struct bch_fs *c)
|
|
+{
|
|
+ size_t ret = 0;
|
|
+ struct btree *b;
|
|
+
|
|
+ mutex_lock(&c->btree_cache.lock);
|
|
+ list_for_each_entry(b, &c->btree_cache.live, list)
|
|
+ ret += btree_bytes(c);
|
|
+
|
|
+ mutex_unlock(&c->btree_cache.lock);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static size_t bch2_btree_avg_write_size(struct bch_fs *c)
|
|
+{
|
|
+ u64 nr = atomic64_read(&c->btree_writes_nr);
|
|
+ u64 sectors = atomic64_read(&c->btree_writes_sectors);
|
|
+
|
|
+ return nr ? div64_u64(sectors, nr) : 0;
|
|
+}
|
|
+
|
|
+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct bch_fs_usage_online *fs_usage = bch2_fs_usage_read(c);
|
|
+
|
|
+ if (!fs_usage)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ bch2_fs_usage_to_text(out, c, fs_usage);
|
|
+
|
|
+ percpu_up_read(&c->mark_lock);
|
|
+
|
|
+ kfree(fs_usage);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0,
|
|
+ nr_compressed_extents = 0,
|
|
+ compressed_sectors_compressed = 0,
|
|
+ compressed_sectors_uncompressed = 0;
|
|
+ int ret;
|
|
+
|
|
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
|
|
+ return -EPERM;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN, 0, k, ret)
|
|
+ if (k.k->type == KEY_TYPE_extent) {
|
|
+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
|
|
+ const union bch_extent_entry *entry;
|
|
+ struct extent_ptr_decoded p;
|
|
+
|
|
+ extent_for_each_ptr_decode(e, p, entry) {
|
|
+ if (!crc_is_compressed(p.crc)) {
|
|
+ nr_uncompressed_extents++;
|
|
+ uncompressed_sectors += e.k->size;
|
|
+ } else {
|
|
+ nr_compressed_extents++;
|
|
+ compressed_sectors_compressed +=
|
|
+ p.crc.compressed_size;
|
|
+ compressed_sectors_uncompressed +=
|
|
+ p.crc.uncompressed_size;
|
|
+ }
|
|
+
|
|
+ /* only looking at the first ptr */
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ pr_buf(out,
|
|
+ "uncompressed data:\n"
|
|
+ " nr extents: %llu\n"
|
|
+ " size (bytes): %llu\n"
|
|
+ "compressed data:\n"
|
|
+ " nr extents: %llu\n"
|
|
+ " compressed size (bytes): %llu\n"
|
|
+ " uncompressed size (bytes): %llu\n",
|
|
+ nr_uncompressed_extents,
|
|
+ uncompressed_sectors << 9,
|
|
+ nr_compressed_extents,
|
|
+ compressed_sectors_compressed << 9,
|
|
+ compressed_sectors_uncompressed << 9);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
|
|
+{
|
|
+ pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
|
|
+ bch2_bpos_to_text(out, c->gc_gens_pos);
|
|
+ pr_buf(out, "\n");
|
|
+}
|
|
+
|
|
+SHOW(bch2_fs)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
|
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
|
|
+
|
|
+ sysfs_print(minor, c->minor);
|
|
+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
|
|
+
|
|
+ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
|
|
+ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
|
|
+
|
|
+ sysfs_print(block_size, block_bytes(c));
|
|
+ sysfs_print(btree_node_size, btree_bytes(c));
|
|
+ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
|
|
+ sysfs_hprint(btree_avg_write_size, bch2_btree_avg_write_size(c));
|
|
+
|
|
+ sysfs_print(read_realloc_races,
|
|
+ atomic_long_read(&c->read_realloc_races));
|
|
+ sysfs_print(extent_migrate_done,
|
|
+ atomic_long_read(&c->extent_migrate_done));
|
|
+ sysfs_print(extent_migrate_raced,
|
|
+ atomic_long_read(&c->extent_migrate_raced));
|
|
+
|
|
+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic);
|
|
+
|
|
+ if (attr == &sysfs_gc_gens_pos) {
|
|
+ bch2_gc_gens_pos_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
|
|
+
|
|
+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled);
|
|
+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */
|
|
+ sysfs_hprint(copy_gc_wait,
|
|
+ max(0LL, c->copygc_wait -
|
|
+ atomic64_read(&c->io_clock[WRITE].now)) << 9);
|
|
+
|
|
+ if (attr == &sysfs_rebalance_work) {
|
|
+ bch2_rebalance_work_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ sysfs_print(promote_whole_extents, c->promote_whole_extents);
|
|
+
|
|
+ /* Debugging: */
|
|
+
|
|
+ if (attr == &sysfs_alloc_debug)
|
|
+ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf;
|
|
+
|
|
+ if (attr == &sysfs_journal_debug) {
|
|
+ bch2_journal_debug_to_text(&out, &c->journal);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_journal_pins) {
|
|
+ bch2_journal_pins_to_text(&out, &c->journal);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_btree_updates) {
|
|
+ bch2_btree_updates_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_dirty_btree_nodes) {
|
|
+ bch2_dirty_btree_nodes_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_btree_cache) {
|
|
+ bch2_btree_cache_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_btree_key_cache) {
|
|
+ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_btree_transactions) {
|
|
+ bch2_btree_trans_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_stripes_heap) {
|
|
+ bch2_stripes_heap_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_compression_stats) {
|
|
+ bch2_compression_stats_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_new_stripes) {
|
|
+ bch2_new_stripes_to_text(&out, c);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_io_timers_read) {
|
|
+ bch2_io_timers_to_text(&out, &c->io_clock[READ]);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+ if (attr == &sysfs_io_timers_write) {
|
|
+ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+STORE(bch2_fs)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
|
|
+
|
|
+ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
|
|
+ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
|
|
+
|
|
+ if (attr == &sysfs_btree_gc_periodic) {
|
|
+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
|
|
+ ?: (ssize_t) size;
|
|
+
|
|
+ wake_up_process(c->gc_thread);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_copy_gc_enabled) {
|
|
+ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled)
|
|
+ ?: (ssize_t) size;
|
|
+
|
|
+ if (c->copygc_thread)
|
|
+ wake_up_process(c->copygc_thread);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_rebalance_enabled) {
|
|
+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled)
|
|
+ ?: (ssize_t) size;
|
|
+
|
|
+ rebalance_wakeup(c);
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd);
|
|
+
|
|
+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents);
|
|
+
|
|
+ /* Debugging: */
|
|
+
|
|
+ if (!test_bit(BCH_FS_STARTED, &c->flags))
|
|
+ return -EPERM;
|
|
+
|
|
+ /* Debugging: */
|
|
+
|
|
+ if (attr == &sysfs_trigger_journal_flush)
|
|
+ bch2_journal_meta(&c->journal);
|
|
+
|
|
+ if (attr == &sysfs_trigger_gc) {
|
|
+ /*
|
|
+ * Full gc is currently incompatible with btree key cache:
|
|
+ */
|
|
+#if 0
|
|
+ down_read(&c->state_lock);
|
|
+ bch2_gc(c, false, false);
|
|
+ up_read(&c->state_lock);
|
|
+#else
|
|
+ bch2_gc_gens(c);
|
|
+#endif
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_prune_cache) {
|
|
+ struct shrink_control sc;
|
|
+
|
|
+ sc.gfp_mask = GFP_KERNEL;
|
|
+ sc.nr_to_scan = strtoul_or_return(buf);
|
|
+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
|
|
+ }
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_TESTS
|
|
+ if (attr == &sysfs_perf_test) {
|
|
+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
|
|
+ char *test = strsep(&p, " \t\n");
|
|
+ char *nr_str = strsep(&p, " \t\n");
|
|
+ char *threads_str = strsep(&p, " \t\n");
|
|
+ unsigned threads;
|
|
+ u64 nr;
|
|
+ int ret = -EINVAL;
|
|
+
|
|
+ if (threads_str &&
|
|
+ !(ret = kstrtouint(threads_str, 10, &threads)) &&
|
|
+ !(ret = bch2_strtoull_h(nr_str, &nr)))
|
|
+ ret = bch2_btree_perf_test(c, test, nr, threads);
|
|
+ kfree(tmp);
|
|
+
|
|
+ if (ret)
|
|
+ size = ret;
|
|
+ }
|
|
+#endif
|
|
+ return size;
|
|
+}
|
|
+SYSFS_OPS(bch2_fs);
|
|
+
|
|
+struct attribute *bch2_fs_files[] = {
|
|
+ &sysfs_minor,
|
|
+ &sysfs_block_size,
|
|
+ &sysfs_btree_node_size,
|
|
+ &sysfs_btree_cache_size,
|
|
+ &sysfs_btree_avg_write_size,
|
|
+
|
|
+ &sysfs_journal_write_delay_ms,
|
|
+ &sysfs_journal_reclaim_delay_ms,
|
|
+
|
|
+ &sysfs_promote_whole_extents,
|
|
+
|
|
+ &sysfs_compression_stats,
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_TESTS
|
|
+ &sysfs_perf_test,
|
|
+#endif
|
|
+ NULL
|
|
+};
|
|
+
|
|
+/* internal dir - just a wrapper */
|
|
+
|
|
+SHOW(bch2_fs_internal)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
|
|
+ return bch2_fs_show(&c->kobj, attr, buf);
|
|
+}
|
|
+
|
|
+STORE(bch2_fs_internal)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal);
|
|
+ return bch2_fs_store(&c->kobj, attr, buf, size);
|
|
+}
|
|
+SYSFS_OPS(bch2_fs_internal);
|
|
+
|
|
+struct attribute *bch2_fs_internal_files[] = {
|
|
+ &sysfs_alloc_debug,
|
|
+ &sysfs_journal_debug,
|
|
+ &sysfs_journal_pins,
|
|
+ &sysfs_btree_updates,
|
|
+ &sysfs_dirty_btree_nodes,
|
|
+ &sysfs_btree_cache,
|
|
+ &sysfs_btree_key_cache,
|
|
+ &sysfs_btree_transactions,
|
|
+ &sysfs_stripes_heap,
|
|
+
|
|
+ &sysfs_read_realloc_races,
|
|
+ &sysfs_extent_migrate_done,
|
|
+ &sysfs_extent_migrate_raced,
|
|
+
|
|
+ &sysfs_trigger_journal_flush,
|
|
+ &sysfs_trigger_gc,
|
|
+ &sysfs_gc_gens_pos,
|
|
+ &sysfs_prune_cache,
|
|
+
|
|
+ &sysfs_copy_gc_enabled,
|
|
+ &sysfs_copy_gc_wait,
|
|
+
|
|
+ &sysfs_rebalance_enabled,
|
|
+ &sysfs_rebalance_work,
|
|
+ sysfs_pd_controller_files(rebalance),
|
|
+
|
|
+ &sysfs_new_stripes,
|
|
+
|
|
+ &sysfs_io_timers_read,
|
|
+ &sysfs_io_timers_write,
|
|
+
|
|
+ &sysfs_internal_uuid,
|
|
+ NULL
|
|
+};
|
|
+
|
|
+/* options */
|
|
+
|
|
+SHOW(bch2_fs_opts_dir)
|
|
+{
|
|
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
+ const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
|
+ int id = opt - bch2_opt_table;
|
|
+ u64 v = bch2_opt_get_by_id(&c->opts, id);
|
|
+
|
|
+ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST);
|
|
+ pr_buf(&out, "\n");
|
|
+
|
|
+ return out.pos - buf;
|
|
+}
|
|
+
|
|
+STORE(bch2_fs_opts_dir)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
|
|
+ const struct bch_option *opt = container_of(attr, struct bch_option, attr);
|
|
+ int ret, id = opt - bch2_opt_table;
|
|
+ char *tmp;
|
|
+ u64 v;
|
|
+
|
|
+ tmp = kstrdup(buf, GFP_KERNEL);
|
|
+ if (!tmp)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = bch2_opt_parse(c, opt, strim(tmp), &v);
|
|
+ kfree(tmp);
|
|
+
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_opt_check_may_set(c, id, v);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ if (opt->set_sb != SET_NO_SB_OPT) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ opt->set_sb(c->disk_sb.sb, v);
|
|
+ bch2_write_super(c);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ bch2_opt_set_by_id(&c->opts, id, v);
|
|
+
|
|
+ if ((id == Opt_background_target ||
|
|
+ id == Opt_background_compression) && v) {
|
|
+ bch2_rebalance_add_work(c, S64_MAX);
|
|
+ rebalance_wakeup(c);
|
|
+ }
|
|
+
|
|
+ return size;
|
|
+}
|
|
+SYSFS_OPS(bch2_fs_opts_dir);
|
|
+
|
|
+struct attribute *bch2_fs_opts_dir_files[] = { NULL };
|
|
+
|
|
+int bch2_opts_create_sysfs_files(struct kobject *kobj)
|
|
+{
|
|
+ const struct bch_option *i;
|
|
+ int ret;
|
|
+
|
|
+ for (i = bch2_opt_table;
|
|
+ i < bch2_opt_table + bch2_opts_nr;
|
|
+ i++) {
|
|
+ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME)))
|
|
+ continue;
|
|
+
|
|
+ ret = sysfs_create_file(kobj, &i->attr);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* time stats */
|
|
+
|
|
+SHOW(bch2_fs_time_stats)
|
|
+{
|
|
+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
|
|
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
|
|
+
|
|
+#define x(name) \
|
|
+ if (attr == &sysfs_time_stat_##name) { \
|
|
+ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\
|
|
+ return out.pos - buf; \
|
|
+ }
|
|
+ BCH_TIME_STATS()
|
|
+#undef x
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+STORE(bch2_fs_time_stats)
|
|
+{
|
|
+ return size;
|
|
+}
|
|
+SYSFS_OPS(bch2_fs_time_stats);
|
|
+
|
|
+struct attribute *bch2_fs_time_stats_files[] = {
|
|
+#define x(name) \
|
|
+ &sysfs_time_stat_##name,
|
|
+ BCH_TIME_STATS()
|
|
+#undef x
|
|
+ NULL
|
|
+};
|
|
+
|
|
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
|
|
+ size_t, void *);
|
|
+
|
|
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t b, void *private)
|
|
+{
|
|
+ int rw = (private ? 1 : 0);
|
|
+
|
|
+ return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
|
|
+}
|
|
+
|
|
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t b, void *private)
|
|
+{
|
|
+ struct bucket *g = bucket(ca, b);
|
|
+ return bucket_sectors_used(g->mark);
|
|
+}
|
|
+
|
|
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
|
|
+ size_t b, void *private)
|
|
+{
|
|
+ return bucket_gc_gen(bucket(ca, b));
|
|
+}
|
|
+
|
|
+static int unsigned_cmp(const void *_l, const void *_r)
|
|
+{
|
|
+ const unsigned *l = _l;
|
|
+ const unsigned *r = _r;
|
|
+
|
|
+ return cmp_int(*l, *r);
|
|
+}
|
|
+
|
|
+static int quantiles_to_text(struct printbuf *out,
|
|
+ struct bch_fs *c, struct bch_dev *ca,
|
|
+ bucket_map_fn *fn, void *private)
|
|
+{
|
|
+ size_t i, n;
|
|
+ /* Compute 31 quantiles */
|
|
+ unsigned q[31], *p;
|
|
+
|
|
+ down_read(&ca->bucket_lock);
|
|
+ n = ca->mi.nbuckets;
|
|
+
|
|
+ p = vzalloc(n * sizeof(unsigned));
|
|
+ if (!p) {
|
|
+ up_read(&ca->bucket_lock);
|
|
+ return -ENOMEM;
|
|
+ }
|
|
+
|
|
+ for (i = ca->mi.first_bucket; i < n; i++)
|
|
+ p[i] = fn(c, ca, i, private);
|
|
+
|
|
+ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
|
|
+ up_read(&ca->bucket_lock);
|
|
+
|
|
+ while (n &&
|
|
+ !p[n - 1])
|
|
+ --n;
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(q); i++)
|
|
+ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)];
|
|
+
|
|
+ vfree(p);
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(q); i++)
|
|
+ pr_buf(out, "%u ", q[i]);
|
|
+ pr_buf(out, "\n");
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
|
|
+{
|
|
+ enum alloc_reserve i;
|
|
+
|
|
+ spin_lock(&ca->fs->freelist_lock);
|
|
+
|
|
+ pr_buf(out, "free_inc:\t%zu\t%zu\n",
|
|
+ fifo_used(&ca->free_inc),
|
|
+ ca->free_inc.size);
|
|
+
|
|
+ for (i = 0; i < RESERVE_NR; i++)
|
|
+ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
|
|
+ fifo_used(&ca->free[i]),
|
|
+ ca->free[i].size);
|
|
+
|
|
+ spin_unlock(&ca->fs->freelist_lock);
|
|
+}
|
|
+
|
|
+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
|
|
+{
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct bch_dev_usage stats = bch2_dev_usage_read(ca);
|
|
+ unsigned i, nr[BCH_DATA_NR];
|
|
+
|
|
+ memset(nr, 0, sizeof(nr));
|
|
+
|
|
+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++)
|
|
+ nr[c->open_buckets[i].type]++;
|
|
+
|
|
+ pr_buf(out,
|
|
+ "\t\t buckets\t sectors fragmented\n"
|
|
+ "capacity%16llu\n",
|
|
+ ca->mi.nbuckets - ca->mi.first_bucket);
|
|
+
|
|
+ for (i = 1; i < BCH_DATA_NR; i++)
|
|
+ pr_buf(out, "%-8s%16llu%16llu%16llu\n",
|
|
+ bch2_data_types[i], stats.d[i].buckets,
|
|
+ stats.d[i].sectors, stats.d[i].fragmented);
|
|
+
|
|
+ pr_buf(out,
|
|
+ "ec\t%16llu\n"
|
|
+ "available%15llu\n"
|
|
+ "\n"
|
|
+ "free_inc\t\t%zu/%zu\n"
|
|
+ "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
|
|
+ "free[RESERVE_NONE]\t%zu/%zu\n"
|
|
+ "freelist_wait\t\t%s\n"
|
|
+ "open buckets allocated\t%u\n"
|
|
+ "open buckets this dev\t%u\n"
|
|
+ "open buckets total\t%u\n"
|
|
+ "open_buckets_wait\t%s\n"
|
|
+ "open_buckets_btree\t%u\n"
|
|
+ "open_buckets_user\t%u\n"
|
|
+ "btree reserve cache\t%u\n"
|
|
+ "thread state:\t\t%s\n",
|
|
+ stats.buckets_ec,
|
|
+ __dev_buckets_available(ca, stats),
|
|
+ fifo_used(&ca->free_inc), ca->free_inc.size,
|
|
+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
|
|
+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size,
|
|
+ c->freelist_wait.list.first ? "waiting" : "empty",
|
|
+ OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
|
|
+ ca->nr_open_buckets,
|
|
+ OPEN_BUCKETS_COUNT,
|
|
+ c->open_buckets_wait.list.first ? "waiting" : "empty",
|
|
+ nr[BCH_DATA_btree],
|
|
+ nr[BCH_DATA_user],
|
|
+ c->btree_reserve_cache_nr,
|
|
+ bch2_allocator_states[ca->allocator_state]);
|
|
+}
|
|
+
|
|
+static const char * const bch2_rw[] = {
|
|
+ "read",
|
|
+ "write",
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca)
|
|
+{
|
|
+ int rw, i;
|
|
+
|
|
+ for (rw = 0; rw < 2; rw++) {
|
|
+ pr_buf(out, "%s:\n", bch2_rw[rw]);
|
|
+
|
|
+ for (i = 1; i < BCH_DATA_NR; i++)
|
|
+ pr_buf(out, "%-12s:%12llu\n",
|
|
+ bch2_data_types[i],
|
|
+ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
|
|
+ }
|
|
+}
|
|
+
|
|
+SHOW(bch2_dev)
|
|
+{
|
|
+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct printbuf out = _PBUF(buf, PAGE_SIZE);
|
|
+
|
|
+ sysfs_printf(uuid, "%pU\n", ca->uuid.b);
|
|
+
|
|
+ sysfs_print(bucket_size, bucket_bytes(ca));
|
|
+ sysfs_print(block_size, block_bytes(c));
|
|
+ sysfs_print(first_bucket, ca->mi.first_bucket);
|
|
+ sysfs_print(nbuckets, ca->mi.nbuckets);
|
|
+ sysfs_print(durability, ca->mi.durability);
|
|
+ sysfs_print(discard, ca->mi.discard);
|
|
+
|
|
+ if (attr == &sysfs_label) {
|
|
+ if (ca->mi.group) {
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ bch2_disk_path_to_text(&out, &c->disk_sb,
|
|
+ ca->mi.group - 1);
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ pr_buf(&out, "\n");
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_has_data) {
|
|
+ bch2_flags_to_text(&out, bch2_data_types,
|
|
+ bch2_dev_has_data(c, ca));
|
|
+ pr_buf(&out, "\n");
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_cache_replacement_policy) {
|
|
+ bch2_string_opt_to_text(&out,
|
|
+ bch2_cache_replacement_policies,
|
|
+ ca->mi.replacement);
|
|
+ pr_buf(&out, "\n");
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_state_rw) {
|
|
+ bch2_string_opt_to_text(&out, bch2_member_states,
|
|
+ ca->mi.state);
|
|
+ pr_buf(&out, "\n");
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_iodone) {
|
|
+ dev_iodone_to_text(&out, ca);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ]));
|
|
+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE]));
|
|
+
|
|
+ if (attr == &sysfs_io_latency_stats_read) {
|
|
+ bch2_time_stats_to_text(&out, &ca->io_latency[READ]);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+ if (attr == &sysfs_io_latency_stats_write) {
|
|
+ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ sysfs_printf(congested, "%u%%",
|
|
+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
|
|
+ * 100 / CONGESTED_MAX);
|
|
+
|
|
+ if (attr == &sysfs_bucket_quantiles_last_read)
|
|
+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf;
|
|
+ if (attr == &sysfs_bucket_quantiles_last_write)
|
|
+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf;
|
|
+ if (attr == &sysfs_bucket_quantiles_fragmentation)
|
|
+ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf;
|
|
+ if (attr == &sysfs_bucket_quantiles_oldest_gen)
|
|
+ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf;
|
|
+
|
|
+ if (attr == &sysfs_reserve_stats) {
|
|
+ reserve_stats_to_text(&out, ca);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+ if (attr == &sysfs_alloc_debug) {
|
|
+ dev_alloc_debug_to_text(&out, ca);
|
|
+ return out.pos - buf;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+STORE(bch2_dev)
|
|
+{
|
|
+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
|
|
+ struct bch_fs *c = ca->fs;
|
|
+ struct bch_member *mi;
|
|
+
|
|
+ if (attr == &sysfs_discard) {
|
|
+ bool v = strtoul_or_return(buf);
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
|
|
+
|
|
+ if (v != BCH_MEMBER_DISCARD(mi)) {
|
|
+ SET_BCH_MEMBER_DISCARD(mi, v);
|
|
+ bch2_write_super(c);
|
|
+ }
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_cache_replacement_policy) {
|
|
+ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf);
|
|
+
|
|
+ if (v < 0)
|
|
+ return v;
|
|
+
|
|
+ mutex_lock(&c->sb_lock);
|
|
+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
|
|
+
|
|
+ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
|
|
+ SET_BCH_MEMBER_REPLACEMENT(mi, v);
|
|
+ bch2_write_super(c);
|
|
+ }
|
|
+ mutex_unlock(&c->sb_lock);
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_label) {
|
|
+ char *tmp;
|
|
+ int ret;
|
|
+
|
|
+ tmp = kstrdup(buf, GFP_KERNEL);
|
|
+ if (!tmp)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ ret = bch2_dev_group_set(c, ca, strim(tmp));
|
|
+ kfree(tmp);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ if (attr == &sysfs_wake_allocator)
|
|
+ bch2_wake_allocator(ca);
|
|
+
|
|
+ return size;
|
|
+}
|
|
+SYSFS_OPS(bch2_dev);
|
|
+
|
|
+struct attribute *bch2_dev_files[] = {
|
|
+ &sysfs_uuid,
|
|
+ &sysfs_bucket_size,
|
|
+ &sysfs_block_size,
|
|
+ &sysfs_first_bucket,
|
|
+ &sysfs_nbuckets,
|
|
+ &sysfs_durability,
|
|
+
|
|
+ /* settings: */
|
|
+ &sysfs_discard,
|
|
+ &sysfs_cache_replacement_policy,
|
|
+ &sysfs_state_rw,
|
|
+ &sysfs_label,
|
|
+
|
|
+ &sysfs_has_data,
|
|
+ &sysfs_iodone,
|
|
+
|
|
+ &sysfs_io_latency_read,
|
|
+ &sysfs_io_latency_write,
|
|
+ &sysfs_io_latency_stats_read,
|
|
+ &sysfs_io_latency_stats_write,
|
|
+ &sysfs_congested,
|
|
+
|
|
+ /* alloc info - other stats: */
|
|
+ &sysfs_bucket_quantiles_last_read,
|
|
+ &sysfs_bucket_quantiles_last_write,
|
|
+ &sysfs_bucket_quantiles_fragmentation,
|
|
+ &sysfs_bucket_quantiles_oldest_gen,
|
|
+
|
|
+ &sysfs_reserve_stats,
|
|
+
|
|
+ /* debug: */
|
|
+ &sysfs_alloc_debug,
|
|
+ &sysfs_wake_allocator,
|
|
+ NULL
|
|
+};
|
|
+
|
|
+#endif /* _BCACHEFS_SYSFS_H_ */
|
|
diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h
|
|
new file mode 100644
|
|
index 000000000000..525fd05d91f7
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/sysfs.h
|
|
@@ -0,0 +1,44 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_SYSFS_H_
|
|
+#define _BCACHEFS_SYSFS_H_
|
|
+
|
|
+#include <linux/sysfs.h>
|
|
+
|
|
+#ifndef NO_BCACHEFS_SYSFS
|
|
+
|
|
+struct attribute;
|
|
+struct sysfs_ops;
|
|
+
|
|
+extern struct attribute *bch2_fs_files[];
|
|
+extern struct attribute *bch2_fs_internal_files[];
|
|
+extern struct attribute *bch2_fs_opts_dir_files[];
|
|
+extern struct attribute *bch2_fs_time_stats_files[];
|
|
+extern struct attribute *bch2_dev_files[];
|
|
+
|
|
+extern struct sysfs_ops bch2_fs_sysfs_ops;
|
|
+extern struct sysfs_ops bch2_fs_internal_sysfs_ops;
|
|
+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
|
|
+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
|
|
+extern struct sysfs_ops bch2_dev_sysfs_ops;
|
|
+
|
|
+int bch2_opts_create_sysfs_files(struct kobject *);
|
|
+
|
|
+#else
|
|
+
|
|
+static struct attribute *bch2_fs_files[] = {};
|
|
+static struct attribute *bch2_fs_internal_files[] = {};
|
|
+static struct attribute *bch2_fs_opts_dir_files[] = {};
|
|
+static struct attribute *bch2_fs_time_stats_files[] = {};
|
|
+static struct attribute *bch2_dev_files[] = {};
|
|
+
|
|
+static const struct sysfs_ops bch2_fs_sysfs_ops;
|
|
+static const struct sysfs_ops bch2_fs_internal_sysfs_ops;
|
|
+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops;
|
|
+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops;
|
|
+static const struct sysfs_ops bch2_dev_sysfs_ops;
|
|
+
|
|
+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; }
|
|
+
|
|
+#endif /* NO_BCACHEFS_SYSFS */
|
|
+
|
|
+#endif /* _BCACHEFS_SYSFS_H_ */
|
|
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
|
|
new file mode 100644
|
|
index 000000000000..254e3b314204
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/tests.c
|
|
@@ -0,0 +1,859 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#ifdef CONFIG_BCACHEFS_TESTS
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "btree_update.h"
|
|
+#include "journal_reclaim.h"
|
|
+#include "tests.h"
|
|
+
|
|
+#include "linux/kthread.h"
|
|
+#include "linux/random.h"
|
|
+
|
|
+static void delete_test_keys(struct bch_fs *c)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_extents,
|
|
+ POS(0, 0), POS(0, U64_MAX),
|
|
+ NULL);
|
|
+ BUG_ON(ret);
|
|
+
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
|
|
+ POS(0, 0), POS(0, U64_MAX),
|
|
+ NULL);
|
|
+ BUG_ON(ret);
|
|
+}
|
|
+
|
|
+/* unit tests */
|
|
+
|
|
+static int test_delete(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_i_cookie k;
|
|
+ int ret;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret) {
|
|
+ bch_err(c, "lookup error in test_delete: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ bch2_trans_update(&trans, iter, &k.k_i, 0));
|
|
+ if (ret) {
|
|
+ bch_err(c, "update error in test_delete: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ pr_info("deleting once");
|
|
+ ret = bch2_btree_delete_at(&trans, iter, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "delete error (first) in test_delete: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ pr_info("deleting twice");
|
|
+ ret = bch2_btree_delete_at(&trans, iter, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "delete error (second) in test_delete: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int test_delete_written(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_i_cookie k;
|
|
+ int ret;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, k.k.p,
|
|
+ BTREE_ITER_INTENT);
|
|
+
|
|
+ ret = bch2_btree_iter_traverse(iter);
|
|
+ if (ret) {
|
|
+ bch_err(c, "lookup error in test_delete_written: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ bch2_trans_update(&trans, iter, &k.k_i, 0));
|
|
+ if (ret) {
|
|
+ bch_err(c, "update error in test_delete_written: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ bch2_journal_flush_all_pins(&c->journal);
|
|
+
|
|
+ ret = bch2_btree_delete_at(&trans, iter, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "delete error in test_delete_written: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int test_iterate(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter = NULL;
|
|
+ struct bkey_s_c k;
|
|
+ u64 i;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ delete_test_keys(c);
|
|
+
|
|
+ pr_info("inserting test keys");
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ struct bkey_i_cookie k;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k.p.offset = i;
|
|
+
|
|
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
|
|
+ NULL, NULL, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "insert error in test_iterate: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pr_info("iterating forwards");
|
|
+
|
|
+ i = 0;
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
|
|
+ POS_MIN, 0, k, ret) {
|
|
+ if (k.k->p.inode)
|
|
+ break;
|
|
+
|
|
+ BUG_ON(k.k->p.offset != i++);
|
|
+ }
|
|
+
|
|
+ BUG_ON(i != nr);
|
|
+
|
|
+ pr_info("iterating backwards");
|
|
+
|
|
+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k))
|
|
+ BUG_ON(k.k->p.offset != --i);
|
|
+
|
|
+ BUG_ON(i);
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int test_iterate_extents(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter = NULL;
|
|
+ struct bkey_s_c k;
|
|
+ u64 i;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ delete_test_keys(c);
|
|
+
|
|
+ pr_info("inserting test extents");
|
|
+
|
|
+ for (i = 0; i < nr; i += 8) {
|
|
+ struct bkey_i_cookie k;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k.p.offset = i + 8;
|
|
+ k.k.size = 8;
|
|
+
|
|
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
|
|
+ NULL, NULL, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "insert error in test_iterate_extents: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pr_info("iterating forwards");
|
|
+
|
|
+ i = 0;
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents,
|
|
+ POS_MIN, 0, k, ret) {
|
|
+ BUG_ON(bkey_start_offset(k.k) != i);
|
|
+ i = k.k->p.offset;
|
|
+ }
|
|
+
|
|
+ BUG_ON(i != nr);
|
|
+
|
|
+ pr_info("iterating backwards");
|
|
+
|
|
+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) {
|
|
+ BUG_ON(k.k->p.offset != i);
|
|
+ i = bkey_start_offset(k.k);
|
|
+ }
|
|
+
|
|
+ BUG_ON(i);
|
|
+err:
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int test_iterate_slots(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 i;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ delete_test_keys(c);
|
|
+
|
|
+ pr_info("inserting test keys");
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ struct bkey_i_cookie k;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k.p.offset = i * 2;
|
|
+
|
|
+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i,
|
|
+ NULL, NULL, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "insert error in test_iterate_slots: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pr_info("iterating forwards");
|
|
+
|
|
+ i = 0;
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
|
|
+ 0, k, ret) {
|
|
+ if (k.k->p.inode)
|
|
+ break;
|
|
+
|
|
+ BUG_ON(k.k->p.offset != i);
|
|
+ i += 2;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ BUG_ON(i != nr * 2);
|
|
+
|
|
+ pr_info("iterating forwards by slots");
|
|
+
|
|
+ i = 0;
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
|
|
+ BTREE_ITER_SLOTS, k, ret) {
|
|
+ BUG_ON(k.k->p.offset != i);
|
|
+ BUG_ON(bkey_deleted(k.k) != (i & 1));
|
|
+
|
|
+ i++;
|
|
+ if (i == nr * 2)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+err:
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ u64 i;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ delete_test_keys(c);
|
|
+
|
|
+ pr_info("inserting test keys");
|
|
+
|
|
+ for (i = 0; i < nr; i += 16) {
|
|
+ struct bkey_i_cookie k;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k.p.offset = i + 16;
|
|
+ k.k.size = 8;
|
|
+
|
|
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
|
|
+ NULL, NULL, 0);
|
|
+ if (ret) {
|
|
+ bch_err(c, "insert error in test_iterate_slots_extents: %i", ret);
|
|
+ goto err;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ pr_info("iterating forwards");
|
|
+
|
|
+ i = 0;
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
|
|
+ 0, k, ret) {
|
|
+ BUG_ON(bkey_start_offset(k.k) != i + 8);
|
|
+ BUG_ON(k.k->size != 8);
|
|
+ i += 16;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ BUG_ON(i != nr);
|
|
+
|
|
+ pr_info("iterating forwards by slots");
|
|
+
|
|
+ i = 0;
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_extents, POS_MIN,
|
|
+ BTREE_ITER_SLOTS, k, ret) {
|
|
+ BUG_ON(bkey_deleted(k.k) != !(i % 16));
|
|
+
|
|
+ BUG_ON(bkey_start_offset(k.k) != i);
|
|
+ BUG_ON(k.k->size != 8);
|
|
+ i = k.k->p.offset;
|
|
+
|
|
+ if (i == nr)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+err:
|
|
+ bch2_trans_exit(&trans);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/*
|
|
+ * XXX: we really want to make sure we've got a btree with depth > 0 for these
|
|
+ * tests
|
|
+ */
|
|
+static int test_peek_end(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ BUG_ON(k.k);
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ BUG_ON(k.k);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int test_peek_end_extents(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, POS_MIN, 0);
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ BUG_ON(k.k);
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ BUG_ON(k.k);
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+/* extent unit tests */
|
|
+
|
|
+u64 test_version;
|
|
+
|
|
+static int insert_test_extent(struct bch_fs *c,
|
|
+ u64 start, u64 end)
|
|
+{
|
|
+ struct bkey_i_cookie k;
|
|
+ int ret;
|
|
+
|
|
+ //pr_info("inserting %llu-%llu v %llu", start, end, test_version);
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k_i.k.p.offset = end;
|
|
+ k.k_i.k.size = end - start;
|
|
+ k.k_i.k.version.lo = test_version++;
|
|
+
|
|
+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i,
|
|
+ NULL, NULL, 0);
|
|
+ if (ret)
|
|
+ bch_err(c, "insert error in insert_test_extent: %i", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __test_extent_overwrite(struct bch_fs *c,
|
|
+ u64 e1_start, u64 e1_end,
|
|
+ u64 e2_start, u64 e2_end)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = insert_test_extent(c, e1_start, e1_end) ?:
|
|
+ insert_test_extent(c, e2_start, e2_end);
|
|
+
|
|
+ delete_test_keys(c);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ return __test_extent_overwrite(c, 0, 64, 0, 32) ?:
|
|
+ __test_extent_overwrite(c, 8, 64, 0, 32);
|
|
+}
|
|
+
|
|
+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ return __test_extent_overwrite(c, 0, 64, 32, 64) ?:
|
|
+ __test_extent_overwrite(c, 0, 64, 32, 72);
|
|
+}
|
|
+
|
|
+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ return __test_extent_overwrite(c, 0, 64, 32, 40);
|
|
+}
|
|
+
|
|
+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ return __test_extent_overwrite(c, 32, 64, 0, 64) ?:
|
|
+ __test_extent_overwrite(c, 32, 64, 0, 128) ?:
|
|
+ __test_extent_overwrite(c, 32, 64, 32, 64) ?:
|
|
+ __test_extent_overwrite(c, 32, 64, 32, 128);
|
|
+}
|
|
+
|
|
+/* perf tests */
|
|
+
|
|
+static u64 test_rand(void)
|
|
+{
|
|
+ u64 v;
|
|
+#if 0
|
|
+ v = prandom_u32();
|
|
+#else
|
|
+ prandom_bytes(&v, sizeof(v));
|
|
+#endif
|
|
+ return v;
|
|
+}
|
|
+
|
|
+static int rand_insert(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct bkey_i_cookie k;
|
|
+ int ret = 0;
|
|
+ u64 i;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k.p.offset = test_rand();
|
|
+ k.k.p.snapshot = U32_MAX;
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in rand_insert: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int rand_insert_multi(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct bkey_i_cookie k[8];
|
|
+ int ret = 0;
|
|
+ unsigned j;
|
|
+ u64 i;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for (i = 0; i < nr; i += ARRAY_SIZE(k)) {
|
|
+ for (j = 0; j < ARRAY_SIZE(k); j++) {
|
|
+ bkey_cookie_init(&k[j].k_i);
|
|
+ k[j].k.p.offset = test_rand();
|
|
+ k[j].k.p.snapshot = U32_MAX;
|
|
+ }
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i) ?:
|
|
+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in rand_insert_multi: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int rand_lookup(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+ u64 i;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in rand_lookup: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int rand_mixed(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+ u64 i;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+ iter = bch2_trans_get_iter(&trans, BTREE_ID_xattrs, POS_MIN, 0);
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ bch2_btree_iter_set_pos(iter, POS(0, test_rand()));
|
|
+
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret) {
|
|
+ bch_err(c, "lookup error in rand_mixed: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!(i & 3) && k.k) {
|
|
+ struct bkey_i_cookie k;
|
|
+
|
|
+ bkey_cookie_init(&k.k_i);
|
|
+ k.k.p = iter->pos;
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ bch2_trans_update(&trans, iter, &k.k_i, 0));
|
|
+ if (ret) {
|
|
+ bch_err(c, "update error in rand_mixed: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int __do_delete(struct btree_trans *trans, struct bpos pos)
|
|
+{
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_i delete;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ iter = bch2_trans_get_iter(trans, BTREE_ID_xattrs, pos,
|
|
+ BTREE_ITER_INTENT);
|
|
+ k = bch2_btree_iter_peek(iter);
|
|
+ ret = bkey_err(k);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ if (!k.k)
|
|
+ goto err;
|
|
+
|
|
+ bkey_init(&delete.k);
|
|
+ delete.k.p = k.k->p;
|
|
+
|
|
+ bch2_trans_update(trans, iter, &delete, 0);
|
|
+err:
|
|
+ bch2_trans_iter_put(trans, iter);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int rand_delete(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ int ret = 0;
|
|
+ u64 i;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ struct bpos pos = POS(0, test_rand());
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ __do_delete(&trans, pos));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in rand_delete: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int seq_insert(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct bkey_i_cookie insert;
|
|
+ int ret = 0;
|
|
+ u64 i = 0;
|
|
+
|
|
+ bkey_cookie_init(&insert.k_i);
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
|
|
+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) {
|
|
+ insert.k.p = iter->pos;
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ bch2_trans_update(&trans, iter, &insert.k_i, 0));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in seq_insert: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (++i == nr)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int seq_lookup(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN, 0, k, ret)
|
|
+ ;
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int seq_overwrite(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ int ret = 0;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs, POS_MIN,
|
|
+ BTREE_ITER_INTENT, k, ret) {
|
|
+ struct bkey_i_cookie u;
|
|
+
|
|
+ bkey_reassemble(&u.k_i, k);
|
|
+
|
|
+ ret = __bch2_trans_do(&trans, NULL, NULL, 0,
|
|
+ bch2_trans_update(&trans, iter, &u.k_i, 0));
|
|
+ if (ret) {
|
|
+ bch_err(c, "error in seq_overwrite: %i", ret);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ bch2_trans_exit(&trans);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static int seq_delete(struct bch_fs *c, u64 nr)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs,
|
|
+ POS(0, 0), POS(0, U64_MAX),
|
|
+ NULL);
|
|
+ if (ret)
|
|
+ bch_err(c, "error in seq_delete: %i", ret);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+typedef int (*perf_test_fn)(struct bch_fs *, u64);
|
|
+
|
|
+struct test_job {
|
|
+ struct bch_fs *c;
|
|
+ u64 nr;
|
|
+ unsigned nr_threads;
|
|
+ perf_test_fn fn;
|
|
+
|
|
+ atomic_t ready;
|
|
+ wait_queue_head_t ready_wait;
|
|
+
|
|
+ atomic_t done;
|
|
+ struct completion done_completion;
|
|
+
|
|
+ u64 start;
|
|
+ u64 finish;
|
|
+ int ret;
|
|
+};
|
|
+
|
|
+static int btree_perf_test_thread(void *data)
|
|
+{
|
|
+ struct test_job *j = data;
|
|
+ int ret;
|
|
+
|
|
+ if (atomic_dec_and_test(&j->ready)) {
|
|
+ wake_up(&j->ready_wait);
|
|
+ j->start = sched_clock();
|
|
+ } else {
|
|
+ wait_event(j->ready_wait, !atomic_read(&j->ready));
|
|
+ }
|
|
+
|
|
+ ret = j->fn(j->c, j->nr / j->nr_threads);
|
|
+ if (ret)
|
|
+ j->ret = ret;
|
|
+
|
|
+ if (atomic_dec_and_test(&j->done)) {
|
|
+ j->finish = sched_clock();
|
|
+ complete(&j->done_completion);
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
|
|
+ u64 nr, unsigned nr_threads)
|
|
+{
|
|
+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
|
|
+ char name_buf[20], nr_buf[20], per_sec_buf[20];
|
|
+ unsigned i;
|
|
+ u64 time;
|
|
+
|
|
+ atomic_set(&j.ready, nr_threads);
|
|
+ init_waitqueue_head(&j.ready_wait);
|
|
+
|
|
+ atomic_set(&j.done, nr_threads);
|
|
+ init_completion(&j.done_completion);
|
|
+
|
|
+#define perf_test(_test) \
|
|
+ if (!strcmp(testname, #_test)) j.fn = _test
|
|
+
|
|
+ perf_test(rand_insert);
|
|
+ perf_test(rand_insert_multi);
|
|
+ perf_test(rand_lookup);
|
|
+ perf_test(rand_mixed);
|
|
+ perf_test(rand_delete);
|
|
+
|
|
+ perf_test(seq_insert);
|
|
+ perf_test(seq_lookup);
|
|
+ perf_test(seq_overwrite);
|
|
+ perf_test(seq_delete);
|
|
+
|
|
+ /* a unit test, not a perf test: */
|
|
+ perf_test(test_delete);
|
|
+ perf_test(test_delete_written);
|
|
+ perf_test(test_iterate);
|
|
+ perf_test(test_iterate_extents);
|
|
+ perf_test(test_iterate_slots);
|
|
+ perf_test(test_iterate_slots_extents);
|
|
+ perf_test(test_peek_end);
|
|
+ perf_test(test_peek_end_extents);
|
|
+
|
|
+ perf_test(test_extent_overwrite_front);
|
|
+ perf_test(test_extent_overwrite_back);
|
|
+ perf_test(test_extent_overwrite_middle);
|
|
+ perf_test(test_extent_overwrite_all);
|
|
+
|
|
+ if (!j.fn) {
|
|
+ pr_err("unknown test %s", testname);
|
|
+ return -EINVAL;
|
|
+ }
|
|
+
|
|
+ //pr_info("running test %s:", testname);
|
|
+
|
|
+ if (nr_threads == 1)
|
|
+ btree_perf_test_thread(&j);
|
|
+ else
|
|
+ for (i = 0; i < nr_threads; i++)
|
|
+ kthread_run(btree_perf_test_thread, &j,
|
|
+ "bcachefs perf test[%u]", i);
|
|
+
|
|
+ while (wait_for_completion_interruptible(&j.done_completion))
|
|
+ ;
|
|
+
|
|
+ time = j.finish - j.start;
|
|
+
|
|
+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
|
|
+ bch2_hprint(&PBUF(nr_buf), nr);
|
|
+ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time);
|
|
+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
|
|
+ name_buf, nr_buf, nr_threads,
|
|
+ time / NSEC_PER_SEC,
|
|
+ time * nr_threads / nr,
|
|
+ per_sec_buf);
|
|
+ return j.ret;
|
|
+}
|
|
+
|
|
+#endif /* CONFIG_BCACHEFS_TESTS */
|
|
diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h
|
|
new file mode 100644
|
|
index 000000000000..c73b18aea7e0
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/tests.h
|
|
@@ -0,0 +1,15 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_TEST_H
|
|
+#define _BCACHEFS_TEST_H
|
|
+
|
|
+struct bch_fs;
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_TESTS
|
|
+
|
|
+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
|
|
+
|
|
+#else
|
|
+
|
|
+#endif /* CONFIG_BCACHEFS_TESTS */
|
|
+
|
|
+#endif /* _BCACHEFS_TEST_H */
|
|
diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c
|
|
new file mode 100644
|
|
index 000000000000..59e8dfa3d245
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/trace.c
|
|
@@ -0,0 +1,12 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+#include "bcachefs.h"
|
|
+#include "alloc_types.h"
|
|
+#include "buckets.h"
|
|
+#include "btree_types.h"
|
|
+#include "keylist.h"
|
|
+
|
|
+#include <linux/blktrace_api.h>
|
|
+#include "keylist.h"
|
|
+
|
|
+#define CREATE_TRACE_POINTS
|
|
+#include <trace/events/bcachefs.h>
|
|
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
|
|
new file mode 100644
|
|
index 000000000000..e3ad26e244ab
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/util.c
|
|
@@ -0,0 +1,907 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * random utiility code, for bcache but in theory not specific to bcache
|
|
+ *
|
|
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include <linux/bio.h>
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/ctype.h>
|
|
+#include <linux/debugfs.h>
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/kthread.h>
|
|
+#include <linux/log2.h>
|
|
+#include <linux/math64.h>
|
|
+#include <linux/percpu.h>
|
|
+#include <linux/preempt.h>
|
|
+#include <linux/random.h>
|
|
+#include <linux/seq_file.h>
|
|
+#include <linux/string.h>
|
|
+#include <linux/types.h>
|
|
+#include <linux/sched/clock.h>
|
|
+
|
|
+#include "eytzinger.h"
|
|
+#include "util.h"
|
|
+
|
|
+static const char si_units[] = "?kMGTPEZY";
|
|
+
|
|
+static int __bch2_strtoh(const char *cp, u64 *res,
|
|
+ u64 t_max, bool t_signed)
|
|
+{
|
|
+ bool positive = *cp != '-';
|
|
+ unsigned u;
|
|
+ u64 v = 0;
|
|
+
|
|
+ if (*cp == '+' || *cp == '-')
|
|
+ cp++;
|
|
+
|
|
+ if (!isdigit(*cp))
|
|
+ return -EINVAL;
|
|
+
|
|
+ do {
|
|
+ if (v > U64_MAX / 10)
|
|
+ return -ERANGE;
|
|
+ v *= 10;
|
|
+ if (v > U64_MAX - (*cp - '0'))
|
|
+ return -ERANGE;
|
|
+ v += *cp - '0';
|
|
+ cp++;
|
|
+ } while (isdigit(*cp));
|
|
+
|
|
+ for (u = 1; u < strlen(si_units); u++)
|
|
+ if (*cp == si_units[u]) {
|
|
+ cp++;
|
|
+ goto got_unit;
|
|
+ }
|
|
+ u = 0;
|
|
+got_unit:
|
|
+ if (*cp == '\n')
|
|
+ cp++;
|
|
+ if (*cp)
|
|
+ return -EINVAL;
|
|
+
|
|
+ if (fls64(v) + u * 10 > 64)
|
|
+ return -ERANGE;
|
|
+
|
|
+ v <<= u * 10;
|
|
+
|
|
+ if (positive) {
|
|
+ if (v > t_max)
|
|
+ return -ERANGE;
|
|
+ } else {
|
|
+ if (v && !t_signed)
|
|
+ return -ERANGE;
|
|
+
|
|
+ if (v > t_max + 1)
|
|
+ return -ERANGE;
|
|
+ v = -v;
|
|
+ }
|
|
+
|
|
+ *res = v;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+#define STRTO_H(name, type) \
|
|
+int bch2_ ## name ## _h(const char *cp, type *res) \
|
|
+{ \
|
|
+ u64 v; \
|
|
+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \
|
|
+ ANYSINT_MAX(type) != ((type) ~0ULL)); \
|
|
+ *res = v; \
|
|
+ return ret; \
|
|
+}
|
|
+
|
|
+STRTO_H(strtoint, int)
|
|
+STRTO_H(strtouint, unsigned int)
|
|
+STRTO_H(strtoll, long long)
|
|
+STRTO_H(strtoull, unsigned long long)
|
|
+STRTO_H(strtou64, u64)
|
|
+
|
|
+void bch2_hprint(struct printbuf *buf, s64 v)
|
|
+{
|
|
+ int u, t = 0;
|
|
+
|
|
+ for (u = 0; v >= 1024 || v <= -1024; u++) {
|
|
+ t = v & ~(~0U << 10);
|
|
+ v >>= 10;
|
|
+ }
|
|
+
|
|
+ pr_buf(buf, "%lli", v);
|
|
+
|
|
+ /*
|
|
+ * 103 is magic: t is in the range [-1023, 1023] and we want
|
|
+ * to turn it into [-9, 9]
|
|
+ */
|
|
+ if (u && v < 100 && v > -100)
|
|
+ pr_buf(buf, ".%i", t / 103);
|
|
+ if (u)
|
|
+ pr_buf(buf, "%c", si_units[u]);
|
|
+}
|
|
+
|
|
+void bch2_string_opt_to_text(struct printbuf *out,
|
|
+ const char * const list[],
|
|
+ size_t selected)
|
|
+{
|
|
+ size_t i;
|
|
+
|
|
+ for (i = 0; list[i]; i++)
|
|
+ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]);
|
|
+}
|
|
+
|
|
+void bch2_flags_to_text(struct printbuf *out,
|
|
+ const char * const list[], u64 flags)
|
|
+{
|
|
+ unsigned bit, nr = 0;
|
|
+ bool first = true;
|
|
+
|
|
+ if (out->pos != out->end)
|
|
+ *out->pos = '\0';
|
|
+
|
|
+ while (list[nr])
|
|
+ nr++;
|
|
+
|
|
+ while (flags && (bit = __ffs(flags)) < nr) {
|
|
+ if (!first)
|
|
+ pr_buf(out, ",");
|
|
+ first = false;
|
|
+ pr_buf(out, "%s", list[bit]);
|
|
+ flags ^= 1 << bit;
|
|
+ }
|
|
+}
|
|
+
|
|
+u64 bch2_read_flag_list(char *opt, const char * const list[])
|
|
+{
|
|
+ u64 ret = 0;
|
|
+ char *p, *s, *d = kstrdup(opt, GFP_KERNEL);
|
|
+
|
|
+ if (!d)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ s = strim(d);
|
|
+
|
|
+ while ((p = strsep(&s, ","))) {
|
|
+ int flag = match_string(list, -1, p);
|
|
+ if (flag < 0) {
|
|
+ ret = -1;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ ret |= 1 << flag;
|
|
+ }
|
|
+
|
|
+ kfree(d);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+bool bch2_is_zero(const void *_p, size_t n)
|
|
+{
|
|
+ const char *p = _p;
|
|
+ size_t i;
|
|
+
|
|
+ for (i = 0; i < n; i++)
|
|
+ if (p[i])
|
|
+ return false;
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static void bch2_quantiles_update(struct quantiles *q, u64 v)
|
|
+{
|
|
+ unsigned i = 0;
|
|
+
|
|
+ while (i < ARRAY_SIZE(q->entries)) {
|
|
+ struct quantile_entry *e = q->entries + i;
|
|
+
|
|
+ if (unlikely(!e->step)) {
|
|
+ e->m = v;
|
|
+ e->step = max_t(unsigned, v / 2, 1024);
|
|
+ } else if (e->m > v) {
|
|
+ e->m = e->m >= e->step
|
|
+ ? e->m - e->step
|
|
+ : 0;
|
|
+ } else if (e->m < v) {
|
|
+ e->m = e->m + e->step > e->m
|
|
+ ? e->m + e->step
|
|
+ : U32_MAX;
|
|
+ }
|
|
+
|
|
+ if ((e->m > v ? e->m - v : v - e->m) < e->step)
|
|
+ e->step = max_t(unsigned, e->step / 2, 1);
|
|
+
|
|
+ if (v >= e->m)
|
|
+ break;
|
|
+
|
|
+ i = eytzinger0_child(i, v > e->m);
|
|
+ }
|
|
+}
|
|
+
|
|
+/* time stats: */
|
|
+
|
|
+static void bch2_time_stats_update_one(struct time_stats *stats,
|
|
+ u64 start, u64 end)
|
|
+{
|
|
+ u64 duration, freq;
|
|
+
|
|
+ duration = time_after64(end, start)
|
|
+ ? end - start : 0;
|
|
+ freq = time_after64(end, stats->last_event)
|
|
+ ? end - stats->last_event : 0;
|
|
+
|
|
+ stats->count++;
|
|
+
|
|
+ stats->average_duration = stats->average_duration
|
|
+ ? ewma_add(stats->average_duration, duration, 6)
|
|
+ : duration;
|
|
+
|
|
+ stats->average_frequency = stats->average_frequency
|
|
+ ? ewma_add(stats->average_frequency, freq, 6)
|
|
+ : freq;
|
|
+
|
|
+ stats->max_duration = max(stats->max_duration, duration);
|
|
+
|
|
+ stats->last_event = end;
|
|
+
|
|
+ bch2_quantiles_update(&stats->quantiles, duration);
|
|
+}
|
|
+
|
|
+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
|
|
+{
|
|
+ unsigned long flags;
|
|
+
|
|
+ if (!stats->buffer) {
|
|
+ spin_lock_irqsave(&stats->lock, flags);
|
|
+ bch2_time_stats_update_one(stats, start, end);
|
|
+
|
|
+ if (stats->average_frequency < 32 &&
|
|
+ stats->count > 1024)
|
|
+ stats->buffer =
|
|
+ alloc_percpu_gfp(struct time_stat_buffer,
|
|
+ GFP_ATOMIC);
|
|
+ spin_unlock_irqrestore(&stats->lock, flags);
|
|
+ } else {
|
|
+ struct time_stat_buffer_entry *i;
|
|
+ struct time_stat_buffer *b;
|
|
+
|
|
+ preempt_disable();
|
|
+ b = this_cpu_ptr(stats->buffer);
|
|
+
|
|
+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
|
|
+ b->entries[b->nr++] = (struct time_stat_buffer_entry) {
|
|
+ .start = start,
|
|
+ .end = end
|
|
+ };
|
|
+
|
|
+ if (b->nr == ARRAY_SIZE(b->entries)) {
|
|
+ spin_lock_irqsave(&stats->lock, flags);
|
|
+ for (i = b->entries;
|
|
+ i < b->entries + ARRAY_SIZE(b->entries);
|
|
+ i++)
|
|
+ bch2_time_stats_update_one(stats, i->start, i->end);
|
|
+ spin_unlock_irqrestore(&stats->lock, flags);
|
|
+
|
|
+ b->nr = 0;
|
|
+ }
|
|
+
|
|
+ preempt_enable();
|
|
+ }
|
|
+}
|
|
+
|
|
+static const struct time_unit {
|
|
+ const char *name;
|
|
+ u32 nsecs;
|
|
+} time_units[] = {
|
|
+ { "ns", 1 },
|
|
+ { "us", NSEC_PER_USEC },
|
|
+ { "ms", NSEC_PER_MSEC },
|
|
+ { "sec", NSEC_PER_SEC },
|
|
+};
|
|
+
|
|
+static const struct time_unit *pick_time_units(u64 ns)
|
|
+{
|
|
+ const struct time_unit *u;
|
|
+
|
|
+ for (u = time_units;
|
|
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
|
|
+ ns >= u[1].nsecs << 1;
|
|
+ u++)
|
|
+ ;
|
|
+
|
|
+ return u;
|
|
+}
|
|
+
|
|
+static void pr_time_units(struct printbuf *out, u64 ns)
|
|
+{
|
|
+ const struct time_unit *u = pick_time_units(ns);
|
|
+
|
|
+ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
|
|
+}
|
|
+
|
|
+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats)
|
|
+{
|
|
+ const struct time_unit *u;
|
|
+ u64 freq = READ_ONCE(stats->average_frequency);
|
|
+ u64 q, last_q = 0;
|
|
+ int i;
|
|
+
|
|
+ pr_buf(out, "count:\t\t%llu\n",
|
|
+ stats->count);
|
|
+ pr_buf(out, "rate:\t\t%llu/sec\n",
|
|
+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0);
|
|
+
|
|
+ pr_buf(out, "frequency:\t");
|
|
+ pr_time_units(out, freq);
|
|
+
|
|
+ pr_buf(out, "\navg duration:\t");
|
|
+ pr_time_units(out, stats->average_duration);
|
|
+
|
|
+ pr_buf(out, "\nmax duration:\t");
|
|
+ pr_time_units(out, stats->max_duration);
|
|
+
|
|
+ i = eytzinger0_first(NR_QUANTILES);
|
|
+ u = pick_time_units(stats->quantiles.entries[i].m);
|
|
+
|
|
+ pr_buf(out, "\nquantiles (%s):\t", u->name);
|
|
+ eytzinger0_for_each(i, NR_QUANTILES) {
|
|
+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
|
|
+
|
|
+ q = max(stats->quantiles.entries[i].m, last_q);
|
|
+ pr_buf(out, "%llu%s",
|
|
+ div_u64(q, u->nsecs),
|
|
+ is_last ? "\n" : " ");
|
|
+ last_q = q;
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch2_time_stats_exit(struct time_stats *stats)
|
|
+{
|
|
+ free_percpu(stats->buffer);
|
|
+}
|
|
+
|
|
+void bch2_time_stats_init(struct time_stats *stats)
|
|
+{
|
|
+ memset(stats, 0, sizeof(*stats));
|
|
+ spin_lock_init(&stats->lock);
|
|
+}
|
|
+
|
|
+/* ratelimit: */
|
|
+
|
|
+/**
|
|
+ * bch2_ratelimit_delay() - return how long to delay until the next time to do
|
|
+ * some work
|
|
+ *
|
|
+ * @d - the struct bch_ratelimit to update
|
|
+ *
|
|
+ * Returns the amount of time to delay by, in jiffies
|
|
+ */
|
|
+u64 bch2_ratelimit_delay(struct bch_ratelimit *d)
|
|
+{
|
|
+ u64 now = local_clock();
|
|
+
|
|
+ return time_after64(d->next, now)
|
|
+ ? nsecs_to_jiffies(d->next - now)
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * bch2_ratelimit_increment() - increment @d by the amount of work done
|
|
+ *
|
|
+ * @d - the struct bch_ratelimit to update
|
|
+ * @done - the amount of work done, in arbitrary units
|
|
+ */
|
|
+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
|
|
+{
|
|
+ u64 now = local_clock();
|
|
+
|
|
+ d->next += div_u64(done * NSEC_PER_SEC, d->rate);
|
|
+
|
|
+ if (time_before64(now + NSEC_PER_SEC, d->next))
|
|
+ d->next = now + NSEC_PER_SEC;
|
|
+
|
|
+ if (time_after64(now - NSEC_PER_SEC * 2, d->next))
|
|
+ d->next = now - NSEC_PER_SEC * 2;
|
|
+}
|
|
+
|
|
+/* pd controller: */
|
|
+
|
|
+/*
|
|
+ * Updates pd_controller. Attempts to scale inputed values to units per second.
|
|
+ * @target: desired value
|
|
+ * @actual: current value
|
|
+ *
|
|
+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing
|
|
+ * it makes actual go down.
|
|
+ */
|
|
+void bch2_pd_controller_update(struct bch_pd_controller *pd,
|
|
+ s64 target, s64 actual, int sign)
|
|
+{
|
|
+ s64 proportional, derivative, change;
|
|
+
|
|
+ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ;
|
|
+
|
|
+ if (seconds_since_update == 0)
|
|
+ return;
|
|
+
|
|
+ pd->last_update = jiffies;
|
|
+
|
|
+ proportional = actual - target;
|
|
+ proportional *= seconds_since_update;
|
|
+ proportional = div_s64(proportional, pd->p_term_inverse);
|
|
+
|
|
+ derivative = actual - pd->last_actual;
|
|
+ derivative = div_s64(derivative, seconds_since_update);
|
|
+ derivative = ewma_add(pd->smoothed_derivative, derivative,
|
|
+ (pd->d_term / seconds_since_update) ?: 1);
|
|
+ derivative = derivative * pd->d_term;
|
|
+ derivative = div_s64(derivative, pd->p_term_inverse);
|
|
+
|
|
+ change = proportional + derivative;
|
|
+
|
|
+ /* Don't increase rate if not keeping up */
|
|
+ if (change > 0 &&
|
|
+ pd->backpressure &&
|
|
+ time_after64(local_clock(),
|
|
+ pd->rate.next + NSEC_PER_MSEC))
|
|
+ change = 0;
|
|
+
|
|
+ change *= (sign * -1);
|
|
+
|
|
+ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change,
|
|
+ 1, UINT_MAX);
|
|
+
|
|
+ pd->last_actual = actual;
|
|
+ pd->last_derivative = derivative;
|
|
+ pd->last_proportional = proportional;
|
|
+ pd->last_change = change;
|
|
+ pd->last_target = target;
|
|
+}
|
|
+
|
|
+void bch2_pd_controller_init(struct bch_pd_controller *pd)
|
|
+{
|
|
+ pd->rate.rate = 1024;
|
|
+ pd->last_update = jiffies;
|
|
+ pd->p_term_inverse = 6000;
|
|
+ pd->d_term = 30;
|
|
+ pd->d_smooth = pd->d_term;
|
|
+ pd->backpressure = 1;
|
|
+}
|
|
+
|
|
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
|
|
+{
|
|
+ /* 2^64 - 1 is 20 digits, plus null byte */
|
|
+ char rate[21];
|
|
+ char actual[21];
|
|
+ char target[21];
|
|
+ char proportional[21];
|
|
+ char derivative[21];
|
|
+ char change[21];
|
|
+ s64 next_io;
|
|
+
|
|
+ bch2_hprint(&PBUF(rate), pd->rate.rate);
|
|
+ bch2_hprint(&PBUF(actual), pd->last_actual);
|
|
+ bch2_hprint(&PBUF(target), pd->last_target);
|
|
+ bch2_hprint(&PBUF(proportional), pd->last_proportional);
|
|
+ bch2_hprint(&PBUF(derivative), pd->last_derivative);
|
|
+ bch2_hprint(&PBUF(change), pd->last_change);
|
|
+
|
|
+ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC);
|
|
+
|
|
+ return sprintf(buf,
|
|
+ "rate:\t\t%s/sec\n"
|
|
+ "target:\t\t%s\n"
|
|
+ "actual:\t\t%s\n"
|
|
+ "proportional:\t%s\n"
|
|
+ "derivative:\t%s\n"
|
|
+ "change:\t\t%s/sec\n"
|
|
+ "next io:\t%llims\n",
|
|
+ rate, target, actual, proportional,
|
|
+ derivative, change, next_io);
|
|
+}
|
|
+
|
|
+/* misc: */
|
|
+
|
|
+void bch2_bio_map(struct bio *bio, void *base, size_t size)
|
|
+{
|
|
+ while (size) {
|
|
+ struct page *page = is_vmalloc_addr(base)
|
|
+ ? vmalloc_to_page(base)
|
|
+ : virt_to_page(base);
|
|
+ unsigned offset = offset_in_page(base);
|
|
+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size);
|
|
+
|
|
+ BUG_ON(!bio_add_page(bio, page, len, offset));
|
|
+ size -= len;
|
|
+ base += len;
|
|
+ }
|
|
+}
|
|
+
|
|
+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
|
|
+{
|
|
+ while (size) {
|
|
+ struct page *page = alloc_page(gfp_mask);
|
|
+ unsigned len = min_t(size_t, PAGE_SIZE, size);
|
|
+
|
|
+ if (!page)
|
|
+ return -ENOMEM;
|
|
+
|
|
+ BUG_ON(!bio_add_page(bio, page, len, 0));
|
|
+ size -= len;
|
|
+ }
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+size_t bch2_rand_range(size_t max)
|
|
+{
|
|
+ size_t rand;
|
|
+
|
|
+ if (!max)
|
|
+ return 0;
|
|
+
|
|
+ do {
|
|
+ rand = get_random_long();
|
|
+ rand &= roundup_pow_of_two(max) - 1;
|
|
+ } while (rand >= max);
|
|
+
|
|
+ return rand;
|
|
+}
|
|
+
|
|
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
|
|
+{
|
|
+ struct bio_vec bv;
|
|
+ struct bvec_iter iter;
|
|
+
|
|
+ __bio_for_each_segment(bv, dst, iter, dst_iter) {
|
|
+ void *dstp = kmap_atomic(bv.bv_page);
|
|
+ memcpy(dstp + bv.bv_offset, src, bv.bv_len);
|
|
+ kunmap_atomic(dstp);
|
|
+
|
|
+ src += bv.bv_len;
|
|
+ }
|
|
+}
|
|
+
|
|
+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
|
|
+{
|
|
+ struct bio_vec bv;
|
|
+ struct bvec_iter iter;
|
|
+
|
|
+ __bio_for_each_segment(bv, src, iter, src_iter) {
|
|
+ void *srcp = kmap_atomic(bv.bv_page);
|
|
+ memcpy(dst, srcp + bv.bv_offset, bv.bv_len);
|
|
+ kunmap_atomic(srcp);
|
|
+
|
|
+ dst += bv.bv_len;
|
|
+ }
|
|
+}
|
|
+
|
|
+void bch_scnmemcpy(struct printbuf *out,
|
|
+ const char *src, size_t len)
|
|
+{
|
|
+ size_t n = printbuf_remaining(out);
|
|
+
|
|
+ if (n) {
|
|
+ n = min(n - 1, len);
|
|
+ memcpy(out->pos, src, n);
|
|
+ out->pos += n;
|
|
+ *out->pos = '\0';
|
|
+ }
|
|
+}
|
|
+
|
|
+#include "eytzinger.h"
|
|
+
|
|
+static int alignment_ok(const void *base, size_t align)
|
|
+{
|
|
+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
|
|
+ ((unsigned long)base & (align - 1)) == 0;
|
|
+}
|
|
+
|
|
+static void u32_swap(void *a, void *b, size_t size)
|
|
+{
|
|
+ u32 t = *(u32 *)a;
|
|
+ *(u32 *)a = *(u32 *)b;
|
|
+ *(u32 *)b = t;
|
|
+}
|
|
+
|
|
+static void u64_swap(void *a, void *b, size_t size)
|
|
+{
|
|
+ u64 t = *(u64 *)a;
|
|
+ *(u64 *)a = *(u64 *)b;
|
|
+ *(u64 *)b = t;
|
|
+}
|
|
+
|
|
+static void generic_swap(void *a, void *b, size_t size)
|
|
+{
|
|
+ char t;
|
|
+
|
|
+ do {
|
|
+ t = *(char *)a;
|
|
+ *(char *)a++ = *(char *)b;
|
|
+ *(char *)b++ = t;
|
|
+ } while (--size > 0);
|
|
+}
|
|
+
|
|
+static inline int do_cmp(void *base, size_t n, size_t size,
|
|
+ int (*cmp_func)(const void *, const void *, size_t),
|
|
+ size_t l, size_t r)
|
|
+{
|
|
+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
|
|
+ base + inorder_to_eytzinger0(r, n) * size,
|
|
+ size);
|
|
+}
|
|
+
|
|
+static inline void do_swap(void *base, size_t n, size_t size,
|
|
+ void (*swap_func)(void *, void *, size_t),
|
|
+ size_t l, size_t r)
|
|
+{
|
|
+ swap_func(base + inorder_to_eytzinger0(l, n) * size,
|
|
+ base + inorder_to_eytzinger0(r, n) * size,
|
|
+ size);
|
|
+}
|
|
+
|
|
+void eytzinger0_sort(void *base, size_t n, size_t size,
|
|
+ int (*cmp_func)(const void *, const void *, size_t),
|
|
+ void (*swap_func)(void *, void *, size_t))
|
|
+{
|
|
+ int i, c, r;
|
|
+
|
|
+ if (!swap_func) {
|
|
+ if (size == 4 && alignment_ok(base, 4))
|
|
+ swap_func = u32_swap;
|
|
+ else if (size == 8 && alignment_ok(base, 8))
|
|
+ swap_func = u64_swap;
|
|
+ else
|
|
+ swap_func = generic_swap;
|
|
+ }
|
|
+
|
|
+ /* heapify */
|
|
+ for (i = n / 2 - 1; i >= 0; --i) {
|
|
+ for (r = i; r * 2 + 1 < n; r = c) {
|
|
+ c = r * 2 + 1;
|
|
+
|
|
+ if (c + 1 < n &&
|
|
+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
|
|
+ c++;
|
|
+
|
|
+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
|
|
+ break;
|
|
+
|
|
+ do_swap(base, n, size, swap_func, r, c);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* sort */
|
|
+ for (i = n - 1; i > 0; --i) {
|
|
+ do_swap(base, n, size, swap_func, 0, i);
|
|
+
|
|
+ for (r = 0; r * 2 + 1 < i; r = c) {
|
|
+ c = r * 2 + 1;
|
|
+
|
|
+ if (c + 1 < i &&
|
|
+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
|
|
+ c++;
|
|
+
|
|
+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
|
|
+ break;
|
|
+
|
|
+ do_swap(base, n, size, swap_func, r, c);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void sort_cmp_size(void *base, size_t num, size_t size,
|
|
+ int (*cmp_func)(const void *, const void *, size_t),
|
|
+ void (*swap_func)(void *, void *, size_t size))
|
|
+{
|
|
+ /* pre-scale counters for performance */
|
|
+ int i = (num/2 - 1) * size, n = num * size, c, r;
|
|
+
|
|
+ if (!swap_func) {
|
|
+ if (size == 4 && alignment_ok(base, 4))
|
|
+ swap_func = u32_swap;
|
|
+ else if (size == 8 && alignment_ok(base, 8))
|
|
+ swap_func = u64_swap;
|
|
+ else
|
|
+ swap_func = generic_swap;
|
|
+ }
|
|
+
|
|
+ /* heapify */
|
|
+ for ( ; i >= 0; i -= size) {
|
|
+ for (r = i; r * 2 + size < n; r = c) {
|
|
+ c = r * 2 + size;
|
|
+ if (c < n - size &&
|
|
+ cmp_func(base + c, base + c + size, size) < 0)
|
|
+ c += size;
|
|
+ if (cmp_func(base + r, base + c, size) >= 0)
|
|
+ break;
|
|
+ swap_func(base + r, base + c, size);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* sort */
|
|
+ for (i = n - size; i > 0; i -= size) {
|
|
+ swap_func(base, base + i, size);
|
|
+ for (r = 0; r * 2 + size < i; r = c) {
|
|
+ c = r * 2 + size;
|
|
+ if (c < i - size &&
|
|
+ cmp_func(base + c, base + c + size, size) < 0)
|
|
+ c += size;
|
|
+ if (cmp_func(base + r, base + c, size) >= 0)
|
|
+ break;
|
|
+ swap_func(base + r, base + c, size);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static void mempool_free_vp(void *element, void *pool_data)
|
|
+{
|
|
+ size_t size = (size_t) pool_data;
|
|
+
|
|
+ vpfree(element, size);
|
|
+}
|
|
+
|
|
+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
|
|
+{
|
|
+ size_t size = (size_t) pool_data;
|
|
+
|
|
+ return vpmalloc(size, gfp_mask);
|
|
+}
|
|
+
|
|
+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
|
|
+{
|
|
+ return size < PAGE_SIZE
|
|
+ ? mempool_init_kmalloc_pool(pool, min_nr, size)
|
|
+ : mempool_init(pool, min_nr, mempool_alloc_vp,
|
|
+ mempool_free_vp, (void *) size);
|
|
+}
|
|
+
|
|
+#if 0
|
|
+void eytzinger1_test(void)
|
|
+{
|
|
+ unsigned inorder, eytz, size;
|
|
+
|
|
+ pr_info("1 based eytzinger test:");
|
|
+
|
|
+ for (size = 2;
|
|
+ size < 65536;
|
|
+ size++) {
|
|
+ unsigned extra = eytzinger1_extra(size);
|
|
+
|
|
+ if (!(size % 4096))
|
|
+ pr_info("tree size %u", size);
|
|
+
|
|
+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
|
|
+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
|
|
+
|
|
+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0);
|
|
+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0);
|
|
+
|
|
+ inorder = 1;
|
|
+ eytzinger1_for_each(eytz, size) {
|
|
+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
|
|
+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
|
|
+ BUG_ON(eytz != eytzinger1_last(size) &&
|
|
+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
|
|
+
|
|
+ inorder++;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void eytzinger0_test(void)
|
|
+{
|
|
+
|
|
+ unsigned inorder, eytz, size;
|
|
+
|
|
+ pr_info("0 based eytzinger test:");
|
|
+
|
|
+ for (size = 1;
|
|
+ size < 65536;
|
|
+ size++) {
|
|
+ unsigned extra = eytzinger0_extra(size);
|
|
+
|
|
+ if (!(size % 4096))
|
|
+ pr_info("tree size %u", size);
|
|
+
|
|
+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
|
|
+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
|
|
+
|
|
+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1);
|
|
+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1);
|
|
+
|
|
+ inorder = 0;
|
|
+ eytzinger0_for_each(eytz, size) {
|
|
+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
|
|
+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
|
|
+ BUG_ON(eytz != eytzinger0_last(size) &&
|
|
+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
|
|
+
|
|
+ inorder++;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
|
|
+{
|
|
+ const u16 *l = _l, *r = _r;
|
|
+
|
|
+ return (*l > *r) - (*r - *l);
|
|
+}
|
|
+
|
|
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
|
|
+{
|
|
+ int i, c1 = -1, c2 = -1;
|
|
+ ssize_t r;
|
|
+
|
|
+ r = eytzinger0_find_le(test_array, nr,
|
|
+ sizeof(test_array[0]),
|
|
+ cmp_u16, &search);
|
|
+ if (r >= 0)
|
|
+ c1 = test_array[r];
|
|
+
|
|
+ for (i = 0; i < nr; i++)
|
|
+ if (test_array[i] <= search && test_array[i] > c2)
|
|
+ c2 = test_array[i];
|
|
+
|
|
+ if (c1 != c2) {
|
|
+ eytzinger0_for_each(i, nr)
|
|
+ pr_info("[%3u] = %12u", i, test_array[i]);
|
|
+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
|
|
+ i, r, c1, c2);
|
|
+ }
|
|
+}
|
|
+
|
|
+void eytzinger0_find_test(void)
|
|
+{
|
|
+ unsigned i, nr, allocated = 1 << 12;
|
|
+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
|
|
+
|
|
+ for (nr = 1; nr < allocated; nr++) {
|
|
+ pr_info("testing %u elems", nr);
|
|
+
|
|
+ get_random_bytes(test_array, nr * sizeof(test_array[0]));
|
|
+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
|
|
+
|
|
+ /* verify array is sorted correctly: */
|
|
+ eytzinger0_for_each(i, nr)
|
|
+ BUG_ON(i != eytzinger0_last(nr) &&
|
|
+ test_array[i] > test_array[eytzinger0_next(i, nr)]);
|
|
+
|
|
+ for (i = 0; i < U16_MAX; i += 1 << 12)
|
|
+ eytzinger0_find_test_val(test_array, nr, i);
|
|
+
|
|
+ for (i = 0; i < nr; i++) {
|
|
+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
|
|
+ eytzinger0_find_test_val(test_array, nr, test_array[i]);
|
|
+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ kfree(test_array);
|
|
+}
|
|
+#endif
|
|
+
|
|
+/*
|
|
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
|
|
+ * against any percpu counter is guarded against
|
|
+ */
|
|
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
|
|
+{
|
|
+ u64 *ret;
|
|
+ int cpu;
|
|
+
|
|
+ preempt_disable();
|
|
+ ret = this_cpu_ptr(p);
|
|
+ preempt_enable();
|
|
+
|
|
+ for_each_possible_cpu(cpu) {
|
|
+ u64 *i = per_cpu_ptr(p, cpu);
|
|
+
|
|
+ if (i != ret) {
|
|
+ acc_u64s(ret, i, nr);
|
|
+ memset(i, 0, nr * sizeof(u64));
|
|
+ }
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
|
|
new file mode 100644
|
|
index 000000000000..c69b05deec41
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/util.h
|
|
@@ -0,0 +1,755 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_UTIL_H
|
|
+#define _BCACHEFS_UTIL_H
|
|
+
|
|
+#include <linux/bio.h>
|
|
+#include <linux/blkdev.h>
|
|
+#include <linux/closure.h>
|
|
+#include <linux/errno.h>
|
|
+#include <linux/freezer.h>
|
|
+#include <linux/kernel.h>
|
|
+#include <linux/sched/clock.h>
|
|
+#include <linux/llist.h>
|
|
+#include <linux/log2.h>
|
|
+#include <linux/percpu.h>
|
|
+#include <linux/preempt.h>
|
|
+#include <linux/ratelimit.h>
|
|
+#include <linux/slab.h>
|
|
+#include <linux/vmalloc.h>
|
|
+#include <linux/workqueue.h>
|
|
+
|
|
+#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9)
|
|
+#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT)
|
|
+
|
|
+struct closure;
|
|
+
|
|
+#ifdef CONFIG_BCACHEFS_DEBUG
|
|
+
|
|
+#define EBUG_ON(cond) BUG_ON(cond)
|
|
+#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0)
|
|
+#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i)
|
|
+#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0)
|
|
+#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0)
|
|
+#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0)
|
|
+#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0)
|
|
+#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0)
|
|
+#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i)
|
|
+#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0)
|
|
+#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0)
|
|
+
|
|
+#else /* DEBUG */
|
|
+
|
|
+#define EBUG_ON(cond)
|
|
+#define atomic_dec_bug(v) atomic_dec(v)
|
|
+#define atomic_inc_bug(v, i) atomic_inc(v)
|
|
+#define atomic_sub_bug(i, v) atomic_sub(i, v)
|
|
+#define atomic_add_bug(i, v) atomic_add(i, v)
|
|
+#define atomic_long_dec_bug(v) atomic_long_dec(v)
|
|
+#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v)
|
|
+#define atomic64_dec_bug(v) atomic64_dec(v)
|
|
+#define atomic64_inc_bug(v, i) atomic64_inc(v)
|
|
+#define atomic64_sub_bug(i, v) atomic64_sub(i, v)
|
|
+#define atomic64_add_bug(i, v) atomic64_add(i, v)
|
|
+
|
|
+#endif
|
|
+
|
|
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
+#define CPU_BIG_ENDIAN 0
|
|
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
+#define CPU_BIG_ENDIAN 1
|
|
+#endif
|
|
+
|
|
+/* type hackery */
|
|
+
|
|
+#define type_is_exact(_val, _type) \
|
|
+ __builtin_types_compatible_p(typeof(_val), _type)
|
|
+
|
|
+#define type_is(_val, _type) \
|
|
+ (__builtin_types_compatible_p(typeof(_val), _type) || \
|
|
+ __builtin_types_compatible_p(typeof(_val), const _type))
|
|
+
|
|
+/* Userspace doesn't align allocations as nicely as the kernel allocators: */
|
|
+static inline size_t buf_pages(void *p, size_t len)
|
|
+{
|
|
+ return DIV_ROUND_UP(len +
|
|
+ ((unsigned long) p & (PAGE_SIZE - 1)),
|
|
+ PAGE_SIZE);
|
|
+}
|
|
+
|
|
+static inline void vpfree(void *p, size_t size)
|
|
+{
|
|
+ if (is_vmalloc_addr(p))
|
|
+ vfree(p);
|
|
+ else
|
|
+ free_pages((unsigned long) p, get_order(size));
|
|
+}
|
|
+
|
|
+static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
|
|
+{
|
|
+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
|
|
+ get_order(size)) ?:
|
|
+ __vmalloc(size, gfp_mask);
|
|
+}
|
|
+
|
|
+static inline void kvpfree(void *p, size_t size)
|
|
+{
|
|
+ if (size < PAGE_SIZE)
|
|
+ kfree(p);
|
|
+ else
|
|
+ vpfree(p, size);
|
|
+}
|
|
+
|
|
+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
|
|
+{
|
|
+ return size < PAGE_SIZE
|
|
+ ? kmalloc(size, gfp_mask)
|
|
+ : vpmalloc(size, gfp_mask);
|
|
+}
|
|
+
|
|
+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
|
|
+
|
|
+#define HEAP(type) \
|
|
+struct { \
|
|
+ size_t size, used; \
|
|
+ type *data; \
|
|
+}
|
|
+
|
|
+#define DECLARE_HEAP(type, name) HEAP(type) name
|
|
+
|
|
+#define init_heap(heap, _size, gfp) \
|
|
+({ \
|
|
+ (heap)->used = 0; \
|
|
+ (heap)->size = (_size); \
|
|
+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
|
|
+ (gfp)); \
|
|
+})
|
|
+
|
|
+#define free_heap(heap) \
|
|
+do { \
|
|
+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \
|
|
+ (heap)->data = NULL; \
|
|
+} while (0)
|
|
+
|
|
+#define heap_set_backpointer(h, i, _fn) \
|
|
+do { \
|
|
+ void (*fn)(typeof(h), size_t) = _fn; \
|
|
+ if (fn) \
|
|
+ fn(h, i); \
|
|
+} while (0)
|
|
+
|
|
+#define heap_swap(h, i, j, set_backpointer) \
|
|
+do { \
|
|
+ swap((h)->data[i], (h)->data[j]); \
|
|
+ heap_set_backpointer(h, i, set_backpointer); \
|
|
+ heap_set_backpointer(h, j, set_backpointer); \
|
|
+} while (0)
|
|
+
|
|
+#define heap_peek(h) \
|
|
+({ \
|
|
+ EBUG_ON(!(h)->used); \
|
|
+ (h)->data[0]; \
|
|
+})
|
|
+
|
|
+#define heap_full(h) ((h)->used == (h)->size)
|
|
+
|
|
+#define heap_sift_down(h, i, cmp, set_backpointer) \
|
|
+do { \
|
|
+ size_t _c, _j = i; \
|
|
+ \
|
|
+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \
|
|
+ _c = _j * 2 + 1; \
|
|
+ if (_c + 1 < (h)->used && \
|
|
+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \
|
|
+ _c++; \
|
|
+ \
|
|
+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \
|
|
+ break; \
|
|
+ heap_swap(h, _c, _j, set_backpointer); \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+#define heap_sift_up(h, i, cmp, set_backpointer) \
|
|
+do { \
|
|
+ while (i) { \
|
|
+ size_t p = (i - 1) / 2; \
|
|
+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \
|
|
+ break; \
|
|
+ heap_swap(h, i, p, set_backpointer); \
|
|
+ i = p; \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+#define __heap_add(h, d, cmp, set_backpointer) \
|
|
+({ \
|
|
+ size_t _i = (h)->used++; \
|
|
+ (h)->data[_i] = d; \
|
|
+ heap_set_backpointer(h, _i, set_backpointer); \
|
|
+ \
|
|
+ heap_sift_up(h, _i, cmp, set_backpointer); \
|
|
+ _i; \
|
|
+})
|
|
+
|
|
+#define heap_add(h, d, cmp, set_backpointer) \
|
|
+({ \
|
|
+ bool _r = !heap_full(h); \
|
|
+ if (_r) \
|
|
+ __heap_add(h, d, cmp, set_backpointer); \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define heap_add_or_replace(h, new, cmp, set_backpointer) \
|
|
+do { \
|
|
+ if (!heap_add(h, new, cmp, set_backpointer) && \
|
|
+ cmp(h, new, heap_peek(h)) >= 0) { \
|
|
+ (h)->data[0] = new; \
|
|
+ heap_set_backpointer(h, 0, set_backpointer); \
|
|
+ heap_sift_down(h, 0, cmp, set_backpointer); \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+#define heap_del(h, i, cmp, set_backpointer) \
|
|
+do { \
|
|
+ size_t _i = (i); \
|
|
+ \
|
|
+ BUG_ON(_i >= (h)->used); \
|
|
+ (h)->used--; \
|
|
+ heap_swap(h, _i, (h)->used, set_backpointer); \
|
|
+ heap_sift_up(h, _i, cmp, set_backpointer); \
|
|
+ heap_sift_down(h, _i, cmp, set_backpointer); \
|
|
+} while (0)
|
|
+
|
|
+#define heap_pop(h, d, cmp, set_backpointer) \
|
|
+({ \
|
|
+ bool _r = (h)->used; \
|
|
+ if (_r) { \
|
|
+ (d) = (h)->data[0]; \
|
|
+ heap_del(h, 0, cmp, set_backpointer); \
|
|
+ } \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define heap_resort(heap, cmp, set_backpointer) \
|
|
+do { \
|
|
+ ssize_t _i; \
|
|
+ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \
|
|
+ heap_sift_down(heap, _i, cmp, set_backpointer); \
|
|
+} while (0)
|
|
+
|
|
+#define ANYSINT_MAX(t) \
|
|
+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
|
|
+
|
|
+struct printbuf {
|
|
+ char *pos;
|
|
+ char *end;
|
|
+};
|
|
+
|
|
+static inline size_t printbuf_remaining(struct printbuf *buf)
|
|
+{
|
|
+ return buf->end - buf->pos;
|
|
+}
|
|
+
|
|
+#define _PBUF(_buf, _len) \
|
|
+ ((struct printbuf) { \
|
|
+ .pos = _buf, \
|
|
+ .end = _buf + _len, \
|
|
+ })
|
|
+
|
|
+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf))
|
|
+
|
|
+#define pr_buf(_out, ...) \
|
|
+do { \
|
|
+ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \
|
|
+ __VA_ARGS__); \
|
|
+} while (0)
|
|
+
|
|
+void bch_scnmemcpy(struct printbuf *, const char *, size_t);
|
|
+
|
|
+int bch2_strtoint_h(const char *, int *);
|
|
+int bch2_strtouint_h(const char *, unsigned int *);
|
|
+int bch2_strtoll_h(const char *, long long *);
|
|
+int bch2_strtoull_h(const char *, unsigned long long *);
|
|
+int bch2_strtou64_h(const char *, u64 *);
|
|
+
|
|
+static inline int bch2_strtol_h(const char *cp, long *res)
|
|
+{
|
|
+#if BITS_PER_LONG == 32
|
|
+ return bch2_strtoint_h(cp, (int *) res);
|
|
+#else
|
|
+ return bch2_strtoll_h(cp, (long long *) res);
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline int bch2_strtoul_h(const char *cp, long *res)
|
|
+{
|
|
+#if BITS_PER_LONG == 32
|
|
+ return bch2_strtouint_h(cp, (unsigned int *) res);
|
|
+#else
|
|
+ return bch2_strtoull_h(cp, (unsigned long long *) res);
|
|
+#endif
|
|
+}
|
|
+
|
|
+#define strtoi_h(cp, res) \
|
|
+ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\
|
|
+ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\
|
|
+ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\
|
|
+ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\
|
|
+ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\
|
|
+ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\
|
|
+ : -EINVAL)
|
|
+
|
|
+#define strtoul_safe(cp, var) \
|
|
+({ \
|
|
+ unsigned long _v; \
|
|
+ int _r = kstrtoul(cp, 10, &_v); \
|
|
+ if (!_r) \
|
|
+ var = _v; \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define strtoul_safe_clamp(cp, var, min, max) \
|
|
+({ \
|
|
+ unsigned long _v; \
|
|
+ int _r = kstrtoul(cp, 10, &_v); \
|
|
+ if (!_r) \
|
|
+ var = clamp_t(typeof(var), _v, min, max); \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define strtoul_safe_restrict(cp, var, min, max) \
|
|
+({ \
|
|
+ unsigned long _v; \
|
|
+ int _r = kstrtoul(cp, 10, &_v); \
|
|
+ if (!_r && _v >= min && _v <= max) \
|
|
+ var = _v; \
|
|
+ else \
|
|
+ _r = -EINVAL; \
|
|
+ _r; \
|
|
+})
|
|
+
|
|
+#define snprint(buf, size, var) \
|
|
+ snprintf(buf, size, \
|
|
+ type_is(var, int) ? "%i\n" \
|
|
+ : type_is(var, unsigned) ? "%u\n" \
|
|
+ : type_is(var, long) ? "%li\n" \
|
|
+ : type_is(var, unsigned long) ? "%lu\n" \
|
|
+ : type_is(var, s64) ? "%lli\n" \
|
|
+ : type_is(var, u64) ? "%llu\n" \
|
|
+ : type_is(var, char *) ? "%s\n" \
|
|
+ : "%i\n", var)
|
|
+
|
|
+void bch2_hprint(struct printbuf *, s64);
|
|
+
|
|
+bool bch2_is_zero(const void *, size_t);
|
|
+
|
|
+void bch2_string_opt_to_text(struct printbuf *,
|
|
+ const char * const [], size_t);
|
|
+
|
|
+void bch2_flags_to_text(struct printbuf *, const char * const[], u64);
|
|
+u64 bch2_read_flag_list(char *, const char * const[]);
|
|
+
|
|
+#define NR_QUANTILES 15
|
|
+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
|
|
+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES)
|
|
+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES)
|
|
+
|
|
+struct quantiles {
|
|
+ struct quantile_entry {
|
|
+ u64 m;
|
|
+ u64 step;
|
|
+ } entries[NR_QUANTILES];
|
|
+};
|
|
+
|
|
+struct time_stat_buffer {
|
|
+ unsigned nr;
|
|
+ struct time_stat_buffer_entry {
|
|
+ u64 start;
|
|
+ u64 end;
|
|
+ } entries[32];
|
|
+};
|
|
+
|
|
+struct time_stats {
|
|
+ spinlock_t lock;
|
|
+ u64 count;
|
|
+ /* all fields are in nanoseconds */
|
|
+ u64 average_duration;
|
|
+ u64 average_frequency;
|
|
+ u64 max_duration;
|
|
+ u64 last_event;
|
|
+ struct quantiles quantiles;
|
|
+
|
|
+ struct time_stat_buffer __percpu *buffer;
|
|
+};
|
|
+
|
|
+void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
|
|
+
|
|
+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
|
|
+{
|
|
+ __bch2_time_stats_update(stats, start, local_clock());
|
|
+}
|
|
+
|
|
+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *);
|
|
+
|
|
+void bch2_time_stats_exit(struct time_stats *);
|
|
+void bch2_time_stats_init(struct time_stats *);
|
|
+
|
|
+#define ewma_add(ewma, val, weight) \
|
|
+({ \
|
|
+ typeof(ewma) _ewma = (ewma); \
|
|
+ typeof(weight) _weight = (weight); \
|
|
+ \
|
|
+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \
|
|
+})
|
|
+
|
|
+struct bch_ratelimit {
|
|
+ /* Next time we want to do some work, in nanoseconds */
|
|
+ u64 next;
|
|
+
|
|
+ /*
|
|
+ * Rate at which we want to do work, in units per nanosecond
|
|
+ * The units here correspond to the units passed to
|
|
+ * bch2_ratelimit_increment()
|
|
+ */
|
|
+ unsigned rate;
|
|
+};
|
|
+
|
|
+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d)
|
|
+{
|
|
+ d->next = local_clock();
|
|
+}
|
|
+
|
|
+u64 bch2_ratelimit_delay(struct bch_ratelimit *);
|
|
+void bch2_ratelimit_increment(struct bch_ratelimit *, u64);
|
|
+
|
|
+struct bch_pd_controller {
|
|
+ struct bch_ratelimit rate;
|
|
+ unsigned long last_update;
|
|
+
|
|
+ s64 last_actual;
|
|
+ s64 smoothed_derivative;
|
|
+
|
|
+ unsigned p_term_inverse;
|
|
+ unsigned d_smooth;
|
|
+ unsigned d_term;
|
|
+
|
|
+ /* for exporting to sysfs (no effect on behavior) */
|
|
+ s64 last_derivative;
|
|
+ s64 last_proportional;
|
|
+ s64 last_change;
|
|
+ s64 last_target;
|
|
+
|
|
+ /* If true, the rate will not increase if bch2_ratelimit_delay()
|
|
+ * is not being called often enough. */
|
|
+ bool backpressure;
|
|
+};
|
|
+
|
|
+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int);
|
|
+void bch2_pd_controller_init(struct bch_pd_controller *);
|
|
+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *);
|
|
+
|
|
+#define sysfs_pd_controller_attribute(name) \
|
|
+ rw_attribute(name##_rate); \
|
|
+ rw_attribute(name##_rate_bytes); \
|
|
+ rw_attribute(name##_rate_d_term); \
|
|
+ rw_attribute(name##_rate_p_term_inverse); \
|
|
+ read_attribute(name##_rate_debug)
|
|
+
|
|
+#define sysfs_pd_controller_files(name) \
|
|
+ &sysfs_##name##_rate, \
|
|
+ &sysfs_##name##_rate_bytes, \
|
|
+ &sysfs_##name##_rate_d_term, \
|
|
+ &sysfs_##name##_rate_p_term_inverse, \
|
|
+ &sysfs_##name##_rate_debug
|
|
+
|
|
+#define sysfs_pd_controller_show(name, var) \
|
|
+do { \
|
|
+ sysfs_hprint(name##_rate, (var)->rate.rate); \
|
|
+ sysfs_print(name##_rate_bytes, (var)->rate.rate); \
|
|
+ sysfs_print(name##_rate_d_term, (var)->d_term); \
|
|
+ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \
|
|
+ \
|
|
+ if (attr == &sysfs_##name##_rate_debug) \
|
|
+ return bch2_pd_controller_print_debug(var, buf); \
|
|
+} while (0)
|
|
+
|
|
+#define sysfs_pd_controller_store(name, var) \
|
|
+do { \
|
|
+ sysfs_strtoul_clamp(name##_rate, \
|
|
+ (var)->rate.rate, 1, UINT_MAX); \
|
|
+ sysfs_strtoul_clamp(name##_rate_bytes, \
|
|
+ (var)->rate.rate, 1, UINT_MAX); \
|
|
+ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \
|
|
+ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \
|
|
+ (var)->p_term_inverse, 1, INT_MAX); \
|
|
+} while (0)
|
|
+
|
|
+#define container_of_or_null(ptr, type, member) \
|
|
+({ \
|
|
+ typeof(ptr) _ptr = ptr; \
|
|
+ _ptr ? container_of(_ptr, type, member) : NULL; \
|
|
+})
|
|
+
|
|
+/* Does linear interpolation between powers of two */
|
|
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
|
|
+{
|
|
+ unsigned fract = x & ~(~0 << fract_bits);
|
|
+
|
|
+ x >>= fract_bits;
|
|
+ x = 1 << x;
|
|
+ x += (x * fract) >> fract_bits;
|
|
+
|
|
+ return x;
|
|
+}
|
|
+
|
|
+void bch2_bio_map(struct bio *bio, void *base, size_t);
|
|
+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
|
|
+
|
|
+static inline sector_t bdev_sectors(struct block_device *bdev)
|
|
+{
|
|
+ return bdev->bd_inode->i_size >> 9;
|
|
+}
|
|
+
|
|
+#define closure_bio_submit(bio, cl) \
|
|
+do { \
|
|
+ closure_get(cl); \
|
|
+ submit_bio(bio); \
|
|
+} while (0)
|
|
+
|
|
+#define kthread_wait_freezable(cond) \
|
|
+({ \
|
|
+ int _ret = 0; \
|
|
+ while (1) { \
|
|
+ set_current_state(TASK_INTERRUPTIBLE); \
|
|
+ if (kthread_should_stop()) { \
|
|
+ _ret = -1; \
|
|
+ break; \
|
|
+ } \
|
|
+ \
|
|
+ if (cond) \
|
|
+ break; \
|
|
+ \
|
|
+ schedule(); \
|
|
+ try_to_freeze(); \
|
|
+ } \
|
|
+ set_current_state(TASK_RUNNING); \
|
|
+ _ret; \
|
|
+})
|
|
+
|
|
+size_t bch2_rand_range(size_t);
|
|
+
|
|
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
|
|
+void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
|
|
+
|
|
+static inline void memcpy_u64s_small(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ u64 *d = dst;
|
|
+ const u64 *s = src;
|
|
+
|
|
+ while (u64s--)
|
|
+ *d++ = *s++;
|
|
+}
|
|
+
|
|
+static inline void __memcpy_u64s(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+#ifdef CONFIG_X86_64
|
|
+ long d0, d1, d2;
|
|
+ asm volatile("rep ; movsq"
|
|
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
|
|
+ : "0" (u64s), "1" (dst), "2" (src)
|
|
+ : "memory");
|
|
+#else
|
|
+ u64 *d = dst;
|
|
+ const u64 *s = src;
|
|
+
|
|
+ while (u64s--)
|
|
+ *d++ = *s++;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void memcpy_u64s(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ EBUG_ON(!(dst >= src + u64s * sizeof(u64) ||
|
|
+ dst + u64s * sizeof(u64) <= src));
|
|
+
|
|
+ __memcpy_u64s(dst, src, u64s);
|
|
+}
|
|
+
|
|
+static inline void __memmove_u64s_down(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ __memcpy_u64s(dst, src, u64s);
|
|
+}
|
|
+
|
|
+static inline void memmove_u64s_down(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ EBUG_ON(dst > src);
|
|
+
|
|
+ __memmove_u64s_down(dst, src, u64s);
|
|
+}
|
|
+
|
|
+static inline void __memmove_u64s_up_small(void *_dst, const void *_src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ u64 *dst = (u64 *) _dst + u64s;
|
|
+ u64 *src = (u64 *) _src + u64s;
|
|
+
|
|
+ while (u64s--)
|
|
+ *--dst = *--src;
|
|
+}
|
|
+
|
|
+static inline void memmove_u64s_up_small(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ EBUG_ON(dst < src);
|
|
+
|
|
+ __memmove_u64s_up_small(dst, src, u64s);
|
|
+}
|
|
+
|
|
+static inline void __memmove_u64s_up(void *_dst, const void *_src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ u64 *dst = (u64 *) _dst + u64s - 1;
|
|
+ u64 *src = (u64 *) _src + u64s - 1;
|
|
+
|
|
+#ifdef CONFIG_X86_64
|
|
+ long d0, d1, d2;
|
|
+ asm volatile("std ;\n"
|
|
+ "rep ; movsq\n"
|
|
+ "cld ;\n"
|
|
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
|
|
+ : "0" (u64s), "1" (dst), "2" (src)
|
|
+ : "memory");
|
|
+#else
|
|
+ while (u64s--)
|
|
+ *dst-- = *src--;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void memmove_u64s_up(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ EBUG_ON(dst < src);
|
|
+
|
|
+ __memmove_u64s_up(dst, src, u64s);
|
|
+}
|
|
+
|
|
+static inline void memmove_u64s(void *dst, const void *src,
|
|
+ unsigned u64s)
|
|
+{
|
|
+ if (dst < src)
|
|
+ __memmove_u64s_down(dst, src, u64s);
|
|
+ else
|
|
+ __memmove_u64s_up(dst, src, u64s);
|
|
+}
|
|
+
|
|
+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */
|
|
+static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
|
|
+{
|
|
+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes;
|
|
+
|
|
+ memset(s + bytes, c, rem);
|
|
+}
|
|
+
|
|
+void sort_cmp_size(void *base, size_t num, size_t size,
|
|
+ int (*cmp_func)(const void *, const void *, size_t),
|
|
+ void (*swap_func)(void *, void *, size_t));
|
|
+
|
|
+/* just the memmove, doesn't update @_nr */
|
|
+#define __array_insert_item(_array, _nr, _pos) \
|
|
+ memmove(&(_array)[(_pos) + 1], \
|
|
+ &(_array)[(_pos)], \
|
|
+ sizeof((_array)[0]) * ((_nr) - (_pos)))
|
|
+
|
|
+#define array_insert_item(_array, _nr, _pos, _new_item) \
|
|
+do { \
|
|
+ __array_insert_item(_array, _nr, _pos); \
|
|
+ (_nr)++; \
|
|
+ (_array)[(_pos)] = (_new_item); \
|
|
+} while (0)
|
|
+
|
|
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \
|
|
+do { \
|
|
+ (_nr) -= (_nr_to_remove); \
|
|
+ memmove(&(_array)[(_pos)], \
|
|
+ &(_array)[(_pos) + (_nr_to_remove)], \
|
|
+ sizeof((_array)[0]) * ((_nr) - (_pos))); \
|
|
+} while (0)
|
|
+
|
|
+#define array_remove_item(_array, _nr, _pos) \
|
|
+ array_remove_items(_array, _nr, _pos, 1)
|
|
+
|
|
+#define bubble_sort(_base, _nr, _cmp) \
|
|
+do { \
|
|
+ ssize_t _i, _end; \
|
|
+ bool _swapped = true; \
|
|
+ \
|
|
+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
|
|
+ _swapped = false; \
|
|
+ for (_i = 0; _i < _end; _i++) \
|
|
+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
|
|
+ swap((_base)[_i], (_base)[_i + 1]); \
|
|
+ _swapped = true; \
|
|
+ } \
|
|
+ } \
|
|
+} while (0)
|
|
+
|
|
+static inline u64 percpu_u64_get(u64 __percpu *src)
|
|
+{
|
|
+ u64 ret = 0;
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu)
|
|
+ ret += *per_cpu_ptr(src, cpu);
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline void percpu_u64_set(u64 __percpu *dst, u64 src)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu)
|
|
+ *per_cpu_ptr(dst, cpu) = 0;
|
|
+
|
|
+ preempt_disable();
|
|
+ *this_cpu_ptr(dst) = src;
|
|
+ preempt_enable();
|
|
+}
|
|
+
|
|
+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr)
|
|
+{
|
|
+ unsigned i;
|
|
+
|
|
+ for (i = 0; i < nr; i++)
|
|
+ acc[i] += src[i];
|
|
+}
|
|
+
|
|
+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
|
|
+ unsigned nr)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu)
|
|
+ acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
|
|
+}
|
|
+
|
|
+static inline void percpu_memset(void __percpu *p, int c, size_t bytes)
|
|
+{
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu)
|
|
+ memset(per_cpu_ptr(p, cpu), c, bytes);
|
|
+}
|
|
+
|
|
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
|
|
+
|
|
+#define cmp_int(l, r) ((l > r) - (l < r))
|
|
+
|
|
+static inline int u8_cmp(u8 l, u8 r)
|
|
+{
|
|
+ return cmp_int(l, r);
|
|
+}
|
|
+
|
|
+#endif /* _BCACHEFS_UTIL_H */
|
|
diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c
|
|
new file mode 100644
|
|
index 000000000000..a3d252c741c8
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/varint.c
|
|
@@ -0,0 +1,42 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include <linux/bitops.h>
|
|
+#include <asm/unaligned.h>
|
|
+
|
|
+#include "varint.h"
|
|
+
|
|
+int bch2_varint_encode(u8 *out, u64 v)
|
|
+{
|
|
+ unsigned bits = fls64(v|1);
|
|
+ unsigned bytes = DIV_ROUND_UP(bits, 7);
|
|
+
|
|
+ if (likely(bytes < 9)) {
|
|
+ v <<= bytes;
|
|
+ v |= ~(~0 << (bytes - 1));
|
|
+ } else {
|
|
+ *out++ = 255;
|
|
+ bytes = 9;
|
|
+ }
|
|
+
|
|
+ put_unaligned_le64(v, out);
|
|
+ return bytes;
|
|
+}
|
|
+
|
|
+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
|
|
+{
|
|
+ u64 v = get_unaligned_le64(in);
|
|
+ unsigned bytes = ffz(v & 255) + 1;
|
|
+
|
|
+ if (unlikely(in + bytes > end))
|
|
+ return -1;
|
|
+
|
|
+ if (likely(bytes < 9)) {
|
|
+ v >>= bytes;
|
|
+ v &= ~(~0ULL << (7 * bytes));
|
|
+ } else {
|
|
+ v = get_unaligned_le64(++in);
|
|
+ }
|
|
+
|
|
+ *out = v;
|
|
+ return bytes;
|
|
+}
|
|
diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h
|
|
new file mode 100644
|
|
index 000000000000..8daf813576b7
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/varint.h
|
|
@@ -0,0 +1,8 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_VARINT_H
|
|
+#define _BCACHEFS_VARINT_H
|
|
+
|
|
+int bch2_varint_encode(u8 *, u64);
|
|
+int bch2_varint_decode(const u8 *, const u8 *, u64 *);
|
|
+
|
|
+#endif /* _BCACHEFS_VARINT_H */
|
|
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
|
|
new file mode 100644
|
|
index 000000000000..c099cdc0605f
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/vstructs.h
|
|
@@ -0,0 +1,63 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _VSTRUCTS_H
|
|
+#define _VSTRUCTS_H
|
|
+
|
|
+#include "util.h"
|
|
+
|
|
+/*
|
|
+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this
|
|
+ * assumes u64 is little endian:
|
|
+ */
|
|
+#define __vstruct_u64s(_s) \
|
|
+({ \
|
|
+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \
|
|
+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \
|
|
+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \
|
|
+ : ((__force u8) ((_s)->u64s))); \
|
|
+})
|
|
+
|
|
+#define __vstruct_bytes(_type, _u64s) \
|
|
+({ \
|
|
+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \
|
|
+ \
|
|
+ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \
|
|
+})
|
|
+
|
|
+#define vstruct_bytes(_s) \
|
|
+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s))
|
|
+
|
|
+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \
|
|
+ (round_up(__vstruct_bytes(_type, _u64s), \
|
|
+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits)))
|
|
+
|
|
+#define vstruct_blocks(_s, _sector_block_bits) \
|
|
+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s))
|
|
+
|
|
+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \
|
|
+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \
|
|
+ __vstruct_u64s(_s) + (_u64s))
|
|
+
|
|
+#define vstruct_sectors(_s, _sector_block_bits) \
|
|
+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9)
|
|
+
|
|
+#define vstruct_next(_s) \
|
|
+ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s)))
|
|
+#define vstruct_last(_s) \
|
|
+ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s)))
|
|
+#define vstruct_end(_s) \
|
|
+ ((void *) ((_s)->_data + __vstruct_u64s(_s)))
|
|
+
|
|
+#define vstruct_for_each(_s, _i) \
|
|
+ for (_i = (_s)->start; \
|
|
+ _i < vstruct_last(_s); \
|
|
+ _i = vstruct_next(_i))
|
|
+
|
|
+#define vstruct_for_each_safe(_s, _i, _t) \
|
|
+ for (_i = (_s)->start; \
|
|
+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
|
|
+ _i = _t)
|
|
+
|
|
+#define vstruct_idx(_s, _idx) \
|
|
+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
|
|
+
|
|
+#endif /* _VSTRUCTS_H */
|
|
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
|
|
new file mode 100644
|
|
index 000000000000..858aa8766053
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/xattr.c
|
|
@@ -0,0 +1,589 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include "bcachefs.h"
|
|
+#include "bkey_methods.h"
|
|
+#include "btree_update.h"
|
|
+#include "extents.h"
|
|
+#include "fs.h"
|
|
+#include "rebalance.h"
|
|
+#include "str_hash.h"
|
|
+#include "xattr.h"
|
|
+
|
|
+#include <linux/dcache.h>
|
|
+#include <linux/posix_acl_xattr.h>
|
|
+#include <linux/xattr.h>
|
|
+
|
|
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned);
|
|
+
|
|
+static u64 bch2_xattr_hash(const struct bch_hash_info *info,
|
|
+ const struct xattr_search_key *key)
|
|
+{
|
|
+ struct bch_str_hash_ctx ctx;
|
|
+
|
|
+ bch2_str_hash_init(&ctx, info);
|
|
+ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type));
|
|
+ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len);
|
|
+
|
|
+ return bch2_str_hash_end(&ctx, info);
|
|
+}
|
|
+
|
|
+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key)
|
|
+{
|
|
+ return bch2_xattr_hash(info, key);
|
|
+}
|
|
+
|
|
+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
|
|
+{
|
|
+ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k);
|
|
+
|
|
+ return bch2_xattr_hash(info,
|
|
+ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len));
|
|
+}
|
|
+
|
|
+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r)
|
|
+{
|
|
+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
|
|
+ const struct xattr_search_key *r = _r;
|
|
+
|
|
+ return l.v->x_type != r->type ||
|
|
+ l.v->x_name_len != r->name.len ||
|
|
+ memcmp(l.v->x_name, r->name.name, r->name.len);
|
|
+}
|
|
+
|
|
+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
|
|
+{
|
|
+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l);
|
|
+ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r);
|
|
+
|
|
+ return l.v->x_type != r.v->x_type ||
|
|
+ l.v->x_name_len != r.v->x_name_len ||
|
|
+ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len);
|
|
+}
|
|
+
|
|
+const struct bch_hash_desc bch2_xattr_hash_desc = {
|
|
+ .btree_id = BTREE_ID_xattrs,
|
|
+ .key_type = KEY_TYPE_xattr,
|
|
+ .hash_key = xattr_hash_key,
|
|
+ .hash_bkey = xattr_hash_bkey,
|
|
+ .cmp_key = xattr_cmp_key,
|
|
+ .cmp_bkey = xattr_cmp_bkey,
|
|
+};
|
|
+
|
|
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
|
|
+{
|
|
+ const struct xattr_handler *handler;
|
|
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
|
|
+
|
|
+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr))
|
|
+ return "value too small";
|
|
+
|
|
+ if (bkey_val_u64s(k.k) <
|
|
+ xattr_val_u64s(xattr.v->x_name_len,
|
|
+ le16_to_cpu(xattr.v->x_val_len)))
|
|
+ return "value too small";
|
|
+
|
|
+ if (bkey_val_u64s(k.k) >
|
|
+ xattr_val_u64s(xattr.v->x_name_len,
|
|
+ le16_to_cpu(xattr.v->x_val_len) + 4))
|
|
+ return "value too big";
|
|
+
|
|
+ handler = bch2_xattr_type_to_handler(xattr.v->x_type);
|
|
+ if (!handler)
|
|
+ return "invalid type";
|
|
+
|
|
+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len))
|
|
+ return "xattr name has invalid characters";
|
|
+
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c,
|
|
+ struct bkey_s_c k)
|
|
+{
|
|
+ const struct xattr_handler *handler;
|
|
+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
|
|
+
|
|
+ handler = bch2_xattr_type_to_handler(xattr.v->x_type);
|
|
+ if (handler && handler->prefix)
|
|
+ pr_buf(out, "%s", handler->prefix);
|
|
+ else if (handler)
|
|
+ pr_buf(out, "(type %u)", xattr.v->x_type);
|
|
+ else
|
|
+ pr_buf(out, "(unknown type %u)", xattr.v->x_type);
|
|
+
|
|
+ bch_scnmemcpy(out, xattr.v->x_name,
|
|
+ xattr.v->x_name_len);
|
|
+ pr_buf(out, ":");
|
|
+ bch_scnmemcpy(out, xattr_val(xattr.v),
|
|
+ le16_to_cpu(xattr.v->x_val_len));
|
|
+}
|
|
+
|
|
+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
|
|
+ const char *name, void *buffer, size_t size, int type)
|
|
+{
|
|
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c_xattr xattr;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash,
|
|
+ inode->v.i_ino,
|
|
+ &X_SEARCH(type, name, strlen(name)),
|
|
+ 0);
|
|
+ ret = PTR_ERR_OR_ZERO(iter);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+
|
|
+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
|
|
+ ret = le16_to_cpu(xattr.v->x_val_len);
|
|
+ if (buffer) {
|
|
+ if (ret > size)
|
|
+ ret = -ERANGE;
|
|
+ else
|
|
+ memcpy(buffer, xattr_val(xattr.v), ret);
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+err:
|
|
+ bch2_trans_exit(&trans);
|
|
+
|
|
+ BUG_ON(ret == -EINTR);
|
|
+ return ret == -ENOENT ? -ENODATA : ret;
|
|
+}
|
|
+
|
|
+int bch2_xattr_set(struct btree_trans *trans, u64 inum,
|
|
+ const struct bch_hash_info *hash_info,
|
|
+ const char *name, const void *value, size_t size,
|
|
+ int type, int flags)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (value) {
|
|
+ struct bkey_i_xattr *xattr;
|
|
+ unsigned namelen = strlen(name);
|
|
+ unsigned u64s = BKEY_U64s +
|
|
+ xattr_val_u64s(namelen, size);
|
|
+
|
|
+ if (u64s > U8_MAX)
|
|
+ return -ERANGE;
|
|
+
|
|
+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
|
|
+ if (IS_ERR(xattr))
|
|
+ return PTR_ERR(xattr);
|
|
+
|
|
+ bkey_xattr_init(&xattr->k_i);
|
|
+ xattr->k.u64s = u64s;
|
|
+ xattr->v.x_type = type;
|
|
+ xattr->v.x_name_len = namelen;
|
|
+ xattr->v.x_val_len = cpu_to_le16(size);
|
|
+ memcpy(xattr->v.x_name, name, namelen);
|
|
+ memcpy(xattr_val(&xattr->v), value, size);
|
|
+
|
|
+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
|
|
+ inum, &xattr->k_i,
|
|
+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)|
|
|
+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0));
|
|
+ } else {
|
|
+ struct xattr_search_key search =
|
|
+ X_SEARCH(type, name, strlen(name));
|
|
+
|
|
+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc,
|
|
+ hash_info, inum, &search);
|
|
+ }
|
|
+
|
|
+ if (ret == -ENOENT)
|
|
+ ret = flags & XATTR_REPLACE ? -ENODATA : 0;
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+struct xattr_buf {
|
|
+ char *buf;
|
|
+ size_t len;
|
|
+ size_t used;
|
|
+};
|
|
+
|
|
+static int __bch2_xattr_emit(const char *prefix,
|
|
+ const char *name, size_t name_len,
|
|
+ struct xattr_buf *buf)
|
|
+{
|
|
+ const size_t prefix_len = strlen(prefix);
|
|
+ const size_t total_len = prefix_len + name_len + 1;
|
|
+
|
|
+ if (buf->buf) {
|
|
+ if (buf->used + total_len > buf->len)
|
|
+ return -ERANGE;
|
|
+
|
|
+ memcpy(buf->buf + buf->used, prefix, prefix_len);
|
|
+ memcpy(buf->buf + buf->used + prefix_len,
|
|
+ name, name_len);
|
|
+ buf->buf[buf->used + prefix_len + name_len] = '\0';
|
|
+ }
|
|
+
|
|
+ buf->used += total_len;
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_xattr_emit(struct dentry *dentry,
|
|
+ const struct bch_xattr *xattr,
|
|
+ struct xattr_buf *buf)
|
|
+{
|
|
+ const struct xattr_handler *handler =
|
|
+ bch2_xattr_type_to_handler(xattr->x_type);
|
|
+
|
|
+ return handler && (!handler->list || handler->list(dentry))
|
|
+ ? __bch2_xattr_emit(handler->prefix ?: handler->name,
|
|
+ xattr->x_name, xattr->x_name_len, buf)
|
|
+ : 0;
|
|
+}
|
|
+
|
|
+static int bch2_xattr_list_bcachefs(struct bch_fs *c,
|
|
+ struct bch_inode_unpacked *inode,
|
|
+ struct xattr_buf *buf,
|
|
+ bool all)
|
|
+{
|
|
+ const char *prefix = all ? "bcachefs_effective." : "bcachefs.";
|
|
+ unsigned id;
|
|
+ int ret = 0;
|
|
+ u64 v;
|
|
+
|
|
+ for (id = 0; id < Inode_opt_nr; id++) {
|
|
+ v = bch2_inode_opt_get(inode, id);
|
|
+ if (!v)
|
|
+ continue;
|
|
+
|
|
+ if (!all &&
|
|
+ !(inode->bi_fields_set & (1 << id)))
|
|
+ continue;
|
|
+
|
|
+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id],
|
|
+ strlen(bch2_inode_opts[id]), buf);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size)
|
|
+{
|
|
+ struct bch_fs *c = dentry->d_sb->s_fs_info;
|
|
+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
|
|
+ struct btree_trans trans;
|
|
+ struct btree_iter *iter;
|
|
+ struct bkey_s_c k;
|
|
+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size };
|
|
+ u64 inum = dentry->d_inode->i_ino;
|
|
+ int ret;
|
|
+
|
|
+ bch2_trans_init(&trans, c, 0, 0);
|
|
+
|
|
+ for_each_btree_key(&trans, iter, BTREE_ID_xattrs,
|
|
+ POS(inum, 0), 0, k, ret) {
|
|
+ BUG_ON(k.k->p.inode < inum);
|
|
+
|
|
+ if (k.k->p.inode > inum)
|
|
+ break;
|
|
+
|
|
+ if (k.k->type != KEY_TYPE_xattr)
|
|
+ continue;
|
|
+
|
|
+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf);
|
|
+ if (ret)
|
|
+ break;
|
|
+ }
|
|
+ bch2_trans_iter_put(&trans, iter);
|
|
+
|
|
+ ret = bch2_trans_exit(&trans) ?: ret;
|
|
+
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true);
|
|
+ if (ret)
|
|
+ return ret;
|
|
+
|
|
+ return buf.used;
|
|
+}
|
|
+
|
|
+static int bch2_xattr_get_handler(const struct xattr_handler *handler,
|
|
+ struct dentry *dentry, struct inode *vinode,
|
|
+ const char *name, void *buffer, size_t size)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+
|
|
+ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags);
|
|
+}
|
|
+
|
|
+static int bch2_xattr_set_handler(const struct xattr_handler *handler,
|
|
+ struct dentry *dentry, struct inode *vinode,
|
|
+ const char *name, const void *value,
|
|
+ size_t size, int flags)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode);
|
|
+
|
|
+ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0,
|
|
+ bch2_xattr_set(&trans, inode->v.i_ino, &hash,
|
|
+ name, value, size,
|
|
+ handler->flags, flags));
|
|
+}
|
|
+
|
|
+static const struct xattr_handler bch_xattr_user_handler = {
|
|
+ .prefix = XATTR_USER_PREFIX,
|
|
+ .get = bch2_xattr_get_handler,
|
|
+ .set = bch2_xattr_set_handler,
|
|
+ .flags = KEY_TYPE_XATTR_INDEX_USER,
|
|
+};
|
|
+
|
|
+static bool bch2_xattr_trusted_list(struct dentry *dentry)
|
|
+{
|
|
+ return capable(CAP_SYS_ADMIN);
|
|
+}
|
|
+
|
|
+static const struct xattr_handler bch_xattr_trusted_handler = {
|
|
+ .prefix = XATTR_TRUSTED_PREFIX,
|
|
+ .list = bch2_xattr_trusted_list,
|
|
+ .get = bch2_xattr_get_handler,
|
|
+ .set = bch2_xattr_set_handler,
|
|
+ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED,
|
|
+};
|
|
+
|
|
+static const struct xattr_handler bch_xattr_security_handler = {
|
|
+ .prefix = XATTR_SECURITY_PREFIX,
|
|
+ .get = bch2_xattr_get_handler,
|
|
+ .set = bch2_xattr_set_handler,
|
|
+ .flags = KEY_TYPE_XATTR_INDEX_SECURITY,
|
|
+};
|
|
+
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+
|
|
+static int opt_to_inode_opt(int id)
|
|
+{
|
|
+ switch (id) {
|
|
+#define x(name, ...) \
|
|
+ case Opt_##name: return Inode_opt_##name;
|
|
+ BCH_INODE_OPTS()
|
|
+#undef x
|
|
+ default:
|
|
+ return -1;
|
|
+ }
|
|
+}
|
|
+
|
|
+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
|
|
+ struct dentry *dentry, struct inode *vinode,
|
|
+ const char *name, void *buffer, size_t size,
|
|
+ bool all)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ struct bch_opts opts =
|
|
+ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode));
|
|
+ const struct bch_option *opt;
|
|
+ int id, inode_opt_id;
|
|
+ char buf[512];
|
|
+ struct printbuf out = PBUF(buf);
|
|
+ unsigned val_len;
|
|
+ u64 v;
|
|
+
|
|
+ id = bch2_opt_lookup(name);
|
|
+ if (id < 0 || !bch2_opt_is_inode_opt(id))
|
|
+ return -EINVAL;
|
|
+
|
|
+ inode_opt_id = opt_to_inode_opt(id);
|
|
+ if (inode_opt_id < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ opt = bch2_opt_table + id;
|
|
+
|
|
+ if (!bch2_opt_defined_by_id(&opts, id))
|
|
+ return -ENODATA;
|
|
+
|
|
+ if (!all &&
|
|
+ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id)))
|
|
+ return -ENODATA;
|
|
+
|
|
+ v = bch2_opt_get_by_id(&opts, id);
|
|
+ bch2_opt_to_text(&out, c, opt, v, 0);
|
|
+
|
|
+ val_len = out.pos - buf;
|
|
+
|
|
+ if (buffer && val_len > size)
|
|
+ return -ERANGE;
|
|
+
|
|
+ if (buffer)
|
|
+ memcpy(buffer, buf, val_len);
|
|
+ return val_len;
|
|
+}
|
|
+
|
|
+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler,
|
|
+ struct dentry *dentry, struct inode *vinode,
|
|
+ const char *name, void *buffer, size_t size)
|
|
+{
|
|
+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
|
|
+ name, buffer, size, false);
|
|
+}
|
|
+
|
|
+struct inode_opt_set {
|
|
+ int id;
|
|
+ u64 v;
|
|
+ bool defined;
|
|
+};
|
|
+
|
|
+static int inode_opt_set_fn(struct bch_inode_info *inode,
|
|
+ struct bch_inode_unpacked *bi,
|
|
+ void *p)
|
|
+{
|
|
+ struct inode_opt_set *s = p;
|
|
+
|
|
+ if (s->defined)
|
|
+ bi->bi_fields_set |= 1U << s->id;
|
|
+ else
|
|
+ bi->bi_fields_set &= ~(1U << s->id);
|
|
+
|
|
+ bch2_inode_opt_set(bi, s->id, s->v);
|
|
+
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
|
|
+ struct dentry *dentry, struct inode *vinode,
|
|
+ const char *name, const void *value,
|
|
+ size_t size, int flags)
|
|
+{
|
|
+ struct bch_inode_info *inode = to_bch_ei(vinode);
|
|
+ struct bch_fs *c = inode->v.i_sb->s_fs_info;
|
|
+ const struct bch_option *opt;
|
|
+ char *buf;
|
|
+ struct inode_opt_set s;
|
|
+ int opt_id, inode_opt_id, ret;
|
|
+
|
|
+ opt_id = bch2_opt_lookup(name);
|
|
+ if (opt_id < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ opt = bch2_opt_table + opt_id;
|
|
+
|
|
+ inode_opt_id = opt_to_inode_opt(opt_id);
|
|
+ if (inode_opt_id < 0)
|
|
+ return -EINVAL;
|
|
+
|
|
+ s.id = inode_opt_id;
|
|
+
|
|
+ if (value) {
|
|
+ u64 v = 0;
|
|
+
|
|
+ buf = kmalloc(size + 1, GFP_KERNEL);
|
|
+ if (!buf)
|
|
+ return -ENOMEM;
|
|
+ memcpy(buf, value, size);
|
|
+ buf[size] = '\0';
|
|
+
|
|
+ ret = bch2_opt_parse(c, opt, buf, &v);
|
|
+ kfree(buf);
|
|
+
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ ret = bch2_opt_check_may_set(c, opt_id, v);
|
|
+ if (ret < 0)
|
|
+ return ret;
|
|
+
|
|
+ s.v = v + 1;
|
|
+ s.defined = true;
|
|
+ } else {
|
|
+ if (!IS_ROOT(dentry)) {
|
|
+ struct bch_inode_info *dir =
|
|
+ to_bch_ei(d_inode(dentry->d_parent));
|
|
+
|
|
+ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id);
|
|
+ } else {
|
|
+ s.v = 0;
|
|
+ }
|
|
+
|
|
+ s.defined = false;
|
|
+ }
|
|
+
|
|
+ mutex_lock(&inode->ei_update_lock);
|
|
+ if (inode_opt_id == Inode_opt_project) {
|
|
+ /*
|
|
+ * inode fields accessible via the xattr interface are stored
|
|
+ * with a +1 bias, so that 0 means unset:
|
|
+ */
|
|
+ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0);
|
|
+ if (ret)
|
|
+ goto err;
|
|
+ }
|
|
+
|
|
+ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0);
|
|
+err:
|
|
+ mutex_unlock(&inode->ei_update_lock);
|
|
+
|
|
+ if (value &&
|
|
+ (opt_id == Opt_background_compression ||
|
|
+ opt_id == Opt_background_target))
|
|
+ bch2_rebalance_add_work(c, inode->v.i_blocks);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static const struct xattr_handler bch_xattr_bcachefs_handler = {
|
|
+ .prefix = "bcachefs.",
|
|
+ .get = bch2_xattr_bcachefs_get,
|
|
+ .set = bch2_xattr_bcachefs_set,
|
|
+};
|
|
+
|
|
+static int bch2_xattr_bcachefs_get_effective(
|
|
+ const struct xattr_handler *handler,
|
|
+ struct dentry *dentry, struct inode *vinode,
|
|
+ const char *name, void *buffer, size_t size)
|
|
+{
|
|
+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode,
|
|
+ name, buffer, size, true);
|
|
+}
|
|
+
|
|
+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = {
|
|
+ .prefix = "bcachefs_effective.",
|
|
+ .get = bch2_xattr_bcachefs_get_effective,
|
|
+ .set = bch2_xattr_bcachefs_set,
|
|
+};
|
|
+
|
|
+#endif /* NO_BCACHEFS_FS */
|
|
+
|
|
+const struct xattr_handler *bch2_xattr_handlers[] = {
|
|
+ &bch_xattr_user_handler,
|
|
+ &posix_acl_access_xattr_handler,
|
|
+ &posix_acl_default_xattr_handler,
|
|
+ &bch_xattr_trusted_handler,
|
|
+ &bch_xattr_security_handler,
|
|
+#ifndef NO_BCACHEFS_FS
|
|
+ &bch_xattr_bcachefs_handler,
|
|
+ &bch_xattr_bcachefs_effective_handler,
|
|
+#endif
|
|
+ NULL
|
|
+};
|
|
+
|
|
+static const struct xattr_handler *bch_xattr_handler_map[] = {
|
|
+ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler,
|
|
+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] =
|
|
+ &posix_acl_access_xattr_handler,
|
|
+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] =
|
|
+ &posix_acl_default_xattr_handler,
|
|
+ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler,
|
|
+ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler,
|
|
+};
|
|
+
|
|
+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type)
|
|
+{
|
|
+ return type < ARRAY_SIZE(bch_xattr_handler_map)
|
|
+ ? bch_xattr_handler_map[type]
|
|
+ : NULL;
|
|
+}
|
|
diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h
|
|
new file mode 100644
|
|
index 000000000000..4151065ab853
|
|
--- /dev/null
|
|
+++ b/fs/bcachefs/xattr.h
|
|
@@ -0,0 +1,49 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _BCACHEFS_XATTR_H
|
|
+#define _BCACHEFS_XATTR_H
|
|
+
|
|
+#include "str_hash.h"
|
|
+
|
|
+extern const struct bch_hash_desc bch2_xattr_hash_desc;
|
|
+
|
|
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
|
|
+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
|
|
+
|
|
+#define bch2_bkey_ops_xattr (struct bkey_ops) { \
|
|
+ .key_invalid = bch2_xattr_invalid, \
|
|
+ .val_to_text = bch2_xattr_to_text, \
|
|
+}
|
|
+
|
|
+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
|
|
+{
|
|
+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
|
|
+ name_len + val_len, sizeof(u64));
|
|
+}
|
|
+
|
|
+#define xattr_val(_xattr) \
|
|
+ ((void *) (_xattr)->x_name + (_xattr)->x_name_len)
|
|
+
|
|
+struct xattr_search_key {
|
|
+ u8 type;
|
|
+ struct qstr name;
|
|
+};
|
|
+
|
|
+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \
|
|
+ { .type = _type, .name = QSTR_INIT(_name, _len) })
|
|
+
|
|
+struct dentry;
|
|
+struct xattr_handler;
|
|
+struct bch_hash_info;
|
|
+struct bch_inode_info;
|
|
+
|
|
+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *,
|
|
+ const char *, void *, size_t, int);
|
|
+
|
|
+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *,
|
|
+ const char *, const void *, size_t, int, int);
|
|
+
|
|
+ssize_t bch2_xattr_list(struct dentry *, char *, size_t);
|
|
+
|
|
+extern const struct xattr_handler *bch2_xattr_handlers[];
|
|
+
|
|
+#endif /* _BCACHEFS_XATTR_H */
|
|
diff --git a/fs/dcache.c b/fs/dcache.c
|
|
index 97e81a844a96..d8d7d591cddb 100644
|
|
--- a/fs/dcache.c
|
|
+++ b/fs/dcache.c
|
|
@@ -3139,9 +3139,8 @@ void d_genocide(struct dentry *parent)
|
|
|
|
EXPORT_SYMBOL(d_genocide);
|
|
|
|
-void d_tmpfile(struct dentry *dentry, struct inode *inode)
|
|
+void d_mark_tmpfile(struct dentry *dentry, struct inode *inode)
|
|
{
|
|
- inode_dec_link_count(inode);
|
|
BUG_ON(dentry->d_name.name != dentry->d_iname ||
|
|
!hlist_unhashed(&dentry->d_u.d_alias) ||
|
|
!d_unlinked(dentry));
|
|
@@ -3151,6 +3150,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode)
|
|
(unsigned long long)inode->i_ino);
|
|
spin_unlock(&dentry->d_lock);
|
|
spin_unlock(&dentry->d_parent->d_lock);
|
|
+}
|
|
+EXPORT_SYMBOL(d_mark_tmpfile);
|
|
+
|
|
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
|
|
+{
|
|
+ inode_dec_link_count(inode);
|
|
+ d_mark_tmpfile(dentry, inode);
|
|
d_instantiate(dentry, inode);
|
|
}
|
|
EXPORT_SYMBOL(d_tmpfile);
|
|
diff --git a/fs/inode.c b/fs/inode.c
|
|
index 6442d97d9a4a..22b4eb3971cb 100644
|
|
--- a/fs/inode.c
|
|
+++ b/fs/inode.c
|
|
@@ -57,8 +57,23 @@
|
|
|
|
static unsigned int i_hash_mask __read_mostly;
|
|
static unsigned int i_hash_shift __read_mostly;
|
|
-static struct hlist_head *inode_hashtable __read_mostly;
|
|
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
|
|
+static struct hlist_bl_head *inode_hashtable __read_mostly;
|
|
+
|
|
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
|
|
+{
|
|
+ unsigned long tmp;
|
|
+
|
|
+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
|
|
+ L1_CACHE_BYTES;
|
|
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
|
|
+ return tmp & i_hash_mask;
|
|
+}
|
|
+
|
|
+static inline struct hlist_bl_head *i_hash_head(struct super_block *sb,
|
|
+ unsigned int hashval)
|
|
+{
|
|
+ return inode_hashtable + hash(sb, hashval);
|
|
+}
|
|
|
|
/*
|
|
* Empty aops. Can be used for the cases where the user does not
|
|
@@ -390,7 +405,7 @@ EXPORT_SYMBOL(address_space_init_once);
|
|
void inode_init_once(struct inode *inode)
|
|
{
|
|
memset(inode, 0, sizeof(*inode));
|
|
- INIT_HLIST_NODE(&inode->i_hash);
|
|
+ INIT_HLIST_BL_NODE(&inode->i_hash);
|
|
INIT_LIST_HEAD(&inode->i_devices);
|
|
INIT_LIST_HEAD(&inode->i_io_list);
|
|
INIT_LIST_HEAD(&inode->i_wb_list);
|
|
@@ -474,14 +489,15 @@ static inline void inode_sb_list_del(struct inode *inode)
|
|
}
|
|
}
|
|
|
|
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
|
|
+/*
|
|
+ * Ensure that we store the hash head in the inode when we insert the inode into
|
|
+ * the hlist_bl_head...
|
|
+ */
|
|
+static inline void
|
|
+__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b)
|
|
{
|
|
- unsigned long tmp;
|
|
-
|
|
- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
|
|
- L1_CACHE_BYTES;
|
|
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
|
|
- return tmp & i_hash_mask;
|
|
+ hlist_bl_add_head_rcu(&inode->i_hash, b);
|
|
+ inode->i_hash_head = b;
|
|
}
|
|
|
|
/**
|
|
@@ -494,13 +510,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval)
|
|
*/
|
|
void __insert_inode_hash(struct inode *inode, unsigned long hashval)
|
|
{
|
|
- struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval);
|
|
+ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
|
|
|
|
- spin_lock(&inode_hash_lock);
|
|
+ hlist_bl_lock(b);
|
|
spin_lock(&inode->i_lock);
|
|
- hlist_add_head_rcu(&inode->i_hash, b);
|
|
+ __insert_inode_hash_head(inode, b);
|
|
spin_unlock(&inode->i_lock);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
}
|
|
EXPORT_SYMBOL(__insert_inode_hash);
|
|
|
|
@@ -512,11 +528,44 @@ EXPORT_SYMBOL(__insert_inode_hash);
|
|
*/
|
|
void __remove_inode_hash(struct inode *inode)
|
|
{
|
|
- spin_lock(&inode_hash_lock);
|
|
- spin_lock(&inode->i_lock);
|
|
- hlist_del_init_rcu(&inode->i_hash);
|
|
- spin_unlock(&inode->i_lock);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ struct hlist_bl_head *b = inode->i_hash_head;
|
|
+
|
|
+ /*
|
|
+ * There are some callers that come through here without synchronisation
|
|
+ * and potentially with multiple references to the inode. Hence we have
|
|
+ * to handle the case that we might race with a remove and insert to a
|
|
+ * different list. Coda, in particular, seems to have a userspace API
|
|
+ * that can directly trigger "unhash/rehash to different list" behaviour
|
|
+ * without any serialisation at all.
|
|
+ *
|
|
+ * Hence we have to handle the situation where the inode->i_hash_head
|
|
+ * might point to a different list than what we expect, indicating that
|
|
+ * we raced with another unhash and potentially a new insertion. This
|
|
+ * means we have to retest the head once we have everything locked up
|
|
+ * and loop again if it doesn't match.
|
|
+ */
|
|
+ while (b) {
|
|
+ hlist_bl_lock(b);
|
|
+ spin_lock(&inode->i_lock);
|
|
+ if (b != inode->i_hash_head) {
|
|
+ hlist_bl_unlock(b);
|
|
+ b = inode->i_hash_head;
|
|
+ spin_unlock(&inode->i_lock);
|
|
+ continue;
|
|
+ }
|
|
+ /*
|
|
+ * Need to set the pprev pointer to NULL after list removal so
|
|
+ * that both RCU traversals and hlist_bl_unhashed() work
|
|
+ * correctly at this point.
|
|
+ */
|
|
+ hlist_bl_del_rcu(&inode->i_hash);
|
|
+ inode->i_hash.pprev = NULL;
|
|
+ inode->i_hash_head = NULL;
|
|
+ spin_unlock(&inode->i_lock);
|
|
+ hlist_bl_unlock(b);
|
|
+ break;
|
|
+ }
|
|
+
|
|
}
|
|
EXPORT_SYMBOL(__remove_inode_hash);
|
|
|
|
@@ -806,26 +855,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
|
|
return freed;
|
|
}
|
|
|
|
-static void __wait_on_freeing_inode(struct inode *inode);
|
|
+static void __wait_on_freeing_inode(struct hlist_bl_head *b,
|
|
+ struct inode *inode);
|
|
/*
|
|
* Called with the inode lock held.
|
|
*/
|
|
static struct inode *find_inode(struct super_block *sb,
|
|
- struct hlist_head *head,
|
|
+ struct hlist_bl_head *b,
|
|
int (*test)(struct inode *, void *),
|
|
void *data)
|
|
{
|
|
+ struct hlist_bl_node *node;
|
|
struct inode *inode = NULL;
|
|
|
|
repeat:
|
|
- hlist_for_each_entry(inode, head, i_hash) {
|
|
+ hlist_bl_for_each_entry(inode, node, b, i_hash) {
|
|
if (inode->i_sb != sb)
|
|
continue;
|
|
if (!test(inode, data))
|
|
continue;
|
|
spin_lock(&inode->i_lock);
|
|
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
|
|
- __wait_on_freeing_inode(inode);
|
|
+ __wait_on_freeing_inode(b, inode);
|
|
goto repeat;
|
|
}
|
|
if (unlikely(inode->i_state & I_CREATING)) {
|
|
@@ -844,19 +895,20 @@ static struct inode *find_inode(struct super_block *sb,
|
|
* iget_locked for details.
|
|
*/
|
|
static struct inode *find_inode_fast(struct super_block *sb,
|
|
- struct hlist_head *head, unsigned long ino)
|
|
+ struct hlist_bl_head *b, unsigned long ino)
|
|
{
|
|
+ struct hlist_bl_node *node;
|
|
struct inode *inode = NULL;
|
|
|
|
repeat:
|
|
- hlist_for_each_entry(inode, head, i_hash) {
|
|
+ hlist_bl_for_each_entry(inode, node, b, i_hash) {
|
|
if (inode->i_ino != ino)
|
|
continue;
|
|
if (inode->i_sb != sb)
|
|
continue;
|
|
spin_lock(&inode->i_lock);
|
|
if (inode->i_state & (I_FREEING|I_WILL_FREE)) {
|
|
- __wait_on_freeing_inode(inode);
|
|
+ __wait_on_freeing_inode(b, inode);
|
|
goto repeat;
|
|
}
|
|
if (unlikely(inode->i_state & I_CREATING)) {
|
|
@@ -1065,26 +1117,26 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
|
|
* return it locked, hashed, and with the I_NEW flag set. The file system gets
|
|
* to fill it in before unlocking it via unlock_new_inode().
|
|
*
|
|
- * Note both @test and @set are called with the inode_hash_lock held, so can't
|
|
- * sleep.
|
|
+ * Note both @test and @set are called with the inode hash chain lock held,
|
|
+ * so can't sleep.
|
|
*/
|
|
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
|
|
int (*test)(struct inode *, void *),
|
|
int (*set)(struct inode *, void *), void *data)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
|
|
+ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval);
|
|
struct inode *old;
|
|
bool creating = inode->i_state & I_CREATING;
|
|
|
|
again:
|
|
- spin_lock(&inode_hash_lock);
|
|
- old = find_inode(inode->i_sb, head, test, data);
|
|
+ hlist_bl_lock(b);
|
|
+ old = find_inode(inode->i_sb, b, test, data);
|
|
if (unlikely(old)) {
|
|
/*
|
|
* Uhhuh, somebody else created the same inode under us.
|
|
* Use the old inode instead of the preallocated one.
|
|
*/
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
if (IS_ERR(old))
|
|
return NULL;
|
|
wait_on_inode(old);
|
|
@@ -1106,12 +1158,12 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
|
|
*/
|
|
spin_lock(&inode->i_lock);
|
|
inode->i_state |= I_NEW;
|
|
- hlist_add_head_rcu(&inode->i_hash, head);
|
|
+ __insert_inode_hash_head(inode, b);
|
|
spin_unlock(&inode->i_lock);
|
|
if (!creating)
|
|
inode_sb_list_add(inode);
|
|
unlock:
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
|
|
return inode;
|
|
}
|
|
@@ -1172,12 +1224,12 @@ EXPORT_SYMBOL(iget5_locked);
|
|
*/
|
|
struct inode *iget_locked(struct super_block *sb, unsigned long ino)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
|
|
struct inode *inode;
|
|
again:
|
|
- spin_lock(&inode_hash_lock);
|
|
- inode = find_inode_fast(sb, head, ino);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_lock(b);
|
|
+ inode = find_inode_fast(sb, b, ino);
|
|
+ hlist_bl_unlock(b);
|
|
if (inode) {
|
|
if (IS_ERR(inode))
|
|
return NULL;
|
|
@@ -1193,17 +1245,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
|
|
if (inode) {
|
|
struct inode *old;
|
|
|
|
- spin_lock(&inode_hash_lock);
|
|
+ hlist_bl_lock(b);
|
|
/* We released the lock, so.. */
|
|
- old = find_inode_fast(sb, head, ino);
|
|
+ old = find_inode_fast(sb, b, ino);
|
|
if (!old) {
|
|
inode->i_ino = ino;
|
|
spin_lock(&inode->i_lock);
|
|
inode->i_state = I_NEW;
|
|
- hlist_add_head_rcu(&inode->i_hash, head);
|
|
+ __insert_inode_hash_head(inode, b);
|
|
spin_unlock(&inode->i_lock);
|
|
inode_sb_list_add(inode);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
|
|
/* Return the locked inode with I_NEW set, the
|
|
* caller is responsible for filling in the contents
|
|
@@ -1216,7 +1268,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
|
|
* us. Use the old inode instead of the one we just
|
|
* allocated.
|
|
*/
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
destroy_inode(inode);
|
|
if (IS_ERR(old))
|
|
return NULL;
|
|
@@ -1240,10 +1292,11 @@ EXPORT_SYMBOL(iget_locked);
|
|
*/
|
|
static int test_inode_iunique(struct super_block *sb, unsigned long ino)
|
|
{
|
|
- struct hlist_head *b = inode_hashtable + hash(sb, ino);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
|
|
+ struct hlist_bl_node *node;
|
|
struct inode *inode;
|
|
|
|
- hlist_for_each_entry_rcu(inode, b, i_hash) {
|
|
+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
|
|
if (inode->i_ino == ino && inode->i_sb == sb)
|
|
return 0;
|
|
}
|
|
@@ -1327,12 +1380,12 @@ EXPORT_SYMBOL(igrab);
|
|
struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
|
|
int (*test)(struct inode *, void *), void *data)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, hashval);
|
|
struct inode *inode;
|
|
|
|
- spin_lock(&inode_hash_lock);
|
|
- inode = find_inode(sb, head, test, data);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_lock(b);
|
|
+ inode = find_inode(sb, b, test, data);
|
|
+ hlist_bl_unlock(b);
|
|
|
|
return IS_ERR(inode) ? NULL : inode;
|
|
}
|
|
@@ -1382,12 +1435,12 @@ EXPORT_SYMBOL(ilookup5);
|
|
*/
|
|
struct inode *ilookup(struct super_block *sb, unsigned long ino)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
|
|
struct inode *inode;
|
|
again:
|
|
- spin_lock(&inode_hash_lock);
|
|
- inode = find_inode_fast(sb, head, ino);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_lock(b);
|
|
+ inode = find_inode_fast(sb, b, ino);
|
|
+ hlist_bl_unlock(b);
|
|
|
|
if (inode) {
|
|
if (IS_ERR(inode))
|
|
@@ -1431,12 +1484,13 @@ struct inode *find_inode_nowait(struct super_block *sb,
|
|
void *),
|
|
void *data)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, hashval);
|
|
+ struct hlist_bl_node *node;
|
|
struct inode *inode, *ret_inode = NULL;
|
|
int mval;
|
|
|
|
- spin_lock(&inode_hash_lock);
|
|
- hlist_for_each_entry(inode, head, i_hash) {
|
|
+ hlist_bl_lock(b);
|
|
+ hlist_bl_for_each_entry(inode, node, b, i_hash) {
|
|
if (inode->i_sb != sb)
|
|
continue;
|
|
mval = match(inode, hashval, data);
|
|
@@ -1447,7 +1501,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
|
|
goto out;
|
|
}
|
|
out:
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
return ret_inode;
|
|
}
|
|
EXPORT_SYMBOL(find_inode_nowait);
|
|
@@ -1476,13 +1530,14 @@ EXPORT_SYMBOL(find_inode_nowait);
|
|
struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
|
|
int (*test)(struct inode *, void *), void *data)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, hashval);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, hashval);
|
|
+ struct hlist_bl_node *node;
|
|
struct inode *inode;
|
|
|
|
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
|
|
"suspicious find_inode_rcu() usage");
|
|
|
|
- hlist_for_each_entry_rcu(inode, head, i_hash) {
|
|
+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
|
|
if (inode->i_sb == sb &&
|
|
!(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) &&
|
|
test(inode, data))
|
|
@@ -1514,13 +1569,14 @@ EXPORT_SYMBOL(find_inode_rcu);
|
|
struct inode *find_inode_by_ino_rcu(struct super_block *sb,
|
|
unsigned long ino)
|
|
{
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
|
|
+ struct hlist_bl_node *node;
|
|
struct inode *inode;
|
|
|
|
RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
|
|
"suspicious find_inode_by_ino_rcu() usage");
|
|
|
|
- hlist_for_each_entry_rcu(inode, head, i_hash) {
|
|
+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) {
|
|
if (inode->i_ino == ino &&
|
|
inode->i_sb == sb &&
|
|
!(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)))
|
|
@@ -1534,39 +1590,42 @@ int insert_inode_locked(struct inode *inode)
|
|
{
|
|
struct super_block *sb = inode->i_sb;
|
|
ino_t ino = inode->i_ino;
|
|
- struct hlist_head *head = inode_hashtable + hash(sb, ino);
|
|
+ struct hlist_bl_head *b = i_hash_head(sb, ino);
|
|
|
|
while (1) {
|
|
- struct inode *old = NULL;
|
|
- spin_lock(&inode_hash_lock);
|
|
- hlist_for_each_entry(old, head, i_hash) {
|
|
- if (old->i_ino != ino)
|
|
+ struct hlist_bl_node *node;
|
|
+ struct inode *old = NULL, *t;
|
|
+
|
|
+ hlist_bl_lock(b);
|
|
+ hlist_bl_for_each_entry(t, node, b, i_hash) {
|
|
+ if (t->i_ino != ino)
|
|
continue;
|
|
- if (old->i_sb != sb)
|
|
+ if (t->i_sb != sb)
|
|
continue;
|
|
- spin_lock(&old->i_lock);
|
|
- if (old->i_state & (I_FREEING|I_WILL_FREE)) {
|
|
- spin_unlock(&old->i_lock);
|
|
+ spin_lock(&t->i_lock);
|
|
+ if (t->i_state & (I_FREEING|I_WILL_FREE)) {
|
|
+ spin_unlock(&t->i_lock);
|
|
continue;
|
|
}
|
|
+ old = t;
|
|
break;
|
|
}
|
|
if (likely(!old)) {
|
|
spin_lock(&inode->i_lock);
|
|
inode->i_state |= I_NEW | I_CREATING;
|
|
- hlist_add_head_rcu(&inode->i_hash, head);
|
|
+ __insert_inode_hash_head(inode, b);
|
|
spin_unlock(&inode->i_lock);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
return 0;
|
|
}
|
|
if (unlikely(old->i_state & I_CREATING)) {
|
|
spin_unlock(&old->i_lock);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
return -EBUSY;
|
|
}
|
|
__iget(old);
|
|
spin_unlock(&old->i_lock);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
wait_on_inode(old);
|
|
if (unlikely(!inode_unhashed(old))) {
|
|
iput(old);
|
|
@@ -2036,17 +2095,18 @@ EXPORT_SYMBOL(inode_needs_sync);
|
|
* wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list
|
|
* will DTRT.
|
|
*/
|
|
-static void __wait_on_freeing_inode(struct inode *inode)
|
|
+static void __wait_on_freeing_inode(struct hlist_bl_head *b,
|
|
+ struct inode *inode)
|
|
{
|
|
wait_queue_head_t *wq;
|
|
DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
|
|
wq = bit_waitqueue(&inode->i_state, __I_NEW);
|
|
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
|
|
spin_unlock(&inode->i_lock);
|
|
- spin_unlock(&inode_hash_lock);
|
|
+ hlist_bl_unlock(b);
|
|
schedule();
|
|
finish_wait(wq, &wait.wq_entry);
|
|
- spin_lock(&inode_hash_lock);
|
|
+ hlist_bl_lock(b);
|
|
}
|
|
|
|
static __initdata unsigned long ihash_entries;
|
|
@@ -2072,7 +2132,7 @@ void __init inode_init_early(void)
|
|
|
|
inode_hashtable =
|
|
alloc_large_system_hash("Inode-cache",
|
|
- sizeof(struct hlist_head),
|
|
+ sizeof(struct hlist_bl_head),
|
|
ihash_entries,
|
|
14,
|
|
HASH_EARLY | HASH_ZERO,
|
|
@@ -2098,7 +2158,7 @@ void __init inode_init(void)
|
|
|
|
inode_hashtable =
|
|
alloc_large_system_hash("Inode-cache",
|
|
- sizeof(struct hlist_head),
|
|
+ sizeof(struct hlist_bl_head),
|
|
ihash_entries,
|
|
14,
|
|
HASH_ZERO,
|
|
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
|
|
index f94ee3089e01..c89beffb3198 100644
|
|
--- a/include/linux/blkdev.h
|
|
+++ b/include/linux/blkdev.h
|
|
@@ -958,6 +958,7 @@ extern const char *blk_op_str(unsigned int op);
|
|
|
|
int blk_status_to_errno(blk_status_t status);
|
|
blk_status_t errno_to_blk_status(int errno);
|
|
+const char *blk_status_to_str(blk_status_t status);
|
|
|
|
int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
|
|
|
|
diff --git a/include/linux/closure.h b/include/linux/closure.h
|
|
new file mode 100644
|
|
index 000000000000..36b4a83f9b77
|
|
--- /dev/null
|
|
+++ b/include/linux/closure.h
|
|
@@ -0,0 +1,399 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#ifndef _LINUX_CLOSURE_H
|
|
+#define _LINUX_CLOSURE_H
|
|
+
|
|
+#include <linux/llist.h>
|
|
+#include <linux/sched.h>
|
|
+#include <linux/sched/task_stack.h>
|
|
+#include <linux/workqueue.h>
|
|
+
|
|
+/*
|
|
+ * Closure is perhaps the most overused and abused term in computer science, but
|
|
+ * since I've been unable to come up with anything better you're stuck with it
|
|
+ * again.
|
|
+ *
|
|
+ * What are closures?
|
|
+ *
|
|
+ * They embed a refcount. The basic idea is they count "things that are in
|
|
+ * progress" - in flight bios, some other thread that's doing something else -
|
|
+ * anything you might want to wait on.
|
|
+ *
|
|
+ * The refcount may be manipulated with closure_get() and closure_put().
|
|
+ * closure_put() is where many of the interesting things happen, when it causes
|
|
+ * the refcount to go to 0.
|
|
+ *
|
|
+ * Closures can be used to wait on things both synchronously and asynchronously,
|
|
+ * and synchronous and asynchronous use can be mixed without restriction. To
|
|
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
|
|
+ * refcount hits 1.
|
|
+ *
|
|
+ * To wait asynchronously, use
|
|
+ * continue_at(cl, next_function, workqueue);
|
|
+ *
|
|
+ * passing it, as you might expect, the function to run when nothing is pending
|
|
+ * and the workqueue to run that function out of.
|
|
+ *
|
|
+ * continue_at() also, critically, requires a 'return' immediately following the
|
|
+ * location where this macro is referenced, to return to the calling function.
|
|
+ * There's good reason for this.
|
|
+ *
|
|
+ * To use safely closures asynchronously, they must always have a refcount while
|
|
+ * they are running owned by the thread that is running them. Otherwise, suppose
|
|
+ * you submit some bios and wish to have a function run when they all complete:
|
|
+ *
|
|
+ * foo_endio(struct bio *bio)
|
|
+ * {
|
|
+ * closure_put(cl);
|
|
+ * }
|
|
+ *
|
|
+ * closure_init(cl);
|
|
+ *
|
|
+ * do_stuff();
|
|
+ * closure_get(cl);
|
|
+ * bio1->bi_endio = foo_endio;
|
|
+ * bio_submit(bio1);
|
|
+ *
|
|
+ * do_more_stuff();
|
|
+ * closure_get(cl);
|
|
+ * bio2->bi_endio = foo_endio;
|
|
+ * bio_submit(bio2);
|
|
+ *
|
|
+ * continue_at(cl, complete_some_read, system_wq);
|
|
+ *
|
|
+ * If closure's refcount started at 0, complete_some_read() could run before the
|
|
+ * second bio was submitted - which is almost always not what you want! More
|
|
+ * importantly, it wouldn't be possible to say whether the original thread or
|
|
+ * complete_some_read()'s thread owned the closure - and whatever state it was
|
|
+ * associated with!
|
|
+ *
|
|
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
|
|
+ * closure_fn is run, the refcount will be reset to 1 first.
|
|
+ *
|
|
+ * Then, the rule is - if you got the refcount with closure_get(), release it
|
|
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
|
|
+ * on a closure because you called closure_init() or you were run out of a
|
|
+ * closure - _always_ use continue_at(). Doing so consistently will help
|
|
+ * eliminate an entire class of particularly pernicious races.
|
|
+ *
|
|
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
|
|
+ * need for specifying the condition - you just want to wait until someone runs
|
|
+ * closure_wake_up() on the appropriate wait list. In that case, just use
|
|
+ * closure_wait(). It will return either true or false, depending on whether the
|
|
+ * closure was already on a wait list or not - a closure can only be on one wait
|
|
+ * list at a time.
|
|
+ *
|
|
+ * Parents:
|
|
+ *
|
|
+ * closure_init() takes two arguments - it takes the closure to initialize, and
|
|
+ * a (possibly null) parent.
|
|
+ *
|
|
+ * If parent is non null, the new closure will have a refcount for its lifetime;
|
|
+ * a closure is considered to be "finished" when its refcount hits 0 and the
|
|
+ * function to run is null. Hence
|
|
+ *
|
|
+ * continue_at(cl, NULL, NULL);
|
|
+ *
|
|
+ * returns up the (spaghetti) stack of closures, precisely like normal return
|
|
+ * returns up the C stack. continue_at() with non null fn is better thought of
|
|
+ * as doing a tail call.
|
|
+ *
|
|
+ * All this implies that a closure should typically be embedded in a particular
|
|
+ * struct (which its refcount will normally control the lifetime of), and that
|
|
+ * struct can very much be thought of as a stack frame.
|
|
+ */
|
|
+
|
|
+struct closure;
|
|
+struct closure_syncer;
|
|
+typedef void (closure_fn) (struct closure *);
|
|
+extern struct dentry *bcache_debug;
|
|
+
|
|
+struct closure_waitlist {
|
|
+ struct llist_head list;
|
|
+};
|
|
+
|
|
+enum closure_state {
|
|
+ /*
|
|
+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
|
|
+ * the thread that owns the closure, and cleared by the thread that's
|
|
+ * waking up the closure.
|
|
+ *
|
|
+ * The rest are for debugging and don't affect behaviour:
|
|
+ *
|
|
+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by
|
|
+ * closure_init() and when closure_put() runs then next function), and
|
|
+ * must be cleared before remaining hits 0. Primarily to help guard
|
|
+ * against incorrect usage and accidentally transferring references.
|
|
+ * continue_at() and closure_return() clear it for you, if you're doing
|
|
+ * something unusual you can use closure_set_dead() which also helps
|
|
+ * annotate where references are being transferred.
|
|
+ */
|
|
+
|
|
+ CLOSURE_BITS_START = (1U << 26),
|
|
+ CLOSURE_DESTRUCTOR = (1U << 26),
|
|
+ CLOSURE_WAITING = (1U << 28),
|
|
+ CLOSURE_RUNNING = (1U << 30),
|
|
+};
|
|
+
|
|
+#define CLOSURE_GUARD_MASK \
|
|
+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
|
|
+
|
|
+#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
|
|
+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
|
|
+
|
|
+struct closure {
|
|
+ union {
|
|
+ struct {
|
|
+ struct workqueue_struct *wq;
|
|
+ struct closure_syncer *s;
|
|
+ struct llist_node list;
|
|
+ closure_fn *fn;
|
|
+ };
|
|
+ struct work_struct work;
|
|
+ };
|
|
+
|
|
+ struct closure *parent;
|
|
+
|
|
+ atomic_t remaining;
|
|
+
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+#define CLOSURE_MAGIC_DEAD 0xc054dead
|
|
+#define CLOSURE_MAGIC_ALIVE 0xc054a11e
|
|
+
|
|
+ unsigned int magic;
|
|
+ struct list_head all;
|
|
+ unsigned long ip;
|
|
+ unsigned long waiting_on;
|
|
+#endif
|
|
+};
|
|
+
|
|
+void closure_sub(struct closure *cl, int v);
|
|
+void closure_put(struct closure *cl);
|
|
+void __closure_wake_up(struct closure_waitlist *list);
|
|
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
|
|
+void __closure_sync(struct closure *cl);
|
|
+
|
|
+/**
|
|
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
|
|
+ *
|
|
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
|
|
+ * the last refcount.
|
|
+ */
|
|
+static inline void closure_sync(struct closure *cl)
|
|
+{
|
|
+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
|
|
+ __closure_sync(cl);
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+
|
|
+void closure_debug_create(struct closure *cl);
|
|
+void closure_debug_destroy(struct closure *cl);
|
|
+
|
|
+#else
|
|
+
|
|
+static inline void closure_debug_create(struct closure *cl) {}
|
|
+static inline void closure_debug_destroy(struct closure *cl) {}
|
|
+
|
|
+#endif
|
|
+
|
|
+static inline void closure_set_ip(struct closure *cl)
|
|
+{
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+ cl->ip = _THIS_IP_;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void closure_set_ret_ip(struct closure *cl)
|
|
+{
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+ cl->ip = _RET_IP_;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void closure_set_waiting(struct closure *cl, unsigned long f)
|
|
+{
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+ cl->waiting_on = f;
|
|
+#endif
|
|
+}
|
|
+
|
|
+static inline void closure_set_stopped(struct closure *cl)
|
|
+{
|
|
+ atomic_sub(CLOSURE_RUNNING, &cl->remaining);
|
|
+}
|
|
+
|
|
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
|
|
+ struct workqueue_struct *wq)
|
|
+{
|
|
+ closure_set_ip(cl);
|
|
+ cl->fn = fn;
|
|
+ cl->wq = wq;
|
|
+ /* between atomic_dec() in closure_put() */
|
|
+ smp_mb__before_atomic();
|
|
+}
|
|
+
|
|
+static inline void closure_queue(struct closure *cl)
|
|
+{
|
|
+ struct workqueue_struct *wq = cl->wq;
|
|
+ /**
|
|
+ * Changes made to closure, work_struct, or a couple of other structs
|
|
+ * may cause work.func not pointing to the right location.
|
|
+ */
|
|
+ BUILD_BUG_ON(offsetof(struct closure, fn)
|
|
+ != offsetof(struct work_struct, func));
|
|
+
|
|
+ if (wq) {
|
|
+ INIT_WORK(&cl->work, cl->work.func);
|
|
+ BUG_ON(!queue_work(wq, &cl->work));
|
|
+ } else
|
|
+ cl->fn(cl);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * closure_get - increment a closure's refcount
|
|
+ */
|
|
+static inline void closure_get(struct closure *cl)
|
|
+{
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+ BUG_ON((atomic_inc_return(&cl->remaining) &
|
|
+ CLOSURE_REMAINING_MASK) <= 1);
|
|
+#else
|
|
+ atomic_inc(&cl->remaining);
|
|
+#endif
|
|
+}
|
|
+
|
|
+/**
|
|
+ * closure_init - Initialize a closure, setting the refcount to 1
|
|
+ * @cl: closure to initialize
|
|
+ * @parent: parent of the new closure. cl will take a refcount on it for its
|
|
+ * lifetime; may be NULL.
|
|
+ */
|
|
+static inline void closure_init(struct closure *cl, struct closure *parent)
|
|
+{
|
|
+ cl->fn = NULL;
|
|
+ cl->parent = parent;
|
|
+ if (parent)
|
|
+ closure_get(parent);
|
|
+
|
|
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
|
|
+
|
|
+ closure_debug_create(cl);
|
|
+ closure_set_ip(cl);
|
|
+}
|
|
+
|
|
+static inline void closure_init_stack(struct closure *cl)
|
|
+{
|
|
+ memset(cl, 0, sizeof(struct closure));
|
|
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * closure_wake_up - wake up all closures on a wait list,
|
|
+ * with memory barrier
|
|
+ */
|
|
+static inline void closure_wake_up(struct closure_waitlist *list)
|
|
+{
|
|
+ /* Memory barrier for the wait list */
|
|
+ smp_mb();
|
|
+ __closure_wake_up(list);
|
|
+}
|
|
+
|
|
+/**
|
|
+ * continue_at - jump to another function with barrier
|
|
+ *
|
|
+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have
|
|
+ * been dropped with closure_put()), it will resume execution at @fn running out
|
|
+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
|
|
+ *
|
|
+ * This is because after calling continue_at() you no longer have a ref on @cl,
|
|
+ * and whatever @cl owns may be freed out from under you - a running closure fn
|
|
+ * has a ref on its own closure which continue_at() drops.
|
|
+ *
|
|
+ * Note you are expected to immediately return after using this macro.
|
|
+ */
|
|
+#define continue_at(_cl, _fn, _wq) \
|
|
+do { \
|
|
+ set_closure_fn(_cl, _fn, _wq); \
|
|
+ closure_sub(_cl, CLOSURE_RUNNING + 1); \
|
|
+} while (0)
|
|
+
|
|
+/**
|
|
+ * closure_return - finish execution of a closure
|
|
+ *
|
|
+ * This is used to indicate that @cl is finished: when all outstanding refs on
|
|
+ * @cl have been dropped @cl's ref on its parent closure (as passed to
|
|
+ * closure_init()) will be dropped, if one was specified - thus this can be
|
|
+ * thought of as returning to the parent closure.
|
|
+ */
|
|
+#define closure_return(_cl) continue_at((_cl), NULL, NULL)
|
|
+
|
|
+/**
|
|
+ * continue_at_nobarrier - jump to another function without barrier
|
|
+ *
|
|
+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if
|
|
+ * @wq is NULL).
|
|
+ *
|
|
+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
|
|
+ * thus it's not safe to touch anything protected by @cl after a
|
|
+ * continue_at_nobarrier().
|
|
+ */
|
|
+#define continue_at_nobarrier(_cl, _fn, _wq) \
|
|
+do { \
|
|
+ set_closure_fn(_cl, _fn, _wq); \
|
|
+ closure_queue(_cl); \
|
|
+} while (0)
|
|
+
|
|
+/**
|
|
+ * closure_return_with_destructor - finish execution of a closure,
|
|
+ * with destructor
|
|
+ *
|
|
+ * Works like closure_return(), except @destructor will be called when all
|
|
+ * outstanding refs on @cl have been dropped; @destructor may be used to safely
|
|
+ * free the memory occupied by @cl, and it is called with the ref on the parent
|
|
+ * closure still held - so @destructor could safely return an item to a
|
|
+ * freelist protected by @cl's parent.
|
|
+ */
|
|
+#define closure_return_with_destructor(_cl, _destructor) \
|
|
+do { \
|
|
+ set_closure_fn(_cl, _destructor, NULL); \
|
|
+ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \
|
|
+} while (0)
|
|
+
|
|
+/**
|
|
+ * closure_call - execute @fn out of a new, uninitialized closure
|
|
+ *
|
|
+ * Typically used when running out of one closure, and we want to run @fn
|
|
+ * asynchronously out of a new closure - @parent will then wait for @cl to
|
|
+ * finish.
|
|
+ */
|
|
+static inline void closure_call(struct closure *cl, closure_fn fn,
|
|
+ struct workqueue_struct *wq,
|
|
+ struct closure *parent)
|
|
+{
|
|
+ closure_init(cl, parent);
|
|
+ continue_at_nobarrier(cl, fn, wq);
|
|
+}
|
|
+
|
|
+#define __closure_wait_event(waitlist, _cond) \
|
|
+do { \
|
|
+ struct closure cl; \
|
|
+ \
|
|
+ closure_init_stack(&cl); \
|
|
+ \
|
|
+ while (1) { \
|
|
+ closure_wait(waitlist, &cl); \
|
|
+ if (_cond) \
|
|
+ break; \
|
|
+ closure_sync(&cl); \
|
|
+ } \
|
|
+ closure_wake_up(waitlist); \
|
|
+ closure_sync(&cl); \
|
|
+} while (0)
|
|
+
|
|
+#define closure_wait_event(waitlist, _cond) \
|
|
+do { \
|
|
+ if (!(_cond)) \
|
|
+ __closure_wait_event(waitlist, _cond); \
|
|
+} while (0)
|
|
+
|
|
+#endif /* _LINUX_CLOSURE_H */
|
|
diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h
|
|
index ea5e04e75845..911647a0431b 100644
|
|
--- a/include/linux/compiler_attributes.h
|
|
+++ b/include/linux/compiler_attributes.h
|
|
@@ -284,4 +284,9 @@
|
|
*/
|
|
#define __weak __attribute__((__weak__))
|
|
|
|
+/*
|
|
+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute
|
|
+ */
|
|
+#define __flatten __attribute__((flatten))
|
|
+
|
|
#endif /* __LINUX_COMPILER_ATTRIBUTES_H */
|
|
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
|
|
index d7b369fc15d3..ad99f07e634f 100644
|
|
--- a/include/linux/dcache.h
|
|
+++ b/include/linux/dcache.h
|
|
@@ -257,6 +257,7 @@ extern struct dentry * d_make_root(struct inode *);
|
|
/* <clickety>-<click> the ramfs-type tree */
|
|
extern void d_genocide(struct dentry *);
|
|
|
|
+extern void d_mark_tmpfile(struct dentry *, struct inode *);
|
|
extern void d_tmpfile(struct dentry *, struct inode *);
|
|
|
|
extern struct dentry *d_find_alias(struct inode *);
|
|
diff --git a/include/linux/fs.h b/include/linux/fs.h
|
|
index fd47deea7c17..37c330b9768e 100644
|
|
--- a/include/linux/fs.h
|
|
+++ b/include/linux/fs.h
|
|
@@ -662,7 +662,8 @@ struct inode {
|
|
unsigned long dirtied_when; /* jiffies of first dirtying */
|
|
unsigned long dirtied_time_when;
|
|
|
|
- struct hlist_node i_hash;
|
|
+ struct hlist_bl_node i_hash;
|
|
+ struct hlist_bl_head *i_hash_head;
|
|
struct list_head i_io_list; /* backing dev IO list */
|
|
#ifdef CONFIG_CGROUP_WRITEBACK
|
|
struct bdi_writeback *i_wb; /* the associated cgroup wb */
|
|
@@ -728,7 +729,7 @@ static inline unsigned int i_blocksize(const struct inode *node)
|
|
|
|
static inline int inode_unhashed(struct inode *inode)
|
|
{
|
|
- return hlist_unhashed(&inode->i_hash);
|
|
+ return hlist_bl_unhashed(&inode->i_hash);
|
|
}
|
|
|
|
/*
|
|
@@ -739,7 +740,7 @@ static inline int inode_unhashed(struct inode *inode)
|
|
*/
|
|
static inline void inode_fake_hash(struct inode *inode)
|
|
{
|
|
- hlist_add_fake(&inode->i_hash);
|
|
+ hlist_bl_add_fake(&inode->i_hash);
|
|
}
|
|
|
|
/*
|
|
@@ -2947,7 +2948,7 @@ static inline void insert_inode_hash(struct inode *inode)
|
|
extern void __remove_inode_hash(struct inode *);
|
|
static inline void remove_inode_hash(struct inode *inode)
|
|
{
|
|
- if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash))
|
|
+ if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash))
|
|
__remove_inode_hash(inode);
|
|
}
|
|
|
|
diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h
|
|
index bfd00320c7f3..0af6ca0e3b2e 100644
|
|
--- a/include/linux/generic-radix-tree.h
|
|
+++ b/include/linux/generic-radix-tree.h
|
|
@@ -183,6 +183,12 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t);
|
|
static inline void __genradix_iter_advance(struct genradix_iter *iter,
|
|
size_t obj_size)
|
|
{
|
|
+ if (iter->offset + obj_size < iter->offset) {
|
|
+ iter->offset = SIZE_MAX;
|
|
+ iter->pos = SIZE_MAX;
|
|
+ return;
|
|
+ }
|
|
+
|
|
iter->offset += obj_size;
|
|
|
|
if (!is_power_of_2(obj_size) &&
|
|
diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
|
|
index ae1b541446c9..8ee2bf5af131 100644
|
|
--- a/include/linux/list_bl.h
|
|
+++ b/include/linux/list_bl.h
|
|
@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n)
|
|
}
|
|
}
|
|
|
|
+/**
|
|
+ * hlist_bl_add_fake - create a fake list consisting of a single headless node
|
|
+ * @n: Node to make a fake list out of
|
|
+ *
|
|
+ * This makes @n appear to be its own predecessor on a headless hlist.
|
|
+ * The point of this is to allow things like hlist_bl_del() to work correctly
|
|
+ * in cases where there is no list.
|
|
+ */
|
|
+static inline void hlist_bl_add_fake(struct hlist_bl_node *n)
|
|
+{
|
|
+ n->pprev = &n->next;
|
|
+}
|
|
+
|
|
+/**
|
|
+ * hlist_fake: Is this node a fake hlist_bl?
|
|
+ * @h: Node to check for being a self-referential fake hlist.
|
|
+ */
|
|
+static inline bool hlist_bl_fake(struct hlist_bl_node *n)
|
|
+{
|
|
+ return n->pprev == &n->next;
|
|
+}
|
|
+
|
|
static inline void hlist_bl_lock(struct hlist_bl_head *b)
|
|
{
|
|
bit_spin_lock(0, (unsigned long *)b);
|
|
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
|
|
index fd02c5fa60cb..6409e0701b35 100644
|
|
--- a/include/linux/rcupdate.h
|
|
+++ b/include/linux/rcupdate.h
|
|
@@ -33,6 +33,8 @@
|
|
#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
|
|
#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
|
|
#define ulong2long(a) (*(long *)(&(a)))
|
|
+#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b)))
|
|
+#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b)))
|
|
|
|
/* Exported common interfaces */
|
|
void call_rcu(struct rcu_head *head, rcu_callback_t func);
|
|
diff --git a/include/linux/sched.h b/include/linux/sched.h
|
|
index 6e3a5eeec509..3053d660dbb3 100644
|
|
--- a/include/linux/sched.h
|
|
+++ b/include/linux/sched.h
|
|
@@ -770,6 +770,7 @@ struct task_struct {
|
|
|
|
struct mm_struct *mm;
|
|
struct mm_struct *active_mm;
|
|
+ struct address_space *faults_disabled_mapping;
|
|
|
|
/* Per-thread vma caching: */
|
|
struct vmacache vmacache;
|
|
diff --git a/include/linux/six.h b/include/linux/six.h
|
|
new file mode 100644
|
|
index 000000000000..477c33eb00d7
|
|
--- /dev/null
|
|
+++ b/include/linux/six.h
|
|
@@ -0,0 +1,203 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+
|
|
+#ifndef _LINUX_SIX_H
|
|
+#define _LINUX_SIX_H
|
|
+
|
|
+/*
|
|
+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
|
|
+ * semaphores, except with a third intermediate state, intent. Basic operations
|
|
+ * are:
|
|
+ *
|
|
+ * six_lock_read(&foo->lock);
|
|
+ * six_unlock_read(&foo->lock);
|
|
+ *
|
|
+ * six_lock_intent(&foo->lock);
|
|
+ * six_unlock_intent(&foo->lock);
|
|
+ *
|
|
+ * six_lock_write(&foo->lock);
|
|
+ * six_unlock_write(&foo->lock);
|
|
+ *
|
|
+ * Intent locks block other intent locks, but do not block read locks, and you
|
|
+ * must have an intent lock held before taking a write lock, like so:
|
|
+ *
|
|
+ * six_lock_intent(&foo->lock);
|
|
+ * six_lock_write(&foo->lock);
|
|
+ * six_unlock_write(&foo->lock);
|
|
+ * six_unlock_intent(&foo->lock);
|
|
+ *
|
|
+ * Other operations:
|
|
+ *
|
|
+ * six_trylock_read()
|
|
+ * six_trylock_intent()
|
|
+ * six_trylock_write()
|
|
+ *
|
|
+ * six_lock_downgrade(): convert from intent to read
|
|
+ * six_lock_tryupgrade(): attempt to convert from read to intent
|
|
+ *
|
|
+ * Locks also embed a sequence number, which is incremented when the lock is
|
|
+ * locked or unlocked for write. The current sequence number can be grabbed
|
|
+ * while a lock is held from lock->state.seq; then, if you drop the lock you can
|
|
+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
|
|
+ * iff it hasn't been locked for write in the meantime.
|
|
+ *
|
|
+ * There are also operations that take the lock type as a parameter, where the
|
|
+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
|
|
+ *
|
|
+ * six_lock_type(lock, type)
|
|
+ * six_unlock_type(lock, type)
|
|
+ * six_relock(lock, type, seq)
|
|
+ * six_trylock_type(lock, type)
|
|
+ * six_trylock_convert(lock, from, to)
|
|
+ *
|
|
+ * A lock may be held multiple types by the same thread (for read or intent,
|
|
+ * not write). However, the six locks code does _not_ implement the actual
|
|
+ * recursive checks itself though - rather, if your code (e.g. btree iterator
|
|
+ * code) knows that the current thread already has a lock held, and for the
|
|
+ * correct type, six_lock_increment() may be used to bump up the counter for
|
|
+ * that type - the only effect is that one more call to unlock will be required
|
|
+ * before the lock is unlocked.
|
|
+ */
|
|
+
|
|
+#include <linux/lockdep.h>
|
|
+#include <linux/osq_lock.h>
|
|
+#include <linux/sched.h>
|
|
+#include <linux/types.h>
|
|
+
|
|
+#define SIX_LOCK_SEPARATE_LOCKFNS
|
|
+
|
|
+union six_lock_state {
|
|
+ struct {
|
|
+ atomic64_t counter;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ u64 v;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ /* for waitlist_bitnr() */
|
|
+ unsigned long l;
|
|
+ };
|
|
+
|
|
+ struct {
|
|
+ unsigned read_lock:27;
|
|
+ unsigned write_locking:1;
|
|
+ unsigned intent_lock:1;
|
|
+ unsigned waiters:3;
|
|
+ /*
|
|
+ * seq works much like in seqlocks: it's incremented every time
|
|
+ * we lock and unlock for write.
|
|
+ *
|
|
+ * If it's odd write lock is held, even unlocked.
|
|
+ *
|
|
+ * Thus readers can unlock, and then lock again later iff it
|
|
+ * hasn't been modified in the meantime.
|
|
+ */
|
|
+ u32 seq;
|
|
+ };
|
|
+};
|
|
+
|
|
+enum six_lock_type {
|
|
+ SIX_LOCK_read,
|
|
+ SIX_LOCK_intent,
|
|
+ SIX_LOCK_write,
|
|
+};
|
|
+
|
|
+struct six_lock {
|
|
+ union six_lock_state state;
|
|
+ unsigned intent_lock_recurse;
|
|
+ struct task_struct *owner;
|
|
+ struct optimistic_spin_queue osq;
|
|
+ unsigned __percpu *readers;
|
|
+
|
|
+ raw_spinlock_t wait_lock;
|
|
+ struct list_head wait_list[2];
|
|
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
+ struct lockdep_map dep_map;
|
|
+#endif
|
|
+};
|
|
+
|
|
+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *);
|
|
+
|
|
+static __always_inline void __six_lock_init(struct six_lock *lock,
|
|
+ const char *name,
|
|
+ struct lock_class_key *key)
|
|
+{
|
|
+ atomic64_set(&lock->state.counter, 0);
|
|
+ raw_spin_lock_init(&lock->wait_lock);
|
|
+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]);
|
|
+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]);
|
|
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
+ debug_check_no_locks_freed((void *) lock, sizeof(*lock));
|
|
+ lockdep_init_map(&lock->dep_map, name, key, 0);
|
|
+#endif
|
|
+}
|
|
+
|
|
+#define six_lock_init(lock) \
|
|
+do { \
|
|
+ static struct lock_class_key __key; \
|
|
+ \
|
|
+ __six_lock_init((lock), #lock, &__key); \
|
|
+} while (0)
|
|
+
|
|
+#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v)
|
|
+
|
|
+#define __SIX_LOCK(type) \
|
|
+bool six_trylock_##type(struct six_lock *); \
|
|
+bool six_relock_##type(struct six_lock *, u32); \
|
|
+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\
|
|
+void six_unlock_##type(struct six_lock *);
|
|
+
|
|
+__SIX_LOCK(read)
|
|
+__SIX_LOCK(intent)
|
|
+__SIX_LOCK(write)
|
|
+#undef __SIX_LOCK
|
|
+
|
|
+#define SIX_LOCK_DISPATCH(type, fn, ...) \
|
|
+ switch (type) { \
|
|
+ case SIX_LOCK_read: \
|
|
+ return fn##_read(__VA_ARGS__); \
|
|
+ case SIX_LOCK_intent: \
|
|
+ return fn##_intent(__VA_ARGS__); \
|
|
+ case SIX_LOCK_write: \
|
|
+ return fn##_write(__VA_ARGS__); \
|
|
+ default: \
|
|
+ BUG(); \
|
|
+ }
|
|
+
|
|
+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ SIX_LOCK_DISPATCH(type, six_trylock, lock);
|
|
+}
|
|
+
|
|
+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type,
|
|
+ unsigned seq)
|
|
+{
|
|
+ SIX_LOCK_DISPATCH(type, six_relock, lock, seq);
|
|
+}
|
|
+
|
|
+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type,
|
|
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
|
|
+{
|
|
+ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p);
|
|
+}
|
|
+
|
|
+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ SIX_LOCK_DISPATCH(type, six_unlock, lock);
|
|
+}
|
|
+
|
|
+void six_lock_downgrade(struct six_lock *);
|
|
+bool six_lock_tryupgrade(struct six_lock *);
|
|
+bool six_trylock_convert(struct six_lock *, enum six_lock_type,
|
|
+ enum six_lock_type);
|
|
+
|
|
+void six_lock_increment(struct six_lock *, enum six_lock_type);
|
|
+
|
|
+void six_lock_wakeup_all(struct six_lock *);
|
|
+
|
|
+void six_lock_pcpu_free_rcu(struct six_lock *);
|
|
+void six_lock_pcpu_free(struct six_lock *);
|
|
+void six_lock_pcpu_alloc(struct six_lock *);
|
|
+
|
|
+#endif /* _LINUX_SIX_H */
|
|
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
|
|
index e432cc92c73d..a0895bbf71ce 100644
|
|
--- a/include/linux/srcu.h
|
|
+++ b/include/linux/srcu.h
|
|
@@ -60,6 +60,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp);
|
|
int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp);
|
|
void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp);
|
|
void synchronize_srcu(struct srcu_struct *ssp);
|
|
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp);
|
|
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp);
|
|
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie);
|
|
|
|
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
|
|
|
diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h
|
|
index 5a5a1941ca15..0e0cf4d6a72a 100644
|
|
--- a/include/linux/srcutiny.h
|
|
+++ b/include/linux/srcutiny.h
|
|
@@ -15,7 +15,8 @@
|
|
|
|
struct srcu_struct {
|
|
short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */
|
|
- short srcu_idx; /* Current reader array element. */
|
|
+ unsigned short srcu_idx; /* Current reader array element in bit 0x2. */
|
|
+ unsigned short srcu_idx_max; /* Furthest future srcu_idx request. */
|
|
u8 srcu_gp_running; /* GP workqueue running? */
|
|
u8 srcu_gp_waiting; /* GP waiting for readers? */
|
|
struct swait_queue_head srcu_wq;
|
|
@@ -59,7 +60,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp)
|
|
{
|
|
int idx;
|
|
|
|
- idx = READ_ONCE(ssp->srcu_idx);
|
|
+ idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
|
|
WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1);
|
|
return idx;
|
|
}
|
|
@@ -80,7 +81,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp,
|
|
{
|
|
int idx;
|
|
|
|
- idx = READ_ONCE(ssp->srcu_idx) & 0x1;
|
|
+ idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1;
|
|
pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
|
|
tt, tf, idx,
|
|
READ_ONCE(ssp->srcu_lock_nesting[!idx]),
|
|
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
|
|
index cedcda6593f6..3d6d7f2ca679 100644
|
|
--- a/include/linux/vmalloc.h
|
|
+++ b/include/linux/vmalloc.h
|
|
@@ -100,6 +100,7 @@ extern void *vzalloc(unsigned long size);
|
|
extern void *vmalloc_user(unsigned long size);
|
|
extern void *vmalloc_node(unsigned long size, int node);
|
|
extern void *vzalloc_node(unsigned long size, int node);
|
|
+extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask);
|
|
extern void *vmalloc_32(unsigned long size);
|
|
extern void *vmalloc_32_user(unsigned long size);
|
|
extern void *__vmalloc(unsigned long size, gfp_t gfp_mask);
|
|
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
|
|
new file mode 100644
|
|
index 000000000000..c79338c8ebf7
|
|
--- /dev/null
|
|
+++ b/include/trace/events/bcachefs.h
|
|
@@ -0,0 +1,817 @@
|
|
+/* SPDX-License-Identifier: GPL-2.0 */
|
|
+#undef TRACE_SYSTEM
|
|
+#define TRACE_SYSTEM bcachefs
|
|
+
|
|
+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
|
|
+#define _TRACE_BCACHE_H
|
|
+
|
|
+#include <linux/tracepoint.h>
|
|
+
|
|
+DECLARE_EVENT_CLASS(bpos,
|
|
+ TP_PROTO(struct bpos *p),
|
|
+ TP_ARGS(p),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u64, inode )
|
|
+ __field(u64, offset )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->inode = p->inode;
|
|
+ __entry->offset = p->offset;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%llu:%llu", __entry->inode, __entry->offset)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(bkey,
|
|
+ TP_PROTO(const struct bkey *k),
|
|
+ TP_ARGS(k),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u64, inode )
|
|
+ __field(u64, offset )
|
|
+ __field(u32, size )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->inode = k->p.inode;
|
|
+ __entry->offset = k->p.offset;
|
|
+ __entry->size = k->size;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%llu:%llu len %u", __entry->inode,
|
|
+ __entry->offset, __entry->size)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(bch_fs,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU", __entry->uuid)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(bio,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(dev_t, dev )
|
|
+ __field(sector_t, sector )
|
|
+ __field(unsigned int, nr_sector )
|
|
+ __array(char, rwbs, 6 )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0;
|
|
+ __entry->sector = bio->bi_iter.bi_sector;
|
|
+ __entry->nr_sector = bio->bi_iter.bi_size >> 9;
|
|
+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
|
|
+ ),
|
|
+
|
|
+ TP_printk("%d,%d %s %llu + %u",
|
|
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
|
|
+ (unsigned long long)__entry->sector, __entry->nr_sector)
|
|
+);
|
|
+
|
|
+/* io.c: */
|
|
+
|
|
+DEFINE_EVENT(bio, read_split,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bio, read_bounce,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bio, read_retry,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bio, promote,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+/* Journal */
|
|
+
|
|
+DEFINE_EVENT(bch_fs, journal_full,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bch_fs, journal_entry_full,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bio, journal_write,
|
|
+ TP_PROTO(struct bio *bio),
|
|
+ TP_ARGS(bio)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(journal_reclaim_start,
|
|
+ TP_PROTO(struct bch_fs *c, u64 min_nr,
|
|
+ u64 prereserved, u64 prereserved_total,
|
|
+ u64 btree_cache_dirty, u64 btree_cache_total,
|
|
+ u64 btree_key_cache_dirty, u64 btree_key_cache_total),
|
|
+ TP_ARGS(c, min_nr, prereserved, prereserved_total,
|
|
+ btree_cache_dirty, btree_cache_total,
|
|
+ btree_key_cache_dirty, btree_key_cache_total),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ __field(u64, min_nr )
|
|
+ __field(u64, prereserved )
|
|
+ __field(u64, prereserved_total )
|
|
+ __field(u64, btree_cache_dirty )
|
|
+ __field(u64, btree_cache_total )
|
|
+ __field(u64, btree_key_cache_dirty )
|
|
+ __field(u64, btree_key_cache_total )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ __entry->min_nr = min_nr;
|
|
+ __entry->prereserved = prereserved;
|
|
+ __entry->prereserved_total = prereserved_total;
|
|
+ __entry->btree_cache_dirty = btree_cache_dirty;
|
|
+ __entry->btree_cache_total = btree_cache_total;
|
|
+ __entry->btree_key_cache_dirty = btree_key_cache_dirty;
|
|
+ __entry->btree_key_cache_total = btree_key_cache_total;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
|
|
+ __entry->uuid,
|
|
+ __entry->min_nr,
|
|
+ __entry->prereserved,
|
|
+ __entry->prereserved_total,
|
|
+ __entry->btree_cache_dirty,
|
|
+ __entry->btree_cache_total,
|
|
+ __entry->btree_key_cache_dirty,
|
|
+ __entry->btree_key_cache_total)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(journal_reclaim_finish,
|
|
+ TP_PROTO(struct bch_fs *c, u64 nr_flushed),
|
|
+ TP_ARGS(c, nr_flushed),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ __field(u64, nr_flushed )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ __entry->nr_flushed = nr_flushed;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
|
|
+);
|
|
+
|
|
+/* bset.c: */
|
|
+
|
|
+DEFINE_EVENT(bpos, bkey_pack_pos_fail,
|
|
+ TP_PROTO(struct bpos *p),
|
|
+ TP_ARGS(p)
|
|
+);
|
|
+
|
|
+/* Btree */
|
|
+
|
|
+DECLARE_EVENT_CLASS(btree_node,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ __field(u8, level )
|
|
+ __field(u8, id )
|
|
+ __field(u64, inode )
|
|
+ __field(u64, offset )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ __entry->level = b->c.level;
|
|
+ __entry->id = b->c.btree_id;
|
|
+ __entry->inode = b->key.k.p.inode;
|
|
+ __entry->offset = b->key.k.p.offset;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU %u id %u %llu:%llu",
|
|
+ __entry->uuid, __entry->level, __entry->id,
|
|
+ __entry->inode, __entry->offset)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_read,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(btree_write,
|
|
+ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors),
|
|
+ TP_ARGS(b, bytes, sectors),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(enum btree_node_type, type)
|
|
+ __field(unsigned, bytes )
|
|
+ __field(unsigned, sectors )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->type = btree_node_type(b);
|
|
+ __entry->bytes = bytes;
|
|
+ __entry->sectors = sectors;
|
|
+ ),
|
|
+
|
|
+ TP_printk("bkey type %u bytes %u sectors %u",
|
|
+ __entry->type , __entry->bytes, __entry->sectors)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_node_alloc,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_node_free,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_node_reap,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU", __entry->uuid)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(btree_reserve_get_fail,
|
|
+ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl),
|
|
+ TP_ARGS(c, required, cl),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ __field(size_t, required )
|
|
+ __field(struct closure *, cl )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ __entry->required = required;
|
|
+ __entry->cl = cl;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU required %zu by %p", __entry->uuid,
|
|
+ __entry->required, __entry->cl)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(btree_insert_key,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k),
|
|
+ TP_ARGS(c, b, k),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u8, id )
|
|
+ __field(u64, inode )
|
|
+ __field(u64, offset )
|
|
+ __field(u32, size )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->id = b->c.btree_id;
|
|
+ __entry->inode = k->k.p.inode;
|
|
+ __entry->offset = k->k.p.offset;
|
|
+ __entry->size = k->k.size;
|
|
+ ),
|
|
+
|
|
+ TP_printk("btree %u: %llu:%llu len %u", __entry->id,
|
|
+ __entry->inode, __entry->offset, __entry->size)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_split,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_compact,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_merge,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_set_root,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+/* Garbage collection */
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail,
|
|
+ TP_PROTO(struct bch_fs *c, struct btree *b),
|
|
+ TP_ARGS(c, b)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bch_fs, gc_start,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bch_fs, gc_end,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens,
|
|
+ TP_PROTO(struct bch_fs *c),
|
|
+ TP_ARGS(c)
|
|
+);
|
|
+
|
|
+/* Allocator */
|
|
+
|
|
+TRACE_EVENT(alloc_scan,
|
|
+ TP_PROTO(struct bch_dev *ca, u64 found, u64 inc_gen, u64 inc_gen_skipped),
|
|
+ TP_ARGS(ca, found, inc_gen, inc_gen_skipped),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(dev_t, dev )
|
|
+ __field(u64, found )
|
|
+ __field(u64, inc_gen )
|
|
+ __field(u64, inc_gen_skipped )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = ca->disk_sb.bdev->bd_dev;
|
|
+ __entry->found = found;
|
|
+ __entry->inc_gen = inc_gen;
|
|
+ __entry->inc_gen_skipped = inc_gen_skipped;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%d,%d found %llu inc_gen %llu inc_gen_skipped %llu",
|
|
+ MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
+ __entry->found, __entry->inc_gen, __entry->inc_gen_skipped)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(invalidate,
|
|
+ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
|
|
+ TP_ARGS(ca, offset, sectors),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(unsigned, sectors )
|
|
+ __field(dev_t, dev )
|
|
+ __field(__u64, offset )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = ca->disk_sb.bdev->bd_dev;
|
|
+ __entry->offset = offset,
|
|
+ __entry->sectors = sectors;
|
|
+ ),
|
|
+
|
|
+ TP_printk("invalidated %u sectors at %d,%d sector=%llu",
|
|
+ __entry->sectors,
|
|
+ MAJOR(__entry->dev),
|
|
+ MINOR(__entry->dev),
|
|
+ __entry->offset)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(bucket_alloc,
|
|
+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
|
|
+ TP_ARGS(ca, reserve),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(dev_t, dev )
|
|
+ __field(enum alloc_reserve, reserve )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->dev = ca->disk_sb.bdev->bd_dev;
|
|
+ __entry->reserve = reserve;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%d,%d reserve %d",
|
|
+ MAJOR(__entry->dev), MINOR(__entry->dev),
|
|
+ __entry->reserve)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bucket_alloc, bucket_alloc,
|
|
+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
|
|
+ TP_ARGS(ca, reserve)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
|
|
+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
|
|
+ TP_ARGS(ca, reserve)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
|
|
+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
|
|
+ TP_ARGS(ca, reserve)
|
|
+);
|
|
+
|
|
+/* Moving IO */
|
|
+
|
|
+DEFINE_EVENT(bkey, move_extent,
|
|
+ TP_PROTO(const struct bkey *k),
|
|
+ TP_ARGS(k)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bkey, move_alloc_fail,
|
|
+ TP_PROTO(const struct bkey *k),
|
|
+ TP_ARGS(k)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(bkey, move_race,
|
|
+ TP_PROTO(const struct bkey *k),
|
|
+ TP_ARGS(k)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(move_data,
|
|
+ TP_PROTO(struct bch_fs *c, u64 sectors_moved,
|
|
+ u64 keys_moved),
|
|
+ TP_ARGS(c, sectors_moved, keys_moved),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ __field(u64, sectors_moved )
|
|
+ __field(u64, keys_moved )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ __entry->sectors_moved = sectors_moved;
|
|
+ __entry->keys_moved = keys_moved;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU sectors_moved %llu keys_moved %llu",
|
|
+ __entry->uuid, __entry->sectors_moved, __entry->keys_moved)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(copygc,
|
|
+ TP_PROTO(struct bch_fs *c,
|
|
+ u64 sectors_moved, u64 sectors_not_moved,
|
|
+ u64 buckets_moved, u64 buckets_not_moved),
|
|
+ TP_ARGS(c,
|
|
+ sectors_moved, sectors_not_moved,
|
|
+ buckets_moved, buckets_not_moved),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __array(char, uuid, 16 )
|
|
+ __field(u64, sectors_moved )
|
|
+ __field(u64, sectors_not_moved )
|
|
+ __field(u64, buckets_moved )
|
|
+ __field(u64, buckets_not_moved )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
|
|
+ __entry->sectors_moved = sectors_moved;
|
|
+ __entry->sectors_not_moved = sectors_not_moved;
|
|
+ __entry->buckets_moved = buckets_moved;
|
|
+ __entry->buckets_not_moved = buckets_moved;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
|
|
+ __entry->uuid,
|
|
+ __entry->sectors_moved, __entry->sectors_not_moved,
|
|
+ __entry->buckets_moved, __entry->buckets_not_moved)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(trans_get_iter,
|
|
+ TP_PROTO(unsigned long caller, unsigned long ip,
|
|
+ enum btree_id btree_id,
|
|
+ struct bpos *pos_want,
|
|
+ unsigned locks_want,
|
|
+ struct bpos *pos_found,
|
|
+ unsigned locks_found,
|
|
+ unsigned uptodate),
|
|
+ TP_ARGS(caller, ip, btree_id,
|
|
+ pos_want, locks_want,
|
|
+ pos_found, locks_found,
|
|
+ uptodate),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(unsigned long, caller )
|
|
+ __field(unsigned long, ip )
|
|
+ __field(u8, btree_id )
|
|
+ __field(u8, uptodate )
|
|
+ __field(u8, locks_want )
|
|
+ __field(u8, locks_found )
|
|
+ __field(u64, pos_want_inode )
|
|
+ __field(u64, pos_want_offset )
|
|
+ __field(u32, pos_want_snapshot )
|
|
+ __field(u64, pos_found_inode )
|
|
+ __field(u64, pos_found_offset )
|
|
+ __field(u32, pos_found_snapshot )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->caller = caller;
|
|
+ __entry->ip = ip;
|
|
+ __entry->btree_id = btree_id;
|
|
+ __entry->uptodate = uptodate;
|
|
+ __entry->pos_want_inode = pos_want->inode;
|
|
+ __entry->pos_want_offset = pos_want->offset;
|
|
+ __entry->pos_want_snapshot = pos_want->snapshot;
|
|
+ __entry->pos_found_inode = pos_found->inode;
|
|
+ __entry->pos_found_offset = pos_found->offset;
|
|
+ __entry->pos_found_snapshot = pos_found->snapshot;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%ps %pS btree %u uptodate %u want %llu:%llu:%u locks %u found %llu:%llu:%u locks %u",
|
|
+ (void *) __entry->caller,
|
|
+ (void *) __entry->ip,
|
|
+ __entry->btree_id,
|
|
+ __entry->uptodate,
|
|
+ __entry->pos_want_inode,
|
|
+ __entry->pos_want_offset,
|
|
+ __entry->pos_want_snapshot,
|
|
+ __entry->locks_want,
|
|
+ __entry->pos_found_inode,
|
|
+ __entry->pos_found_offset,
|
|
+ __entry->pos_found_snapshot,
|
|
+ __entry->locks_found)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(transaction_restart_ip,
|
|
+ TP_PROTO(unsigned long caller, unsigned long ip),
|
|
+ TP_ARGS(caller, ip),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(unsigned long, caller )
|
|
+ __field(unsigned long, ip )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->caller = caller;
|
|
+ __entry->ip = ip;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%ps %pS", (void *) __entry->caller, (void *) __entry->ip)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(transaction_restart,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(unsigned long, ip )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->ip = ip;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%ps", (void *) __entry->ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_blocked_journal_reclaim,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(trans_restart_would_deadlock,
|
|
+ TP_PROTO(unsigned long trans_ip,
|
|
+ unsigned long caller_ip,
|
|
+ bool in_traverse_all,
|
|
+ unsigned reason,
|
|
+ enum btree_id have_btree_id,
|
|
+ unsigned have_iter_type,
|
|
+ struct bpos *have_pos,
|
|
+ enum btree_id want_btree_id,
|
|
+ unsigned want_iter_type,
|
|
+ struct bpos *want_pos),
|
|
+ TP_ARGS(trans_ip, caller_ip, in_traverse_all, reason,
|
|
+ have_btree_id, have_iter_type, have_pos,
|
|
+ want_btree_id, want_iter_type, want_pos),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(unsigned long, trans_ip )
|
|
+ __field(unsigned long, caller_ip )
|
|
+ __field(u8, in_traverse_all )
|
|
+ __field(u8, reason )
|
|
+ __field(u8, have_btree_id )
|
|
+ __field(u8, have_iter_type )
|
|
+ __field(u8, want_btree_id )
|
|
+ __field(u8, want_iter_type )
|
|
+
|
|
+ __field(u64, have_pos_inode )
|
|
+ __field(u64, have_pos_offset )
|
|
+ __field(u32, have_pos_snapshot)
|
|
+ __field(u32, want_pos_snapshot)
|
|
+ __field(u64, want_pos_inode )
|
|
+ __field(u64, want_pos_offset )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->trans_ip = trans_ip;
|
|
+ __entry->caller_ip = caller_ip;
|
|
+ __entry->in_traverse_all = in_traverse_all;
|
|
+ __entry->reason = reason;
|
|
+ __entry->have_btree_id = have_btree_id;
|
|
+ __entry->have_iter_type = have_iter_type;
|
|
+ __entry->want_btree_id = want_btree_id;
|
|
+ __entry->want_iter_type = want_iter_type;
|
|
+
|
|
+ __entry->have_pos_inode = have_pos->inode;
|
|
+ __entry->have_pos_offset = have_pos->offset;
|
|
+ __entry->have_pos_snapshot = have_pos->snapshot;
|
|
+
|
|
+ __entry->want_pos_inode = want_pos->inode;
|
|
+ __entry->want_pos_offset = want_pos->offset;
|
|
+ __entry->want_pos_snapshot = want_pos->snapshot;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%ps %pS traverse_all %u because %u have %u:%u %llu:%llu:%u want %u:%u %llu:%llu:%u",
|
|
+ (void *) __entry->trans_ip,
|
|
+ (void *) __entry->caller_ip,
|
|
+ __entry->in_traverse_all,
|
|
+ __entry->reason,
|
|
+ __entry->have_btree_id,
|
|
+ __entry->have_iter_type,
|
|
+ __entry->have_pos_inode,
|
|
+ __entry->have_pos_offset,
|
|
+ __entry->have_pos_snapshot,
|
|
+ __entry->want_btree_id,
|
|
+ __entry->want_iter_type,
|
|
+ __entry->want_pos_inode,
|
|
+ __entry->want_pos_offset,
|
|
+ __entry->want_pos_snapshot)
|
|
+);
|
|
+
|
|
+TRACE_EVENT(trans_restart_mem_realloced,
|
|
+ TP_PROTO(unsigned long trans_ip, unsigned long caller_ip,
|
|
+ unsigned long bytes),
|
|
+ TP_ARGS(trans_ip, caller_ip, bytes),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(unsigned long, trans_ip )
|
|
+ __field(unsigned long, caller_ip )
|
|
+ __field(unsigned long, bytes )
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->trans_ip = trans_ip;
|
|
+ __entry->caller_ip = caller_ip;
|
|
+ __entry->bytes = bytes;
|
|
+ ),
|
|
+
|
|
+ TP_printk("%ps %pS bytes %lu",
|
|
+ (void *) __entry->trans_ip,
|
|
+ (void *) __entry->caller_ip,
|
|
+ __entry->bytes)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_journal_reclaim,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_fault_inject,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_mark,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_upgrade,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_relock,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_restart_traverse,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(transaction_restart, trans_traverse_all,
|
|
+ TP_PROTO(unsigned long ip),
|
|
+ TP_ARGS(ip)
|
|
+);
|
|
+
|
|
+DECLARE_EVENT_CLASS(node_lock_fail,
|
|
+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
|
|
+ TP_ARGS(level, iter_seq, node, node_seq),
|
|
+
|
|
+ TP_STRUCT__entry(
|
|
+ __field(u32, level)
|
|
+ __field(u32, iter_seq)
|
|
+ __field(u32, node)
|
|
+ __field(u32, node_seq)
|
|
+ ),
|
|
+
|
|
+ TP_fast_assign(
|
|
+ __entry->level = level;
|
|
+ __entry->iter_seq = iter_seq;
|
|
+ __entry->node = node;
|
|
+ __entry->node_seq = node_seq;
|
|
+ ),
|
|
+
|
|
+ TP_printk("level %u iter seq %u node %u node seq %u",
|
|
+ __entry->level, __entry->iter_seq,
|
|
+ __entry->node, __entry->node_seq)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(node_lock_fail, node_upgrade_fail,
|
|
+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
|
|
+ TP_ARGS(level, iter_seq, node, node_seq)
|
|
+);
|
|
+
|
|
+DEFINE_EVENT(node_lock_fail, node_relock_fail,
|
|
+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq),
|
|
+ TP_ARGS(level, iter_seq, node, node_seq)
|
|
+);
|
|
+
|
|
+#endif /* _TRACE_BCACHE_H */
|
|
+
|
|
+/* This part must be outside protection */
|
|
+#include <trace/define_trace.h>
|
|
diff --git a/init/init_task.c b/init/init_task.c
|
|
index 3711cdaafed2..338164f9980d 100644
|
|
--- a/init/init_task.c
|
|
+++ b/init/init_task.c
|
|
@@ -84,6 +84,7 @@ struct task_struct init_task
|
|
.nr_cpus_allowed= NR_CPUS,
|
|
.mm = NULL,
|
|
.active_mm = &init_mm,
|
|
+ .faults_disabled_mapping = NULL,
|
|
.restart_block = {
|
|
.fn = do_no_restart_syscall,
|
|
},
|
|
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
|
|
index 3de8fd11873b..ab8aa082ce56 100644
|
|
--- a/kernel/Kconfig.locks
|
|
+++ b/kernel/Kconfig.locks
|
|
@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB
|
|
config MMIOWB
|
|
def_bool y if ARCH_HAS_MMIOWB
|
|
depends on SMP
|
|
+
|
|
+config SIXLOCKS
|
|
+ bool
|
|
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
|
|
index 6d11cfb9b41f..4c13937e8f37 100644
|
|
--- a/kernel/locking/Makefile
|
|
+++ b/kernel/locking/Makefile
|
|
@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
|
|
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
|
|
obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o
|
|
obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o
|
|
+obj-$(CONFIG_SIXLOCKS) += six.o
|
|
diff --git a/kernel/locking/six.c b/kernel/locking/six.c
|
|
new file mode 100644
|
|
index 000000000000..fca1208720b6
|
|
--- /dev/null
|
|
+++ b/kernel/locking/six.c
|
|
@@ -0,0 +1,759 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+
|
|
+#include <linux/export.h>
|
|
+#include <linux/log2.h>
|
|
+#include <linux/percpu.h>
|
|
+#include <linux/preempt.h>
|
|
+#include <linux/rcupdate.h>
|
|
+#include <linux/sched.h>
|
|
+#include <linux/sched/rt.h>
|
|
+#include <linux/six.h>
|
|
+#include <linux/slab.h>
|
|
+
|
|
+#ifdef DEBUG
|
|
+#define EBUG_ON(cond) BUG_ON(cond)
|
|
+#else
|
|
+#define EBUG_ON(cond) do {} while (0)
|
|
+#endif
|
|
+
|
|
+#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_)
|
|
+#define six_release(l) lock_release(l, _RET_IP_)
|
|
+
|
|
+struct six_lock_vals {
|
|
+ /* Value we add to the lock in order to take the lock: */
|
|
+ u64 lock_val;
|
|
+
|
|
+ /* If the lock has this value (used as a mask), taking the lock fails: */
|
|
+ u64 lock_fail;
|
|
+
|
|
+ /* Value we add to the lock in order to release the lock: */
|
|
+ u64 unlock_val;
|
|
+
|
|
+ /* Mask that indicates lock is held for this type: */
|
|
+ u64 held_mask;
|
|
+
|
|
+ /* Waitlist we wakeup when releasing the lock: */
|
|
+ enum six_lock_type unlock_wakeup;
|
|
+};
|
|
+
|
|
+#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0)
|
|
+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0)
|
|
+#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1)
|
|
+
|
|
+#define LOCK_VALS { \
|
|
+ [SIX_LOCK_read] = { \
|
|
+ .lock_val = __SIX_VAL(read_lock, 1), \
|
|
+ .lock_fail = __SIX_LOCK_HELD_write + __SIX_VAL(write_locking, 1),\
|
|
+ .unlock_val = -__SIX_VAL(read_lock, 1), \
|
|
+ .held_mask = __SIX_LOCK_HELD_read, \
|
|
+ .unlock_wakeup = SIX_LOCK_write, \
|
|
+ }, \
|
|
+ [SIX_LOCK_intent] = { \
|
|
+ .lock_val = __SIX_VAL(intent_lock, 1), \
|
|
+ .lock_fail = __SIX_LOCK_HELD_intent, \
|
|
+ .unlock_val = -__SIX_VAL(intent_lock, 1), \
|
|
+ .held_mask = __SIX_LOCK_HELD_intent, \
|
|
+ .unlock_wakeup = SIX_LOCK_intent, \
|
|
+ }, \
|
|
+ [SIX_LOCK_write] = { \
|
|
+ .lock_val = __SIX_VAL(seq, 1), \
|
|
+ .lock_fail = __SIX_LOCK_HELD_read, \
|
|
+ .unlock_val = __SIX_VAL(seq, 1), \
|
|
+ .held_mask = __SIX_LOCK_HELD_write, \
|
|
+ .unlock_wakeup = SIX_LOCK_read, \
|
|
+ }, \
|
|
+}
|
|
+
|
|
+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type,
|
|
+ union six_lock_state old)
|
|
+{
|
|
+ if (type != SIX_LOCK_intent)
|
|
+ return;
|
|
+
|
|
+ if (!old.intent_lock) {
|
|
+ EBUG_ON(lock->owner);
|
|
+ lock->owner = current;
|
|
+ } else {
|
|
+ EBUG_ON(lock->owner != current);
|
|
+ }
|
|
+}
|
|
+
|
|
+static inline unsigned pcpu_read_count(struct six_lock *lock)
|
|
+{
|
|
+ unsigned read_count = 0;
|
|
+ int cpu;
|
|
+
|
|
+ for_each_possible_cpu(cpu)
|
|
+ read_count += *per_cpu_ptr(lock->readers, cpu);
|
|
+ return read_count;
|
|
+}
|
|
+
|
|
+struct six_lock_waiter {
|
|
+ struct list_head list;
|
|
+ struct task_struct *task;
|
|
+};
|
|
+
|
|
+/* This is probably up there with the more evil things I've done */
|
|
+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l))
|
|
+
|
|
+static inline void six_lock_wakeup(struct six_lock *lock,
|
|
+ union six_lock_state state,
|
|
+ unsigned waitlist_id)
|
|
+{
|
|
+ if (waitlist_id == SIX_LOCK_write) {
|
|
+ if (state.write_locking && !state.read_lock) {
|
|
+ struct task_struct *p = READ_ONCE(lock->owner);
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+ }
|
|
+ } else {
|
|
+ struct list_head *wait_list = &lock->wait_list[waitlist_id];
|
|
+ struct six_lock_waiter *w, *next;
|
|
+
|
|
+ if (!(state.waiters & (1 << waitlist_id)))
|
|
+ return;
|
|
+
|
|
+ clear_bit(waitlist_bitnr(waitlist_id),
|
|
+ (unsigned long *) &lock->state.v);
|
|
+
|
|
+ raw_spin_lock(&lock->wait_lock);
|
|
+
|
|
+ list_for_each_entry_safe(w, next, wait_list, list) {
|
|
+ list_del_init(&w->list);
|
|
+
|
|
+ if (wake_up_process(w->task) &&
|
|
+ waitlist_id != SIX_LOCK_read) {
|
|
+ if (!list_empty(wait_list))
|
|
+ set_bit(waitlist_bitnr(waitlist_id),
|
|
+ (unsigned long *) &lock->state.v);
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ raw_spin_unlock(&lock->wait_lock);
|
|
+ }
|
|
+}
|
|
+
|
|
+static __always_inline bool do_six_trylock_type(struct six_lock *lock,
|
|
+ enum six_lock_type type,
|
|
+ bool try)
|
|
+{
|
|
+ const struct six_lock_vals l[] = LOCK_VALS;
|
|
+ union six_lock_state old, new;
|
|
+ bool ret;
|
|
+ u64 v;
|
|
+
|
|
+ EBUG_ON(type == SIX_LOCK_write && lock->owner != current);
|
|
+ EBUG_ON(type == SIX_LOCK_write && (lock->state.seq & 1));
|
|
+
|
|
+ EBUG_ON(type == SIX_LOCK_write && (try != !(lock->state.write_locking)));
|
|
+
|
|
+ /*
|
|
+ * Percpu reader mode:
|
|
+ *
|
|
+ * The basic idea behind this algorithm is that you can implement a lock
|
|
+ * between two threads without any atomics, just memory barriers:
|
|
+ *
|
|
+ * For two threads you'll need two variables, one variable for "thread a
|
|
+ * has the lock" and another for "thread b has the lock".
|
|
+ *
|
|
+ * To take the lock, a thread sets its variable indicating that it holds
|
|
+ * the lock, then issues a full memory barrier, then reads from the
|
|
+ * other thread's variable to check if the other thread thinks it has
|
|
+ * the lock. If we raced, we backoff and retry/sleep.
|
|
+ */
|
|
+
|
|
+ if (type == SIX_LOCK_read && lock->readers) {
|
|
+retry:
|
|
+ preempt_disable();
|
|
+ this_cpu_inc(*lock->readers); /* signal that we own lock */
|
|
+
|
|
+ smp_mb();
|
|
+
|
|
+ old.v = READ_ONCE(lock->state.v);
|
|
+ ret = !(old.v & l[type].lock_fail);
|
|
+
|
|
+ this_cpu_sub(*lock->readers, !ret);
|
|
+ preempt_enable();
|
|
+
|
|
+ /*
|
|
+ * If we failed because a writer was trying to take the
|
|
+ * lock, issue a wakeup because we might have caused a
|
|
+ * spurious trylock failure:
|
|
+ */
|
|
+ if (old.write_locking) {
|
|
+ struct task_struct *p = READ_ONCE(lock->owner);
|
|
+
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * If we failed from the lock path and the waiting bit wasn't
|
|
+ * set, set it:
|
|
+ */
|
|
+ if (!try && !ret) {
|
|
+ v = old.v;
|
|
+
|
|
+ do {
|
|
+ new.v = old.v = v;
|
|
+
|
|
+ if (!(old.v & l[type].lock_fail))
|
|
+ goto retry;
|
|
+
|
|
+ if (new.waiters & (1 << type))
|
|
+ break;
|
|
+
|
|
+ new.waiters |= 1 << type;
|
|
+ } while ((v = atomic64_cmpxchg(&lock->state.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+ }
|
|
+ } else if (type == SIX_LOCK_write && lock->readers) {
|
|
+ if (try) {
|
|
+ atomic64_add(__SIX_VAL(write_locking, 1),
|
|
+ &lock->state.counter);
|
|
+ smp_mb__after_atomic();
|
|
+ }
|
|
+
|
|
+ ret = !pcpu_read_count(lock);
|
|
+
|
|
+ /*
|
|
+ * On success, we increment lock->seq; also we clear
|
|
+ * write_locking unless we failed from the lock path:
|
|
+ */
|
|
+ v = 0;
|
|
+ if (ret)
|
|
+ v += __SIX_VAL(seq, 1);
|
|
+ if (ret || try)
|
|
+ v -= __SIX_VAL(write_locking, 1);
|
|
+
|
|
+ if (try && !ret) {
|
|
+ old.v = atomic64_add_return(v, &lock->state.counter);
|
|
+ six_lock_wakeup(lock, old, SIX_LOCK_read);
|
|
+ } else {
|
|
+ atomic64_add(v, &lock->state.counter);
|
|
+ }
|
|
+ } else {
|
|
+ v = READ_ONCE(lock->state.v);
|
|
+ do {
|
|
+ new.v = old.v = v;
|
|
+
|
|
+ if (!(old.v & l[type].lock_fail)) {
|
|
+ new.v += l[type].lock_val;
|
|
+
|
|
+ if (type == SIX_LOCK_write)
|
|
+ new.write_locking = 0;
|
|
+ } else if (!try && type != SIX_LOCK_write &&
|
|
+ !(new.waiters & (1 << type)))
|
|
+ new.waiters |= 1 << type;
|
|
+ else
|
|
+ break; /* waiting bit already set */
|
|
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ ret = !(old.v & l[type].lock_fail);
|
|
+
|
|
+ EBUG_ON(ret && !(lock->state.v & l[type].held_mask));
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ six_set_owner(lock, type, old);
|
|
+
|
|
+ EBUG_ON(type == SIX_LOCK_write && (try || ret) && (lock->state.write_locking));
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+__always_inline __flatten
|
|
+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ if (!do_six_trylock_type(lock, type, true))
|
|
+ return false;
|
|
+
|
|
+ if (type != SIX_LOCK_write)
|
|
+ six_acquire(&lock->dep_map, 1);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+__always_inline __flatten
|
|
+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type,
|
|
+ unsigned seq)
|
|
+{
|
|
+ const struct six_lock_vals l[] = LOCK_VALS;
|
|
+ union six_lock_state old;
|
|
+ u64 v;
|
|
+
|
|
+ EBUG_ON(type == SIX_LOCK_write);
|
|
+
|
|
+ if (type == SIX_LOCK_read &&
|
|
+ lock->readers) {
|
|
+ bool ret;
|
|
+
|
|
+ preempt_disable();
|
|
+ this_cpu_inc(*lock->readers);
|
|
+
|
|
+ smp_mb();
|
|
+
|
|
+ old.v = READ_ONCE(lock->state.v);
|
|
+ ret = !(old.v & l[type].lock_fail) && old.seq == seq;
|
|
+
|
|
+ this_cpu_sub(*lock->readers, !ret);
|
|
+ preempt_enable();
|
|
+
|
|
+ /*
|
|
+ * Similar to the lock path, we may have caused a spurious write
|
|
+ * lock fail and need to issue a wakeup:
|
|
+ */
|
|
+ if (old.write_locking) {
|
|
+ struct task_struct *p = READ_ONCE(lock->owner);
|
|
+
|
|
+ if (p)
|
|
+ wake_up_process(p);
|
|
+ }
|
|
+
|
|
+ if (ret)
|
|
+ six_acquire(&lock->dep_map, 1);
|
|
+
|
|
+ return ret;
|
|
+ }
|
|
+
|
|
+ v = READ_ONCE(lock->state.v);
|
|
+ do {
|
|
+ old.v = v;
|
|
+
|
|
+ if (old.seq != seq || old.v & l[type].lock_fail)
|
|
+ return false;
|
|
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
|
|
+ old.v,
|
|
+ old.v + l[type].lock_val)) != old.v);
|
|
+
|
|
+ six_set_owner(lock, type, old);
|
|
+ if (type != SIX_LOCK_write)
|
|
+ six_acquire(&lock->dep_map, 1);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+#ifdef CONFIG_LOCK_SPIN_ON_OWNER
|
|
+
|
|
+static inline int six_can_spin_on_owner(struct six_lock *lock)
|
|
+{
|
|
+ struct task_struct *owner;
|
|
+ int retval = 1;
|
|
+
|
|
+ if (need_resched())
|
|
+ return 0;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ owner = READ_ONCE(lock->owner);
|
|
+ if (owner)
|
|
+ retval = owner->on_cpu;
|
|
+ rcu_read_unlock();
|
|
+ /*
|
|
+ * if lock->owner is not set, the mutex owner may have just acquired
|
|
+ * it and not set the owner yet or the mutex has been released.
|
|
+ */
|
|
+ return retval;
|
|
+}
|
|
+
|
|
+static inline bool six_spin_on_owner(struct six_lock *lock,
|
|
+ struct task_struct *owner)
|
|
+{
|
|
+ bool ret = true;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ while (lock->owner == owner) {
|
|
+ /*
|
|
+ * Ensure we emit the owner->on_cpu, dereference _after_
|
|
+ * checking lock->owner still matches owner. If that fails,
|
|
+ * owner might point to freed memory. If it still matches,
|
|
+ * the rcu_read_lock() ensures the memory stays valid.
|
|
+ */
|
|
+ barrier();
|
|
+
|
|
+ if (!owner->on_cpu || need_resched()) {
|
|
+ ret = false;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ cpu_relax();
|
|
+ }
|
|
+ rcu_read_unlock();
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ struct task_struct *task = current;
|
|
+
|
|
+ if (type == SIX_LOCK_write)
|
|
+ return false;
|
|
+
|
|
+ preempt_disable();
|
|
+ if (!six_can_spin_on_owner(lock))
|
|
+ goto fail;
|
|
+
|
|
+ if (!osq_lock(&lock->osq))
|
|
+ goto fail;
|
|
+
|
|
+ while (1) {
|
|
+ struct task_struct *owner;
|
|
+
|
|
+ /*
|
|
+ * If there's an owner, wait for it to either
|
|
+ * release the lock or go to sleep.
|
|
+ */
|
|
+ owner = READ_ONCE(lock->owner);
|
|
+ if (owner && !six_spin_on_owner(lock, owner))
|
|
+ break;
|
|
+
|
|
+ if (do_six_trylock_type(lock, type, false)) {
|
|
+ osq_unlock(&lock->osq);
|
|
+ preempt_enable();
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ /*
|
|
+ * When there's no owner, we might have preempted between the
|
|
+ * owner acquiring the lock and setting the owner field. If
|
|
+ * we're an RT task that will live-lock because we won't let
|
|
+ * the owner complete.
|
|
+ */
|
|
+ if (!owner && (need_resched() || rt_task(task)))
|
|
+ break;
|
|
+
|
|
+ /*
|
|
+ * The cpu_relax() call is a compiler barrier which forces
|
|
+ * everything in this loop to be re-loaded. We don't need
|
|
+ * memory barriers as we'll eventually observe the right
|
|
+ * values at the cost of a few extra spins.
|
|
+ */
|
|
+ cpu_relax();
|
|
+ }
|
|
+
|
|
+ osq_unlock(&lock->osq);
|
|
+fail:
|
|
+ preempt_enable();
|
|
+
|
|
+ /*
|
|
+ * If we fell out of the spin path because of need_resched(),
|
|
+ * reschedule now, before we try-lock again. This avoids getting
|
|
+ * scheduled out right after we obtained the lock.
|
|
+ */
|
|
+ if (need_resched())
|
|
+ schedule();
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
|
|
+
|
|
+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ return false;
|
|
+}
|
|
+
|
|
+#endif
|
|
+
|
|
+noinline
|
|
+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type,
|
|
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
|
|
+{
|
|
+ union six_lock_state old;
|
|
+ struct six_lock_waiter wait;
|
|
+ int ret = 0;
|
|
+
|
|
+ if (type == SIX_LOCK_write) {
|
|
+ EBUG_ON(lock->state.write_locking);
|
|
+ atomic64_add(__SIX_VAL(write_locking, 1), &lock->state.counter);
|
|
+ smp_mb__after_atomic();
|
|
+ }
|
|
+
|
|
+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
|
|
+ if (ret)
|
|
+ goto out_before_sleep;
|
|
+
|
|
+ if (six_optimistic_spin(lock, type))
|
|
+ goto out_before_sleep;
|
|
+
|
|
+ lock_contended(&lock->dep_map, _RET_IP_);
|
|
+
|
|
+ INIT_LIST_HEAD(&wait.list);
|
|
+ wait.task = current;
|
|
+
|
|
+ while (1) {
|
|
+ set_current_state(TASK_UNINTERRUPTIBLE);
|
|
+ if (type == SIX_LOCK_write)
|
|
+ EBUG_ON(lock->owner != current);
|
|
+ else if (list_empty_careful(&wait.list)) {
|
|
+ raw_spin_lock(&lock->wait_lock);
|
|
+ list_add_tail(&wait.list, &lock->wait_list[type]);
|
|
+ raw_spin_unlock(&lock->wait_lock);
|
|
+ }
|
|
+
|
|
+ if (do_six_trylock_type(lock, type, false))
|
|
+ break;
|
|
+
|
|
+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0;
|
|
+ if (ret)
|
|
+ break;
|
|
+
|
|
+ schedule();
|
|
+ }
|
|
+
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+
|
|
+ if (!list_empty_careful(&wait.list)) {
|
|
+ raw_spin_lock(&lock->wait_lock);
|
|
+ list_del_init(&wait.list);
|
|
+ raw_spin_unlock(&lock->wait_lock);
|
|
+ }
|
|
+out_before_sleep:
|
|
+ if (ret && type == SIX_LOCK_write) {
|
|
+ old.v = atomic64_sub_return(__SIX_VAL(write_locking, 1),
|
|
+ &lock->state.counter);
|
|
+ six_lock_wakeup(lock, old, SIX_LOCK_read);
|
|
+ }
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+__always_inline
|
|
+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type,
|
|
+ six_lock_should_sleep_fn should_sleep_fn, void *p)
|
|
+{
|
|
+ int ret;
|
|
+
|
|
+ if (type != SIX_LOCK_write)
|
|
+ six_acquire(&lock->dep_map, 0);
|
|
+
|
|
+ ret = do_six_trylock_type(lock, type, true) ? 0
|
|
+ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p);
|
|
+
|
|
+ if (ret && type != SIX_LOCK_write)
|
|
+ six_release(&lock->dep_map);
|
|
+ if (!ret)
|
|
+ lock_acquired(&lock->dep_map, _RET_IP_);
|
|
+
|
|
+ return ret;
|
|
+}
|
|
+
|
|
+__always_inline __flatten
|
|
+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ const struct six_lock_vals l[] = LOCK_VALS;
|
|
+ union six_lock_state state;
|
|
+
|
|
+ EBUG_ON(type == SIX_LOCK_write &&
|
|
+ !(lock->state.v & __SIX_LOCK_HELD_intent));
|
|
+
|
|
+ if (type != SIX_LOCK_write)
|
|
+ six_release(&lock->dep_map);
|
|
+
|
|
+ if (type == SIX_LOCK_intent) {
|
|
+ EBUG_ON(lock->owner != current);
|
|
+
|
|
+ if (lock->intent_lock_recurse) {
|
|
+ --lock->intent_lock_recurse;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ lock->owner = NULL;
|
|
+ }
|
|
+
|
|
+ if (type == SIX_LOCK_read &&
|
|
+ lock->readers) {
|
|
+ smp_mb(); /* unlock barrier */
|
|
+ this_cpu_dec(*lock->readers);
|
|
+ smp_mb(); /* between unlocking and checking for waiters */
|
|
+ state.v = READ_ONCE(lock->state.v);
|
|
+ } else {
|
|
+ EBUG_ON(!(lock->state.v & l[type].held_mask));
|
|
+ state.v = atomic64_add_return_release(l[type].unlock_val,
|
|
+ &lock->state.counter);
|
|
+ }
|
|
+
|
|
+ six_lock_wakeup(lock, state, l[type].unlock_wakeup);
|
|
+}
|
|
+
|
|
+#define __SIX_LOCK(type) \
|
|
+bool six_trylock_##type(struct six_lock *lock) \
|
|
+{ \
|
|
+ return __six_trylock_type(lock, SIX_LOCK_##type); \
|
|
+} \
|
|
+EXPORT_SYMBOL_GPL(six_trylock_##type); \
|
|
+ \
|
|
+bool six_relock_##type(struct six_lock *lock, u32 seq) \
|
|
+{ \
|
|
+ return __six_relock_type(lock, SIX_LOCK_##type, seq); \
|
|
+} \
|
|
+EXPORT_SYMBOL_GPL(six_relock_##type); \
|
|
+ \
|
|
+int six_lock_##type(struct six_lock *lock, \
|
|
+ six_lock_should_sleep_fn should_sleep_fn, void *p) \
|
|
+{ \
|
|
+ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\
|
|
+} \
|
|
+EXPORT_SYMBOL_GPL(six_lock_##type); \
|
|
+ \
|
|
+void six_unlock_##type(struct six_lock *lock) \
|
|
+{ \
|
|
+ __six_unlock_type(lock, SIX_LOCK_##type); \
|
|
+} \
|
|
+EXPORT_SYMBOL_GPL(six_unlock_##type);
|
|
+
|
|
+__SIX_LOCK(read)
|
|
+__SIX_LOCK(intent)
|
|
+__SIX_LOCK(write)
|
|
+
|
|
+#undef __SIX_LOCK
|
|
+
|
|
+/* Convert from intent to read: */
|
|
+void six_lock_downgrade(struct six_lock *lock)
|
|
+{
|
|
+ six_lock_increment(lock, SIX_LOCK_read);
|
|
+ six_unlock_intent(lock);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_downgrade);
|
|
+
|
|
+bool six_lock_tryupgrade(struct six_lock *lock)
|
|
+{
|
|
+ union six_lock_state old, new;
|
|
+ u64 v = READ_ONCE(lock->state.v);
|
|
+
|
|
+ do {
|
|
+ new.v = old.v = v;
|
|
+
|
|
+ if (new.intent_lock)
|
|
+ return false;
|
|
+
|
|
+ if (!lock->readers) {
|
|
+ EBUG_ON(!new.read_lock);
|
|
+ new.read_lock--;
|
|
+ }
|
|
+
|
|
+ new.intent_lock = 1;
|
|
+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter,
|
|
+ old.v, new.v)) != old.v);
|
|
+
|
|
+ if (lock->readers)
|
|
+ this_cpu_dec(*lock->readers);
|
|
+
|
|
+ six_set_owner(lock, SIX_LOCK_intent, old);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_tryupgrade);
|
|
+
|
|
+bool six_trylock_convert(struct six_lock *lock,
|
|
+ enum six_lock_type from,
|
|
+ enum six_lock_type to)
|
|
+{
|
|
+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write);
|
|
+
|
|
+ if (to == from)
|
|
+ return true;
|
|
+
|
|
+ if (to == SIX_LOCK_read) {
|
|
+ six_lock_downgrade(lock);
|
|
+ return true;
|
|
+ } else {
|
|
+ return six_lock_tryupgrade(lock);
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_trylock_convert);
|
|
+
|
|
+/*
|
|
+ * Increment read/intent lock count, assuming we already have it read or intent
|
|
+ * locked:
|
|
+ */
|
|
+void six_lock_increment(struct six_lock *lock, enum six_lock_type type)
|
|
+{
|
|
+ const struct six_lock_vals l[] = LOCK_VALS;
|
|
+
|
|
+ six_acquire(&lock->dep_map, 0);
|
|
+
|
|
+ /* XXX: assert already locked, and that we don't overflow: */
|
|
+
|
|
+ switch (type) {
|
|
+ case SIX_LOCK_read:
|
|
+ if (lock->readers) {
|
|
+ this_cpu_inc(*lock->readers);
|
|
+ } else {
|
|
+ EBUG_ON(!lock->state.read_lock &&
|
|
+ !lock->state.intent_lock);
|
|
+ atomic64_add(l[type].lock_val, &lock->state.counter);
|
|
+ }
|
|
+ break;
|
|
+ case SIX_LOCK_intent:
|
|
+ EBUG_ON(!lock->state.intent_lock);
|
|
+ lock->intent_lock_recurse++;
|
|
+ break;
|
|
+ case SIX_LOCK_write:
|
|
+ BUG();
|
|
+ break;
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_increment);
|
|
+
|
|
+void six_lock_wakeup_all(struct six_lock *lock)
|
|
+{
|
|
+ struct six_lock_waiter *w;
|
|
+
|
|
+ raw_spin_lock(&lock->wait_lock);
|
|
+
|
|
+ list_for_each_entry(w, &lock->wait_list[0], list)
|
|
+ wake_up_process(w->task);
|
|
+ list_for_each_entry(w, &lock->wait_list[1], list)
|
|
+ wake_up_process(w->task);
|
|
+
|
|
+ raw_spin_unlock(&lock->wait_lock);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_wakeup_all);
|
|
+
|
|
+struct free_pcpu_rcu {
|
|
+ struct rcu_head rcu;
|
|
+ void __percpu *p;
|
|
+};
|
|
+
|
|
+static void free_pcpu_rcu_fn(struct rcu_head *_rcu)
|
|
+{
|
|
+ struct free_pcpu_rcu *rcu =
|
|
+ container_of(_rcu, struct free_pcpu_rcu, rcu);
|
|
+
|
|
+ free_percpu(rcu->p);
|
|
+ kfree(rcu);
|
|
+}
|
|
+
|
|
+void six_lock_pcpu_free_rcu(struct six_lock *lock)
|
|
+{
|
|
+ struct free_pcpu_rcu *rcu = kzalloc(sizeof(*rcu), GFP_KERNEL);
|
|
+
|
|
+ if (!rcu)
|
|
+ return;
|
|
+
|
|
+ rcu->p = lock->readers;
|
|
+ lock->readers = NULL;
|
|
+
|
|
+ call_rcu(&rcu->rcu, free_pcpu_rcu_fn);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free_rcu);
|
|
+
|
|
+void six_lock_pcpu_free(struct six_lock *lock)
|
|
+{
|
|
+ BUG_ON(lock->readers && pcpu_read_count(lock));
|
|
+ BUG_ON(lock->state.read_lock);
|
|
+
|
|
+ free_percpu(lock->readers);
|
|
+ lock->readers = NULL;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_pcpu_free);
|
|
+
|
|
+void six_lock_pcpu_alloc(struct six_lock *lock)
|
|
+{
|
|
+#ifdef __KERNEL__
|
|
+ if (!lock->readers)
|
|
+ lock->readers = alloc_percpu(unsigned);
|
|
+#endif
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(six_lock_pcpu_alloc);
|
|
diff --git a/kernel/module.c b/kernel/module.c
|
|
index 4bf30e4b3eaa..676a31aacd9d 100644
|
|
--- a/kernel/module.c
|
|
+++ b/kernel/module.c
|
|
@@ -2872,9 +2872,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug)
|
|
|
|
void * __weak module_alloc(unsigned long size)
|
|
{
|
|
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
|
|
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
|
|
- NUMA_NO_NODE, __builtin_return_address(0));
|
|
+ return vmalloc_exec(size, GFP_KERNEL);
|
|
}
|
|
|
|
bool __weak module_init_section(const char *name)
|
|
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
|
|
index 6208c1dae5c9..26344dc6483b 100644
|
|
--- a/kernel/rcu/srcutiny.c
|
|
+++ b/kernel/rcu/srcutiny.c
|
|
@@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp)
|
|
ssp->srcu_gp_running = false;
|
|
ssp->srcu_gp_waiting = false;
|
|
ssp->srcu_idx = 0;
|
|
+ ssp->srcu_idx_max = 0;
|
|
INIT_WORK(&ssp->srcu_work, srcu_drive_gp);
|
|
INIT_LIST_HEAD(&ssp->srcu_work.entry);
|
|
return 0;
|
|
@@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
|
|
WARN_ON(ssp->srcu_gp_waiting);
|
|
WARN_ON(ssp->srcu_cb_head);
|
|
WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail);
|
|
+ WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max);
|
|
+ WARN_ON(ssp->srcu_idx & 0x1);
|
|
}
|
|
EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
|
|
|
|
@@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp)
|
|
struct srcu_struct *ssp;
|
|
|
|
ssp = container_of(wp, struct srcu_struct, srcu_work);
|
|
- if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head))
|
|
+ if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
|
|
return; /* Already running or nothing to do. */
|
|
|
|
/* Remove recently arrived callbacks and wait for readers. */
|
|
@@ -124,11 +127,12 @@ void srcu_drive_gp(struct work_struct *wp)
|
|
ssp->srcu_cb_head = NULL;
|
|
ssp->srcu_cb_tail = &ssp->srcu_cb_head;
|
|
local_irq_enable();
|
|
- idx = ssp->srcu_idx;
|
|
- WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx);
|
|
+ idx = (ssp->srcu_idx & 0x2) / 2;
|
|
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
|
|
WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */
|
|
swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx]));
|
|
WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
|
|
+ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1);
|
|
|
|
/* Invoke the callbacks we removed above. */
|
|
while (lh) {
|
|
@@ -146,11 +150,27 @@ void srcu_drive_gp(struct work_struct *wp)
|
|
* straighten that out.
|
|
*/
|
|
WRITE_ONCE(ssp->srcu_gp_running, false);
|
|
- if (READ_ONCE(ssp->srcu_cb_head))
|
|
+ if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)))
|
|
schedule_work(&ssp->srcu_work);
|
|
}
|
|
EXPORT_SYMBOL_GPL(srcu_drive_gp);
|
|
|
|
+static void srcu_gp_start_if_needed(struct srcu_struct *ssp)
|
|
+{
|
|
+ unsigned short cookie;
|
|
+
|
|
+ cookie = get_state_synchronize_srcu(ssp);
|
|
+ if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie))
|
|
+ return;
|
|
+ WRITE_ONCE(ssp->srcu_idx_max, cookie);
|
|
+ if (!READ_ONCE(ssp->srcu_gp_running)) {
|
|
+ if (likely(srcu_init_done))
|
|
+ schedule_work(&ssp->srcu_work);
|
|
+ else if (list_empty(&ssp->srcu_work.entry))
|
|
+ list_add(&ssp->srcu_work.entry, &srcu_boot_list);
|
|
+ }
|
|
+}
|
|
+
|
|
/*
|
|
* Enqueue an SRCU callback on the specified srcu_struct structure,
|
|
* initiating grace-period processing if it is not already running.
|
|
@@ -166,12 +186,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
|
*ssp->srcu_cb_tail = rhp;
|
|
ssp->srcu_cb_tail = &rhp->next;
|
|
local_irq_restore(flags);
|
|
- if (!READ_ONCE(ssp->srcu_gp_running)) {
|
|
- if (likely(srcu_init_done))
|
|
- schedule_work(&ssp->srcu_work);
|
|
- else if (list_empty(&ssp->srcu_work.entry))
|
|
- list_add(&ssp->srcu_work.entry, &srcu_boot_list);
|
|
- }
|
|
+ srcu_gp_start_if_needed(ssp);
|
|
}
|
|
EXPORT_SYMBOL_GPL(call_srcu);
|
|
|
|
@@ -190,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp)
|
|
}
|
|
EXPORT_SYMBOL_GPL(synchronize_srcu);
|
|
|
|
+/*
|
|
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
|
|
+ */
|
|
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
|
|
+{
|
|
+ unsigned long ret;
|
|
+
|
|
+ barrier();
|
|
+ ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1;
|
|
+ barrier();
|
|
+ return ret & USHRT_MAX;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
|
|
+
|
|
+/*
|
|
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
|
|
+ *
|
|
+ * The difference between this and get_state_synchronize_srcu() is that
|
|
+ * this function ensures that the poll_state_synchronize_srcu() will
|
|
+ * eventually return the value true.
|
|
+ */
|
|
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
|
|
+{
|
|
+ unsigned long ret = get_state_synchronize_srcu(ssp);
|
|
+
|
|
+ srcu_gp_start_if_needed(ssp);
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
|
|
+
|
|
+/*
|
|
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
|
|
+ */
|
|
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
|
|
+{
|
|
+ bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie);
|
|
+
|
|
+ barrier();
|
|
+ return ret;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
|
|
+
|
|
/* Lockdep diagnostics. */
|
|
void __init rcu_scheduler_starting(void)
|
|
{
|
|
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
|
|
index 0f23d20d485a..c5d0c036fab5 100644
|
|
--- a/kernel/rcu/srcutree.c
|
|
+++ b/kernel/rcu/srcutree.c
|
|
@@ -807,6 +807,46 @@ static void srcu_leak_callback(struct rcu_head *rhp)
|
|
{
|
|
}
|
|
|
|
+/*
|
|
+ * Start an SRCU grace period, and also queue the callback if non-NULL.
|
|
+ */
|
|
+static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
|
|
+ struct rcu_head *rhp, bool do_norm)
|
|
+{
|
|
+ unsigned long flags;
|
|
+ int idx;
|
|
+ bool needexp = false;
|
|
+ bool needgp = false;
|
|
+ unsigned long s;
|
|
+ struct srcu_data *sdp;
|
|
+
|
|
+ check_init_srcu_struct(ssp);
|
|
+ idx = srcu_read_lock(ssp);
|
|
+ sdp = raw_cpu_ptr(ssp->sda);
|
|
+ spin_lock_irqsave_rcu_node(sdp, flags);
|
|
+ if (rhp)
|
|
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
|
|
+ rcu_segcblist_advance(&sdp->srcu_cblist,
|
|
+ rcu_seq_current(&ssp->srcu_gp_seq));
|
|
+ s = rcu_seq_snap(&ssp->srcu_gp_seq);
|
|
+ (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
|
|
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
|
|
+ sdp->srcu_gp_seq_needed = s;
|
|
+ needgp = true;
|
|
+ }
|
|
+ if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
|
|
+ sdp->srcu_gp_seq_needed_exp = s;
|
|
+ needexp = true;
|
|
+ }
|
|
+ spin_unlock_irqrestore_rcu_node(sdp, flags);
|
|
+ if (needgp)
|
|
+ srcu_funnel_gp_start(ssp, sdp, s, do_norm);
|
|
+ else if (needexp)
|
|
+ srcu_funnel_exp_start(ssp, sdp->mynode, s);
|
|
+ srcu_read_unlock(ssp, idx);
|
|
+ return s;
|
|
+}
|
|
+
|
|
/*
|
|
* Enqueue an SRCU callback on the srcu_data structure associated with
|
|
* the current CPU and the specified srcu_struct structure, initiating
|
|
@@ -838,14 +878,6 @@ static void srcu_leak_callback(struct rcu_head *rhp)
|
|
static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
|
rcu_callback_t func, bool do_norm)
|
|
{
|
|
- unsigned long flags;
|
|
- int idx;
|
|
- bool needexp = false;
|
|
- bool needgp = false;
|
|
- unsigned long s;
|
|
- struct srcu_data *sdp;
|
|
-
|
|
- check_init_srcu_struct(ssp);
|
|
if (debug_rcu_head_queue(rhp)) {
|
|
/* Probable double call_srcu(), so leak the callback. */
|
|
WRITE_ONCE(rhp->func, srcu_leak_callback);
|
|
@@ -853,28 +885,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
|
|
return;
|
|
}
|
|
rhp->func = func;
|
|
- idx = srcu_read_lock(ssp);
|
|
- sdp = raw_cpu_ptr(ssp->sda);
|
|
- spin_lock_irqsave_rcu_node(sdp, flags);
|
|
- rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
|
|
- rcu_segcblist_advance(&sdp->srcu_cblist,
|
|
- rcu_seq_current(&ssp->srcu_gp_seq));
|
|
- s = rcu_seq_snap(&ssp->srcu_gp_seq);
|
|
- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
|
|
- if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
|
|
- sdp->srcu_gp_seq_needed = s;
|
|
- needgp = true;
|
|
- }
|
|
- if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) {
|
|
- sdp->srcu_gp_seq_needed_exp = s;
|
|
- needexp = true;
|
|
- }
|
|
- spin_unlock_irqrestore_rcu_node(sdp, flags);
|
|
- if (needgp)
|
|
- srcu_funnel_gp_start(ssp, sdp, s, do_norm);
|
|
- else if (needexp)
|
|
- srcu_funnel_exp_start(ssp, sdp->mynode, s);
|
|
- srcu_read_unlock(ssp, idx);
|
|
+ (void)srcu_gp_start_if_needed(ssp, rhp, do_norm);
|
|
}
|
|
|
|
/**
|
|
@@ -1003,6 +1014,62 @@ void synchronize_srcu(struct srcu_struct *ssp)
|
|
}
|
|
EXPORT_SYMBOL_GPL(synchronize_srcu);
|
|
|
|
+/**
|
|
+ * get_state_synchronize_srcu - Provide an end-of-grace-period cookie
|
|
+ * @ssp: srcu_struct to provide cookie for.
|
|
+ *
|
|
+ * This function returns a cookie that can be passed to
|
|
+ * poll_state_synchronize_srcu(), which will return true if a full grace
|
|
+ * period has elapsed in the meantime. It is the caller's responsibility
|
|
+ * to make sure that grace period happens, for example, by invoking
|
|
+ * call_srcu() after return from get_state_synchronize_srcu().
|
|
+ */
|
|
+unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp)
|
|
+{
|
|
+ // Any prior manipulation of SRCU-protected data must happen
|
|
+ // before the load from ->srcu_gp_seq.
|
|
+ smp_mb();
|
|
+ return rcu_seq_snap(&ssp->srcu_gp_seq);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(get_state_synchronize_srcu);
|
|
+
|
|
+/**
|
|
+ * start_poll_synchronize_srcu - Provide cookie and start grace period
|
|
+ * @ssp: srcu_struct to provide cookie for.
|
|
+ *
|
|
+ * This function returns a cookie that can be passed to
|
|
+ * poll_state_synchronize_srcu(), which will return true if a full grace
|
|
+ * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(),
|
|
+ * this function also ensures that any needed SRCU grace period will be
|
|
+ * started. This convenience does come at a cost in terms of CPU overhead.
|
|
+ */
|
|
+unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp)
|
|
+{
|
|
+ return srcu_gp_start_if_needed(ssp, NULL, true);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
|
|
+
|
|
+/**
|
|
+ * poll_state_synchronize_srcu - Has cookie's grace period ended?
|
|
+ * @ssp: srcu_struct to provide cookie for.
|
|
+ * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu().
|
|
+ *
|
|
+ * This function takes the cookie that was returned from either
|
|
+ * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and
|
|
+ * returns @true if an SRCU grace period elapsed since the time that the
|
|
+ * cookie was created.
|
|
+ */
|
|
+bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
|
|
+{
|
|
+ if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie))
|
|
+ return false;
|
|
+ // Ensure that the end of the SRCU grace period happens before
|
|
+ // any subsequent code that the caller might execute.
|
|
+ smp_mb(); // ^^^
|
|
+ return true;
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
|
|
+
|
|
/*
|
|
* Callback function for srcu_barrier() use.
|
|
*/
|
|
diff --git a/lib/Kconfig b/lib/Kconfig
|
|
index 46806332a8cc..52124db873a5 100644
|
|
--- a/lib/Kconfig
|
|
+++ b/lib/Kconfig
|
|
@@ -461,6 +461,9 @@ config ASSOCIATIVE_ARRAY
|
|
|
|
for more information.
|
|
|
|
+config CLOSURES
|
|
+ bool
|
|
+
|
|
config HAS_IOMEM
|
|
bool
|
|
depends on !NO_IOMEM
|
|
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
|
|
index 7937265ef879..d4582113b0d0 100644
|
|
--- a/lib/Kconfig.debug
|
|
+++ b/lib/Kconfig.debug
|
|
@@ -1559,6 +1559,15 @@ config DEBUG_CREDENTIALS
|
|
|
|
source "kernel/rcu/Kconfig.debug"
|
|
|
|
+config DEBUG_CLOSURES
|
|
+ bool "Debug closures (bcache async widgits)"
|
|
+ depends on CLOSURES
|
|
+ select DEBUG_FS
|
|
+ help
|
|
+ Keeps all active closures in a linked list and provides a debugfs
|
|
+ interface to list them, which makes it possible to see asynchronous
|
|
+ operations that get stuck.
|
|
+
|
|
config DEBUG_WQ_FORCE_RR_CPU
|
|
bool "Force round-robin CPU selection for unbound work items"
|
|
depends on DEBUG_KERNEL
|
|
diff --git a/lib/Makefile b/lib/Makefile
|
|
index afeff05fa8c5..eeb931232661 100644
|
|
--- a/lib/Makefile
|
|
+++ b/lib/Makefile
|
|
@@ -238,6 +238,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
|
|
|
|
obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
|
|
|
|
+obj-$(CONFIG_CLOSURES) += closure.o
|
|
+
|
|
obj-$(CONFIG_DQL) += dynamic_queue_limits.o
|
|
|
|
obj-$(CONFIG_GLOB) += glob.o
|
|
diff --git a/lib/closure.c b/lib/closure.c
|
|
new file mode 100644
|
|
index 000000000000..b38ded00b9b0
|
|
--- /dev/null
|
|
+++ b/lib/closure.c
|
|
@@ -0,0 +1,204 @@
|
|
+// SPDX-License-Identifier: GPL-2.0
|
|
+/*
|
|
+ * Asynchronous refcounty things
|
|
+ *
|
|
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
|
|
+ * Copyright 2012 Google, Inc.
|
|
+ */
|
|
+
|
|
+#include <linux/closure.h>
|
|
+#include <linux/debugfs.h>
|
|
+#include <linux/export.h>
|
|
+#include <linux/seq_file.h>
|
|
+#include <linux/sched/debug.h>
|
|
+
|
|
+static inline void closure_put_after_sub(struct closure *cl, int flags)
|
|
+{
|
|
+ int r = flags & CLOSURE_REMAINING_MASK;
|
|
+
|
|
+ BUG_ON(flags & CLOSURE_GUARD_MASK);
|
|
+ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
|
|
+
|
|
+ if (!r) {
|
|
+ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
|
|
+ atomic_set(&cl->remaining,
|
|
+ CLOSURE_REMAINING_INITIALIZER);
|
|
+ closure_queue(cl);
|
|
+ } else {
|
|
+ struct closure *parent = cl->parent;
|
|
+ closure_fn *destructor = cl->fn;
|
|
+
|
|
+ closure_debug_destroy(cl);
|
|
+
|
|
+ if (destructor)
|
|
+ destructor(cl);
|
|
+
|
|
+ if (parent)
|
|
+ closure_put(parent);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* For clearing flags with the same atomic op as a put */
|
|
+void closure_sub(struct closure *cl, int v)
|
|
+{
|
|
+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
|
|
+}
|
|
+EXPORT_SYMBOL(closure_sub);
|
|
+
|
|
+/*
|
|
+ * closure_put - decrement a closure's refcount
|
|
+ */
|
|
+void closure_put(struct closure *cl)
|
|
+{
|
|
+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
|
|
+}
|
|
+EXPORT_SYMBOL(closure_put);
|
|
+
|
|
+/*
|
|
+ * closure_wake_up - wake up all closures on a wait list, without memory barrier
|
|
+ */
|
|
+void __closure_wake_up(struct closure_waitlist *wait_list)
|
|
+{
|
|
+ struct llist_node *list;
|
|
+ struct closure *cl, *t;
|
|
+ struct llist_node *reverse = NULL;
|
|
+
|
|
+ list = llist_del_all(&wait_list->list);
|
|
+
|
|
+ /* We first reverse the list to preserve FIFO ordering and fairness */
|
|
+ reverse = llist_reverse_order(list);
|
|
+
|
|
+ /* Then do the wakeups */
|
|
+ llist_for_each_entry_safe(cl, t, reverse, list) {
|
|
+ closure_set_waiting(cl, 0);
|
|
+ closure_sub(cl, CLOSURE_WAITING + 1);
|
|
+ }
|
|
+}
|
|
+EXPORT_SYMBOL(__closure_wake_up);
|
|
+
|
|
+/**
|
|
+ * closure_wait - add a closure to a waitlist
|
|
+ * @waitlist: will own a ref on @cl, which will be released when
|
|
+ * closure_wake_up() is called on @waitlist.
|
|
+ * @cl: closure pointer.
|
|
+ *
|
|
+ */
|
|
+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
|
|
+{
|
|
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
|
|
+ return false;
|
|
+
|
|
+ closure_set_waiting(cl, _RET_IP_);
|
|
+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
|
|
+ llist_add(&cl->list, &waitlist->list);
|
|
+
|
|
+ return true;
|
|
+}
|
|
+EXPORT_SYMBOL(closure_wait);
|
|
+
|
|
+struct closure_syncer {
|
|
+ struct task_struct *task;
|
|
+ int done;
|
|
+};
|
|
+
|
|
+static void closure_sync_fn(struct closure *cl)
|
|
+{
|
|
+ struct closure_syncer *s = cl->s;
|
|
+ struct task_struct *p;
|
|
+
|
|
+ rcu_read_lock();
|
|
+ p = READ_ONCE(s->task);
|
|
+ s->done = 1;
|
|
+ wake_up_process(p);
|
|
+ rcu_read_unlock();
|
|
+}
|
|
+
|
|
+void __sched __closure_sync(struct closure *cl)
|
|
+{
|
|
+ struct closure_syncer s = { .task = current };
|
|
+
|
|
+ cl->s = &s;
|
|
+ continue_at(cl, closure_sync_fn, NULL);
|
|
+
|
|
+ while (1) {
|
|
+ set_current_state(TASK_UNINTERRUPTIBLE);
|
|
+ if (s.done)
|
|
+ break;
|
|
+ schedule();
|
|
+ }
|
|
+
|
|
+ __set_current_state(TASK_RUNNING);
|
|
+}
|
|
+EXPORT_SYMBOL(__closure_sync);
|
|
+
|
|
+#ifdef CONFIG_DEBUG_CLOSURES
|
|
+
|
|
+static LIST_HEAD(closure_list);
|
|
+static DEFINE_SPINLOCK(closure_list_lock);
|
|
+
|
|
+void closure_debug_create(struct closure *cl)
|
|
+{
|
|
+ unsigned long flags;
|
|
+
|
|
+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
|
|
+ cl->magic = CLOSURE_MAGIC_ALIVE;
|
|
+
|
|
+ spin_lock_irqsave(&closure_list_lock, flags);
|
|
+ list_add(&cl->all, &closure_list);
|
|
+ spin_unlock_irqrestore(&closure_list_lock, flags);
|
|
+}
|
|
+EXPORT_SYMBOL(closure_debug_create);
|
|
+
|
|
+void closure_debug_destroy(struct closure *cl)
|
|
+{
|
|
+ unsigned long flags;
|
|
+
|
|
+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
|
|
+ cl->magic = CLOSURE_MAGIC_DEAD;
|
|
+
|
|
+ spin_lock_irqsave(&closure_list_lock, flags);
|
|
+ list_del(&cl->all);
|
|
+ spin_unlock_irqrestore(&closure_list_lock, flags);
|
|
+}
|
|
+EXPORT_SYMBOL(closure_debug_destroy);
|
|
+
|
|
+static int debug_show(struct seq_file *f, void *data)
|
|
+{
|
|
+ struct closure *cl;
|
|
+
|
|
+ spin_lock_irq(&closure_list_lock);
|
|
+
|
|
+ list_for_each_entry(cl, &closure_list, all) {
|
|
+ int r = atomic_read(&cl->remaining);
|
|
+
|
|
+ seq_printf(f, "%p: %pS -> %pS p %p r %i ",
|
|
+ cl, (void *) cl->ip, cl->fn, cl->parent,
|
|
+ r & CLOSURE_REMAINING_MASK);
|
|
+
|
|
+ seq_printf(f, "%s%s\n",
|
|
+ test_bit(WORK_STRUCT_PENDING_BIT,
|
|
+ work_data_bits(&cl->work)) ? "Q" : "",
|
|
+ r & CLOSURE_RUNNING ? "R" : "");
|
|
+
|
|
+ if (r & CLOSURE_WAITING)
|
|
+ seq_printf(f, " W %pS\n",
|
|
+ (void *) cl->waiting_on);
|
|
+
|
|
+ seq_puts(f, "\n");
|
|
+ }
|
|
+
|
|
+ spin_unlock_irq(&closure_list_lock);
|
|
+ return 0;
|
|
+}
|
|
+
|
|
+DEFINE_SHOW_ATTRIBUTE(debug);
|
|
+
|
|
+static int __init closure_debug_init(void)
|
|
+{
|
|
+ debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops);
|
|
+ return 0;
|
|
+}
|
|
+late_initcall(closure_debug_init)
|
|
+
|
|
+#endif
|
|
diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c
|
|
index f25eb111c051..7dfa88282b00 100644
|
|
--- a/lib/generic-radix-tree.c
|
|
+++ b/lib/generic-radix-tree.c
|
|
@@ -166,6 +166,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
|
|
struct genradix_root *r;
|
|
struct genradix_node *n;
|
|
unsigned level, i;
|
|
+
|
|
+ if (iter->offset == SIZE_MAX)
|
|
+ return NULL;
|
|
+
|
|
restart:
|
|
r = READ_ONCE(radix->root);
|
|
if (!r)
|
|
@@ -184,10 +188,17 @@ void *__genradix_iter_peek(struct genradix_iter *iter,
|
|
(GENRADIX_ARY - 1);
|
|
|
|
while (!n->children[i]) {
|
|
+ size_t objs_per_ptr = genradix_depth_size(level);
|
|
+
|
|
+ if (iter->offset + objs_per_ptr < iter->offset) {
|
|
+ iter->offset = SIZE_MAX;
|
|
+ iter->pos = SIZE_MAX;
|
|
+ return NULL;
|
|
+ }
|
|
+
|
|
i++;
|
|
- iter->offset = round_down(iter->offset +
|
|
- genradix_depth_size(level),
|
|
- genradix_depth_size(level));
|
|
+ iter->offset = round_down(iter->offset + objs_per_ptr,
|
|
+ objs_per_ptr);
|
|
iter->pos = (iter->offset >> PAGE_SHIFT) *
|
|
objs_per_page;
|
|
if (i == GENRADIX_ARY)
|
|
diff --git a/mm/filemap.c b/mm/filemap.c
|
|
index aa0e0fb04670..669a00196730 100644
|
|
--- a/mm/filemap.c
|
|
+++ b/mm/filemap.c
|
|
@@ -2023,6 +2023,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
|
|
|
|
return ret;
|
|
}
|
|
+EXPORT_SYMBOL(find_get_pages_range);
|
|
|
|
/**
|
|
* find_get_pages_contig - gang contiguous pagecache lookup
|
|
diff --git a/mm/nommu.c b/mm/nommu.c
|
|
index 870fea12823e..3b3394cb8076 100644
|
|
--- a/mm/nommu.c
|
|
+++ b/mm/nommu.c
|
|
@@ -290,6 +290,24 @@ void *vzalloc_node(unsigned long size, int node)
|
|
}
|
|
EXPORT_SYMBOL(vzalloc_node);
|
|
|
|
+/**
|
|
+ * vmalloc_exec - allocate virtually contiguous, executable memory
|
|
+ * @size: allocation size
|
|
+ *
|
|
+ * Kernel-internal function to allocate enough pages to cover @size
|
|
+ * the page level allocator and map them into contiguous and
|
|
+ * executable kernel virtual space.
|
|
+ *
|
|
+ * For tight control over page level allocator and protection flags
|
|
+ * use __vmalloc() instead.
|
|
+ */
|
|
+
|
|
+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
|
|
+{
|
|
+ return __vmalloc(size, gfp_mask);
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(vmalloc_exec);
|
|
+
|
|
/**
|
|
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
|
|
* @size: allocation size
|
|
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
|
|
index e6f352bf0498..f3577549d56c 100644
|
|
--- a/mm/vmalloc.c
|
|
+++ b/mm/vmalloc.c
|
|
@@ -2730,6 +2730,27 @@ void *vzalloc_node(unsigned long size, int node)
|
|
}
|
|
EXPORT_SYMBOL(vzalloc_node);
|
|
|
|
+/**
|
|
+ * vmalloc_exec - allocate virtually contiguous, executable memory
|
|
+ * @size: allocation size
|
|
+ *
|
|
+ * Kernel-internal function to allocate enough pages to cover @size
|
|
+ * the page level allocator and map them into contiguous and
|
|
+ * executable kernel virtual space.
|
|
+ *
|
|
+ * For tight control over page level allocator and protection flags
|
|
+ * use __vmalloc() instead.
|
|
+ *
|
|
+ * Return: pointer to the allocated memory or %NULL on error
|
|
+ */
|
|
+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask)
|
|
+{
|
|
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
|
|
+ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
|
|
+ NUMA_NO_NODE, __builtin_return_address(0));
|
|
+}
|
|
+EXPORT_SYMBOL_GPL(vmalloc_exec);
|
|
+
|
|
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
|
|
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
|
|
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
|