On large files, fuse_invalidate_files() can take a very long time to complete.
This is caused by two slow operations that cannot be optimized:
 - filemap_write_and_wait() when the file is under heavy write load, and
 - invalidate_inode_pages2() when the page cache is heavily populated.

These long delays block the userspace evloop (which must not be blocked) and
can trigger a shaman reboot in the worst case.

To fix this, the following changes are made:

1. Move the execution of filemap_write_and_wait() and invalidate_inode_pages2()
   into a dedicated kernel workqueue item.

2. In fuse_invalidate_files(), only set the FUSE_I_INVAL_FILES bit in fi->state
   and schedule the invalidation work for the fuse_inode.

3. Block new opens of the file while the FUSE_I_INVAL_FILES bit is set.
   The bit is cleared only after the file has been fully invalidated.
   This is necessary because userspace views the file as fully invalidated
   as soon as fuse_invalidate_files() returns.

Additionally, make the fuse trace function available in fuse module so
that fuse_invalidate_files events can be traced and logged.

Related to
https://virtuozzo.atlassian.net/browse/VSTOR-124254

Signed-off-by: Liu Kui <[email protected]>
---
 fs/fuse/dev.c                      |   2 +-
 fs/fuse/file.c                     |  30 ++++++--
 fs/fuse/fuse_i.h                   |  22 +++++-
 fs/fuse/inode.c                    | 114 ++++++++++++++++++++++++-----
 fs/fuse/kio/pcs/pcs_fuse_kdirect.c |  32 ++++++--
 5 files changed, 165 insertions(+), 35 deletions(-)

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c1102069d032..4fcfd644dcf6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -110,7 +110,7 @@ static bool fuse_block_alloc(struct fuse_conn *fc, bool 
for_background)
        return !fc->initialized || (for_background && fc->blocked);
 }
 
-static void fuse_drop_waiting(struct fuse_conn *fc)
+void fuse_drop_waiting(struct fuse_conn *fc)
 {
        /*
         * lockess check of fc->connected is okay, because atomic_dec_and_test()
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 0860996c19ad..7bebe03dda5b 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -252,10 +252,11 @@ static void fuse_link_rw_file(struct file *file)
        struct fuse_file *ff = file->private_data;
 
        spin_lock(&fi->lock);
-       if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
+       if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
                spin_lock(&ff->lock);
                set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
                spin_unlock(&ff->lock);
+               fuse_ktrace(ff->fm->fc, "fuse_file[%llu] --> invalidate_file on 
[%llu] pending", ff->fh, ff->nodeid);
        }
        if (list_empty(&ff->rw_entry))
                list_add(&ff->rw_entry, &fi->rw_files);
@@ -319,6 +320,13 @@ static int fuse_open(struct inode *inode, struct file 
*file)
        if ((file->f_flags & O_DIRECT) && !fc->direct_enable)
                return -EINVAL;
 
+       if (unlikely(test_bit(FUSE_I_INVAL_FILES, &fi->state))) {
+               fuse_ktrace(fc, "waiting for invalidate_file on [%llu] to 
complete", fi->nodeid);
+               err = wait_on_bit(&fi->state, FUSE_I_INVAL_FILES, 
TASK_KILLABLE);
+               if (err)
+                       return err;
+       }
+
        err = generic_file_open(inode, file);
        if (err)
                return err;
@@ -361,8 +369,6 @@ static int fuse_open(struct inode *inode, struct file *file)
                inode_unlock(inode);
 
        if (!err && fc->close_wait) {
-               struct fuse_inode *fi = get_fuse_inode(inode);
-
                inode_lock(inode);
                spin_lock(&fi->lock);
 
@@ -1409,6 +1415,12 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
                        return err;
        }
 
+       /*
+        * Block read if the file had been invalidated.
+        */
+       if (fuse_file_fail_immediately(iocb->ki_filp->private_data))
+               return -EIO;
+
        return generic_file_read_iter(iocb, to);
 }
 
@@ -1794,6 +1806,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
                        goto writethrough;
                }
 
+               /*
+                * Block write if the file had been invalidated.
+                */
+               if (fuse_file_fail_immediately(file->private_data))
+                       return -EIO;
+
                return generic_file_write_iter(iocb, from);
        }
 
@@ -2704,13 +2722,9 @@ static int fuse_writepages_fill(struct folio *folio,
 
        BUG_ON(wpa && !data->ff);
 
-       /* More than optimization: writeback pages to /dev/null; fused would
-        * drop our FUSE_WRITE requests anyway, but it will be blocked while
-        * sending NOTIFY_INVAL_FILES until we return!
-        */
        if (!wpa && test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
                unlock_page(&folio->page);
-               return 0;
+               return -EIO;
        }
 
        if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 853bf12e282d..35222f48cb5b 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -215,6 +215,9 @@ struct fuse_inode {
                atomic_t read_count;
                atomic_t write_count;
        } dio;
+
+       /** Entry on fc->inval_files_list list */
+       struct list_head inval_files_entry;
 };
 
 /** FUSE inode state bits */
@@ -1110,7 +1113,13 @@ struct fuse_conn {
        } kio;
 
        int ktrace_level;
-       struct fuse_ktrace * ktrace;
+       struct fuse_ktrace *ktrace;
+       void (*fuse_ktrace_fn)(struct fuse_conn *fc, const char *fmt, ...);
+
+       /* List of fuse_inodes to be invalidated by userspace */
+       struct list_head inval_files_list;
+       struct delayed_work inval_files_work;
+
        struct dentry *conn_ctl;
 
        /* New writepages go into this bucket */
@@ -1122,6 +1131,14 @@ struct fuse_conn {
 #endif
 };
 
+#define fuse_ktrace(fc, fmt, args...) { \
+       do { \
+               struct fuse_conn *__fc = (fc); \
+               if (__fc->fuse_ktrace_fn) \
+                       __fc->fuse_ktrace_fn(__fc, "%s: " fmt, __func__, ## 
args); \
+       } while (0); \
+}
+
 /*
  * Represents a mounted filesystem, potentially a submount.
  *
@@ -1552,7 +1569,7 @@ static inline void fuse_dio_wait(struct fuse_inode *fi)
 
 static inline bool fuse_file_fail_immediately(struct fuse_file *ff)
 {
-       return ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+       return unlikely(ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
 }
 
 /**
@@ -1717,6 +1734,7 @@ void fuse_file_release(struct inode *inode, struct 
fuse_file *ff,
 
 struct fuse_kio_ops *fuse_kio_get(struct fuse_conn *fc, char *name);
 void fuse_kio_put(struct fuse_kio_ops *ops);
+void fuse_drop_waiting(struct fuse_conn *fc);
 
 /* passthrough.c */
 static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f167d275885b..2e6cf9edb04e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -35,6 +35,8 @@ struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 EXPORT_SYMBOL_GPL(fuse_mutex);
 
+struct workqueue_struct *fuse_inval_files_wq;
+
 static int fuse_ve_odirect;
 
 static int set_global_limit(const char *val, const struct kernel_param *kp);
@@ -117,6 +119,7 @@ static struct inode *fuse_alloc_inode(struct super_block 
*sb)
        fi->i_size_unstable = 0;
        fi->private = NULL;
        INIT_LIST_HEAD(&fi->rw_files);
+       INIT_LIST_HEAD(&fi->inval_files_entry);
        mutex_init(&fi->mutex);
        spin_lock_init(&fi->lock);
        init_waitqueue_head(&fi->dio.waitq);
@@ -603,12 +606,81 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
                mutex_unlock(&get_fuse_inode(inode)->mutex);
 }
 
+static void fuse_inval_files_work(struct work_struct *w)
+{
+       struct fuse_conn *fc = container_of(w, struct fuse_conn, 
inval_files_work.work);
+       struct list_head inval_files_list;
+       struct list_head failed_list;
+       struct fuse_file *ff;
+       struct fuse_inode *fi;
+       bool to_retry;
+       int err;
+
+       INIT_LIST_HEAD(&inval_files_list);
+       INIT_LIST_HEAD(&failed_list);
+
+       spin_lock(&fc->lock);
+       list_splice_init(&fc->inval_files_list, &inval_files_list);
+       to_retry = fc->connected;
+       spin_unlock(&fc->lock);
+
+       while (!list_empty(&inval_files_list)) {
+               u64 nodeid;
+
+               fi = list_first_entry(&inval_files_list, struct fuse_inode, 
inval_files_entry);
+               list_del(&fi->inval_files_entry);
+               nodeid = get_node_id(&fi->inode) - FUSE_ROOT_ID;
+               fuse_ktrace(fc, "invalidate_file on [%llu] starts", nodeid);
+
+               err = filemap_write_and_wait(fi->inode.i_mapping);
+               if (err && err != -EIO && to_retry) {
+                       fuse_ktrace(fc, "filemap_write_and_wait() on [%llu] 
returns err=%d", nodeid, err);
+                       list_add_tail(&fi->inval_files_entry, &failed_list);
+                       continue;
+               }
+
+               spin_lock(&fi->lock);
+               list_for_each_entry(ff, &fi->rw_files, rw_entry)
+                       fuse_revoke_readpages(ff);
+               spin_unlock(&fi->lock);
+
+               wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
+
+               err = invalidate_inode_pages2(fi->inode.i_mapping);
+               if (err && to_retry) {
+                       fuse_ktrace(fc, "invalidate_inode_pages2() on [%llu] 
returns err=%d", nodeid, err);
+                       list_add_tail(&fi->inval_files_entry, &failed_list);
+                       continue;
+               }
+
+               fuse_invalidate_attr(&fi->inode);
+
+               spin_lock(&fi->lock);
+               clear_bit(FUSE_I_INVAL_FILES, &fi->state);
+               wake_up_bit(&fi->state, FUSE_I_INVAL_FILES);
+               spin_unlock(&fi->lock);
+
+               fuse_ktrace(fc, "invalidate_file on [%llu] ends", nodeid);
+               iput(&fi->inode);
+       }
+
+       if (!list_empty(&failed_list)) {
+               spin_lock(&fc->lock);
+               list_splice_init(&failed_list, &fc->inval_files_list);
+               spin_unlock(&fc->lock);
+               if (queue_delayed_work(fuse_inval_files_wq, 
&fc->inval_files_work, 1))
+                       return;
+       }
+
+       fuse_drop_waiting(fc);
+}
+
 int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 {
        struct inode *inode;
        struct fuse_inode *fi;
        struct fuse_file *ff;
-       int err, i;
+       int i;
 
        if (!fc->async_read) {
                printk(KERN_ERR "Turn async_read ON to use "
@@ -624,6 +696,11 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
 
        /* Mark that invalidate files is in progress */
        spin_lock(&fi->lock);
+       if (test_bit(FUSE_I_INVAL_FILES, &fi->state)) {
+               spin_unlock(&fi->lock);
+               iput(inode);
+               return 0;
+       }
        set_bit(FUSE_I_INVAL_FILES, &fi->state);
        list_for_each_entry(ff, &fi->rw_files, rw_entry) {
                spin_lock(&ff->lock);
@@ -638,23 +715,14 @@ int fuse_invalidate_files(struct fuse_conn *fc, u64 
nodeid)
        for (i = 0; i < FUSE_QHASH_SIZE; i++)
                wake_up_all(&fc->qhash[i].waitq);
 
-       err = filemap_write_and_wait(inode->i_mapping);
-       if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
-               spin_lock(&fi->lock);
-               list_for_each_entry(ff, &fi->rw_files, rw_entry)
-                       fuse_revoke_readpages(ff);
-               spin_unlock(&fi->lock);
-
-               wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
-               err = invalidate_inode_pages2(inode->i_mapping);
-       }
-
-       if (!err)
-               fuse_invalidate_attr(inode);
+       atomic_inc(&fc->num_waiting);
+       spin_lock(&fc->lock);
+       list_add_tail(&fi->inval_files_entry, &fc->inval_files_list);
+       spin_unlock(&fc->lock);
+       if (!queue_delayed_work(fuse_inval_files_wq, &fc->inval_files_work, 0))
+               fuse_drop_waiting(fc);
 
-       clear_bit(FUSE_I_INVAL_FILES, &fi->state);
-       iput(inode);
-       return err;
+       return 0;
 }
 
 static void fuse_umount_begin(struct super_block *sb)
@@ -1308,6 +1376,9 @@ int fuse_conn_init(struct fuse_conn *fc, struct 
fuse_mount *fm,
        if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
                fuse_backing_files_init(fc);
 
+       INIT_LIST_HEAD(&fc->inval_files_list);
+       INIT_DELAYED_WORK(&fc->inval_files_work, fuse_inval_files_work);
+
        INIT_LIST_HEAD(&fc->mounts);
        list_add(&fm->fc_entry, &fc->mounts);
        fm->fc = fc;
@@ -2456,13 +2527,17 @@ static int __init fuse_fs_init(void)
 {
        int err;
 
+       fuse_inval_files_wq = alloc_workqueue("fuse_inval_files_wq", 
WQ_MEM_RECLAIM, 1);
+       if (!fuse_inval_files_wq)
+               goto out;
+
        fuse_inode_cachep = kmem_cache_create("fuse_inode",
                        sizeof(struct fuse_inode), 0,
                        SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT,
                        fuse_inode_init_once);
        err = -ENOMEM;
        if (!fuse_inode_cachep)
-               goto out;
+               goto out1;
 
        err = register_fuseblk();
        if (err)
@@ -2478,6 +2553,8 @@ static int __init fuse_fs_init(void)
        unregister_fuseblk();
  out2:
        kmem_cache_destroy(fuse_inode_cachep);
+ out1:
+       destroy_workqueue(fuse_inval_files_wq);
  out:
        return err;
 }
@@ -2493,6 +2570,7 @@ static void fuse_fs_cleanup(void)
         */
        rcu_barrier();
        kmem_cache_destroy(fuse_inode_cachep);
+       destroy_workqueue(fuse_inval_files_wq);
 }
 
 static struct kobject *fuse_kobj;
diff --git a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c 
b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
index eafe2ee2313b..42cdca250cd9 100644
--- a/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
+++ b/fs/fuse/kio/pcs/pcs_fuse_kdirect.c
@@ -158,6 +158,7 @@ MODULE_PARM_DESC(rdmaio_io_failing, "Enable/Disbale RDMA io 
failing");
 
 static int fuse_ktrace_setup(struct fuse_conn * fc);
 static int fuse_ktrace_remove(struct fuse_conn *fc);
+static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...);
 
 static struct kmem_cache *pcs_fuse_req_cachep;
 static struct kmem_cache *pcs_ireq_cachep;
@@ -1672,6 +1673,8 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
                goto err;
        }
 
+       fc->fuse_ktrace_fn = kfuse_trace;
+
        return 0;
 
 err:
@@ -1680,22 +1683,19 @@ static int fuse_ktrace_setup(struct fuse_conn * fc)
        return ret;
 }
 
-void __kfuse_trace(struct fuse_conn * fc, unsigned long ip, const char * fmt, 
...)
+static void kfuse_tracer(struct fuse_conn *fc, unsigned long ip, const char 
*fmt, va_list va)
 {
-       struct fuse_ktrace * tr;
-        va_list va;
+       struct fuse_ktrace *tr;
        int cpu;
 
        cpu = get_cpu();
        tr = fc->ktrace;
        if (tr) {
                u8 * buf = per_cpu_ptr(tr->buf, cpu);
-               struct fuse_trace_hdr * t;
+               struct fuse_trace_hdr *t;
                int len;
 
-               va_start(va, fmt);
                len = vsnprintf(buf, KTRACE_LOG_BUF_SIZE, fmt, va);
-               va_end(va);
                t = fuse_trace_prepare(tr, FUSE_KTRACE_STRING, len + 1);
                if (t)
                        memcpy(t + 1, buf, len + 1);
@@ -1710,6 +1710,26 @@ void __kfuse_trace(struct fuse_conn * fc, unsigned long 
ip, const char * fmt, ..
        put_cpu();
 }
 
+void __kfuse_trace(struct fuse_conn *fc, unsigned long ip, const char *fmt, 
...)
+{
+       va_list va;
+
+       va_start(va, fmt);
+       kfuse_tracer(fc, ip, fmt, va);
+       va_end(va);
+}
+
+static void kfuse_trace(struct fuse_conn *fc, const char *fmt, ...)
+{
+       va_list va;
+
+       if (fc->ktrace_level >= LOG_TRACE) {
+               va_start(va, fmt);
+               kfuse_tracer(fc, 0, fmt, va);
+               va_end(va);
+       }
+}
+
 void pcs_kio_file_list(struct fuse_conn *fc, kio_file_itr kfile_cb, void *ctx)
 {
        struct fuse_file *ff;
-- 
2.39.5 (Apple Git-154)

_______________________________________________
Devel mailing list
[email protected]
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to