summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Pagano <mpagano@gentoo.org>2024-06-21 10:08:28 -0400
committerMike Pagano <mpagano@gentoo.org>2024-06-21 10:08:28 -0400
commit2513a1c96d3b8ccb88724b6b60f950810bea91b1 (patch)
tree9257bc8b907935b132c9c3a2f227571359ff8b0c
parentLinux 5.10.219 (diff)
downloadlinux-patches-2513a1c96d3b8ccb88724b6b60f950810bea91b1.tar.gz
linux-patches-2513a1c96d3b8ccb88724b6b60f950810bea91b1.tar.bz2
linux-patches-2513a1c96d3b8ccb88724b6b60f950810bea91b1.zip
Linux patch 5.10.2205.10-231
Signed-off-by: Mike Pagano <mpagano@gentoo.org>
-rw-r--r--0000_README4
-rw-r--r--1219_linux-5.10.220.patch38118
2 files changed, 38122 insertions, 0 deletions
diff --git a/0000_README b/0000_README
index 893a81a9..7461b294 100644
--- a/0000_README
+++ b/0000_README
@@ -919,6 +919,10 @@ Patch: 1218_linux-5.10.219.patch
From: https://www.kernel.org
Desc: Linux 5.10.219
+Patch: 1219_linux-5.10.220.patch
+From: https://www.kernel.org
+Desc: Linux 5.10.220
+
Patch: 1500_XATTR_USER_PREFIX.patch
From: https://bugs.gentoo.org/show_bug.cgi?id=470644
Desc: Support for namespace user.pax.* on tmpfs.
diff --git a/1219_linux-5.10.220.patch b/1219_linux-5.10.220.patch
new file mode 100644
index 00000000..9b2312a5
--- /dev/null
+++ b/1219_linux-5.10.220.patch
@@ -0,0 +1,38118 @@
+diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst
+index cbf8e57376bf6..bcf84459917f5 100644
+--- a/Documentation/filesystems/files.rst
++++ b/Documentation/filesystems/files.rst
+@@ -62,7 +62,7 @@ the fdtable structure -
+ be held.
+
+ 4. To look up the file structure given an fd, a reader
+- must use either fcheck() or fcheck_files() APIs. These
++ must use either lookup_fd_rcu() or files_lookup_fd_rcu() APIs. These
+ take care of barrier requirements due to lock-free lookup.
+
+ An example::
+@@ -70,7 +70,7 @@ the fdtable structure -
+ struct file *file;
+
+ rcu_read_lock();
+- file = fcheck(fd);
++ file = lookup_fd_rcu(fd);
+ if (file) {
+ ...
+ }
+@@ -84,7 +84,7 @@ the fdtable structure -
+ on ->f_count::
+
+ rcu_read_lock();
+- file = fcheck_files(files, fd);
++ file = files_lookup_fd_rcu(files, fd);
+ if (file) {
+ if (atomic_long_inc_not_zero(&file->f_count))
+ *fput_needed = 1;
+@@ -104,7 +104,7 @@ the fdtable structure -
+ lock-free, they must be installed using rcu_assign_pointer()
+ API. If they are looked up lock-free, rcu_dereference()
+ must be used. However it is advisable to use files_fdtable()
+- and fcheck()/fcheck_files() which take care of these issues.
++ and lookup_fd_rcu()/files_lookup_fd_rcu() which take care of these issues.
+
+ 7. While updating, the fdtable pointer must be looked up while
+ holding files->file_lock. If ->file_lock is dropped, then
+diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
+index fbd695d66905f..23a0d24168bc5 100644
+--- a/Documentation/filesystems/locking.rst
++++ b/Documentation/filesystems/locking.rst
+@@ -433,17 +433,21 @@ prototypes::
+ void (*lm_break)(struct file_lock *); /* break_lease callback */
+ int (*lm_change)(struct file_lock **, int);
+ bool (*lm_breaker_owns_lease)(struct file_lock *);
++ bool (*lm_lock_expirable)(struct file_lock *);
++ void (*lm_expire_lock)(void);
+
+ locking rules:
+
+ ====================== ============= ================= =========
+-ops inode->i_lock blocked_lock_lock may block
++ops flc_lock blocked_lock_lock may block
+ ====================== ============= ================= =========
+-lm_notify: yes yes no
++lm_notify: no yes no
+ lm_grant: no no no
+ lm_break: yes no no
+ lm_change yes no no
+-lm_breaker_owns_lease: no no no
++lm_breaker_owns_lease: yes no no
++lm_lock_expirable yes no no
++lm_expire_lock no no yes
+ ====================== ============= ================= =========
+
+ buffer_head
+diff --git a/Documentation/filesystems/nfs/exporting.rst b/Documentation/filesystems/nfs/exporting.rst
+index 33d588a01ace1..6f59a364f84cd 100644
+--- a/Documentation/filesystems/nfs/exporting.rst
++++ b/Documentation/filesystems/nfs/exporting.rst
+@@ -154,6 +154,11 @@ struct which has the following members:
+ to find potential names, and matches inode numbers to find the correct
+ match.
+
++ flags
++ Some filesystems may need to be handled differently than others. The
++ export_operations struct also includes a flags field that allows the
++ filesystem to communicate such information to nfsd. See the Export
++ Operations Flags section below for more explanation.
+
+ A filehandle fragment consists of an array of 1 or more 4byte words,
+ together with a one byte "type".
+@@ -163,3 +168,76 @@ generated by encode_fh, in which case it will have been padded with
+ nuls. Rather, the encode_fh routine should choose a "type" which
+ indicates the decode_fh how much of the filehandle is valid, and how
+ it should be interpreted.
++
++Export Operations Flags
++-----------------------
++In addition to the operation vector pointers, struct export_operations also
++contains a "flags" field that allows the filesystem to communicate to nfsd
++that it may want to do things differently when dealing with it. The
++following flags are defined:
++
++ EXPORT_OP_NOWCC - disable NFSv3 WCC attributes on this filesystem
++ RFC 1813 recommends that servers always send weak cache consistency
++ (WCC) data to the client after each operation. The server should
++ atomically collect attributes about the inode, do an operation on it,
++ and then collect the attributes afterward. This allows the client to
++ skip issuing GETATTRs in some situations but means that the server
++ is calling vfs_getattr for almost all RPCs. On some filesystems
++ (particularly those that are clustered or networked) this is expensive
++ and atomicity is difficult to guarantee. This flag indicates to nfsd
++ that it should skip providing WCC attributes to the client in NFSv3
++ replies when doing operations on this filesystem. Consider enabling
++ this on filesystems that have an expensive ->getattr inode operation,
++ or when atomicity between pre and post operation attribute collection
++ is impossible to guarantee.
++
++ EXPORT_OP_NOSUBTREECHK - disallow subtree checking on this fs
++ Many NFS operations deal with filehandles, which the server must then
++ vet to ensure that they live inside of an exported tree. When the
++ export consists of an entire filesystem, this is trivial. nfsd can just
++ ensure that the filehandle live on the filesystem. When only part of a
++ filesystem is exported however, then nfsd must walk the ancestors of the
++ inode to ensure that it's within an exported subtree. This is an
++ expensive operation and not all filesystems can support it properly.
++ This flag exempts the filesystem from subtree checking and causes
++ exportfs to get back an error if it tries to enable subtree checking
++ on it.
++
++ EXPORT_OP_CLOSE_BEFORE_UNLINK - always close cached files before unlinking
++ On some exportable filesystems (such as NFS) unlinking a file that
++ is still open can cause a fair bit of extra work. For instance,
++ the NFS client will do a "sillyrename" to ensure that the file
++ sticks around while it's still open. When reexporting, that open
++ file is held by nfsd so we usually end up doing a sillyrename, and
++ then immediately deleting the sillyrenamed file just afterward when
++ the link count actually goes to zero. Sometimes this delete can race
++ with other operations (for instance an rmdir of the parent directory).
++ This flag causes nfsd to close any open files for this inode _before_
++ calling into the vfs to do an unlink or a rename that would replace
++ an existing file.
++
++ EXPORT_OP_REMOTE_FS - Backing storage for this filesystem is remote
++ PF_LOCAL_THROTTLE exists for loopback NFSD, where a thread needs to
++ write to one bdi (the final bdi) in order to free up writes queued
++ to another bdi (the client bdi). Such threads get a private balance
++ of dirty pages so that dirty pages for the client bdi do not imact
++ the daemon writing to the final bdi. For filesystems whose durable
++ storage is not local (such as exported NFS filesystems), this
++ constraint has negative consequences. EXPORT_OP_REMOTE_FS enables
++ an export to disable writeback throttling.
++
++ EXPORT_OP_NOATOMIC_ATTR - Filesystem does not update attributes atomically
++ EXPORT_OP_NOATOMIC_ATTR indicates that the exported filesystem
++ cannot provide the semantics required by the "atomic" boolean in
++ NFSv4's change_info4. This boolean indicates to a client whether the
++ returned before and after change attributes were obtained atomically
++ with the respect to the requested metadata operation (UNLINK,
++ OPEN/CREATE, MKDIR, etc).
++
++ EXPORT_OP_FLUSH_ON_CLOSE - Filesystem flushes file data on close(2)
++ On most filesystems, inodes can remain under writeback after the
++ file is closed. NFSD relies on client activity or local flusher
++ threads to handle writeback. Certain filesystems, such as NFS, flush
++ all of an inode's dirty data on last close. Exports that behave this
++ way should set EXPORT_OP_FLUSH_ON_CLOSE so that NFSD knows to skip
++ waiting for writeback when closing such files.
+diff --git a/Makefile b/Makefile
+index 3b36b77589f2b..9304408d8ace2 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 5
+ PATCHLEVEL = 10
+-SUBLEVEL = 219
++SUBLEVEL = 220
+ EXTRAVERSION =
+ NAME = Dare mighty things
+
+diff --git a/arch/powerpc/platforms/cell/spufs/coredump.c b/arch/powerpc/platforms/cell/spufs/coredump.c
+index 026c181a98c5d..60b5583e9eafc 100644
+--- a/arch/powerpc/platforms/cell/spufs/coredump.c
++++ b/arch/powerpc/platforms/cell/spufs/coredump.c
+@@ -74,7 +74,7 @@ static struct spu_context *coredump_next_context(int *fd)
+ *fd = n - 1;
+
+ rcu_read_lock();
+- file = fcheck(*fd);
++ file = lookup_fd_rcu(*fd);
+ ctx = SPUFS_I(file_inode(file))->i_ctx;
+ get_spu_context(ctx);
+ rcu_read_unlock();
+diff --git a/crypto/algboss.c b/crypto/algboss.c
+index 5ebccbd6b74ed..b87f907bb1428 100644
+--- a/crypto/algboss.c
++++ b/crypto/algboss.c
+@@ -74,7 +74,7 @@ static int cryptomgr_probe(void *data)
+ complete_all(&param->larval->completion);
+ crypto_alg_put(&param->larval->alg);
+ kfree(param);
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ }
+
+ static int cryptomgr_schedule_probe(struct crypto_larval *larval)
+@@ -209,7 +209,7 @@ static int cryptomgr_test(void *data)
+ crypto_alg_tested(param->driver, err);
+
+ kfree(param);
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ }
+
+ static int cryptomgr_schedule_test(struct crypto_alg *alg)
+diff --git a/fs/Kconfig b/fs/Kconfig
+index da524c4d7b7e0..11b60d160f88f 100644
+--- a/fs/Kconfig
++++ b/fs/Kconfig
+@@ -320,7 +320,7 @@ config LOCKD
+
+ config LOCKD_V4
+ bool
+- depends on NFSD_V3 || NFS_V3
++ depends on NFSD || NFS_V3
+ depends on FILE_LOCKING
+ default y
+
+@@ -333,6 +333,10 @@ config NFS_COMMON
+ depends on NFSD || NFS_FS || LOCKD
+ default y
+
++config NFS_V4_2_SSC_HELPER
++ bool
++ default y if NFS_V4_2
++
+ source "net/sunrpc/Kconfig"
+ source "fs/ceph/Kconfig"
+ source "fs/cifs/Kconfig"
+diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
+index 322b7dfb4ea01..5bf781ea6d676 100644
+--- a/fs/autofs/dev-ioctl.c
++++ b/fs/autofs/dev-ioctl.c
+@@ -4,9 +4,10 @@
+ * Copyright 2008 Ian Kent <raven@themaw.net>
+ */
+
++#include <linux/module.h>
+ #include <linux/miscdevice.h>
+ #include <linux/compat.h>
+-#include <linux/syscalls.h>
++#include <linux/fdtable.h>
+ #include <linux/magic.h>
+ #include <linux/nospec.h>
+
+@@ -289,7 +290,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp,
+ struct autofs_sb_info *sbi,
+ struct autofs_dev_ioctl *param)
+ {
+- return ksys_close(param->ioctlfd);
++ return close_fd(param->ioctlfd);
+ }
+
+ /*
+diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
+index ecc8ecbbfa5ac..7b987de0babe8 100644
+--- a/fs/cachefiles/namei.c
++++ b/fs/cachefiles/namei.c
+@@ -412,9 +412,14 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
+ if (ret < 0) {
+ cachefiles_io_error(cache, "Rename security error %d", ret);
+ } else {
++ struct renamedata rd = {
++ .old_dir = d_inode(dir),
++ .old_dentry = rep,
++ .new_dir = d_inode(cache->graveyard),
++ .new_dentry = grave,
++ };
+ trace_cachefiles_rename(object, rep, grave, why);
+- ret = vfs_rename(d_inode(dir), rep,
+- d_inode(cache->graveyard), grave, NULL, 0);
++ ret = vfs_rename(&rd);
+ if (ret != 0 && ret != -ENOMEM)
+ cachefiles_io_error(cache,
+ "Rename failed with error %d", ret);
+diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
+index 164b985407160..a3c0e6a4e4847 100644
+--- a/fs/cifs/connect.c
++++ b/fs/cifs/connect.c
+@@ -1242,7 +1242,7 @@ cifs_demultiplex_thread(void *p)
+ }
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ }
+
+ /* extract the host portion of the UNC string */
+diff --git a/fs/coredump.c b/fs/coredump.c
+index 9d91e831ed0b2..7b085975ea163 100644
+--- a/fs/coredump.c
++++ b/fs/coredump.c
+@@ -590,7 +590,6 @@ void do_coredump(const kernel_siginfo_t *siginfo)
+ int ispipe;
+ size_t *argv = NULL;
+ int argc = 0;
+- struct files_struct *displaced;
+ /* require nonrelative corefile path and be extra careful */
+ bool need_suid_safe = false;
+ bool core_dumped = false;
+@@ -797,11 +796,9 @@ void do_coredump(const kernel_siginfo_t *siginfo)
+ }
+
+ /* get us an unshared descriptor table; almost always a no-op */
+- retval = unshare_files(&displaced);
++ retval = unshare_files();
+ if (retval)
+ goto close_fail;
+- if (displaced)
+- put_files_struct(displaced);
+ if (!dump_interrupted()) {
+ /*
+ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
+index c867a0d62f360..1dbe0c3ff38ea 100644
+--- a/fs/ecryptfs/inode.c
++++ b/fs/ecryptfs/inode.c
+@@ -598,6 +598,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct dentry *lower_new_dir_dentry;
+ struct dentry *trap;
+ struct inode *target_inode;
++ struct renamedata rd = {};
+
+ if (flags)
+ return -EINVAL;
+@@ -627,9 +628,12 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+ rc = -ENOTEMPTY;
+ goto out_lock;
+ }
+- rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry,
+- d_inode(lower_new_dir_dentry), lower_new_dentry,
+- NULL, 0);
++
++ rd.old_dir = d_inode(lower_old_dir_dentry);
++ rd.old_dentry = lower_old_dentry;
++ rd.new_dir = d_inode(lower_new_dir_dentry);
++ rd.new_dentry = lower_new_dentry;
++ rc = vfs_rename(&rd);
+ if (rc)
+ goto out_lock;
+ if (target_inode)
+diff --git a/fs/exec.c b/fs/exec.c
+index ebe9011955b9b..d5c8f085235bc 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -1264,6 +1264,11 @@ int begin_new_exec(struct linux_binprm * bprm)
+ if (retval)
+ goto out;
+
++ /* Ensure the files table is not shared. */
++ retval = unshare_files();
++ if (retval)
++ goto out;
++
+ /*
+ * Must be called _before_ exec_mmap() as bprm->mm is
+ * not visibile until then. This also enables the update
+@@ -1789,7 +1794,6 @@ static int bprm_execve(struct linux_binprm *bprm,
+ int fd, struct filename *filename, int flags)
+ {
+ struct file *file;
+- struct files_struct *displaced;
+ int retval;
+
+ /*
+@@ -1797,13 +1801,9 @@ static int bprm_execve(struct linux_binprm *bprm,
+ */
+ io_uring_task_cancel();
+
+- retval = unshare_files(&displaced);
+- if (retval)
+- return retval;
+-
+ retval = prepare_bprm_creds(bprm);
+ if (retval)
+- goto out_files;
++ return retval;
+
+ check_unsafe_exec(bprm);
+ current->in_execve = 1;
+@@ -1818,11 +1818,14 @@ static int bprm_execve(struct linux_binprm *bprm,
+ bprm->file = file;
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+- * inaccessible after exec. Relies on having exclusive access to
+- * current->files (due to unshare_files above).
++ * inaccessible after exec. This allows the code in exec to
++ * choose to fail when the executable is not mmaped into the
++ * interpreter and an open file descriptor is not passed to
++ * the interpreter. This makes for a better user experience
++ * than having the interpreter start and then immediately fail
++ * when it finds the executable is inaccessible.
+ */
+- if (bprm->fdpath &&
+- close_on_exec(fd, rcu_dereference_raw(current->files->fdt)))
++ if (bprm->fdpath && get_close_on_exec(fd))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+
+ /* Set the unchanging part of bprm->cred */
+@@ -1840,8 +1843,6 @@ static int bprm_execve(struct linux_binprm *bprm,
+ rseq_execve(current);
+ acct_update_integrals(current);
+ task_numa_free(current, false);
+- if (displaced)
+- put_files_struct(displaced);
+ return retval;
+
+ out:
+@@ -1858,10 +1859,6 @@ static int bprm_execve(struct linux_binprm *bprm,
+ current->fs->in_exec = 0;
+ current->in_execve = 0;
+
+-out_files:
+- if (displaced)
+- reset_files_struct(displaced);
+-
+ return retval;
+ }
+
+diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
+index 2dd55b172d57f..8c28bd1c9ed94 100644
+--- a/fs/exportfs/expfs.c
++++ b/fs/exportfs/expfs.c
+@@ -18,7 +18,7 @@
+ #include <linux/sched.h>
+ #include <linux/cred.h>
+
+-#define dprintk(fmt, args...) do{}while(0)
++#define dprintk(fmt, args...) pr_debug(fmt, ##args)
+
+
+ static int get_name(const struct path *path, char *name, struct dentry *child);
+@@ -132,8 +132,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
+ inode_unlock(dentry->d_inode);
+
+ if (IS_ERR(parent)) {
+- dprintk("%s: get_parent of %ld failed, err %d\n",
+- __func__, dentry->d_inode->i_ino, PTR_ERR(parent));
++ dprintk("get_parent of %lu failed, err %ld\n",
++ dentry->d_inode->i_ino, PTR_ERR(parent));
+ return parent;
+ }
+
+@@ -147,7 +147,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
+ dprintk("%s: found name: %s\n", __func__, nbuf);
+ tmp = lookup_one_len_unlocked(nbuf, parent, strlen(nbuf));
+ if (IS_ERR(tmp)) {
+- dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
++ dprintk("lookup failed: %ld\n", PTR_ERR(tmp));
+ err = PTR_ERR(tmp);
+ goto out_err;
+ }
+@@ -417,9 +417,11 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
+ }
+ EXPORT_SYMBOL_GPL(exportfs_encode_fh);
+
+-struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+- int fh_len, int fileid_type,
+- int (*acceptable)(void *, struct dentry *), void *context)
++struct dentry *
++exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len,
++ int fileid_type,
++ int (*acceptable)(void *, struct dentry *),
++ void *context)
+ {
+ const struct export_operations *nop = mnt->mnt_sb->s_export_op;
+ struct dentry *result, *alias;
+@@ -432,10 +434,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+ if (!nop || !nop->fh_to_dentry)
+ return ERR_PTR(-ESTALE);
+ result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
+- if (PTR_ERR(result) == -ENOMEM)
+- return ERR_CAST(result);
+ if (IS_ERR_OR_NULL(result))
+- return ERR_PTR(-ESTALE);
++ return result;
+
+ /*
+ * If no acceptance criteria was specified by caller, a disconnected
+@@ -561,10 +561,26 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+
+ err_result:
+ dput(result);
+- if (err != -ENOMEM)
+- err = -ESTALE;
+ return ERR_PTR(err);
+ }
++EXPORT_SYMBOL_GPL(exportfs_decode_fh_raw);
++
++struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
++ int fh_len, int fileid_type,
++ int (*acceptable)(void *, struct dentry *),
++ void *context)
++{
++ struct dentry *ret;
++
++ ret = exportfs_decode_fh_raw(mnt, fid, fh_len, fileid_type,
++ acceptable, context);
++ if (IS_ERR_OR_NULL(ret)) {
++ if (ret == ERR_PTR(-ENOMEM))
++ return ret;
++ return ERR_PTR(-ESTALE);
++ }
++ return ret;
++}
+ EXPORT_SYMBOL_GPL(exportfs_decode_fh);
+
+ MODULE_LICENSE("GPL");
+diff --git a/fs/file.c b/fs/file.c
+index d6bc73960e4ac..fdb84a64724b7 100644
+--- a/fs/file.c
++++ b/fs/file.c
+@@ -175,7 +175,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
+ spin_unlock(&files->file_lock);
+ new_fdt = alloc_fdtable(nr);
+
+- /* make sure all __fd_install() have seen resize_in_progress
++ /* make sure all fd_install() have seen resize_in_progress
+ * or have finished their rcu_read_lock_sched() section.
+ */
+ if (atomic_read(&files->count) > 1)
+@@ -198,7 +198,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
+ rcu_assign_pointer(files->fdt, new_fdt);
+ if (cur_fdt != &files->fdtab)
+ call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
+- /* coupled with smp_rmb() in __fd_install() */
++ /* coupled with smp_rmb() in fd_install() */
+ smp_wmb();
+ return 1;
+ }
+@@ -466,18 +466,6 @@ void put_files_struct(struct files_struct *files)
+ }
+ }
+
+-void reset_files_struct(struct files_struct *files)
+-{
+- struct task_struct *tsk = current;
+- struct files_struct *old;
+-
+- old = tsk->files;
+- task_lock(tsk);
+- tsk->files = files;
+- task_unlock(tsk);
+- put_files_struct(old);
+-}
+-
+ void exit_files(struct task_struct *tsk)
+ {
+ struct files_struct * files = tsk->files;
+@@ -521,9 +509,9 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
+ /*
+ * allocate a file descriptor, mark it busy.
+ */
+-int __alloc_fd(struct files_struct *files,
+- unsigned start, unsigned end, unsigned flags)
++static int alloc_fd(unsigned start, unsigned end, unsigned flags)
+ {
++ struct files_struct *files = current->files;
+ unsigned int fd;
+ int error;
+ struct fdtable *fdt;
+@@ -579,14 +567,9 @@ int __alloc_fd(struct files_struct *files,
+ return error;
+ }
+
+-static int alloc_fd(unsigned start, unsigned flags)
+-{
+- return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
+-}
+-
+ int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
+ {
+- return __alloc_fd(current->files, 0, nofile, flags);
++ return alloc_fd(0, nofile, flags);
+ }
+
+ int get_unused_fd_flags(unsigned flags)
+@@ -625,17 +608,13 @@ EXPORT_SYMBOL(put_unused_fd);
+ * It should never happen - if we allow dup2() do it, _really_ bad things
+ * will follow.
+ *
+- * NOTE: __fd_install() variant is really, really low-level; don't
+- * use it unless you are forced to by truly lousy API shoved down
+- * your throat. 'files' *MUST* be either current->files or obtained
+- * by get_files_struct(current) done by whoever had given it to you,
+- * or really bad things will happen. Normally you want to use
+- * fd_install() instead.
++ * This consumes the "file" refcount, so callers should treat it
++ * as if they had called fput(file).
+ */
+
+-void __fd_install(struct files_struct *files, unsigned int fd,
+- struct file *file)
++void fd_install(unsigned int fd, struct file *file)
+ {
++ struct files_struct *files = current->files;
+ struct fdtable *fdt;
+
+ rcu_read_lock_sched();
+@@ -657,15 +636,6 @@ void __fd_install(struct files_struct *files, unsigned int fd,
+ rcu_read_unlock_sched();
+ }
+
+-/*
+- * This consumes the "file" refcount, so callers should treat it
+- * as if they had called fput(file).
+- */
+-void fd_install(unsigned int fd, struct file *file)
+-{
+- __fd_install(current->files, fd, file);
+-}
+-
+ EXPORT_SYMBOL(fd_install);
+
+ static struct file *pick_file(struct files_struct *files, unsigned fd)
+@@ -689,11 +659,9 @@ static struct file *pick_file(struct files_struct *files, unsigned fd)
+ return file;
+ }
+
+-/*
+- * The same warnings as for __alloc_fd()/__fd_install() apply here...
+- */
+-int __close_fd(struct files_struct *files, unsigned fd)
++int close_fd(unsigned fd)
+ {
++ struct files_struct *files = current->files;
+ struct file *file;
+
+ file = pick_file(files, fd);
+@@ -702,7 +670,7 @@ int __close_fd(struct files_struct *files, unsigned fd)
+
+ return filp_close(file, files);
+ }
+-EXPORT_SYMBOL(__close_fd); /* for ksys_close() */
++EXPORT_SYMBOL(close_fd); /* for ksys_close() */
+
+ /**
+ * __close_range() - Close all file descriptors in a given range.
+@@ -861,68 +829,28 @@ void do_close_on_exec(struct files_struct *files)
+ spin_unlock(&files->file_lock);
+ }
+
+-static inline struct file *__fget_files_rcu(struct files_struct *files,
+- unsigned int fd, fmode_t mask, unsigned int refs)
+-{
+- for (;;) {
+- struct file *file;
+- struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+- struct file __rcu **fdentry;
+-
+- if (unlikely(fd >= fdt->max_fds))
+- return NULL;
+-
+- fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+- file = rcu_dereference_raw(*fdentry);
+- if (unlikely(!file))
+- return NULL;
+-
+- if (unlikely(file->f_mode & mask))
+- return NULL;
+-
+- /*
+- * Ok, we have a file pointer. However, because we do
+- * this all locklessly under RCU, we may be racing with
+- * that file being closed.
+- *
+- * Such a race can take two forms:
+- *
+- * (a) the file ref already went down to zero,
+- * and get_file_rcu_many() fails. Just try
+- * again:
+- */
+- if (unlikely(!get_file_rcu_many(file, refs)))
+- continue;
+-
+- /*
+- * (b) the file table entry has changed under us.
+- * Note that we don't need to re-check the 'fdt->fd'
+- * pointer having changed, because it always goes
+- * hand-in-hand with 'fdt'.
+- *
+- * If so, we need to put our refs and try again.
+- */
+- if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
+- unlikely(rcu_dereference_raw(*fdentry) != file)) {
+- fput_many(file, refs);
+- continue;
+- }
+-
+- /*
+- * Ok, we have a ref to the file, and checked that it
+- * still exists.
+- */
+- return file;
+- }
+-}
+-
+ static struct file *__fget_files(struct files_struct *files, unsigned int fd,
+ fmode_t mask, unsigned int refs)
+ {
+ struct file *file;
+
+ rcu_read_lock();
+- file = __fget_files_rcu(files, fd, mask, refs);
++loop:
++ file = files_lookup_fd_rcu(files, fd);
++ if (file) {
++ /* File object ref couldn't be taken.
++ * dup2() atomicity guarantee is the reason
++ * we loop to catch the new file (or NULL pointer)
++ */
++ if (file->f_mode & mask)
++ file = NULL;
++ else if (!get_file_rcu_many(file, refs))
++ goto loop;
++ else if (files_lookup_fd_raw(files, fd) != file) {
++ fput_many(file, refs);
++ goto loop;
++ }
++ }
+ rcu_read_unlock();
+
+ return file;
+@@ -963,6 +891,42 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
+ return file;
+ }
+
++struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
++{
++ /* Must be called with rcu_read_lock held */
++ struct files_struct *files;
++ struct file *file = NULL;
++
++ task_lock(task);
++ files = task->files;
++ if (files)
++ file = files_lookup_fd_rcu(files, fd);
++ task_unlock(task);
++
++ return file;
++}
++
++struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
++{
++ /* Must be called with rcu_read_lock held */
++ struct files_struct *files;
++ unsigned int fd = *ret_fd;
++ struct file *file = NULL;
++
++ task_lock(task);
++ files = task->files;
++ if (files) {
++ for (; fd < files_fdtable(files)->max_fds; fd++) {
++ file = files_lookup_fd_rcu(files, fd);
++ if (file)
++ break;
++ }
++ }
++ task_unlock(task);
++ *ret_fd = fd;
++ return file;
++}
++
+ /*
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
+ *
+@@ -985,7 +949,7 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
+ struct file *file;
+
+ if (atomic_read(&files->count) == 1) {
+- file = __fcheck_files(files, fd);
++ file = files_lookup_fd_raw(files, fd);
+ if (!file || unlikely(file->f_mode & mask))
+ return 0;
+ return (unsigned long)file;
+@@ -1121,7 +1085,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
+ struct files_struct *files = current->files;
+
+ if (!file)
+- return __close_fd(files, fd);
++ return close_fd(fd);
+
+ if (fd >= rlimit(RLIMIT_NOFILE))
+ return -EBADF;
+@@ -1210,7 +1174,7 @@ static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
+
+ spin_lock(&files->file_lock);
+ err = expand_files(files, newfd);
+- file = fcheck(oldfd);
++ file = files_lookup_fd_locked(files, oldfd);
+ if (unlikely(!file))
+ goto Ebadf;
+ if (unlikely(err < 0)) {
+@@ -1239,7 +1203,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
+ int retval = oldfd;
+
+ rcu_read_lock();
+- if (!fcheck_files(files, oldfd))
++ if (!files_lookup_fd_rcu(files, oldfd))
+ retval = -EBADF;
+ rcu_read_unlock();
+ return retval;
+@@ -1264,10 +1228,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes)
+
+ int f_dupfd(unsigned int from, struct file *file, unsigned flags)
+ {
++ unsigned long nofile = rlimit(RLIMIT_NOFILE);
+ int err;
+- if (from >= rlimit(RLIMIT_NOFILE))
++ if (from >= nofile)
+ return -EINVAL;
+- err = alloc_fd(from, flags);
++ err = alloc_fd(from, nofile, flags);
+ if (err >= 0) {
+ get_file(file);
+ fd_install(err, file);
+diff --git a/fs/init.c b/fs/init.c
+index e9c320a48cf15..02723bea84990 100644
+--- a/fs/init.c
++++ b/fs/init.c
+@@ -49,7 +49,7 @@ int __init init_chdir(const char *filename)
+ error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+ if (error)
+ return error;
+- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
++ error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
+ if (!error)
+ set_fs_pwd(current->fs, &path);
+ path_put(&path);
+@@ -64,7 +64,7 @@ int __init init_chroot(const char *filename)
+ error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+ if (error)
+ return error;
+- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
++ error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
+ if (error)
+ goto dput_and_out;
+ error = -EPERM;
+@@ -118,7 +118,7 @@ int __init init_eaccess(const char *filename)
+ error = kern_path(filename, LOOKUP_FOLLOW, &path);
+ if (error)
+ return error;
+- error = inode_permission(d_inode(path.dentry), MAY_ACCESS);
++ error = path_permission(&path, MAY_ACCESS);
+ path_put(&path);
+ return error;
+ }
+diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
+index 7df6324ccb8ab..8161667c976f8 100644
+--- a/fs/lockd/clnt4xdr.c
++++ b/fs/lockd/clnt4xdr.c
+@@ -261,7 +261,6 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+ u32 exclusive;
+ int error;
+ __be32 *p;
+- s32 end;
+
+ memset(lock, 0, sizeof(*lock));
+ locks_init_lock(fl);
+@@ -285,13 +284,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
+ fl->fl_type = exclusive != 0 ? F_WRLCK : F_RDLCK;
+ p = xdr_decode_hyper(p, &l_offset);
+ xdr_decode_hyper(p, &l_len);
+- end = l_offset + l_len - 1;
+-
+- fl->fl_start = (loff_t)l_offset;
+- if (l_len == 0 || end < 0)
+- fl->fl_end = OFFSET_MAX;
+- else
+- fl->fl_end = (loff_t)end;
++ nlm4svc_set_file_lock_range(fl, l_offset, l_len);
+ error = 0;
+ out:
+ return error;
+diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
+index b11f2afa84f1f..99fffc9cb9585 100644
+--- a/fs/lockd/clntproc.c
++++ b/fs/lockd/clntproc.c
+@@ -794,9 +794,6 @@ static void nlmclnt_cancel_callback(struct rpc_task *task, void *data)
+ goto retry_cancel;
+ }
+
+- dprintk("lockd: cancel status %u (task %u)\n",
+- status, task->tk_pid);
+-
+ switch (status) {
+ case NLM_LCK_GRANTED:
+ case NLM_LCK_DENIED_GRACE_PERIOD:
+diff --git a/fs/lockd/host.c b/fs/lockd/host.c
+index 771c289f6df7f..cdc8e12cdac44 100644
+--- a/fs/lockd/host.c
++++ b/fs/lockd/host.c
+@@ -163,8 +163,8 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
+ host->h_nsmhandle = nsm;
+ host->h_addrbuf = nsm->sm_addrbuf;
+ host->net = ni->net;
+- host->h_cred = get_cred(ni->cred),
+- strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
++ host->h_cred = get_cred(ni->cred);
++ strscpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
+
+ out:
+ return host;
+diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
+index 1a639e34847dd..5579e67da17db 100644
+--- a/fs/lockd/svc.c
++++ b/fs/lockd/svc.c
+@@ -54,13 +54,9 @@ EXPORT_SYMBOL_GPL(nlmsvc_ops);
+
+ static DEFINE_MUTEX(nlmsvc_mutex);
+ static unsigned int nlmsvc_users;
+-static struct task_struct *nlmsvc_task;
+-static struct svc_rqst *nlmsvc_rqst;
++static struct svc_serv *nlmsvc_serv;
+ unsigned long nlmsvc_timeout;
+
+-static atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0);
+-static DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq);
+-
+ unsigned int lockd_net_id;
+
+ /*
+@@ -184,6 +180,10 @@ lockd(void *vrqstp)
+ nlm_shutdown_hosts();
+ cancel_delayed_work_sync(&ln->grace_period_end);
+ locks_end_grace(&ln->lockd_manager);
++
++ dprintk("lockd_down: service stopped\n");
++
++ svc_exit_thread(rqstp);
+ return 0;
+ }
+
+@@ -196,8 +196,8 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name,
+
+ xprt = svc_find_xprt(serv, name, net, family, 0);
+ if (xprt == NULL)
+- return svc_create_xprt(serv, name, net, family, port,
+- SVC_SOCK_DEFAULTS, cred);
++ return svc_xprt_create(serv, name, net, family, port,
++ SVC_SOCK_DEFAULTS, cred);
+ svc_xprt_put(xprt);
+ return 0;
+ }
+@@ -247,7 +247,8 @@ static int make_socks(struct svc_serv *serv, struct net *net,
+ if (warned++ == 0)
+ printk(KERN_WARNING
+ "lockd_up: makesock failed, error=%d\n", err);
+- svc_shutdown_net(serv, net);
++ svc_xprt_destroy_all(serv, net);
++ svc_rpcb_cleanup(serv, net);
+ return err;
+ }
+
+@@ -285,13 +286,12 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
+ nlm_shutdown_hosts_net(net);
+ cancel_delayed_work_sync(&ln->grace_period_end);
+ locks_end_grace(&ln->lockd_manager);
+- svc_shutdown_net(serv, net);
+- dprintk("%s: per-net data destroyed; net=%x\n",
+- __func__, net->ns.inum);
++ svc_xprt_destroy_all(serv, net);
++ svc_rpcb_cleanup(serv, net);
+ }
+ } else {
+- pr_err("%s: no users! task=%p, net=%x\n",
+- __func__, nlmsvc_task, net->ns.inum);
++ pr_err("%s: no users! net=%x\n",
++ __func__, net->ns.inum);
+ BUG();
+ }
+ }
+@@ -302,20 +302,16 @@ static int lockd_inetaddr_event(struct notifier_block *this,
+ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+ struct sockaddr_in sin;
+
+- if ((event != NETDEV_DOWN) ||
+- !atomic_inc_not_zero(&nlm_ntf_refcnt))
++ if (event != NETDEV_DOWN)
+ goto out;
+
+- if (nlmsvc_rqst) {
++ if (nlmsvc_serv) {
+ dprintk("lockd_inetaddr_event: removed %pI4\n",
+ &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+- svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+- (struct sockaddr *)&sin);
++ svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin);
+ }
+- atomic_dec(&nlm_ntf_refcnt);
+- wake_up(&nlm_ntf_wq);
+
+ out:
+ return NOTIFY_DONE;
+@@ -332,21 +328,17 @@ static int lockd_inet6addr_event(struct notifier_block *this,
+ struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+ struct sockaddr_in6 sin6;
+
+- if ((event != NETDEV_DOWN) ||
+- !atomic_inc_not_zero(&nlm_ntf_refcnt))
++ if (event != NETDEV_DOWN)
+ goto out;
+
+- if (nlmsvc_rqst) {
++ if (nlmsvc_serv) {
+ dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ifa->addr;
+ if (ipv6_addr_type(&sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)
+ sin6.sin6_scope_id = ifa->idev->dev->ifindex;
+- svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+- (struct sockaddr *)&sin6);
++ svc_age_temp_xprts_now(nlmsvc_serv, (struct sockaddr *)&sin6);
+ }
+- atomic_dec(&nlm_ntf_refcnt);
+- wake_up(&nlm_ntf_wq);
+
+ out:
+ return NOTIFY_DONE;
+@@ -357,86 +349,14 @@ static struct notifier_block lockd_inet6addr_notifier = {
+ };
+ #endif
+
+-static void lockd_unregister_notifiers(void)
+-{
+- unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+-#if IS_ENABLED(CONFIG_IPV6)
+- unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+-#endif
+- wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0);
+-}
+-
+-static void lockd_svc_exit_thread(void)
+-{
+- atomic_dec(&nlm_ntf_refcnt);
+- lockd_unregister_notifiers();
+- svc_exit_thread(nlmsvc_rqst);
+-}
+-
+-static int lockd_start_svc(struct svc_serv *serv)
++static int lockd_get(void)
+ {
++ struct svc_serv *serv;
+ int error;
+
+- if (nlmsvc_rqst)
++ if (nlmsvc_serv) {
++ nlmsvc_users++;
+ return 0;
+-
+- /*
+- * Create the kernel thread and wait for it to start.
+- */
+- nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+- if (IS_ERR(nlmsvc_rqst)) {
+- error = PTR_ERR(nlmsvc_rqst);
+- printk(KERN_WARNING
+- "lockd_up: svc_rqst allocation failed, error=%d\n",
+- error);
+- lockd_unregister_notifiers();
+- goto out_rqst;
+- }
+-
+- atomic_inc(&nlm_ntf_refcnt);
+- svc_sock_update_bufs(serv);
+- serv->sv_maxconn = nlm_max_connections;
+-
+- nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name);
+- if (IS_ERR(nlmsvc_task)) {
+- error = PTR_ERR(nlmsvc_task);
+- printk(KERN_WARNING
+- "lockd_up: kthread_run failed, error=%d\n", error);
+- goto out_task;
+- }
+- nlmsvc_rqst->rq_task = nlmsvc_task;
+- wake_up_process(nlmsvc_task);
+-
+- dprintk("lockd_up: service started\n");
+- return 0;
+-
+-out_task:
+- lockd_svc_exit_thread();
+- nlmsvc_task = NULL;
+-out_rqst:
+- nlmsvc_rqst = NULL;
+- return error;
+-}
+-
+-static const struct svc_serv_ops lockd_sv_ops = {
+- .svo_shutdown = svc_rpcb_cleanup,
+- .svo_enqueue_xprt = svc_xprt_do_enqueue,
+-};
+-
+-static struct svc_serv *lockd_create_svc(void)
+-{
+- struct svc_serv *serv;
+-
+- /*
+- * Check whether we're already up and running.
+- */
+- if (nlmsvc_rqst) {
+- /*
+- * Note: increase service usage, because later in case of error
+- * svc_destroy() will be called.
+- */
+- svc_get(nlmsvc_rqst->rq_server);
+- return nlmsvc_rqst->rq_server;
+ }
+
+ /*
+@@ -451,17 +371,44 @@ static struct svc_serv *lockd_create_svc(void)
+ nlm_timeout = LOCKD_DFLT_TIMEO;
+ nlmsvc_timeout = nlm_timeout * HZ;
+
+- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, &lockd_sv_ops);
++ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, lockd);
+ if (!serv) {
+ printk(KERN_WARNING "lockd_up: create service failed\n");
+- return ERR_PTR(-ENOMEM);
++ return -ENOMEM;
+ }
++
++ serv->sv_maxconn = nlm_max_connections;
++ error = svc_set_num_threads(serv, NULL, 1);
++ /* The thread now holds the only reference */
++ svc_put(serv);
++ if (error < 0)
++ return error;
++
++ nlmsvc_serv = serv;
+ register_inetaddr_notifier(&lockd_inetaddr_notifier);
+ #if IS_ENABLED(CONFIG_IPV6)
+ register_inet6addr_notifier(&lockd_inet6addr_notifier);
+ #endif
+ dprintk("lockd_up: service created\n");
+- return serv;
++ nlmsvc_users++;
++ return 0;
++}
++
++static void lockd_put(void)
++{
++ if (WARN(nlmsvc_users <= 0, "lockd_down: no users!\n"))
++ return;
++ if (--nlmsvc_users)
++ return;
++
++ unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
++#if IS_ENABLED(CONFIG_IPV6)
++ unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
++#endif
++
++ svc_set_num_threads(nlmsvc_serv, NULL, 0);
++ nlmsvc_serv = NULL;
++ dprintk("lockd_down: service destroyed\n");
+ }
+
+ /*
+@@ -469,36 +416,21 @@ static struct svc_serv *lockd_create_svc(void)
+ */
+ int lockd_up(struct net *net, const struct cred *cred)
+ {
+- struct svc_serv *serv;
+ int error;
+
+ mutex_lock(&nlmsvc_mutex);
+
+- serv = lockd_create_svc();
+- if (IS_ERR(serv)) {
+- error = PTR_ERR(serv);
+- goto err_create;
+- }
++ error = lockd_get();
++ if (error)
++ goto err;
+
+- error = lockd_up_net(serv, net, cred);
++ error = lockd_up_net(nlmsvc_serv, net, cred);
+ if (error < 0) {
+- lockd_unregister_notifiers();
+- goto err_put;
++ lockd_put();
++ goto err;
+ }
+
+- error = lockd_start_svc(serv);
+- if (error < 0) {
+- lockd_down_net(serv, net);
+- goto err_put;
+- }
+- nlmsvc_users++;
+- /*
+- * Note: svc_serv structures have an initial use count of 1,
+- * so we exit through here on both success and failure.
+- */
+-err_put:
+- svc_destroy(serv);
+-err_create:
++err:
+ mutex_unlock(&nlmsvc_mutex);
+ return error;
+ }
+@@ -511,27 +443,8 @@ void
+ lockd_down(struct net *net)
+ {
+ mutex_lock(&nlmsvc_mutex);
+- lockd_down_net(nlmsvc_rqst->rq_server, net);
+- if (nlmsvc_users) {
+- if (--nlmsvc_users)
+- goto out;
+- } else {
+- printk(KERN_ERR "lockd_down: no users! task=%p\n",
+- nlmsvc_task);
+- BUG();
+- }
+-
+- if (!nlmsvc_task) {
+- printk(KERN_ERR "lockd_down: no lockd running.\n");
+- BUG();
+- }
+- kthread_stop(nlmsvc_task);
+- dprintk("lockd_down: service stopped\n");
+- lockd_svc_exit_thread();
+- dprintk("lockd_down: service destroyed\n");
+- nlmsvc_task = NULL;
+- nlmsvc_rqst = NULL;
+-out:
++ lockd_down_net(nlmsvc_serv, net);
++ lockd_put();
+ mutex_unlock(&nlmsvc_mutex);
+ }
+ EXPORT_SYMBOL_GPL(lockd_down);
+@@ -584,7 +497,7 @@ static struct ctl_table nlm_sysctls[] = {
+ .data = &nsm_use_hostnames,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+- .proc_handler = proc_dointvec,
++ .proc_handler = proc_dobool,
+ },
+ {
+ .procname = "nsm_local_state",
+@@ -649,6 +562,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
+ switch (rqstp->rq_authop->flavour) {
+ case RPC_AUTH_NULL:
+ case RPC_AUTH_UNIX:
++ rqstp->rq_auth_stat = rpc_auth_ok;
+ if (rqstp->rq_proc == 0)
+ return SVC_OK;
+ if (is_callback(rqstp->rq_proc)) {
+@@ -659,6 +573,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp)
+ }
+ return svc_set_client(rqstp);
+ }
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+
+@@ -766,6 +681,44 @@ static void __exit exit_nlm(void)
+ module_init(init_nlm);
+ module_exit(exit_nlm);
+
++/**
++ * nlmsvc_dispatch - Process an NLM Request
++ * @rqstp: incoming request
++ * @statp: pointer to location of accept_stat field in RPC Reply buffer
++ *
++ * Return values:
++ * %0: Processing complete; do not send a Reply
++ * %1: Processing complete; send Reply in rqstp->rq_res
++ */
++static int nlmsvc_dispatch(struct svc_rqst *rqstp, __be32 *statp)
++{
++ const struct svc_procedure *procp = rqstp->rq_procinfo;
++
++ svcxdr_init_decode(rqstp);
++ if (!procp->pc_decode(rqstp, &rqstp->rq_arg_stream))
++ goto out_decode_err;
++
++ *statp = procp->pc_func(rqstp);
++ if (*statp == rpc_drop_reply)
++ return 0;
++ if (*statp != rpc_success)
++ return 1;
++
++ svcxdr_init_encode(rqstp);
++ if (!procp->pc_encode(rqstp, &rqstp->rq_res_stream))
++ goto out_encode_err;
++
++ return 1;
++
++out_decode_err:
++ *statp = rpc_garbage_args;
++ return 1;
++
++out_encode_err:
++ *statp = rpc_system_err;
++ return 1;
++}
++
+ /*
+ * Define NLM program and procedures
+ */
+@@ -775,6 +728,7 @@ static const struct svc_version nlmsvc_version1 = {
+ .vs_nproc = 17,
+ .vs_proc = nlmsvc_procedures,
+ .vs_count = nlmsvc_version1_count,
++ .vs_dispatch = nlmsvc_dispatch,
+ .vs_xdrsize = NLMSVC_XDRSIZE,
+ };
+ static unsigned int nlmsvc_version3_count[24];
+@@ -783,6 +737,7 @@ static const struct svc_version nlmsvc_version3 = {
+ .vs_nproc = 24,
+ .vs_proc = nlmsvc_procedures,
+ .vs_count = nlmsvc_version3_count,
++ .vs_dispatch = nlmsvc_dispatch,
+ .vs_xdrsize = NLMSVC_XDRSIZE,
+ };
+ #ifdef CONFIG_LOCKD_V4
+@@ -792,6 +747,7 @@ static const struct svc_version nlmsvc_version4 = {
+ .vs_nproc = 24,
+ .vs_proc = nlmsvc_procedures4,
+ .vs_count = nlmsvc_version4_count,
++ .vs_dispatch = nlmsvc_dispatch,
+ .vs_xdrsize = NLMSVC_XDRSIZE,
+ };
+ #endif
+diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
+index fa41dda399259..b72023a6b4c16 100644
+--- a/fs/lockd/svc4proc.c
++++ b/fs/lockd/svc4proc.c
+@@ -32,6 +32,10 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+ if (!nlmsvc_ops)
+ return nlm_lck_denied_nolocks;
+
++ if (lock->lock_start > OFFSET_MAX ||
++ (lock->lock_len && ((lock->lock_len - 1) > (OFFSET_MAX - lock->lock_start))))
++ return nlm4_fbig;
++
+ /* Obtain host handle */
+ if (!(host = nlmsvc_lookup_host(rqstp, lock->caller, lock->len))
+ || (argp->monitor && nsm_monitor(host) < 0))
+@@ -40,13 +44,21 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+
+ /* Obtain file pointer. Not used by FREE_ALL call. */
+ if (filp != NULL) {
+- if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0)
++ int mode = lock_to_openmode(&lock->fl);
++
++ error = nlm_lookup_file(rqstp, &file, lock);
++ if (error)
+ goto no_locks;
+ *filp = file;
+
+ /* Set up the missing parts of the file_lock structure */
+- lock->fl.fl_file = file->f_file;
++ lock->fl.fl_flags = FL_POSIX;
++ lock->fl.fl_file = file->f_file[mode];
+ lock->fl.fl_pid = current->tgid;
++ lock->fl.fl_start = (loff_t)lock->lock_start;
++ lock->fl.fl_end = lock->lock_len ?
++ (loff_t)(lock->lock_start + lock->lock_len - 1) :
++ OFFSET_MAX;
+ lock->fl.fl_lmops = &nlmsvc_lock_operations;
+ nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
+ if (!lock->fl.fl_owner) {
+@@ -84,6 +96,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
+ struct nlm_args *argp = rqstp->rq_argp;
+ struct nlm_host *host;
+ struct nlm_file *file;
++ struct nlm_lockowner *test_owner;
+ __be32 rc = rpc_success;
+
+ dprintk("lockd: TEST4 called\n");
+@@ -93,6 +106,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
+ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
++ test_owner = argp->lock.fl.fl_owner;
+ /* Now check for conflicting locks */
+ resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
+ if (resp->status == nlm_drop_reply)
+@@ -100,7 +114,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
+ else
+ dprintk("lockd: TEST4 status %d\n", ntohl(resp->status));
+
+- nlmsvc_release_lockowner(&argp->lock);
++ nlmsvc_put_lockowner(test_owner);
+ nlmsvc_release_host(host);
+ nlm_release_file(file);
+ return rc;
+@@ -266,8 +280,6 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp)
+ */
+ static void nlm4svc_callback_exit(struct rpc_task *task, void *data)
+ {
+- dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
+- -task->tk_status);
+ }
+
+ static void nlm4svc_callback_release(void *data)
+@@ -510,191 +522,239 @@ const struct svc_procedure nlmsvc_procedures4[24] = {
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "NULL",
+ },
+ [NLMPROC_TEST] = {
+ .pc_func = nlm4svc_proc_test,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_testres,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+2+No+Rg,
++ .pc_name = "TEST",
+ },
+ [NLMPROC_LOCK] = {
+ .pc_func = nlm4svc_proc_lock,
+ .pc_decode = nlm4svc_decode_lockargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "LOCK",
+ },
+ [NLMPROC_CANCEL] = {
+ .pc_func = nlm4svc_proc_cancel,
+ .pc_decode = nlm4svc_decode_cancargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "CANCEL",
+ },
+ [NLMPROC_UNLOCK] = {
+ .pc_func = nlm4svc_proc_unlock,
+ .pc_decode = nlm4svc_decode_unlockargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "UNLOCK",
+ },
+ [NLMPROC_GRANTED] = {
+ .pc_func = nlm4svc_proc_granted,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "GRANTED",
+ },
+ [NLMPROC_TEST_MSG] = {
+ .pc_func = nlm4svc_proc_test_msg,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "TEST_MSG",
+ },
+ [NLMPROC_LOCK_MSG] = {
+ .pc_func = nlm4svc_proc_lock_msg,
+ .pc_decode = nlm4svc_decode_lockargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "LOCK_MSG",
+ },
+ [NLMPROC_CANCEL_MSG] = {
+ .pc_func = nlm4svc_proc_cancel_msg,
+ .pc_decode = nlm4svc_decode_cancargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "CANCEL_MSG",
+ },
+ [NLMPROC_UNLOCK_MSG] = {
+ .pc_func = nlm4svc_proc_unlock_msg,
+ .pc_decode = nlm4svc_decode_unlockargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNLOCK_MSG",
+ },
+ [NLMPROC_GRANTED_MSG] = {
+ .pc_func = nlm4svc_proc_granted_msg,
+ .pc_decode = nlm4svc_decode_testargs,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "GRANTED_MSG",
+ },
+ [NLMPROC_TEST_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "TEST_RES",
+ },
+ [NLMPROC_LOCK_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "LOCK_RES",
+ },
+ [NLMPROC_CANCEL_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "CANCEL_RES",
+ },
+ [NLMPROC_UNLOCK_RES] = {
+ .pc_func = nlm4svc_proc_null,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNLOCK_RES",
+ },
+ [NLMPROC_GRANTED_RES] = {
+ .pc_func = nlm4svc_proc_granted_res,
+ .pc_decode = nlm4svc_decode_res,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "GRANTED_RES",
+ },
+ [NLMPROC_NSM_NOTIFY] = {
+ .pc_func = nlm4svc_proc_sm_notify,
+ .pc_decode = nlm4svc_decode_reboot,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_reboot),
++ .pc_argzero = sizeof(struct nlm_reboot),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "SM_NOTIFY",
+ },
+ [17] = {
+ .pc_func = nlm4svc_proc_unused,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
++ .pc_name = "UNUSED",
+ },
+ [18] = {
+ .pc_func = nlm4svc_proc_unused,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
++ .pc_name = "UNUSED",
+ },
+ [19] = {
+ .pc_func = nlm4svc_proc_unused,
+ .pc_decode = nlm4svc_decode_void,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
++ .pc_name = "UNUSED",
+ },
+ [NLMPROC_SHARE] = {
+ .pc_func = nlm4svc_proc_share,
+ .pc_decode = nlm4svc_decode_shareargs,
+ .pc_encode = nlm4svc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
++ .pc_name = "SHARE",
+ },
+ [NLMPROC_UNSHARE] = {
+ .pc_func = nlm4svc_proc_unshare,
+ .pc_decode = nlm4svc_decode_shareargs,
+ .pc_encode = nlm4svc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
++ .pc_name = "UNSHARE",
+ },
+ [NLMPROC_NM_LOCK] = {
+ .pc_func = nlm4svc_proc_nm_lock,
+ .pc_decode = nlm4svc_decode_lockargs,
+ .pc_encode = nlm4svc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "NM_LOCK",
+ },
+ [NLMPROC_FREE_ALL] = {
+ .pc_func = nlm4svc_proc_free_all,
+ .pc_decode = nlm4svc_decode_notify,
+ .pc_encode = nlm4svc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "FREE_ALL",
+ },
+ };
+diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
+index 273a81971ed57..4e30f3c509701 100644
+--- a/fs/lockd/svclock.c
++++ b/fs/lockd/svclock.c
+@@ -31,6 +31,7 @@
+ #include <linux/lockd/nlm.h>
+ #include <linux/lockd/lockd.h>
+ #include <linux/kthread.h>
++#include <linux/exportfs.h>
+
+ #define NLMDBG_FACILITY NLMDBG_SVCLOCK
+
+@@ -339,7 +340,7 @@ nlmsvc_get_lockowner(struct nlm_lockowner *lockowner)
+ return lockowner;
+ }
+
+-static void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
++void nlmsvc_put_lockowner(struct nlm_lockowner *lockowner)
+ {
+ if (!refcount_dec_and_lock(&lockowner->count, &lockowner->host->h_lock))
+ return;
+@@ -469,18 +470,27 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
+ struct nlm_host *host, struct nlm_lock *lock, int wait,
+ struct nlm_cookie *cookie, int reclaim)
+ {
++#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
++ struct inode *inode = nlmsvc_file_inode(file);
++#endif
+ struct nlm_block *block = NULL;
+ int error;
++ int mode;
++ int async_block = 0;
+ __be32 ret;
+
+ dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
+- locks_inode(file->f_file)->i_sb->s_id,
+- locks_inode(file->f_file)->i_ino,
++ inode->i_sb->s_id, inode->i_ino,
+ lock->fl.fl_type, lock->fl.fl_pid,
+ (long long)lock->fl.fl_start,
+ (long long)lock->fl.fl_end,
+ wait);
+
++ if (nlmsvc_file_file(file)->f_op->lock) {
++ async_block = wait;
++ wait = 0;
++ }
++
+ /* Lock file against concurrent access */
+ mutex_lock(&file->f_mutex);
+ /* Get existing block (in case client is busy-waiting)
+@@ -524,7 +534,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
+
+ if (!wait)
+ lock->fl.fl_flags &= ~FL_SLEEP;
+- error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
++ mode = lock_to_openmode(&lock->fl);
++ error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
+ lock->fl.fl_flags &= ~FL_SLEEP;
+
+ dprintk("lockd: vfs_lock_file returned %d\n", error);
+@@ -540,7 +551,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
+ */
+ if (wait)
+ break;
+- ret = nlm_lck_denied;
++ ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
+ goto out;
+ case FILE_LOCK_DEFERRED:
+ if (wait)
+@@ -577,12 +588,12 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+ struct nlm_lock *conflock, struct nlm_cookie *cookie)
+ {
+ int error;
++ int mode;
+ __be32 ret;
+- struct nlm_lockowner *test_owner;
+
+ dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
+- locks_inode(file->f_file)->i_sb->s_id,
+- locks_inode(file->f_file)->i_ino,
++ nlmsvc_file_inode(file)->i_sb->s_id,
++ nlmsvc_file_inode(file)->i_ino,
+ lock->fl.fl_type,
+ (long long)lock->fl.fl_start,
+ (long long)lock->fl.fl_end);
+@@ -592,10 +603,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+ goto out;
+ }
+
+- /* If there's a conflicting lock, remember to clean up the test lock */
+- test_owner = (struct nlm_lockowner *)lock->fl.fl_owner;
+-
+- error = vfs_test_lock(file->f_file, &lock->fl);
++ mode = lock_to_openmode(&lock->fl);
++ error = vfs_test_lock(file->f_file[mode], &lock->fl);
+ if (error) {
+ /* We can't currently deal with deferred test requests */
+ if (error == FILE_LOCK_DEFERRED)
+@@ -622,10 +631,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+ conflock->fl.fl_end = lock->fl.fl_end;
+ locks_release_private(&lock->fl);
+
+- /* Clean up the test lock */
+- lock->fl.fl_owner = NULL;
+- nlmsvc_put_lockowner(test_owner);
+-
+ ret = nlm_lck_denied;
+ out:
+ return ret;
+@@ -641,11 +646,11 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
+ __be32
+ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
+ {
+- int error;
++ int error = 0;
+
+ dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
+- locks_inode(file->f_file)->i_sb->s_id,
+- locks_inode(file->f_file)->i_ino,
++ nlmsvc_file_inode(file)->i_sb->s_id,
++ nlmsvc_file_inode(file)->i_ino,
+ lock->fl.fl_pid,
+ (long long)lock->fl.fl_start,
+ (long long)lock->fl.fl_end);
+@@ -654,7 +659,14 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
+ nlmsvc_cancel_blocked(net, file, lock);
+
+ lock->fl.fl_type = F_UNLCK;
+- error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
++ lock->fl.fl_file = file->f_file[O_RDONLY];
++ if (lock->fl.fl_file)
++ error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
++ &lock->fl, NULL);
++ lock->fl.fl_file = file->f_file[O_WRONLY];
++ if (lock->fl.fl_file)
++ error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
++ &lock->fl, NULL);
+
+ return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
+ }
+@@ -671,10 +683,11 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
+ {
+ struct nlm_block *block;
+ int status = 0;
++ int mode;
+
+ dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
+- locks_inode(file->f_file)->i_sb->s_id,
+- locks_inode(file->f_file)->i_ino,
++ nlmsvc_file_inode(file)->i_sb->s_id,
++ nlmsvc_file_inode(file)->i_ino,
+ lock->fl.fl_pid,
+ (long long)lock->fl.fl_start,
+ (long long)lock->fl.fl_end);
+@@ -686,8 +699,10 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
+ block = nlmsvc_lookup_block(file, lock);
+ mutex_unlock(&file->f_mutex);
+ if (block != NULL) {
+- vfs_cancel_lock(block->b_file->f_file,
+- &block->b_call->a_args.lock.fl);
++ struct file_lock *fl = &block->b_call->a_args.lock.fl;
++
++ mode = lock_to_openmode(fl);
++ vfs_cancel_lock(block->b_file->f_file[mode], fl);
+ status = nlmsvc_unlink_block(block);
+ nlmsvc_release_block(block);
+ }
+@@ -803,6 +818,7 @@ nlmsvc_grant_blocked(struct nlm_block *block)
+ {
+ struct nlm_file *file = block->b_file;
+ struct nlm_lock *lock = &block->b_call->a_args.lock;
++ int mode;
+ int error;
+ loff_t fl_start, fl_end;
+
+@@ -828,7 +844,8 @@ nlmsvc_grant_blocked(struct nlm_block *block)
+ lock->fl.fl_flags |= FL_SLEEP;
+ fl_start = lock->fl.fl_start;
+ fl_end = lock->fl.fl_end;
+- error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL);
++ mode = lock_to_openmode(&lock->fl);
++ error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
+ lock->fl.fl_flags &= ~FL_SLEEP;
+ lock->fl.fl_start = fl_start;
+ lock->fl.fl_end = fl_end;
+diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
+index 50855f2c1f4b8..32784f508c810 100644
+--- a/fs/lockd/svcproc.c
++++ b/fs/lockd/svcproc.c
+@@ -55,6 +55,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+ struct nlm_host *host = NULL;
+ struct nlm_file *file = NULL;
+ struct nlm_lock *lock = &argp->lock;
++ int mode;
+ __be32 error = 0;
+
+ /* nfsd callbacks must have been installed for this procedure */
+@@ -69,13 +70,15 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
+
+ /* Obtain file pointer. Not used by FREE_ALL call. */
+ if (filp != NULL) {
+- error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh));
++ error = cast_status(nlm_lookup_file(rqstp, &file, lock));
+ if (error != 0)
+ goto no_locks;
+ *filp = file;
+
+ /* Set up the missing parts of the file_lock structure */
+- lock->fl.fl_file = file->f_file;
++ mode = lock_to_openmode(&lock->fl);
++ lock->fl.fl_flags = FL_POSIX;
++ lock->fl.fl_file = file->f_file[mode];
+ lock->fl.fl_pid = current->tgid;
+ lock->fl.fl_lmops = &nlmsvc_lock_operations;
+ nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
+@@ -114,6 +117,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
+ struct nlm_args *argp = rqstp->rq_argp;
+ struct nlm_host *host;
+ struct nlm_file *file;
++ struct nlm_lockowner *test_owner;
+ __be32 rc = rpc_success;
+
+ dprintk("lockd: TEST called\n");
+@@ -123,6 +127,8 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
+ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
+ return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
+
++ test_owner = argp->lock.fl.fl_owner;
++
+ /* Now check for conflicting locks */
+ resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
+ if (resp->status == nlm_drop_reply)
+@@ -131,7 +137,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
+ dprintk("lockd: TEST status %d vers %d\n",
+ ntohl(resp->status), rqstp->rq_vers);
+
+- nlmsvc_release_lockowner(&argp->lock);
++ nlmsvc_put_lockowner(test_owner);
+ nlmsvc_release_host(host);
+ nlm_release_file(file);
+ return rc;
+@@ -299,8 +305,6 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp)
+ */
+ static void nlmsvc_callback_exit(struct rpc_task *task, void *data)
+ {
+- dprintk("lockd: %5u callback returned %d\n", task->tk_pid,
+- -task->tk_status);
+ }
+
+ void nlmsvc_release_call(struct nlm_rqst *call)
+@@ -552,191 +556,239 @@ const struct svc_procedure nlmsvc_procedures[24] = {
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "NULL",
+ },
+ [NLMPROC_TEST] = {
+ .pc_func = nlmsvc_proc_test,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_testres,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+2+No+Rg,
++ .pc_name = "TEST",
+ },
+ [NLMPROC_LOCK] = {
+ .pc_func = nlmsvc_proc_lock,
+ .pc_decode = nlmsvc_decode_lockargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "LOCK",
+ },
+ [NLMPROC_CANCEL] = {
+ .pc_func = nlmsvc_proc_cancel,
+ .pc_decode = nlmsvc_decode_cancargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "CANCEL",
+ },
+ [NLMPROC_UNLOCK] = {
+ .pc_func = nlmsvc_proc_unlock,
+ .pc_decode = nlmsvc_decode_unlockargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "UNLOCK",
+ },
+ [NLMPROC_GRANTED] = {
+ .pc_func = nlmsvc_proc_granted,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "GRANTED",
+ },
+ [NLMPROC_TEST_MSG] = {
+ .pc_func = nlmsvc_proc_test_msg,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "TEST_MSG",
+ },
+ [NLMPROC_LOCK_MSG] = {
+ .pc_func = nlmsvc_proc_lock_msg,
+ .pc_decode = nlmsvc_decode_lockargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "LOCK_MSG",
+ },
+ [NLMPROC_CANCEL_MSG] = {
+ .pc_func = nlmsvc_proc_cancel_msg,
+ .pc_decode = nlmsvc_decode_cancargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "CANCEL_MSG",
+ },
+ [NLMPROC_UNLOCK_MSG] = {
+ .pc_func = nlmsvc_proc_unlock_msg,
+ .pc_decode = nlmsvc_decode_unlockargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNLOCK_MSG",
+ },
+ [NLMPROC_GRANTED_MSG] = {
+ .pc_func = nlmsvc_proc_granted_msg,
+ .pc_decode = nlmsvc_decode_testargs,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "GRANTED_MSG",
+ },
+ [NLMPROC_TEST_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "TEST_RES",
+ },
+ [NLMPROC_LOCK_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "LOCK_RES",
+ },
+ [NLMPROC_CANCEL_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "CANCEL_RES",
+ },
+ [NLMPROC_UNLOCK_RES] = {
+ .pc_func = nlmsvc_proc_null,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNLOCK_RES",
+ },
+ [NLMPROC_GRANTED_RES] = {
+ .pc_func = nlmsvc_proc_granted_res,
+ .pc_decode = nlmsvc_decode_res,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_res),
++ .pc_argzero = sizeof(struct nlm_res),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "GRANTED_RES",
+ },
+ [NLMPROC_NSM_NOTIFY] = {
+ .pc_func = nlmsvc_proc_sm_notify,
+ .pc_decode = nlmsvc_decode_reboot,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_reboot),
++ .pc_argzero = sizeof(struct nlm_reboot),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "SM_NOTIFY",
+ },
+ [17] = {
+ .pc_func = nlmsvc_proc_unused,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNUSED",
+ },
+ [18] = {
+ .pc_func = nlmsvc_proc_unused,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNUSED",
+ },
+ [19] = {
+ .pc_func = nlmsvc_proc_unused,
+ .pc_decode = nlmsvc_decode_void,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_void),
++ .pc_argzero = sizeof(struct nlm_void),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = St,
++ .pc_name = "UNUSED",
+ },
+ [NLMPROC_SHARE] = {
+ .pc_func = nlmsvc_proc_share,
+ .pc_decode = nlmsvc_decode_shareargs,
+ .pc_encode = nlmsvc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
++ .pc_name = "SHARE",
+ },
+ [NLMPROC_UNSHARE] = {
+ .pc_func = nlmsvc_proc_unshare,
+ .pc_decode = nlmsvc_decode_shareargs,
+ .pc_encode = nlmsvc_encode_shareres,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St+1,
++ .pc_name = "UNSHARE",
+ },
+ [NLMPROC_NM_LOCK] = {
+ .pc_func = nlmsvc_proc_nm_lock,
+ .pc_decode = nlmsvc_decode_lockargs,
+ .pc_encode = nlmsvc_encode_res,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_res),
+ .pc_xdrressize = Ck+St,
++ .pc_name = "NM_LOCK",
+ },
+ [NLMPROC_FREE_ALL] = {
+ .pc_func = nlmsvc_proc_free_all,
+ .pc_decode = nlmsvc_decode_notify,
+ .pc_encode = nlmsvc_encode_void,
+ .pc_argsize = sizeof(struct nlm_args),
++ .pc_argzero = sizeof(struct nlm_args),
+ .pc_ressize = sizeof(struct nlm_void),
+ .pc_xdrressize = 0,
++ .pc_name = "FREE_ALL",
+ },
+ };
+diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
+index 028fc152da22f..e3b6229e7ae5c 100644
+--- a/fs/lockd/svcsubs.c
++++ b/fs/lockd/svcsubs.c
+@@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f)
+
+ static inline void nlm_debug_print_file(char *msg, struct nlm_file *file)
+ {
+- struct inode *inode = locks_inode(file->f_file);
++ struct inode *inode = nlmsvc_file_inode(file);
+
+ dprintk("lockd: %s %s/%ld\n",
+ msg, inode->i_sb->s_id, inode->i_ino);
+@@ -71,56 +71,75 @@ static inline unsigned int file_hash(struct nfs_fh *f)
+ return tmp & (FILE_NRHASH - 1);
+ }
+
++int lock_to_openmode(struct file_lock *lock)
++{
++ return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY;
++}
++
++/*
++ * Open the file. Note that if we're reexporting, for example,
++ * this could block the lockd thread for a while.
++ *
++ * We have to make sure we have the right credential to open
++ * the file.
++ */
++static __be32 nlm_do_fopen(struct svc_rqst *rqstp,
++ struct nlm_file *file, int mode)
++{
++ struct file **fp = &file->f_file[mode];
++ __be32 nfserr;
++
++ if (*fp)
++ return 0;
++ nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode);
++ if (nfserr)
++ dprintk("lockd: open failed (error %d)\n", nfserr);
++ return nfserr;
++}
++
+ /*
+ * Lookup file info. If it doesn't exist, create a file info struct
+ * and open a (VFS) file for the given inode.
+- *
+- * FIXME:
+- * Note that we open the file O_RDONLY even when creating write locks.
+- * This is not quite right, but for now, we assume the client performs
+- * the proper R/W checking.
+ */
+ __be32
+ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
+- struct nfs_fh *f)
++ struct nlm_lock *lock)
+ {
+ struct nlm_file *file;
+ unsigned int hash;
+ __be32 nfserr;
++ int mode;
+
+- nlm_debug_print_fh("nlm_lookup_file", f);
++ nlm_debug_print_fh("nlm_lookup_file", &lock->fh);
+
+- hash = file_hash(f);
++ hash = file_hash(&lock->fh);
++ mode = lock_to_openmode(&lock->fl);
+
+ /* Lock file table */
+ mutex_lock(&nlm_file_mutex);
+
+ hlist_for_each_entry(file, &nlm_files[hash], f_list)
+- if (!nfs_compare_fh(&file->f_handle, f))
++ if (!nfs_compare_fh(&file->f_handle, &lock->fh)) {
++ mutex_lock(&file->f_mutex);
++ nfserr = nlm_do_fopen(rqstp, file, mode);
++ mutex_unlock(&file->f_mutex);
+ goto found;
+-
+- nlm_debug_print_fh("creating file for", f);
++ }
++ nlm_debug_print_fh("creating file for", &lock->fh);
+
+ nfserr = nlm_lck_denied_nolocks;
+ file = kzalloc(sizeof(*file), GFP_KERNEL);
+ if (!file)
+- goto out_unlock;
++ goto out_free;
+
+- memcpy(&file->f_handle, f, sizeof(struct nfs_fh));
++ memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh));
+ mutex_init(&file->f_mutex);
+ INIT_HLIST_NODE(&file->f_list);
+ INIT_LIST_HEAD(&file->f_blocks);
+
+- /* Open the file. Note that this must not sleep for too long, else
+- * we would lock up lockd:-) So no NFS re-exports, folks.
+- *
+- * We have to make sure we have the right credential to open
+- * the file.
+- */
+- if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) {
+- dprintk("lockd: open failed (error %d)\n", nfserr);
+- goto out_free;
+- }
++ nfserr = nlm_do_fopen(rqstp, file, mode);
++ if (nfserr)
++ goto out_unlock;
+
+ hlist_add_head(&file->f_list, &nlm_files[hash]);
+
+@@ -128,7 +147,6 @@ nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result,
+ dprintk("lockd: found file %p (count %d)\n", file, file->f_count);
+ *result = file;
+ file->f_count++;
+- nfserr = 0;
+
+ out_unlock:
+ mutex_unlock(&nlm_file_mutex);
+@@ -148,13 +166,40 @@ nlm_delete_file(struct nlm_file *file)
+ nlm_debug_print_file("closing file", file);
+ if (!hlist_unhashed(&file->f_list)) {
+ hlist_del(&file->f_list);
+- nlmsvc_ops->fclose(file->f_file);
++ if (file->f_file[O_RDONLY])
++ nlmsvc_ops->fclose(file->f_file[O_RDONLY]);
++ if (file->f_file[O_WRONLY])
++ nlmsvc_ops->fclose(file->f_file[O_WRONLY]);
+ kfree(file);
+ } else {
+ printk(KERN_WARNING "lockd: attempt to release unknown file!\n");
+ }
+ }
+
++static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
++{
++ struct file_lock lock;
++
++ locks_init_lock(&lock);
++ lock.fl_type = F_UNLCK;
++ lock.fl_start = 0;
++ lock.fl_end = OFFSET_MAX;
++ lock.fl_owner = fl->fl_owner;
++ lock.fl_pid = fl->fl_pid;
++ lock.fl_flags = FL_POSIX;
++
++ lock.fl_file = file->f_file[O_RDONLY];
++ if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
++ goto out_err;
++ lock.fl_file = file->f_file[O_WRONLY];
++ if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
++ goto out_err;
++ return 0;
++out_err:
++ pr_warn("lockd: unlock failure in %s:%d\n", __FILE__, __LINE__);
++ return 1;
++}
++
+ /*
+ * Loop over all locks on the given file and perform the specified
+ * action.
+@@ -165,7 +210,7 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+ {
+ struct inode *inode = nlmsvc_file_inode(file);
+ struct file_lock *fl;
+- struct file_lock_context *flctx = inode->i_flctx;
++ struct file_lock_context *flctx = locks_inode_context(inode);
+ struct nlm_host *lockhost;
+
+ if (!flctx || list_empty_careful(&flctx->flc_posix))
+@@ -182,17 +227,10 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
+
+ lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
+ if (match(lockhost, host)) {
+- struct file_lock lock = *fl;
+
+ spin_unlock(&flctx->flc_lock);
+- lock.fl_type = F_UNLCK;
+- lock.fl_start = 0;
+- lock.fl_end = OFFSET_MAX;
+- if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) {
+- printk("lockd: unlock failure in %s:%d\n",
+- __FILE__, __LINE__);
++ if (nlm_unlock_files(file, fl))
+ return 1;
+- }
+ goto again;
+ }
+ }
+@@ -227,7 +265,7 @@ nlm_file_inuse(struct nlm_file *file)
+ {
+ struct inode *inode = nlmsvc_file_inode(file);
+ struct file_lock *fl;
+- struct file_lock_context *flctx = inode->i_flctx;
++ struct file_lock_context *flctx = locks_inode_context(inode);
+
+ if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
+ return 1;
+@@ -246,6 +284,14 @@ nlm_file_inuse(struct nlm_file *file)
+ return 0;
+ }
+
++static void nlm_close_files(struct nlm_file *file)
++{
++ if (file->f_file[O_RDONLY])
++ nlmsvc_ops->fclose(file->f_file[O_RDONLY]);
++ if (file->f_file[O_WRONLY])
++ nlmsvc_ops->fclose(file->f_file[O_WRONLY]);
++}
++
+ /*
+ * Loop over all files in the file table.
+ */
+@@ -276,7 +322,7 @@ nlm_traverse_files(void *data, nlm_host_match_fn_t match,
+ if (list_empty(&file->f_blocks) && !file->f_locks
+ && !file->f_shares && !file->f_count) {
+ hlist_del(&file->f_list);
+- nlmsvc_ops->fclose(file->f_file);
++ nlm_close_files(file);
+ kfree(file);
+ }
+ }
+@@ -410,12 +456,13 @@ nlmsvc_invalidate_all(void)
+ nlm_traverse_files(NULL, nlmsvc_is_client, NULL);
+ }
+
++
+ static int
+ nlmsvc_match_sb(void *datap, struct nlm_file *file)
+ {
+ struct super_block *sb = datap;
+
+- return sb == locks_inode(file->f_file)->i_sb;
++ return sb == nlmsvc_file_inode(file)->i_sb;
+ }
+
+ /**
+diff --git a/fs/lockd/svcxdr.h b/fs/lockd/svcxdr.h
+new file mode 100644
+index 0000000000000..4f1a451da5ba2
+--- /dev/null
++++ b/fs/lockd/svcxdr.h
+@@ -0,0 +1,142 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/*
++ * Encode/decode NLM basic data types
++ *
++ * Basic NLMv3 XDR data types are not defined in an IETF standards
++ * document. X/Open has a description of these data types that
++ * is useful. See Chapter 10 of "Protocols for Interworking:
++ * XNFS, Version 3W".
++ *
++ * Basic NLMv4 XDR data types are defined in Appendix II.1.4 of
++ * RFC 1813: "NFS Version 3 Protocol Specification".
++ *
++ * Author: Chuck Lever <chuck.lever@oracle.com>
++ *
++ * Copyright (c) 2020, Oracle and/or its affiliates.
++ */
++
++#ifndef _LOCKD_SVCXDR_H_
++#define _LOCKD_SVCXDR_H_
++
++static inline bool
++svcxdr_decode_stats(struct xdr_stream *xdr, __be32 *status)
++{
++ __be32 *p;
++
++ p = xdr_inline_decode(xdr, XDR_UNIT);
++ if (!p)
++ return false;
++ *status = *p;
++
++ return true;
++}
++
++static inline bool
++svcxdr_encode_stats(struct xdr_stream *xdr, __be32 status)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT);
++ if (!p)
++ return false;
++ *p = status;
++
++ return true;
++}
++
++static inline bool
++svcxdr_decode_string(struct xdr_stream *xdr, char **data, unsigned int *data_len)
++{
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len > NLM_MAXSTRLEN)
++ return false;
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ *data_len = len;
++ *data = (char *)p;
++
++ return true;
++}
++
++/*
++ * NLM cookies are defined by specification to be a variable-length
++ * XDR opaque no longer than 1024 bytes. However, this implementation
++ * limits their length to 32 bytes, and treats zero-length cookies
++ * specially.
++ */
++static inline bool
++svcxdr_decode_cookie(struct xdr_stream *xdr, struct nlm_cookie *cookie)
++{
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len > NLM_MAXCOOKIELEN)
++ return false;
++ if (!len)
++ goto out_hpux;
++
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ cookie->len = len;
++ memcpy(cookie->data, p, len);
++
++ return true;
++
++ /* apparently HPUX can return empty cookies */
++out_hpux:
++ cookie->len = 4;
++ memset(cookie->data, 0, 4);
++ return true;
++}
++
++static inline bool
++svcxdr_encode_cookie(struct xdr_stream *xdr, const struct nlm_cookie *cookie)
++{
++ __be32 *p;
++
++ if (xdr_stream_encode_u32(xdr, cookie->len) < 0)
++ return false;
++ p = xdr_reserve_space(xdr, cookie->len);
++ if (!p)
++ return false;
++ memcpy(p, cookie->data, cookie->len);
++
++ return true;
++}
++
++static inline bool
++svcxdr_decode_owner(struct xdr_stream *xdr, struct xdr_netobj *obj)
++{
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len > XDR_MAX_NETOBJ)
++ return false;
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ obj->len = len;
++ obj->data = (u8 *)p;
++
++ return true;
++}
++
++static inline bool
++svcxdr_encode_owner(struct xdr_stream *xdr, const struct xdr_netobj *obj)
++{
++ if (obj->len > XDR_MAX_NETOBJ)
++ return false;
++ return xdr_stream_encode_opaque(xdr, obj->data, obj->len) > 0;
++}
++
++#endif /* _LOCKD_SVCXDR_H_ */
+diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
+index 982629f7b120a..2fb5748dae0c8 100644
+--- a/fs/lockd/xdr.c
++++ b/fs/lockd/xdr.c
+@@ -19,7 +19,7 @@
+
+ #include <uapi/linux/nfs2.h>
+
+-#define NLMDBG_FACILITY NLMDBG_XDR
++#include "svcxdr.h"
+
+
+ static inline loff_t
+@@ -42,311 +42,313 @@ loff_t_to_s32(loff_t offset)
+ }
+
+ /*
+- * XDR functions for basic NLM types
++ * NLM file handles are defined by specification to be a variable-length
++ * XDR opaque no longer than 1024 bytes. However, this implementation
++ * constrains their length to exactly the length of an NFSv2 file
++ * handle.
+ */
+-static __be32 *nlm_decode_cookie(__be32 *p, struct nlm_cookie *c)
++static bool
++svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+ {
+- unsigned int len;
+-
+- len = ntohl(*p++);
+-
+- if(len==0)
+- {
+- c->len=4;
+- memset(c->data, 0, 4); /* hockeypux brain damage */
+- }
+- else if(len<=NLM_MAXCOOKIELEN)
+- {
+- c->len=len;
+- memcpy(c->data, p, len);
+- p+=XDR_QUADLEN(len);
+- }
+- else
+- {
+- dprintk("lockd: bad cookie size %d (only cookies under "
+- "%d bytes are supported.)\n",
+- len, NLM_MAXCOOKIELEN);
+- return NULL;
+- }
+- return p;
+-}
+-
+-static inline __be32 *
+-nlm_encode_cookie(__be32 *p, struct nlm_cookie *c)
+-{
+- *p++ = htonl(c->len);
+- memcpy(p, c->data, c->len);
+- p+=XDR_QUADLEN(c->len);
+- return p;
+-}
+-
+-static __be32 *
+-nlm_decode_fh(__be32 *p, struct nfs_fh *f)
+-{
+- unsigned int len;
+-
+- if ((len = ntohl(*p++)) != NFS2_FHSIZE) {
+- dprintk("lockd: bad fhandle size %d (should be %d)\n",
+- len, NFS2_FHSIZE);
+- return NULL;
+- }
+- f->size = NFS2_FHSIZE;
+- memset(f->data, 0, sizeof(f->data));
+- memcpy(f->data, p, NFS2_FHSIZE);
+- return p + XDR_QUADLEN(NFS2_FHSIZE);
+-}
+-
+-/*
+- * Encode and decode owner handle
+- */
+-static inline __be32 *
+-nlm_decode_oh(__be32 *p, struct xdr_netobj *oh)
+-{
+- return xdr_decode_netobj(p, oh);
+-}
+-
+-static inline __be32 *
+-nlm_encode_oh(__be32 *p, struct xdr_netobj *oh)
+-{
+- return xdr_encode_netobj(p, oh);
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len != NFS2_FHSIZE)
++ return false;
++
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ fh->size = NFS2_FHSIZE;
++ memcpy(fh->data, p, len);
++ memset(fh->data + NFS2_FHSIZE, 0, sizeof(fh->data) - NFS2_FHSIZE);
++
++ return true;
+ }
+
+-static __be32 *
+-nlm_decode_lock(__be32 *p, struct nlm_lock *lock)
++static bool
++svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
+ {
+- struct file_lock *fl = &lock->fl;
+- s32 start, len, end;
+-
+- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+- &lock->len,
+- NLM_MAXSTRLEN))
+- || !(p = nlm_decode_fh(p, &lock->fh))
+- || !(p = nlm_decode_oh(p, &lock->oh)))
+- return NULL;
+- lock->svid = ntohl(*p++);
++ struct file_lock *fl = &lock->fl;
++ s32 start, len, end;
++
++ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
++ return false;
++ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
++ return false;
++ if (!svcxdr_decode_owner(xdr, &lock->oh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &start) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
+
+ locks_init_lock(fl);
+ fl->fl_flags = FL_POSIX;
+- fl->fl_type = F_RDLCK; /* as good as anything else */
+- start = ntohl(*p++);
+- len = ntohl(*p++);
++ fl->fl_type = F_RDLCK;
+ end = start + len - 1;
+-
+ fl->fl_start = s32_to_loff_t(start);
+-
+ if (len == 0 || end < 0)
+ fl->fl_end = OFFSET_MAX;
+ else
+ fl->fl_end = s32_to_loff_t(end);
+- return p;
++
++ return true;
+ }
+
+-/*
+- * Encode result of a TEST/TEST_MSG call
+- */
+-static __be32 *
+-nlm_encode_testres(__be32 *p, struct nlm_res *resp)
++static bool
++svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
+ {
+- s32 start, len;
+-
+- if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+- return NULL;
+- *p++ = resp->status;
+-
+- if (resp->status == nlm_lck_denied) {
+- struct file_lock *fl = &resp->lock.fl;
+-
+- *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
+- *p++ = htonl(resp->lock.svid);
+-
+- /* Encode owner handle. */
+- if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
+- return NULL;
++ const struct file_lock *fl = &lock->fl;
++ s32 start, len;
++
++ /* exclusive */
++ if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
++ return false;
++ if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
++ return false;
++ if (!svcxdr_encode_owner(xdr, &lock->oh))
++ return false;
++ start = loff_t_to_s32(fl->fl_start);
++ if (fl->fl_end == OFFSET_MAX)
++ len = 0;
++ else
++ len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
++ if (xdr_stream_encode_u32(xdr, start) < 0)
++ return false;
++ if (xdr_stream_encode_u32(xdr, len) < 0)
++ return false;
+
+- start = loff_t_to_s32(fl->fl_start);
+- if (fl->fl_end == OFFSET_MAX)
+- len = 0;
+- else
+- len = loff_t_to_s32(fl->fl_end - fl->fl_start + 1);
++ return true;
++}
+
+- *p++ = htonl(start);
+- *p++ = htonl(len);
++static bool
++svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
++{
++ if (!svcxdr_encode_stats(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nlm_lck_denied:
++ if (!svcxdr_encode_holder(xdr, &resp->lock))
++ return false;
+ }
+
+- return p;
++ return true;
+ }
+
+
+ /*
+- * First, the server side XDR functions
++ * Decode Call arguments
+ */
+-int
+-nlmsvc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nlm_args *argp = rqstp->rq_argp;
+- u32 exclusive;
+-
+- if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+- return 0;
+-
+- exclusive = ntohl(*p++);
+- if (!(p = nlm_decode_lock(p, &argp->lock)))
+- return 0;
+- if (exclusive)
+- argp->lock.fl.fl_type = F_WRLCK;
+
+- return xdr_argsize_check(rqstp, p);
++bool
++nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
++{
++ return true;
+ }
+
+-int
+-nlmsvc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_resp;
++ struct nlm_args *argp = rqstp->rq_argp;
++ u32 exclusive;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
++ if (exclusive)
++ argp->lock.fl.fl_type = F_WRLCK;
+
+- if (!(p = nlm_encode_testres(p, resp)))
+- return 0;
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nlmsvc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+- u32 exclusive;
+-
+- if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+- return 0;
+- argp->block = ntohl(*p++);
+- exclusive = ntohl(*p++);
+- if (!(p = nlm_decode_lock(p, &argp->lock)))
+- return 0;
++ u32 exclusive;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
++ return false;
++ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
+ if (exclusive)
+ argp->lock.fl.fl_type = F_WRLCK;
+- argp->reclaim = ntohl(*p++);
+- argp->state = ntohl(*p++);
++ if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
++ return false;
+ argp->monitor = 1; /* monitor client by default */
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nlmsvc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+- u32 exclusive;
+-
+- if (!(p = nlm_decode_cookie(p, &argp->cookie)))
+- return 0;
+- argp->block = ntohl(*p++);
+- exclusive = ntohl(*p++);
+- if (!(p = nlm_decode_lock(p, &argp->lock)))
+- return 0;
++ u32 exclusive;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
++ return false;
++ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
+ if (exclusive)
+ argp->lock.fl.fl_type = F_WRLCK;
+- return xdr_argsize_check(rqstp, p);
++
++ return true;
+ }
+
+-int
+-nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+
+- if (!(p = nlm_decode_cookie(p, &argp->cookie))
+- || !(p = nlm_decode_lock(p, &argp->lock)))
+- return 0;
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
+ argp->lock.fl.fl_type = F_UNLCK;
+- return xdr_argsize_check(rqstp, p);
++
++ return true;
+ }
+
+-int
+-nlmsvc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_args *argp = rqstp->rq_argp;
+- struct nlm_lock *lock = &argp->lock;
++ struct nlm_res *resp = rqstp->rq_argp;
+
+- memset(lock, 0, sizeof(*lock));
+- locks_init_lock(&lock->fl);
+- lock->svid = ~(u32) 0;
+-
+- if (!(p = nlm_decode_cookie(p, &argp->cookie))
+- || !(p = xdr_decode_string_inplace(p, &lock->caller,
+- &lock->len, NLM_MAXSTRLEN))
+- || !(p = nlm_decode_fh(p, &lock->fh))
+- || !(p = nlm_decode_oh(p, &lock->oh)))
+- return 0;
+- argp->fsm_mode = ntohl(*p++);
+- argp->fsm_access = ntohl(*p++);
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_cookie(xdr, &resp->cookie))
++ return false;
++ if (!svcxdr_decode_stats(xdr, &resp->status))
++ return false;
++
++ return true;
+ }
+
+-int
+-nlmsvc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_resp;
++ struct nlm_reboot *argp = rqstp->rq_argp;
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len > SM_MAXSTRLEN)
++ return false;
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ argp->len = len;
++ argp->mon = (char *)p;
++ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
++ return false;
++ p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
++ if (!p)
++ return false;
++ memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+
+- if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+- return 0;
+- *p++ = resp->status;
+- *p++ = xdr_zero; /* sequence argument */
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nlmsvc_encode_res(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_resp;
++ struct nlm_args *argp = rqstp->rq_argp;
++ struct nlm_lock *lock = &argp->lock;
+
+- if (!(p = nlm_encode_cookie(p, &resp->cookie)))
+- return 0;
+- *p++ = resp->status;
+- return xdr_ressize_check(rqstp, p);
++ memset(lock, 0, sizeof(*lock));
++ locks_init_lock(&lock->fl);
++ lock->svid = ~(u32)0;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
++ return false;
++ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
++ return false;
++ if (!svcxdr_decode_owner(xdr, &lock->oh))
++ return false;
++ /* XXX: Range checks are missing in the original code */
++ if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nlmsvc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+ struct nlm_lock *lock = &argp->lock;
+
+- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+- &lock->len, NLM_MAXSTRLEN)))
+- return 0;
+- argp->state = ntohl(*p++);
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nlm_reboot *argp = rqstp->rq_argp;
+
+- if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
+- return 0;
+- argp->state = ntohl(*p++);
+- memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+- p += XDR_QUADLEN(SM_PRIV_SIZE);
+- return xdr_argsize_check(rqstp, p);
++/*
++ * Encode Reply results
++ */
++
++bool
++nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
++{
++ return true;
+ }
+
+-int
+-nlmsvc_decode_res(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_argp;
++ struct nlm_res *resp = rqstp->rq_resp;
+
+- if (!(p = nlm_decode_cookie(p, &resp->cookie)))
+- return 0;
+- resp->status = *p++;
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
++ svcxdr_encode_testrply(xdr, resp);
+ }
+
+-int
+-nlmsvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- return xdr_argsize_check(rqstp, p);
++ struct nlm_res *resp = rqstp->rq_resp;
++
++ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
++ svcxdr_encode_stats(xdr, resp->status);
+ }
+
+-int
+-nlmsvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- return xdr_ressize_check(rqstp, p);
++ struct nlm_res *resp = rqstp->rq_resp;
++
++ if (!svcxdr_encode_cookie(xdr, &resp->cookie))
++ return false;
++ if (!svcxdr_encode_stats(xdr, resp->status))
++ return false;
++ /* sequence */
++ if (xdr_stream_encode_u32(xdr, 0) < 0)
++ return false;
++
++ return true;
+ }
+diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
+index 5fa9f48a9dba7..5fcbf30cd2759 100644
+--- a/fs/lockd/xdr4.c
++++ b/fs/lockd/xdr4.c
+@@ -18,14 +18,7 @@
+ #include <linux/sunrpc/stats.h>
+ #include <linux/lockd/lockd.h>
+
+-#define NLMDBG_FACILITY NLMDBG_XDR
+-
+-static inline loff_t
+-s64_to_loff_t(__s64 offset)
+-{
+- return (loff_t)offset;
+-}
+-
++#include "svcxdr.h"
+
+ static inline s64
+ loff_t_to_s64(loff_t offset)
+@@ -40,310 +33,317 @@ loff_t_to_s64(loff_t offset)
+ return res;
+ }
+
+-/*
+- * XDR functions for basic NLM types
+- */
+-static __be32 *
+-nlm4_decode_cookie(__be32 *p, struct nlm_cookie *c)
+-{
+- unsigned int len;
+-
+- len = ntohl(*p++);
+-
+- if(len==0)
+- {
+- c->len=4;
+- memset(c->data, 0, 4); /* hockeypux brain damage */
+- }
+- else if(len<=NLM_MAXCOOKIELEN)
+- {
+- c->len=len;
+- memcpy(c->data, p, len);
+- p+=XDR_QUADLEN(len);
+- }
+- else
+- {
+- dprintk("lockd: bad cookie size %d (only cookies under "
+- "%d bytes are supported.)\n",
+- len, NLM_MAXCOOKIELEN);
+- return NULL;
+- }
+- return p;
+-}
+-
+-static __be32 *
+-nlm4_encode_cookie(__be32 *p, struct nlm_cookie *c)
++void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len)
+ {
+- *p++ = htonl(c->len);
+- memcpy(p, c->data, c->len);
+- p+=XDR_QUADLEN(c->len);
+- return p;
+-}
++ s64 end = off + len - 1;
+
+-static __be32 *
+-nlm4_decode_fh(__be32 *p, struct nfs_fh *f)
+-{
+- memset(f->data, 0, sizeof(f->data));
+- f->size = ntohl(*p++);
+- if (f->size > NFS_MAXFHSIZE) {
+- dprintk("lockd: bad fhandle size %d (should be <=%d)\n",
+- f->size, NFS_MAXFHSIZE);
+- return NULL;
+- }
+- memcpy(f->data, p, f->size);
+- return p + XDR_QUADLEN(f->size);
++ fl->fl_start = off;
++ if (len == 0 || end < 0)
++ fl->fl_end = OFFSET_MAX;
++ else
++ fl->fl_end = end;
+ }
+
+ /*
+- * Encode and decode owner handle
++ * NLM file handles are defined by specification to be a variable-length
++ * XDR opaque no longer than 1024 bytes. However, this implementation
++ * limits their length to the size of an NFSv3 file handle.
+ */
+-static __be32 *
+-nlm4_decode_oh(__be32 *p, struct xdr_netobj *oh)
++static bool
++svcxdr_decode_fhandle(struct xdr_stream *xdr, struct nfs_fh *fh)
+ {
+- return xdr_decode_netobj(p, oh);
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len > NFS_MAXFHSIZE)
++ return false;
++
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ fh->size = len;
++ memcpy(fh->data, p, len);
++ memset(fh->data + len, 0, sizeof(fh->data) - len);
++
++ return true;
+ }
+
+-static __be32 *
+-nlm4_decode_lock(__be32 *p, struct nlm_lock *lock)
++static bool
++svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
+ {
+- struct file_lock *fl = &lock->fl;
+- __u64 len, start;
+- __s64 end;
+-
+- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+- &lock->len, NLM_MAXSTRLEN))
+- || !(p = nlm4_decode_fh(p, &lock->fh))
+- || !(p = nlm4_decode_oh(p, &lock->oh)))
+- return NULL;
+- lock->svid = ntohl(*p++);
++ struct file_lock *fl = &lock->fl;
++
++ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
++ return false;
++ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
++ return false;
++ if (!svcxdr_decode_owner(xdr, &lock->oh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &lock->svid) < 0)
++ return false;
++ if (xdr_stream_decode_u64(xdr, &lock->lock_start) < 0)
++ return false;
++ if (xdr_stream_decode_u64(xdr, &lock->lock_len) < 0)
++ return false;
+
+ locks_init_lock(fl);
+ fl->fl_flags = FL_POSIX;
+- fl->fl_type = F_RDLCK; /* as good as anything else */
+- p = xdr_decode_hyper(p, &start);
+- p = xdr_decode_hyper(p, &len);
+- end = start + len - 1;
+-
+- fl->fl_start = s64_to_loff_t(start);
++ fl->fl_type = F_RDLCK;
++ nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len);
++ return true;
++}
+
+- if (len == 0 || end < 0)
+- fl->fl_end = OFFSET_MAX;
++static bool
++svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
++{
++ const struct file_lock *fl = &lock->fl;
++ s64 start, len;
++
++ /* exclusive */
++ if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
++ return false;
++ if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
++ return false;
++ if (!svcxdr_encode_owner(xdr, &lock->oh))
++ return false;
++ start = loff_t_to_s64(fl->fl_start);
++ if (fl->fl_end == OFFSET_MAX)
++ len = 0;
+ else
+- fl->fl_end = s64_to_loff_t(end);
+- return p;
++ len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
++ if (xdr_stream_encode_u64(xdr, start) < 0)
++ return false;
++ if (xdr_stream_encode_u64(xdr, len) < 0)
++ return false;
++
++ return true;
+ }
+
+-/*
+- * Encode result of a TEST/TEST_MSG call
+- */
+-static __be32 *
+-nlm4_encode_testres(__be32 *p, struct nlm_res *resp)
++static bool
++svcxdr_encode_testrply(struct xdr_stream *xdr, const struct nlm_res *resp)
+ {
+- s64 start, len;
+-
+- dprintk("xdr: before encode_testres (p %p resp %p)\n", p, resp);
+- if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+- return NULL;
+- *p++ = resp->status;
+-
+- if (resp->status == nlm_lck_denied) {
+- struct file_lock *fl = &resp->lock.fl;
+-
+- *p++ = (fl->fl_type == F_RDLCK)? xdr_zero : xdr_one;
+- *p++ = htonl(resp->lock.svid);
+-
+- /* Encode owner handle. */
+- if (!(p = xdr_encode_netobj(p, &resp->lock.oh)))
+- return NULL;
+-
+- start = loff_t_to_s64(fl->fl_start);
+- if (fl->fl_end == OFFSET_MAX)
+- len = 0;
+- else
+- len = loff_t_to_s64(fl->fl_end - fl->fl_start + 1);
+-
+- p = xdr_encode_hyper(p, start);
+- p = xdr_encode_hyper(p, len);
+- dprintk("xdr: encode_testres (status %u pid %d type %d start %Ld end %Ld)\n",
+- resp->status, (int)resp->lock.svid, fl->fl_type,
+- (long long)fl->fl_start, (long long)fl->fl_end);
++ if (!svcxdr_encode_stats(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nlm_lck_denied:
++ if (!svcxdr_encode_holder(xdr, &resp->lock))
++ return false;
+ }
+
+- dprintk("xdr: after encode_testres (p %p resp %p)\n", p, resp);
+- return p;
++ return true;
+ }
+
+
+ /*
+- * First, the server side XDR functions
++ * Decode Call arguments
+ */
+-int
+-nlm4svc_decode_testargs(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nlm_args *argp = rqstp->rq_argp;
+- u32 exclusive;
+-
+- if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+- return 0;
+-
+- exclusive = ntohl(*p++);
+- if (!(p = nlm4_decode_lock(p, &argp->lock)))
+- return 0;
+- if (exclusive)
+- argp->lock.fl.fl_type = F_WRLCK;
+
+- return xdr_argsize_check(rqstp, p);
++bool
++nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
++{
++ return true;
+ }
+
+-int
+-nlm4svc_encode_testres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_resp;
++ struct nlm_args *argp = rqstp->rq_argp;
++ u32 exclusive;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
++ if (exclusive)
++ argp->lock.fl.fl_type = F_WRLCK;
+
+- if (!(p = nlm4_encode_testres(p, resp)))
+- return 0;
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nlm4svc_decode_lockargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+- u32 exclusive;
+-
+- if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+- return 0;
+- argp->block = ntohl(*p++);
+- exclusive = ntohl(*p++);
+- if (!(p = nlm4_decode_lock(p, &argp->lock)))
+- return 0;
++ u32 exclusive;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
++ return false;
++ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
+ if (exclusive)
+ argp->lock.fl.fl_type = F_WRLCK;
+- argp->reclaim = ntohl(*p++);
+- argp->state = ntohl(*p++);
++ if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
++ return false;
+ argp->monitor = 1; /* monitor client by default */
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nlm4svc_decode_cancargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+- u32 exclusive;
+-
+- if (!(p = nlm4_decode_cookie(p, &argp->cookie)))
+- return 0;
+- argp->block = ntohl(*p++);
+- exclusive = ntohl(*p++);
+- if (!(p = nlm4_decode_lock(p, &argp->lock)))
+- return 0;
++ u32 exclusive;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (xdr_stream_decode_bool(xdr, &argp->block) < 0)
++ return false;
++ if (xdr_stream_decode_bool(xdr, &exclusive) < 0)
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
+ if (exclusive)
+ argp->lock.fl.fl_type = F_WRLCK;
+- return xdr_argsize_check(rqstp, p);
++
++ return true;
+ }
+
+-int
+-nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+
+- if (!(p = nlm4_decode_cookie(p, &argp->cookie))
+- || !(p = nlm4_decode_lock(p, &argp->lock)))
+- return 0;
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (!svcxdr_decode_lock(xdr, &argp->lock))
++ return false;
+ argp->lock.fl.fl_type = F_UNLCK;
+- return xdr_argsize_check(rqstp, p);
++
++ return true;
+ }
+
+-int
+-nlm4svc_decode_shareargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_args *argp = rqstp->rq_argp;
+- struct nlm_lock *lock = &argp->lock;
++ struct nlm_res *resp = rqstp->rq_argp;
+
+- memset(lock, 0, sizeof(*lock));
+- locks_init_lock(&lock->fl);
+- lock->svid = ~(u32) 0;
+-
+- if (!(p = nlm4_decode_cookie(p, &argp->cookie))
+- || !(p = xdr_decode_string_inplace(p, &lock->caller,
+- &lock->len, NLM_MAXSTRLEN))
+- || !(p = nlm4_decode_fh(p, &lock->fh))
+- || !(p = nlm4_decode_oh(p, &lock->oh)))
+- return 0;
+- argp->fsm_mode = ntohl(*p++);
+- argp->fsm_access = ntohl(*p++);
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_cookie(xdr, &resp->cookie))
++ return false;
++ if (!svcxdr_decode_stats(xdr, &resp->status))
++ return false;
++
++ return true;
+ }
+
+-int
+-nlm4svc_encode_shareres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_resp;
++ struct nlm_reboot *argp = rqstp->rq_argp;
++ __be32 *p;
++ u32 len;
++
++ if (xdr_stream_decode_u32(xdr, &len) < 0)
++ return false;
++ if (len > SM_MAXSTRLEN)
++ return false;
++ p = xdr_inline_decode(xdr, len);
++ if (!p)
++ return false;
++ argp->len = len;
++ argp->mon = (char *)p;
++ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
++ return false;
++ p = xdr_inline_decode(xdr, SM_PRIV_SIZE);
++ if (!p)
++ return false;
++ memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+
+- if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+- return 0;
+- *p++ = resp->status;
+- *p++ = xdr_zero; /* sequence argument */
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nlm4svc_encode_res(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_resp;
++ struct nlm_args *argp = rqstp->rq_argp;
++ struct nlm_lock *lock = &argp->lock;
+
+- if (!(p = nlm4_encode_cookie(p, &resp->cookie)))
+- return 0;
+- *p++ = resp->status;
+- return xdr_ressize_check(rqstp, p);
++ memset(lock, 0, sizeof(*lock));
++ locks_init_lock(&lock->fl);
++ lock->svid = ~(u32)0;
++
++ if (!svcxdr_decode_cookie(xdr, &argp->cookie))
++ return false;
++ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
++ return false;
++ if (!svcxdr_decode_fhandle(xdr, &lock->fh))
++ return false;
++ if (!svcxdr_decode_owner(xdr, &lock->oh))
++ return false;
++ /* XXX: Range checks are missing in the original code */
++ if (xdr_stream_decode_u32(xdr, &argp->fsm_mode) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->fsm_access) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nlm4svc_decode_notify(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nlm_args *argp = rqstp->rq_argp;
+ struct nlm_lock *lock = &argp->lock;
+
+- if (!(p = xdr_decode_string_inplace(p, &lock->caller,
+- &lock->len, NLM_MAXSTRLEN)))
+- return 0;
+- argp->state = ntohl(*p++);
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_string(xdr, &lock->caller, &lock->len))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nlm_reboot *argp = rqstp->rq_argp;
+
+- if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN)))
+- return 0;
+- argp->state = ntohl(*p++);
+- memcpy(&argp->priv.data, p, sizeof(argp->priv.data));
+- p += XDR_QUADLEN(SM_PRIV_SIZE);
+- return xdr_argsize_check(rqstp, p);
++/*
++ * Encode Reply results
++ */
++
++bool
++nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
++{
++ return true;
+ }
+
+-int
+-nlm4svc_decode_res(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nlm_res *resp = rqstp->rq_argp;
++ struct nlm_res *resp = rqstp->rq_resp;
+
+- if (!(p = nlm4_decode_cookie(p, &resp->cookie)))
+- return 0;
+- resp->status = *p++;
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
++ svcxdr_encode_testrply(xdr, resp);
+ }
+
+-int
+-nlm4svc_decode_void(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- return xdr_argsize_check(rqstp, p);
++ struct nlm_res *resp = rqstp->rq_resp;
++
++ return svcxdr_encode_cookie(xdr, &resp->cookie) &&
++ svcxdr_encode_stats(xdr, resp->status);
+ }
+
+-int
+-nlm4svc_encode_void(struct svc_rqst *rqstp, __be32 *p)
++bool
++nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- return xdr_ressize_check(rqstp, p);
++ struct nlm_res *resp = rqstp->rq_resp;
++
++ if (!svcxdr_encode_cookie(xdr, &resp->cookie))
++ return false;
++ if (!svcxdr_encode_stats(xdr, resp->status))
++ return false;
++ /* sequence */
++ if (xdr_stream_encode_u32(xdr, 0) < 0)
++ return false;
++
++ return true;
+ }
+diff --git a/fs/locks.c b/fs/locks.c
+index cbb5701ce9f37..b0753c8871fb2 100644
+--- a/fs/locks.c
++++ b/fs/locks.c
+@@ -251,7 +251,7 @@ locks_get_lock_context(struct inode *inode, int type)
+ struct file_lock_context *ctx;
+
+ /* paired with cmpxchg() below */
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (likely(ctx) || type == F_UNLCK)
+ goto out;
+
+@@ -270,7 +270,7 @@ locks_get_lock_context(struct inode *inode, int type)
+ */
+ if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+ kmem_cache_free(flctx_cache, ctx);
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ }
+ out:
+ trace_locks_get_lock_context(inode, type, ctx);
+@@ -323,7 +323,7 @@ locks_check_ctx_file_list(struct file *filp, struct list_head *list,
+ void
+ locks_free_lock_context(struct inode *inode)
+ {
+- struct file_lock_context *ctx = inode->i_flctx;
++ struct file_lock_context *ctx = locks_inode_context(inode);
+
+ if (unlikely(ctx)) {
+ locks_check_ctx_lists(inode);
+@@ -376,6 +376,34 @@ void locks_release_private(struct file_lock *fl)
+ }
+ EXPORT_SYMBOL_GPL(locks_release_private);
+
++/**
++ * locks_owner_has_blockers - Check for blocking lock requests
++ * @flctx: file lock context
++ * @owner: lock owner
++ *
++ * Return values:
++ * %true: @owner has at least one blocker
++ * %false: @owner has no blockers
++ */
++bool locks_owner_has_blockers(struct file_lock_context *flctx,
++ fl_owner_t owner)
++{
++ struct file_lock *fl;
++
++ spin_lock(&flctx->flc_lock);
++ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
++ if (fl->fl_owner != owner)
++ continue;
++ if (!list_empty(&fl->fl_blocked_requests)) {
++ spin_unlock(&flctx->flc_lock);
++ return true;
++ }
++ }
++ spin_unlock(&flctx->flc_lock);
++ return false;
++}
++EXPORT_SYMBOL_GPL(locks_owner_has_blockers);
++
+ /* Free a lock which is not in use. */
+ void locks_free_lock(struct file_lock *fl)
+ {
+@@ -954,19 +982,32 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
+ struct file_lock *cfl;
+ struct file_lock_context *ctx;
+ struct inode *inode = locks_inode(filp);
++ void *owner;
++ void (*func)(void);
+
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (!ctx || list_empty_careful(&ctx->flc_posix)) {
+ fl->fl_type = F_UNLCK;
+ return;
+ }
+
++retry:
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+- if (posix_locks_conflict(fl, cfl)) {
+- locks_copy_conflock(fl, cfl);
+- goto out;
++ if (!posix_locks_conflict(fl, cfl))
++ continue;
++ if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
++ && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) {
++ owner = cfl->fl_lmops->lm_mod_owner;
++ func = cfl->fl_lmops->lm_expire_lock;
++ __module_get(owner);
++ spin_unlock(&ctx->flc_lock);
++ (*func)();
++ module_put(owner);
++ goto retry;
+ }
++ locks_copy_conflock(fl, cfl);
++ goto out;
+ }
+ fl->fl_type = F_UNLCK;
+ out:
+@@ -1140,6 +1181,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+ int error;
+ bool added = false;
+ LIST_HEAD(dispose);
++ void *owner;
++ void (*func)(void);
+
+ ctx = locks_get_lock_context(inode, request->fl_type);
+ if (!ctx)
+@@ -1158,6 +1201,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+ new_fl2 = locks_alloc_lock();
+ }
+
++retry:
+ percpu_down_read(&file_rwsem);
+ spin_lock(&ctx->flc_lock);
+ /*
+@@ -1169,6 +1213,17 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+ list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+ if (!posix_locks_conflict(request, fl))
+ continue;
++ if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
++ && (*fl->fl_lmops->lm_lock_expirable)(fl)) {
++ owner = fl->fl_lmops->lm_mod_owner;
++ func = fl->fl_lmops->lm_expire_lock;
++ __module_get(owner);
++ spin_unlock(&ctx->flc_lock);
++ percpu_up_read(&file_rwsem);
++ (*func)();
++ module_put(owner);
++ goto retry;
++ }
+ if (conflock)
+ locks_copy_conflock(conflock, fl);
+ error = -EAGAIN;
+@@ -1619,7 +1674,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
+ new_fl->fl_flags = type;
+
+ /* typically we will check that ctx is non-NULL before calling */
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (!ctx) {
+ WARN_ON_ONCE(1);
+ goto free_lock;
+@@ -1724,7 +1779,7 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
+ struct file_lock_context *ctx;
+ struct file_lock *fl;
+
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+ spin_lock(&ctx->flc_lock);
+ fl = list_first_entry_or_null(&ctx->flc_lease,
+@@ -1770,7 +1825,7 @@ int fcntl_getlease(struct file *filp)
+ int type = F_UNLCK;
+ LIST_HEAD(dispose);
+
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+ percpu_down_read(&file_rwsem);
+ spin_lock(&ctx->flc_lock);
+@@ -1808,6 +1863,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags)
+
+ if (flags & FL_LAYOUT)
+ return 0;
++ if (flags & FL_DELEG)
++ /* We leave these checks to the caller */
++ return 0;
+
+ if (arg == F_RDLCK)
+ return inode_is_open_for_write(inode) ? -EAGAIN : 0;
+@@ -1956,7 +2014,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
+ struct file_lock_context *ctx;
+ LIST_HEAD(dispose);
+
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (!ctx) {
+ trace_generic_delete_lease(inode, NULL);
+ return error;
+@@ -2536,14 +2594,15 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
+ */
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
++ struct files_struct *files = current->files;
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+- spin_lock(&current->files->file_lock);
+- f = fcheck(fd);
+- spin_unlock(&current->files->file_lock);
++ spin_lock(&files->file_lock);
++ f = files_lookup_fd_locked(files, fd);
++ spin_unlock(&files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+@@ -2667,14 +2726,15 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
+ */
+ if (!error && file_lock->fl_type != F_UNLCK &&
+ !(file_lock->fl_flags & FL_OFDLCK)) {
++ struct files_struct *files = current->files;
+ /*
+ * We need that spin_lock here - it prevents reordering between
+ * update of i_flctx->flc_posix and check for it done in
+ * close(). rcu_read_lock() wouldn't do.
+ */
+- spin_lock(&current->files->file_lock);
+- f = fcheck(fd);
+- spin_unlock(&current->files->file_lock);
++ spin_lock(&files->file_lock);
++ f = files_lookup_fd_locked(files, fd);
++ spin_unlock(&files->file_lock);
+ if (f != filp) {
+ file_lock->fl_type = F_UNLCK;
+ error = do_lock_file_wait(filp, cmd, file_lock);
+@@ -2705,7 +2765,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
+ * posix_lock_file(). Another process could be setting a lock on this
+ * file at the same time, but we wouldn't remove that lock anyway.
+ */
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (!ctx || list_empty(&ctx->flc_posix))
+ return;
+
+@@ -2778,7 +2838,7 @@ void locks_remove_file(struct file *filp)
+ {
+ struct file_lock_context *ctx;
+
+- ctx = smp_load_acquire(&locks_inode(filp)->i_flctx);
++ ctx = locks_inode_context(locks_inode(filp));
+ if (!ctx)
+ return;
+
+@@ -2825,7 +2885,7 @@ bool vfs_inode_has_locks(struct inode *inode)
+ struct file_lock_context *ctx;
+ bool ret;
+
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (!ctx)
+ return false;
+
+@@ -2970,7 +3030,7 @@ void show_fd_locks(struct seq_file *f,
+ struct file_lock_context *ctx;
+ int id = 0;
+
+- ctx = smp_load_acquire(&inode->i_flctx);
++ ctx = locks_inode_context(inode);
+ if (!ctx)
+ return;
+
+diff --git a/fs/namei.c b/fs/namei.c
+index cb37d7c477e0b..72521a614514b 100644
+--- a/fs/namei.c
++++ b/fs/namei.c
+@@ -4277,11 +4277,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
+ * ->i_mutex on parents, which works but leads to some truly excessive
+ * locking].
+ */
+-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+- struct inode *new_dir, struct dentry *new_dentry,
+- struct inode **delegated_inode, unsigned int flags)
++int vfs_rename(struct renamedata *rd)
+ {
+ int error;
++ struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir;
++ struct dentry *old_dentry = rd->old_dentry;
++ struct dentry *new_dentry = rd->new_dentry;
++ struct inode **delegated_inode = rd->delegated_inode;
++ unsigned int flags = rd->flags;
+ bool is_dir = d_is_dir(old_dentry);
+ struct inode *source = old_dentry->d_inode;
+ struct inode *target = new_dentry->d_inode;
+@@ -4429,6 +4432,7 @@ EXPORT_SYMBOL(vfs_rename);
+ int do_renameat2(int olddfd, struct filename *from, int newdfd,
+ struct filename *to, unsigned int flags)
+ {
++ struct renamedata rd;
+ struct dentry *old_dentry, *new_dentry;
+ struct dentry *trap;
+ struct path old_path, new_path;
+@@ -4532,9 +4536,14 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd,
+ &new_path, new_dentry, flags);
+ if (error)
+ goto exit5;
+- error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+- new_path.dentry->d_inode, new_dentry,
+- &delegated_inode, flags);
++
++ rd.old_dir = old_path.dentry->d_inode;
++ rd.old_dentry = old_dentry;
++ rd.new_dir = new_path.dentry->d_inode;
++ rd.new_dentry = new_dentry;
++ rd.delegated_inode = &delegated_inode;
++ rd.flags = flags;
++ error = vfs_rename(&rd);
+ exit5:
+ dput(new_dentry);
+ exit4:
+diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
+index 73000aa2d220b..a9e563145e0c2 100644
+--- a/fs/nfs/blocklayout/blocklayout.c
++++ b/fs/nfs/blocklayout/blocklayout.c
+@@ -699,7 +699,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
+
+ xdr_init_decode_pages(&xdr, &buf,
+ lgr->layoutp->pages, lgr->layoutp->len);
+- xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&xdr, scratch);
+
+ status = -EIO;
+ p = xdr_inline_decode(&xdr, 4);
+diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
+index 6e3a14fdff9c8..16412d6636e86 100644
+--- a/fs/nfs/blocklayout/dev.c
++++ b/fs/nfs/blocklayout/dev.c
+@@ -510,7 +510,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ goto out;
+
+ xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
+- xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&xdr, scratch);
+
+ p = xdr_inline_decode(&xdr, sizeof(__be32));
+ if (!p)
+diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
+index 7817ad94a6bae..8fe143cad4a2b 100644
+--- a/fs/nfs/callback.c
++++ b/fs/nfs/callback.c
+@@ -17,7 +17,6 @@
+ #include <linux/errno.h>
+ #include <linux/mutex.h>
+ #include <linux/freezer.h>
+-#include <linux/kthread.h>
+ #include <linux/sunrpc/svcauth_gss.h>
+ #include <linux/sunrpc/bc_xprt.h>
+
+@@ -45,18 +44,18 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net)
+ int ret;
+ struct nfs_net *nn = net_generic(net, nfs_net_id);
+
+- ret = svc_create_xprt(serv, "tcp", net, PF_INET,
+- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+- cred);
++ ret = svc_xprt_create(serv, "tcp", net, PF_INET,
++ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
++ cred);
+ if (ret <= 0)
+ goto out_err;
+ nn->nfs_callback_tcpport = ret;
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+ nn->nfs_callback_tcpport, PF_INET, net->ns.inum);
+
+- ret = svc_create_xprt(serv, "tcp", net, PF_INET6,
+- nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
+- cred);
++ ret = svc_xprt_create(serv, "tcp", net, PF_INET6,
++ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS,
++ cred);
+ if (ret > 0) {
+ nn->nfs_callback_tcpport6 = ret;
+ dprintk("NFS: Callback listener port = %u (af %u, net %x)\n",
+@@ -81,9 +80,6 @@ nfs4_callback_svc(void *vrqstp)
+ set_freezable();
+
+ while (!kthread_freezable_should_stop(NULL)) {
+-
+- if (signal_pending(current))
+- flush_signals(current);
+ /*
+ * Listen for a request on the socket
+ */
+@@ -92,8 +88,8 @@ nfs4_callback_svc(void *vrqstp)
+ continue;
+ svc_process(rqstp);
+ }
++
+ svc_exit_thread(rqstp);
+- module_put_and_exit(0);
+ return 0;
+ }
+
+@@ -113,11 +109,7 @@ nfs41_callback_svc(void *vrqstp)
+ set_freezable();
+
+ while (!kthread_freezable_should_stop(NULL)) {
+-
+- if (signal_pending(current))
+- flush_signals(current);
+-
+- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
++ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE);
+ spin_lock_bh(&serv->sv_cb_lock);
+ if (!list_empty(&serv->sv_cb_list)) {
+ req = list_first_entry(&serv->sv_cb_list,
+@@ -132,12 +124,12 @@ nfs41_callback_svc(void *vrqstp)
+ } else {
+ spin_unlock_bh(&serv->sv_cb_lock);
+ if (!kthread_should_stop())
+- schedule();
++ freezable_schedule();
+ finish_wait(&serv->sv_cb_waitq, &wq);
+ }
+ }
++
+ svc_exit_thread(rqstp);
+- module_put_and_exit(0);
+ return 0;
+ }
+
+@@ -169,12 +161,12 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
+ if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+ nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
+
+- if (serv->sv_nrthreads-1 == nrservs)
++ if (serv->sv_nrthreads == nrservs)
+ return 0;
+
+- ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
++ ret = svc_set_num_threads(serv, NULL, nrservs);
+ if (ret) {
+- serv->sv_ops->svo_setup(serv, NULL, 0);
++ svc_set_num_threads(serv, NULL, 0);
+ return ret;
+ }
+ dprintk("nfs_callback_up: service started\n");
+@@ -189,7 +181,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
+ return;
+
+ dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum);
+- svc_shutdown_net(serv, net);
++ svc_xprt_destroy_all(serv, net);
+ }
+
+ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+@@ -232,59 +224,17 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ return ret;
+ }
+
+-static const struct svc_serv_ops nfs40_cb_sv_ops = {
+- .svo_function = nfs4_callback_svc,
+- .svo_enqueue_xprt = svc_xprt_do_enqueue,
+- .svo_setup = svc_set_num_threads_sync,
+- .svo_module = THIS_MODULE,
+-};
+-#if defined(CONFIG_NFS_V4_1)
+-static const struct svc_serv_ops nfs41_cb_sv_ops = {
+- .svo_function = nfs41_callback_svc,
+- .svo_enqueue_xprt = svc_xprt_do_enqueue,
+- .svo_setup = svc_set_num_threads_sync,
+- .svo_module = THIS_MODULE,
+-};
+-
+-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+- [0] = &nfs40_cb_sv_ops,
+- [1] = &nfs41_cb_sv_ops,
+-};
+-#else
+-static const struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+- [0] = &nfs40_cb_sv_ops,
+- [1] = NULL,
+-};
+-#endif
+-
+ static struct svc_serv *nfs_callback_create_svc(int minorversion)
+ {
+ struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+- const struct svc_serv_ops *sv_ops;
++ int (*threadfn)(void *data);
+ struct svc_serv *serv;
+
+ /*
+ * Check whether we're already up and running.
+ */
+- if (cb_info->serv) {
+- /*
+- * Note: increase service usage, because later in case of error
+- * svc_destroy() will be called.
+- */
+- svc_get(cb_info->serv);
+- return cb_info->serv;
+- }
+-
+- switch (minorversion) {
+- case 0:
+- sv_ops = nfs4_cb_sv_ops[0];
+- break;
+- default:
+- sv_ops = nfs4_cb_sv_ops[1];
+- }
+-
+- if (sv_ops == NULL)
+- return ERR_PTR(-ENOTSUPP);
++ if (cb_info->serv)
++ return svc_get(cb_info->serv);
+
+ /*
+ * Sanity check: if there's no task,
+@@ -294,7 +244,16 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
+ printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
+ cb_info->users);
+
+- serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
++ threadfn = nfs4_callback_svc;
++#if defined(CONFIG_NFS_V4_1)
++ if (minorversion)
++ threadfn = nfs41_callback_svc;
++#else
++ if (minorversion)
++ return ERR_PTR(-ENOTSUPP);
++#endif
++ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
++ threadfn);
+ if (!serv) {
+ printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
+ return ERR_PTR(-ENOMEM);
+@@ -335,16 +294,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
+ goto err_start;
+
+ cb_info->users++;
+- /*
+- * svc_create creates the svc_serv with sv_nrthreads == 1, and then
+- * svc_prepare_thread increments that. So we need to call svc_destroy
+- * on both success and failure so that the refcount is 1 when the
+- * thread exits.
+- */
+ err_net:
+ if (!cb_info->users)
+ cb_info->serv = NULL;
+- svc_destroy(serv);
++ svc_put(serv);
+ err_create:
+ mutex_unlock(&nfs_callback_mutex);
+ return ret;
+@@ -369,8 +322,8 @@ void nfs_callback_down(int minorversion, struct net *net)
+ cb_info->users--;
+ if (cb_info->users == 0) {
+ svc_get(serv);
+- serv->sv_ops->svo_setup(serv, NULL, 0);
+- svc_destroy(serv);
++ svc_set_num_threads(serv, NULL, 0);
++ svc_put(serv);
+ dprintk("nfs_callback_down: service destroyed\n");
+ cb_info->serv = NULL;
+ }
+@@ -429,6 +382,8 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
+ */
+ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+ {
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
++
+ switch (rqstp->rq_authop->flavour) {
+ case RPC_AUTH_NULL:
+ if (rqstp->rq_proc != CB_NULL)
+@@ -439,6 +394,8 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
+ if (svc_is_backchannel(rqstp))
+ return SVC_DENIED;
+ }
++
++ rqstp->rq_auth_stat = rpc_auth_ok;
+ return SVC_OK;
+ }
+
+diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
+index ca8a4aa351dc9..db69fc267c9a0 100644
+--- a/fs/nfs/callback_xdr.c
++++ b/fs/nfs/callback_xdr.c
+@@ -63,14 +63,13 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp)
+ return htonl(NFS4_OK);
+ }
+
+-static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return xdr_argsize_check(rqstp, p);
+-}
+-
+-static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p)
++/*
++ * svc_process_common() looks for an XDR encoder to know when
++ * not to drop a Reply.
++ */
++static bool nfs4_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+ static __be32 decode_string(struct xdr_stream *xdr, unsigned int *len,
+@@ -984,7 +983,17 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
+
+ out_invalidcred:
+ pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+- return svc_return_autherr(rqstp, rpc_autherr_badcred);
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
++ return rpc_success;
++}
++
++static int
++nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp)
++{
++ const struct svc_procedure *procp = rqstp->rq_procinfo;
++
++ *statp = procp->pc_func(rqstp);
++ return 1;
+ }
+
+ /*
+@@ -1053,16 +1062,18 @@ static struct callback_op callback_ops[] = {
+ static const struct svc_procedure nfs4_callback_procedures1[] = {
+ [CB_NULL] = {
+ .pc_func = nfs4_callback_null,
+- .pc_decode = nfs4_decode_void,
+ .pc_encode = nfs4_encode_void,
+ .pc_xdrressize = 1,
++ .pc_name = "NULL",
+ },
+ [CB_COMPOUND] = {
+ .pc_func = nfs4_callback_compound,
+ .pc_encode = nfs4_encode_void,
+ .pc_argsize = 256,
++ .pc_argzero = 256,
+ .pc_ressize = 256,
+ .pc_xdrressize = NFS4_CALLBACK_BUFSIZE,
++ .pc_name = "COMPOUND",
+ }
+ };
+
+@@ -1073,7 +1084,7 @@ const struct svc_version nfs4_callback_version1 = {
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_count = nfs4_callback_count1,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+- .vs_dispatch = NULL,
++ .vs_dispatch = nfs_callback_dispatch,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
+ };
+@@ -1085,7 +1096,7 @@ const struct svc_version nfs4_callback_version4 = {
+ .vs_proc = nfs4_callback_procedures1,
+ .vs_count = nfs4_callback_count4,
+ .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
+- .vs_dispatch = NULL,
++ .vs_dispatch = nfs_callback_dispatch,
+ .vs_hidden = true,
+ .vs_need_cong_ctrl = true,
+ };
+diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
+index 9f88ca7b20015..935029632d5f6 100644
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -576,7 +576,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en
+ goto out_nopages;
+
+ xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen);
+- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&stream, scratch);
+
+ do {
+ if (entry->label)
+diff --git a/fs/nfs/export.c b/fs/nfs/export.c
+index 3430d6891e89f..993be63ab3015 100644
+--- a/fs/nfs/export.c
++++ b/fs/nfs/export.c
+@@ -167,8 +167,25 @@ nfs_get_parent(struct dentry *dentry)
+ return parent;
+ }
+
++static u64 nfs_fetch_iversion(struct inode *inode)
++{
++ struct nfs_server *server = NFS_SERVER(inode);
++
++ if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_CHANGE |
++ NFS_INO_REVAL_PAGECACHE))
++ __nfs_revalidate_inode(server, inode);
++ return inode_peek_iversion_raw(inode);
++}
++
+ const struct export_operations nfs_export_ops = {
+ .encode_fh = nfs_encode_fh,
+ .fh_to_dentry = nfs_fh_to_dentry,
+ .get_parent = nfs_get_parent,
++ .fetch_iversion = nfs_fetch_iversion,
++ .flags = EXPORT_OP_NOWCC |
++ EXPORT_OP_NOSUBTREECHK |
++ EXPORT_OP_CLOSE_BEFORE_UNLINK |
++ EXPORT_OP_REMOTE_FS |
++ EXPORT_OP_NOATOMIC_ATTR |
++ EXPORT_OP_FLUSH_ON_CLOSE,
+ };
+diff --git a/fs/nfs/file.c b/fs/nfs/file.c
+index 7be1a7f7fcb2a..d35aae47b062b 100644
+--- a/fs/nfs/file.c
++++ b/fs/nfs/file.c
+@@ -798,6 +798,9 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
+
+ nfs_inc_stats(inode, NFSIOS_VFSLOCK);
+
++ if (fl->fl_flags & FL_RECLAIM)
++ return -ENOGRACE;
++
+ /* No mandatory locks over NFS */
+ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
+ goto out_err;
+diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
+index deecfb50dd7e3..2ed8b6885b091 100644
+--- a/fs/nfs/filelayout/filelayout.c
++++ b/fs/nfs/filelayout/filelayout.c
+@@ -293,8 +293,6 @@ static void filelayout_read_call_done(struct rpc_task *task, void *data)
+ {
+ struct nfs_pgio_header *hdr = data;
+
+- dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+-
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs41_sequence_done(task, &hdr->res.seq_res);
+@@ -666,7 +664,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
+ return -ENOMEM;
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len);
+- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&stream, scratch);
+
+ /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8),
+ * num_fh (4) */
+diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
+index d913e818858f3..86c3f7e69ec42 100644
+--- a/fs/nfs/filelayout/filelayoutdev.c
++++ b/fs/nfs/filelayout/filelayoutdev.c
+@@ -82,7 +82,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ goto out_err;
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&stream, scratch);
+
+ /* Get the stripe count (number of stripe index) */
+ p = xdr_inline_decode(&stream, 4);
+diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
+index e4f2820ba5a59..a263bfec4244d 100644
+--- a/fs/nfs/flexfilelayout/flexfilelayout.c
++++ b/fs/nfs/flexfilelayout/flexfilelayout.c
+@@ -378,7 +378,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+ lgr->layoutp->len);
+- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&stream, scratch);
+
+ /* stripe unit and mirror_array_cnt */
+ rc = -EIO;
+@@ -1419,8 +1419,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+ {
+ struct nfs_pgio_header *hdr = data;
+
+- dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+-
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+index 1f12297109b41..bfa7202ca7be1 100644
+--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
++++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+@@ -69,7 +69,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ INIT_LIST_HEAD(&dsaddrs);
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+- xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(&stream, scratch);
+
+ /* multipath count */
+ p = xdr_inline_decode(&stream, 4);
+diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
+index f2248d9d4db51..df5bee2f505c4 100644
+--- a/fs/nfs/nfs42xdr.c
++++ b/fs/nfs/nfs42xdr.c
+@@ -1536,7 +1536,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp,
+ struct compound_hdr hdr;
+ int status;
+
+- xdr_set_scratch_buffer(xdr, page_address(res->scratch), PAGE_SIZE);
++ xdr_set_scratch_page(xdr, res->scratch);
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
+index afb617a4a7e42..d8fc5d72a161c 100644
+--- a/fs/nfs/nfs4state.c
++++ b/fs/nfs/nfs4state.c
+@@ -2757,7 +2757,7 @@ static int nfs4_run_state_manager(void *ptr)
+ goto again;
+
+ nfs_put_client(clp);
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ return 0;
+ }
+
+diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
+index f1e599553f2be..4e5c6cb770ad5 100644
+--- a/fs/nfs/nfs4xdr.c
++++ b/fs/nfs/nfs4xdr.c
+@@ -6404,10 +6404,8 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+ struct compound_hdr hdr;
+ int status;
+
+- if (res->acl_scratch != NULL) {
+- void *p = page_address(res->acl_scratch);
+- xdr_set_scratch_buffer(xdr, p, PAGE_SIZE);
+- }
++ if (res->acl_scratch != NULL)
++ xdr_set_scratch_page(xdr, res->acl_scratch);
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
+index 17fef6eb490c5..d79a3b6cb0701 100644
+--- a/fs/nfs/pagelist.c
++++ b/fs/nfs/pagelist.c
+@@ -870,9 +870,6 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
+ struct nfs_pgio_header *hdr = calldata;
+ struct inode *inode = hdr->inode;
+
+- dprintk("NFS: %s: %5u, (status %d)\n", __func__,
+- task->tk_pid, task->tk_status);
+-
+ if (hdr->rw_ops->rw_done(task, hdr, inode) != 0)
+ return;
+ if (task->tk_status < 0)
+diff --git a/fs/nfs/super.c b/fs/nfs/super.c
+index b3fcc27b95648..1ffce90760606 100644
+--- a/fs/nfs/super.c
++++ b/fs/nfs/super.c
+@@ -86,9 +86,11 @@ const struct super_operations nfs_sops = {
+ };
+ EXPORT_SYMBOL_GPL(nfs_sops);
+
++#ifdef CONFIG_NFS_V4_2
+ static const struct nfs_ssc_client_ops nfs_ssc_clnt_ops_tbl = {
+ .sco_sb_deactive = nfs_sb_deactive,
+ };
++#endif
+
+ #if IS_ENABLED(CONFIG_NFS_V4)
+ static int __init register_nfs4_fs(void)
+@@ -111,6 +113,7 @@ static void unregister_nfs4_fs(void)
+ }
+ #endif
+
++#ifdef CONFIG_NFS_V4_2
+ static void nfs_ssc_register_ops(void)
+ {
+ nfs_ssc_register(&nfs_ssc_clnt_ops_tbl);
+@@ -120,6 +123,7 @@ static void nfs_ssc_unregister_ops(void)
+ {
+ nfs_ssc_unregister(&nfs_ssc_clnt_ops_tbl);
+ }
++#endif /* CONFIG_NFS_V4_2 */
+
+ static struct shrinker acl_shrinker = {
+ .count_objects = nfs_access_cache_count,
+@@ -148,7 +152,9 @@ int __init register_nfs_fs(void)
+ ret = register_shrinker(&acl_shrinker);
+ if (ret < 0)
+ goto error_3;
++#ifdef CONFIG_NFS_V4_2
+ nfs_ssc_register_ops();
++#endif
+ return 0;
+ error_3:
+ nfs_unregister_sysctl();
+@@ -168,7 +174,9 @@ void __exit unregister_nfs_fs(void)
+ unregister_shrinker(&acl_shrinker);
+ nfs_unregister_sysctl();
+ unregister_nfs4_fs();
++#ifdef CONFIG_NFS_V4_2
+ nfs_ssc_unregister_ops();
++#endif
+ unregister_filesystem(&nfs_fs_type);
+ }
+
+diff --git a/fs/nfs/write.c b/fs/nfs/write.c
+index 4cf0606919794..2bde35921f2b2 100644
+--- a/fs/nfs/write.c
++++ b/fs/nfs/write.c
+@@ -1809,9 +1809,6 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
+ {
+ struct nfs_commit_data *data = calldata;
+
+- dprintk("NFS: %5u nfs_commit_done (status %d)\n",
+- task->tk_pid, task->tk_status);
+-
+ /* Call the NFS version-specific code */
+ NFS_PROTO(data->inode)->commit_done(task, data);
+ trace_nfs_commit_done(task, data);
+diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile
+index fa82f5aaa6d95..119c75ab9fd08 100644
+--- a/fs/nfs_common/Makefile
++++ b/fs/nfs_common/Makefile
+@@ -7,4 +7,4 @@ obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o
+ nfs_acl-objs := nfsacl.o
+
+ obj-$(CONFIG_GRACE_PERIOD) += grace.o
+-obj-$(CONFIG_GRACE_PERIOD) += nfs_ssc.o
++obj-$(CONFIG_NFS_V4_2_SSC_HELPER) += nfs_ssc.o
+diff --git a/fs/nfs_common/nfs_ssc.c b/fs/nfs_common/nfs_ssc.c
+index f43bbb3739134..7c1509e968c81 100644
+--- a/fs/nfs_common/nfs_ssc.c
++++ b/fs/nfs_common/nfs_ssc.c
+@@ -1,7 +1,5 @@
+ // SPDX-License-Identifier: GPL-2.0-only
+ /*
+- * fs/nfs_common/nfs_ssc_comm.c
+- *
+ * Helper for knfsd's SSC to access ops in NFS client modules
+ *
+ * Author: Dai Ngo <dai.ngo@oracle.com>
+diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
+index d056ad2fdefd6..5a5bd85d08f8c 100644
+--- a/fs/nfs_common/nfsacl.c
++++ b/fs/nfs_common/nfsacl.c
+@@ -136,6 +136,77 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ }
+ EXPORT_SYMBOL_GPL(nfsacl_encode);
+
++/**
++ * nfs_stream_encode_acl - Encode an NFSv3 ACL
++ *
++ * @xdr: an xdr_stream positioned to receive an encoded ACL
++ * @inode: inode of file whose ACL this is
++ * @acl: posix_acl to encode
++ * @encode_entries: whether to encode ACEs as well
++ * @typeflag: ACL type: NFS_ACL_DEFAULT or zero
++ *
++ * Return values:
++ * %false: The ACL could not be encoded
++ * %true: @xdr is advanced to the next available position
++ */
++bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode,
++ struct posix_acl *acl, int encode_entries,
++ int typeflag)
++{
++ const size_t elem_size = XDR_UNIT * 3;
++ u32 entries = (acl && acl->a_count) ? max_t(int, acl->a_count, 4) : 0;
++ struct nfsacl_encode_desc nfsacl_desc = {
++ .desc = {
++ .elem_size = elem_size,
++ .array_len = encode_entries ? entries : 0,
++ .xcode = xdr_nfsace_encode,
++ },
++ .acl = acl,
++ .typeflag = typeflag,
++ .uid = inode->i_uid,
++ .gid = inode->i_gid,
++ };
++ struct nfsacl_simple_acl aclbuf;
++ unsigned int base;
++ int err;
++
++ if (entries > NFS_ACL_MAX_ENTRIES)
++ return false;
++ if (xdr_stream_encode_u32(xdr, entries) < 0)
++ return false;
++
++ if (encode_entries && acl && acl->a_count == 3) {
++ struct posix_acl *acl2 = &aclbuf.acl;
++
++ /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is
++ * invoked in contexts where a memory allocation failure is
++ * fatal. Fortunately this fake ACL is small enough to
++ * construct on the stack. */
++ posix_acl_init(acl2, 4);
++
++ /* Insert entries in canonical order: other orders seem
++ to confuse Solaris VxFS. */
++ acl2->a_entries[0] = acl->a_entries[0]; /* ACL_USER_OBJ */
++ acl2->a_entries[1] = acl->a_entries[1]; /* ACL_GROUP_OBJ */
++ acl2->a_entries[2] = acl->a_entries[1]; /* ACL_MASK */
++ acl2->a_entries[2].e_tag = ACL_MASK;
++ acl2->a_entries[3] = acl->a_entries[2]; /* ACL_OTHER */
++ nfsacl_desc.acl = acl2;
++ }
++
++ base = xdr_stream_pos(xdr);
++ if (!xdr_reserve_space(xdr, XDR_UNIT +
++ elem_size * nfsacl_desc.desc.array_len))
++ return false;
++ err = xdr_encode_array2(xdr->buf, base, &nfsacl_desc.desc);
++ if (err)
++ return false;
++
++ return true;
++}
++EXPORT_SYMBOL_GPL(nfs_stream_encode_acl);
++
++
+ struct nfsacl_decode_desc {
+ struct xdr_array2_desc desc;
+ unsigned int count;
+@@ -295,3 +366,55 @@ int nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ nfsacl_desc.desc.array_len;
+ }
+ EXPORT_SYMBOL_GPL(nfsacl_decode);
++
++/**
++ * nfs_stream_decode_acl - Decode an NFSv3 ACL
++ *
++ * @xdr: an xdr_stream positioned at an encoded ACL
++ * @aclcnt: OUT: count of ACEs in decoded posix_acl
++ * @pacl: OUT: a dynamically-allocated buffer containing the decoded posix_acl
++ *
++ * Return values:
++ * %false: The encoded ACL is not valid
++ * %true: @pacl contains a decoded ACL, and @xdr is advanced
++ *
++ * On a successful return, caller must release *pacl using posix_acl_release().
++ */
++bool nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt,
++ struct posix_acl **pacl)
++{
++ const size_t elem_size = XDR_UNIT * 3;
++ struct nfsacl_decode_desc nfsacl_desc = {
++ .desc = {
++ .elem_size = elem_size,
++ .xcode = pacl ? xdr_nfsace_decode : NULL,
++ },
++ };
++ unsigned int base;
++ u32 entries;
++
++ if (xdr_stream_decode_u32(xdr, &entries) < 0)
++ return false;
++ if (entries > NFS_ACL_MAX_ENTRIES)
++ return false;
++
++ base = xdr_stream_pos(xdr);
++ if (!xdr_inline_decode(xdr, XDR_UNIT + elem_size * entries))
++ return false;
++ nfsacl_desc.desc.array_maxlen = entries;
++ if (xdr_decode_array2(xdr->buf, base, &nfsacl_desc.desc))
++ return false;
++
++ if (pacl) {
++ if (entries != nfsacl_desc.desc.array_len ||
++ posix_acl_from_nfsacl(nfsacl_desc.acl) != 0) {
++ posix_acl_release(nfsacl_desc.acl);
++ return false;
++ }
++ *pacl = nfsacl_desc.acl;
++ }
++ if (aclcnt)
++ *aclcnt = entries;
++ return true;
++}
++EXPORT_SYMBOL_GPL(nfs_stream_decode_acl);
+diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
+index 248f1459c0399..6d2d498a59573 100644
+--- a/fs/nfsd/Kconfig
++++ b/fs/nfsd/Kconfig
+@@ -8,6 +8,7 @@ config NFSD
+ select SUNRPC
+ select EXPORTFS
+ select NFS_ACL_SUPPORT if NFSD_V2_ACL
++ select NFS_ACL_SUPPORT if NFSD_V3_ACL
+ depends on MULTIUSER
+ help
+ Choose Y here if you want to allow other computers to access
+@@ -26,28 +27,29 @@ config NFSD
+
+ Below you can choose which versions of the NFS protocol are
+ available to clients mounting the NFS server on this system.
+- Support for NFS version 2 (RFC 1094) is always available when
++ Support for NFS version 3 (RFC 1813) is always available when
+ CONFIG_NFSD is selected.
+
+ If unsure, say N.
+
+-config NFSD_V2_ACL
+- bool
+- depends on NFSD
+-
+-config NFSD_V3
+- bool "NFS server support for NFS version 3"
++config NFSD_V2
++ bool "NFS server support for NFS version 2 (DEPRECATED)"
+ depends on NFSD
++ default n
+ help
+- This option enables support in your system's NFS server for
+- version 3 of the NFS protocol (RFC 1813).
++ NFSv2 (RFC 1094) was the first publicly-released version of NFS.
++ Unless you are hosting ancient (1990's era) NFS clients, you don't
++ need this.
+
+- If unsure, say Y.
++ If unsure, say N.
++
++config NFSD_V2_ACL
++ bool "NFS server support for the NFSv2 ACL protocol extension"
++ depends on NFSD_V2
+
+ config NFSD_V3_ACL
+ bool "NFS server support for the NFSv3 ACL protocol extension"
+- depends on NFSD_V3
+- select NFSD_V2_ACL
++ depends on NFSD
+ help
+ Solaris NFS servers support an auxiliary NFSv3 ACL protocol that
+ never became an official part of the NFS version 3 protocol.
+@@ -70,13 +72,13 @@ config NFSD_V3_ACL
+ config NFSD_V4
+ bool "NFS server support for NFS version 4"
+ depends on NFSD && PROC_FS
+- select NFSD_V3
+ select FS_POSIX_ACL
+ select SUNRPC_GSS
+ select CRYPTO
+ select CRYPTO_MD5
+ select CRYPTO_SHA256
+ select GRACE_PERIOD
++ select NFS_V4_2_SSC_HELPER if NFS_V4_2
+ help
+ This option enables support in your system's NFS server for
+ version 4 of the NFS protocol (RFC 3530).
+@@ -98,7 +100,7 @@ config NFSD_BLOCKLAYOUT
+ help
+ This option enables support for the exporting pNFS block layouts
+ in the kernel's NFS server. The pNFS block layout enables NFS
+- clients to directly perform I/O to block devices accesible to both
++ clients to directly perform I/O to block devices accessible to both
+ the server and the clients. See RFC 5663 for more details.
+
+ If unsure, say N.
+@@ -112,7 +114,7 @@ config NFSD_SCSILAYOUT
+ help
+ This option enables support for the exporting pNFS SCSI layouts
+ in the kernel's NFS server. The pNFS SCSI layout enables NFS
+- clients to directly perform I/O to SCSI devices accesible to both
++ clients to directly perform I/O to SCSI devices accessible to both
+ the server and the clients. See draft-ietf-nfsv4-scsi-layout for
+ more details.
+
+@@ -126,7 +128,7 @@ config NFSD_FLEXFILELAYOUT
+ This option enables support for the exporting pNFS Flex File
+ layouts in the kernel's NFS server. The pNFS Flex File layout
+ enables NFS clients to directly perform I/O to NFSv3 devices
+- accesible to both the server and the clients. See
++ accessible to both the server and the clients. See
+ draft-ietf-nfsv4-flex-files for more details.
+
+ Warning, this server implements the bare minimum functionality
+@@ -137,7 +139,7 @@ config NFSD_FLEXFILELAYOUT
+
+ config NFSD_V4_2_INTER_SSC
+ bool "NFSv4.2 inter server to server COPY"
+- depends on NFSD_V4 && NFS_V4_1 && NFS_V4_2
++ depends on NFSD_V4 && NFS_V4_2
+ help
+ This option enables support for NFSv4.2 inter server to
+ server copy where the destination server calls the NFSv4.2
+diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
+index 3f0983e93a998..6fffc8f03f740 100644
+--- a/fs/nfsd/Makefile
++++ b/fs/nfsd/Makefile
+@@ -10,11 +10,11 @@ obj-$(CONFIG_NFSD) += nfsd.o
+ # this one should be compiled first, as the tracing macros can easily blow up
+ nfsd-y += trace.o
+
+-nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+- export.o auth.o lockd.o nfscache.o nfsxdr.o \
+- stats.o filecache.o
++nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \
++ export.o auth.o lockd.o nfscache.o \
++ stats.o filecache.o nfs3proc.o nfs3xdr.o
++nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o
+ nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
+-nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
+ nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
+ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
+ nfs4acl.o nfs4callback.o nfs4recover.o
+diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h
+index ba14d2f4b64f4..4b7324458a94e 100644
+--- a/fs/nfsd/acl.h
++++ b/fs/nfsd/acl.h
+@@ -38,6 +38,8 @@
+ struct nfs4_acl;
+ struct svc_fh;
+ struct svc_rqst;
++struct nfsd_attrs;
++enum nfs_ftype4;
+
+ int nfs4_acl_bytes(int entries);
+ int nfs4_acl_get_whotype(char *, u32);
+@@ -45,7 +47,7 @@ __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who);
+
+ int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry,
+ struct nfs4_acl **acl);
+-__be32 nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- struct nfs4_acl *acl);
++__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
++ struct nfsd_attrs *attr);
+
+ #endif /* LINUX_NFS4_ACL_H */
+diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
+index a07c39c94bbd0..d91a686d2f313 100644
+--- a/fs/nfsd/blocklayout.c
++++ b/fs/nfsd/blocklayout.c
+@@ -16,6 +16,7 @@
+ #include "blocklayoutxdr.h"
+ #include "pnfs.h"
+ #include "filecache.h"
++#include "vfs.h"
+
+ #define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
+index 2455dc8be18a8..1ed2f691ebb90 100644
+--- a/fs/nfsd/blocklayoutxdr.c
++++ b/fs/nfsd/blocklayoutxdr.c
+@@ -9,6 +9,7 @@
+
+ #include "nfsd.h"
+ #include "blocklayoutxdr.h"
++#include "vfs.h"
+
+ #define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
+index 65c331f75e9c7..f21259ead64bb 100644
+--- a/fs/nfsd/cache.h
++++ b/fs/nfsd/cache.h
+@@ -84,6 +84,6 @@ int nfsd_reply_cache_init(struct nfsd_net *);
+ void nfsd_reply_cache_shutdown(struct nfsd_net *);
+ int nfsd_cache_lookup(struct svc_rqst *);
+ void nfsd_cache_update(struct svc_rqst *, int, __be32 *);
+-int nfsd_reply_cache_stats_open(struct inode *, struct file *);
++int nfsd_reply_cache_stats_show(struct seq_file *m, void *v);
+
+ #endif /* NFSCACHE_H */
+diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
+index 21e404e7cb68c..7c863f2c21e0c 100644
+--- a/fs/nfsd/export.c
++++ b/fs/nfsd/export.c
+@@ -331,12 +331,29 @@ static void nfsd4_fslocs_free(struct nfsd4_fs_locations *fsloc)
+ fsloc->locations = NULL;
+ }
+
++static int export_stats_init(struct export_stats *stats)
++{
++ stats->start_time = ktime_get_seconds();
++ return nfsd_percpu_counters_init(stats->counter, EXP_STATS_COUNTERS_NUM);
++}
++
++static void export_stats_reset(struct export_stats *stats)
++{
++ nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM);
++}
++
++static void export_stats_destroy(struct export_stats *stats)
++{
++ nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM);
++}
++
+ static void svc_export_put(struct kref *ref)
+ {
+ struct svc_export *exp = container_of(ref, struct svc_export, h.ref);
+ path_put(&exp->ex_path);
+ auth_domain_put(exp->ex_client);
+ nfsd4_fslocs_free(&exp->ex_fslocs);
++ export_stats_destroy(&exp->ex_stats);
+ kfree(exp->ex_uuid);
+ kfree_rcu(exp, ex_rcu);
+ }
+@@ -408,6 +425,12 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid)
+ return -EINVAL;
+ }
+
++ if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK &&
++ !(*flags & NFSEXP_NOSUBTREECHECK)) {
++ dprintk("%s: %s does not support subtree checking!\n",
++ __func__, inode->i_sb->s_type->name);
++ return -EINVAL;
++ }
+ return 0;
+
+ }
+@@ -686,22 +709,47 @@ static void exp_flags(struct seq_file *m, int flag, int fsid,
+ kuid_t anonu, kgid_t anong, struct nfsd4_fs_locations *fslocs);
+ static void show_secinfo(struct seq_file *m, struct svc_export *exp);
+
++static int is_export_stats_file(struct seq_file *m)
++{
++ /*
++ * The export_stats file uses the same ops as the exports file.
++ * We use the file's name to determine the reported info per export.
++ * There is no rename in nsfdfs, so d_name.name is stable.
++ */
++ return !strcmp(m->file->f_path.dentry->d_name.name, "export_stats");
++}
++
+ static int svc_export_show(struct seq_file *m,
+ struct cache_detail *cd,
+ struct cache_head *h)
+ {
+- struct svc_export *exp ;
++ struct svc_export *exp;
++ bool export_stats = is_export_stats_file(m);
+
+- if (h ==NULL) {
+- seq_puts(m, "#path domain(flags)\n");
++ if (h == NULL) {
++ if (export_stats)
++ seq_puts(m, "#path domain start-time\n#\tstats\n");
++ else
++ seq_puts(m, "#path domain(flags)\n");
+ return 0;
+ }
+ exp = container_of(h, struct svc_export, h);
+ seq_path(m, &exp->ex_path, " \t\n\\");
+ seq_putc(m, '\t');
+ seq_escape(m, exp->ex_client->name, " \t\n\\");
++ if (export_stats) {
++ seq_printf(m, "\t%lld\n", exp->ex_stats.start_time);
++ seq_printf(m, "\tfh_stale: %lld\n",
++ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE]));
++ seq_printf(m, "\tio_read: %lld\n",
++ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ]));
++ seq_printf(m, "\tio_write: %lld\n",
++ percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE]));
++ seq_putc(m, '\n');
++ return 0;
++ }
+ seq_putc(m, '(');
+- if (test_bit(CACHE_VALID, &h->flags) &&
++ if (test_bit(CACHE_VALID, &h->flags) &&
+ !test_bit(CACHE_NEGATIVE, &h->flags)) {
+ exp_flags(m, exp->ex_flags, exp->ex_fsid,
+ exp->ex_anon_uid, exp->ex_anon_gid, &exp->ex_fslocs);
+@@ -742,6 +790,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
+ new->ex_layout_types = 0;
+ new->ex_uuid = NULL;
+ new->cd = item->cd;
++ export_stats_reset(&new->ex_stats);
+ }
+
+ static void export_update(struct cache_head *cnew, struct cache_head *citem)
+@@ -774,10 +823,15 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
+ static struct cache_head *svc_export_alloc(void)
+ {
+ struct svc_export *i = kmalloc(sizeof(*i), GFP_KERNEL);
+- if (i)
+- return &i->h;
+- else
++ if (!i)
++ return NULL;
++
++ if (export_stats_init(&i->ex_stats)) {
++ kfree(i);
+ return NULL;
++ }
++
++ return &i->h;
+ }
+
+ static const struct cache_detail svc_export_cache_template = {
+@@ -1239,10 +1293,14 @@ static int e_show(struct seq_file *m, void *p)
+ struct cache_head *cp = p;
+ struct svc_export *exp = container_of(cp, struct svc_export, h);
+ struct cache_detail *cd = m->private;
++ bool export_stats = is_export_stats_file(m);
+
+ if (p == SEQ_START_TOKEN) {
+ seq_puts(m, "# Version 1.1\n");
+- seq_puts(m, "# Path Client(Flags) # IPs\n");
++ if (export_stats)
++ seq_puts(m, "# Path Client Start-time\n#\tStats\n");
++ else
++ seq_puts(m, "# Path Client(Flags) # IPs\n");
+ return 0;
+ }
+
+diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
+index e7daa1f246f08..d03f7f6a8642d 100644
+--- a/fs/nfsd/export.h
++++ b/fs/nfsd/export.h
+@@ -6,6 +6,7 @@
+ #define NFSD_EXPORT_H
+
+ #include <linux/sunrpc/cache.h>
++#include <linux/percpu_counter.h>
+ #include <uapi/linux/nfsd/export.h>
+ #include <linux/nfs4.h>
+
+@@ -46,6 +47,19 @@ struct exp_flavor_info {
+ u32 flags;
+ };
+
++/* Per-export stats */
++enum {
++ EXP_STATS_FH_STALE,
++ EXP_STATS_IO_READ,
++ EXP_STATS_IO_WRITE,
++ EXP_STATS_COUNTERS_NUM
++};
++
++struct export_stats {
++ time64_t start_time;
++ struct percpu_counter counter[EXP_STATS_COUNTERS_NUM];
++};
++
+ struct svc_export {
+ struct cache_head h;
+ struct auth_domain * ex_client;
+@@ -62,6 +76,7 @@ struct svc_export {
+ struct nfsd4_deviceid_map *ex_devid_map;
+ struct cache_detail *cd;
+ struct rcu_head ex_rcu;
++ struct export_stats ex_stats;
+ };
+
+ /* an "export key" (expkey) maps a filehandlefragement to an
+@@ -100,7 +115,6 @@ struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *);
+ int exp_rootfh(struct net *, struct auth_domain *,
+ char *path, struct knfsd_fh *, int maxsize);
+ __be32 exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
+-__be32 nfserrno(int errno);
+
+ static inline void exp_put(struct svc_export *exp)
+ {
+diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
+index e30e1ddc1aceb..615ea8324911e 100644
+--- a/fs/nfsd/filecache.c
++++ b/fs/nfsd/filecache.c
+@@ -1,7 +1,32 @@
++// SPDX-License-Identifier: GPL-2.0
+ /*
+- * Open file cache.
++ * The NFSD open file cache.
+ *
+ * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com>
++ *
++ * An nfsd_file object is a per-file collection of open state that binds
++ * together:
++ * - a struct file *
++ * - a user credential
++ * - a network namespace
++ * - a read-ahead context
++ * - monitoring for writeback errors
++ *
++ * nfsd_file objects are reference-counted. Consumers acquire a new
++ * object via the nfsd_file_acquire API. They manage their interest in
++ * the acquired object, and hence the object's reference count, via
++ * nfsd_file_get and nfsd_file_put. There are two varieties of nfsd_file
++ * object:
++ *
++ * * non-garbage-collected: When a consumer wants to precisely control
++ * the lifetime of a file's open state, it acquires a non-garbage-
++ * collected nfsd_file. The final nfsd_file_put releases the open
++ * state immediately.
++ *
++ * * garbage-collected: When a consumer does not control the lifetime
++ * of open state, it acquires a garbage-collected nfsd_file. The
++ * final nfsd_file_put allows the open state to linger for a period
++ * during which it may be re-used.
+ */
+
+ #include <linux/hash.h>
+@@ -12,6 +37,7 @@
+ #include <linux/fsnotify_backend.h>
+ #include <linux/fsnotify.h>
+ #include <linux/seq_file.h>
++#include <linux/rhashtable.h>
+
+ #include "vfs.h"
+ #include "nfsd.h"
+@@ -20,63 +46,75 @@
+ #include "filecache.h"
+ #include "trace.h"
+
+-#define NFSDDBG_FACILITY NFSDDBG_FH
+-
+-/* FIXME: dynamically size this for the machine somehow? */
+-#define NFSD_FILE_HASH_BITS 12
+-#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS)
+ #define NFSD_LAUNDRETTE_DELAY (2 * HZ)
+
+-#define NFSD_FILE_SHUTDOWN (1)
+-#define NFSD_FILE_LRU_THRESHOLD (4096UL)
+-#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2)
++#define NFSD_FILE_CACHE_UP (0)
+
+ /* We only care about NFSD_MAY_READ/WRITE for this cache */
+ #define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE)
+
+-struct nfsd_fcache_bucket {
+- struct hlist_head nfb_head;
+- spinlock_t nfb_lock;
+- unsigned int nfb_count;
+- unsigned int nfb_maxcount;
+-};
+-
+ static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits);
++static DEFINE_PER_CPU(unsigned long, nfsd_file_acquisitions);
++static DEFINE_PER_CPU(unsigned long, nfsd_file_releases);
++static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
++static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
+
+ struct nfsd_fcache_disposal {
+- struct list_head list;
+ struct work_struct work;
+- struct net *net;
+ spinlock_t lock;
+ struct list_head freeme;
+- struct rcu_head rcu;
+ };
+
+ static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
+
+ static struct kmem_cache *nfsd_file_slab;
+ static struct kmem_cache *nfsd_file_mark_slab;
+-static struct nfsd_fcache_bucket *nfsd_file_hashtbl;
+ static struct list_lru nfsd_file_lru;
+-static long nfsd_file_lru_flags;
++static unsigned long nfsd_file_flags;
+ static struct fsnotify_group *nfsd_file_fsnotify_group;
+-static atomic_long_t nfsd_filecache_count;
+ static struct delayed_work nfsd_filecache_laundrette;
+-static DEFINE_SPINLOCK(laundrette_lock);
+-static LIST_HEAD(laundrettes);
++static struct rhltable nfsd_file_rhltable
++ ____cacheline_aligned_in_smp;
++
++static bool
++nfsd_match_cred(const struct cred *c1, const struct cred *c2)
++{
++ int i;
++
++ if (!uid_eq(c1->fsuid, c2->fsuid))
++ return false;
++ if (!gid_eq(c1->fsgid, c2->fsgid))
++ return false;
++ if (c1->group_info == NULL || c2->group_info == NULL)
++ return c1->group_info == c2->group_info;
++ if (c1->group_info->ngroups != c2->group_info->ngroups)
++ return false;
++ for (i = 0; i < c1->group_info->ngroups; i++) {
++ if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
++ return false;
++ }
++ return true;
++}
+
+-static void nfsd_file_gc(void);
++static const struct rhashtable_params nfsd_file_rhash_params = {
++ .key_len = sizeof_field(struct nfsd_file, nf_inode),
++ .key_offset = offsetof(struct nfsd_file, nf_inode),
++ .head_offset = offsetof(struct nfsd_file, nf_rlist),
++
++ /*
++ * Start with a single page hash table to reduce resizing churn
++ * on light workloads.
++ */
++ .min_size = 256,
++ .automatic_shrinking = true,
++};
+
+ static void
+ nfsd_file_schedule_laundrette(void)
+ {
+- long count = atomic_long_read(&nfsd_filecache_count);
+-
+- if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags))
+- return;
+-
+- queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
+- NFSD_LAUNDRETTE_DELAY);
++ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags))
++ queue_delayed_work(system_wq, &nfsd_filecache_laundrette,
++ NFSD_LAUNDRETTE_DELAY);
+ }
+
+ static void
+@@ -115,22 +153,21 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm)
+ }
+
+ static struct nfsd_file_mark *
+-nfsd_file_mark_find_or_create(struct nfsd_file *nf)
++nfsd_file_mark_find_or_create(struct nfsd_file *nf, struct inode *inode)
+ {
+ int err;
+ struct fsnotify_mark *mark;
+ struct nfsd_file_mark *nfm = NULL, *new;
+- struct inode *inode = nf->nf_inode;
+
+ do {
+- mutex_lock(&nfsd_file_fsnotify_group->mark_mutex);
++ fsnotify_group_lock(nfsd_file_fsnotify_group);
+ mark = fsnotify_find_mark(&inode->i_fsnotify_marks,
+- nfsd_file_fsnotify_group);
++ nfsd_file_fsnotify_group);
+ if (mark) {
+ nfm = nfsd_file_mark_get(container_of(mark,
+ struct nfsd_file_mark,
+ nfm_mark));
+- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
++ fsnotify_group_unlock(nfsd_file_fsnotify_group);
+ if (nfm) {
+ fsnotify_put_mark(mark);
+ break;
+@@ -138,8 +175,9 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+ /* Avoid soft lockup race with nfsd_file_mark_put() */
+ fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
+ fsnotify_put_mark(mark);
+- } else
+- mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex);
++ } else {
++ fsnotify_group_unlock(nfsd_file_fsnotify_group);
++ }
+
+ /* allocate a new nfm */
+ new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
+@@ -170,244 +208,233 @@ nfsd_file_mark_find_or_create(struct nfsd_file *nf)
+ }
+
+ static struct nfsd_file *
+-nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
+- struct net *net)
++nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need,
++ bool want_gc)
+ {
+ struct nfsd_file *nf;
+
+ nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL);
+- if (nf) {
+- INIT_HLIST_NODE(&nf->nf_node);
+- INIT_LIST_HEAD(&nf->nf_lru);
+- nf->nf_file = NULL;
+- nf->nf_cred = get_current_cred();
+- nf->nf_net = net;
+- nf->nf_flags = 0;
+- nf->nf_inode = inode;
+- nf->nf_hashval = hashval;
+- refcount_set(&nf->nf_ref, 1);
+- nf->nf_may = may & NFSD_FILE_MAY_MASK;
+- if (may & NFSD_MAY_NOT_BREAK_LEASE) {
+- if (may & NFSD_MAY_WRITE)
+- __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags);
+- if (may & NFSD_MAY_READ)
+- __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+- }
+- nf->nf_mark = NULL;
+- trace_nfsd_file_alloc(nf);
+- }
+- return nf;
+-}
+-
+-static bool
+-nfsd_file_free(struct nfsd_file *nf)
+-{
+- bool flush = false;
+-
+- trace_nfsd_file_put_final(nf);
+- if (nf->nf_mark)
+- nfsd_file_mark_put(nf->nf_mark);
+- if (nf->nf_file) {
+- get_file(nf->nf_file);
+- filp_close(nf->nf_file, NULL);
+- fput(nf->nf_file);
+- flush = true;
+- }
+- call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
+- return flush;
+-}
+-
+-static bool
+-nfsd_file_check_writeback(struct nfsd_file *nf)
+-{
+- struct file *file = nf->nf_file;
+- struct address_space *mapping;
++ if (unlikely(!nf))
++ return NULL;
+
+- if (!file || !(file->f_mode & FMODE_WRITE))
+- return false;
+- mapping = file->f_mapping;
+- return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
+- mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
++ INIT_LIST_HEAD(&nf->nf_lru);
++ nf->nf_birthtime = ktime_get();
++ nf->nf_file = NULL;
++ nf->nf_cred = get_current_cred();
++ nf->nf_net = net;
++ nf->nf_flags = want_gc ?
++ BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING) | BIT(NFSD_FILE_GC) :
++ BIT(NFSD_FILE_HASHED) | BIT(NFSD_FILE_PENDING);
++ nf->nf_inode = inode;
++ refcount_set(&nf->nf_ref, 1);
++ nf->nf_may = need;
++ nf->nf_mark = NULL;
++ return nf;
+ }
+
+-static int
++/**
++ * nfsd_file_check_write_error - check for writeback errors on a file
++ * @nf: nfsd_file to check for writeback errors
++ *
++ * Check whether a nfsd_file has an unseen error. Reset the write
++ * verifier if so.
++ */
++static void
+ nfsd_file_check_write_error(struct nfsd_file *nf)
+ {
+ struct file *file = nf->nf_file;
+
+- if (!file || !(file->f_mode & FMODE_WRITE))
+- return 0;
+- return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
++ if ((file->f_mode & FMODE_WRITE) &&
++ filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)))
++ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
+ }
+
+ static void
+-nfsd_file_do_unhash(struct nfsd_file *nf)
++nfsd_file_hash_remove(struct nfsd_file *nf)
+ {
+- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+-
+ trace_nfsd_file_unhash(nf);
+-
+- if (nfsd_file_check_write_error(nf))
+- nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id));
+- --nfsd_file_hashtbl[nf->nf_hashval].nfb_count;
+- hlist_del_rcu(&nf->nf_node);
+- atomic_long_dec(&nfsd_filecache_count);
++ rhltable_remove(&nfsd_file_rhltable, &nf->nf_rlist,
++ nfsd_file_rhash_params);
+ }
+
+ static bool
+ nfsd_file_unhash(struct nfsd_file *nf)
+ {
+ if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+- nfsd_file_do_unhash(nf);
+- if (!list_empty(&nf->nf_lru))
+- list_lru_del(&nfsd_file_lru, &nf->nf_lru);
++ nfsd_file_hash_remove(nf);
+ return true;
+ }
+ return false;
+ }
+
+-/*
+- * Return true if the file was unhashed.
+- */
+-static bool
+-nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose)
++static void
++nfsd_file_free(struct nfsd_file *nf)
+ {
+- lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+-
+- trace_nfsd_file_unhash_and_release_locked(nf);
+- if (!nfsd_file_unhash(nf))
+- return false;
+- /* keep final reference for nfsd_file_lru_dispose */
+- if (refcount_dec_not_one(&nf->nf_ref))
+- return true;
++ s64 age = ktime_to_ms(ktime_sub(ktime_get(), nf->nf_birthtime));
+
+- list_add(&nf->nf_lru, dispose);
+- return true;
+-}
++ trace_nfsd_file_free(nf);
+
+-static void
+-nfsd_file_put_noref(struct nfsd_file *nf)
+-{
+- trace_nfsd_file_put(nf);
++ this_cpu_inc(nfsd_file_releases);
++ this_cpu_add(nfsd_file_total_age, age);
+
+- if (refcount_dec_and_test(&nf->nf_ref)) {
+- WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags));
+- nfsd_file_free(nf);
++ nfsd_file_unhash(nf);
++ if (nf->nf_mark)
++ nfsd_file_mark_put(nf->nf_mark);
++ if (nf->nf_file) {
++ nfsd_file_check_write_error(nf);
++ filp_close(nf->nf_file, NULL);
+ }
+-}
+
+-void
+-nfsd_file_put(struct nfsd_file *nf)
+-{
+- bool is_hashed;
+-
+- set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
+- if (refcount_read(&nf->nf_ref) > 2 || !nf->nf_file) {
+- nfsd_file_put_noref(nf);
++ /*
++ * If this item is still linked via nf_lru, that's a bug.
++ * WARN and leak it to preserve system stability.
++ */
++ if (WARN_ON_ONCE(!list_empty(&nf->nf_lru)))
+ return;
+- }
+
+- filemap_flush(nf->nf_file->f_mapping);
+- is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0;
+- nfsd_file_put_noref(nf);
+- if (is_hashed)
+- nfsd_file_schedule_laundrette();
+- if (atomic_long_read(&nfsd_filecache_count) >= NFSD_FILE_LRU_LIMIT)
+- nfsd_file_gc();
++ call_rcu(&nf->nf_rcu, nfsd_file_slab_free);
+ }
+
+-struct nfsd_file *
+-nfsd_file_get(struct nfsd_file *nf)
++static bool
++nfsd_file_check_writeback(struct nfsd_file *nf)
+ {
+- if (likely(refcount_inc_not_zero(&nf->nf_ref)))
+- return nf;
+- return NULL;
++ struct file *file = nf->nf_file;
++ struct address_space *mapping;
++
++ /* File not open for write? */
++ if (!(file->f_mode & FMODE_WRITE))
++ return false;
++
++ /*
++ * Some filesystems (e.g. NFS) flush all dirty data on close.
++ * On others, there is no need to wait for writeback.
++ */
++ if (!(file_inode(file)->i_sb->s_export_op->flags & EXPORT_OP_FLUSH_ON_CLOSE))
++ return false;
++
++ mapping = file->f_mapping;
++ return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) ||
++ mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK);
+ }
+
+-static void
+-nfsd_file_dispose_list(struct list_head *dispose)
+-{
+- struct nfsd_file *nf;
+
+- while(!list_empty(dispose)) {
+- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+- list_del(&nf->nf_lru);
+- nfsd_file_put_noref(nf);
++static bool nfsd_file_lru_add(struct nfsd_file *nf)
++{
++ set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
++ if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
++ trace_nfsd_file_lru_add(nf);
++ return true;
+ }
++ return false;
+ }
+
+-static void
+-nfsd_file_dispose_list_sync(struct list_head *dispose)
++static bool nfsd_file_lru_remove(struct nfsd_file *nf)
+ {
+- bool flush = false;
+- struct nfsd_file *nf;
+-
+- while(!list_empty(dispose)) {
+- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+- list_del(&nf->nf_lru);
+- if (!refcount_dec_and_test(&nf->nf_ref))
+- continue;
+- if (nfsd_file_free(nf))
+- flush = true;
++ if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
++ trace_nfsd_file_lru_del(nf);
++ return true;
+ }
+- if (flush)
+- flush_delayed_fput();
++ return false;
+ }
+
+-static void
+-nfsd_file_list_remove_disposal(struct list_head *dst,
+- struct nfsd_fcache_disposal *l)
++struct nfsd_file *
++nfsd_file_get(struct nfsd_file *nf)
+ {
+- spin_lock(&l->lock);
+- list_splice_init(&l->freeme, dst);
+- spin_unlock(&l->lock);
++ if (nf && refcount_inc_not_zero(&nf->nf_ref))
++ return nf;
++ return NULL;
+ }
+
+-static void
+-nfsd_file_list_add_disposal(struct list_head *files, struct net *net)
++/**
++ * nfsd_file_put - put the reference to a nfsd_file
++ * @nf: nfsd_file of which to put the reference
++ *
++ * Put a reference to a nfsd_file. In the non-GC case, we just put the
++ * reference immediately. In the GC case, if the reference would be
++ * the last one, the put it on the LRU instead to be cleaned up later.
++ */
++void
++nfsd_file_put(struct nfsd_file *nf)
+ {
+- struct nfsd_fcache_disposal *l;
++ might_sleep();
++ trace_nfsd_file_put(nf);
+
+- rcu_read_lock();
+- list_for_each_entry_rcu(l, &laundrettes, list) {
+- if (l->net == net) {
+- spin_lock(&l->lock);
+- list_splice_tail_init(files, &l->freeme);
+- spin_unlock(&l->lock);
+- queue_work(nfsd_filecache_wq, &l->work);
+- break;
++ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) &&
++ test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
++ /*
++ * If this is the last reference (nf_ref == 1), then try to
++ * transfer it to the LRU.
++ */
++ if (refcount_dec_not_one(&nf->nf_ref))
++ return;
++
++ /* Try to add it to the LRU. If that fails, decrement. */
++ if (nfsd_file_lru_add(nf)) {
++ /* If it's still hashed, we're done */
++ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
++ nfsd_file_schedule_laundrette();
++ return;
++ }
++
++ /*
++ * We're racing with unhashing, so try to remove it from
++ * the LRU. If removal fails, then someone else already
++ * has our reference.
++ */
++ if (!nfsd_file_lru_remove(nf))
++ return;
+ }
+ }
+- rcu_read_unlock();
++ if (refcount_dec_and_test(&nf->nf_ref))
++ nfsd_file_free(nf);
+ }
+
+ static void
+-nfsd_file_list_add_pernet(struct list_head *dst, struct list_head *src,
+- struct net *net)
++nfsd_file_dispose_list(struct list_head *dispose)
+ {
+- struct nfsd_file *nf, *tmp;
++ struct nfsd_file *nf;
+
+- list_for_each_entry_safe(nf, tmp, src, nf_lru) {
+- if (nf->nf_net == net)
+- list_move_tail(&nf->nf_lru, dst);
++ while (!list_empty(dispose)) {
++ nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
++ list_del_init(&nf->nf_lru);
++ nfsd_file_free(nf);
+ }
+ }
+
++/**
++ * nfsd_file_dispose_list_delayed - move list of dead files to net's freeme list
++ * @dispose: list of nfsd_files to be disposed
++ *
++ * Transfers each file to the "freeme" list for its nfsd_net, to eventually
++ * be disposed of by the per-net garbage collector.
++ */
+ static void
+ nfsd_file_dispose_list_delayed(struct list_head *dispose)
+ {
+- LIST_HEAD(list);
+- struct nfsd_file *nf;
+-
+ while(!list_empty(dispose)) {
+- nf = list_first_entry(dispose, struct nfsd_file, nf_lru);
+- nfsd_file_list_add_pernet(&list, dispose, nf->nf_net);
+- nfsd_file_list_add_disposal(&list, nf->nf_net);
++ struct nfsd_file *nf = list_first_entry(dispose,
++ struct nfsd_file, nf_lru);
++ struct nfsd_net *nn = net_generic(nf->nf_net, nfsd_net_id);
++ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
++
++ spin_lock(&l->lock);
++ list_move_tail(&nf->nf_lru, &l->freeme);
++ spin_unlock(&l->lock);
++ queue_work(nfsd_filecache_wq, &l->work);
+ }
+ }
+
+-/*
+- * Note this can deadlock with nfsd_file_cache_purge.
++/**
++ * nfsd_file_lru_cb - Examine an entry on the LRU list
++ * @item: LRU entry to examine
++ * @lru: controlling LRU
++ * @lock: LRU list lock (unused)
++ * @arg: dispose list
++ *
++ * Return values:
++ * %LRU_REMOVED: @item was removed from the LRU
++ * %LRU_ROTATE: @item is to be moved to the LRU tail
++ * %LRU_SKIP: @item cannot be evicted
+ */
+ static enum lru_status
+ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
+@@ -418,72 +445,60 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru,
+ struct list_head *head = arg;
+ struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru);
+
+- /*
+- * Do a lockless refcount check. The hashtable holds one reference, so
+- * we look to see if anything else has a reference, or if any have
+- * been put since the shrinker last ran. Those don't get unhashed and
+- * released.
+- *
+- * Note that in the put path, we set the flag and then decrement the
+- * counter. Here we check the counter and then test and clear the flag.
+- * That order is deliberate to ensure that we can do this locklessly.
+- */
+- if (refcount_read(&nf->nf_ref) > 1)
+- goto out_skip;
++ /* We should only be dealing with GC entries here */
++ WARN_ON_ONCE(!test_bit(NFSD_FILE_GC, &nf->nf_flags));
+
+ /*
+ * Don't throw out files that are still undergoing I/O or
+ * that have uncleared errors pending.
+ */
+- if (nfsd_file_check_writeback(nf))
+- goto out_skip;
++ if (nfsd_file_check_writeback(nf)) {
++ trace_nfsd_file_gc_writeback(nf);
++ return LRU_SKIP;
++ }
+
+- if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags))
+- goto out_skip;
++ /* If it was recently added to the list, skip it */
++ if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) {
++ trace_nfsd_file_gc_referenced(nf);
++ return LRU_ROTATE;
++ }
+
+- if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags))
+- goto out_skip;
++ /*
++ * Put the reference held on behalf of the LRU. If it wasn't the last
++ * one, then just remove it from the LRU and ignore it.
++ */
++ if (!refcount_dec_and_test(&nf->nf_ref)) {
++ trace_nfsd_file_gc_in_use(nf);
++ list_lru_isolate(lru, &nf->nf_lru);
++ return LRU_REMOVED;
++ }
+
++ /* Refcount went to zero. Unhash it and queue it to the dispose list */
++ nfsd_file_unhash(nf);
+ list_lru_isolate_move(lru, &nf->nf_lru, head);
++ this_cpu_inc(nfsd_file_evictions);
++ trace_nfsd_file_gc_disposed(nf);
+ return LRU_REMOVED;
+-out_skip:
+- return LRU_SKIP;
+-}
+-
+-static unsigned long
+-nfsd_file_lru_walk_list(struct shrink_control *sc)
+-{
+- LIST_HEAD(head);
+- struct nfsd_file *nf;
+- unsigned long ret;
+-
+- if (sc)
+- ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
+- nfsd_file_lru_cb, &head);
+- else
+- ret = list_lru_walk(&nfsd_file_lru,
+- nfsd_file_lru_cb,
+- &head, LONG_MAX);
+- list_for_each_entry(nf, &head, nf_lru) {
+- spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+- nfsd_file_do_unhash(nf);
+- spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock);
+- }
+- nfsd_file_dispose_list_delayed(&head);
+- return ret;
+ }
+
+ static void
+ nfsd_file_gc(void)
+ {
+- nfsd_file_lru_walk_list(NULL);
++ LIST_HEAD(dispose);
++ unsigned long ret;
++
++ ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb,
++ &dispose, list_lru_count(&nfsd_file_lru));
++ trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru));
++ nfsd_file_dispose_list_delayed(&dispose);
+ }
+
+ static void
+ nfsd_file_gc_worker(struct work_struct *work)
+ {
+ nfsd_file_gc();
+- nfsd_file_schedule_laundrette();
++ if (list_lru_count(&nfsd_file_lru))
++ nfsd_file_schedule_laundrette();
+ }
+
+ static unsigned long
+@@ -495,7 +510,14 @@ nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc)
+ static unsigned long
+ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc)
+ {
+- return nfsd_file_lru_walk_list(sc);
++ LIST_HEAD(dispose);
++ unsigned long ret;
++
++ ret = list_lru_shrink_walk(&nfsd_file_lru, sc,
++ nfsd_file_lru_cb, &dispose);
++ trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru));
++ nfsd_file_dispose_list_delayed(&dispose);
++ return ret;
+ }
+
+ static struct shrinker nfsd_file_shrinker = {
+@@ -504,70 +526,123 @@ static struct shrinker nfsd_file_shrinker = {
+ .seeks = 1,
+ };
+
++/**
++ * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file
++ * @nf: nfsd_file to attempt to queue
++ * @dispose: private list to queue successfully-put objects
++ *
++ * Unhash an nfsd_file, try to get a reference to it, and then put that
++ * reference. If it's the last reference, queue it to the dispose list.
++ */
++static void
++nfsd_file_cond_queue(struct nfsd_file *nf, struct list_head *dispose)
++ __must_hold(RCU)
++{
++ int decrement = 1;
++
++ /* If we raced with someone else unhashing, ignore it */
++ if (!nfsd_file_unhash(nf))
++ return;
++
++ /* If we can't get a reference, ignore it */
++ if (!nfsd_file_get(nf))
++ return;
++
++ /* Extra decrement if we remove from the LRU */
++ if (nfsd_file_lru_remove(nf))
++ ++decrement;
++
++ /* If refcount goes to 0, then put on the dispose list */
++ if (refcount_sub_and_test(decrement, &nf->nf_ref)) {
++ list_add(&nf->nf_lru, dispose);
++ trace_nfsd_file_closing(nf);
++ }
++}
++
++/**
++ * nfsd_file_queue_for_close: try to close out any open nfsd_files for an inode
++ * @inode: inode on which to close out nfsd_files
++ * @dispose: list on which to gather nfsd_files to close out
++ *
++ * An nfsd_file represents a struct file being held open on behalf of nfsd.
++ * An open file however can block other activity (such as leases), or cause
++ * undesirable behavior (e.g. spurious silly-renames when reexporting NFS).
++ *
++ * This function is intended to find open nfsd_files when this sort of
++ * conflicting access occurs and then attempt to close those files out.
++ *
++ * Populates the dispose list with entries that have already had their
++ * refcounts go to zero. The actual free of an nfsd_file can be expensive,
++ * so we leave it up to the caller whether it wants to wait or not.
++ */
+ static void
+-__nfsd_file_close_inode(struct inode *inode, unsigned int hashval,
+- struct list_head *dispose)
++nfsd_file_queue_for_close(struct inode *inode, struct list_head *dispose)
+ {
+- struct nfsd_file *nf;
+- struct hlist_node *tmp;
++ struct rhlist_head *tmp, *list;
++ struct nfsd_file *nf;
+
+- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+- hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) {
+- if (inode == nf->nf_inode)
+- nfsd_file_unhash_and_release_locked(nf, dispose);
++ rcu_read_lock();
++ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
++ nfsd_file_rhash_params);
++ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
++ if (!test_bit(NFSD_FILE_GC, &nf->nf_flags))
++ continue;
++ nfsd_file_cond_queue(nf, dispose);
+ }
+- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
++ rcu_read_unlock();
+ }
+
+ /**
+- * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
++ * nfsd_file_close_inode - attempt a delayed close of a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+- * Walk the whole hash bucket, looking for any files that correspond to "inode".
+- * If any do, then unhash them and put the hashtable reference to them and
+- * destroy any that had their last reference put. Also ensure that any of the
+- * fputs also have their final __fput done as well.
++ * Close out any open nfsd_files that can be reaped for @inode. The
++ * actual freeing is deferred to the dispose_list_delayed infrastructure.
++ *
++ * This is used by the fsnotify callbacks and setlease notifier.
+ */
+-void
+-nfsd_file_close_inode_sync(struct inode *inode)
++static void
++nfsd_file_close_inode(struct inode *inode)
+ {
+- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
+- NFSD_FILE_HASH_BITS);
+ LIST_HEAD(dispose);
+
+- __nfsd_file_close_inode(inode, hashval, &dispose);
+- trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose));
+- nfsd_file_dispose_list_sync(&dispose);
++ nfsd_file_queue_for_close(inode, &dispose);
++ nfsd_file_dispose_list_delayed(&dispose);
+ }
+
+ /**
+ * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file
+ * @inode: inode of the file to attempt to remove
+ *
+- * Walk the whole hash bucket, looking for any files that correspond to "inode".
+- * If any do, then unhash them and put the hashtable reference to them and
+- * destroy any that had their last reference put.
++ * Close out any open nfsd_files that can be reaped for @inode. The
++ * nfsd_files are closed out synchronously.
++ *
++ * This is called from nfsd_rename and nfsd_unlink to avoid silly-renames
++ * when reexporting NFS.
+ */
+-static void
+-nfsd_file_close_inode(struct inode *inode)
++void
++nfsd_file_close_inode_sync(struct inode *inode)
+ {
+- unsigned int hashval = (unsigned int)hash_long(inode->i_ino,
+- NFSD_FILE_HASH_BITS);
++ struct nfsd_file *nf;
+ LIST_HEAD(dispose);
+
+- __nfsd_file_close_inode(inode, hashval, &dispose);
+- trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose));
+- nfsd_file_dispose_list_delayed(&dispose);
++ trace_nfsd_file_close(inode);
++
++ nfsd_file_queue_for_close(inode, &dispose);
++ while (!list_empty(&dispose)) {
++ nf = list_first_entry(&dispose, struct nfsd_file, nf_lru);
++ list_del_init(&nf->nf_lru);
++ nfsd_file_free(nf);
++ }
++ flush_delayed_fput();
+ }
+
+ /**
+ * nfsd_file_delayed_close - close unused nfsd_files
+ * @work: dummy
+ *
+- * Walk the LRU list and close any entries that have not been used since
+- * the last scan.
+- *
+- * Note this can deadlock with nfsd_file_cache_purge.
++ * Scrape the freeme list for this nfsd_net, and then dispose of them
++ * all.
+ */
+ static void
+ nfsd_file_delayed_close(struct work_struct *work)
+@@ -576,7 +651,10 @@ nfsd_file_delayed_close(struct work_struct *work)
+ struct nfsd_fcache_disposal *l = container_of(work,
+ struct nfsd_fcache_disposal, work);
+
+- nfsd_file_list_remove_disposal(&head, l);
++ spin_lock(&l->lock);
++ list_splice_init(&l->freeme, &head);
++ spin_unlock(&l->lock);
++
+ nfsd_file_dispose_list(&head);
+ }
+
+@@ -588,7 +666,7 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
+
+ /* Only close files for F_SETLEASE leases */
+ if (fl->fl_flags & FL_LEASE)
+- nfsd_file_close_inode_sync(file_inode(fl->fl_file));
++ nfsd_file_close_inode(file_inode(fl->fl_file));
+ return 0;
+ }
+
+@@ -601,6 +679,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
+ struct inode *inode, struct inode *dir,
+ const struct qstr *name, u32 cookie)
+ {
++ if (WARN_ON_ONCE(!inode))
++ return 0;
++
+ trace_nfsd_file_fsnotify_handle_event(inode, mask);
+
+ /* Should be no marks on non-regular files */
+@@ -628,25 +709,21 @@ static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
+ int
+ nfsd_file_cache_init(void)
+ {
+- int ret = -ENOMEM;
+- unsigned int i;
+-
+- clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
++ int ret;
+
+- if (nfsd_file_hashtbl)
++ lockdep_assert_held(&nfsd_mutex);
++ if (test_and_set_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
+ return 0;
+
++ ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params);
++ if (ret)
++ return ret;
++
++ ret = -ENOMEM;
+ nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
+ if (!nfsd_filecache_wq)
+ goto out;
+
+- nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
+- sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
+- if (!nfsd_file_hashtbl) {
+- pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
+- goto out_err;
+- }
+-
+ nfsd_file_slab = kmem_cache_create("nfsd_file",
+ sizeof(struct nfsd_file), 0, 0, NULL);
+ if (!nfsd_file_slab) {
+@@ -680,19 +757,16 @@ nfsd_file_cache_init(void)
+ goto out_shrinker;
+ }
+
+- nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops);
++ nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops,
++ FSNOTIFY_GROUP_NOFS);
+ if (IS_ERR(nfsd_file_fsnotify_group)) {
+ pr_err("nfsd: unable to create fsnotify group: %ld\n",
+ PTR_ERR(nfsd_file_fsnotify_group));
++ ret = PTR_ERR(nfsd_file_fsnotify_group);
+ nfsd_file_fsnotify_group = NULL;
+ goto out_notifier;
+ }
+
+- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+- INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head);
+- spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock);
+- }
+-
+ INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
+ out:
+ return ret;
+@@ -707,50 +781,47 @@ nfsd_file_cache_init(void)
+ nfsd_file_slab = NULL;
+ kmem_cache_destroy(nfsd_file_mark_slab);
+ nfsd_file_mark_slab = NULL;
+- kvfree(nfsd_file_hashtbl);
+- nfsd_file_hashtbl = NULL;
+ destroy_workqueue(nfsd_filecache_wq);
+ nfsd_filecache_wq = NULL;
++ rhltable_destroy(&nfsd_file_rhltable);
+ goto out;
+ }
+
+-/*
+- * Note this can deadlock with nfsd_file_lru_cb.
++/**
++ * __nfsd_file_cache_purge: clean out the cache for shutdown
++ * @net: net-namespace to shut down the cache (may be NULL)
++ *
++ * Walk the nfsd_file cache and close out any that match @net. If @net is NULL,
++ * then close out everything. Called when an nfsd instance is being shut down,
++ * and when the exports table is flushed.
+ */
+-void
+-nfsd_file_cache_purge(struct net *net)
++static void
++__nfsd_file_cache_purge(struct net *net)
+ {
+- unsigned int i;
+- struct nfsd_file *nf;
+- struct hlist_node *next;
++ struct rhashtable_iter iter;
++ struct nfsd_file *nf;
+ LIST_HEAD(dispose);
+- bool del;
+
+- if (!nfsd_file_hashtbl)
+- return;
++ rhltable_walk_enter(&nfsd_file_rhltable, &iter);
++ do {
++ rhashtable_walk_start(&iter);
+
+- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+- struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i];
++ nf = rhashtable_walk_next(&iter);
++ while (!IS_ERR_OR_NULL(nf)) {
++ if (!net || nf->nf_net == net)
++ nfsd_file_cond_queue(nf, &dispose);
++ nf = rhashtable_walk_next(&iter);
++ }
+
+- spin_lock(&nfb->nfb_lock);
+- hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) {
+- if (net && nf->nf_net != net)
+- continue;
+- del = nfsd_file_unhash_and_release_locked(nf, &dispose);
++ rhashtable_walk_stop(&iter);
++ } while (nf == ERR_PTR(-EAGAIN));
++ rhashtable_walk_exit(&iter);
+
+- /*
+- * Deadlock detected! Something marked this entry as
+- * unhased, but hasn't removed it from the hash list.
+- */
+- WARN_ON_ONCE(!del);
+- }
+- spin_unlock(&nfb->nfb_lock);
+- nfsd_file_dispose_list(&dispose);
+- }
++ nfsd_file_dispose_list(&dispose);
+ }
+
+ static struct nfsd_fcache_disposal *
+-nfsd_alloc_fcache_disposal(struct net *net)
++nfsd_alloc_fcache_disposal(void)
+ {
+ struct nfsd_fcache_disposal *l;
+
+@@ -758,7 +829,6 @@ nfsd_alloc_fcache_disposal(struct net *net)
+ if (!l)
+ return NULL;
+ INIT_WORK(&l->work, nfsd_file_delayed_close);
+- l->net = net;
+ spin_lock_init(&l->lock);
+ INIT_LIST_HEAD(&l->freeme);
+ return l;
+@@ -767,61 +837,40 @@ nfsd_alloc_fcache_disposal(struct net *net)
+ static void
+ nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
+ {
+- rcu_assign_pointer(l->net, NULL);
+ cancel_work_sync(&l->work);
+ nfsd_file_dispose_list(&l->freeme);
+- kfree_rcu(l, rcu);
++ kfree(l);
+ }
+
+ static void
+-nfsd_add_fcache_disposal(struct nfsd_fcache_disposal *l)
+-{
+- spin_lock(&laundrette_lock);
+- list_add_tail_rcu(&l->list, &laundrettes);
+- spin_unlock(&laundrette_lock);
+-}
+-
+-static void
+-nfsd_del_fcache_disposal(struct nfsd_fcache_disposal *l)
+-{
+- spin_lock(&laundrette_lock);
+- list_del_rcu(&l->list);
+- spin_unlock(&laundrette_lock);
+-}
+-
+-static int
+-nfsd_alloc_fcache_disposal_net(struct net *net)
++nfsd_free_fcache_disposal_net(struct net *net)
+ {
+- struct nfsd_fcache_disposal *l;
++ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+- l = nfsd_alloc_fcache_disposal(net);
+- if (!l)
+- return -ENOMEM;
+- nfsd_add_fcache_disposal(l);
+- return 0;
++ nfsd_free_fcache_disposal(l);
+ }
+
+-static void
+-nfsd_free_fcache_disposal_net(struct net *net)
++int
++nfsd_file_cache_start_net(struct net *net)
+ {
+- struct nfsd_fcache_disposal *l;
++ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+- rcu_read_lock();
+- list_for_each_entry_rcu(l, &laundrettes, list) {
+- if (l->net != net)
+- continue;
+- nfsd_del_fcache_disposal(l);
+- rcu_read_unlock();
+- nfsd_free_fcache_disposal(l);
+- return;
+- }
+- rcu_read_unlock();
++ nn->fcache_disposal = nfsd_alloc_fcache_disposal();
++ return nn->fcache_disposal ? 0 : -ENOMEM;
+ }
+
+-int
+-nfsd_file_cache_start_net(struct net *net)
++/**
++ * nfsd_file_cache_purge - Remove all cache items associated with @net
++ * @net: target net namespace
++ *
++ */
++void
++nfsd_file_cache_purge(struct net *net)
+ {
+- return nfsd_alloc_fcache_disposal_net(net);
++ lockdep_assert_held(&nfsd_mutex);
++ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1)
++ __nfsd_file_cache_purge(net);
+ }
+
+ void
+@@ -834,7 +883,11 @@ nfsd_file_cache_shutdown_net(struct net *net)
+ void
+ nfsd_file_cache_shutdown(void)
+ {
+- set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags);
++ int i;
++
++ lockdep_assert_held(&nfsd_mutex);
++ if (test_and_clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 0)
++ return;
+
+ lease_unregister_notifier(&nfsd_file_lease_notifier);
+ unregister_shrinker(&nfsd_file_shrinker);
+@@ -843,7 +896,7 @@ nfsd_file_cache_shutdown(void)
+ * calling nfsd_file_cache_purge
+ */
+ cancel_delayed_work_sync(&nfsd_filecache_laundrette);
+- nfsd_file_cache_purge(NULL);
++ __nfsd_file_cache_purge(NULL);
+ list_lru_destroy(&nfsd_file_lru);
+ rcu_barrier();
+ fsnotify_put_group(nfsd_file_fsnotify_group);
+@@ -853,240 +906,332 @@ nfsd_file_cache_shutdown(void)
+ fsnotify_wait_marks_destroyed();
+ kmem_cache_destroy(nfsd_file_mark_slab);
+ nfsd_file_mark_slab = NULL;
+- kvfree(nfsd_file_hashtbl);
+- nfsd_file_hashtbl = NULL;
+ destroy_workqueue(nfsd_filecache_wq);
+ nfsd_filecache_wq = NULL;
+-}
+-
+-static bool
+-nfsd_match_cred(const struct cred *c1, const struct cred *c2)
+-{
+- int i;
+-
+- if (!uid_eq(c1->fsuid, c2->fsuid))
+- return false;
+- if (!gid_eq(c1->fsgid, c2->fsgid))
+- return false;
+- if (c1->group_info == NULL || c2->group_info == NULL)
+- return c1->group_info == c2->group_info;
+- if (c1->group_info->ngroups != c2->group_info->ngroups)
+- return false;
+- for (i = 0; i < c1->group_info->ngroups; i++) {
+- if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i]))
+- return false;
++ rhltable_destroy(&nfsd_file_rhltable);
++
++ for_each_possible_cpu(i) {
++ per_cpu(nfsd_file_cache_hits, i) = 0;
++ per_cpu(nfsd_file_acquisitions, i) = 0;
++ per_cpu(nfsd_file_releases, i) = 0;
++ per_cpu(nfsd_file_total_age, i) = 0;
++ per_cpu(nfsd_file_evictions, i) = 0;
+ }
+- return true;
+ }
+
+ static struct nfsd_file *
+-nfsd_file_find_locked(struct inode *inode, unsigned int may_flags,
+- unsigned int hashval, struct net *net)
++nfsd_file_lookup_locked(const struct net *net, const struct cred *cred,
++ struct inode *inode, unsigned char need,
++ bool want_gc)
+ {
++ struct rhlist_head *tmp, *list;
+ struct nfsd_file *nf;
+- unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+
+- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+- nf_node, lockdep_is_held(&nfsd_file_hashtbl[hashval].nfb_lock)) {
++ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
++ nfsd_file_rhash_params);
++ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist) {
+ if (nf->nf_may != need)
+ continue;
+- if (nf->nf_inode != inode)
+- continue;
+ if (nf->nf_net != net)
+ continue;
+- if (!nfsd_match_cred(nf->nf_cred, current_cred()))
++ if (!nfsd_match_cred(nf->nf_cred, cred))
+ continue;
+- if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags))
++ if (test_bit(NFSD_FILE_GC, &nf->nf_flags) != want_gc)
+ continue;
+- if (nfsd_file_get(nf) != NULL)
+- return nf;
++ if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags) == 0)
++ continue;
++
++ if (!nfsd_file_get(nf))
++ continue;
++ return nf;
+ }
+ return NULL;
+ }
+
+ /**
+- * nfsd_file_is_cached - are there any cached open files for this fh?
+- * @inode: inode of the file to check
++ * nfsd_file_is_cached - are there any cached open files for this inode?
++ * @inode: inode to check
++ *
++ * The lookup matches inodes in all net namespaces and is atomic wrt
++ * nfsd_file_acquire().
+ *
+- * Scan the hashtable for open files that match this fh. Returns true if there
+- * are any, and false if not.
++ * Return values:
++ * %true: filecache contains at least one file matching this inode
++ * %false: filecache contains no files matching this inode
+ */
+ bool
+ nfsd_file_is_cached(struct inode *inode)
+ {
+- bool ret = false;
+- struct nfsd_file *nf;
+- unsigned int hashval;
+-
+- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
++ struct rhlist_head *tmp, *list;
++ struct nfsd_file *nf;
++ bool ret = false;
+
+ rcu_read_lock();
+- hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head,
+- nf_node) {
+- if (inode == nf->nf_inode) {
++ list = rhltable_lookup(&nfsd_file_rhltable, &inode,
++ nfsd_file_rhash_params);
++ rhl_for_each_entry_rcu(nf, tmp, list, nf_rlist)
++ if (test_bit(NFSD_FILE_GC, &nf->nf_flags)) {
+ ret = true;
+ break;
+ }
+- }
+ rcu_read_unlock();
+- trace_nfsd_file_is_cached(inode, hashval, (int)ret);
++
++ trace_nfsd_file_is_cached(inode, (int)ret);
+ return ret;
+ }
+
+-__be32
+-nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- unsigned int may_flags, struct nfsd_file **pnf)
++static __be32
++nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ unsigned int may_flags, struct file *file,
++ struct nfsd_file **pnf, bool want_gc)
+ {
+- __be32 status;
++ unsigned char need = may_flags & NFSD_FILE_MAY_MASK;
+ struct net *net = SVC_NET(rqstp);
+- struct nfsd_file *nf, *new;
++ struct nfsd_file *new, *nf;
++ const struct cred *cred;
++ bool open_retry = true;
+ struct inode *inode;
+- unsigned int hashval;
+- bool retry = true;
++ __be32 status;
++ int ret;
+
+- /* FIXME: skip this if fh_dentry is already set? */
+ status = fh_verify(rqstp, fhp, S_IFREG,
+ may_flags|NFSD_MAY_OWNER_OVERRIDE);
+ if (status != nfs_ok)
+ return status;
+-
+ inode = d_inode(fhp->fh_dentry);
+- hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS);
++ cred = get_current_cred();
++
+ retry:
+ rcu_read_lock();
+- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
++ nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
+ rcu_read_unlock();
+- if (nf)
++
++ if (nf) {
++ /*
++ * If the nf is on the LRU then it holds an extra reference
++ * that must be put if it's removed. It had better not be
++ * the last one however, since we should hold another.
++ */
++ if (nfsd_file_lru_remove(nf))
++ WARN_ON_ONCE(refcount_dec_and_test(&nf->nf_ref));
+ goto wait_for_construction;
++ }
+
+- new = nfsd_file_alloc(inode, may_flags, hashval, net);
++ new = nfsd_file_alloc(net, inode, need, want_gc);
+ if (!new) {
+- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags,
+- NULL, nfserr_jukebox);
+- return nfserr_jukebox;
++ status = nfserr_jukebox;
++ goto out;
+ }
+
+- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+- nf = nfsd_file_find_locked(inode, may_flags, hashval, net);
+- if (nf == NULL)
++ rcu_read_lock();
++ spin_lock(&inode->i_lock);
++ nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc);
++ if (unlikely(nf)) {
++ spin_unlock(&inode->i_lock);
++ rcu_read_unlock();
++ nfsd_file_slab_free(&new->nf_rcu);
++ goto wait_for_construction;
++ }
++ nf = new;
++ ret = rhltable_insert(&nfsd_file_rhltable, &nf->nf_rlist,
++ nfsd_file_rhash_params);
++ spin_unlock(&inode->i_lock);
++ rcu_read_unlock();
++ if (likely(ret == 0))
+ goto open_file;
+- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+- nfsd_file_slab_free(&new->nf_rcu);
++
++ if (ret == -EEXIST)
++ goto retry;
++ trace_nfsd_file_insert_err(rqstp, inode, may_flags, ret);
++ status = nfserr_jukebox;
++ goto construction_err;
+
+ wait_for_construction:
+ wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE);
+
+ /* Did construction of this file fail? */
+ if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) {
+- if (!retry) {
++ trace_nfsd_file_cons_err(rqstp, inode, may_flags, nf);
++ if (!open_retry) {
+ status = nfserr_jukebox;
+- goto out;
++ goto construction_err;
+ }
+- retry = false;
+- nfsd_file_put_noref(nf);
++ open_retry = false;
+ goto retry;
+ }
+-
+ this_cpu_inc(nfsd_file_cache_hits);
+
+- if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) {
+- bool write = (may_flags & NFSD_MAY_WRITE);
+-
+- if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) ||
+- (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) {
+- status = nfserrno(nfsd_open_break_lease(
+- file_inode(nf->nf_file), may_flags));
+- if (status == nfs_ok) {
+- clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
+- if (write)
+- clear_bit(NFSD_FILE_BREAK_WRITE,
+- &nf->nf_flags);
+- }
+- }
++ status = nfserrno(nfsd_open_break_lease(file_inode(nf->nf_file), may_flags));
++ if (status != nfs_ok) {
++ nfsd_file_put(nf);
++ nf = NULL;
+ }
++
+ out:
+ if (status == nfs_ok) {
++ this_cpu_inc(nfsd_file_acquisitions);
++ nfsd_file_check_write_error(nf);
+ *pnf = nf;
+- } else {
+- nfsd_file_put(nf);
+- nf = NULL;
+ }
+-
+- trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status);
++ put_cred(cred);
++ trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status);
+ return status;
++
+ open_file:
+- nf = new;
+- /* Take reference for the hashtable */
+- refcount_inc(&nf->nf_ref);
+- __set_bit(NFSD_FILE_HASHED, &nf->nf_flags);
+- __set_bit(NFSD_FILE_PENDING, &nf->nf_flags);
+- list_lru_add(&nfsd_file_lru, &nf->nf_lru);
+- hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head);
+- ++nfsd_file_hashtbl[hashval].nfb_count;
+- nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount,
+- nfsd_file_hashtbl[hashval].nfb_count);
+- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+- if (atomic_long_inc_return(&nfsd_filecache_count) >= NFSD_FILE_LRU_THRESHOLD)
+- nfsd_file_gc();
+-
+- nf->nf_mark = nfsd_file_mark_find_or_create(nf);
+- if (nf->nf_mark)
+- status = nfsd_open_verified(rqstp, fhp, S_IFREG,
+- may_flags, &nf->nf_file);
+- else
++ trace_nfsd_file_alloc(nf);
++ nf->nf_mark = nfsd_file_mark_find_or_create(nf, inode);
++ if (nf->nf_mark) {
++ if (file) {
++ get_file(file);
++ nf->nf_file = file;
++ status = nfs_ok;
++ trace_nfsd_file_opened(nf, status);
++ } else {
++ status = nfsd_open_verified(rqstp, fhp, may_flags,
++ &nf->nf_file);
++ trace_nfsd_file_open(nf, status);
++ }
++ } else
+ status = nfserr_jukebox;
+ /*
+ * If construction failed, or we raced with a call to unlink()
+ * then unhash.
+ */
+- if (status != nfs_ok || inode->i_nlink == 0) {
+- bool do_free;
+- spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock);
+- do_free = nfsd_file_unhash(nf);
+- spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock);
+- if (do_free)
+- nfsd_file_put_noref(nf);
+- }
+- clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags);
+- smp_mb__after_atomic();
+- wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING);
++ if (status != nfs_ok || inode->i_nlink == 0)
++ nfsd_file_unhash(nf);
++ clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags);
++ if (status == nfs_ok)
++ goto out;
++
++construction_err:
++ if (refcount_dec_and_test(&nf->nf_ref))
++ nfsd_file_free(nf);
++ nf = NULL;
+ goto out;
+ }
+
++/**
++ * nfsd_file_acquire_gc - Get a struct nfsd_file with an open file
++ * @rqstp: the RPC transaction being executed
++ * @fhp: the NFS filehandle of the file to be opened
++ * @may_flags: NFSD_MAY_ settings for the file
++ * @pnf: OUT: new or found "struct nfsd_file" object
++ *
++ * The nfsd_file object returned by this API is reference-counted
++ * and garbage-collected. The object is retained for a few
++ * seconds after the final nfsd_file_put() in case the caller
++ * wants to re-use it.
++ *
++ * Return values:
++ * %nfs_ok - @pnf points to an nfsd_file with its reference
++ * count boosted.
++ *
++ * On error, an nfsstat value in network byte order is returned.
++ */
++__be32
++nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ unsigned int may_flags, struct nfsd_file **pnf)
++{
++ return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, true);
++}
++
++/**
++ * nfsd_file_acquire - Get a struct nfsd_file with an open file
++ * @rqstp: the RPC transaction being executed
++ * @fhp: the NFS filehandle of the file to be opened
++ * @may_flags: NFSD_MAY_ settings for the file
++ * @pnf: OUT: new or found "struct nfsd_file" object
++ *
++ * The nfsd_file_object returned by this API is reference-counted
++ * but not garbage-collected. The object is unhashed after the
++ * final nfsd_file_put().
++ *
++ * Return values:
++ * %nfs_ok - @pnf points to an nfsd_file with its reference
++ * count boosted.
++ *
++ * On error, an nfsstat value in network byte order is returned.
++ */
++__be32
++nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ unsigned int may_flags, struct nfsd_file **pnf)
++{
++ return nfsd_file_do_acquire(rqstp, fhp, may_flags, NULL, pnf, false);
++}
++
++/**
++ * nfsd_file_acquire_opened - Get a struct nfsd_file using existing open file
++ * @rqstp: the RPC transaction being executed
++ * @fhp: the NFS filehandle of the file just created
++ * @may_flags: NFSD_MAY_ settings for the file
++ * @file: cached, already-open file (may be NULL)
++ * @pnf: OUT: new or found "struct nfsd_file" object
++ *
++ * Acquire a nfsd_file object that is not GC'ed. If one doesn't already exist,
++ * and @file is non-NULL, use it to instantiate a new nfsd_file instead of
++ * opening a new one.
++ *
++ * Return values:
++ * %nfs_ok - @pnf points to an nfsd_file with its reference
++ * count boosted.
++ *
++ * On error, an nfsstat value in network byte order is returned.
++ */
++__be32
++nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ unsigned int may_flags, struct file *file,
++ struct nfsd_file **pnf)
++{
++ return nfsd_file_do_acquire(rqstp, fhp, may_flags, file, pnf, false);
++}
++
+ /*
+ * Note that fields may be added, removed or reordered in the future. Programs
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+-static int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
++int nfsd_file_cache_stats_show(struct seq_file *m, void *v)
+ {
+- unsigned int i, count = 0, longest = 0;
+- unsigned long hits = 0;
++ unsigned long releases = 0, evictions = 0;
++ unsigned long hits = 0, acquisitions = 0;
++ unsigned int i, count = 0, buckets = 0;
++ unsigned long lru = 0, total_age = 0;
+
+- /*
+- * No need for spinlocks here since we're not terribly interested in
+- * accuracy. We do take the nfsd_mutex simply to ensure that we
+- * don't end up racing with server shutdown
+- */
++ /* Serialize with server shutdown */
+ mutex_lock(&nfsd_mutex);
+- if (nfsd_file_hashtbl) {
+- for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) {
+- count += nfsd_file_hashtbl[i].nfb_count;
+- longest = max(longest, nfsd_file_hashtbl[i].nfb_count);
+- }
++ if (test_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags) == 1) {
++ struct bucket_table *tbl;
++ struct rhashtable *ht;
++
++ lru = list_lru_count(&nfsd_file_lru);
++
++ rcu_read_lock();
++ ht = &nfsd_file_rhltable.ht;
++ count = atomic_read(&ht->nelems);
++ tbl = rht_dereference_rcu(ht->tbl, ht);
++ buckets = tbl->size;
++ rcu_read_unlock();
+ }
+ mutex_unlock(&nfsd_mutex);
+
+- for_each_possible_cpu(i)
++ for_each_possible_cpu(i) {
+ hits += per_cpu(nfsd_file_cache_hits, i);
++ acquisitions += per_cpu(nfsd_file_acquisitions, i);
++ releases += per_cpu(nfsd_file_releases, i);
++ total_age += per_cpu(nfsd_file_total_age, i);
++ evictions += per_cpu(nfsd_file_evictions, i);
++ }
+
+- seq_printf(m, "total entries: %u\n", count);
+- seq_printf(m, "longest chain: %u\n", longest);
++ seq_printf(m, "total inodes: %u\n", count);
++ seq_printf(m, "hash buckets: %u\n", buckets);
++ seq_printf(m, "lru entries: %lu\n", lru);
+ seq_printf(m, "cache hits: %lu\n", hits);
++ seq_printf(m, "acquisitions: %lu\n", acquisitions);
++ seq_printf(m, "releases: %lu\n", releases);
++ seq_printf(m, "evictions: %lu\n", evictions);
++ if (releases)
++ seq_printf(m, "mean age (ms): %ld\n", total_age / releases);
++ else
++ seq_printf(m, "mean age (ms): -\n");
+ return 0;
+ }
+-
+-int nfsd_file_cache_stats_open(struct inode *inode, struct file *file)
+-{
+- return single_open(file, nfsd_file_cache_stats_show, NULL);
+-}
+diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
+index 435ceab27897a..e54165a3224f0 100644
+--- a/fs/nfsd/filecache.h
++++ b/fs/nfsd/filecache.h
+@@ -29,23 +29,23 @@ struct nfsd_file_mark {
+ * never be dereferenced, only used for comparison.
+ */
+ struct nfsd_file {
+- struct hlist_node nf_node;
+- struct list_head nf_lru;
+- struct rcu_head nf_rcu;
++ struct rhlist_head nf_rlist;
++ void *nf_inode;
+ struct file *nf_file;
+ const struct cred *nf_cred;
+ struct net *nf_net;
+ #define NFSD_FILE_HASHED (0)
+ #define NFSD_FILE_PENDING (1)
+-#define NFSD_FILE_BREAK_READ (2)
+-#define NFSD_FILE_BREAK_WRITE (3)
+-#define NFSD_FILE_REFERENCED (4)
++#define NFSD_FILE_REFERENCED (2)
++#define NFSD_FILE_GC (3)
+ unsigned long nf_flags;
+- struct inode *nf_inode;
+- unsigned int nf_hashval;
+ refcount_t nf_ref;
+ unsigned char nf_may;
++
+ struct nfsd_file_mark *nf_mark;
++ struct list_head nf_lru;
++ struct rcu_head nf_rcu;
++ ktime_t nf_birthtime;
+ };
+
+ int nfsd_file_cache_init(void);
+@@ -57,7 +57,12 @@ void nfsd_file_put(struct nfsd_file *nf);
+ struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
+ void nfsd_file_close_inode_sync(struct inode *inode);
+ bool nfsd_file_is_cached(struct inode *inode);
++__be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ unsigned int may_flags, struct nfsd_file **nfp);
+ __be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned int may_flags, struct nfsd_file **nfp);
+-int nfsd_file_cache_stats_open(struct inode *, struct file *);
++__be32 nfsd_file_acquire_opened(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ unsigned int may_flags, struct file *file,
++ struct nfsd_file **nfp);
++int nfsd_file_cache_stats_show(struct seq_file *m, void *v);
+ #endif /* _FS_NFSD_FILECACHE_H */
+diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
+index db7ef07ae50c9..fabc21ed68cea 100644
+--- a/fs/nfsd/flexfilelayout.c
++++ b/fs/nfsd/flexfilelayout.c
+@@ -15,6 +15,7 @@
+
+ #include "flexfilelayoutxdr.h"
+ #include "pnfs.h"
++#include "vfs.h"
+
+ #define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+@@ -61,7 +62,7 @@ nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ goto out_error;
+
+ fl->fh.size = fhp->fh_handle.fh_size;
+- memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
++ memcpy(fl->fh.data, &fhp->fh_handle.fh_raw, fl->fh.size);
+
+ /* Give whole file layout segments */
+ seg->offset = 0;
+diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
+index 3f5b3d7b62b71..46a7f9b813e52 100644
+--- a/fs/nfsd/lockd.c
++++ b/fs/nfsd/lockd.c
+@@ -25,18 +25,22 @@
+ * Note: we hold the dentry use count while the file is open.
+ */
+ static __be32
+-nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp)
++nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp,
++ int mode)
+ {
+ __be32 nfserr;
++ int access;
+ struct svc_fh fh;
+
+ /* must initialize before using! but maxsize doesn't matter */
+ fh_init(&fh,0);
+ fh.fh_handle.fh_size = f->size;
+- memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size);
++ memcpy(&fh.fh_handle.fh_raw, f->data, f->size);
+ fh.fh_export = NULL;
+
+- nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp);
++ access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ;
++ access |= NFSD_MAY_LOCK;
++ nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp);
+ fh_put(&fh);
+ /* We return nlm error codes as nlm doesn't know
+ * about nfsd, but nfsd does know about nlm..
+diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
+index 02d3d2f0e6168..51a4b7885cae2 100644
+--- a/fs/nfsd/netns.h
++++ b/fs/nfsd/netns.h
+@@ -10,6 +10,8 @@
+
+ #include <net/net_namespace.h>
+ #include <net/netns/generic.h>
++#include <linux/percpu_counter.h>
++#include <linux/siphash.h>
+
+ /* Hash tables for nfs4_clientid state */
+ #define CLIENT_HASH_BITS 4
+@@ -21,6 +23,14 @@
+ struct cld_net;
+ struct nfsd4_client_tracking_ops;
+
++enum {
++ /* cache misses due only to checksum comparison failures */
++ NFSD_NET_PAYLOAD_MISSES,
++ /* amount of memory (in bytes) currently consumed by the DRC */
++ NFSD_NET_DRC_MEM_USAGE,
++ NFSD_NET_COUNTERS_NUM
++};
++
+ /*
+ * Represents a nfsd "container". With respect to nfsv4 state tracking, the
+ * fields of interest are the *_id_hashtbls and the *_name_tree. These track
+@@ -99,9 +109,8 @@ struct nfsd_net {
+ bool nfsd_net_up;
+ bool lockd_up;
+
+- /* Time of server startup */
+- struct timespec64 nfssvc_boot;
+- seqlock_t boot_lock;
++ seqlock_t writeverf_lock;
++ unsigned char writeverf[8];
+
+ /*
+ * Max number of connections this nfsd container will allow. Defaults
+@@ -114,12 +123,13 @@ struct nfsd_net {
+ u32 clverifier_counter;
+
+ struct svc_serv *nfsd_serv;
+-
+- wait_queue_head_t ntf_wq;
+- atomic_t ntf_refcnt;
+-
+- /* Allow umount to wait for nfsd state cleanup */
+- struct completion nfsd_shutdown_complete;
++ /* When a listening socket is added to nfsd, keep_active is set
++ * and this justifies a reference on nfsd_serv. This stops
++ * nfsd_serv from being freed. When the number of threads is
++ * set, keep_active is cleared and the reference is dropped. So
++ * when the last thread exits, the service will be destroyed.
++ */
++ int keep_active;
+
+ /*
+ * clientid and stateid data for construction of net unique COPY
+@@ -149,20 +159,16 @@ struct nfsd_net {
+
+ /*
+ * Stats and other tracking of on the duplicate reply cache.
+- * These fields and the "rc" fields in nfsdstats are modified
+- * with only the per-bucket cache lock, which isn't really safe
+- * and should be fixed if we want the statistics to be
+- * completely accurate.
++ * The longest_chain* fields are modified with only the per-bucket
++ * cache lock, which isn't really safe and should be fixed if we want
++ * these statistics to be completely accurate.
+ */
+
+ /* total number of entries */
+ atomic_t num_drc_entries;
+
+- /* cache misses due only to checksum comparison failures */
+- unsigned int payload_misses;
+-
+- /* amount of memory (in bytes) currently consumed by the DRC */
+- unsigned int drc_mem_usage;
++ /* Per-netns stats counters */
++ struct percpu_counter counter[NFSD_NET_COUNTERS_NUM];
+
+ /* longest hash chain seen */
+ unsigned int longest_chain;
+@@ -171,8 +177,25 @@ struct nfsd_net {
+ unsigned int longest_chain_cachesize;
+
+ struct shrinker nfsd_reply_cache_shrinker;
++
++ /* tracking server-to-server copy mounts */
++ spinlock_t nfsd_ssc_lock;
++ struct list_head nfsd_ssc_mount_list;
++ wait_queue_head_t nfsd_ssc_waitq;
++
+ /* utsname taken from the process that starts the server */
+ char nfsd_name[UNX_MAXNODENAME+1];
++
++ struct nfsd_fcache_disposal *fcache_disposal;
++
++ siphash_key_t siphash_key;
++
++ atomic_t nfs4_client_count;
++ int nfs4_max_clients;
++
++ atomic_t nfsd_courtesy_clients;
++ struct shrinker nfsd_client_shrinker;
++ struct work_struct nfsd_shrinker_work;
+ };
+
+ /* Simple check to find out if a given net was properly initialized */
+@@ -182,6 +205,6 @@ extern void nfsd_netns_free_versions(struct nfsd_net *nn);
+
+ extern unsigned int nfsd_net_id;
+
+-void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn);
+-void nfsd_reset_boot_verifier(struct nfsd_net *nn);
++void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn);
++void nfsd_reset_write_verifier(struct nfsd_net *nn);
+ #endif /* __NFSD_NETNS_H__ */
+diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c
+index 6a900f770dd23..9adf672dedbdd 100644
+--- a/fs/nfsd/nfs2acl.c
++++ b/fs/nfsd/nfs2acl.c
+@@ -111,7 +111,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
+ if (error)
+ goto out_errno;
+
+- fh_lock(fh);
++ inode_lock(inode);
+
+ error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+ if (error)
+@@ -120,7 +120,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
+ if (error)
+ goto out_drop_lock;
+
+- fh_unlock(fh);
++ inode_unlock(inode);
+
+ fh_drop_write(fh);
+
+@@ -134,7 +134,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp)
+ return rpc_success;
+
+ out_drop_lock:
+- fh_unlock(fh);
++ inode_unlock(inode);
+ fh_drop_write(fh);
+ out_errno:
+ resp->status = nfserrno(error);
+@@ -185,161 +185,106 @@ static __be32 nfsacld_proc_access(struct svc_rqst *rqstp)
+ /*
+ * XDR decode functions
+ */
+-static int nfsaclsvc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return 1;
+-}
+
+-static int nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfsaclsvc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_getaclargs *argp = rqstp->rq_argp;
+
+- p = nfs2svc_decode_fh(p, &argp->fh);
+- if (!p)
+- return 0;
+- argp->mask = ntohl(*p); p++;
++ if (!svcxdr_decode_fhandle(xdr, &argp->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
++ return false;
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-
+-static int nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfsaclsvc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
+- struct kvec *head = rqstp->rq_arg.head;
+- unsigned int base;
+- int n;
+-
+- p = nfs2svc_decode_fh(p, &argp->fh);
+- if (!p)
+- return 0;
+- argp->mask = ntohl(*p++);
+- if (argp->mask & ~NFS_ACL_MASK ||
+- !xdr_argsize_check(rqstp, p))
+- return 0;
+-
+- base = (char *)p - (char *)head->iov_base;
+- n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+- (argp->mask & NFS_ACL) ?
+- &argp->acl_access : NULL);
+- if (n > 0)
+- n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+- (argp->mask & NFS_DFACL) ?
+- &argp->acl_default : NULL);
+- return (n > 0);
+-}
+
+-static int nfsaclsvc_decode_fhandleargs(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nfsd_fhandle *argp = rqstp->rq_argp;
+-
+- p = nfs2svc_decode_fh(p, &argp->fh);
+- if (!p)
+- return 0;
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_fhandle(xdr, &argp->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
++ return false;
++ if (argp->mask & ~NFS_ACL_MASK)
++ return false;
++ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
++ &argp->acl_access : NULL))
++ return false;
++ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
++ &argp->acl_default : NULL))
++ return false;
++
++ return true;
+ }
+
+-static int nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfsaclsvc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nfsd3_accessargs *argp = rqstp->rq_argp;
++ struct nfsd3_accessargs *args = rqstp->rq_argp;
+
+- p = nfs2svc_decode_fh(p, &argp->fh);
+- if (!p)
+- return 0;
+- argp->access = ntohl(*p++);
++ if (!svcxdr_decode_fhandle(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->access) < 0)
++ return false;
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+ /*
+ * XDR encode functions
+ */
+
+-/*
+- * There must be an encoding function for void results so svc_process
+- * will work properly.
+- */
+-static int nfsaclsvc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return xdr_ressize_check(rqstp, p);
+-}
+-
+ /* GETACL */
+-static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+ struct dentry *dentry = resp->fh.fh_dentry;
+ struct inode *inode;
+- struct kvec *head = rqstp->rq_res.head;
+- unsigned int base;
+- int n;
+- int w;
+
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- return xdr_ressize_check(rqstp, p);
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
+
+- /*
+- * Since this is version 2, the check for nfserr in
+- * nfsd_dispatch actually ensures the following cannot happen.
+- * However, it seems fragile to depend on that.
+- */
+ if (dentry == NULL || d_really_is_negative(dentry))
+- return 0;
++ return true;
+ inode = d_inode(dentry);
+
+- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+- *p++ = htonl(resp->mask);
+- if (!xdr_ressize_check(rqstp, p))
+- return 0;
+- base = (char *)p - (char *)head->iov_base;
+-
+- rqstp->rq_res.page_len = w = nfsacl_size(
+- (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
+- (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+- while (w > 0) {
+- if (!*(rqstp->rq_next_page++))
+- return 0;
+- w -= PAGE_SIZE;
+- }
++ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
++ return false;
+
+- n = nfsacl_encode(&rqstp->rq_res, base, inode,
+- resp->acl_access,
+- resp->mask & NFS_ACL, 0);
+- if (n > 0)
+- n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+- resp->acl_default,
+- resp->mask & NFS_DFACL,
+- NFS_ACL_DEFAULT);
+- return (n > 0);
+-}
++ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
++ resp->mask & NFS_ACL, 0))
++ return false;
++ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
++ resp->mask & NFS_DFACL, NFS_ACL_DEFAULT))
++ return false;
+
+-static int nfsaclsvc_encode_attrstatres(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nfsd_attrstat *resp = rqstp->rq_resp;
+-
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- goto out;
+-
+- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+-out:
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+ /* ACCESS */
+-static int nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfsaclsvc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- goto out;
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->access) < 0)
++ return false;
++ break;
++ }
+
+- p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+- *p++ = htonl(resp->access);
+-out:
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+ /*
+@@ -354,13 +299,6 @@ static void nfsaclsvc_release_getacl(struct svc_rqst *rqstp)
+ posix_acl_release(resp->acl_default);
+ }
+
+-static void nfsaclsvc_release_attrstat(struct svc_rqst *rqstp)
+-{
+- struct nfsd_attrstat *resp = rqstp->rq_resp;
+-
+- fh_put(&resp->fh);
+-}
+-
+ static void nfsaclsvc_release_access(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+@@ -378,12 +316,14 @@ struct nfsd3_voidargs { int dummy; };
+ static const struct svc_procedure nfsd_acl_procedures2[5] = {
+ [ACLPROC2_NULL] = {
+ .pc_func = nfsacld_proc_null,
+- .pc_decode = nfsaclsvc_decode_voidarg,
+- .pc_encode = nfsaclsvc_encode_voidres,
+- .pc_argsize = sizeof(struct nfsd3_voidargs),
+- .pc_ressize = sizeof(struct nfsd3_voidargs),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
++ .pc_name = "NULL",
+ },
+ [ACLPROC2_GETACL] = {
+ .pc_func = nfsacld_proc_getacl,
+@@ -391,29 +331,35 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
+ .pc_encode = nfsaclsvc_encode_getaclres,
+ .pc_release = nfsaclsvc_release_getacl,
+ .pc_argsize = sizeof(struct nfsd3_getaclargs),
++ .pc_argzero = sizeof(struct nfsd3_getaclargs),
+ .pc_ressize = sizeof(struct nfsd3_getaclres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+2*(1+ACL),
++ .pc_name = "GETACL",
+ },
+ [ACLPROC2_SETACL] = {
+ .pc_func = nfsacld_proc_setacl,
+ .pc_decode = nfsaclsvc_decode_setaclargs,
+- .pc_encode = nfsaclsvc_encode_attrstatres,
+- .pc_release = nfsaclsvc_release_attrstat,
++ .pc_encode = nfssvc_encode_attrstatres,
++ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd3_setaclargs),
++ .pc_argzero = sizeof(struct nfsd3_setaclargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
++ .pc_name = "SETACL",
+ },
+ [ACLPROC2_GETATTR] = {
+ .pc_func = nfsacld_proc_getattr,
+- .pc_decode = nfsaclsvc_decode_fhandleargs,
+- .pc_encode = nfsaclsvc_encode_attrstatres,
+- .pc_release = nfsaclsvc_release_attrstat,
++ .pc_decode = nfssvc_decode_fhandleargs,
++ .pc_encode = nfssvc_encode_attrstatres,
++ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
++ .pc_argzero = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
++ .pc_name = "GETATTR",
+ },
+ [ACLPROC2_ACCESS] = {
+ .pc_func = nfsacld_proc_access,
+@@ -421,9 +367,11 @@ static const struct svc_procedure nfsd_acl_procedures2[5] = {
+ .pc_encode = nfsaclsvc_encode_accessres,
+ .pc_release = nfsaclsvc_release_access,
+ .pc_argsize = sizeof(struct nfsd3_accessargs),
++ .pc_argzero = sizeof(struct nfsd3_accessargs),
+ .pc_ressize = sizeof(struct nfsd3_accessres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT+1,
++ .pc_name = "SETATTR",
+ },
+ };
+
+diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c
+index 34a394e50e1d1..161f831b3a1b7 100644
+--- a/fs/nfsd/nfs3acl.c
++++ b/fs/nfsd/nfs3acl.c
+@@ -101,7 +101,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
+ if (error)
+ goto out_errno;
+
+- fh_lock(fh);
++ inode_lock(inode);
+
+ error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access);
+ if (error)
+@@ -109,7 +109,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
+ error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default);
+
+ out_drop_lock:
+- fh_unlock(fh);
++ inode_unlock(inode);
+ fh_drop_write(fh);
+ out_errno:
+ resp->status = nfserrno(error);
+@@ -124,43 +124,39 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp)
+ /*
+ * XDR decode functions
+ */
+-static int nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, __be32 *p)
++
++static bool
++nfs3svc_decode_getaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_getaclargs *args = rqstp->rq_argp;
+
+- p = nfs3svc_decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- args->mask = ntohl(*p); p++;
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->mask) < 0)
++ return false;
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-
+-static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+- struct nfsd3_setaclargs *args = rqstp->rq_argp;
+- struct kvec *head = rqstp->rq_arg.head;
+- unsigned int base;
+- int n;
+-
+- p = nfs3svc_decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- args->mask = ntohl(*p++);
+- if (args->mask & ~NFS_ACL_MASK ||
+- !xdr_argsize_check(rqstp, p))
+- return 0;
+-
+- base = (char *)p - (char *)head->iov_base;
+- n = nfsacl_decode(&rqstp->rq_arg, base, NULL,
+- (args->mask & NFS_ACL) ?
+- &args->acl_access : NULL);
+- if (n > 0)
+- n = nfsacl_decode(&rqstp->rq_arg, base + n, NULL,
+- (args->mask & NFS_DFACL) ?
+- &args->acl_default : NULL);
+- return (n > 0);
++ struct nfsd3_setaclargs *argp = rqstp->rq_argp;
++
++ if (!svcxdr_decode_nfs_fh3(xdr, &argp->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &argp->mask) < 0)
++ return false;
++ if (argp->mask & ~NFS_ACL_MASK)
++ return false;
++ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_ACL) ?
++ &argp->acl_access : NULL))
++ return false;
++ if (!nfs_stream_decode_acl(xdr, NULL, (argp->mask & NFS_DFACL) ?
++ &argp->acl_default : NULL))
++ return false;
++
++ return true;
+ }
+
+ /*
+@@ -168,59 +164,47 @@ static int nfs3svc_decode_setaclargs(struct svc_rqst *rqstp, __be32 *p)
+ */
+
+ /* GETACL */
+-static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfs3svc_encode_getaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_getaclres *resp = rqstp->rq_resp;
+ struct dentry *dentry = resp->fh.fh_dentry;
++ struct inode *inode;
+
+- *p++ = resp->status;
+- p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+- if (resp->status == 0 && dentry && d_really_is_positive(dentry)) {
+- struct inode *inode = d_inode(dentry);
+- struct kvec *head = rqstp->rq_res.head;
+- unsigned int base;
+- int n;
+- int w;
+-
+- *p++ = htonl(resp->mask);
+- if (!xdr_ressize_check(rqstp, p))
+- return 0;
+- base = (char *)p - (char *)head->iov_base;
+-
+- rqstp->rq_res.page_len = w = nfsacl_size(
+- (resp->mask & NFS_ACL) ? resp->acl_access : NULL,
+- (resp->mask & NFS_DFACL) ? resp->acl_default : NULL);
+- while (w > 0) {
+- if (!*(rqstp->rq_next_page++))
+- return 0;
+- w -= PAGE_SIZE;
+- }
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ inode = d_inode(dentry);
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->mask) < 0)
++ return false;
++
++ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_access,
++ resp->mask & NFS_ACL, 0))
++ return false;
++ if (!nfs_stream_encode_acl(xdr, inode, resp->acl_default,
++ resp->mask & NFS_DFACL,
++ NFS_ACL_DEFAULT))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ }
+
+- n = nfsacl_encode(&rqstp->rq_res, base, inode,
+- resp->acl_access,
+- resp->mask & NFS_ACL, 0);
+- if (n > 0)
+- n = nfsacl_encode(&rqstp->rq_res, base + n, inode,
+- resp->acl_default,
+- resp->mask & NFS_DFACL,
+- NFS_ACL_DEFAULT);
+- if (n <= 0)
+- return 0;
+- } else
+- if (!xdr_ressize_check(rqstp, p))
+- return 0;
+-
+- return 1;
++ return true;
+ }
+
+ /* SETACL */
+-static int nfs3svc_encode_setaclres(struct svc_rqst *rqstp, __be32 *p)
++static bool
++nfs3svc_encode_setaclres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh);
+- return xdr_ressize_check(rqstp, p);
++ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
++ svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh);
+ }
+
+ /*
+@@ -245,12 +229,14 @@ struct nfsd3_voidargs { int dummy; };
+ static const struct svc_procedure nfsd_acl_procedures3[3] = {
+ [ACLPROC3_NULL] = {
+ .pc_func = nfsd3_proc_null,
+- .pc_decode = nfs3svc_decode_voidarg,
+- .pc_encode = nfs3svc_encode_voidres,
+- .pc_argsize = sizeof(struct nfsd3_voidargs),
+- .pc_ressize = sizeof(struct nfsd3_voidargs),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
++ .pc_name = "NULL",
+ },
+ [ACLPROC3_GETACL] = {
+ .pc_func = nfsd3_proc_getacl,
+@@ -258,9 +244,11 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
+ .pc_encode = nfs3svc_encode_getaclres,
+ .pc_release = nfs3svc_release_getacl,
+ .pc_argsize = sizeof(struct nfsd3_getaclargs),
++ .pc_argzero = sizeof(struct nfsd3_getaclargs),
+ .pc_ressize = sizeof(struct nfsd3_getaclres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+2*(1+ACL),
++ .pc_name = "GETACL",
+ },
+ [ACLPROC3_SETACL] = {
+ .pc_func = nfsd3_proc_setacl,
+@@ -268,9 +256,11 @@ static const struct svc_procedure nfsd_acl_procedures3[3] = {
+ .pc_encode = nfs3svc_encode_setaclres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_setaclargs),
++ .pc_argzero = sizeof(struct nfsd3_setaclargs),
+ .pc_ressize = sizeof(struct nfsd3_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT,
++ .pc_name = "SETACL",
+ },
+ };
+
+diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
+index 981a4e4c9a3cf..19cf583096d9c 100644
+--- a/fs/nfsd/nfs3proc.c
++++ b/fs/nfsd/nfs3proc.c
+@@ -8,10 +8,12 @@
+ #include <linux/fs.h>
+ #include <linux/ext2_fs.h>
+ #include <linux/magic.h>
++#include <linux/namei.h>
+
+ #include "cache.h"
+ #include "xdr3.h"
+ #include "vfs.h"
++#include "filecache.h"
+
+ #define NFSDDBG_FACILITY NFSDDBG_PROC
+
+@@ -66,12 +68,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_sattrargs *argp = rqstp->rq_argp;
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &argp->attrs,
++ };
+
+ dprintk("nfsd: SETATTR(3) %s\n",
+ SVCFH_fmt(&argp->fh));
+
+ fh_copy(&resp->fh, &argp->fh);
+- resp->status = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,
++ resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs,
+ argp->check_guard, argp->guardtime);
+ return rpc_success;
+ }
+@@ -124,7 +129,7 @@ nfsd3_proc_access(struct svc_rqst *rqstp)
+ static __be32
+ nfsd3_proc_readlink(struct svc_rqst *rqstp)
+ {
+- struct nfsd3_readlinkargs *argp = rqstp->rq_argp;
++ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd3_readlinkres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
+@@ -132,7 +137,9 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp)
+ /* Read the symlink. */
+ fh_copy(&resp->fh, &argp->fh);
+ resp->len = NFS3_MAXPATHLEN;
+- resp->status = nfsd_readlink(rqstp, &resp->fh, argp->buffer, &resp->len);
++ resp->pages = rqstp->rq_next_page++;
++ resp->status = nfsd_readlink(rqstp, &resp->fh,
++ page_address(*resp->pages), &resp->len);
+ return rpc_success;
+ }
+
+@@ -144,25 +151,43 @@ nfsd3_proc_read(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_readargs *argp = rqstp->rq_argp;
+ struct nfsd3_readres *resp = rqstp->rq_resp;
+- u32 max_blocksize = svc_max_payload(rqstp);
+- unsigned long cnt = min(argp->count, max_blocksize);
++ unsigned int len;
++ int v;
+
+ dprintk("nfsd: READ(3) %s %lu bytes at %Lu\n",
+ SVCFH_fmt(&argp->fh),
+ (unsigned long) argp->count,
+ (unsigned long long) argp->offset);
+
++ argp->count = min_t(u32, argp->count, svc_max_payload(rqstp));
++ argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
++ if (argp->offset > (u64)OFFSET_MAX)
++ argp->offset = (u64)OFFSET_MAX;
++ if (argp->offset + argp->count > (u64)OFFSET_MAX)
++ argp->count = (u64)OFFSET_MAX - argp->offset;
++
++ v = 0;
++ len = argp->count;
++ resp->pages = rqstp->rq_next_page;
++ while (len > 0) {
++ struct page *page = *(rqstp->rq_next_page++);
++
++ rqstp->rq_vec[v].iov_base = page_address(page);
++ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
++ len -= rqstp->rq_vec[v].iov_len;
++ v++;
++ }
++
+ /* Obtain buffer pointer for payload.
+ * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
+ * + 1 (xdr opaque byte count) = 26
+ */
+- resp->count = cnt;
++ resp->count = argp->count;
+ svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
+- rqstp->rq_vec, argp->vlen, &resp->count,
+- &resp->eof);
++ rqstp->rq_vec, v, &resp->count, &resp->eof);
+ return rpc_success;
+ }
+
+@@ -190,32 +215,147 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
+
+ fh_copy(&resp->fh, &argp->fh);
+ resp->committed = argp->stable;
+- nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
+- &argp->first, cnt);
+- if (!nvecs) {
+- resp->status = nfserr_io;
+- goto out;
+- }
++ nvecs = svc_fill_write_vector(rqstp, &argp->payload);
++
+ resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
+ rqstp->rq_vec, nvecs, &cnt,
+ resp->committed, resp->verf);
+ resp->count = cnt;
+-out:
+ return rpc_success;
+ }
+
+ /*
+- * With NFSv3, CREATE processing is a lot easier than with NFSv2.
+- * At least in theory; we'll see how it fares in practice when the
+- * first reports about SunOS compatibility problems start to pour in...
++ * Implement NFSv3's unchecked, guarded, and exclusive CREATE
++ * semantics for regular files. Except for the created file,
++ * this operation is stateless on the server.
++ *
++ * Upon return, caller must release @fhp and @resfhp.
+ */
++static __be32
++nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ struct svc_fh *resfhp, struct nfsd3_createargs *argp)
++{
++ struct iattr *iap = &argp->attrs;
++ struct dentry *parent, *child;
++ struct nfsd_attrs attrs = {
++ .na_iattr = iap,
++ };
++ __u32 v_mtime, v_atime;
++ struct inode *inode;
++ __be32 status;
++ int host_err;
++
++ if (isdotent(argp->name, argp->len))
++ return nfserr_exist;
++ if (!(iap->ia_valid & ATTR_MODE))
++ iap->ia_mode = 0;
++
++ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
++ if (status != nfs_ok)
++ return status;
++
++ parent = fhp->fh_dentry;
++ inode = d_inode(parent);
++
++ host_err = fh_want_write(fhp);
++ if (host_err)
++ return nfserrno(host_err);
++
++ inode_lock_nested(inode, I_MUTEX_PARENT);
++
++ child = lookup_one_len(argp->name, parent, argp->len);
++ if (IS_ERR(child)) {
++ status = nfserrno(PTR_ERR(child));
++ goto out;
++ }
++
++ if (d_really_is_negative(child)) {
++ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
++ if (status != nfs_ok)
++ goto out;
++ }
++
++ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
++ if (status != nfs_ok)
++ goto out;
++
++ v_mtime = 0;
++ v_atime = 0;
++ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
++ u32 *verifier = (u32 *)argp->verf;
++
++ /*
++ * Solaris 7 gets confused (bugid 4218508) if these have
++ * the high bit set, as do xfs filesystems without the
++ * "bigtime" feature. So just clear the high bits.
++ */
++ v_mtime = verifier[0] & 0x7fffffff;
++ v_atime = verifier[1] & 0x7fffffff;
++ }
++
++ if (d_really_is_positive(child)) {
++ status = nfs_ok;
++
++ switch (argp->createmode) {
++ case NFS3_CREATE_UNCHECKED:
++ if (!d_is_reg(child))
++ break;
++ iap->ia_valid &= ATTR_SIZE;
++ goto set_attr;
++ case NFS3_CREATE_GUARDED:
++ status = nfserr_exist;
++ break;
++ case NFS3_CREATE_EXCLUSIVE:
++ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
++ d_inode(child)->i_atime.tv_sec == v_atime &&
++ d_inode(child)->i_size == 0) {
++ break;
++ }
++ status = nfserr_exist;
++ }
++ goto out;
++ }
++
++ if (!IS_POSIXACL(inode))
++ iap->ia_mode &= ~current_umask();
++
++ fh_fill_pre_attrs(fhp);
++ host_err = vfs_create(inode, child, iap->ia_mode, true);
++ if (host_err < 0) {
++ status = nfserrno(host_err);
++ goto out;
++ }
++ fh_fill_post_attrs(fhp);
++
++ /* A newly created file already has a file size of zero. */
++ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
++ iap->ia_valid &= ~ATTR_SIZE;
++ if (argp->createmode == NFS3_CREATE_EXCLUSIVE) {
++ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
++ ATTR_MTIME_SET | ATTR_ATIME_SET;
++ iap->ia_mtime.tv_sec = v_mtime;
++ iap->ia_atime.tv_sec = v_atime;
++ iap->ia_mtime.tv_nsec = 0;
++ iap->ia_atime.tv_nsec = 0;
++ }
++
++set_attr:
++ status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
++
++out:
++ inode_unlock(inode);
++ if (child && !IS_ERR(child))
++ dput(child);
++ fh_drop_write(fhp);
++ return status;
++}
++
+ static __be32
+ nfsd3_proc_create(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_createargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+- svc_fh *dirfhp, *newfhp = NULL;
+- struct iattr *attr;
++ svc_fh *dirfhp, *newfhp;
+
+ dprintk("nfsd: CREATE(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+@@ -224,21 +364,8 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
+
+ dirfhp = fh_copy(&resp->dirfh, &argp->fh);
+ newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
+- attr = &argp->attrs;
+-
+- /* Unfudge the mode bits */
+- attr->ia_mode &= ~S_IFMT;
+- if (!(attr->ia_valid & ATTR_MODE)) {
+- attr->ia_valid |= ATTR_MODE;
+- attr->ia_mode = S_IFREG;
+- } else {
+- attr->ia_mode = (attr->ia_mode & ~S_IFMT) | S_IFREG;
+- }
+
+- /* Now create the file and set attributes */
+- resp->status = do_nfsd_create(rqstp, dirfhp, argp->name, argp->len,
+- attr, newfhp, argp->createmode,
+- (u32 *)argp->verf, NULL, NULL);
++ resp->status = nfsd3_create_file(rqstp, dirfhp, newfhp, argp);
+ return rpc_success;
+ }
+
+@@ -250,6 +377,9 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_createargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &argp->attrs,
++ };
+
+ dprintk("nfsd: MKDIR(3) %s %.*s\n",
+ SVCFH_fmt(&argp->fh),
+@@ -260,8 +390,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
+ fh_copy(&resp->dirfh, &argp->fh);
+ fh_init(&resp->fh, NFS3_FHSIZE);
+ resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+- &argp->attrs, S_IFDIR, 0, &resp->fh);
+- fh_unlock(&resp->dirfh);
++ &attrs, S_IFDIR, 0, &resp->fh);
+ return rpc_success;
+ }
+
+@@ -270,6 +399,9 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_symlinkargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &argp->attrs,
++ };
+
+ if (argp->tlen == 0) {
+ resp->status = nfserr_inval;
+@@ -296,7 +428,7 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
+ fh_copy(&resp->dirfh, &argp->ffh);
+ fh_init(&resp->fh, NFS3_FHSIZE);
+ resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
+- argp->flen, argp->tname, &resp->fh);
++ argp->flen, argp->tname, &attrs, &resp->fh);
+ kfree(argp->tname);
+ out:
+ return rpc_success;
+@@ -310,6 +442,9 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_mknodargs *argp = rqstp->rq_argp;
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &argp->attrs,
++ };
+ int type;
+ dev_t rdev = 0;
+
+@@ -335,8 +470,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
+
+ type = nfs3_ftypes[argp->ftype];
+ resp->status = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len,
+- &argp->attrs, type, rdev, &resp->fh);
+- fh_unlock(&resp->dirfh);
++ &attrs, type, rdev, &resp->fh);
+ out:
+ return rpc_success;
+ }
+@@ -359,7 +493,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
+ argp->name, argp->len);
+- fh_unlock(&resp->fh);
+ return rpc_success;
+ }
+
+@@ -380,7 +513,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
+ argp->name, argp->len);
+- fh_unlock(&resp->fh);
+ return rpc_success;
+ }
+
+@@ -426,6 +558,26 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
+ return rpc_success;
+ }
+
++static void nfsd3_init_dirlist_pages(struct svc_rqst *rqstp,
++ struct nfsd3_readdirres *resp,
++ u32 count)
++{
++ struct xdr_buf *buf = &resp->dirlist;
++ struct xdr_stream *xdr = &resp->xdr;
++ unsigned int sendbuf = min_t(unsigned int, rqstp->rq_res.buflen,
++ svc_max_payload(rqstp));
++
++ memset(buf, 0, sizeof(*buf));
++
++ /* Reserve room for the NULL ptr & eof flag (-2 words) */
++ buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), sendbuf);
++ buf->buflen -= XDR_UNIT * 2;
++ buf->pages = rqstp->rq_next_page;
++ rqstp->rq_next_page += (buf->buflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
++
++ xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
++}
++
+ /*
+ * Read a portion of a directory.
+ */
+@@ -434,53 +586,26 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_readdirargs *argp = rqstp->rq_argp;
+ struct nfsd3_readdirres *resp = rqstp->rq_resp;
+- int count = 0;
+- struct page **p;
+- caddr_t page_addr = NULL;
++ loff_t offset;
+
+ dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, (u32) argp->cookie);
+
+- /* Make sure we've room for the NULL ptr & eof flag, and shrink to
+- * client read size */
+- count = (argp->count >> 2) - 2;
++ nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
+
+- /* Read directory and encode entries on the fly */
+ fh_copy(&resp->fh, &argp->fh);
+-
+- resp->buflen = count;
+ resp->common.err = nfs_ok;
+- resp->buffer = argp->buffer;
++ resp->cookie_offset = 0;
+ resp->rqstp = rqstp;
+- resp->status = nfsd_readdir(rqstp, &resp->fh, (loff_t *)&argp->cookie,
+- &resp->common, nfs3svc_encode_entry);
++ offset = argp->cookie;
++ resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
++ &resp->common, nfs3svc_encode_entry3);
+ memcpy(resp->verf, argp->verf, 8);
+- count = 0;
+- for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+- page_addr = page_address(*p);
++ nfs3svc_encode_cookie3(resp, offset);
+
+- if (((caddr_t)resp->buffer >= page_addr) &&
+- ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+- count += (caddr_t)resp->buffer - page_addr;
+- break;
+- }
+- count += PAGE_SIZE;
+- }
+- resp->count = count >> 2;
+- if (resp->offset) {
+- loff_t offset = argp->cookie;
+-
+- if (unlikely(resp->offset1)) {
+- /* we ended up with offset on a page boundary */
+- *resp->offset = htonl(offset >> 32);
+- *resp->offset1 = htonl(offset & 0xffffffff);
+- resp->offset1 = NULL;
+- } else {
+- xdr_encode_hyper(resp->offset, offset);
+- }
+- resp->offset = NULL;
+- }
++ /* Recycle only pages that were part of the reply */
++ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
+ return rpc_success;
+ }
+@@ -494,25 +619,17 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_readdirargs *argp = rqstp->rq_argp;
+ struct nfsd3_readdirres *resp = rqstp->rq_resp;
+- int count = 0;
+ loff_t offset;
+- struct page **p;
+- caddr_t page_addr = NULL;
+
+ dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, (u32) argp->cookie);
+
+- /* Convert byte count to number of words (i.e. >> 2),
+- * and reserve room for the NULL ptr & eof flag (-2 words) */
+- resp->count = (argp->count >> 2) - 2;
++ nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
+
+- /* Read directory and encode entries on the fly */
+ fh_copy(&resp->fh, &argp->fh);
+-
+ resp->common.err = nfs_ok;
+- resp->buffer = argp->buffer;
+- resp->buflen = resp->count;
++ resp->cookie_offset = 0;
+ resp->rqstp = rqstp;
+ offset = argp->cookie;
+
+@@ -526,30 +643,12 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
+ }
+
+ resp->status = nfsd_readdir(rqstp, &resp->fh, &offset,
+- &resp->common, nfs3svc_encode_entry_plus);
++ &resp->common, nfs3svc_encode_entryplus3);
+ memcpy(resp->verf, argp->verf, 8);
+- for (p = rqstp->rq_respages + 1; p < rqstp->rq_next_page; p++) {
+- page_addr = page_address(*p);
++ nfs3svc_encode_cookie3(resp, offset);
+
+- if (((caddr_t)resp->buffer >= page_addr) &&
+- ((caddr_t)resp->buffer < page_addr + PAGE_SIZE)) {
+- count += (caddr_t)resp->buffer - page_addr;
+- break;
+- }
+- count += PAGE_SIZE;
+- }
+- resp->count = count >> 2;
+- if (resp->offset) {
+- if (unlikely(resp->offset1)) {
+- /* we ended up with offset on a page boundary */
+- *resp->offset = htonl(offset >> 32);
+- *resp->offset1 = htonl(offset & 0xffffffff);
+- resp->offset1 = NULL;
+- } else {
+- xdr_encode_hyper(resp->offset, offset);
+- }
+- resp->offset = NULL;
+- }
++ /* Recycle only pages that were part of the reply */
++ rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
+ out:
+ return rpc_success;
+@@ -665,20 +764,21 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
+ {
+ struct nfsd3_commitargs *argp = rqstp->rq_argp;
+ struct nfsd3_commitres *resp = rqstp->rq_resp;
++ struct nfsd_file *nf;
+
+ dprintk("nfsd: COMMIT(3) %s %u@%Lu\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count,
+ (unsigned long long) argp->offset);
+
+- if (argp->offset > NFS_OFFSET_MAX) {
+- resp->status = nfserr_inval;
+- goto out;
+- }
+-
+ fh_copy(&resp->fh, &argp->fh);
+- resp->status = nfsd_commit(rqstp, &resp->fh, argp->offset,
++ resp->status = nfsd_file_acquire_gc(rqstp, &resp->fh, NFSD_MAY_WRITE |
++ NFSD_MAY_NOT_BREAK_LEASE, &nf);
++ if (resp->status)
++ goto out;
++ resp->status = nfsd_commit(rqstp, &resp->fh, nf, argp->offset,
+ argp->count, resp->verf);
++ nfsd_file_put(nf);
+ out:
+ return rpc_success;
+ }
+@@ -688,18 +788,14 @@ nfsd3_proc_commit(struct svc_rqst *rqstp)
+ * NFSv3 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+-#define nfs3svc_decode_fhandleargs nfs3svc_decode_fhandle
+ #define nfs3svc_encode_attrstatres nfs3svc_encode_attrstat
+ #define nfs3svc_encode_wccstatres nfs3svc_encode_wccstat
+ #define nfsd3_mkdirargs nfsd3_createargs
+ #define nfsd3_readdirplusargs nfsd3_readdirargs
+ #define nfsd3_fhandleargs nfsd_fhandle
+-#define nfsd3_fhandleres nfsd3_attrstat
+ #define nfsd3_attrstatres nfsd3_attrstat
+ #define nfsd3_wccstatres nfsd3_attrstat
+ #define nfsd3_createres nfsd3_diropres
+-#define nfsd3_voidres nfsd3_voidargs
+-struct nfsd3_voidargs { int dummy; };
+
+ #define ST 1 /* status*/
+ #define FH 17 /* filehandle with length */
+@@ -710,22 +806,26 @@ struct nfsd3_voidargs { int dummy; };
+ static const struct svc_procedure nfsd_procedures3[22] = {
+ [NFS3PROC_NULL] = {
+ .pc_func = nfsd3_proc_null,
+- .pc_decode = nfs3svc_decode_voidarg,
+- .pc_encode = nfs3svc_encode_voidres,
+- .pc_argsize = sizeof(struct nfsd3_voidargs),
+- .pc_ressize = sizeof(struct nfsd3_voidres),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST,
++ .pc_name = "NULL",
+ },
+ [NFS3PROC_GETATTR] = {
+ .pc_func = nfsd3_proc_getattr,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+- .pc_encode = nfs3svc_encode_attrstatres,
++ .pc_encode = nfs3svc_encode_getattrres,
+ .pc_release = nfs3svc_release_fhandle,
+- .pc_argsize = sizeof(struct nfsd3_fhandleargs),
++ .pc_argsize = sizeof(struct nfsd_fhandle),
++ .pc_argzero = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd3_attrstatres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
++ .pc_name = "GETATTR",
+ },
+ [NFS3PROC_SETATTR] = {
+ .pc_func = nfsd3_proc_setattr,
+@@ -733,19 +833,23 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_wccstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_sattrargs),
++ .pc_argzero = sizeof(struct nfsd3_sattrargs),
+ .pc_ressize = sizeof(struct nfsd3_wccstatres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC,
++ .pc_name = "SETATTR",
+ },
+ [NFS3PROC_LOOKUP] = {
+ .pc_func = nfsd3_proc_lookup,
+ .pc_decode = nfs3svc_decode_diropargs,
+- .pc_encode = nfs3svc_encode_diropres,
++ .pc_encode = nfs3svc_encode_lookupres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_diropargs),
++ .pc_argzero = sizeof(struct nfsd3_diropargs),
+ .pc_ressize = sizeof(struct nfsd3_diropres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+FH+pAT+pAT,
++ .pc_name = "LOOKUP",
+ },
+ [NFS3PROC_ACCESS] = {
+ .pc_func = nfsd3_proc_access,
+@@ -753,19 +857,23 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_accessres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_accessargs),
++ .pc_argzero = sizeof(struct nfsd3_accessargs),
+ .pc_ressize = sizeof(struct nfsd3_accessres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+1,
++ .pc_name = "ACCESS",
+ },
+ [NFS3PROC_READLINK] = {
+ .pc_func = nfsd3_proc_readlink,
+- .pc_decode = nfs3svc_decode_readlinkargs,
++ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_readlinkres,
+ .pc_release = nfs3svc_release_fhandle,
+- .pc_argsize = sizeof(struct nfsd3_readlinkargs),
++ .pc_argsize = sizeof(struct nfsd_fhandle),
++ .pc_argzero = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd3_readlinkres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+1+NFS3_MAXPATHLEN/4,
++ .pc_name = "READLINK",
+ },
+ [NFS3PROC_READ] = {
+ .pc_func = nfsd3_proc_read,
+@@ -773,9 +881,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_readres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readargs),
++ .pc_argzero = sizeof(struct nfsd3_readargs),
+ .pc_ressize = sizeof(struct nfsd3_readres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+4+NFSSVC_MAXBLKSIZE/4,
++ .pc_name = "READ",
+ },
+ [NFS3PROC_WRITE] = {
+ .pc_func = nfsd3_proc_write,
+@@ -783,9 +893,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_writeres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_writeargs),
++ .pc_argzero = sizeof(struct nfsd3_writeargs),
+ .pc_ressize = sizeof(struct nfsd3_writeres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC+4,
++ .pc_name = "WRITE",
+ },
+ [NFS3PROC_CREATE] = {
+ .pc_func = nfsd3_proc_create,
+@@ -793,9 +905,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_createargs),
++ .pc_argzero = sizeof(struct nfsd3_createargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
++ .pc_name = "CREATE",
+ },
+ [NFS3PROC_MKDIR] = {
+ .pc_func = nfsd3_proc_mkdir,
+@@ -803,9 +917,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_mkdirargs),
++ .pc_argzero = sizeof(struct nfsd3_mkdirargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
++ .pc_name = "MKDIR",
+ },
+ [NFS3PROC_SYMLINK] = {
+ .pc_func = nfsd3_proc_symlink,
+@@ -813,9 +929,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_symlinkargs),
++ .pc_argzero = sizeof(struct nfsd3_symlinkargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
++ .pc_name = "SYMLINK",
+ },
+ [NFS3PROC_MKNOD] = {
+ .pc_func = nfsd3_proc_mknod,
+@@ -823,9 +941,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_createres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_mknodargs),
++ .pc_argzero = sizeof(struct nfsd3_mknodargs),
+ .pc_ressize = sizeof(struct nfsd3_createres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+(1+FH+pAT)+WC,
++ .pc_name = "MKNOD",
+ },
+ [NFS3PROC_REMOVE] = {
+ .pc_func = nfsd3_proc_remove,
+@@ -833,9 +953,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_wccstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_diropargs),
++ .pc_argzero = sizeof(struct nfsd3_diropargs),
+ .pc_ressize = sizeof(struct nfsd3_wccstatres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC,
++ .pc_name = "REMOVE",
+ },
+ [NFS3PROC_RMDIR] = {
+ .pc_func = nfsd3_proc_rmdir,
+@@ -843,9 +965,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_wccstatres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_diropargs),
++ .pc_argzero = sizeof(struct nfsd3_diropargs),
+ .pc_ressize = sizeof(struct nfsd3_wccstatres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC,
++ .pc_name = "RMDIR",
+ },
+ [NFS3PROC_RENAME] = {
+ .pc_func = nfsd3_proc_rename,
+@@ -853,9 +977,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_renameres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_renameargs),
++ .pc_argzero = sizeof(struct nfsd3_renameargs),
+ .pc_ressize = sizeof(struct nfsd3_renameres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+WC+WC,
++ .pc_name = "RENAME",
+ },
+ [NFS3PROC_LINK] = {
+ .pc_func = nfsd3_proc_link,
+@@ -863,9 +989,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_linkres,
+ .pc_release = nfs3svc_release_fhandle2,
+ .pc_argsize = sizeof(struct nfsd3_linkargs),
++ .pc_argzero = sizeof(struct nfsd3_linkargs),
+ .pc_ressize = sizeof(struct nfsd3_linkres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+pAT+WC,
++ .pc_name = "LINK",
+ },
+ [NFS3PROC_READDIR] = {
+ .pc_func = nfsd3_proc_readdir,
+@@ -873,8 +1001,10 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_readdirres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readdirargs),
++ .pc_argzero = sizeof(struct nfsd3_readdirargs),
+ .pc_ressize = sizeof(struct nfsd3_readdirres),
+ .pc_cachetype = RC_NOCACHE,
++ .pc_name = "READDIR",
+ },
+ [NFS3PROC_READDIRPLUS] = {
+ .pc_func = nfsd3_proc_readdirplus,
+@@ -882,35 +1012,43 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_readdirres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_readdirplusargs),
++ .pc_argzero = sizeof(struct nfsd3_readdirplusargs),
+ .pc_ressize = sizeof(struct nfsd3_readdirres),
+ .pc_cachetype = RC_NOCACHE,
++ .pc_name = "READDIRPLUS",
+ },
+ [NFS3PROC_FSSTAT] = {
+ .pc_func = nfsd3_proc_fsstat,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_fsstatres,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
++ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_fsstatres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+2*6+1,
++ .pc_name = "FSSTAT",
+ },
+ [NFS3PROC_FSINFO] = {
+ .pc_func = nfsd3_proc_fsinfo,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_fsinfores,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
++ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_fsinfores),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+12,
++ .pc_name = "FSINFO",
+ },
+ [NFS3PROC_PATHCONF] = {
+ .pc_func = nfsd3_proc_pathconf,
+ .pc_decode = nfs3svc_decode_fhandleargs,
+ .pc_encode = nfs3svc_encode_pathconfres,
+ .pc_argsize = sizeof(struct nfsd3_fhandleargs),
++ .pc_argzero = sizeof(struct nfsd3_fhandleargs),
+ .pc_ressize = sizeof(struct nfsd3_pathconfres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+pAT+6,
++ .pc_name = "PATHCONF",
+ },
+ [NFS3PROC_COMMIT] = {
+ .pc_func = nfsd3_proc_commit,
+@@ -918,9 +1056,11 @@ static const struct svc_procedure nfsd_procedures3[22] = {
+ .pc_encode = nfs3svc_encode_commitres,
+ .pc_release = nfs3svc_release_fhandle,
+ .pc_argsize = sizeof(struct nfsd3_commitargs),
++ .pc_argzero = sizeof(struct nfsd3_commitargs),
+ .pc_ressize = sizeof(struct nfsd3_commitres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+WC+2,
++ .pc_name = "COMMIT",
+ },
+ };
+
+diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
+index 716566da400e1..3308dd671ef0b 100644
+--- a/fs/nfsd/nfs3xdr.c
++++ b/fs/nfsd/nfs3xdr.c
+@@ -14,13 +14,26 @@
+ #include "netns.h"
+ #include "vfs.h"
+
+-#define NFSDDBG_FACILITY NFSDDBG_XDR
++/*
++ * Force construction of an empty post-op attr
++ */
++static const struct svc_fh nfs3svc_null_fh = {
++ .fh_no_wcc = true,
++};
+
++/*
++ * time_delta. {1, 0} means the server is accurate only
++ * to the nearest second.
++ */
++static const struct timespec64 nfs3svc_time_delta = {
++ .tv_sec = 1,
++ .tv_nsec = 0,
++};
+
+ /*
+ * Mapping of S_IF* types to NFS file types
+ */
+-static u32 nfs3_ftypes[] = {
++static const u32 nfs3_ftypes[] = {
+ NF3NON, NF3FIFO, NF3CHR, NF3BAD,
+ NF3DIR, NF3BAD, NF3BLK, NF3BAD,
+ NF3REG, NF3BAD, NF3LNK, NF3BAD,
+@@ -29,824 +42,938 @@ static u32 nfs3_ftypes[] = {
+
+
+ /*
+- * XDR functions for basic NFS types
++ * Basic NFSv3 data types (RFC 1813 Sections 2.5 and 2.6)
+ */
++
+ static __be32 *
+-encode_time3(__be32 *p, struct timespec64 *time)
++encode_nfstime3(__be32 *p, const struct timespec64 *time)
+ {
+- *p++ = htonl((u32) time->tv_sec); *p++ = htonl(time->tv_nsec);
++ *p++ = cpu_to_be32((u32)time->tv_sec);
++ *p++ = cpu_to_be32(time->tv_nsec);
++
+ return p;
+ }
+
+-static __be32 *
+-decode_time3(__be32 *p, struct timespec64 *time)
++static bool
++svcxdr_decode_nfstime3(struct xdr_stream *xdr, struct timespec64 *timep)
+ {
+- time->tv_sec = ntohl(*p++);
+- time->tv_nsec = ntohl(*p++);
+- return p;
++ __be32 *p;
++
++ p = xdr_inline_decode(xdr, XDR_UNIT * 2);
++ if (!p)
++ return false;
++ timep->tv_sec = be32_to_cpup(p++);
++ timep->tv_nsec = be32_to_cpup(p);
++
++ return true;
+ }
+
+-static __be32 *
+-decode_fh(__be32 *p, struct svc_fh *fhp)
++/**
++ * svcxdr_decode_nfs_fh3 - Decode an NFSv3 file handle
++ * @xdr: XDR stream positioned at an undecoded NFSv3 FH
++ * @fhp: OUT: filled-in server file handle
++ *
++ * Return values:
++ * %false: The encoded file handle was not valid
++ * %true: @fhp has been initialized
++ */
++bool
++svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp)
+ {
+- unsigned int size;
++ __be32 *p;
++ u32 size;
++
++ if (xdr_stream_decode_u32(xdr, &size) < 0)
++ return false;
++ if (size == 0 || size > NFS3_FHSIZE)
++ return false;
++ p = xdr_inline_decode(xdr, size);
++ if (!p)
++ return false;
+ fh_init(fhp, NFS3_FHSIZE);
+- size = ntohl(*p++);
+- if (size > NFS3_FHSIZE)
+- return NULL;
+-
+- memcpy(&fhp->fh_handle.fh_base, p, size);
+ fhp->fh_handle.fh_size = size;
+- return p + XDR_QUADLEN(size);
++ memcpy(&fhp->fh_handle.fh_raw, p, size);
++
++ return true;
+ }
+
+-/* Helper function for NFSv3 ACL code */
+-__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp)
++/**
++ * svcxdr_encode_nfsstat3 - Encode an NFSv3 status code
++ * @xdr: XDR stream
++ * @status: status value to encode
++ *
++ * Return values:
++ * %false: Send buffer space was exhausted
++ * %true: Success
++ */
++bool
++svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status)
+ {
+- return decode_fh(p, fhp);
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, sizeof(status));
++ if (!p)
++ return false;
++ *p = status;
++
++ return true;
+ }
+
+-static __be32 *
+-encode_fh(__be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_encode_nfs_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
++{
++ u32 size = fhp->fh_handle.fh_size;
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT + size);
++ if (!p)
++ return false;
++ *p++ = cpu_to_be32(size);
++ if (size)
++ p[XDR_QUADLEN(size) - 1] = 0;
++ memcpy(p, &fhp->fh_handle.fh_raw, size);
++
++ return true;
++}
++
++static bool
++svcxdr_encode_post_op_fh3(struct xdr_stream *xdr, const struct svc_fh *fhp)
+ {
+- unsigned int size = fhp->fh_handle.fh_size;
+- *p++ = htonl(size);
+- if (size) p[XDR_QUADLEN(size)-1]=0;
+- memcpy(p, &fhp->fh_handle.fh_base, size);
+- return p + XDR_QUADLEN(size);
++ if (xdr_stream_encode_item_present(xdr) < 0)
++ return false;
++ if (!svcxdr_encode_nfs_fh3(xdr, fhp))
++ return false;
++
++ return true;
+ }
+
+-/*
+- * Decode a file name and make sure that the path contains
+- * no slashes or null bytes.
+- */
+-static __be32 *
+-decode_filename(__be32 *p, char **namp, unsigned int *lenp)
++static bool
++svcxdr_encode_cookieverf3(struct xdr_stream *xdr, const __be32 *verf)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, NFS3_COOKIEVERFSIZE);
++ if (!p)
++ return false;
++ memcpy(p, verf, NFS3_COOKIEVERFSIZE);
++
++ return true;
++}
++
++static bool
++svcxdr_encode_writeverf3(struct xdr_stream *xdr, const __be32 *verf)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, NFS3_WRITEVERFSIZE);
++ if (!p)
++ return false;
++ memcpy(p, verf, NFS3_WRITEVERFSIZE);
++
++ return true;
++}
++
++static bool
++svcxdr_decode_filename3(struct xdr_stream *xdr, char **name, unsigned int *len)
+ {
+- char *name;
+- unsigned int i;
+-
+- if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS3_MAXNAMLEN)) != NULL) {
+- for (i = 0, name = *namp; i < *lenp; i++, name++) {
+- if (*name == '\0' || *name == '/')
+- return NULL;
+- }
++ u32 size, i;
++ __be32 *p;
++ char *c;
++
++ if (xdr_stream_decode_u32(xdr, &size) < 0)
++ return false;
++ if (size == 0 || size > NFS3_MAXNAMLEN)
++ return false;
++ p = xdr_inline_decode(xdr, size);
++ if (!p)
++ return false;
++
++ *len = size;
++ *name = (char *)p;
++ for (i = 0, c = *name; i < size; i++, c++) {
++ if (*c == '\0' || *c == '/')
++ return false;
+ }
+
+- return p;
++ return true;
+ }
+
+-static __be32 *
+-decode_sattr3(__be32 *p, struct iattr *iap, struct user_namespace *userns)
++static bool
++svcxdr_decode_diropargs3(struct xdr_stream *xdr, struct svc_fh *fhp,
++ char **name, unsigned int *len)
++{
++ return svcxdr_decode_nfs_fh3(xdr, fhp) &&
++ svcxdr_decode_filename3(xdr, name, len);
++}
++
++static bool
++svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ struct iattr *iap)
+ {
+- u32 tmp;
++ u32 set_it;
+
+ iap->ia_valid = 0;
+
+- if (*p++) {
++ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
++ return false;
++ if (set_it) {
++ u32 mode;
++
++ if (xdr_stream_decode_u32(xdr, &mode) < 0)
++ return false;
+ iap->ia_valid |= ATTR_MODE;
+- iap->ia_mode = ntohl(*p++);
++ iap->ia_mode = mode;
+ }
+- if (*p++) {
+- iap->ia_uid = make_kuid(userns, ntohl(*p++));
++ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
++ return false;
++ if (set_it) {
++ u32 uid;
++
++ if (xdr_stream_decode_u32(xdr, &uid) < 0)
++ return false;
++ iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), uid);
+ if (uid_valid(iap->ia_uid))
+ iap->ia_valid |= ATTR_UID;
+ }
+- if (*p++) {
+- iap->ia_gid = make_kgid(userns, ntohl(*p++));
++ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
++ return false;
++ if (set_it) {
++ u32 gid;
++
++ if (xdr_stream_decode_u32(xdr, &gid) < 0)
++ return false;
++ iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), gid);
+ if (gid_valid(iap->ia_gid))
+ iap->ia_valid |= ATTR_GID;
+ }
+- if (*p++) {
+- u64 newsize;
++ if (xdr_stream_decode_bool(xdr, &set_it) < 0)
++ return false;
++ if (set_it) {
++ u64 newsize;
+
++ if (xdr_stream_decode_u64(xdr, &newsize) < 0)
++ return false;
+ iap->ia_valid |= ATTR_SIZE;
+- p = xdr_decode_hyper(p, &newsize);
+- iap->ia_size = min_t(u64, newsize, NFS_OFFSET_MAX);
++ iap->ia_size = newsize;
+ }
+- if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
++ if (xdr_stream_decode_u32(xdr, &set_it) < 0)
++ return false;
++ switch (set_it) {
++ case DONT_CHANGE:
++ break;
++ case SET_TO_SERVER_TIME:
+ iap->ia_valid |= ATTR_ATIME;
+- } else if (tmp == 2) { /* set to client time */
++ break;
++ case SET_TO_CLIENT_TIME:
++ if (!svcxdr_decode_nfstime3(xdr, &iap->ia_atime))
++ return false;
+ iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+- iap->ia_atime.tv_sec = ntohl(*p++);
+- iap->ia_atime.tv_nsec = ntohl(*p++);
++ break;
++ default:
++ return false;
+ }
+- if ((tmp = ntohl(*p++)) == 1) { /* set to server time */
++ if (xdr_stream_decode_u32(xdr, &set_it) < 0)
++ return false;
++ switch (set_it) {
++ case DONT_CHANGE:
++ break;
++ case SET_TO_SERVER_TIME:
+ iap->ia_valid |= ATTR_MTIME;
+- } else if (tmp == 2) { /* set to client time */
++ break;
++ case SET_TO_CLIENT_TIME:
++ if (!svcxdr_decode_nfstime3(xdr, &iap->ia_mtime))
++ return false;
+ iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+- iap->ia_mtime.tv_sec = ntohl(*p++);
+- iap->ia_mtime.tv_nsec = ntohl(*p++);
++ break;
++ default:
++ return false;
+ }
+- return p;
++
++ return true;
+ }
+
+-static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args)
+ {
+- u64 f;
+- switch(fsid_source(fhp)) {
+- default:
+- case FSIDSOURCE_DEV:
+- p = xdr_encode_hyper(p, (u64)huge_encode_dev
+- (fhp->fh_dentry->d_sb->s_dev));
+- break;
+- case FSIDSOURCE_FSID:
+- p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid);
+- break;
+- case FSIDSOURCE_UUID:
+- f = ((u64*)fhp->fh_export->ex_uuid)[0];
+- f ^= ((u64*)fhp->fh_export->ex_uuid)[1];
+- p = xdr_encode_hyper(p, f);
+- break;
+- }
+- return p;
++ __be32 *p;
++ u32 check;
++
++ if (xdr_stream_decode_bool(xdr, &check) < 0)
++ return false;
++ if (check) {
++ p = xdr_inline_decode(xdr, XDR_UNIT * 2);
++ if (!p)
++ return false;
++ args->check_guard = 1;
++ args->guardtime = be32_to_cpup(p);
++ } else
++ args->check_guard = 0;
++
++ return true;
+ }
+
+-static __be32 *
+-encode_fattr3(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+- struct kstat *stat)
++static bool
++svcxdr_decode_specdata3(struct xdr_stream *xdr, struct nfsd3_mknodargs *args)
+ {
+- struct user_namespace *userns = nfsd_user_namespace(rqstp);
+- *p++ = htonl(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
+- *p++ = htonl((u32) (stat->mode & S_IALLUGO));
+- *p++ = htonl((u32) stat->nlink);
+- *p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
+- *p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
+- if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN) {
+- p = xdr_encode_hyper(p, (u64) NFS3_MAXPATHLEN);
+- } else {
+- p = xdr_encode_hyper(p, (u64) stat->size);
+- }
+- p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
+- *p++ = htonl((u32) MAJOR(stat->rdev));
+- *p++ = htonl((u32) MINOR(stat->rdev));
+- p = encode_fsid(p, fhp);
+- p = xdr_encode_hyper(p, stat->ino);
+- p = encode_time3(p, &stat->atime);
+- p = encode_time3(p, &stat->mtime);
+- p = encode_time3(p, &stat->ctime);
++ __be32 *p;
+
+- return p;
++ p = xdr_inline_decode(xdr, XDR_UNIT * 2);
++ if (!p)
++ return false;
++ args->major = be32_to_cpup(p++);
++ args->minor = be32_to_cpup(p);
++
++ return true;
+ }
+
+-static __be32 *
+-encode_saved_post_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_decode_devicedata3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ struct nfsd3_mknodargs *args)
+ {
+- /* Attributes to follow */
+- *p++ = xdr_one;
+- return encode_fattr3(rqstp, p, fhp, &fhp->fh_post_attr);
++ return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) &&
++ svcxdr_decode_specdata3(xdr, args);
+ }
+
+-/*
+- * Encode post-operation attributes.
+- * The inode may be NULL if the call failed because of a stale file
+- * handle. In this case, no attributes are returned.
+- */
+-static __be32 *
+-encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_encode_fattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ const struct svc_fh *fhp, const struct kstat *stat)
+ {
+- struct dentry *dentry = fhp->fh_dentry;
+- if (dentry && d_really_is_positive(dentry)) {
+- __be32 err;
+- struct kstat stat;
+-
+- err = fh_getattr(fhp, &stat);
+- if (!err) {
+- *p++ = xdr_one; /* attributes follow */
+- lease_get_mtime(d_inode(dentry), &stat.mtime);
+- return encode_fattr3(rqstp, p, fhp, &stat);
+- }
++ struct user_namespace *userns = nfsd_user_namespace(rqstp);
++ __be32 *p;
++ u64 fsid;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT * 21);
++ if (!p)
++ return false;
++
++ *p++ = cpu_to_be32(nfs3_ftypes[(stat->mode & S_IFMT) >> 12]);
++ *p++ = cpu_to_be32((u32)(stat->mode & S_IALLUGO));
++ *p++ = cpu_to_be32((u32)stat->nlink);
++ *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid));
++ *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid));
++ if (S_ISLNK(stat->mode) && stat->size > NFS3_MAXPATHLEN)
++ p = xdr_encode_hyper(p, (u64)NFS3_MAXPATHLEN);
++ else
++ p = xdr_encode_hyper(p, (u64)stat->size);
++
++ /* used */
++ p = xdr_encode_hyper(p, ((u64)stat->blocks) << 9);
++
++ /* rdev */
++ *p++ = cpu_to_be32((u32)MAJOR(stat->rdev));
++ *p++ = cpu_to_be32((u32)MINOR(stat->rdev));
++
++ switch(fsid_source(fhp)) {
++ case FSIDSOURCE_FSID:
++ fsid = (u64)fhp->fh_export->ex_fsid;
++ break;
++ case FSIDSOURCE_UUID:
++ fsid = ((u64 *)fhp->fh_export->ex_uuid)[0];
++ fsid ^= ((u64 *)fhp->fh_export->ex_uuid)[1];
++ break;
++ default:
++ fsid = (u64)huge_encode_dev(fhp->fh_dentry->d_sb->s_dev);
+ }
+- *p++ = xdr_zero;
+- return p;
++ p = xdr_encode_hyper(p, fsid);
++
++ /* fileid */
++ p = xdr_encode_hyper(p, stat->ino);
++
++ p = encode_nfstime3(p, &stat->atime);
++ p = encode_nfstime3(p, &stat->mtime);
++ encode_nfstime3(p, &stat->ctime);
++
++ return true;
+ }
+
+-/* Helper for NFSv3 ACLs */
+-__be32 *
+-nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_encode_wcc_attr(struct xdr_stream *xdr, const struct svc_fh *fhp)
+ {
+- return encode_post_op_attr(rqstp, p, fhp);
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT * 6);
++ if (!p)
++ return false;
++ p = xdr_encode_hyper(p, (u64)fhp->fh_pre_size);
++ p = encode_nfstime3(p, &fhp->fh_pre_mtime);
++ encode_nfstime3(p, &fhp->fh_pre_ctime);
++
++ return true;
+ }
+
+-/*
+- * Enocde weak cache consistency data
+- */
+-static __be32 *
+-encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_encode_pre_op_attr(struct xdr_stream *xdr, const struct svc_fh *fhp)
+ {
+- struct dentry *dentry = fhp->fh_dentry;
+-
+- if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) {
+- if (fhp->fh_pre_saved) {
+- *p++ = xdr_one;
+- p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size);
+- p = encode_time3(p, &fhp->fh_pre_mtime);
+- p = encode_time3(p, &fhp->fh_pre_ctime);
+- } else {
+- *p++ = xdr_zero;
+- }
+- return encode_saved_post_attr(rqstp, p, fhp);
++ if (!fhp->fh_pre_saved) {
++ if (xdr_stream_encode_item_absent(xdr) < 0)
++ return false;
++ return true;
+ }
+- /* no pre- or post-attrs */
+- *p++ = xdr_zero;
+- return encode_post_op_attr(rqstp, p, fhp);
++
++ if (xdr_stream_encode_item_present(xdr) < 0)
++ return false;
++ return svcxdr_encode_wcc_attr(xdr, fhp);
+ }
+
+-/*
+- * Fill in the pre_op attr for the wcc data
++/**
++ * svcxdr_encode_post_op_attr - Encode NFSv3 post-op attributes
++ * @rqstp: Context of a completed RPC transaction
++ * @xdr: XDR stream
++ * @fhp: File handle to encode
++ *
++ * Return values:
++ * %false: Send buffer space was exhausted
++ * %true: Success
+ */
+-void fill_pre_wcc(struct svc_fh *fhp)
++bool
++svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ const struct svc_fh *fhp)
+ {
+- struct inode *inode;
+- struct kstat stat;
+- __be32 err;
++ struct dentry *dentry = fhp->fh_dentry;
++ struct kstat stat;
+
+- if (fhp->fh_pre_saved)
+- return;
++ /*
++ * The inode may be NULL if the call failed because of a
++ * stale file handle. In this case, no attributes are
++ * returned.
++ */
++ if (fhp->fh_no_wcc || !dentry || !d_really_is_positive(dentry))
++ goto no_post_op_attrs;
++ if (fh_getattr(fhp, &stat) != nfs_ok)
++ goto no_post_op_attrs;
+
+- inode = d_inode(fhp->fh_dentry);
+- err = fh_getattr(fhp, &stat);
+- if (err) {
+- /* Grab the times from inode anyway */
+- stat.mtime = inode->i_mtime;
+- stat.ctime = inode->i_ctime;
+- stat.size = inode->i_size;
+- }
++ if (xdr_stream_encode_item_present(xdr) < 0)
++ return false;
++ lease_get_mtime(d_inode(dentry), &stat.mtime);
++ if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &stat))
++ return false;
+
+- fhp->fh_pre_mtime = stat.mtime;
+- fhp->fh_pre_ctime = stat.ctime;
+- fhp->fh_pre_size = stat.size;
+- fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
+- fhp->fh_pre_saved = true;
++ return true;
++
++no_post_op_attrs:
++ return xdr_stream_encode_item_absent(xdr) > 0;
+ }
+
+ /*
+- * Fill in the post_op attr for the wcc data
++ * Encode weak cache consistency data
+ */
+-void fill_post_wcc(struct svc_fh *fhp)
++static bool
++svcxdr_encode_wcc_data(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ const struct svc_fh *fhp)
+ {
+- __be32 err;
+-
+- if (fhp->fh_post_saved)
+- printk("nfsd: inode locked twice during operation.\n");
+-
+- err = fh_getattr(fhp, &fhp->fh_post_attr);
+- fhp->fh_post_change = nfsd4_change_attribute(&fhp->fh_post_attr,
+- d_inode(fhp->fh_dentry));
+- if (err) {
+- fhp->fh_post_saved = false;
+- /* Grab the ctime anyway - set_change_info might use it */
+- fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
+- } else
+- fhp->fh_post_saved = true;
++ struct dentry *dentry = fhp->fh_dentry;
++
++ if (!dentry || !d_really_is_positive(dentry) || !fhp->fh_post_saved)
++ goto neither;
++
++ /* before */
++ if (!svcxdr_encode_pre_op_attr(xdr, fhp))
++ return false;
++
++ /* after */
++ if (xdr_stream_encode_item_present(xdr) < 0)
++ return false;
++ if (!svcxdr_encode_fattr3(rqstp, xdr, fhp, &fhp->fh_post_attr))
++ return false;
++
++ return true;
++
++neither:
++ if (xdr_stream_encode_item_absent(xdr) < 0)
++ return false;
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, fhp))
++ return false;
++
++ return true;
+ }
+
+ /*
+ * XDR decode functions
+ */
+-int
+-nfs3svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return 1;
+-}
+
+-int
+-nfs3svc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_fhandle *args = rqstp->rq_argp;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_nfs_fh3(xdr, &args->fh);
+ }
+
+-int
+-nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_sattrargs *args = rqstp->rq_argp;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- if ((args->check_guard = ntohl(*p++)) != 0) {
+- struct timespec64 time;
+- p = decode_time3(p, &time);
+- args->guardtime = time.tv_sec;
+- }
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_nfs_fh3(xdr, &args->fh) &&
++ svcxdr_decode_sattr3(rqstp, xdr, &args->attrs) &&
++ svcxdr_decode_sattrguard3(xdr, args);
+ }
+
+-int
+-nfs3svc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_diropargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->fh))
+- || !(p = decode_filename(p, &args->name, &args->len)))
+- return 0;
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len);
+ }
+
+-int
+-nfs3svc_decode_accessargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_accessargs *args = rqstp->rq_argp;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- args->access = ntohl(*p++);
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->access) < 0)
++ return false;
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_readargs *args = rqstp->rq_argp;
+- unsigned int len;
+- int v;
+- u32 max_blocksize = svc_max_payload(rqstp);
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = xdr_decode_hyper(p, &args->offset);
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
+
+- args->count = ntohl(*p++);
+- len = min(args->count, max_blocksize);
+-
+- /* set up the kvec */
+- v=0;
+- while (len > 0) {
+- struct page *p = *(rqstp->rq_next_page++);
+-
+- rqstp->rq_vec[v].iov_base = page_address(p);
+- rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+- len -= rqstp->rq_vec[v].iov_len;
+- v++;
+- }
+- args->vlen = v;
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_writeargs *args = rqstp->rq_argp;
+- unsigned int len, hdr, dlen;
+ u32 max_blocksize = svc_max_payload(rqstp);
+- struct kvec *head = rqstp->rq_arg.head;
+- struct kvec *tail = rqstp->rq_arg.tail;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = xdr_decode_hyper(p, &args->offset);
+-
+- args->count = ntohl(*p++);
+- args->stable = ntohl(*p++);
+- len = args->len = ntohl(*p++);
+- if ((void *)p > head->iov_base + head->iov_len)
+- return 0;
+- /*
+- * The count must equal the amount of data passed.
+- */
+- if (args->count != args->len)
+- return 0;
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->stable) < 0)
++ return false;
+
+- /*
+- * Check to make sure that we got the right number of
+- * bytes.
+- */
+- hdr = (void*)p - head->iov_base;
+- dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
+- /*
+- * Round the length of the data which was specified up to
+- * the next multiple of XDR units and then compare that
+- * against the length which was actually received.
+- * Note that when RPCSEC/GSS (for example) is used, the
+- * data buffer can be padded so dlen might be larger
+- * than required. It must never be smaller.
+- */
+- if (dlen < XDR_QUADLEN(len)*4)
+- return 0;
++ /* opaque data */
++ if (xdr_stream_decode_u32(xdr, &args->len) < 0)
++ return false;
+
++ /* request sanity */
++ if (args->count != args->len)
++ return false;
+ if (args->count > max_blocksize) {
+ args->count = max_blocksize;
+- len = args->len = max_blocksize;
++ args->len = max_blocksize;
+ }
+
+- args->first.iov_base = (void *)p;
+- args->first.iov_len = head->iov_len - hdr;
+- return 1;
++ return xdr_stream_subsegment(xdr, &args->payload, args->count);
+ }
+
+-int
+-nfs3svc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_createargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->fh))
+- || !(p = decode_filename(p, &args->name, &args->len)))
+- return 0;
+-
+- switch (args->createmode = ntohl(*p++)) {
++ if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->createmode) < 0)
++ return false;
++ switch (args->createmode) {
+ case NFS3_CREATE_UNCHECKED:
+ case NFS3_CREATE_GUARDED:
+- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+- break;
++ return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
+ case NFS3_CREATE_EXCLUSIVE:
+- args->verf = p;
+- p += 2;
++ args->verf = xdr_inline_decode(xdr, NFS3_CREATEVERFSIZE);
++ if (!args->verf)
++ return false;
+ break;
+ default:
+- return 0;
++ return false;
+ }
+-
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_createargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->fh)) ||
+- !(p = decode_filename(p, &args->name, &args->len)))
+- return 0;
+- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_diropargs3(xdr, &args->fh,
++ &args->name, &args->len) &&
++ svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
+ }
+
+-int
+-nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_symlinkargs *args = rqstp->rq_argp;
+- char *base = (char *)p;
+- size_t dlen;
+-
+- if (!(p = decode_fh(p, &args->ffh)) ||
+- !(p = decode_filename(p, &args->fname, &args->flen)))
+- return 0;
+- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- args->tlen = ntohl(*p++);
+-
+- args->first.iov_base = p;
+- args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+- args->first.iov_len -= (char *)p - base;
++ struct kvec *head = rqstp->rq_arg.head;
+
+- dlen = args->first.iov_len + rqstp->rq_arg.page_len +
+- rqstp->rq_arg.tail[0].iov_len;
+- if (dlen < XDR_QUADLEN(args->tlen) << 2)
+- return 0;
+- return 1;
++ if (!svcxdr_decode_diropargs3(xdr, &args->ffh, &args->fname, &args->flen))
++ return false;
++ if (!svcxdr_decode_sattr3(rqstp, xdr, &args->attrs))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
++ return false;
++
++ /* symlink_data */
++ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
++ args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
++ return args->first.iov_base != NULL;
+ }
+
+-int
+-nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_mknodargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->fh))
+- || !(p = decode_filename(p, &args->name, &args->len)))
+- return 0;
+-
+- args->ftype = ntohl(*p++);
+-
+- if (args->ftype == NF3BLK || args->ftype == NF3CHR
+- || args->ftype == NF3SOCK || args->ftype == NF3FIFO)
+- p = decode_sattr3(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- if (args->ftype == NF3BLK || args->ftype == NF3CHR) {
+- args->major = ntohl(*p++);
+- args->minor = ntohl(*p++);
++ if (!svcxdr_decode_diropargs3(xdr, &args->fh, &args->name, &args->len))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->ftype) < 0)
++ return false;
++ switch (args->ftype) {
++ case NF3CHR:
++ case NF3BLK:
++ return svcxdr_decode_devicedata3(rqstp, xdr, args);
++ case NF3SOCK:
++ case NF3FIFO:
++ return svcxdr_decode_sattr3(rqstp, xdr, &args->attrs);
++ case NF3REG:
++ case NF3DIR:
++ case NF3LNK:
++ /* Valid XDR but illegal file types */
++ break;
++ default:
++ return false;
+ }
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+-int
+-nfs3svc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_renameargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->ffh))
+- || !(p = decode_filename(p, &args->fname, &args->flen))
+- || !(p = decode_fh(p, &args->tfh))
+- || !(p = decode_filename(p, &args->tname, &args->tlen)))
+- return 0;
+-
+- return xdr_argsize_check(rqstp, p);
+-}
+-
+-int
+-nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nfsd3_readlinkargs *args = rqstp->rq_argp;
+-
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- args->buffer = page_address(*(rqstp->rq_next_page++));
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_diropargs3(xdr, &args->ffh,
++ &args->fname, &args->flen) &&
++ svcxdr_decode_diropargs3(xdr, &args->tfh,
++ &args->tname, &args->tlen);
+ }
+
+-int
+-nfs3svc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_linkargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->ffh))
+- || !(p = decode_fh(p, &args->tfh))
+- || !(p = decode_filename(p, &args->tname, &args->tlen)))
+- return 0;
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_nfs_fh3(xdr, &args->ffh) &&
++ svcxdr_decode_diropargs3(xdr, &args->tfh,
++ &args->tname, &args->tlen);
+ }
+
+-int
+-nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_readdirargs *args = rqstp->rq_argp;
+- int len;
+- u32 max_blocksize = svc_max_payload(rqstp);
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = xdr_decode_hyper(p, &args->cookie);
+- args->verf = p; p += 2;
+- args->dircount = ~0;
+- args->count = ntohl(*p++);
+- len = args->count = min_t(u32, args->count, max_blocksize);
+-
+- while (len > 0) {
+- struct page *p = *(rqstp->rq_next_page++);
+- if (!args->buffer)
+- args->buffer = page_address(p);
+- len -= PAGE_SIZE;
+- }
+-
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
++ return false;
++ args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
++ if (!args->verf)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_readdirargs *args = rqstp->rq_argp;
+- int len;
+- u32 max_blocksize = svc_max_payload(rqstp);
+-
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = xdr_decode_hyper(p, &args->cookie);
+- args->verf = p; p += 2;
+- args->dircount = ntohl(*p++);
+- args->count = ntohl(*p++);
+-
+- len = args->count = min(args->count, max_blocksize);
+- while (len > 0) {
+- struct page *p = *(rqstp->rq_next_page++);
+- if (!args->buffer)
+- args->buffer = page_address(p);
+- len -= PAGE_SIZE;
+- }
+-
+- return xdr_argsize_check(rqstp, p);
++ u32 dircount;
++
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u64(xdr, &args->cookie) < 0)
++ return false;
++ args->verf = xdr_inline_decode(xdr, NFS3_COOKIEVERFSIZE);
++ if (!args->verf)
++ return false;
++ /* dircount is ignored */
++ if (xdr_stream_decode_u32(xdr, &dircount) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nfs3svc_decode_commitargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_commitargs *args = rqstp->rq_argp;
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = xdr_decode_hyper(p, &args->offset);
+- args->count = ntohl(*p++);
+
+- return xdr_argsize_check(rqstp, p);
++ if (!svcxdr_decode_nfs_fh3(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u64(xdr, &args->offset) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
++
++ return true;
+ }
+
+ /*
+ * XDR encode functions
+ */
+
+-int
+-nfs3svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return xdr_ressize_check(rqstp, p);
+-}
+-
+ /* GETATTR */
+-int
+-nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- if (resp->status == 0) {
+- lease_get_mtime(d_inode(resp->fh.fh_dentry),
+- &resp->stat.mtime);
+- p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat);
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime);
++ if (!svcxdr_encode_fattr3(rqstp, xdr, &resp->fh, &resp->stat))
++ return false;
++ break;
+ }
+- return xdr_ressize_check(rqstp, p);
++
++ return true;
+ }
+
+ /* SETATTR, REMOVE, RMDIR */
+-int
+-nfs3svc_encode_wccstat(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_attrstat *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = encode_wcc_data(rqstp, p, &resp->fh);
+- return xdr_ressize_check(rqstp, p);
++ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
++ svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh);
+ }
+
+ /* LOOKUP */
+-int
+-nfs3svc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- if (resp->status == 0) {
+- p = encode_fh(p, &resp->fh);
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_nfs_fh3(xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->dirfh))
++ return false;
+ }
+- p = encode_post_op_attr(rqstp, p, &resp->dirfh);
+- return xdr_ressize_check(rqstp, p);
++
++ return true;
+ }
+
+ /* ACCESS */
+-int
+-nfs3svc_encode_accessres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_accessres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
+- if (resp->status == 0)
+- *p++ = htonl(resp->access);
+- return xdr_ressize_check(rqstp, p);
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->access) < 0)
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ }
++
++ return true;
+ }
+
+ /* READLINK */
+-int
+-nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_readlinkres *resp = rqstp->rq_resp;
++ struct kvec *head = rqstp->rq_res.head;
++
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->len) < 0)
++ return false;
++ xdr_write_pages(xdr, resp->pages, 0, resp->len);
++ if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ }
+
+- *p++ = resp->status;
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
+- if (resp->status == 0) {
+- *p++ = htonl(resp->len);
+- xdr_ressize_check(rqstp, p);
+- rqstp->rq_res.page_len = resp->len;
+- if (resp->len & 3) {
+- /* need to pad the tail */
+- rqstp->rq_res.tail[0].iov_base = p;
+- *p = 0;
+- rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
+- }
+- return 1;
+- } else
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+ /* READ */
+-int
+-nfs3svc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_readres *resp = rqstp->rq_resp;
++ struct kvec *head = rqstp->rq_res.head;
++
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
++ return false;
++ if (xdr_stream_encode_bool(xdr, resp->eof) < 0)
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
++ return false;
++ xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
++ resp->count);
++ if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ }
+
+- *p++ = resp->status;
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
+- if (resp->status == 0) {
+- *p++ = htonl(resp->count);
+- *p++ = htonl(resp->eof);
+- *p++ = htonl(resp->count); /* xdr opaque count */
+- xdr_ressize_check(rqstp, p);
+- /* now update rqstp->rq_res to reflect data as well */
+- rqstp->rq_res.page_len = resp->count;
+- if (resp->count & 3) {
+- /* need to pad the tail */
+- rqstp->rq_res.tail[0].iov_base = p;
+- *p = 0;
+- rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
+- }
+- return 1;
+- } else
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+ /* WRITE */
+-int
+-nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_writeres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = encode_wcc_data(rqstp, p, &resp->fh);
+- if (resp->status == 0) {
+- *p++ = htonl(resp->count);
+- *p++ = htonl(resp->committed);
+- *p++ = resp->verf[0];
+- *p++ = resp->verf[1];
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->committed) < 0)
++ return false;
++ if (!svcxdr_encode_writeverf3(xdr, resp->verf))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
++ return false;
+ }
+- return xdr_ressize_check(rqstp, p);
++
++ return true;
+ }
+
+ /* CREATE, MKDIR, SYMLINK, MKNOD */
+-int
+-nfs3svc_encode_createres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_diropres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- if (resp->status == 0) {
+- *p++ = xdr_one;
+- p = encode_fh(p, &resp->fh);
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_fh3(xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->dirfh))
++ return false;
+ }
+- p = encode_wcc_data(rqstp, p, &resp->dirfh);
+- return xdr_ressize_check(rqstp, p);
++
++ return true;
+ }
+
+ /* RENAME */
+-int
+-nfs3svc_encode_renameres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_renameres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = encode_wcc_data(rqstp, p, &resp->ffh);
+- p = encode_wcc_data(rqstp, p, &resp->tfh);
+- return xdr_ressize_check(rqstp, p);
++ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
++ svcxdr_encode_wcc_data(rqstp, xdr, &resp->ffh) &&
++ svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh);
+ }
+
+ /* LINK */
+-int
+-nfs3svc_encode_linkres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_linkres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
+- p = encode_wcc_data(rqstp, p, &resp->tfh);
+- return xdr_ressize_check(rqstp, p);
++ return svcxdr_encode_nfsstat3(xdr, resp->status) &&
++ svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh) &&
++ svcxdr_encode_wcc_data(rqstp, xdr, &resp->tfh);
+ }
+
+ /* READDIR */
+-int
+-nfs3svc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_readdirres *resp = rqstp->rq_resp;
++ struct xdr_buf *dirlist = &resp->dirlist;
++
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_cookieverf3(xdr, resp->verf))
++ return false;
++ xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
++ /* no more entries */
++ if (xdr_stream_encode_item_absent(xdr) < 0)
++ return false;
++ if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &resp->fh))
++ return false;
++ }
+
+- *p++ = resp->status;
+- p = encode_post_op_attr(rqstp, p, &resp->fh);
+-
+- if (resp->status == 0) {
+- /* stupid readdir cookie */
+- memcpy(p, resp->verf, 8); p += 2;
+- xdr_ressize_check(rqstp, p);
+- if (rqstp->rq_res.head[0].iov_len + (2<<2) > PAGE_SIZE)
+- return 1; /*No room for trailer */
+- rqstp->rq_res.page_len = (resp->count) << 2;
+-
+- /* add the 'tail' to the end of the 'head' page - page 0. */
+- rqstp->rq_res.tail[0].iov_base = p;
+- *p++ = 0; /* no more entries */
+- *p++ = htonl(resp->common.err == nfserr_eof);
+- rqstp->rq_res.tail[0].iov_len = 2<<2;
+- return 1;
+- } else
+- return xdr_ressize_check(rqstp, p);
+-}
+-
+-static __be32 *
+-encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
+- int namlen, u64 ino)
+-{
+- *p++ = xdr_one; /* mark entry present */
+- p = xdr_encode_hyper(p, ino); /* file id */
+- p = xdr_encode_array(p, name, namlen);/* name length & name */
+-
+- cd->offset = p; /* remember pointer */
+- p = xdr_encode_hyper(p, NFS_OFFSET_MAX);/* offset of next entry */
+-
+- return p;
++ return true;
+ }
+
+ static __be32
+@@ -887,267 +1014,323 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
+ return rv;
+ }
+
+-static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino)
+-{
+- struct svc_fh *fh = &cd->scratch;
+- __be32 err;
+-
+- fh_init(fh, NFS3_FHSIZE);
+- err = compose_entry_fh(cd, fh, name, namlen, ino);
+- if (err) {
+- *p++ = 0;
+- *p++ = 0;
+- goto out;
+- }
+- p = encode_post_op_attr(cd->rqstp, p, fh);
+- *p++ = xdr_one; /* yes, a file handle follows */
+- p = encode_fh(p, fh);
+-out:
+- fh_put(fh);
+- return p;
+-}
+-
+-/*
+- * Encode a directory entry. This one works for both normal readdir
+- * and readdirplus.
+- * The normal readdir reply requires 2 (fileid) + 1 (stringlen)
+- * + string + 2 (cookie) + 1 (next) words, i.e. 6 + strlen.
+- *
+- * The readdirplus baggage is 1+21 words for post_op_attr, plus the
+- * file handle.
++/**
++ * nfs3svc_encode_cookie3 - Encode a directory offset cookie
++ * @resp: readdir result context
++ * @offset: offset cookie to encode
++ *
++ * The buffer space for the offset cookie has already been reserved
++ * by svcxdr_encode_entry3_common().
+ */
+-
+-#define NFS3_ENTRY_BAGGAGE (2 + 1 + 2 + 1)
+-#define NFS3_ENTRYPLUS_BAGGAGE (1 + 21 + 1 + (NFS3_FHSIZE >> 2))
+-static int
+-encode_entry(struct readdir_cd *ccd, const char *name, int namlen,
+- loff_t offset, u64 ino, unsigned int d_type, int plus)
++void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset)
+ {
+- struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres,
+- common);
+- __be32 *p = cd->buffer;
+- caddr_t curr_page_addr = NULL;
+- struct page ** page;
+- int slen; /* string (name) length */
+- int elen; /* estimated entry length in words */
+- int num_entry_words = 0; /* actual number of words */
+-
+- if (cd->offset) {
+- u64 offset64 = offset;
+-
+- if (unlikely(cd->offset1)) {
+- /* we ended up with offset on a page boundary */
+- *cd->offset = htonl(offset64 >> 32);
+- *cd->offset1 = htonl(offset64 & 0xffffffff);
+- cd->offset1 = NULL;
+- } else {
+- xdr_encode_hyper(cd->offset, offset64);
+- }
+- cd->offset = NULL;
+- }
+-
+- /*
+- dprintk("encode_entry(%.*s @%ld%s)\n",
+- namlen, name, (long) offset, plus? " plus" : "");
+- */
+-
+- /* truncate filename if too long */
+- namlen = min(namlen, NFS3_MAXNAMLEN);
++ __be64 cookie = cpu_to_be64(offset);
+
+- slen = XDR_QUADLEN(namlen);
+- elen = slen + NFS3_ENTRY_BAGGAGE
+- + (plus? NFS3_ENTRYPLUS_BAGGAGE : 0);
++ if (!resp->cookie_offset)
++ return;
++ write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie,
++ sizeof(cookie));
++ resp->cookie_offset = 0;
++}
+
+- if (cd->buflen < elen) {
+- cd->common.err = nfserr_toosmall;
+- return -EINVAL;
+- }
++static bool
++svcxdr_encode_entry3_common(struct nfsd3_readdirres *resp, const char *name,
++ int namlen, loff_t offset, u64 ino)
++{
++ struct xdr_buf *dirlist = &resp->dirlist;
++ struct xdr_stream *xdr = &resp->xdr;
++
++ if (xdr_stream_encode_item_present(xdr) < 0)
++ return false;
++ /* fileid */
++ if (xdr_stream_encode_u64(xdr, ino) < 0)
++ return false;
++ /* name */
++ if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS3_MAXNAMLEN)) < 0)
++ return false;
++ /* cookie */
++ resp->cookie_offset = dirlist->len;
++ if (xdr_stream_encode_u64(xdr, OFFSET_MAX) < 0)
++ return false;
++
++ return true;
++}
+
+- /* determine which page in rq_respages[] we are currently filling */
+- for (page = cd->rqstp->rq_respages + 1;
+- page < cd->rqstp->rq_next_page; page++) {
+- curr_page_addr = page_address(*page);
++/**
++ * nfs3svc_encode_entry3 - encode one NFSv3 READDIR entry
++ * @data: directory context
++ * @name: name of the object to be encoded
++ * @namlen: length of that name, in bytes
++ * @offset: the offset of the previous entry
++ * @ino: the fileid of this entry
++ * @d_type: unused
++ *
++ * Return values:
++ * %0: Entry was successfully encoded.
++ * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err
++ *
++ * On exit, the following fields are updated:
++ * - resp->xdr
++ * - resp->common.err
++ * - resp->cookie_offset
++ */
++int nfs3svc_encode_entry3(void *data, const char *name, int namlen,
++ loff_t offset, u64 ino, unsigned int d_type)
++{
++ struct readdir_cd *ccd = data;
++ struct nfsd3_readdirres *resp = container_of(ccd,
++ struct nfsd3_readdirres,
++ common);
++ unsigned int starting_length = resp->dirlist.len;
+
+- if (((caddr_t)cd->buffer >= curr_page_addr) &&
+- ((caddr_t)cd->buffer < curr_page_addr + PAGE_SIZE))
+- break;
+- }
++ /* The offset cookie for the previous entry */
++ nfs3svc_encode_cookie3(resp, offset);
+
+- if ((caddr_t)(cd->buffer + elen) < (curr_page_addr + PAGE_SIZE)) {
+- /* encode entry in current page */
++ if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino))
++ goto out_toosmall;
+
+- p = encode_entry_baggage(cd, p, name, namlen, ino);
++ xdr_commit_encode(&resp->xdr);
++ resp->common.err = nfs_ok;
++ return 0;
+
+- if (plus)
+- p = encode_entryplus_baggage(cd, p, name, namlen, ino);
+- num_entry_words = p - cd->buffer;
+- } else if (*(page+1) != NULL) {
+- /* temporarily encode entry into next page, then move back to
+- * current and next page in rq_respages[] */
+- __be32 *p1, *tmp;
+- int len1, len2;
++out_toosmall:
++ resp->cookie_offset = 0;
++ resp->common.err = nfserr_toosmall;
++ resp->dirlist.len = starting_length;
++ return -EINVAL;
++}
+
+- /* grab next page for temporary storage of entry */
+- p1 = tmp = page_address(*(page+1));
++static bool
++svcxdr_encode_entry3_plus(struct nfsd3_readdirres *resp, const char *name,
++ int namlen, u64 ino)
++{
++ struct xdr_stream *xdr = &resp->xdr;
++ struct svc_fh *fhp = &resp->scratch;
++ bool result;
+
+- p1 = encode_entry_baggage(cd, p1, name, namlen, ino);
++ result = false;
++ fh_init(fhp, NFS3_FHSIZE);
++ if (compose_entry_fh(resp, fhp, name, namlen, ino) != nfs_ok)
++ goto out_noattrs;
+
+- if (plus)
+- p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino);
++ if (!svcxdr_encode_post_op_attr(resp->rqstp, xdr, fhp))
++ goto out;
++ if (!svcxdr_encode_post_op_fh3(xdr, fhp))
++ goto out;
++ result = true;
+
+- /* determine entry word length and lengths to go in pages */
+- num_entry_words = p1 - tmp;
+- len1 = curr_page_addr + PAGE_SIZE - (caddr_t)cd->buffer;
+- if ((num_entry_words << 2) < len1) {
+- /* the actual number of words in the entry is less
+- * than elen and can still fit in the current page
+- */
+- memmove(p, tmp, num_entry_words << 2);
+- p += num_entry_words;
+-
+- /* update offset */
+- cd->offset = cd->buffer + (cd->offset - tmp);
+- } else {
+- unsigned int offset_r = (cd->offset - tmp) << 2;
+-
+- /* update pointer to offset location.
+- * This is a 64bit quantity, so we need to
+- * deal with 3 cases:
+- * - entirely in first page
+- * - entirely in second page
+- * - 4 bytes in each page
+- */
+- if (offset_r + 8 <= len1) {
+- cd->offset = p + (cd->offset - tmp);
+- } else if (offset_r >= len1) {
+- cd->offset -= len1 >> 2;
+- } else {
+- /* sitting on the fence */
+- BUG_ON(offset_r != len1 - 4);
+- cd->offset = p + (cd->offset - tmp);
+- cd->offset1 = tmp;
+- }
+-
+- len2 = (num_entry_words << 2) - len1;
+-
+- /* move from temp page to current and next pages */
+- memmove(p, tmp, len1);
+- memmove(tmp, (caddr_t)tmp+len1, len2);
+-
+- p = tmp + (len2 >> 2);
+- }
+- }
+- else {
+- cd->common.err = nfserr_toosmall;
+- return -EINVAL;
+- }
++out:
++ fh_put(fhp);
++ return result;
++
++out_noattrs:
++ if (xdr_stream_encode_item_absent(xdr) < 0)
++ return false;
++ if (xdr_stream_encode_item_absent(xdr) < 0)
++ return false;
++ return true;
++}
+
+- cd->buflen -= num_entry_words;
+- cd->buffer = p;
+- cd->common.err = nfs_ok;
++/**
++ * nfs3svc_encode_entryplus3 - encode one NFSv3 READDIRPLUS entry
++ * @data: directory context
++ * @name: name of the object to be encoded
++ * @namlen: length of that name, in bytes
++ * @offset: the offset of the previous entry
++ * @ino: the fileid of this entry
++ * @d_type: unused
++ *
++ * Return values:
++ * %0: Entry was successfully encoded.
++ * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err
++ *
++ * On exit, the following fields are updated:
++ * - resp->xdr
++ * - resp->common.err
++ * - resp->cookie_offset
++ */
++int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen,
++ loff_t offset, u64 ino, unsigned int d_type)
++{
++ struct readdir_cd *ccd = data;
++ struct nfsd3_readdirres *resp = container_of(ccd,
++ struct nfsd3_readdirres,
++ common);
++ unsigned int starting_length = resp->dirlist.len;
++
++ /* The offset cookie for the previous entry */
++ nfs3svc_encode_cookie3(resp, offset);
++
++ if (!svcxdr_encode_entry3_common(resp, name, namlen, offset, ino))
++ goto out_toosmall;
++ if (!svcxdr_encode_entry3_plus(resp, name, namlen, ino))
++ goto out_toosmall;
++
++ xdr_commit_encode(&resp->xdr);
++ resp->common.err = nfs_ok;
+ return 0;
+
++out_toosmall:
++ resp->cookie_offset = 0;
++ resp->common.err = nfserr_toosmall;
++ resp->dirlist.len = starting_length;
++ return -EINVAL;
+ }
+
+-int
+-nfs3svc_encode_entry(void *cd, const char *name,
+- int namlen, loff_t offset, u64 ino, unsigned int d_type)
++static bool
++svcxdr_encode_fsstat3resok(struct xdr_stream *xdr,
++ const struct nfsd3_fsstatres *resp)
+ {
+- return encode_entry(cd, name, namlen, offset, ino, d_type, 0);
+-}
++ const struct kstatfs *s = &resp->stats;
++ u64 bs = s->f_bsize;
++ __be32 *p;
+
+-int
+-nfs3svc_encode_entry_plus(void *cd, const char *name,
+- int namlen, loff_t offset, u64 ino,
+- unsigned int d_type)
+-{
+- return encode_entry(cd, name, namlen, offset, ino, d_type, 1);
++ p = xdr_reserve_space(xdr, XDR_UNIT * 13);
++ if (!p)
++ return false;
++ p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */
++ p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */
++ p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */
++ p = xdr_encode_hyper(p, s->f_files); /* total inodes */
++ p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */
++ p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */
++ *p = cpu_to_be32(resp->invarsec); /* mean unchanged time */
++
++ return true;
+ }
+
+ /* FSSTAT */
+-int
+-nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_fsstatres *resp = rqstp->rq_resp;
+- struct kstatfs *s = &resp->stats;
+- u64 bs = s->f_bsize;
+-
+- *p++ = resp->status;
+- *p++ = xdr_zero; /* no post_op_attr */
+-
+- if (resp->status == 0) {
+- p = xdr_encode_hyper(p, bs * s->f_blocks); /* total bytes */
+- p = xdr_encode_hyper(p, bs * s->f_bfree); /* free bytes */
+- p = xdr_encode_hyper(p, bs * s->f_bavail); /* user available bytes */
+- p = xdr_encode_hyper(p, s->f_files); /* total inodes */
+- p = xdr_encode_hyper(p, s->f_ffree); /* free inodes */
+- p = xdr_encode_hyper(p, s->f_ffree); /* user available inodes */
+- *p++ = htonl(resp->invarsec); /* mean unchanged time */
++
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
++ return false;
++ if (!svcxdr_encode_fsstat3resok(xdr, resp))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
++ return false;
+ }
+- return xdr_ressize_check(rqstp, p);
++
++ return true;
++}
++
++static bool
++svcxdr_encode_fsinfo3resok(struct xdr_stream *xdr,
++ const struct nfsd3_fsinfores *resp)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT * 12);
++ if (!p)
++ return false;
++ *p++ = cpu_to_be32(resp->f_rtmax);
++ *p++ = cpu_to_be32(resp->f_rtpref);
++ *p++ = cpu_to_be32(resp->f_rtmult);
++ *p++ = cpu_to_be32(resp->f_wtmax);
++ *p++ = cpu_to_be32(resp->f_wtpref);
++ *p++ = cpu_to_be32(resp->f_wtmult);
++ *p++ = cpu_to_be32(resp->f_dtpref);
++ p = xdr_encode_hyper(p, resp->f_maxfilesize);
++ p = encode_nfstime3(p, &nfs3svc_time_delta);
++ *p = cpu_to_be32(resp->f_properties);
++
++ return true;
+ }
+
+ /* FSINFO */
+-int
+-nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_fsinfores *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- *p++ = xdr_zero; /* no post_op_attr */
+-
+- if (resp->status == 0) {
+- *p++ = htonl(resp->f_rtmax);
+- *p++ = htonl(resp->f_rtpref);
+- *p++ = htonl(resp->f_rtmult);
+- *p++ = htonl(resp->f_wtmax);
+- *p++ = htonl(resp->f_wtpref);
+- *p++ = htonl(resp->f_wtmult);
+- *p++ = htonl(resp->f_dtpref);
+- p = xdr_encode_hyper(p, resp->f_maxfilesize);
+- *p++ = xdr_one;
+- *p++ = xdr_zero;
+- *p++ = htonl(resp->f_properties);
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
++ return false;
++ if (!svcxdr_encode_fsinfo3resok(xdr, resp))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
++ return false;
+ }
+
+- return xdr_ressize_check(rqstp, p);
++ return true;
++}
++
++static bool
++svcxdr_encode_pathconf3resok(struct xdr_stream *xdr,
++ const struct nfsd3_pathconfres *resp)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT * 6);
++ if (!p)
++ return false;
++ *p++ = cpu_to_be32(resp->p_link_max);
++ *p++ = cpu_to_be32(resp->p_name_max);
++ p = xdr_encode_bool(p, resp->p_no_trunc);
++ p = xdr_encode_bool(p, resp->p_chown_restricted);
++ p = xdr_encode_bool(p, resp->p_case_insensitive);
++ xdr_encode_bool(p, resp->p_case_preserving);
++
++ return true;
+ }
+
+ /* PATHCONF */
+-int
+-nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_pathconfres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- *p++ = xdr_zero; /* no post_op_attr */
+-
+- if (resp->status == 0) {
+- *p++ = htonl(resp->p_link_max);
+- *p++ = htonl(resp->p_name_max);
+- *p++ = htonl(resp->p_no_trunc);
+- *p++ = htonl(resp->p_chown_restricted);
+- *p++ = htonl(resp->p_case_insensitive);
+- *p++ = htonl(resp->p_case_preserving);
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
++ return false;
++ if (!svcxdr_encode_pathconf3resok(xdr, resp))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_post_op_attr(rqstp, xdr, &nfs3svc_null_fh))
++ return false;
+ }
+
+- return xdr_ressize_check(rqstp, p);
++ return true;
+ }
+
+ /* COMMIT */
+-int
+-nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd3_commitres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- p = encode_wcc_data(rqstp, p, &resp->fh);
+- /* Write verifier */
+- if (resp->status == 0) {
+- *p++ = resp->verf[0];
+- *p++ = resp->verf[1];
++ if (!svcxdr_encode_nfsstat3(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_writeverf3(xdr, resp->verf))
++ return false;
++ break;
++ default:
++ if (!svcxdr_encode_wcc_data(rqstp, xdr, &resp->fh))
++ return false;
+ }
+- return xdr_ressize_check(rqstp, p);
++
++ return true;
+ }
+
+ /*
+diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
+index 71292a0d6f092..bb8e2f6d7d03c 100644
+--- a/fs/nfsd/nfs4acl.c
++++ b/fs/nfsd/nfs4acl.c
+@@ -751,57 +751,26 @@ static int nfs4_acl_nfsv4_to_posix(struct nfs4_acl *acl,
+ return ret;
+ }
+
+-__be32
+-nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- struct nfs4_acl *acl)
++__be32 nfsd4_acl_to_attr(enum nfs_ftype4 type, struct nfs4_acl *acl,
++ struct nfsd_attrs *attr)
+ {
+- __be32 error;
+ int host_error;
+- struct dentry *dentry;
+- struct inode *inode;
+- struct posix_acl *pacl = NULL, *dpacl = NULL;
+ unsigned int flags = 0;
+
+- /* Get inode */
+- error = fh_verify(rqstp, fhp, 0, NFSD_MAY_SATTR);
+- if (error)
+- return error;
+-
+- dentry = fhp->fh_dentry;
+- inode = d_inode(dentry);
++ if (!acl)
++ return nfs_ok;
+
+- if (S_ISDIR(inode->i_mode))
++ if (type == NF4DIR)
+ flags = NFS4_ACL_DIR;
+
+- host_error = nfs4_acl_nfsv4_to_posix(acl, &pacl, &dpacl, flags);
++ host_error = nfs4_acl_nfsv4_to_posix(acl, &attr->na_pacl,
++ &attr->na_dpacl, flags);
+ if (host_error == -EINVAL)
+ return nfserr_attrnotsupp;
+- if (host_error < 0)
+- goto out_nfserr;
+-
+- fh_lock(fhp);
+-
+- host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl);
+- if (host_error < 0)
+- goto out_drop_lock;
+-
+- if (S_ISDIR(inode->i_mode)) {
+- host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl);
+- }
+-
+-out_drop_lock:
+- fh_unlock(fhp);
+-
+- posix_acl_release(pacl);
+- posix_acl_release(dpacl);
+-out_nfserr:
+- if (host_error == -EOPNOTSUPP)
+- return nfserr_attrnotsupp;
+ else
+ return nfserrno(host_error);
+ }
+
+-
+ static short
+ ace2type(struct nfs4_ace *ace)
+ {
+diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
+index f5b7ad0847f20..4eae2c5af2edf 100644
+--- a/fs/nfsd/nfs4callback.c
++++ b/fs/nfsd/nfs4callback.c
+@@ -76,6 +76,17 @@ static __be32 *xdr_encode_empty_array(__be32 *p)
+ * 1 Protocol"
+ */
+
++static void encode_uint32(struct xdr_stream *xdr, u32 n)
++{
++ WARN_ON_ONCE(xdr_stream_encode_u32(xdr, n) < 0);
++}
++
++static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
++ size_t len)
++{
++ WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
++}
++
+ /*
+ * nfs_cb_opnum4
+ *
+@@ -121,7 +132,7 @@ static void encode_nfs_fh4(struct xdr_stream *xdr, const struct knfsd_fh *fh)
+
+ BUG_ON(length > NFS4_FHSIZE);
+ p = xdr_reserve_space(xdr, 4 + length);
+- xdr_encode_opaque(p, &fh->fh_base, length);
++ xdr_encode_opaque(p, &fh->fh_raw, length);
+ }
+
+ /*
+@@ -328,6 +339,24 @@ static void encode_cb_recall4args(struct xdr_stream *xdr,
+ hdr->nops++;
+ }
+
++/*
++ * CB_RECALLANY4args
++ *
++ * struct CB_RECALLANY4args {
++ * uint32_t craa_objects_to_keep;
++ * bitmap4 craa_type_mask;
++ * };
++ */
++static void
++encode_cb_recallany4args(struct xdr_stream *xdr,
++ struct nfs4_cb_compound_hdr *hdr, struct nfsd4_cb_recall_any *ra)
++{
++ encode_nfs_cb_opnum4(xdr, OP_CB_RECALL_ANY);
++ encode_uint32(xdr, ra->ra_keep);
++ encode_bitmap4(xdr, ra->ra_bmval, ARRAY_SIZE(ra->ra_bmval));
++ hdr->nops++;
++}
++
+ /*
+ * CB_SEQUENCE4args
+ *
+@@ -482,6 +511,26 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
+ encode_cb_nops(&hdr);
+ }
+
++/*
++ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
++ */
++static void
++nfs4_xdr_enc_cb_recall_any(struct rpc_rqst *req,
++ struct xdr_stream *xdr, const void *data)
++{
++ const struct nfsd4_callback *cb = data;
++ struct nfsd4_cb_recall_any *ra;
++ struct nfs4_cb_compound_hdr hdr = {
++ .ident = cb->cb_clp->cl_cb_ident,
++ .minorversion = cb->cb_clp->cl_minorversion,
++ };
++
++ ra = container_of(cb, struct nfsd4_cb_recall_any, ra_cb);
++ encode_cb_compound4args(xdr, &hdr);
++ encode_cb_sequence4args(xdr, cb, &hdr);
++ encode_cb_recallany4args(xdr, &hdr, ra);
++ encode_cb_nops(&hdr);
++}
+
+ /*
+ * NFSv4.0 and NFSv4.1 XDR decode functions
+@@ -520,6 +569,28 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
+ return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status);
+ }
+
++/*
++ * 20.6. Operation 8: CB_RECALL_ANY - Keep Any N Recallable Objects
++ */
++static int
++nfs4_xdr_dec_cb_recall_any(struct rpc_rqst *rqstp,
++ struct xdr_stream *xdr,
++ void *data)
++{
++ struct nfsd4_callback *cb = data;
++ struct nfs4_cb_compound_hdr hdr;
++ int status;
++
++ status = decode_cb_compound4res(xdr, &hdr);
++ if (unlikely(status))
++ return status;
++ status = decode_cb_sequence4res(xdr, cb);
++ if (unlikely(status || cb->cb_seq_status))
++ return status;
++ status = decode_cb_op_status(xdr, OP_CB_RECALL_ANY, &cb->cb_status);
++ return status;
++}
++
+ #ifdef CONFIG_NFSD_PNFS
+ /*
+ * CB_LAYOUTRECALL4args
+@@ -679,7 +750,7 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
+ * case NFS4_OK:
+ * write_response4 coa_resok4;
+ * default:
+- * length4 coa_bytes_copied;
++ * length4 coa_bytes_copied;
+ * };
+ * struct CB_OFFLOAD4args {
+ * nfs_fh4 coa_fh;
+@@ -688,21 +759,22 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
+ * };
+ */
+ static void encode_offload_info4(struct xdr_stream *xdr,
+- __be32 nfserr,
+- const struct nfsd4_copy *cp)
++ const struct nfsd4_cb_offload *cbo)
+ {
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+- *p++ = nfserr;
+- if (!nfserr) {
++ *p = cbo->co_nfserr;
++ switch (cbo->co_nfserr) {
++ case nfs_ok:
+ p = xdr_reserve_space(xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+ p = xdr_encode_empty_array(p);
+- p = xdr_encode_hyper(p, cp->cp_res.wr_bytes_written);
+- *p++ = cpu_to_be32(cp->cp_res.wr_stable_how);
+- p = xdr_encode_opaque_fixed(p, cp->cp_res.wr_verifier.data,
++ p = xdr_encode_hyper(p, cbo->co_res.wr_bytes_written);
++ *p++ = cpu_to_be32(cbo->co_res.wr_stable_how);
++ p = xdr_encode_opaque_fixed(p, cbo->co_res.wr_verifier.data,
+ NFS4_VERIFIER_SIZE);
+- } else {
++ break;
++ default:
+ p = xdr_reserve_space(xdr, 8);
+ /* We always return success if bytes were written */
+ p = xdr_encode_hyper(p, 0);
+@@ -710,18 +782,16 @@ static void encode_offload_info4(struct xdr_stream *xdr,
+ }
+
+ static void encode_cb_offload4args(struct xdr_stream *xdr,
+- __be32 nfserr,
+- const struct knfsd_fh *fh,
+- const struct nfsd4_copy *cp,
++ const struct nfsd4_cb_offload *cbo,
+ struct nfs4_cb_compound_hdr *hdr)
+ {
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+- *p++ = cpu_to_be32(OP_CB_OFFLOAD);
+- encode_nfs_fh4(xdr, fh);
+- encode_stateid4(xdr, &cp->cp_res.cb_stateid);
+- encode_offload_info4(xdr, nfserr, cp);
++ *p = cpu_to_be32(OP_CB_OFFLOAD);
++ encode_nfs_fh4(xdr, &cbo->co_fh);
++ encode_stateid4(xdr, &cbo->co_res.cb_stateid);
++ encode_offload_info4(xdr, cbo);
+
+ hdr->nops++;
+ }
+@@ -731,8 +801,8 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
+ const void *data)
+ {
+ const struct nfsd4_callback *cb = data;
+- const struct nfsd4_copy *cp =
+- container_of(cb, struct nfsd4_copy, cp_cb);
++ const struct nfsd4_cb_offload *cbo =
++ container_of(cb, struct nfsd4_cb_offload, co_cb);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_clp->cl_minorversion,
+@@ -740,7 +810,7 @@ static void nfs4_xdr_enc_cb_offload(struct rpc_rqst *req,
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+- encode_cb_offload4args(xdr, cp->nfserr, &cp->fh, cp, &hdr);
++ encode_cb_offload4args(xdr, cbo, &hdr);
+ encode_cb_nops(&hdr);
+ }
+
+@@ -784,6 +854,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
+ #endif
+ PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock),
+ PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload),
++ PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any),
+ };
+
+ static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
+@@ -941,37 +1012,43 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
+ clp->cl_cb_client = client;
+ clp->cl_cb_cred = cred;
+- trace_nfsd_cb_setup(clp);
++ rcu_read_lock();
++ trace_nfsd_cb_setup(clp, rpc_peeraddr2str(client, RPC_DISPLAY_NETID),
++ args.authflavor);
++ rcu_read_unlock();
+ return 0;
+ }
+
++static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
++{
++ if (clp->cl_cb_state != newstate) {
++ clp->cl_cb_state = newstate;
++ trace_nfsd_cb_state(clp);
++ }
++}
++
+ static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+ {
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
+- clp->cl_cb_state = NFSD4_CB_DOWN;
+- trace_nfsd_cb_state(clp);
++ nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN);
+ }
+
+ static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+ {
+ if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+ return;
+- clp->cl_cb_state = NFSD4_CB_FAULT;
+- trace_nfsd_cb_state(clp);
++ nfsd4_mark_cb_state(clp, NFSD4_CB_FAULT);
+ }
+
+ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
+ {
+ struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
+
+- trace_nfsd_cb_done(clp, task->tk_status);
+ if (task->tk_status)
+ nfsd4_mark_cb_down(clp, task->tk_status);
+- else {
+- clp->cl_cb_state = NFSD4_CB_UP;
+- trace_nfsd_cb_state(clp);
+- }
++ else
++ nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
+ }
+
+ static void nfsd4_cb_probe_release(void *calldata)
+@@ -995,8 +1072,8 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = {
+ */
+ void nfsd4_probe_callback(struct nfs4_client *clp)
+ {
+- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+- trace_nfsd_cb_state(clp);
++ trace_nfsd_cb_probe(clp);
++ nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN);
+ set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+ nfsd4_run_cb(&clp->cl_cb_null);
+ }
+@@ -1009,11 +1086,10 @@ void nfsd4_probe_callback_sync(struct nfs4_client *clp)
+
+ void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *conn)
+ {
+- clp->cl_cb_state = NFSD4_CB_UNKNOWN;
++ nfsd4_mark_cb_state(clp, NFSD4_CB_UNKNOWN);
+ spin_lock(&clp->cl_lock);
+ memcpy(&clp->cl_cb_conn, conn, sizeof(struct nfs4_cb_conn));
+ spin_unlock(&clp->cl_lock);
+- trace_nfsd_cb_state(clp);
+ }
+
+ /*
+@@ -1170,8 +1246,6 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
+ struct nfsd4_callback *cb = calldata;
+ struct nfs4_client *clp = cb->cb_clp;
+
+- trace_nfsd_cb_done(clp, task->tk_status);
+-
+ if (!nfsd4_cb_sequence_done(task, cb))
+ return;
+
+@@ -1231,6 +1305,9 @@ void nfsd4_destroy_callback_queue(void)
+ /* must be called under the state lock */
+ void nfsd4_shutdown_callback(struct nfs4_client *clp)
+ {
++ if (clp->cl_cb_state != NFSD4_CB_UNKNOWN)
++ trace_nfsd_cb_shutdown(clp);
++
+ set_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags);
+ /*
+ * Note this won't actually result in a null callback;
+@@ -1276,7 +1353,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
+ * kill the old client:
+ */
+ if (clp->cl_cb_client) {
+- trace_nfsd_cb_shutdown(clp);
+ rpc_shutdown_client(clp->cl_cb_client);
+ clp->cl_cb_client = NULL;
+ put_cred(clp->cl_cb_cred);
+@@ -1322,8 +1398,6 @@ nfsd4_run_cb_work(struct work_struct *work)
+ struct rpc_clnt *clnt;
+ int flags;
+
+- trace_nfsd_cb_work(clp, cb->cb_msg.rpc_proc->p_name);
+-
+ if (cb->cb_need_restart) {
+ cb->cb_need_restart = false;
+ } else {
+@@ -1345,7 +1419,7 @@ nfsd4_run_cb_work(struct work_struct *work)
+ * Don't send probe messages for 4.1 or later.
+ */
+ if (!cb->cb_ops && clp->cl_minorversion) {
+- clp->cl_cb_state = NFSD4_CB_UP;
++ nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
+ nfsd41_destroy_cb(cb);
+ return;
+ }
+@@ -1371,11 +1445,21 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ cb->cb_holds_slot = false;
+ }
+
+-void nfsd4_run_cb(struct nfsd4_callback *cb)
++/**
++ * nfsd4_run_cb - queue up a callback job to run
++ * @cb: callback to queue
++ *
++ * Kick off a callback to do its thing. Returns false if it was already
++ * on a queue, true otherwise.
++ */
++bool nfsd4_run_cb(struct nfsd4_callback *cb)
+ {
+ struct nfs4_client *clp = cb->cb_clp;
++ bool queued;
+
+ nfsd41_cb_inflight_begin(clp);
+- if (!nfsd4_queue_cb(cb))
++ queued = nfsd4_queue_cb(cb);
++ if (!queued)
+ nfsd41_cb_inflight_end(clp);
++ return queued;
+ }
+diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
+index f92161ce1f97d..5e9809aff37eb 100644
+--- a/fs/nfsd/nfs4idmap.c
++++ b/fs/nfsd/nfs4idmap.c
+@@ -41,6 +41,7 @@
+ #include "idmap.h"
+ #include "nfsd.h"
+ #include "netns.h"
++#include "vfs.h"
+
+ /*
+ * Turn off idmapping when using AUTH_SYS.
+@@ -82,8 +83,8 @@ ent_init(struct cache_head *cnew, struct cache_head *citm)
+ new->id = itm->id;
+ new->type = itm->type;
+
+- strlcpy(new->name, itm->name, sizeof(new->name));
+- strlcpy(new->authname, itm->authname, sizeof(new->authname));
++ strscpy(new->name, itm->name, sizeof(new->name));
++ strscpy(new->authname, itm->authname, sizeof(new->authname));
+ }
+
+ static void
+@@ -548,7 +549,7 @@ idmap_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen
+ return nfserr_badowner;
+ memcpy(key.name, name, namelen);
+ key.name[namelen] = '\0';
+- strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
++ strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ ret = idmap_lookup(rqstp, nametoid_lookup, &key, nn->nametoid_cache, &item);
+ if (ret == -ENOENT)
+ return nfserr_badowner;
+@@ -584,7 +585,7 @@ static __be32 idmap_id_to_name(struct xdr_stream *xdr,
+ int ret;
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+- strlcpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
++ strscpy(key.authname, rqst_authname(rqstp), sizeof(key.authname));
+ ret = idmap_lookup(rqstp, idtoname_lookup, &key, nn->idtoname_cache, &item);
+ if (ret == -ENOENT)
+ return encode_ascii_id(xdr, id);
+diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
+index 2673019d30ecd..e4e23b2a3e655 100644
+--- a/fs/nfsd/nfs4layouts.c
++++ b/fs/nfsd/nfs4layouts.c
+@@ -421,7 +421,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+ new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+ if (!new)
+ return nfserr_jukebox;
+- memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
++ memcpy(&new->lo_seg, seg, sizeof(new->lo_seg));
+ new->lo_state = ls;
+
+ spin_lock(&fp->fi_lock);
+@@ -657,7 +657,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+ ktime_t now, cutoff;
+ const struct nfsd4_layout_ops *ops;
+
+-
++ trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
+ switch (task->tk_status) {
+ case 0:
+ case -NFS4ERR_DELAY:
+diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
+index e84996c3867c7..2c0de247083a9 100644
+--- a/fs/nfsd/nfs4proc.c
++++ b/fs/nfsd/nfs4proc.c
+@@ -37,6 +37,9 @@
+ #include <linux/falloc.h>
+ #include <linux/slab.h>
+ #include <linux/kthread.h>
++#include <linux/namei.h>
++#include <linux/freezer.h>
++
+ #include <linux/sunrpc/addr.h>
+ #include <linux/nfs_ssc.h>
+
+@@ -50,34 +53,16 @@
+ #include "pnfs.h"
+ #include "trace.h"
+
+-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+-#include <linux/security.h>
+-
+-static inline void
+-nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+-{
+- struct inode *inode = d_inode(resfh->fh_dentry);
+- int status;
+-
+- inode_lock(inode);
+- status = security_inode_setsecctx(resfh->fh_dentry,
+- label->data, label->len);
+- inode_unlock(inode);
+-
+- if (status)
+- /*
+- * XXX: We should really fail the whole open, but we may
+- * already have created a new file, so it may be too
+- * late. For now this seems the least of evils:
+- */
+- bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
++static bool inter_copy_offload_enable;
++module_param(inter_copy_offload_enable, bool, 0644);
++MODULE_PARM_DESC(inter_copy_offload_enable,
++ "Enable inter server to server copy offload. Default: false");
+
+- return;
+-}
+-#else
+-static inline void
+-nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval)
+-{ }
++#ifdef CONFIG_NFSD_V4_2_INTER_SSC
++static int nfsd4_ssc_umount_timeout = 900000; /* default to 15 mins */
++module_param(nfsd4_ssc_umount_timeout, int, 0644);
++MODULE_PARM_DESC(nfsd4_ssc_umount_timeout,
++ "idle msecs before unmount export from source server");
+ #endif
+
+ #define NFSDDBG_FACILITY NFSDDBG_PROC
+@@ -144,26 +129,6 @@ is_create_with_attrs(struct nfsd4_open *open)
+ || open->op_createmode == NFS4_CREATE_EXCLUSIVE4_1);
+ }
+
+-/*
+- * if error occurs when setting the acl, just clear the acl bit
+- * in the returned attr bitmap.
+- */
+-static void
+-do_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- struct nfs4_acl *acl, u32 *bmval)
+-{
+- __be32 status;
+-
+- status = nfsd4_set_nfs4_acl(rqstp, fhp, acl);
+- if (status)
+- /*
+- * We should probably fail the whole open at this point,
+- * but we've already created the file, so it's too late;
+- * So this seems the least of evils:
+- */
+- bmval[0] &= ~FATTR4_WORD0_ACL;
+-}
+-
+ static inline void
+ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
+ {
+@@ -177,7 +142,6 @@ fh_dup2(struct svc_fh *dst, struct svc_fh *src)
+ static __be32
+ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open, int accmode)
+ {
+- __be32 status;
+
+ if (open->op_truncate &&
+ !(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+@@ -192,9 +156,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs
+ if (open->op_share_deny & NFS4_SHARE_DENY_READ)
+ accmode |= NFSD_MAY_WRITE;
+
+- status = fh_verify(rqstp, current_fh, S_IFREG, accmode);
+-
+- return status;
++ return fh_verify(rqstp, current_fh, S_IFREG, accmode);
+ }
+
+ static __be32 nfsd_check_obj_isreg(struct svc_fh *fh)
+@@ -223,6 +185,202 @@ static void nfsd4_set_open_owner_reply_cache(struct nfsd4_compound_state *cstate
+ &resfh->fh_handle);
+ }
+
++static inline bool nfsd4_create_is_exclusive(int createmode)
++{
++ return createmode == NFS4_CREATE_EXCLUSIVE ||
++ createmode == NFS4_CREATE_EXCLUSIVE4_1;
++}
++
++static __be32
++nfsd4_vfs_create(struct svc_fh *fhp, struct dentry *child,
++ struct nfsd4_open *open)
++{
++ struct file *filp;
++ struct path path;
++ int oflags;
++
++ oflags = O_CREAT | O_LARGEFILE;
++ switch (open->op_share_access & NFS4_SHARE_ACCESS_BOTH) {
++ case NFS4_SHARE_ACCESS_WRITE:
++ oflags |= O_WRONLY;
++ break;
++ case NFS4_SHARE_ACCESS_BOTH:
++ oflags |= O_RDWR;
++ break;
++ default:
++ oflags |= O_RDONLY;
++ }
++
++ path.mnt = fhp->fh_export->ex_path.mnt;
++ path.dentry = child;
++ filp = dentry_create(&path, oflags, open->op_iattr.ia_mode,
++ current_cred());
++ if (IS_ERR(filp))
++ return nfserrno(PTR_ERR(filp));
++
++ open->op_filp = filp;
++ return nfs_ok;
++}
++
++/*
++ * Implement NFSv4's unchecked, guarded, and exclusive create
++ * semantics for regular files. Open state for this new file is
++ * subsequently fabricated in nfsd4_process_open2().
++ *
++ * Upon return, caller must release @fhp and @resfhp.
++ */
++static __be32
++nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ struct svc_fh *resfhp, struct nfsd4_open *open)
++{
++ struct iattr *iap = &open->op_iattr;
++ struct nfsd_attrs attrs = {
++ .na_iattr = iap,
++ .na_seclabel = &open->op_label,
++ };
++ struct dentry *parent, *child;
++ __u32 v_mtime, v_atime;
++ struct inode *inode;
++ __be32 status;
++ int host_err;
++
++ if (isdotent(open->op_fname, open->op_fnamelen))
++ return nfserr_exist;
++ if (!(iap->ia_valid & ATTR_MODE))
++ iap->ia_mode = 0;
++
++ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
++ if (status != nfs_ok)
++ return status;
++ parent = fhp->fh_dentry;
++ inode = d_inode(parent);
++
++ host_err = fh_want_write(fhp);
++ if (host_err)
++ return nfserrno(host_err);
++
++ if (is_create_with_attrs(open))
++ nfsd4_acl_to_attr(NF4REG, open->op_acl, &attrs);
++
++ inode_lock_nested(inode, I_MUTEX_PARENT);
++
++ child = lookup_one_len(open->op_fname, parent, open->op_fnamelen);
++ if (IS_ERR(child)) {
++ status = nfserrno(PTR_ERR(child));
++ goto out;
++ }
++
++ if (d_really_is_negative(child)) {
++ status = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
++ if (status != nfs_ok)
++ goto out;
++ }
++
++ status = fh_compose(resfhp, fhp->fh_export, child, fhp);
++ if (status != nfs_ok)
++ goto out;
++
++ v_mtime = 0;
++ v_atime = 0;
++ if (nfsd4_create_is_exclusive(open->op_createmode)) {
++ u32 *verifier = (u32 *)open->op_verf.data;
++
++ /*
++ * Solaris 7 gets confused (bugid 4218508) if these have
++ * the high bit set, as do xfs filesystems without the
++ * "bigtime" feature. So just clear the high bits. If this
++ * is ever changed to use different attrs for storing the
++ * verifier, then do_open_lookup() will also need to be
++ * fixed accordingly.
++ */
++ v_mtime = verifier[0] & 0x7fffffff;
++ v_atime = verifier[1] & 0x7fffffff;
++ }
++
++ if (d_really_is_positive(child)) {
++ status = nfs_ok;
++
++ /* NFSv4 protocol requires change attributes even though
++ * no change happened.
++ */
++ fh_fill_both_attrs(fhp);
++
++ switch (open->op_createmode) {
++ case NFS4_CREATE_UNCHECKED:
++ if (!d_is_reg(child))
++ break;
++
++ /*
++ * In NFSv4, we don't want to truncate the file
++ * now. This would be wrong if the OPEN fails for
++ * some other reason. Furthermore, if the size is
++ * nonzero, we should ignore it according to spec!
++ */
++ open->op_truncate = (iap->ia_valid & ATTR_SIZE) &&
++ !iap->ia_size;
++ break;
++ case NFS4_CREATE_GUARDED:
++ status = nfserr_exist;
++ break;
++ case NFS4_CREATE_EXCLUSIVE:
++ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
++ d_inode(child)->i_atime.tv_sec == v_atime &&
++ d_inode(child)->i_size == 0) {
++ open->op_created = true;
++ break; /* subtle */
++ }
++ status = nfserr_exist;
++ break;
++ case NFS4_CREATE_EXCLUSIVE4_1:
++ if (d_inode(child)->i_mtime.tv_sec == v_mtime &&
++ d_inode(child)->i_atime.tv_sec == v_atime &&
++ d_inode(child)->i_size == 0) {
++ open->op_created = true;
++ goto set_attr; /* subtle */
++ }
++ status = nfserr_exist;
++ }
++ goto out;
++ }
++
++ if (!IS_POSIXACL(inode))
++ iap->ia_mode &= ~current_umask();
++
++ fh_fill_pre_attrs(fhp);
++ status = nfsd4_vfs_create(fhp, child, open);
++ if (status != nfs_ok)
++ goto out;
++ open->op_created = true;
++ fh_fill_post_attrs(fhp);
++
++ /* A newly created file already has a file size of zero. */
++ if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
++ iap->ia_valid &= ~ATTR_SIZE;
++ if (nfsd4_create_is_exclusive(open->op_createmode)) {
++ iap->ia_valid = ATTR_MTIME | ATTR_ATIME |
++ ATTR_MTIME_SET|ATTR_ATIME_SET;
++ iap->ia_mtime.tv_sec = v_mtime;
++ iap->ia_atime.tv_sec = v_atime;
++ iap->ia_mtime.tv_nsec = 0;
++ iap->ia_atime.tv_nsec = 0;
++ }
++
++set_attr:
++ status = nfsd_create_setattr(rqstp, fhp, resfhp, &attrs);
++
++ if (attrs.na_labelerr)
++ open->op_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
++ if (attrs.na_aclerr)
++ open->op_bmval[0] &= ~FATTR4_WORD0_ACL;
++out:
++ inode_unlock(inode);
++ nfsd_attrs_free(&attrs);
++ if (child && !IS_ERR(child))
++ dput(child);
++ fh_drop_write(fhp);
++ return status;
++}
++
+ static __be32
+ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open, struct svc_fh **resfh)
+ {
+@@ -252,47 +410,33 @@ do_open_lookup(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, stru
+ * yes | yes | GUARDED4 | GUARDED4
+ */
+
+- /*
+- * Note: create modes (UNCHECKED,GUARDED...) are the same
+- * in NFSv4 as in v3 except EXCLUSIVE4_1.
+- */
+ current->fs->umask = open->op_umask;
+- status = do_nfsd_create(rqstp, current_fh, open->op_fname.data,
+- open->op_fname.len, &open->op_iattr,
+- *resfh, open->op_createmode,
+- (u32 *)open->op_verf.data,
+- &open->op_truncate, &open->op_created);
++ status = nfsd4_create_file(rqstp, current_fh, *resfh, open);
+ current->fs->umask = 0;
+
+- if (!status && open->op_label.len)
+- nfsd4_security_inode_setsecctx(*resfh, &open->op_label, open->op_bmval);
+-
+ /*
+ * Following rfc 3530 14.2.16, and rfc 5661 18.16.4
+ * use the returned bitmask to indicate which attributes
+ * we used to store the verifier:
+ */
+- if (nfsd_create_is_exclusive(open->op_createmode) && status == 0)
++ if (nfsd4_create_is_exclusive(open->op_createmode) && status == 0)
+ open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS |
+ FATTR4_WORD1_TIME_MODIFY);
+- } else
+- /*
+- * Note this may exit with the parent still locked.
+- * We will hold the lock until nfsd4_open's final
+- * lookup, to prevent renames or unlinks until we've had
+- * a chance to an acquire a delegation if appropriate.
+- */
++ } else {
+ status = nfsd_lookup(rqstp, current_fh,
+- open->op_fname.data, open->op_fname.len, *resfh);
++ open->op_fname, open->op_fnamelen, *resfh);
++ if (!status)
++ /* NFSv4 protocol requires change attributes even though
++ * no change happened.
++ */
++ fh_fill_both_attrs(current_fh);
++ }
+ if (status)
+ goto out;
+ status = nfsd_check_obj_isreg(*resfh);
+ if (status)
+ goto out;
+
+- if (is_create_with_attrs(open) && open->op_acl != NULL)
+- do_set_nfs4_acl(rqstp, *resfh, open->op_acl, open->op_bmval);
+-
+ nfsd4_set_open_owner_reply_cache(cstate, open, *resfh);
+ accmode = NFSD_MAY_NOP;
+ if (open->op_created ||
+@@ -308,7 +452,6 @@ static __be32
+ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_open *open)
+ {
+ struct svc_fh *current_fh = &cstate->current_fh;
+- __be32 status;
+ int accmode = 0;
+
+ /* We don't know the target directory, and therefore can not
+@@ -333,9 +476,7 @@ do_open_fhandle(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, str
+ if (open->op_claim_type == NFS4_OPEN_CLAIM_DELEG_CUR_FH)
+ accmode = NFSD_MAY_OWNER_OVERRIDE;
+
+- status = do_open_permission(rqstp, current_fh, open, accmode);
+-
+- return status;
++ return do_open_permission(rqstp, current_fh, open, accmode);
+ }
+
+ static void
+@@ -360,9 +501,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ bool reclaim = false;
+
+ dprintk("NFSD: nfsd4_open filename %.*s op_openowner %p\n",
+- (int)open->op_fname.len, open->op_fname.data,
++ (int)open->op_fnamelen, open->op_fname,
+ open->op_openowner);
+
++ open->op_filp = NULL;
++ open->op_rqstp = rqstp;
++
+ /* This check required by spec. */
+ if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
+ return nfserr_inval;
+@@ -373,8 +517,7 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ * Before RECLAIM_COMPLETE done, server should deny new lock
+ */
+ if (nfsd4_has_session(cstate) &&
+- !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+- &cstate->session->se_client->cl_flags) &&
++ !test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags) &&
+ open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS)
+ return nfserr_grace;
+
+@@ -416,51 +559,46 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ goto out;
+
+ switch (open->op_claim_type) {
+- case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+- case NFS4_OPEN_CLAIM_NULL:
+- status = do_open_lookup(rqstp, cstate, open, &resfh);
+- if (status)
+- goto out;
+- break;
+- case NFS4_OPEN_CLAIM_PREVIOUS:
+- status = nfs4_check_open_reclaim(&open->op_clientid,
+- cstate, nn);
+- if (status)
+- goto out;
+- open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
+- reclaim = true;
+- fallthrough;
+- case NFS4_OPEN_CLAIM_FH:
+- case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+- status = do_open_fhandle(rqstp, cstate, open);
+- if (status)
+- goto out;
+- resfh = &cstate->current_fh;
+- break;
+- case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+- case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+- dprintk("NFSD: unsupported OPEN claim type %d\n",
+- open->op_claim_type);
+- status = nfserr_notsupp;
++ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
++ case NFS4_OPEN_CLAIM_NULL:
++ status = do_open_lookup(rqstp, cstate, open, &resfh);
++ if (status)
++ goto out;
++ break;
++ case NFS4_OPEN_CLAIM_PREVIOUS:
++ status = nfs4_check_open_reclaim(cstate->clp);
++ if (status)
+ goto out;
+- default:
+- dprintk("NFSD: Invalid OPEN claim type %d\n",
+- open->op_claim_type);
+- status = nfserr_inval;
++ open->op_openowner->oo_flags |= NFS4_OO_CONFIRMED;
++ reclaim = true;
++ fallthrough;
++ case NFS4_OPEN_CLAIM_FH:
++ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
++ status = do_open_fhandle(rqstp, cstate, open);
++ if (status)
+ goto out;
++ resfh = &cstate->current_fh;
++ break;
++ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
++ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
++ status = nfserr_notsupp;
++ goto out;
++ default:
++ status = nfserr_inval;
++ goto out;
+ }
+- /*
+- * nfsd4_process_open2() does the actual opening of the file. If
+- * successful, it (1) truncates the file if open->op_truncate was
+- * set, (2) sets open->op_stateid, (3) sets open->op_delegation.
+- */
++
+ status = nfsd4_process_open2(rqstp, resfh, open);
+- WARN(status && open->op_created,
+- "nfsd4_process_open2 failed to open newly-created file! status=%u\n",
+- be32_to_cpu(status));
++ if (status && open->op_created)
++ pr_warn("nfsd4_process_open2 failed to open newly-created file: status=%u\n",
++ be32_to_cpu(status));
+ if (reclaim && !status)
+ nn->somebody_reclaimed = true;
+ out:
++ if (open->op_filp) {
++ fput(open->op_filp);
++ open->op_filp = NULL;
++ }
+ if (resfh && resfh != &cstate->current_fh) {
+ fh_dup2(&cstate->current_fh, resfh);
+ fh_put(resfh);
+@@ -509,7 +647,7 @@ nfsd4_putfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+
+ fh_put(&cstate->current_fh);
+ cstate->current_fh.fh_handle.fh_size = putfh->pf_fhlen;
+- memcpy(&cstate->current_fh.fh_handle.fh_base, putfh->pf_fhval,
++ memcpy(&cstate->current_fh.fh_handle.fh_raw, putfh->pf_fhval,
+ putfh->pf_fhlen);
+ ret = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_BYPASS_GSS);
+ #ifdef CONFIG_NFSD_V4_2_INTER_SSC
+@@ -525,11 +663,9 @@ static __be32
+ nfsd4_putrootfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+ {
+- __be32 status;
+-
+ fh_put(&cstate->current_fh);
+- status = exp_pseudoroot(rqstp, &cstate->current_fh);
+- return status;
++
++ return exp_pseudoroot(rqstp, &cstate->current_fh);
+ }
+
+ static __be32
+@@ -588,7 +724,7 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net)
+
+ BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data));
+
+- nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id));
++ nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id));
+ }
+
+ static __be32
+@@ -596,10 +732,19 @@ nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+ {
+ struct nfsd4_commit *commit = &u->commit;
++ struct nfsd_file *nf;
++ __be32 status;
++
++ status = nfsd_file_acquire(rqstp, &cstate->current_fh, NFSD_MAY_WRITE |
++ NFSD_MAY_NOT_BREAK_LEASE, &nf);
++ if (status != nfs_ok)
++ return status;
+
+- return nfsd_commit(rqstp, &cstate->current_fh, commit->co_offset,
++ status = nfsd_commit(rqstp, &cstate->current_fh, nf, commit->co_offset,
+ commit->co_count,
+ (__be32 *)commit->co_verf.data);
++ nfsd_file_put(nf);
++ return status;
+ }
+
+ static __be32
+@@ -607,6 +752,10 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+ {
+ struct nfsd4_create *create = &u->create;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &create->cr_iattr,
++ .na_seclabel = &create->cr_label,
++ };
+ struct svc_fh resfh;
+ __be32 status;
+ dev_t rdev;
+@@ -622,12 +771,13 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (status)
+ return status;
+
++ status = nfsd4_acl_to_attr(create->cr_type, create->cr_acl, &attrs);
+ current->fs->umask = create->cr_umask;
+ switch (create->cr_type) {
+ case NF4LNK:
+ status = nfsd_symlink(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+- create->cr_data, &resfh);
++ create->cr_data, &attrs, &resfh);
+ break;
+
+ case NF4BLK:
+@@ -638,7 +788,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ goto out_umask;
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+- &create->cr_iattr, S_IFBLK, rdev, &resfh);
++ &attrs, S_IFBLK, rdev, &resfh);
+ break;
+
+ case NF4CHR:
+@@ -649,26 +799,26 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ goto out_umask;
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+- &create->cr_iattr,S_IFCHR, rdev, &resfh);
++ &attrs, S_IFCHR, rdev, &resfh);
+ break;
+
+ case NF4SOCK:
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+- &create->cr_iattr, S_IFSOCK, 0, &resfh);
++ &attrs, S_IFSOCK, 0, &resfh);
+ break;
+
+ case NF4FIFO:
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+- &create->cr_iattr, S_IFIFO, 0, &resfh);
++ &attrs, S_IFIFO, 0, &resfh);
+ break;
+
+ case NF4DIR:
+ create->cr_iattr.ia_valid &= ~ATTR_SIZE;
+ status = nfsd_create(rqstp, &cstate->current_fh,
+ create->cr_name, create->cr_namelen,
+- &create->cr_iattr, S_IFDIR, 0, &resfh);
++ &attrs, S_IFDIR, 0, &resfh);
+ break;
+
+ default:
+@@ -678,20 +828,17 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (status)
+ goto out;
+
+- if (create->cr_label.len)
+- nfsd4_security_inode_setsecctx(&resfh, &create->cr_label, create->cr_bmval);
+-
+- if (create->cr_acl != NULL)
+- do_set_nfs4_acl(rqstp, &resfh, create->cr_acl,
+- create->cr_bmval);
+-
+- fh_unlock(&cstate->current_fh);
++ if (attrs.na_labelerr)
++ create->cr_bmval[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
++ if (attrs.na_aclerr)
++ create->cr_bmval[0] &= ~FATTR4_WORD0_ACL;
+ set_change_info(&create->cr_cinfo, &cstate->current_fh);
+ fh_dup2(&cstate->current_fh, &resfh);
+ out:
+ fh_put(&resfh);
+ out_umask:
+ current->fs->umask = 0;
++ nfsd_attrs_free(&attrs);
+ return status;
+ }
+
+@@ -772,12 +919,16 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ __be32 status;
+
+ read->rd_nf = NULL;
+- if (read->rd_offset >= OFFSET_MAX)
+- return nfserr_inval;
+
+ trace_nfsd_read_start(rqstp, &cstate->current_fh,
+ read->rd_offset, read->rd_length);
+
++ read->rd_length = min_t(u32, read->rd_length, svc_max_payload(rqstp));
++ if (read->rd_offset > (u64)OFFSET_MAX)
++ read->rd_offset = (u64)OFFSET_MAX;
++ if (read->rd_offset + read->rd_length > (u64)OFFSET_MAX)
++ read->rd_length = (u64)OFFSET_MAX - read->rd_offset;
++
+ /*
+ * If we do a zero copy read, then a client will see read data
+ * that reflects the state of the file *after* performing the
+@@ -793,12 +944,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &read->rd_stateid, RD_STATE,
+ &read->rd_nf, NULL);
+- if (status) {
+- dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
+- goto out;
+- }
+- status = nfs_ok;
+-out:
++
+ read->rd_rqstp = rqstp;
+ read->rd_fhp = &cstate->current_fh;
+ return status;
+@@ -860,10 +1006,8 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ return nfserr_grace;
+ status = nfsd_unlink(rqstp, &cstate->current_fh, 0,
+ remove->rm_name, remove->rm_namelen);
+- if (!status) {
+- fh_unlock(&cstate->current_fh);
++ if (!status)
+ set_change_info(&remove->rm_cinfo, &cstate->current_fh);
+- }
+ return status;
+ }
+
+@@ -903,7 +1047,6 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ &exp, &dentry);
+ if (err)
+ return err;
+- fh_unlock(&cstate->current_fh);
+ if (d_really_is_negative(dentry)) {
+ exp_put(exp);
+ err = nfserr_noent;
+@@ -958,17 +1101,21 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+ {
+ struct nfsd4_setattr *setattr = &u->setattr;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &setattr->sa_iattr,
++ .na_seclabel = &setattr->sa_label,
++ };
++ struct inode *inode;
+ __be32 status = nfs_ok;
++ bool save_no_wcc;
+ int err;
+
+ if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
+ status = nfs4_preprocess_stateid_op(rqstp, cstate,
+ &cstate->current_fh, &setattr->sa_stateid,
+ WR_STATE, NULL, NULL);
+- if (status) {
+- dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
++ if (status)
+ return status;
+- }
+ }
+ err = fh_want_write(&cstate->current_fh);
+ if (err)
+@@ -980,19 +1127,23 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (status)
+ goto out;
+
+- if (setattr->sa_acl != NULL)
+- status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
+- setattr->sa_acl);
+- if (status)
+- goto out;
+- if (setattr->sa_label.len)
+- status = nfsd4_set_nfs4_label(rqstp, &cstate->current_fh,
+- &setattr->sa_label);
++ inode = cstate->current_fh.fh_dentry->d_inode;
++ status = nfsd4_acl_to_attr(S_ISDIR(inode->i_mode) ? NF4DIR : NF4REG,
++ setattr->sa_acl, &attrs);
++
+ if (status)
+ goto out;
+- status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
++ save_no_wcc = cstate->current_fh.fh_no_wcc;
++ cstate->current_fh.fh_no_wcc = true;
++ status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
+ 0, (time64_t)0);
++ cstate->current_fh.fh_no_wcc = save_no_wcc;
++ if (!status)
++ status = nfserrno(attrs.na_labelerr);
++ if (!status)
++ status = nfserrno(attrs.na_aclerr);
+ out:
++ nfsd_attrs_free(&attrs);
+ fh_drop_write(&cstate->current_fh);
+ return status;
+ }
+@@ -1017,15 +1168,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ write->wr_offset, cnt);
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ stateid, WR_STATE, &nf, NULL);
+- if (status) {
+- dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
++ if (status)
+ return status;
+- }
+
+ write->wr_how_written = write->wr_stable_how;
+
+- nvecs = svc_fill_write_vector(rqstp, write->wr_pagelist,
+- &write->wr_head, write->wr_buflen);
++ nvecs = svc_fill_write_vector(rqstp, &write->wr_payload);
+ WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
+
+ status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
+@@ -1052,17 +1200,13 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
+ src_stateid, RD_STATE, src, NULL);
+- if (status) {
+- dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
++ if (status)
+ goto out;
+- }
+
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ dst_stateid, WR_STATE, dst, NULL);
+- if (status) {
+- dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
++ if (status)
+ goto out_put_src;
+- }
+
+ /* fix up for NFS-specific error code */
+ if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) ||
+@@ -1095,7 +1239,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (status)
+ goto out;
+
+- status = nfsd4_clone_file_range(src, clone->cl_src_pos,
++ status = nfsd4_clone_file_range(rqstp, src, clone->cl_src_pos,
+ dst, clone->cl_dst_pos, clone->cl_count,
+ EX_ISSYNC(cstate->current_fh.fh_export));
+
+@@ -1105,30 +1249,17 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ return status;
+ }
+
+-void nfs4_put_copy(struct nfsd4_copy *copy)
++static void nfs4_put_copy(struct nfsd4_copy *copy)
+ {
+ if (!refcount_dec_and_test(&copy->refcount))
+ return;
++ kfree(copy->cp_src);
+ kfree(copy);
+ }
+
+-static bool
+-check_and_set_stop_copy(struct nfsd4_copy *copy)
+-{
+- bool value;
+-
+- spin_lock(&copy->cp_clp->async_lock);
+- value = copy->stopped;
+- if (!copy->stopped)
+- copy->stopped = true;
+- spin_unlock(&copy->cp_clp->async_lock);
+- return value;
+-}
+-
+ static void nfsd4_stop_copy(struct nfsd4_copy *copy)
+ {
+- /* only 1 thread should stop the copy */
+- if (!check_and_set_stop_copy(copy))
++ if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags))
+ kthread_stop(copy->copy_task);
+ nfs4_put_copy(copy);
+ }
+@@ -1165,12 +1296,88 @@ extern void nfs_sb_deactive(struct super_block *sb);
+
+ #define NFSD42_INTERSSC_MOUNTOPS "vers=4.2,addr=%s,sec=sys"
+
++/*
++ * setup a work entry in the ssc delayed unmount list.
++ */
++static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr,
++ struct nfsd4_ssc_umount_item **nsui)
++{
++ struct nfsd4_ssc_umount_item *ni = NULL;
++ struct nfsd4_ssc_umount_item *work = NULL;
++ struct nfsd4_ssc_umount_item *tmp;
++ DEFINE_WAIT(wait);
++ __be32 status = 0;
++
++ *nsui = NULL;
++ work = kzalloc(sizeof(*work), GFP_KERNEL);
++try_again:
++ spin_lock(&nn->nfsd_ssc_lock);
++ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
++ if (strncmp(ni->nsui_ipaddr, ipaddr, sizeof(ni->nsui_ipaddr)))
++ continue;
++ /* found a match */
++ if (ni->nsui_busy) {
++ /* wait - and try again */
++ prepare_to_wait(&nn->nfsd_ssc_waitq, &wait, TASK_IDLE);
++ spin_unlock(&nn->nfsd_ssc_lock);
++
++ /* allow 20secs for mount/unmount for now - revisit */
++ if (kthread_should_stop() ||
++ (freezable_schedule_timeout(20*HZ) == 0)) {
++ finish_wait(&nn->nfsd_ssc_waitq, &wait);
++ kfree(work);
++ return nfserr_eagain;
++ }
++ finish_wait(&nn->nfsd_ssc_waitq, &wait);
++ goto try_again;
++ }
++ *nsui = ni;
++ refcount_inc(&ni->nsui_refcnt);
++ spin_unlock(&nn->nfsd_ssc_lock);
++ kfree(work);
++
++ /* return vfsmount in (*nsui)->nsui_vfsmount */
++ return 0;
++ }
++ if (work) {
++ strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1);
++ refcount_set(&work->nsui_refcnt, 2);
++ work->nsui_busy = true;
++ list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list);
++ *nsui = work;
++ } else
++ status = nfserr_resource;
++ spin_unlock(&nn->nfsd_ssc_lock);
++ return status;
++}
++
++static void nfsd4_ssc_update_dul(struct nfsd_net *nn,
++ struct nfsd4_ssc_umount_item *nsui,
++ struct vfsmount *ss_mnt)
++{
++ spin_lock(&nn->nfsd_ssc_lock);
++ nsui->nsui_vfsmount = ss_mnt;
++ nsui->nsui_busy = false;
++ wake_up_all(&nn->nfsd_ssc_waitq);
++ spin_unlock(&nn->nfsd_ssc_lock);
++}
++
++static void nfsd4_ssc_cancel_dul(struct nfsd_net *nn,
++ struct nfsd4_ssc_umount_item *nsui)
++{
++ spin_lock(&nn->nfsd_ssc_lock);
++ list_del(&nsui->nsui_list);
++ wake_up_all(&nn->nfsd_ssc_waitq);
++ spin_unlock(&nn->nfsd_ssc_lock);
++ kfree(nsui);
++}
++
+ /*
+ * Support one copy source server for now.
+ */
+ static __be32
+ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
+- struct vfsmount **mount)
++ struct nfsd4_ssc_umount_item **nsui)
+ {
+ struct file_system_type *type;
+ struct vfsmount *ss_mnt;
+@@ -1181,12 +1388,14 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
+ char *ipaddr, *dev_name, *raw_data;
+ int len, raw_len;
+ __be32 status = nfserr_inval;
++ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ naddr = &nss->u.nl4_addr;
+ tmp_addrlen = rpc_uaddr2sockaddr(SVC_NET(rqstp), naddr->addr,
+ naddr->addr_len,
+ (struct sockaddr *)&tmp_addr,
+ sizeof(tmp_addr));
++ *nsui = NULL;
+ if (tmp_addrlen == 0)
+ goto out_err;
+
+@@ -1229,14 +1438,23 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
+ goto out_free_rawdata;
+ snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep);
+
++ status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui);
++ if (status)
++ goto out_free_devname;
++ if ((*nsui)->nsui_vfsmount)
++ goto out_done;
++
+ /* Use an 'internal' mount: SB_KERNMOUNT -> MNT_INTERNAL */
+ ss_mnt = vfs_kern_mount(type, SB_KERNMOUNT, dev_name, raw_data);
+ module_put(type->owner);
+- if (IS_ERR(ss_mnt))
++ if (IS_ERR(ss_mnt)) {
++ status = nfserr_nodev;
++ nfsd4_ssc_cancel_dul(nn, *nsui);
+ goto out_free_devname;
+-
++ }
++ nfsd4_ssc_update_dul(nn, *nsui, ss_mnt);
++out_done:
+ status = 0;
+- *mount = ss_mnt;
+
+ out_free_devname:
+ kfree(dev_name);
+@@ -1260,7 +1478,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp,
+ static __be32
+ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+- struct nfsd4_copy *copy, struct vfsmount **mount)
++ struct nfsd4_copy *copy)
+ {
+ struct svc_fh *s_fh = NULL;
+ stateid_t *s_stid = &copy->cp_src_stateid;
+@@ -1273,14 +1491,14 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
+ if (status)
+ goto out;
+
+- status = nfsd4_interssc_connect(&copy->cp_src, rqstp, mount);
++ status = nfsd4_interssc_connect(copy->cp_src, rqstp, &copy->ss_nsui);
+ if (status)
+ goto out;
+
+ s_fh = &cstate->save_fh;
+
+ copy->c_fh.size = s_fh->fh_handle.fh_size;
+- memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_base, copy->c_fh.size);
++ memcpy(copy->c_fh.data, &s_fh->fh_handle.fh_raw, copy->c_fh.size);
+ copy->stateid.seqid = cpu_to_be32(s_stid->si_generation);
+ memcpy(copy->stateid.other, (void *)&s_stid->si_opaque,
+ sizeof(stateid_opaque_t));
+@@ -1291,13 +1509,26 @@ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
+ }
+
+ static void
+-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
++nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp,
+ struct nfsd_file *dst)
+ {
+- nfs42_ssc_close(src->nf_file);
+- fput(src->nf_file);
+- nfsd_file_put(dst);
+- mntput(ss_mnt);
++ struct nfsd_net *nn = net_generic(dst->nf_net, nfsd_net_id);
++ long timeout = msecs_to_jiffies(nfsd4_ssc_umount_timeout);
++
++ nfs42_ssc_close(filp);
++ fput(filp);
++
++ spin_lock(&nn->nfsd_ssc_lock);
++ list_del(&nsui->nsui_list);
++ /*
++ * vfsmount can be shared by multiple exports,
++ * decrement refcnt. If the count drops to 1 it
++ * will be unmounted when nsui_expire expires.
++ */
++ refcount_dec(&nsui->nsui_refcnt);
++ nsui->nsui_expire = jiffies + timeout;
++ list_add_tail(&nsui->nsui_list, &nn->nfsd_ssc_mount_list);
++ spin_unlock(&nn->nfsd_ssc_lock);
+ }
+
+ #else /* CONFIG_NFSD_V4_2_INTER_SSC */
+@@ -1305,15 +1536,13 @@ nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
+ static __be32
+ nfsd4_setup_inter_ssc(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+- struct nfsd4_copy *copy,
+- struct vfsmount **mount)
++ struct nfsd4_copy *copy)
+ {
+- *mount = NULL;
+ return nfserr_inval;
+ }
+
+ static void
+-nfsd4_cleanup_inter_ssc(struct vfsmount *ss_mnt, struct nfsd_file *src,
++nfsd4_cleanup_inter_ssc(struct nfsd4_ssc_umount_item *nsui, struct file *filp,
+ struct nfsd_file *dst)
+ {
+ }
+@@ -1336,23 +1565,21 @@ nfsd4_setup_intra_ssc(struct svc_rqst *rqstp,
+ &copy->nf_dst);
+ }
+
+-static void
+-nfsd4_cleanup_intra_ssc(struct nfsd_file *src, struct nfsd_file *dst)
+-{
+- nfsd_file_put(src);
+- nfsd_file_put(dst);
+-}
+-
+ static void nfsd4_cb_offload_release(struct nfsd4_callback *cb)
+ {
+- struct nfsd4_copy *copy = container_of(cb, struct nfsd4_copy, cp_cb);
++ struct nfsd4_cb_offload *cbo =
++ container_of(cb, struct nfsd4_cb_offload, co_cb);
+
+- nfs4_put_copy(copy);
++ kfree(cbo);
+ }
+
+ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
+ struct rpc_task *task)
+ {
++ struct nfsd4_cb_offload *cbo =
++ container_of(cb, struct nfsd4_cb_offload, co_cb);
++
++ trace_nfsd_cb_offload_done(&cbo->co_res.cb_stateid, task);
+ return 1;
+ }
+
+@@ -1363,20 +1590,28 @@ static const struct nfsd4_callback_ops nfsd4_cb_offload_ops = {
+
+ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)
+ {
+- copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+- copy->cp_synchronous = sync;
++ copy->cp_res.wr_stable_how =
++ test_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags) ?
++ NFS_FILE_SYNC : NFS_UNSTABLE;
++ nfsd4_copy_set_sync(copy, sync);
+ gen_boot_verifier(&copy->cp_res.wr_verifier, copy->cp_clp->net);
+ }
+
+-static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
++static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy,
++ struct file *dst,
++ struct file *src)
+ {
+- struct file *dst = copy->nf_dst->nf_file;
+- struct file *src = copy->nf_src->nf_file;
++ errseq_t since;
+ ssize_t bytes_copied = 0;
+- size_t bytes_total = copy->cp_count;
++ u64 bytes_total = copy->cp_count;
+ u64 src_pos = copy->cp_src_pos;
+ u64 dst_pos = copy->cp_dst_pos;
++ int status;
++ loff_t end;
+
++ /* See RFC 7862 p.67: */
++ if (bytes_total == 0)
++ bytes_total = ULLONG_MAX;
+ do {
+ if (kthread_should_stop())
+ break;
+@@ -1388,16 +1623,29 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
+ copy->cp_res.wr_bytes_written += bytes_copied;
+ src_pos += bytes_copied;
+ dst_pos += bytes_copied;
+- } while (bytes_total > 0 && !copy->cp_synchronous);
++ } while (bytes_total > 0 && nfsd4_copy_is_async(copy));
++ /* for a non-zero asynchronous copy do a commit of data */
++ if (nfsd4_copy_is_async(copy) && copy->cp_res.wr_bytes_written > 0) {
++ since = READ_ONCE(dst->f_wb_err);
++ end = copy->cp_dst_pos + copy->cp_res.wr_bytes_written - 1;
++ status = vfs_fsync_range(dst, copy->cp_dst_pos, end, 0);
++ if (!status)
++ status = filemap_check_wb_err(dst->f_mapping, since);
++ if (!status)
++ set_bit(NFSD4_COPY_F_COMMITTED, &copy->cp_flags);
++ }
+ return bytes_copied;
+ }
+
+-static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
++static __be32 nfsd4_do_copy(struct nfsd4_copy *copy,
++ struct file *src, struct file *dst,
++ bool sync)
+ {
+ __be32 status;
+ ssize_t bytes;
+
+- bytes = _nfsd_copy_file_range(copy);
++ bytes = _nfsd_copy_file_range(copy, dst, src);
++
+ /* for async copy, we ignore the error, client can always retry
+ * to get the error
+ */
+@@ -1407,13 +1655,6 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync)
+ nfsd4_init_copy_res(copy, sync);
+ status = nfs_ok;
+ }
+-
+- if (!copy->cp_intra) /* Inter server SSC */
+- nfsd4_cleanup_inter_ssc(copy->ss_mnt, copy->nf_src,
+- copy->nf_dst);
+- else
+- nfsd4_cleanup_intra_ssc(copy->nf_src, copy->nf_dst);
+-
+ return status;
+ }
+
+@@ -1422,71 +1663,100 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst)
+ dst->cp_src_pos = src->cp_src_pos;
+ dst->cp_dst_pos = src->cp_dst_pos;
+ dst->cp_count = src->cp_count;
+- dst->cp_synchronous = src->cp_synchronous;
++ dst->cp_flags = src->cp_flags;
+ memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res));
+ memcpy(&dst->fh, &src->fh, sizeof(src->fh));
+ dst->cp_clp = src->cp_clp;
+ dst->nf_dst = nfsd_file_get(src->nf_dst);
+- dst->cp_intra = src->cp_intra;
+- if (src->cp_intra) /* for inter, file_src doesn't exist yet */
++ /* for inter, nf_src doesn't exist yet */
++ if (!nfsd4_ssc_is_inter(src))
+ dst->nf_src = nfsd_file_get(src->nf_src);
+
+ memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid));
+- memcpy(&dst->cp_src, &src->cp_src, sizeof(struct nl4_server));
++ memcpy(dst->cp_src, src->cp_src, sizeof(struct nl4_server));
+ memcpy(&dst->stateid, &src->stateid, sizeof(src->stateid));
+ memcpy(&dst->c_fh, &src->c_fh, sizeof(src->c_fh));
+- dst->ss_mnt = src->ss_mnt;
++ dst->ss_nsui = src->ss_nsui;
++}
++
++static void release_copy_files(struct nfsd4_copy *copy)
++{
++ if (copy->nf_src)
++ nfsd_file_put(copy->nf_src);
++ if (copy->nf_dst)
++ nfsd_file_put(copy->nf_dst);
+ }
+
+ static void cleanup_async_copy(struct nfsd4_copy *copy)
+ {
+ nfs4_free_copy_state(copy);
+- nfsd_file_put(copy->nf_dst);
+- if (copy->cp_intra)
+- nfsd_file_put(copy->nf_src);
+- spin_lock(&copy->cp_clp->async_lock);
+- list_del(&copy->copies);
+- spin_unlock(&copy->cp_clp->async_lock);
++ release_copy_files(copy);
++ if (copy->cp_clp) {
++ spin_lock(&copy->cp_clp->async_lock);
++ if (!list_empty(&copy->copies))
++ list_del_init(&copy->copies);
++ spin_unlock(&copy->cp_clp->async_lock);
++ }
+ nfs4_put_copy(copy);
+ }
+
++static void nfsd4_send_cb_offload(struct nfsd4_copy *copy, __be32 nfserr)
++{
++ struct nfsd4_cb_offload *cbo;
++
++ cbo = kzalloc(sizeof(*cbo), GFP_KERNEL);
++ if (!cbo)
++ return;
++
++ memcpy(&cbo->co_res, &copy->cp_res, sizeof(copy->cp_res));
++ memcpy(&cbo->co_fh, &copy->fh, sizeof(copy->fh));
++ cbo->co_nfserr = nfserr;
++
++ nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
++ NFSPROC4_CLNT_CB_OFFLOAD);
++ trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
++ &cbo->co_fh, copy->cp_count, nfserr);
++ nfsd4_run_cb(&cbo->co_cb);
++}
++
++/**
++ * nfsd4_do_async_copy - kthread function for background server-side COPY
++ * @data: arguments for COPY operation
++ *
++ * Return values:
++ * %0: Copy operation is done.
++ */
+ static int nfsd4_do_async_copy(void *data)
+ {
+ struct nfsd4_copy *copy = (struct nfsd4_copy *)data;
+- struct nfsd4_copy *cb_copy;
++ __be32 nfserr;
+
+- if (!copy->cp_intra) { /* Inter server SSC */
+- copy->nf_src = kzalloc(sizeof(struct nfsd_file), GFP_KERNEL);
+- if (!copy->nf_src) {
+- copy->nfserr = nfserr_serverfault;
+- /* ss_mnt will be unmounted by the laundromat */
+- goto do_callback;
+- }
+- copy->nf_src->nf_file = nfs42_ssc_open(copy->ss_mnt, &copy->c_fh,
+- &copy->stateid);
+- if (IS_ERR(copy->nf_src->nf_file)) {
+- copy->nfserr = nfserr_offload_denied;
++ if (nfsd4_ssc_is_inter(copy)) {
++ struct file *filp;
++
++ filp = nfs42_ssc_open(copy->ss_nsui->nsui_vfsmount,
++ &copy->c_fh, &copy->stateid);
++ if (IS_ERR(filp)) {
++ switch (PTR_ERR(filp)) {
++ case -EBADF:
++ nfserr = nfserr_wrong_type;
++ break;
++ default:
++ nfserr = nfserr_offload_denied;
++ }
+ /* ss_mnt will be unmounted by the laundromat */
+ goto do_callback;
+ }
++ nfserr = nfsd4_do_copy(copy, filp, copy->nf_dst->nf_file,
++ false);
++ nfsd4_cleanup_inter_ssc(copy->ss_nsui, filp, copy->nf_dst);
++ } else {
++ nfserr = nfsd4_do_copy(copy, copy->nf_src->nf_file,
++ copy->nf_dst->nf_file, false);
+ }
+
+- copy->nfserr = nfsd4_do_copy(copy, 0);
+ do_callback:
+- cb_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+- if (!cb_copy)
+- goto out;
+- refcount_set(&cb_copy->refcount, 1);
+- memcpy(&cb_copy->cp_res, &copy->cp_res, sizeof(copy->cp_res));
+- cb_copy->cp_clp = copy->cp_clp;
+- cb_copy->nfserr = copy->nfserr;
+- memcpy(&cb_copy->fh, &copy->fh, sizeof(copy->fh));
+- nfsd4_init_cb(&cb_copy->cp_cb, cb_copy->cp_clp,
+- &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD);
+- nfsd4_run_cb(&cb_copy->cp_cb);
+-out:
+- if (!copy->cp_intra)
+- kfree(copy->nf_src);
++ nfsd4_send_cb_offload(copy, nfserr);
+ cleanup_async_copy(copy);
+ return 0;
+ }
+@@ -1499,13 +1769,12 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ __be32 status;
+ struct nfsd4_copy *async_copy = NULL;
+
+- if (!copy->cp_intra) { /* Inter server SSC */
+- if (!inter_copy_offload_enable || copy->cp_synchronous) {
++ if (nfsd4_ssc_is_inter(copy)) {
++ if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) {
+ status = nfserr_notsupp;
+ goto out;
+ }
+- status = nfsd4_setup_inter_ssc(rqstp, cstate, copy,
+- &copy->ss_mnt);
++ status = nfsd4_setup_inter_ssc(rqstp, cstate, copy);
+ if (status)
+ return nfserr_offload_denied;
+ } else {
+@@ -1517,17 +1786,21 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ copy->cp_clp = cstate->clp;
+ memcpy(&copy->fh, &cstate->current_fh.fh_handle,
+ sizeof(struct knfsd_fh));
+- if (!copy->cp_synchronous) {
++ if (nfsd4_copy_is_async(copy)) {
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ status = nfserrno(-ENOMEM);
+ async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL);
+ if (!async_copy)
+ goto out_err;
++ INIT_LIST_HEAD(&async_copy->copies);
++ refcount_set(&async_copy->refcount, 1);
++ async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
++ if (!async_copy->cp_src)
++ goto out_err;
+ if (!nfs4_init_copy_state(nn, copy))
+ goto out_err;
+- refcount_set(&async_copy->refcount, 1);
+- memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.stid,
++ memcpy(&copy->cp_res.cb_stateid, &copy->cp_stateid.cs_stid,
+ sizeof(copy->cp_res.cb_stateid));
+ dup_copy_fields(copy, async_copy);
+ async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
+@@ -1541,18 +1814,24 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ wake_up_process(async_copy->copy_task);
+ status = nfs_ok;
+ } else {
+- status = nfsd4_do_copy(copy, 1);
++ status = nfsd4_do_copy(copy, copy->nf_src->nf_file,
++ copy->nf_dst->nf_file, true);
+ }
+ out:
++ release_copy_files(copy);
+ return status;
+ out_err:
++ if (nfsd4_ssc_is_inter(copy)) {
++ /*
++ * Source's vfsmount of inter-copy will be unmounted
++ * by the laundromat. Use copy instead of async_copy
++ * since async_copy->ss_nsui might not be set yet.
++ */
++ refcount_dec(&copy->ss_nsui->nsui_refcnt);
++ }
+ if (async_copy)
+ cleanup_async_copy(async_copy);
+ status = nfserrno(-ENOMEM);
+- /*
+- * source's vfsmount of inter-copy will be unmounted
+- * by the laundromat
+- */
+ goto out;
+ }
+
+@@ -1563,7 +1842,7 @@ find_async_copy(struct nfs4_client *clp, stateid_t *stateid)
+
+ spin_lock(&clp->async_lock);
+ list_for_each_entry(copy, &clp->async_copies, copies) {
+- if (memcmp(&copy->cp_stateid.stid, stateid, NFS4_STATEID_SIZE))
++ if (memcmp(&copy->cp_stateid.cs_stid, stateid, NFS4_STATEID_SIZE))
+ continue;
+ refcount_inc(&copy->refcount);
+ spin_unlock(&clp->async_lock);
+@@ -1617,16 +1896,16 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ cps = nfs4_alloc_init_cpntf_state(nn, stid);
+ if (!cps)
+ goto out;
+- memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.stid, sizeof(stateid_t));
++ memcpy(&cn->cpn_cnr_stateid, &cps->cp_stateid.cs_stid, sizeof(stateid_t));
+ memcpy(&cps->cp_p_stateid, &stid->sc_stateid, sizeof(stateid_t));
+ memcpy(&cps->cp_p_clid, &clp->cl_clientid, sizeof(clientid_t));
+
+ /* For now, only return one server address in cpn_src, the
+ * address used by the client to connect to this server.
+ */
+- cn->cpn_src.nl4_type = NL4_NETADDR;
++ cn->cpn_src->nl4_type = NL4_NETADDR;
+ status = nfsd4_set_netaddr((struct sockaddr *)&rqstp->rq_daddr,
+- &cn->cpn_src.u.nl4_addr);
++ &cn->cpn_src->u.nl4_addr);
+ WARN_ON_ONCE(status);
+ if (status) {
+ nfs4_put_cpntf_state(nn, cps);
+@@ -1647,10 +1926,8 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &fallocate->falloc_stateid,
+ WR_STATE, &nf, NULL);
+- if (status != nfs_ok) {
+- dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n");
++ if (status != nfs_ok)
+ return status;
+- }
+
+ status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file,
+ fallocate->falloc_offset,
+@@ -1706,10 +1983,8 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+ &seek->seek_stateid,
+ RD_STATE, &nf, NULL);
+- if (status) {
+- dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n");
++ if (status)
+ return status;
+- }
+
+ switch (seek->seek_whence) {
+ case NFS4_CONTENT_DATA:
+@@ -1877,7 +2152,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+ nfserr = nfs_ok;
+ if (gdp->gd_maxcount != 0) {
+ nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
+- rqstp, cstate->session->se_client, gdp);
++ rqstp, cstate->clp, gdp);
+ }
+
+ gdp->gd_notify_types &= ops->notify_types;
+@@ -2163,7 +2438,7 @@ nfsd4_proc_null(struct svc_rqst *rqstp)
+ static inline void nfsd4_increment_op_stats(u32 opnum)
+ {
+ if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
+- nfsdstats.nfs4_opcount[opnum]++;
++ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]);
+ }
+
+ static const struct nfsd4_operation nfsd4_ops[];
+@@ -2253,25 +2528,6 @@ static bool need_wrongsec_check(struct svc_rqst *rqstp)
+ return !(nextd->op_flags & OP_HANDLES_WRONGSEC);
+ }
+
+-static void svcxdr_init_encode(struct svc_rqst *rqstp,
+- struct nfsd4_compoundres *resp)
+-{
+- struct xdr_stream *xdr = &resp->xdr;
+- struct xdr_buf *buf = &rqstp->rq_res;
+- struct kvec *head = buf->head;
+-
+- xdr->buf = buf;
+- xdr->iov = head;
+- xdr->p = head->iov_base + head->iov_len;
+- xdr->end = head->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
+- /* Tail and page_len should be zero at this point: */
+- buf->len = buf->head[0].iov_len;
+- xdr->scratch.iov_len = 0;
+- xdr->page_ptr = buf->pages - 1;
+- buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+- - rqstp->rq_auth_slack;
+-}
+-
+ #ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ static void
+ check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
+@@ -2299,7 +2555,7 @@ check_if_stalefh_allowed(struct nfsd4_compoundargs *args)
+ return;
+ }
+ putfh = (struct nfsd4_putfh *)&saved_op->u;
+- if (!copy->cp_intra)
++ if (nfsd4_ssc_is_inter(copy))
+ putfh->no_verify = true;
+ }
+ }
+@@ -2326,10 +2582,14 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ __be32 status;
+
+- svcxdr_init_encode(rqstp, resp);
+- resp->tagp = resp->xdr.p;
++ resp->xdr = &rqstp->rq_res_stream;
++ resp->statusp = resp->xdr->p;
++
++ /* reserve space for: NFS status code */
++ xdr_reserve_space(resp->xdr, XDR_UNIT);
++
+ /* reserve space for: taglen, tag, and opcnt */
+- xdr_reserve_space(&resp->xdr, 8 + args->taglen);
++ xdr_reserve_space(resp->xdr, XDR_UNIT * 2 + args->taglen);
+ resp->taglen = args->taglen;
+ resp->tag = args->tag;
+ resp->rqstp = rqstp;
+@@ -2348,9 +2608,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
+ status = nfserr_minor_vers_mismatch;
+ if (nfsd_minorversion(nn, args->minorversion, NFSD_TEST) <= 0)
+ goto out;
+- status = nfserr_resource;
+- if (args->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
+- goto out;
+
+ status = nfs41_check_op_ordering(args);
+ if (status) {
+@@ -2363,10 +2620,20 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
+
+ rqstp->rq_lease_breaker = (void **)&cstate->clp;
+
+- trace_nfsd_compound(rqstp, args->opcnt);
++ trace_nfsd_compound(rqstp, args->client_opcnt);
+ while (!status && resp->opcnt < args->opcnt) {
+ op = &args->ops[resp->opcnt++];
+
++ if (unlikely(resp->opcnt == NFSD_MAX_OPS_PER_COMPOUND)) {
++ /* If there are still more operations to process,
++ * stop here and report NFS4ERR_RESOURCE. */
++ if (cstate->minorversion == 0 &&
++ args->client_opcnt > resp->opcnt) {
++ op->status = nfserr_resource;
++ goto encode_op;
++ }
++ }
++
+ /*
+ * The XDR decode routines may have pre-set op->status;
+ * for example, if there is a miscellaneous XDR error
+@@ -2390,13 +2657,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
+ goto encode_op;
+ }
+
+- fh_clear_wcc(current_fh);
++ fh_clear_pre_post_attrs(current_fh);
+
+ /* If op is non-idempotent */
+ if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) {
+ /*
+ * Don't execute this op if we couldn't encode a
+- * succesful reply:
++ * successful reply:
+ */
+ u32 plen = op->opdesc->op_rsize_bop(rqstp, op);
+ /*
+@@ -2435,15 +2702,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
+ encode_op:
+ if (op->status == nfserr_replay_me) {
+ op->replay = &cstate->replay_owner->so_replay;
+- nfsd4_encode_replay(&resp->xdr, op);
++ nfsd4_encode_replay(resp->xdr, op);
+ status = op->status = op->replay->rp_status;
+ } else {
+ nfsd4_encode_operation(resp, op);
+ status = op->status;
+ }
+
+- trace_nfsd_compound_status(args->opcnt, resp->opcnt, status,
+- nfsd4_op_name(op->opnum));
++ trace_nfsd_compound_status(args->client_opcnt, resp->opcnt,
++ status, nfsd4_op_name(op->opnum));
+
+ nfsd4_cstate_clear_replay(cstate);
+ nfsd4_increment_op_stats(op->opnum);
+@@ -2477,28 +2744,49 @@ nfsd4_proc_compound(struct svc_rqst *rqstp)
+
+ #define op_encode_channel_attrs_maxsz (6 + 1 + 1)
+
+-static inline u32 nfsd4_only_status_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++/*
++ * The _rsize() helpers are invoked by the NFSv4 COMPOUND decoder, which
++ * is called before sunrpc sets rq_res.buflen. Thus we have to compute
++ * the maximum payload size here, based on transport limits and the size
++ * of the remaining space in the rq_pages array.
++ */
++static u32 nfsd4_max_payload(const struct svc_rqst *rqstp)
++{
++ u32 buflen;
++
++ buflen = (rqstp->rq_page_end - rqstp->rq_next_page) * PAGE_SIZE;
++ buflen -= rqstp->rq_auth_slack;
++ buflen -= rqstp->rq_res.head[0].iov_len;
++ return min_t(u32, buflen, svc_max_payload(rqstp));
++}
++
++static u32 nfsd4_only_status_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_status_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_status_stateid_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_stateid_maxsz)* sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_access_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_access_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ /* ac_supported, ac_resp_access */
+ return (op_encode_hdr_size + 2)* sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_commit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_commit_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_verifier_maxsz) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_create_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+@@ -2509,17 +2797,17 @@ static inline u32 nfsd4_create_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op
+ * the op prematurely if the estimate is too large. We may turn off splice
+ * reads unnecessarily.
+ */
+-static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_getattr_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 *bmap = op->u.getattr.ga_bmval;
++ const u32 *bmap = op->u.getattr.ga_bmval;
+ u32 bmap0 = bmap[0], bmap1 = bmap[1], bmap2 = bmap[2];
+ u32 ret = 0;
+
+ if (bmap0 & FATTR4_WORD0_ACL)
+- return svc_max_payload(rqstp);
++ return nfsd4_max_payload(rqstp);
+ if (bmap0 & FATTR4_WORD0_FS_LOCATIONS)
+- return svc_max_payload(rqstp);
++ return nfsd4_max_payload(rqstp);
+
+ if (bmap1 & FATTR4_WORD1_OWNER) {
+ ret += IDMAP_NAMESZ + 4;
+@@ -2547,24 +2835,28 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp,
+ return ret;
+ }
+
+-static inline u32 nfsd4_getfh_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_getfh_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 1) * sizeof(__be32) + NFS4_FHSIZE;
+ }
+
+-static inline u32 nfsd4_link_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_link_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_lock_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_lock_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_lock_denied_maxsz)
+ * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_open_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_stateid_maxsz
+ + op_encode_change_info_maxsz + 1
+@@ -2572,20 +2864,18 @@ static inline u32 nfsd4_open_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+ + op_encode_delegation_maxsz) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_read_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_read_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 maxcount = 0, rlen = 0;
+-
+- maxcount = svc_max_payload(rqstp);
+- rlen = min(op->u.read.rd_length, maxcount);
++ u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
+
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_read_plus_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 maxcount = svc_max_payload(rqstp);
+- u32 rlen = min(op->u.read.rd_length, maxcount);
++ u32 rlen = min(op->u.read.rd_length, nfsd4_max_payload(rqstp));
+ /*
+ * If we detect that the file changed during hole encoding, then we
+ * recover by encoding the remaining reply as data. This means we need
+@@ -2596,70 +2886,77 @@ static inline u32 nfsd4_read_plus_rsize(struct svc_rqst *rqstp, struct nfsd4_op
+ return (op_encode_hdr_size + 2 + seg_len + XDR_QUADLEN(rlen)) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_readdir_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_readdir_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 maxcount = 0, rlen = 0;
+-
+- maxcount = svc_max_payload(rqstp);
+- rlen = min(op->u.readdir.rd_maxcount, maxcount);
++ u32 rlen = min(op->u.readdir.rd_maxcount, nfsd4_max_payload(rqstp));
+
+ return (op_encode_hdr_size + op_encode_verifier_maxsz +
+ XDR_QUADLEN(rlen)) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_readlink_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_readlink_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 1) * sizeof(__be32) + PAGE_SIZE;
+ }
+
+-static inline u32 nfsd4_remove_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_remove_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_rename_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_rename_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_change_info_maxsz
+ + op_encode_change_info_maxsz) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_sequence_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_sequence_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size
+ + XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_test_stateid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_test_stateid_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 1 + op->u.test_stateid.ts_num_ids)
+ * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_setattr_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_setattr_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + nfs4_fattr_bitmap_maxsz) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_secinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_secinfo_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + RPC_AUTH_MAXFLAVOR *
+ (4 + XDR_QUADLEN(GSS_OID_MAX_LEN))) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_setclientid_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_setclientid_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 2 + XDR_QUADLEN(NFS4_VERIFIER_SIZE)) *
+ sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_write_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_write_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 2 + op_encode_verifier_maxsz) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_exchange_id_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 2 + 1 + /* eir_clientid, eir_sequenceid */\
+ 1 + 1 + /* eir_flags, spr_how */\
+@@ -2673,14 +2970,16 @@ static inline u32 nfsd4_exchange_id_rsize(struct svc_rqst *rqstp, struct nfsd4_o
+ 0 /* ignored eir_server_impl_id contents */) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_bind_conn_to_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_bind_conn_to_session_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* bctsr_sessid */\
+ 2 /* bctsr_dir, use_conn_in_rdma_mode */) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_create_session_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + \
+ XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + /* sessionid */\
+@@ -2689,7 +2988,8 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
+ op_encode_channel_attrs_maxsz) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_copy_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size +
+ 1 /* wr_callback */ +
+@@ -2701,16 +3001,16 @@ static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+ 1 /* cr_synchronous */) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_offload_status_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_offload_status_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size +
+ 2 /* osr_count */ +
+ 1 /* osr_complete<1> optional 0 for now */) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_copy_notify_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size +
+ 3 /* cnr_lease_time */ +
+@@ -2725,12 +3025,10 @@ static inline u32 nfsd4_copy_notify_rsize(struct svc_rqst *rqstp,
+ }
+
+ #ifdef CONFIG_NFSD_PNFS
+-static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_getdeviceinfo_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 maxcount = 0, rlen = 0;
+-
+- maxcount = svc_max_payload(rqstp);
+- rlen = min(op->u.getdeviceinfo.gd_maxcount, maxcount);
++ u32 rlen = min(op->u.getdeviceinfo.gd_maxcount, nfsd4_max_payload(rqstp));
+
+ return (op_encode_hdr_size +
+ 1 /* gd_layout_type*/ +
+@@ -2743,7 +3041,8 @@ static inline u32 nfsd4_getdeviceinfo_rsize(struct svc_rqst *rqstp, struct nfsd4
+ * so we need to define an arbitrary upper bound here.
+ */
+ #define MAX_LAYOUT_SIZE 128
+-static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_layoutget_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size +
+ 1 /* logr_return_on_close */ +
+@@ -2752,14 +3051,16 @@ static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op
+ MAX_LAYOUT_SIZE) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_layoutcommit_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size +
+ 1 /* locr_newsize */ +
+ 2 /* ns_size */) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_layoutreturn_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size +
+ 1 /* lrs_stateid */ +
+@@ -2768,41 +3069,36 @@ static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_
+ #endif /* CONFIG_NFSD_PNFS */
+
+
+-static inline u32 nfsd4_seek_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
++static u32 nfsd4_seek_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + 3) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_getxattr_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_getxattr_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 maxcount, rlen;
+-
+- maxcount = svc_max_payload(rqstp);
+- rlen = min_t(u32, XATTR_SIZE_MAX, maxcount);
++ u32 rlen = min_t(u32, XATTR_SIZE_MAX, nfsd4_max_payload(rqstp));
+
+ return (op_encode_hdr_size + 1 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_setxattr_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_setxattr_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+ }
+-static inline u32 nfsd4_listxattrs_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_listxattrs_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+- u32 maxcount, rlen;
+-
+- maxcount = svc_max_payload(rqstp);
+- rlen = min(op->u.listxattrs.lsxa_maxcount, maxcount);
++ u32 rlen = min(op->u.listxattrs.lsxa_maxcount, nfsd4_max_payload(rqstp));
+
+ return (op_encode_hdr_size + 4 + XDR_QUADLEN(rlen)) * sizeof(__be32);
+ }
+
+-static inline u32 nfsd4_removexattr_rsize(struct svc_rqst *rqstp,
+- struct nfsd4_op *op)
++static u32 nfsd4_removexattr_rsize(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op)
+ {
+ return (op_encode_hdr_size + op_encode_change_info_maxsz)
+ * sizeof(__be32);
+@@ -3235,7 +3531,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+ {
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+- struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
++ struct nfsd4_op *this;
+ struct nfsd4_compound_state *cstate = &resp->cstate;
+ struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
+ u32 opiter;
+@@ -3272,7 +3568,7 @@ int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
+ void warn_on_nonidempotent_op(struct nfsd4_op *op)
+ {
+ if (OPDESC(op)->op_flags & OP_MODIFIES_SOMETHING) {
+- pr_err("unable to encode reply to nonidempotent op %d (%s)\n",
++ pr_err("unable to encode reply to nonidempotent op %u (%s)\n",
+ op->opnum, nfsd4_op_name(op->opnum));
+ WARN_ON_ONCE(1);
+ }
+@@ -3285,28 +3581,29 @@ static const char *nfsd4_op_name(unsigned opnum)
+ return "unknown_operation";
+ }
+
+-#define nfsd4_voidres nfsd4_voidargs
+-struct nfsd4_voidargs { int dummy; };
+-
+ static const struct svc_procedure nfsd_procedures4[2] = {
+ [NFSPROC4_NULL] = {
+ .pc_func = nfsd4_proc_null,
+- .pc_decode = nfs4svc_decode_voidarg,
+- .pc_encode = nfs4svc_encode_voidres,
+- .pc_argsize = sizeof(struct nfsd4_voidargs),
+- .pc_ressize = sizeof(struct nfsd4_voidres),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 1,
++ .pc_name = "NULL",
+ },
+ [NFSPROC4_COMPOUND] = {
+ .pc_func = nfsd4_proc_compound,
+ .pc_decode = nfs4svc_decode_compoundargs,
+ .pc_encode = nfs4svc_encode_compoundres,
+ .pc_argsize = sizeof(struct nfsd4_compoundargs),
++ .pc_argzero = offsetof(struct nfsd4_compoundargs, iops),
+ .pc_ressize = sizeof(struct nfsd4_compoundres),
+ .pc_release = nfsd4_release_compoundargs,
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = NFSD_BUFSIZE/4,
++ .pc_name = "COMPOUND",
+ },
+ };
+
+diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
+index 83c4e68839537..189c622dde61c 100644
+--- a/fs/nfsd/nfs4recover.c
++++ b/fs/nfsd/nfs4recover.c
+@@ -626,7 +626,7 @@ nfsd4_legacy_tracking_init(struct net *net)
+ status = nfsd4_load_reboot_recovery_data(net);
+ if (status)
+ goto err;
+- printk("NFSD: Using legacy client tracking operations.\n");
++ pr_info("NFSD: Using legacy client tracking operations.\n");
+ return 0;
+
+ err:
+@@ -807,17 +807,17 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
+ if (get_user(namelen, &ci->cc_name.cn_len))
+ return -EFAULT;
+ name.data = memdup_user(&ci->cc_name.cn_id, namelen);
+- if (IS_ERR_OR_NULL(name.data))
+- return -EFAULT;
++ if (IS_ERR(name.data))
++ return PTR_ERR(name.data);
+ name.len = namelen;
+ get_user(princhashlen, &ci->cc_princhash.cp_len);
+ if (princhashlen > 0) {
+ princhash.data = memdup_user(
+ &ci->cc_princhash.cp_data,
+ princhashlen);
+- if (IS_ERR_OR_NULL(princhash.data)) {
++ if (IS_ERR(princhash.data)) {
+ kfree(name.data);
+- return -EFAULT;
++ return PTR_ERR(princhash.data);
+ }
+ princhash.len = princhashlen;
+ } else
+@@ -829,8 +829,8 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
+ if (get_user(namelen, &cnm->cn_len))
+ return -EFAULT;
+ name.data = memdup_user(&cnm->cn_id, namelen);
+- if (IS_ERR_OR_NULL(name.data))
+- return -EFAULT;
++ if (IS_ERR(name.data))
++ return PTR_ERR(name.data);
+ name.len = namelen;
+ }
+ if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
+@@ -1030,7 +1030,7 @@ nfsd4_init_cld_pipe(struct net *net)
+
+ status = __nfsd4_init_cld_pipe(net);
+ if (!status)
+- printk("NFSD: Using old nfsdcld client tracking operations.\n");
++ pr_info("NFSD: Using old nfsdcld client tracking operations.\n");
+ return status;
+ }
+
+@@ -1607,7 +1607,7 @@ nfsd4_cld_tracking_init(struct net *net)
+ nfs4_release_reclaim(nn);
+ goto err_remove;
+ } else
+- printk("NFSD: Using nfsdcld client tracking operations.\n");
++ pr_info("NFSD: Using nfsdcld client tracking operations.\n");
+ return 0;
+
+ err_remove:
+@@ -1866,7 +1866,7 @@ nfsd4_umh_cltrack_init(struct net *net)
+ ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
+ kfree(grace_start);
+ if (!ret)
+- printk("NFSD: Using UMH upcall client tracking operations.\n");
++ pr_info("NFSD: Using UMH upcall client tracking operations.\n");
+ return ret;
+ }
+
+diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
+index d402ca0b535f0..228560f3fd0e0 100644
+--- a/fs/nfsd/nfs4state.c
++++ b/fs/nfsd/nfs4state.c
+@@ -43,6 +43,10 @@
+ #include <linux/sunrpc/addr.h>
+ #include <linux/jhash.h>
+ #include <linux/string_helpers.h>
++#include <linux/fsnotify.h>
++#include <linux/rhashtable.h>
++#include <linux/nfs_ssc.h>
++
+ #include "xdr4.h"
+ #include "xdr4cb.h"
+ #include "vfs.h"
+@@ -82,6 +86,7 @@ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
+ static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
+ void nfsd4_end_grace(struct nfsd_net *nn);
+ static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
++static void nfsd4_file_hash_remove(struct nfs4_file *fi);
+
+ /* Locking: */
+
+@@ -123,6 +128,23 @@ static void free_session(struct nfsd4_session *);
+ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+
++static struct workqueue_struct *laundry_wq;
++
++int nfsd4_create_laundry_wq(void)
++{
++ int rc = 0;
++
++ laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
++ if (laundry_wq == NULL)
++ rc = -ENOMEM;
++ return rc;
++}
++
++void nfsd4_destroy_laundry_wq(void)
++{
++ destroy_workqueue(laundry_wq);
++}
++
+ static bool is_session_dead(struct nfsd4_session *ses)
+ {
+ return ses->se_flags & NFS4_SESSION_DEAD;
+@@ -141,6 +163,13 @@ static bool is_client_expired(struct nfs4_client *clp)
+ return clp->cl_time == 0;
+ }
+
++static void nfsd4_dec_courtesy_client_count(struct nfsd_net *nn,
++ struct nfs4_client *clp)
++{
++ if (clp->cl_state != NFSD4_ACTIVE)
++ atomic_add_unless(&nn->nfsd_courtesy_clients, -1, 0);
++}
++
+ static __be32 get_client_locked(struct nfs4_client *clp)
+ {
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+@@ -150,6 +179,8 @@ static __be32 get_client_locked(struct nfs4_client *clp)
+ if (is_client_expired(clp))
+ return nfserr_expired;
+ atomic_inc(&clp->cl_rpc_users);
++ nfsd4_dec_courtesy_client_count(nn, clp);
++ clp->cl_state = NFSD4_ACTIVE;
+ return nfs_ok;
+ }
+
+@@ -170,6 +201,8 @@ renew_client_locked(struct nfs4_client *clp)
+
+ list_move_tail(&clp->cl_lru, &nn->client_lru);
+ clp->cl_time = ktime_get_boottime_seconds();
++ nfsd4_dec_courtesy_client_count(nn, clp);
++ clp->cl_state = NFSD4_ACTIVE;
+ }
+
+ static void put_client_renew_locked(struct nfs4_client *clp)
+@@ -244,6 +277,7 @@ find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
+ if (fh_match(fh, &cur->nbl_fh)) {
+ list_del_init(&cur->nbl_list);
++ WARN_ON(list_empty(&cur->nbl_lru));
+ list_del_init(&cur->nbl_lru);
+ found = cur;
+ break;
+@@ -269,6 +303,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ INIT_LIST_HEAD(&nbl->nbl_lru);
+ fh_copy_shallow(&nbl->nbl_fh, fh);
+ locks_init_lock(&nbl->nbl_lock);
++ kref_init(&nbl->nbl_kref);
+ nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
+ &nfsd4_cb_notify_lock_ops,
+ NFSPROC4_CLNT_CB_NOTIFY_LOCK);
+@@ -278,13 +313,22 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+ }
+
+ static void
+-free_blocked_lock(struct nfsd4_blocked_lock *nbl)
++free_nbl(struct kref *kref)
+ {
+- locks_delete_block(&nbl->nbl_lock);
++ struct nfsd4_blocked_lock *nbl;
++
++ nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref);
+ locks_release_private(&nbl->nbl_lock);
+ kfree(nbl);
+ }
+
++static void
++free_blocked_lock(struct nfsd4_blocked_lock *nbl)
++{
++ locks_delete_block(&nbl->nbl_lock);
++ kref_put(&nbl->nbl_kref, free_nbl);
++}
++
+ static void
+ remove_blocked_locks(struct nfs4_lockowner *lo)
+ {
+@@ -300,6 +344,7 @@ remove_blocked_locks(struct nfs4_lockowner *lo)
+ struct nfsd4_blocked_lock,
+ nbl_list);
+ list_del_init(&nbl->nbl_list);
++ WARN_ON(list_empty(&nbl->nbl_lru));
+ list_move(&nbl->nbl_lru, &reaplist);
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+@@ -324,6 +369,8 @@ nfsd4_cb_notify_lock_prepare(struct nfsd4_callback *cb)
+ static int
+ nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
+ {
++ trace_nfsd_cb_notify_lock_done(&zero_stateid, task);
++
+ /*
+ * Since this is just an optimization, we don't try very hard if it
+ * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
+@@ -353,6 +400,130 @@ static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+ .release = nfsd4_cb_notify_lock_release,
+ };
+
++/*
++ * We store the NONE, READ, WRITE, and BOTH bits separately in the
++ * st_{access,deny}_bmap field of the stateid, in order to track not
++ * only what share bits are currently in force, but also what
++ * combinations of share bits previous opens have used. This allows us
++ * to enforce the recommendation in
++ * https://datatracker.ietf.org/doc/html/rfc7530#section-16.19.4 that
++ * the server return an error if the client attempt to downgrade to a
++ * combination of share bits not explicable by closing some of its
++ * previous opens.
++ *
++ * This enforcement is arguably incomplete, since we don't keep
++ * track of access/deny bit combinations; so, e.g., we allow:
++ *
++ * OPEN allow read, deny write
++ * OPEN allow both, deny none
++ * DOWNGRADE allow read, deny none
++ *
++ * which we should reject.
++ *
++ * But you could also argue that our current code is already overkill,
++ * since it only exists to return NFS4ERR_INVAL on incorrect client
++ * behavior.
++ */
++static unsigned int
++bmap_to_share_mode(unsigned long bmap)
++{
++ int i;
++ unsigned int access = 0;
++
++ for (i = 1; i < 4; i++) {
++ if (test_bit(i, &bmap))
++ access |= i;
++ }
++ return access;
++}
++
++/* set share access for a given stateid */
++static inline void
++set_access(u32 access, struct nfs4_ol_stateid *stp)
++{
++ unsigned char mask = 1 << access;
++
++ WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
++ stp->st_access_bmap |= mask;
++}
++
++/* clear share access for a given stateid */
++static inline void
++clear_access(u32 access, struct nfs4_ol_stateid *stp)
++{
++ unsigned char mask = 1 << access;
++
++ WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
++ stp->st_access_bmap &= ~mask;
++}
++
++/* test whether a given stateid has access */
++static inline bool
++test_access(u32 access, struct nfs4_ol_stateid *stp)
++{
++ unsigned char mask = 1 << access;
++
++ return (bool)(stp->st_access_bmap & mask);
++}
++
++/* set share deny for a given stateid */
++static inline void
++set_deny(u32 deny, struct nfs4_ol_stateid *stp)
++{
++ unsigned char mask = 1 << deny;
++
++ WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
++ stp->st_deny_bmap |= mask;
++}
++
++/* clear share deny for a given stateid */
++static inline void
++clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
++{
++ unsigned char mask = 1 << deny;
++
++ WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
++ stp->st_deny_bmap &= ~mask;
++}
++
++/* test whether a given stateid is denying specific access */
++static inline bool
++test_deny(u32 deny, struct nfs4_ol_stateid *stp)
++{
++ unsigned char mask = 1 << deny;
++
++ return (bool)(stp->st_deny_bmap & mask);
++}
++
++static int nfs4_access_to_omode(u32 access)
++{
++ switch (access & NFS4_SHARE_ACCESS_BOTH) {
++ case NFS4_SHARE_ACCESS_READ:
++ return O_RDONLY;
++ case NFS4_SHARE_ACCESS_WRITE:
++ return O_WRONLY;
++ case NFS4_SHARE_ACCESS_BOTH:
++ return O_RDWR;
++ }
++ WARN_ON_ONCE(1);
++ return O_RDONLY;
++}
++
++static inline int
++access_permit_read(struct nfs4_ol_stateid *stp)
++{
++ return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
++ test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
++ test_access(NFS4_SHARE_ACCESS_WRITE, stp);
++}
++
++static inline int
++access_permit_write(struct nfs4_ol_stateid *stp)
++{
++ return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
++ test_access(NFS4_SHARE_ACCESS_BOTH, stp);
++}
++
+ static inline struct nfs4_stateowner *
+ nfs4_get_stateowner(struct nfs4_stateowner *sop)
+ {
+@@ -420,11 +591,8 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
+ void
+ put_nfs4_file(struct nfs4_file *fi)
+ {
+- might_lock(&state_lock);
+-
+- if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) {
+- hlist_del_rcu(&fi->fi_hash);
+- spin_unlock(&state_lock);
++ if (refcount_dec_and_test(&fi->fi_ref)) {
++ nfsd4_file_hash_remove(fi);
+ WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate));
+ WARN_ON_ONCE(!list_empty(&fi->fi_delegations));
+ call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu);
+@@ -434,9 +602,7 @@ put_nfs4_file(struct nfs4_file *fi)
+ static struct nfsd_file *
+ __nfs4_get_fd(struct nfs4_file *f, int oflag)
+ {
+- if (f->fi_fds[oflag])
+- return nfsd_file_get(f->fi_fds[oflag]);
+- return NULL;
++ return nfsd_file_get(f->fi_fds[oflag]);
+ }
+
+ static struct nfsd_file *
+@@ -549,21 +715,71 @@ static unsigned int ownerstr_hashval(struct xdr_netobj *ownername)
+ return ret & OWNER_HASH_MASK;
+ }
+
+-/* hash table for nfs4_file */
+-#define FILE_HASH_BITS 8
+-#define FILE_HASH_SIZE (1 << FILE_HASH_BITS)
++static struct rhltable nfs4_file_rhltable ____cacheline_aligned_in_smp;
+
+-static unsigned int nfsd_fh_hashval(struct knfsd_fh *fh)
+-{
+- return jhash2(fh->fh_base.fh_pad, XDR_QUADLEN(fh->fh_size), 0);
+-}
++static const struct rhashtable_params nfs4_file_rhash_params = {
++ .key_len = sizeof_field(struct nfs4_file, fi_inode),
++ .key_offset = offsetof(struct nfs4_file, fi_inode),
++ .head_offset = offsetof(struct nfs4_file, fi_rlist),
+
+-static unsigned int file_hashval(struct knfsd_fh *fh)
++ /*
++ * Start with a single page hash table to reduce resizing churn
++ * on light workloads.
++ */
++ .min_size = 256,
++ .automatic_shrinking = true,
++};
++
++/*
++ * Check if courtesy clients have conflicting access and resolve it if possible
++ *
++ * access: is op_share_access if share_access is true.
++ * Check if access mode, op_share_access, would conflict with
++ * the current deny mode of the file 'fp'.
++ * access: is op_share_deny if share_access is false.
++ * Check if the deny mode, op_share_deny, would conflict with
++ * current access of the file 'fp'.
++ * stp: skip checking this entry.
++ * new_stp: normal open, not open upgrade.
++ *
++ * Function returns:
++ * false - access/deny mode conflict with normal client.
++ * true - no conflict or conflict with courtesy client(s) is resolved.
++ */
++static bool
++nfs4_resolve_deny_conflicts_locked(struct nfs4_file *fp, bool new_stp,
++ struct nfs4_ol_stateid *stp, u32 access, bool share_access)
+ {
+- return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
+-}
++ struct nfs4_ol_stateid *st;
++ bool resolvable = true;
++ unsigned char bmap;
++ struct nfsd_net *nn;
++ struct nfs4_client *clp;
+
+-static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
++ lockdep_assert_held(&fp->fi_lock);
++ list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
++ /* ignore lock stateid */
++ if (st->st_openstp)
++ continue;
++ if (st == stp && new_stp)
++ continue;
++ /* check file access against deny mode or vice versa */
++ bmap = share_access ? st->st_deny_bmap : st->st_access_bmap;
++ if (!(access & bmap_to_share_mode(bmap)))
++ continue;
++ clp = st->st_stid.sc_client;
++ if (try_to_expire_client(clp))
++ continue;
++ resolvable = false;
++ break;
++ }
++ if (resolvable) {
++ clp = stp->st_stid.sc_client;
++ nn = net_generic(clp->net, nfsd_net_id);
++ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
++ }
++ return resolvable;
++}
+
+ static void
+ __nfs4_file_get_access(struct nfs4_file *fp, u32 access)
+@@ -768,23 +984,23 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla
+ * Create a unique stateid_t to represent each COPY.
+ */
+ static int nfs4_init_cp_state(struct nfsd_net *nn, copy_stateid_t *stid,
+- unsigned char sc_type)
++ unsigned char cs_type)
+ {
+ int new_id;
+
+- stid->stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
+- stid->stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+- stid->sc_type = sc_type;
++ stid->cs_stid.si_opaque.so_clid.cl_boot = (u32)nn->boot_time;
++ stid->cs_stid.si_opaque.so_clid.cl_id = nn->s2s_cp_cl_id;
+
+ idr_preload(GFP_KERNEL);
+ spin_lock(&nn->s2s_cp_lock);
+ new_id = idr_alloc_cyclic(&nn->s2s_cp_stateids, stid, 0, 0, GFP_NOWAIT);
+- stid->stid.si_opaque.so_id = new_id;
+- stid->stid.si_generation = 1;
++ stid->cs_stid.si_opaque.so_id = new_id;
++ stid->cs_stid.si_generation = 1;
+ spin_unlock(&nn->s2s_cp_lock);
+ idr_preload_end();
+ if (new_id < 0)
+ return 0;
++ stid->cs_type = cs_type;
+ return 1;
+ }
+
+@@ -802,7 +1018,7 @@ struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
+ if (!cps)
+ return NULL;
+ cps->cpntf_time = ktime_get_boottime_seconds();
+- refcount_set(&cps->cp_stateid.sc_count, 1);
++ refcount_set(&cps->cp_stateid.cs_count, 1);
+ if (!nfs4_init_cp_state(nn, &cps->cp_stateid, NFS4_COPYNOTIFY_STID))
+ goto out_free;
+ spin_lock(&nn->s2s_cp_lock);
+@@ -818,11 +1034,12 @@ void nfs4_free_copy_state(struct nfsd4_copy *copy)
+ {
+ struct nfsd_net *nn;
+
+- WARN_ON_ONCE(copy->cp_stateid.sc_type != NFS4_COPY_STID);
++ if (copy->cp_stateid.cs_type != NFS4_COPY_STID)
++ return;
+ nn = net_generic(copy->cp_clp->net, nfsd_net_id);
+ spin_lock(&nn->s2s_cp_lock);
+ idr_remove(&nn->s2s_cp_stateids,
+- copy->cp_stateid.stid.si_opaque.so_id);
++ copy->cp_stateid.cs_stid.si_opaque.so_id);
+ spin_unlock(&nn->s2s_cp_lock);
+ }
+
+@@ -854,7 +1071,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp)
+
+ static void nfs4_free_deleg(struct nfs4_stid *stid)
+ {
+- WARN_ON(!list_empty(&stid->sc_cp_list));
++ struct nfs4_delegation *dp = delegstateid(stid);
++
++ WARN_ON_ONCE(!list_empty(&stid->sc_cp_list));
++ WARN_ON_ONCE(!list_empty(&dp->dl_perfile));
++ WARN_ON_ONCE(!list_empty(&dp->dl_perclnt));
++ WARN_ON_ONCE(!list_empty(&dp->dl_recall_lru));
+ kmem_cache_free(deleg_slab, stid);
+ atomic_long_dec(&num_delegations);
+ }
+@@ -904,7 +1126,7 @@ static int delegation_blocked(struct knfsd_fh *fh)
+ }
+ spin_unlock(&blocked_delegations_lock);
+ }
+- hash = jhash(&fh->fh_base, fh->fh_size, 0);
++ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
+ if (test_bit(hash&255, bd->set[0]) &&
+ test_bit((hash>>8)&255, bd->set[0]) &&
+ test_bit((hash>>16)&255, bd->set[0]))
+@@ -923,7 +1145,7 @@ static void block_delegations(struct knfsd_fh *fh)
+ u32 hash;
+ struct bloom_pair *bd = &blocked_delegations;
+
+- hash = jhash(&fh->fh_base, fh->fh_size, 0);
++ hash = jhash(&fh->fh_raw, fh->fh_size, 0);
+
+ spin_lock(&blocked_delegations_lock);
+ __set_bit(hash&255, bd->set[bd->new]);
+@@ -937,7 +1159,6 @@ static void block_delegations(struct knfsd_fh *fh)
+
+ static struct nfs4_delegation *
+ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+- struct svc_fh *current_fh,
+ struct nfs4_clnt_odstate *odstate)
+ {
+ struct nfs4_delegation *dp;
+@@ -947,7 +1168,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+ n = atomic_long_inc_return(&num_delegations);
+ if (n < 0 || n > max_delegations)
+ goto out_dec;
+- if (delegation_blocked(&current_fh->fh_handle))
++ if (delegation_blocked(&fp->fi_fhandle))
+ goto out_dec;
+ dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg));
+ if (dp == NULL)
+@@ -966,6 +1187,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+ get_clnt_odstate(odstate);
+ dp->dl_type = NFS4_OPEN_DELEGATE_READ;
+ dp->dl_retries = 1;
++ dp->dl_recalled = false;
+ nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+ &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+ get_nfs4_file(fp);
+@@ -1144,6 +1366,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
+
+ WARN_ON(!list_empty(&dp->dl_recall_lru));
+
++ trace_nfsd_stid_revoke(&dp->dl_stid);
++
+ if (clp->cl_minorversion) {
+ spin_lock(&clp->cl_lock);
+ dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
+@@ -1169,175 +1393,73 @@ static unsigned int clientstr_hashval(struct xdr_netobj name)
+ }
+
+ /*
+- * We store the NONE, READ, WRITE, and BOTH bits separately in the
+- * st_{access,deny}_bmap field of the stateid, in order to track not
+- * only what share bits are currently in force, but also what
+- * combinations of share bits previous opens have used. This allows us
+- * to enforce the recommendation of rfc 3530 14.2.19 that the server
+- * return an error if the client attempt to downgrade to a combination
+- * of share bits not explicable by closing some of its previous opens.
+- *
+- * XXX: This enforcement is actually incomplete, since we don't keep
+- * track of access/deny bit combinations; so, e.g., we allow:
+- *
+- * OPEN allow read, deny write
+- * OPEN allow both, deny none
+- * DOWNGRADE allow read, deny none
+- *
+- * which we should reject.
++ * A stateid that had a deny mode associated with it is being released
++ * or downgraded. Recalculate the deny mode on the file.
+ */
+-static unsigned int
+-bmap_to_share_mode(unsigned long bmap) {
++static void
++recalculate_deny_mode(struct nfs4_file *fp)
++{
++ struct nfs4_ol_stateid *stp;
++
++ spin_lock(&fp->fi_lock);
++ fp->fi_share_deny = 0;
++ list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
++ fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
++ spin_unlock(&fp->fi_lock);
++}
++
++static void
++reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
++{
+ int i;
+- unsigned int access = 0;
++ bool change = false;
+
+ for (i = 1; i < 4; i++) {
+- if (test_bit(i, &bmap))
+- access |= i;
++ if ((i & deny) != i) {
++ change = true;
++ clear_deny(i, stp);
++ }
+ }
+- return access;
++
++ /* Recalculate per-file deny mode if there was a change */
++ if (change)
++ recalculate_deny_mode(stp->st_stid.sc_file);
+ }
+
+-/* set share access for a given stateid */
+-static inline void
+-set_access(u32 access, struct nfs4_ol_stateid *stp)
++/* release all access and file references for a given stateid */
++static void
++release_all_access(struct nfs4_ol_stateid *stp)
+ {
+- unsigned char mask = 1 << access;
++ int i;
++ struct nfs4_file *fp = stp->st_stid.sc_file;
+
+- WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+- stp->st_access_bmap |= mask;
++ if (fp && stp->st_deny_bmap != 0)
++ recalculate_deny_mode(fp);
++
++ for (i = 1; i < 4; i++) {
++ if (test_access(i, stp))
++ nfs4_file_put_access(stp->st_stid.sc_file, i);
++ clear_access(i, stp);
++ }
+ }
+
+-/* clear share access for a given stateid */
+-static inline void
+-clear_access(u32 access, struct nfs4_ol_stateid *stp)
++static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop)
+ {
+- unsigned char mask = 1 << access;
+-
+- WARN_ON_ONCE(access > NFS4_SHARE_ACCESS_BOTH);
+- stp->st_access_bmap &= ~mask;
++ kfree(sop->so_owner.data);
++ sop->so_ops->so_free(sop);
+ }
+
+-/* test whether a given stateid has access */
+-static inline bool
+-test_access(u32 access, struct nfs4_ol_stateid *stp)
++static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
+ {
+- unsigned char mask = 1 << access;
++ struct nfs4_client *clp = sop->so_client;
+
+- return (bool)(stp->st_access_bmap & mask);
+-}
+-
+-/* set share deny for a given stateid */
+-static inline void
+-set_deny(u32 deny, struct nfs4_ol_stateid *stp)
+-{
+- unsigned char mask = 1 << deny;
+-
+- WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+- stp->st_deny_bmap |= mask;
+-}
+-
+-/* clear share deny for a given stateid */
+-static inline void
+-clear_deny(u32 deny, struct nfs4_ol_stateid *stp)
+-{
+- unsigned char mask = 1 << deny;
+-
+- WARN_ON_ONCE(deny > NFS4_SHARE_DENY_BOTH);
+- stp->st_deny_bmap &= ~mask;
+-}
+-
+-/* test whether a given stateid is denying specific access */
+-static inline bool
+-test_deny(u32 deny, struct nfs4_ol_stateid *stp)
+-{
+- unsigned char mask = 1 << deny;
+-
+- return (bool)(stp->st_deny_bmap & mask);
+-}
+-
+-static int nfs4_access_to_omode(u32 access)
+-{
+- switch (access & NFS4_SHARE_ACCESS_BOTH) {
+- case NFS4_SHARE_ACCESS_READ:
+- return O_RDONLY;
+- case NFS4_SHARE_ACCESS_WRITE:
+- return O_WRONLY;
+- case NFS4_SHARE_ACCESS_BOTH:
+- return O_RDWR;
+- }
+- WARN_ON_ONCE(1);
+- return O_RDONLY;
+-}
+-
+-/*
+- * A stateid that had a deny mode associated with it is being released
+- * or downgraded. Recalculate the deny mode on the file.
+- */
+-static void
+-recalculate_deny_mode(struct nfs4_file *fp)
+-{
+- struct nfs4_ol_stateid *stp;
+-
+- spin_lock(&fp->fi_lock);
+- fp->fi_share_deny = 0;
+- list_for_each_entry(stp, &fp->fi_stateids, st_perfile)
+- fp->fi_share_deny |= bmap_to_share_mode(stp->st_deny_bmap);
+- spin_unlock(&fp->fi_lock);
+-}
+-
+-static void
+-reset_union_bmap_deny(u32 deny, struct nfs4_ol_stateid *stp)
+-{
+- int i;
+- bool change = false;
+-
+- for (i = 1; i < 4; i++) {
+- if ((i & deny) != i) {
+- change = true;
+- clear_deny(i, stp);
+- }
+- }
+-
+- /* Recalculate per-file deny mode if there was a change */
+- if (change)
+- recalculate_deny_mode(stp->st_stid.sc_file);
+-}
+-
+-/* release all access and file references for a given stateid */
+-static void
+-release_all_access(struct nfs4_ol_stateid *stp)
+-{
+- int i;
+- struct nfs4_file *fp = stp->st_stid.sc_file;
+-
+- if (fp && stp->st_deny_bmap != 0)
+- recalculate_deny_mode(fp);
+-
+- for (i = 1; i < 4; i++) {
+- if (test_access(i, stp))
+- nfs4_file_put_access(stp->st_stid.sc_file, i);
+- clear_access(i, stp);
+- }
+-}
+-
+-static inline void nfs4_free_stateowner(struct nfs4_stateowner *sop)
+-{
+- kfree(sop->so_owner.data);
+- sop->so_ops->so_free(sop);
+-}
+-
+-static void nfs4_put_stateowner(struct nfs4_stateowner *sop)
+-{
+- struct nfs4_client *clp = sop->so_client;
+-
+- might_lock(&clp->cl_lock);
+-
+- if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
+- return;
+- sop->so_ops->so_unhash(sop);
+- spin_unlock(&clp->cl_lock);
+- nfs4_free_stateowner(sop);
++ might_lock(&clp->cl_lock);
++
++ if (!atomic_dec_and_lock(&sop->so_count, &clp->cl_lock))
++ return;
++ sop->so_ops->so_unhash(sop);
++ spin_unlock(&clp->cl_lock);
++ nfs4_free_stateowner(sop);
+ }
+
+ static bool
+@@ -1710,13 +1832,12 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
+ int numslots = fattrs->maxreqs;
+ int slotsize = slot_bytes(fattrs);
+ struct nfsd4_session *new;
+- int mem, i;
++ int i;
+
+- BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot *)
+- + sizeof(struct nfsd4_session) > PAGE_SIZE);
+- mem = numslots * sizeof(struct nfsd4_slot *);
++ BUILD_BUG_ON(struct_size(new, se_slots, NFSD_MAX_SLOTS_PER_SESSION)
++ > PAGE_SIZE);
+
+- new = kzalloc(sizeof(*new) + mem, GFP_KERNEL);
++ new = kzalloc(struct_size(new, se_slots, numslots), GFP_KERNEL);
+ if (!new)
+ return NULL;
+ /* allocate each struct nfsd4_slot and data cache in one piece */
+@@ -1748,6 +1869,8 @@ static void nfsd4_conn_lost(struct svc_xpt_user *u)
+ struct nfsd4_conn *c = container_of(u, struct nfsd4_conn, cn_xpt_user);
+ struct nfs4_client *clp = c->cn_session->se_client;
+
++ trace_nfsd_cb_lost(clp);
++
+ spin_lock(&clp->cl_lock);
+ if (!list_empty(&c->cn_persession)) {
+ list_del(&c->cn_persession);
+@@ -1959,11 +2082,16 @@ STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
+ * This type of memory management is somewhat inefficient, but we use it
+ * anyway since SETCLIENTID is not a common operation.
+ */
+-static struct nfs4_client *alloc_client(struct xdr_netobj name)
++static struct nfs4_client *alloc_client(struct xdr_netobj name,
++ struct nfsd_net *nn)
+ {
+ struct nfs4_client *clp;
+ int i;
+
++ if (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) {
++ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
++ return NULL;
++ }
+ clp = kmem_cache_zalloc(client_slab, GFP_KERNEL);
+ if (clp == NULL)
+ return NULL;
+@@ -1981,6 +2109,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
+ idr_init(&clp->cl_stateids);
+ atomic_set(&clp->cl_rpc_users, 0);
+ clp->cl_cb_state = NFSD4_CB_UNKNOWN;
++ clp->cl_state = NFSD4_ACTIVE;
++ atomic_inc(&nn->nfs4_client_count);
++ atomic_set(&clp->cl_delegs_in_recall, 0);
+ INIT_LIST_HEAD(&clp->cl_idhash);
+ INIT_LIST_HEAD(&clp->cl_openowners);
+ INIT_LIST_HEAD(&clp->cl_delegations);
+@@ -2012,6 +2143,7 @@ static void __free_client(struct kref *k)
+ kfree(clp->cl_nii_domain.data);
+ kfree(clp->cl_nii_name.data);
+ idr_destroy(&clp->cl_stateids);
++ kfree(clp->cl_ra);
+ kmem_cache_free(client_slab, clp);
+ }
+
+@@ -2087,6 +2219,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp)
+ static void
+ __destroy_client(struct nfs4_client *clp)
+ {
++ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ int i;
+ struct nfs4_openowner *oo;
+ struct nfs4_delegation *dp;
+@@ -2130,6 +2263,8 @@ __destroy_client(struct nfs4_client *clp)
+ nfsd4_shutdown_callback(clp);
+ if (clp->cl_cb_conn.cb_xprt)
+ svc_xprt_put(clp->cl_cb_conn.cb_xprt);
++ atomic_add_unless(&nn->nfs4_client_count, -1, 0);
++ nfsd4_dec_courtesy_client_count(nn, clp);
+ free_client(clp);
+ wake_up_all(&expiry_wq);
+ }
+@@ -2358,9 +2493,24 @@ static void seq_quote_mem(struct seq_file *m, char *data, int len)
+ seq_printf(m, "\"");
+ }
+
++static const char *cb_state2str(int state)
++{
++ switch (state) {
++ case NFSD4_CB_UP:
++ return "UP";
++ case NFSD4_CB_UNKNOWN:
++ return "UNKNOWN";
++ case NFSD4_CB_DOWN:
++ return "DOWN";
++ case NFSD4_CB_FAULT:
++ return "FAULT";
++ }
++ return "UNDEFINED";
++}
++
+ static int client_info_show(struct seq_file *m, void *v)
+ {
+- struct inode *inode = m->private;
++ struct inode *inode = file_inode(m->file);
+ struct nfs4_client *clp;
+ u64 clid;
+
+@@ -2370,6 +2520,17 @@ static int client_info_show(struct seq_file *m, void *v)
+ memcpy(&clid, &clp->cl_clientid, sizeof(clid));
+ seq_printf(m, "clientid: 0x%llx\n", clid);
+ seq_printf(m, "address: \"%pISpc\"\n", (struct sockaddr *)&clp->cl_addr);
++
++ if (clp->cl_state == NFSD4_COURTESY)
++ seq_puts(m, "status: courtesy\n");
++ else if (clp->cl_state == NFSD4_EXPIRABLE)
++ seq_puts(m, "status: expirable\n");
++ else if (test_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags))
++ seq_puts(m, "status: confirmed\n");
++ else
++ seq_puts(m, "status: unconfirmed\n");
++ seq_printf(m, "seconds from last renew: %lld\n",
++ ktime_get_boottime_seconds() - clp->cl_time);
+ seq_printf(m, "name: ");
+ seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
+ seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
+@@ -2382,22 +2543,14 @@ static int client_info_show(struct seq_file *m, void *v)
+ seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
+ clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
+ }
++ seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
++ seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
+ drop_client(clp);
+
+ return 0;
+ }
+
+-static int client_info_open(struct inode *inode, struct file *file)
+-{
+- return single_open(file, client_info_show, inode);
+-}
+-
+-static const struct file_operations client_info_fops = {
+- .open = client_info_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = single_release,
+-};
++DEFINE_SHOW_ATTRIBUTE(client_info);
+
+ static void *states_start(struct seq_file *s, loff_t *pos)
+ __acquires(&clp->cl_lock)
+@@ -2440,7 +2593,7 @@ static void nfs4_show_fname(struct seq_file *s, struct nfsd_file *f)
+
+ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
+ {
+- struct inode *inode = f->nf_inode;
++ struct inode *inode = file_inode(f->nf_file);
+
+ seq_printf(s, "superblock: \"%02x:%02x:%ld\"",
+ MAJOR(inode->i_sb->s_dev),
+@@ -2668,6 +2821,8 @@ static void force_expire_client(struct nfs4_client *clp)
+ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+ bool already_expired;
+
++ trace_nfsd_clid_admin_expired(&clp->cl_clientid);
++
+ spin_lock(&nn->client_lock);
+ clp->cl_time = 0;
+ spin_unlock(&nn->client_lock);
+@@ -2716,6 +2871,36 @@ static const struct tree_descr client_files[] = {
+ [3] = {""},
+ };
+
++static int
++nfsd4_cb_recall_any_done(struct nfsd4_callback *cb,
++ struct rpc_task *task)
++{
++ switch (task->tk_status) {
++ case -NFS4ERR_DELAY:
++ rpc_delay(task, 2 * HZ);
++ return 0;
++ default:
++ return 1;
++ }
++}
++
++static void
++nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
++{
++ struct nfs4_client *clp = cb->cb_clp;
++ struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
++
++ spin_lock(&nn->client_lock);
++ clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
++ put_client_renew_locked(clp);
++ spin_unlock(&nn->client_lock);
++}
++
++static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
++ .done = nfsd4_cb_recall_any_done,
++ .release = nfsd4_cb_recall_any_release,
++};
++
+ static struct nfs4_client *create_client(struct xdr_netobj name,
+ struct svc_rqst *rqstp, nfs4_verifier *verf)
+ {
+@@ -2724,8 +2909,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
+ int ret;
+ struct net *net = SVC_NET(rqstp);
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct dentry *dentries[ARRAY_SIZE(client_files)];
+
+- clp = alloc_client(name);
++ clp = alloc_client(name, nn);
+ if (clp == NULL)
+ return NULL;
+
+@@ -2743,13 +2929,23 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
+ memcpy(&clp->cl_addr, sa, sizeof(struct sockaddr_storage));
+ clp->cl_cb_session = NULL;
+ clp->net = net;
+- clp->cl_nfsd_dentry = nfsd_client_mkdir(nn, &clp->cl_nfsdfs,
+- clp->cl_clientid.cl_id - nn->clientid_base,
+- client_files);
++ clp->cl_nfsd_dentry = nfsd_client_mkdir(
++ nn, &clp->cl_nfsdfs,
++ clp->cl_clientid.cl_id - nn->clientid_base,
++ client_files, dentries);
++ clp->cl_nfsd_info_dentry = dentries[0];
+ if (!clp->cl_nfsd_dentry) {
+ free_client(clp);
+ return NULL;
+ }
++ clp->cl_ra = kzalloc(sizeof(*clp->cl_ra), GFP_KERNEL);
++ if (!clp->cl_ra) {
++ free_client(clp);
++ return NULL;
++ }
++ clp->cl_ra_time = 0;
++ nfsd4_init_cb(&clp->cl_ra->ra_cb, clp, &nfsd4_cb_recall_any_ops,
++ NFSPROC4_CLNT_CB_RECALL_ANY);
+ return clp;
+ }
+
+@@ -2816,11 +3012,11 @@ move_to_confirmed(struct nfs4_client *clp)
+
+ lockdep_assert_held(&nn->client_lock);
+
+- dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
+ list_move(&clp->cl_idhash, &nn->conf_id_hashtbl[idhashval]);
+ rb_erase(&clp->cl_namenode, &nn->unconf_name_tree);
+ add_clp_to_name_tree(clp, &nn->conf_name_tree);
+ set_bit(NFSD4_CLIENT_CONFIRMED, &clp->cl_flags);
++ trace_nfsd_clid_confirmed(&clp->cl_clientid);
+ renew_client_locked(clp);
+ }
+
+@@ -2925,7 +3121,7 @@ gen_callback(struct nfs4_client *clp, struct nfsd4_setclientid *se, struct svc_r
+ static void
+ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
+ {
+- struct xdr_buf *buf = resp->xdr.buf;
++ struct xdr_buf *buf = resp->xdr->buf;
+ struct nfsd4_slot *slot = resp->cstate.slot;
+ unsigned int base;
+
+@@ -2995,7 +3191,7 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
+ struct nfsd4_sequence *seq)
+ {
+ struct nfsd4_slot *slot = resp->cstate.slot;
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+ __be32 status;
+
+@@ -3089,7 +3285,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+
+ rpc_ntop(sa, addr_str, sizeof(addr_str));
+ dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
+- "ip_addr=%s flags %x, spa_how %d\n",
++ "ip_addr=%s flags %x, spa_how %u\n",
+ __func__, rqstp, exid, exid->clname.len, exid->clname.data,
+ addr_str, exid->flags, exid->spa_how);
+
+@@ -3136,6 +3332,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ goto out_nolock;
+ }
+ new->cl_mach_cred = true;
++ break;
+ case SP4_NONE:
+ break;
+ default: /* checked by xdr code */
+@@ -3172,20 +3369,24 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ }
+ /* case 6 */
+ exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
++ trace_nfsd_clid_confirmed_r(conf);
+ goto out_copy;
+ }
+ if (!creds_match) { /* case 3 */
+ if (client_has_state(conf)) {
+ status = nfserr_clid_inuse;
++ trace_nfsd_clid_cred_mismatch(conf, rqstp);
+ goto out;
+ }
+ goto out_new;
+ }
+ if (verfs_match) { /* case 2 */
+ conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
++ trace_nfsd_clid_confirmed_r(conf);
+ goto out_copy;
+ }
+ /* case 5, client reboot */
++ trace_nfsd_clid_verf_mismatch(conf, rqstp, &verf);
+ conf = NULL;
+ goto out_new;
+ }
+@@ -3195,16 +3396,19 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ goto out;
+ }
+
+- unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
++ unconf = find_unconfirmed_client_by_name(&exid->clname, nn);
+ if (unconf) /* case 4, possible retry or client restart */
+ unhash_client_locked(unconf);
+
+- /* case 1 (normal case) */
++ /* case 1, new owner ID */
++ trace_nfsd_clid_fresh(new);
++
+ out_new:
+ if (conf) {
+ status = mark_client_expired_locked(conf);
+ if (status)
+ goto out;
++ trace_nfsd_clid_replaced(&conf->cl_clientid);
+ }
+ new->cl_minorversion = cstate->minorversion;
+ new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
+@@ -3228,8 +3432,10 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ out_nolock:
+ if (new)
+ expire_client(new);
+- if (unconf)
++ if (unconf) {
++ trace_nfsd_clid_expire_unconf(&unconf->cl_clientid);
+ expire_client(unconf);
++ }
+ return status;
+ }
+
+@@ -3421,9 +3627,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
+ goto out_free_conn;
+ }
+ } else if (unconf) {
++ status = nfserr_clid_inuse;
+ if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
+ !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
+- status = nfserr_clid_inuse;
++ trace_nfsd_clid_cred_mismatch(unconf, rqstp);
+ goto out_free_conn;
+ }
+ status = nfserr_wrong_cred;
+@@ -3443,6 +3650,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
+ old = NULL;
+ goto out_free_conn;
+ }
++ trace_nfsd_clid_replaced(&old->cl_clientid);
+ }
+ move_to_confirmed(unconf);
+ conf = unconf;
+@@ -3467,6 +3675,8 @@ nfsd4_create_session(struct svc_rqst *rqstp,
+ /* cache solo and embedded create sessions under the client_lock */
+ nfsd4_cache_create_session(cr_ses, cs_slot, status);
+ spin_unlock(&nn->client_lock);
++ if (conf == unconf)
++ fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
+ /* init connection and backchannel */
+ nfsd4_init_conn(rqstp, conn, new);
+ nfsd4_put_session(new);
+@@ -3740,7 +3950,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ {
+ struct nfsd4_sequence *seq = &u->sequence;
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ struct nfsd4_session *session;
+ struct nfs4_client *clp;
+ struct nfsd4_slot *slot;
+@@ -3910,6 +4120,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp,
+ status = nfserr_wrong_cred;
+ goto out;
+ }
++ trace_nfsd_clid_destroyed(&clp->cl_clientid);
+ unhash_client_locked(clp);
+ out:
+ spin_unlock(&nn->client_lock);
+@@ -3923,6 +4134,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, union nfsd4_op_u *u)
+ {
+ struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
++ struct nfs4_client *clp = cstate->clp;
+ __be32 status = 0;
+
+ if (rc->rca_one_fs) {
+@@ -3936,12 +4148,11 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
+ }
+
+ status = nfserr_complete_already;
+- if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE,
+- &cstate->session->se_client->cl_flags))
++ if (test_and_set_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
+ goto out;
+
+ status = nfserr_stale_clientid;
+- if (is_client_expired(cstate->session->se_client))
++ if (is_client_expired(clp))
+ /*
+ * The following error isn't really legal.
+ * But we only get here if the client just explicitly
+@@ -3952,8 +4163,9 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp,
+ goto out;
+
+ status = nfs_ok;
+- nfsd4_client_record_create(cstate->session->se_client);
+- inc_reclaim_complete(cstate->session->se_client);
++ trace_nfsd_clid_reclaim_complete(&clp->cl_clientid);
++ nfsd4_client_record_create(clp);
++ inc_reclaim_complete(clp);
+ out:
+ return status;
+ }
+@@ -3973,27 +4185,29 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ new = create_client(clname, rqstp, &clverifier);
+ if (new == NULL)
+ return nfserr_jukebox;
+- /* Cases below refer to rfc 3530 section 14.2.33: */
+ spin_lock(&nn->client_lock);
+ conf = find_confirmed_client_by_name(&clname, nn);
+ if (conf && client_has_state(conf)) {
+- /* case 0: */
+ status = nfserr_clid_inuse;
+ if (clp_used_exchangeid(conf))
+ goto out;
+ if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
+- trace_nfsd_clid_inuse_err(conf);
++ trace_nfsd_clid_cred_mismatch(conf, rqstp);
+ goto out;
+ }
+ }
+ unconf = find_unconfirmed_client_by_name(&clname, nn);
+ if (unconf)
+ unhash_client_locked(unconf);
+- /* We need to handle only case 1: probable callback update */
+- if (conf && same_verf(&conf->cl_verifier, &clverifier)) {
+- copy_clid(new, conf);
+- gen_confirm(new, nn);
+- }
++ if (conf) {
++ if (same_verf(&conf->cl_verifier, &clverifier)) {
++ copy_clid(new, conf);
++ gen_confirm(new, nn);
++ } else
++ trace_nfsd_clid_verf_mismatch(conf, rqstp,
++ &clverifier);
++ } else
++ trace_nfsd_clid_fresh(new);
+ new->cl_minorversion = 0;
+ gen_callback(new, setclid, rqstp);
+ add_to_unconfirmed(new);
+@@ -4006,12 +4220,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ spin_unlock(&nn->client_lock);
+ if (new)
+ free_client(new);
+- if (unconf)
++ if (unconf) {
++ trace_nfsd_clid_expire_unconf(&unconf->cl_clientid);
+ expire_client(unconf);
++ }
+ return status;
+ }
+
+-
+ __be32
+ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+@@ -4040,25 +4255,27 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+ * Nevertheless, RFC 7530 recommends INUSE for this case:
+ */
+ status = nfserr_clid_inuse;
+- if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
++ if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
++ trace_nfsd_clid_cred_mismatch(unconf, rqstp);
+ goto out;
+- if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
++ }
++ if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
++ trace_nfsd_clid_cred_mismatch(conf, rqstp);
+ goto out;
+- /* cases below refer to rfc 3530 section 14.2.34: */
++ }
+ if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
+ if (conf && same_verf(&confirm, &conf->cl_confirm)) {
+- /* case 2: probable retransmit */
+ status = nfs_ok;
+- } else /* case 4: client hasn't noticed we rebooted yet? */
++ } else
+ status = nfserr_stale_clientid;
+ goto out;
+ }
+ status = nfs_ok;
+- if (conf) { /* case 1: callback update */
++ if (conf) {
+ old = unconf;
+ unhash_client_locked(old);
+ nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+- } else { /* case 3: normal case; new or rebooted client */
++ } else {
+ old = find_confirmed_client_by_name(&unconf->cl_name, nn);
+ if (old) {
+ status = nfserr_clid_inuse;
+@@ -4073,12 +4290,15 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
+ old = NULL;
+ goto out;
+ }
++ trace_nfsd_clid_replaced(&old->cl_clientid);
+ }
+ move_to_confirmed(unconf);
+ conf = unconf;
+ }
+ get_client_locked(conf);
+ spin_unlock(&nn->client_lock);
++ if (conf == unconf)
++ fsnotify_dentry(conf->cl_nfsd_info_dentry, FS_MODIFY);
+ nfsd4_probe_callback(conf);
+ spin_lock(&nn->client_lock);
+ put_client_renew_locked(conf);
+@@ -4095,27 +4315,26 @@ static struct nfs4_file *nfsd4_alloc_file(void)
+ }
+
+ /* OPEN Share state helper functions */
+-static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
+- struct nfs4_file *fp)
+-{
+- lockdep_assert_held(&state_lock);
+
++static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
++{
+ refcount_set(&fp->fi_ref, 1);
+ spin_lock_init(&fp->fi_lock);
+ INIT_LIST_HEAD(&fp->fi_stateids);
+ INIT_LIST_HEAD(&fp->fi_delegations);
+ INIT_LIST_HEAD(&fp->fi_clnt_odstate);
+- fh_copy_shallow(&fp->fi_fhandle, fh);
++ fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle);
+ fp->fi_deleg_file = NULL;
+ fp->fi_had_conflict = false;
+ fp->fi_share_deny = 0;
+ memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
+ memset(fp->fi_access, 0, sizeof(fp->fi_access));
++ fp->fi_aliased = false;
++ fp->fi_inode = d_inode(fh->fh_dentry);
+ #ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&fp->fi_lo_states);
+ atomic_set(&fp->fi_lo_recalls, 0);
+ #endif
+- hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
+ }
+
+ void
+@@ -4179,6 +4398,51 @@ nfsd4_init_slabs(void)
+ return -ENOMEM;
+ }
+
++static unsigned long
++nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc)
++{
++ int count;
++ struct nfsd_net *nn = container_of(shrink,
++ struct nfsd_net, nfsd_client_shrinker);
++
++ count = atomic_read(&nn->nfsd_courtesy_clients);
++ if (!count)
++ count = atomic_long_read(&num_delegations);
++ if (count)
++ queue_work(laundry_wq, &nn->nfsd_shrinker_work);
++ return (unsigned long)count;
++}
++
++static unsigned long
++nfsd4_state_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc)
++{
++ return SHRINK_STOP;
++}
++
++void
++nfsd4_init_leases_net(struct nfsd_net *nn)
++{
++ struct sysinfo si;
++ u64 max_clients;
++
++ nn->nfsd4_lease = 90; /* default lease time */
++ nn->nfsd4_grace = 90;
++ nn->somebody_reclaimed = false;
++ nn->track_reclaim_completes = false;
++ nn->clverifier_counter = prandom_u32();
++ nn->clientid_base = prandom_u32();
++ nn->clientid_counter = nn->clientid_base + 1;
++ nn->s2s_cp_cl_id = nn->clientid_counter++;
++
++ atomic_set(&nn->nfs4_client_count, 0);
++ si_meminfo(&si);
++ max_clients = (u64)si.totalram * si.mem_unit / (1024 * 1024 * 1024);
++ max_clients *= NFS4_CLIENTS_PER_GB;
++ nn->nfs4_max_clients = max_t(int, max_clients, NFS4_CLIENTS_PER_GB);
++
++ atomic_set(&nn->nfsd_courtesy_clients, 0);
++}
++
+ static void init_nfs4_replay(struct nfs4_replay *rp)
+ {
+ rp->rp_status = nfserr_serverfault;
+@@ -4447,55 +4711,80 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net)
+ nfs4_put_stid(&last->st_stid);
+ }
+
+-/* search file_hashtbl[] for file */
+-static struct nfs4_file *
+-find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
++static noinline_for_stack struct nfs4_file *
++nfsd4_file_hash_lookup(const struct svc_fh *fhp)
+ {
+- struct nfs4_file *fp;
++ struct inode *inode = d_inode(fhp->fh_dentry);
++ struct rhlist_head *tmp, *list;
++ struct nfs4_file *fi;
+
+- hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash,
+- lockdep_is_held(&state_lock)) {
+- if (fh_match(&fp->fi_fhandle, fh)) {
+- if (refcount_inc_not_zero(&fp->fi_ref))
+- return fp;
++ rcu_read_lock();
++ list = rhltable_lookup(&nfs4_file_rhltable, &inode,
++ nfs4_file_rhash_params);
++ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
++ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
++ if (refcount_inc_not_zero(&fi->fi_ref)) {
++ rcu_read_unlock();
++ return fi;
++ }
+ }
+ }
++ rcu_read_unlock();
+ return NULL;
+ }
+
+-struct nfs4_file *
+-find_file(struct knfsd_fh *fh)
+-{
+- struct nfs4_file *fp;
+- unsigned int hashval = file_hashval(fh);
++/*
++ * On hash insertion, identify entries with the same inode but
++ * distinct filehandles. They will all be on the list returned
++ * by rhltable_lookup().
++ *
++ * inode->i_lock prevents racing insertions from adding an entry
++ * for the same inode/fhp pair twice.
++ */
++static noinline_for_stack struct nfs4_file *
++nfsd4_file_hash_insert(struct nfs4_file *new, const struct svc_fh *fhp)
++{
++ struct inode *inode = d_inode(fhp->fh_dentry);
++ struct rhlist_head *tmp, *list;
++ struct nfs4_file *ret = NULL;
++ bool alias_found = false;
++ struct nfs4_file *fi;
++ int err;
+
+ rcu_read_lock();
+- fp = find_file_locked(fh, hashval);
+- rcu_read_unlock();
+- return fp;
+-}
++ spin_lock(&inode->i_lock);
+
+-static struct nfs4_file *
+-find_or_add_file(struct nfs4_file *new, struct knfsd_fh *fh)
+-{
+- struct nfs4_file *fp;
+- unsigned int hashval = file_hashval(fh);
++ list = rhltable_lookup(&nfs4_file_rhltable, &inode,
++ nfs4_file_rhash_params);
++ rhl_for_each_entry_rcu(fi, tmp, list, fi_rlist) {
++ if (fh_match(&fi->fi_fhandle, &fhp->fh_handle)) {
++ if (refcount_inc_not_zero(&fi->fi_ref))
++ ret = fi;
++ } else
++ fi->fi_aliased = alias_found = true;
++ }
++ if (ret)
++ goto out_unlock;
+
+- rcu_read_lock();
+- fp = find_file_locked(fh, hashval);
+- rcu_read_unlock();
+- if (fp)
+- return fp;
++ nfsd4_file_init(fhp, new);
++ err = rhltable_insert(&nfs4_file_rhltable, &new->fi_rlist,
++ nfs4_file_rhash_params);
++ if (err)
++ goto out_unlock;
+
+- spin_lock(&state_lock);
+- fp = find_file_locked(fh, hashval);
+- if (likely(fp == NULL)) {
+- nfsd4_init_file(fh, hashval, new);
+- fp = new;
+- }
+- spin_unlock(&state_lock);
++ new->fi_aliased = alias_found;
++ ret = new;
+
+- return fp;
++out_unlock:
++ spin_unlock(&inode->i_lock);
++ rcu_read_unlock();
++ return ret;
++}
++
++static noinline_for_stack void nfsd4_file_hash_remove(struct nfs4_file *fi)
++{
++ rhltable_remove(&nfs4_file_rhltable, &fi->fi_rlist,
++ nfs4_file_rhash_params);
+ }
+
+ /*
+@@ -4508,9 +4797,10 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
+ struct nfs4_file *fp;
+ __be32 ret = nfs_ok;
+
+- fp = find_file(&current_fh->fh_handle);
++ fp = nfsd4_file_hash_lookup(current_fh);
+ if (!fp)
+ return ret;
++
+ /* Check for conflicting share reservations */
+ spin_lock(&fp->fi_lock);
+ if (fp->fi_share_deny & deny_type)
+@@ -4520,6 +4810,35 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
+ return ret;
+ }
+
++static bool nfsd4_deleg_present(const struct inode *inode)
++{
++ struct file_lock_context *ctx = locks_inode_context(inode);
++
++ return ctx && !list_empty_careful(&ctx->flc_lease);
++}
++
++/**
++ * nfsd_wait_for_delegreturn - wait for delegations to be returned
++ * @rqstp: the RPC transaction being executed
++ * @inode: in-core inode of the file being waited for
++ *
++ * The timeout prevents deadlock if all nfsd threads happen to be
++ * tied up waiting for returning delegations.
++ *
++ * Return values:
++ * %true: delegation was returned
++ * %false: timed out waiting for delegreturn
++ */
++bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode)
++{
++ long __maybe_unused timeo;
++
++ timeo = wait_var_event_timeout(inode, !nfsd4_deleg_present(inode),
++ NFSD_DELEGRETURN_TIMEOUT);
++ trace_nfsd_delegret_wakeup(rqstp, inode, timeo);
++ return timeo > 0;
++}
++
+ static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb)
+ {
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+@@ -4548,6 +4867,8 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
+ {
+ struct nfs4_delegation *dp = cb_to_delegation(cb);
+
++ trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
++
+ if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
+ dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
+ return 1;
+@@ -4593,22 +4914,30 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+ * We're assuming the state code never drops its reference
+ * without first removing the lease. Since we're in this lease
+ * callback (and since the lease code is serialized by the
+- * i_lock) we know the server hasn't removed the lease yet, and
++ * flc_lock) we know the server hasn't removed the lease yet, and
+ * we know it's safe to take a reference.
+ */
+ refcount_inc(&dp->dl_stid.sc_count);
+- nfsd4_run_cb(&dp->dl_recall);
++ WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall));
+ }
+
+-/* Called from break_lease() with i_lock held. */
++/* Called from break_lease() with flc_lock held. */
+ static bool
+ nfsd_break_deleg_cb(struct file_lock *fl)
+ {
+- bool ret = false;
+ struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+ struct nfs4_file *fp = dp->dl_stid.sc_file;
++ struct nfs4_client *clp = dp->dl_stid.sc_client;
++ struct nfsd_net *nn;
++
++ trace_nfsd_cb_recall(&dp->dl_stid);
+
+- trace_nfsd_deleg_break(&dp->dl_stid.sc_stateid);
++ dp->dl_recalled = true;
++ atomic_inc(&clp->cl_delegs_in_recall);
++ if (try_to_expire_client(clp)) {
++ nn = net_generic(clp->net, nfsd_net_id);
++ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
++ }
+
+ /*
+ * We don't want the locks code to timeout the lease for us;
+@@ -4617,11 +4946,9 @@ nfsd_break_deleg_cb(struct file_lock *fl)
+ */
+ fl->fl_break_time = 0;
+
+- spin_lock(&fp->fi_lock);
+ fp->fi_had_conflict = true;
+ nfsd_break_one_deleg(dp);
+- spin_unlock(&fp->fi_lock);
+- return ret;
++ return false;
+ }
+
+ /**
+@@ -4652,9 +4979,14 @@ static int
+ nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+ struct list_head *dispose)
+ {
+- if (arg & F_UNLCK)
++ struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
++ struct nfs4_client *clp = dp->dl_stid.sc_client;
++
++ if (arg & F_UNLCK) {
++ if (dp->dl_recalled)
++ atomic_dec(&clp->cl_delegs_in_recall);
+ return lease_modify(onlist, arg, dispose);
+- else
++ } else
+ return -EAGAIN;
+ }
+
+@@ -4675,40 +5007,37 @@ static __be32 nfsd4_check_seqid(struct nfsd4_compound_state *cstate, struct nfs4
+ return nfserr_bad_seqid;
+ }
+
+-static __be32 lookup_clientid(clientid_t *clid,
+- struct nfsd4_compound_state *cstate,
+- struct nfsd_net *nn,
+- bool sessions)
++static struct nfs4_client *lookup_clientid(clientid_t *clid, bool sessions,
++ struct nfsd_net *nn)
+ {
+ struct nfs4_client *found;
+
++ spin_lock(&nn->client_lock);
++ found = find_confirmed_client(clid, sessions, nn);
++ if (found)
++ atomic_inc(&found->cl_rpc_users);
++ spin_unlock(&nn->client_lock);
++ return found;
++}
++
++static __be32 set_client(clientid_t *clid,
++ struct nfsd4_compound_state *cstate,
++ struct nfsd_net *nn)
++{
+ if (cstate->clp) {
+- found = cstate->clp;
+- if (!same_clid(&found->cl_clientid, clid))
++ if (!same_clid(&cstate->clp->cl_clientid, clid))
+ return nfserr_stale_clientid;
+ return nfs_ok;
+ }
+-
+ if (STALE_CLIENTID(clid, nn))
+ return nfserr_stale_clientid;
+-
+ /*
+- * For v4.1+ we get the client in the SEQUENCE op. If we don't have one
+- * cached already then we know this is for is for v4.0 and "sessions"
+- * will be false.
++ * We're in the 4.0 case (otherwise the SEQUENCE op would have
++ * set cstate->clp), so session = false:
+ */
+- WARN_ON_ONCE(cstate->session);
+- spin_lock(&nn->client_lock);
+- found = find_confirmed_client(clid, sessions, nn);
+- if (!found) {
+- spin_unlock(&nn->client_lock);
++ cstate->clp = lookup_clientid(clid, false, nn);
++ if (!cstate->clp)
+ return nfserr_expired;
+- }
+- atomic_inc(&found->cl_rpc_users);
+- spin_unlock(&nn->client_lock);
+-
+- /* Cache the nfs4_client in cstate! */
+- cstate->clp = found;
+ return nfs_ok;
+ }
+
+@@ -4722,8 +5051,6 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+ struct nfs4_openowner *oo = NULL;
+ __be32 status;
+
+- if (STALE_CLIENTID(&open->op_clientid, nn))
+- return nfserr_stale_clientid;
+ /*
+ * In case we need it later, after we've already created the
+ * file and don't want to risk a further failure:
+@@ -4732,7 +5059,7 @@ nfsd4_process_open1(struct nfsd4_compound_state *cstate,
+ if (open->op_file == NULL)
+ return nfserr_jukebox;
+
+- status = lookup_clientid(clientid, cstate, nn, false);
++ status = set_client(clientid, cstate, nn);
+ if (status)
+ return status;
+ clp = cstate->clp;
+@@ -4856,16 +5183,19 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
+ .ia_valid = ATTR_SIZE,
+ .ia_size = 0,
+ };
++ struct nfsd_attrs attrs = {
++ .na_iattr = &iattr,
++ };
+ if (!open->op_truncate)
+ return 0;
+ if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
+ return nfserr_inval;
+- return nfsd_setattr(rqstp, fh, &iattr, 0, (time64_t)0);
++ return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0);
+ }
+
+ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
+- struct nfsd4_open *open)
++ struct nfsd4_open *open, bool new_stp)
+ {
+ struct nfsd_file *nf = NULL;
+ __be32 status;
+@@ -4881,6 +5211,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ */
+ status = nfs4_file_check_deny(fp, open->op_share_deny);
+ if (status != nfs_ok) {
++ if (status != nfserr_share_denied) {
++ spin_unlock(&fp->fi_lock);
++ goto out;
++ }
++ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
++ stp, open->op_share_deny, false))
++ status = nfserr_jukebox;
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+@@ -4888,6 +5225,13 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ /* set access to the file */
+ status = nfs4_file_get_access(fp, open->op_share_access);
+ if (status != nfs_ok) {
++ if (status != nfserr_share_denied) {
++ spin_unlock(&fp->fi_lock);
++ goto out;
++ }
++ if (nfs4_resolve_deny_conflicts_locked(fp, new_stp,
++ stp, open->op_share_access, true))
++ status = nfserr_jukebox;
+ spin_unlock(&fp->fi_lock);
+ goto out;
+ }
+@@ -4903,9 +5247,12 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+
+ if (!fp->fi_fds[oflag]) {
+ spin_unlock(&fp->fi_lock);
+- status = nfsd_file_acquire(rqstp, cur_fh, access, &nf);
+- if (status)
++
++ status = nfsd_file_acquire_opened(rqstp, cur_fh, access,
++ open->op_filp, &nf);
++ if (status != nfs_ok)
+ goto out_put_access;
++
+ spin_lock(&fp->fi_lock);
+ if (!fp->fi_fds[oflag]) {
+ fp->fi_fds[oflag] = nf;
+@@ -4934,21 +5281,30 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
+ }
+
+ static __be32
+-nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open)
++nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp,
++ struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp,
++ struct nfsd4_open *open)
+ {
+ __be32 status;
+ unsigned char old_deny_bmap = stp->st_deny_bmap;
+
+ if (!test_access(open->op_share_access, stp))
+- return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open);
++ return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open, false);
+
+ /* test and set deny mode */
+ spin_lock(&fp->fi_lock);
+ status = nfs4_file_check_deny(fp, open->op_share_deny);
+- if (status == nfs_ok) {
++ switch (status) {
++ case nfs_ok:
+ set_deny(open->op_share_deny, stp);
+ fp->fi_share_deny |=
+- (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
++ (open->op_share_deny & NFS4_SHARE_DENY_BOTH);
++ break;
++ case nfserr_share_denied:
++ if (nfs4_resolve_deny_conflicts_locked(fp, false,
++ stp, open->op_share_deny, false))
++ status = nfserr_jukebox;
++ break;
+ }
+ spin_unlock(&fp->fi_lock);
+
+@@ -4992,11 +5348,118 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+ return fl;
+ }
+
++static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
++ struct nfs4_file *fp)
++{
++ struct nfs4_ol_stateid *st;
++ struct file *f = fp->fi_deleg_file->nf_file;
++ struct inode *ino = locks_inode(f);
++ int writes;
++
++ writes = atomic_read(&ino->i_writecount);
++ if (!writes)
++ return 0;
++ /*
++ * There could be multiple filehandles (hence multiple
++ * nfs4_files) referencing this file, but that's not too
++ * common; let's just give up in that case rather than
++ * trying to go look up all the clients using that other
++ * nfs4_file as well:
++ */
++ if (fp->fi_aliased)
++ return -EAGAIN;
++ /*
++ * If there's a close in progress, make sure that we see it
++ * clear any fi_fds[] entries before we see it decrement
++ * i_writecount:
++ */
++ smp_mb__after_atomic();
++
++ if (fp->fi_fds[O_WRONLY])
++ writes--;
++ if (fp->fi_fds[O_RDWR])
++ writes--;
++ if (writes > 0)
++ return -EAGAIN; /* There may be non-NFSv4 writers */
++ /*
++ * It's possible there are non-NFSv4 write opens in progress,
++ * but if they haven't incremented i_writecount yet then they
++ * also haven't called break lease yet; so, they'll break this
++ * lease soon enough. So, all that's left to check for is NFSv4
++ * opens:
++ */
++ spin_lock(&fp->fi_lock);
++ list_for_each_entry(st, &fp->fi_stateids, st_perfile) {
++ if (st->st_openstp == NULL /* it's an open */ &&
++ access_permit_write(st) &&
++ st->st_stid.sc_client != clp) {
++ spin_unlock(&fp->fi_lock);
++ return -EAGAIN;
++ }
++ }
++ spin_unlock(&fp->fi_lock);
++ /*
++ * There's a small chance that we could be racing with another
++ * NFSv4 open. However, any open that hasn't added itself to
++ * the fi_stateids list also hasn't called break_lease yet; so,
++ * they'll break this lease soon enough.
++ */
++ return 0;
++}
++
++/*
++ * It's possible that between opening the dentry and setting the delegation,
++ * that it has been renamed or unlinked. Redo the lookup to verify that this
++ * hasn't happened.
++ */
++static int
++nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp,
++ struct svc_fh *parent)
++{
++ struct svc_export *exp;
++ struct dentry *child;
++ __be32 err;
++
++ err = nfsd_lookup_dentry(open->op_rqstp, parent,
++ open->op_fname, open->op_fnamelen,
++ &exp, &child);
++
++ if (err)
++ return -EAGAIN;
++
++ exp_put(exp);
++ dput(child);
++ if (child != file_dentry(fp->fi_deleg_file->nf_file))
++ return -EAGAIN;
++
++ return 0;
++}
++
++/*
++ * We avoid breaking delegations held by a client due to its own activity, but
++ * clearing setuid/setgid bits on a write is an implicit activity and the client
++ * may not notice and continue using the old mode. Avoid giving out a delegation
++ * on setuid/setgid files when the client is requesting an open for write.
++ */
++static int
++nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf)
++{
++ struct inode *inode = file_inode(nf->nf_file);
++
++ if ((open->op_share_access & NFS4_SHARE_ACCESS_WRITE) &&
++ (inode->i_mode & (S_ISUID|S_ISGID)))
++ return -EAGAIN;
++ return 0;
++}
++
+ static struct nfs4_delegation *
+-nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+- struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate)
++nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
++ struct svc_fh *parent)
+ {
+ int status = 0;
++ struct nfs4_client *clp = stp->st_stid.sc_client;
++ struct nfs4_file *fp = stp->st_stid.sc_file;
++ struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
+ struct nfs4_delegation *dp;
+ struct nfsd_file *nf;
+ struct file_lock *fl;
+@@ -5011,14 +5474,19 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+
+ nf = find_readable_file(fp);
+ if (!nf) {
+- /* We should always have a readable file here */
+- WARN_ON_ONCE(1);
+- return ERR_PTR(-EBADF);
++ /*
++ * We probably could attempt another open and get a read
++ * delegation, but for now, don't bother until the
++ * client actually sends us one.
++ */
++ return ERR_PTR(-EAGAIN);
+ }
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ if (nfs4_delegation_exists(clp, fp))
+ status = -EAGAIN;
++ else if (nfsd4_verify_setuid_write(open, nf))
++ status = -EAGAIN;
+ else if (!fp->fi_deleg_file) {
+ fp->fi_deleg_file = nf;
+ /* increment early to prevent fi_deleg_file from being
+@@ -5035,7 +5503,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+ return ERR_PTR(status);
+
+ status = -ENOMEM;
+- dp = alloc_init_deleg(clp, fp, fh, odstate);
++ dp = alloc_init_deleg(clp, fp, odstate);
+ if (!dp)
+ goto out_delegees;
+
+@@ -5049,12 +5517,31 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
+ if (status)
+ goto out_clnt_odstate;
+
++ if (parent) {
++ status = nfsd4_verify_deleg_dentry(open, fp, parent);
++ if (status)
++ goto out_unlock;
++ }
++
++ status = nfsd4_check_conflicting_opens(clp, fp);
++ if (status)
++ goto out_unlock;
++
++ /*
++ * Now that the deleg is set, check again to ensure that nothing
++ * raced in and changed the mode while we weren't lookng.
++ */
++ status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file);
++ if (status)
++ goto out_unlock;
++
++ status = -EAGAIN;
++ if (fp->fi_had_conflict)
++ goto out_unlock;
++
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+- if (fp->fi_had_conflict)
+- status = -EAGAIN;
+- else
+- status = hash_delegation_locked(dp, fp);
++ status = hash_delegation_locked(dp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+@@ -5100,12 +5587,13 @@ static void nfsd4_open_deleg_none_ext(struct nfsd4_open *open, int status)
+ * proper support for them.
+ */
+ static void
+-nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
+- struct nfs4_ol_stateid *stp)
++nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
++ struct svc_fh *currentfh)
+ {
+ struct nfs4_delegation *dp;
+ struct nfs4_openowner *oo = openowner(stp->st_stateowner);
+ struct nfs4_client *clp = stp->st_stid.sc_client;
++ struct svc_fh *parent = NULL;
+ int cb_up;
+ int status = 0;
+
+@@ -5119,6 +5607,8 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
+ goto out_no_deleg;
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
++ parent = currentfh;
++ fallthrough;
+ case NFS4_OPEN_CLAIM_FH:
+ /*
+ * Let's not give out any delegations till everyone's
+@@ -5129,22 +5619,11 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open,
+ goto out_no_deleg;
+ if (!cb_up || !(oo->oo_flags & NFS4_OO_CONFIRMED))
+ goto out_no_deleg;
+- /*
+- * Also, if the file was opened for write or
+- * create, there's a good chance the client's
+- * about to write to it, resulting in an
+- * immediate recall (since we don't support
+- * write delegations):
+- */
+- if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE)
+- goto out_no_deleg;
+- if (open->op_create == NFS4_OPEN_CREATE)
+- goto out_no_deleg;
+ break;
+ default:
+ goto out_no_deleg;
+ }
+- dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate);
++ dp = nfs4_set_delegation(open, stp, parent);
+ if (IS_ERR(dp))
+ goto out_no_deleg;
+
+@@ -5186,6 +5665,18 @@ static void nfsd4_deleg_xgrade_none_ext(struct nfsd4_open *open,
+ */
+ }
+
++/**
++ * nfsd4_process_open2 - finish open processing
++ * @rqstp: the RPC transaction being executed
++ * @current_fh: NFSv4 COMPOUND's current filehandle
++ * @open: OPEN arguments
++ *
++ * If successful, (1) truncate the file if open->op_truncate was
++ * set, (2) set open->op_stateid, (3) set open->op_delegation.
++ *
++ * Returns %nfs_ok on success; otherwise an nfs4stat value in
++ * network byte order is returned.
++ */
+ __be32
+ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
+ {
+@@ -5202,7 +5693,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
+ * and check for delegations in the process of being recalled.
+ * If not found, create the nfs4_file struct
+ */
+- fp = find_or_add_file(open->op_file, &current_fh->fh_handle);
++ fp = nfsd4_file_hash_insert(open->op_file, current_fh);
++ if (unlikely(!fp))
++ return nfserr_jukebox;
+ if (fp != open->op_file) {
+ status = nfs4_check_deleg(cl, open, &dp);
+ if (status)
+@@ -5235,7 +5728,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
+ goto out;
+ }
+ } else {
+- status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
++ status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
+ if (status) {
+ stp->st_stid.sc_type = NFS4_CLOSED_STID;
+ release_open_stateid(stp);
+@@ -5264,7 +5757,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
+ * Attempt to hand out a delegation. No error return, because the
+ * OPEN succeeds even if we fail.
+ */
+- nfs4_open_delegation(current_fh, open, stp);
++ nfs4_open_delegation(open, stp, &resp->cstate.current_fh);
+ nodeleg:
+ status = nfs_ok;
+ trace_nfsd_open(&stp->st_stid.sc_stateid);
+@@ -5322,137 +5815,313 @@ nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+ trace_nfsd_clid_renew(clid);
+- status = lookup_clientid(clid, cstate, nn, false);
++ status = set_client(clid, cstate, nn);
+ if (status)
+- goto out;
++ return status;
+ clp = cstate->clp;
+- status = nfserr_cb_path_down;
+ if (!list_empty(&clp->cl_delegations)
+ && clp->cl_cb_state != NFSD4_CB_UP)
+- goto out;
+- status = nfs_ok;
+-out:
+- return status;
++ return nfserr_cb_path_down;
++ return nfs_ok;
++}
++
++void
++nfsd4_end_grace(struct nfsd_net *nn)
++{
++ /* do nothing if grace period already ended */
++ if (nn->grace_ended)
++ return;
++
++ trace_nfsd_grace_complete(nn);
++ nn->grace_ended = true;
++ /*
++ * If the server goes down again right now, an NFSv4
++ * client will still be allowed to reclaim after it comes back up,
++ * even if it hasn't yet had a chance to reclaim state this time.
++ *
++ */
++ nfsd4_record_grace_done(nn);
++ /*
++ * At this point, NFSv4 clients can still reclaim. But if the
++ * server crashes, any that have not yet reclaimed will be out
++ * of luck on the next boot.
++ *
++ * (NFSv4.1+ clients are considered to have reclaimed once they
++ * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
++ * have reclaimed after their first OPEN.)
++ */
++ locks_end_grace(&nn->nfsd4_manager);
++ /*
++ * At this point, and once lockd and/or any other containers
++ * exit their grace period, further reclaims will fail and
++ * regular locking can resume.
++ */
++}
++
++/*
++ * If we've waited a lease period but there are still clients trying to
++ * reclaim, wait a little longer to give them a chance to finish.
++ */
++static bool clients_still_reclaiming(struct nfsd_net *nn)
++{
++ time64_t double_grace_period_end = nn->boot_time +
++ 2 * nn->nfsd4_lease;
++
++ if (nn->track_reclaim_completes &&
++ atomic_read(&nn->nr_reclaim_complete) ==
++ nn->reclaim_str_hashtbl_size)
++ return false;
++ if (!nn->somebody_reclaimed)
++ return false;
++ nn->somebody_reclaimed = false;
++ /*
++ * If we've given them *two* lease times to reclaim, and they're
++ * still not done, give up:
++ */
++ if (ktime_get_boottime_seconds() > double_grace_period_end)
++ return false;
++ return true;
++}
++
++struct laundry_time {
++ time64_t cutoff;
++ time64_t new_timeo;
++};
++
++static bool state_expired(struct laundry_time *lt, time64_t last_refresh)
++{
++ time64_t time_remaining;
++
++ if (last_refresh < lt->cutoff)
++ return true;
++ time_remaining = last_refresh - lt->cutoff;
++ lt->new_timeo = min(lt->new_timeo, time_remaining);
++ return false;
++}
++
++#ifdef CONFIG_NFSD_V4_2_INTER_SSC
++void nfsd4_ssc_init_umount_work(struct nfsd_net *nn)
++{
++ spin_lock_init(&nn->nfsd_ssc_lock);
++ INIT_LIST_HEAD(&nn->nfsd_ssc_mount_list);
++ init_waitqueue_head(&nn->nfsd_ssc_waitq);
++}
++EXPORT_SYMBOL_GPL(nfsd4_ssc_init_umount_work);
++
++/*
++ * This is called when nfsd is being shutdown, after all inter_ssc
++ * cleanup were done, to destroy the ssc delayed unmount list.
++ */
++static void nfsd4_ssc_shutdown_umount(struct nfsd_net *nn)
++{
++ struct nfsd4_ssc_umount_item *ni = NULL;
++ struct nfsd4_ssc_umount_item *tmp;
++
++ spin_lock(&nn->nfsd_ssc_lock);
++ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
++ list_del(&ni->nsui_list);
++ spin_unlock(&nn->nfsd_ssc_lock);
++ mntput(ni->nsui_vfsmount);
++ kfree(ni);
++ spin_lock(&nn->nfsd_ssc_lock);
++ }
++ spin_unlock(&nn->nfsd_ssc_lock);
++}
++
++static void nfsd4_ssc_expire_umount(struct nfsd_net *nn)
++{
++ bool do_wakeup = false;
++ struct nfsd4_ssc_umount_item *ni = NULL;
++ struct nfsd4_ssc_umount_item *tmp;
++
++ spin_lock(&nn->nfsd_ssc_lock);
++ list_for_each_entry_safe(ni, tmp, &nn->nfsd_ssc_mount_list, nsui_list) {
++ if (time_after(jiffies, ni->nsui_expire)) {
++ if (refcount_read(&ni->nsui_refcnt) > 1)
++ continue;
++
++ /* mark being unmount */
++ ni->nsui_busy = true;
++ spin_unlock(&nn->nfsd_ssc_lock);
++ mntput(ni->nsui_vfsmount);
++ spin_lock(&nn->nfsd_ssc_lock);
++
++ /* waiters need to start from begin of list */
++ list_del(&ni->nsui_list);
++ kfree(ni);
++
++ /* wakeup ssc_connect waiters */
++ do_wakeup = true;
++ continue;
++ }
++ break;
++ }
++ if (do_wakeup)
++ wake_up_all(&nn->nfsd_ssc_waitq);
++ spin_unlock(&nn->nfsd_ssc_lock);
++}
++#endif
++
++/* Check if any lock belonging to this lockowner has any blockers */
++static bool
++nfs4_lockowner_has_blockers(struct nfs4_lockowner *lo)
++{
++ struct file_lock_context *ctx;
++ struct nfs4_ol_stateid *stp;
++ struct nfs4_file *nf;
++
++ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
++ nf = stp->st_stid.sc_file;
++ ctx = locks_inode_context(nf->fi_inode);
++ if (!ctx)
++ continue;
++ if (locks_owner_has_blockers(ctx, lo))
++ return true;
++ }
++ return false;
++}
++
++static bool
++nfs4_anylock_blockers(struct nfs4_client *clp)
++{
++ int i;
++ struct nfs4_stateowner *so;
++ struct nfs4_lockowner *lo;
++
++ if (atomic_read(&clp->cl_delegs_in_recall))
++ return true;
++ spin_lock(&clp->cl_lock);
++ for (i = 0; i < OWNER_HASH_SIZE; i++) {
++ list_for_each_entry(so, &clp->cl_ownerstr_hashtbl[i],
++ so_strhash) {
++ if (so->so_is_open_owner)
++ continue;
++ lo = lockowner(so);
++ if (nfs4_lockowner_has_blockers(lo)) {
++ spin_unlock(&clp->cl_lock);
++ return true;
++ }
++ }
++ }
++ spin_unlock(&clp->cl_lock);
++ return false;
+ }
+
+-void
+-nfsd4_end_grace(struct nfsd_net *nn)
++static void
++nfs4_get_client_reaplist(struct nfsd_net *nn, struct list_head *reaplist,
++ struct laundry_time *lt)
+ {
+- /* do nothing if grace period already ended */
+- if (nn->grace_ended)
+- return;
++ unsigned int maxreap, reapcnt = 0;
++ struct list_head *pos, *next;
++ struct nfs4_client *clp;
+
+- trace_nfsd_grace_complete(nn);
+- nn->grace_ended = true;
+- /*
+- * If the server goes down again right now, an NFSv4
+- * client will still be allowed to reclaim after it comes back up,
+- * even if it hasn't yet had a chance to reclaim state this time.
+- *
+- */
+- nfsd4_record_grace_done(nn);
+- /*
+- * At this point, NFSv4 clients can still reclaim. But if the
+- * server crashes, any that have not yet reclaimed will be out
+- * of luck on the next boot.
+- *
+- * (NFSv4.1+ clients are considered to have reclaimed once they
+- * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to
+- * have reclaimed after their first OPEN.)
+- */
+- locks_end_grace(&nn->nfsd4_manager);
+- /*
+- * At this point, and once lockd and/or any other containers
+- * exit their grace period, further reclaims will fail and
+- * regular locking can resume.
+- */
++ maxreap = (atomic_read(&nn->nfs4_client_count) >= nn->nfs4_max_clients) ?
++ NFSD_CLIENT_MAX_TRIM_PER_RUN : 0;
++ INIT_LIST_HEAD(reaplist);
++ spin_lock(&nn->client_lock);
++ list_for_each_safe(pos, next, &nn->client_lru) {
++ clp = list_entry(pos, struct nfs4_client, cl_lru);
++ if (clp->cl_state == NFSD4_EXPIRABLE)
++ goto exp_client;
++ if (!state_expired(lt, clp->cl_time))
++ break;
++ if (!atomic_read(&clp->cl_rpc_users)) {
++ if (clp->cl_state == NFSD4_ACTIVE)
++ atomic_inc(&nn->nfsd_courtesy_clients);
++ clp->cl_state = NFSD4_COURTESY;
++ }
++ if (!client_has_state(clp))
++ goto exp_client;
++ if (!nfs4_anylock_blockers(clp))
++ if (reapcnt >= maxreap)
++ continue;
++exp_client:
++ if (!mark_client_expired_locked(clp)) {
++ list_add(&clp->cl_lru, reaplist);
++ reapcnt++;
++ }
++ }
++ spin_unlock(&nn->client_lock);
+ }
+
+-/*
+- * If we've waited a lease period but there are still clients trying to
+- * reclaim, wait a little longer to give them a chance to finish.
+- */
+-static bool clients_still_reclaiming(struct nfsd_net *nn)
++static void
++nfs4_get_courtesy_client_reaplist(struct nfsd_net *nn,
++ struct list_head *reaplist)
+ {
+- time64_t double_grace_period_end = nn->boot_time +
+- 2 * nn->nfsd4_lease;
++ unsigned int maxreap = 0, reapcnt = 0;
++ struct list_head *pos, *next;
++ struct nfs4_client *clp;
+
+- if (nn->track_reclaim_completes &&
+- atomic_read(&nn->nr_reclaim_complete) ==
+- nn->reclaim_str_hashtbl_size)
+- return false;
+- if (!nn->somebody_reclaimed)
+- return false;
+- nn->somebody_reclaimed = false;
+- /*
+- * If we've given them *two* lease times to reclaim, and they're
+- * still not done, give up:
+- */
+- if (ktime_get_boottime_seconds() > double_grace_period_end)
+- return false;
+- return true;
++ maxreap = NFSD_CLIENT_MAX_TRIM_PER_RUN;
++ INIT_LIST_HEAD(reaplist);
++
++ spin_lock(&nn->client_lock);
++ list_for_each_safe(pos, next, &nn->client_lru) {
++ clp = list_entry(pos, struct nfs4_client, cl_lru);
++ if (clp->cl_state == NFSD4_ACTIVE)
++ break;
++ if (reapcnt >= maxreap)
++ break;
++ if (!mark_client_expired_locked(clp)) {
++ list_add(&clp->cl_lru, reaplist);
++ reapcnt++;
++ }
++ }
++ spin_unlock(&nn->client_lock);
++}
++
++static void
++nfs4_process_client_reaplist(struct list_head *reaplist)
++{
++ struct list_head *pos, *next;
++ struct nfs4_client *clp;
++
++ list_for_each_safe(pos, next, reaplist) {
++ clp = list_entry(pos, struct nfs4_client, cl_lru);
++ trace_nfsd_clid_purged(&clp->cl_clientid);
++ list_del_init(&clp->cl_lru);
++ expire_client(clp);
++ }
+ }
+
+ static time64_t
+ nfs4_laundromat(struct nfsd_net *nn)
+ {
+- struct nfs4_client *clp;
+ struct nfs4_openowner *oo;
+ struct nfs4_delegation *dp;
+ struct nfs4_ol_stateid *stp;
+ struct nfsd4_blocked_lock *nbl;
+ struct list_head *pos, *next, reaplist;
+- time64_t cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease;
+- time64_t t, new_timeo = nn->nfsd4_lease;
++ struct laundry_time lt = {
++ .cutoff = ktime_get_boottime_seconds() - nn->nfsd4_lease,
++ .new_timeo = nn->nfsd4_lease
++ };
+ struct nfs4_cpntf_state *cps;
+ copy_stateid_t *cps_t;
+ int i;
+
+ if (clients_still_reclaiming(nn)) {
+- new_timeo = 0;
++ lt.new_timeo = 0;
+ goto out;
+ }
+ nfsd4_end_grace(nn);
+- INIT_LIST_HEAD(&reaplist);
+
+ spin_lock(&nn->s2s_cp_lock);
+ idr_for_each_entry(&nn->s2s_cp_stateids, cps_t, i) {
+ cps = container_of(cps_t, struct nfs4_cpntf_state, cp_stateid);
+- if (cps->cp_stateid.sc_type == NFS4_COPYNOTIFY_STID &&
+- cps->cpntf_time < cutoff)
++ if (cps->cp_stateid.cs_type == NFS4_COPYNOTIFY_STID &&
++ state_expired(&lt, cps->cpntf_time))
+ _free_cpntf_state_locked(nn, cps);
+ }
+ spin_unlock(&nn->s2s_cp_lock);
++ nfs4_get_client_reaplist(nn, &reaplist, &lt);
++ nfs4_process_client_reaplist(&reaplist);
+
+- spin_lock(&nn->client_lock);
+- list_for_each_safe(pos, next, &nn->client_lru) {
+- clp = list_entry(pos, struct nfs4_client, cl_lru);
+- if (clp->cl_time > cutoff) {
+- t = clp->cl_time - cutoff;
+- new_timeo = min(new_timeo, t);
+- break;
+- }
+- if (mark_client_expired_locked(clp)) {
+- trace_nfsd_clid_expired(&clp->cl_clientid);
+- continue;
+- }
+- list_add(&clp->cl_lru, &reaplist);
+- }
+- spin_unlock(&nn->client_lock);
+- list_for_each_safe(pos, next, &reaplist) {
+- clp = list_entry(pos, struct nfs4_client, cl_lru);
+- trace_nfsd_clid_purged(&clp->cl_clientid);
+- list_del_init(&clp->cl_lru);
+- expire_client(clp);
+- }
+ spin_lock(&state_lock);
+ list_for_each_safe(pos, next, &nn->del_recall_lru) {
+ dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
+- if (dp->dl_time > cutoff) {
+- t = dp->dl_time - cutoff;
+- new_timeo = min(new_timeo, t);
++ if (!state_expired(&lt, dp->dl_time))
+ break;
+- }
+ WARN_ON(!unhash_delegation_locked(dp));
+ list_add(&dp->dl_recall_lru, &reaplist);
+ }
+@@ -5468,11 +6137,8 @@ nfs4_laundromat(struct nfsd_net *nn)
+ while (!list_empty(&nn->close_lru)) {
+ oo = list_first_entry(&nn->close_lru, struct nfs4_openowner,
+ oo_close_lru);
+- if (oo->oo_time > cutoff) {
+- t = oo->oo_time - cutoff;
+- new_timeo = min(new_timeo, t);
++ if (!state_expired(&lt, oo->oo_time))
+ break;
+- }
+ list_del_init(&oo->oo_close_lru);
+ stp = oo->oo_last_closed_stid;
+ oo->oo_last_closed_stid = NULL;
+@@ -5498,11 +6164,8 @@ nfs4_laundromat(struct nfsd_net *nn)
+ while (!list_empty(&nn->blocked_locks_lru)) {
+ nbl = list_first_entry(&nn->blocked_locks_lru,
+ struct nfsd4_blocked_lock, nbl_lru);
+- if (nbl->nbl_time > cutoff) {
+- t = nbl->nbl_time - cutoff;
+- new_timeo = min(new_timeo, t);
++ if (!state_expired(&lt, nbl->nbl_time))
+ break;
+- }
+ list_move(&nbl->nbl_lru, &reaplist);
+ list_del_init(&nbl->nbl_list);
+ }
+@@ -5514,12 +6177,14 @@ nfs4_laundromat(struct nfsd_net *nn)
+ list_del_init(&nbl->nbl_lru);
+ free_blocked_lock(nbl);
+ }
++#ifdef CONFIG_NFSD_V4_2_INTER_SSC
++ /* service the server-to-server copy delayed unmount list */
++ nfsd4_ssc_expire_umount(nn);
++#endif
+ out:
+- new_timeo = max_t(time64_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
+- return new_timeo;
++ return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
+ }
+
+-static struct workqueue_struct *laundry_wq;
+ static void laundromat_main(struct work_struct *);
+
+ static void
+@@ -5534,26 +6199,68 @@ laundromat_main(struct work_struct *laundry)
+ queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ);
+ }
+
+-static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
++static void
++courtesy_client_reaper(struct nfsd_net *nn)
+ {
+- if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
+- return nfserr_bad_stateid;
+- return nfs_ok;
++ struct list_head reaplist;
++
++ nfs4_get_courtesy_client_reaplist(nn, &reaplist);
++ nfs4_process_client_reaplist(&reaplist);
+ }
+
+-static inline int
+-access_permit_read(struct nfs4_ol_stateid *stp)
++static void
++deleg_reaper(struct nfsd_net *nn)
+ {
+- return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+- test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+- test_access(NFS4_SHARE_ACCESS_WRITE, stp);
++ struct list_head *pos, *next;
++ struct nfs4_client *clp;
++ struct list_head cblist;
++
++ INIT_LIST_HEAD(&cblist);
++ spin_lock(&nn->client_lock);
++ list_for_each_safe(pos, next, &nn->client_lru) {
++ clp = list_entry(pos, struct nfs4_client, cl_lru);
++ if (clp->cl_state != NFSD4_ACTIVE ||
++ list_empty(&clp->cl_delegations) ||
++ atomic_read(&clp->cl_delegs_in_recall) ||
++ test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) ||
++ (ktime_get_boottime_seconds() -
++ clp->cl_ra_time < 5)) {
++ continue;
++ }
++ list_add(&clp->cl_ra_cblist, &cblist);
++
++ /* release in nfsd4_cb_recall_any_release */
++ atomic_inc(&clp->cl_rpc_users);
++ set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
++ clp->cl_ra_time = ktime_get_boottime_seconds();
++ }
++ spin_unlock(&nn->client_lock);
++
++ while (!list_empty(&cblist)) {
++ clp = list_first_entry(&cblist, struct nfs4_client,
++ cl_ra_cblist);
++ list_del_init(&clp->cl_ra_cblist);
++ clp->cl_ra->ra_keep = 0;
++ clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
++ nfsd4_run_cb(&clp->cl_ra->ra_cb);
++ }
+ }
+
+-static inline int
+-access_permit_write(struct nfs4_ol_stateid *stp)
++static void
++nfsd4_state_shrinker_worker(struct work_struct *work)
+ {
+- return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+- test_access(NFS4_SHARE_ACCESS_BOTH, stp);
++ struct nfsd_net *nn = container_of(work, struct nfsd_net,
++ nfsd_shrinker_work);
++
++ courtesy_client_reaper(nn);
++ deleg_reaper(nn);
++}
++
++static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp)
++{
++ if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle))
++ return nfserr_bad_stateid;
++ return nfs_ok;
+ }
+
+ static
+@@ -5692,6 +6399,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ struct nfs4_stid **s, struct nfsd_net *nn)
+ {
+ __be32 status;
++ struct nfs4_stid *stid;
+ bool return_revoked = false;
+
+ /*
+@@ -5706,8 +6414,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
+ CLOSE_STATEID(stateid))
+ return nfserr_bad_stateid;
+- status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn,
+- false);
++ status = set_client(&stateid->si_opaque.so_clid, cstate, nn);
+ if (status == nfserr_stale_clientid) {
+ if (cstate->session)
+ return nfserr_bad_stateid;
+@@ -5715,15 +6422,16 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ }
+ if (status)
+ return status;
+- *s = find_stateid_by_type(cstate->clp, stateid, typemask);
+- if (!*s)
++ stid = find_stateid_by_type(cstate->clp, stateid, typemask);
++ if (!stid)
+ return nfserr_bad_stateid;
+- if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+- nfs4_put_stid(*s);
++ if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
++ nfs4_put_stid(stid);
+ if (cstate->minorversion)
+ return nfserr_deleg_revoked;
+ return nfserr_bad_stateid;
+ }
++ *s = stid;
+ return nfs_ok;
+ }
+
+@@ -5788,12 +6496,12 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
+ static void
+ _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps)
+ {
+- WARN_ON_ONCE(cps->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID);
+- if (!refcount_dec_and_test(&cps->cp_stateid.sc_count))
++ WARN_ON_ONCE(cps->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID);
++ if (!refcount_dec_and_test(&cps->cp_stateid.cs_count))
+ return;
+ list_del(&cps->cp_list);
+ idr_remove(&nn->s2s_cp_stateids,
+- cps->cp_stateid.stid.si_opaque.so_id);
++ cps->cp_stateid.cs_stid.si_opaque.so_id);
+ kfree(cps);
+ }
+ /*
+@@ -5815,12 +6523,12 @@ __be32 manage_cpntf_state(struct nfsd_net *nn, stateid_t *st,
+ if (cps_t) {
+ state = container_of(cps_t, struct nfs4_cpntf_state,
+ cp_stateid);
+- if (state->cp_stateid.sc_type != NFS4_COPYNOTIFY_STID) {
++ if (state->cp_stateid.cs_type != NFS4_COPYNOTIFY_STID) {
+ state = NULL;
+ goto unlock;
+ }
+ if (!clp)
+- refcount_inc(&state->cp_stateid.sc_count);
++ refcount_inc(&state->cp_stateid.cs_count);
+ else
+ _free_cpntf_state_locked(nn, state);
+ }
+@@ -5838,21 +6546,27 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
+ {
+ __be32 status;
+ struct nfs4_cpntf_state *cps = NULL;
+- struct nfsd4_compound_state cstate;
++ struct nfs4_client *found;
+
+ status = manage_cpntf_state(nn, st, NULL, &cps);
+ if (status)
+ return status;
+
+ cps->cpntf_time = ktime_get_boottime_seconds();
+- memset(&cstate, 0, sizeof(cstate));
+- status = lookup_clientid(&cps->cp_p_clid, &cstate, nn, true);
+- if (status)
++
++ status = nfserr_expired;
++ found = lookup_clientid(&cps->cp_p_clid, true, nn);
++ if (!found)
+ goto out;
+- status = nfsd4_lookup_stateid(&cstate, &cps->cp_p_stateid,
+- NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
+- stid, nn);
+- put_client_renew(cstate.clp);
++
++ *stid = find_stateid_by_type(found, &cps->cp_p_stateid,
++ NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID);
++ if (*stid)
++ status = nfs_ok;
++ else
++ status = nfserr_bad_stateid;
++
++ put_client_renew(found);
+ out:
+ nfs4_put_cpntf_state(nn, cps);
+ return status;
+@@ -5887,7 +6601,11 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
+ return nfserr_grace;
+
+ if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) {
+- status = check_special_stateids(net, fhp, stateid, flags);
++ if (cstid)
++ status = nfserr_bad_stateid;
++ else
++ status = check_special_stateids(net, fhp, stateid,
++ flags);
+ goto done;
+ }
+
+@@ -5941,7 +6659,7 @@ nfsd4_test_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ {
+ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
+ struct nfsd4_test_stateid_id *stateid;
+- struct nfs4_client *cl = cstate->session->se_client;
++ struct nfs4_client *cl = cstate->clp;
+
+ list_for_each_entry(stateid, &test_stateid->ts_stateid_list, ts_id_list)
+ stateid->ts_id_status =
+@@ -5987,7 +6705,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ stateid_t *stateid = &free_stateid->fr_stateid;
+ struct nfs4_stid *s;
+ struct nfs4_delegation *dp;
+- struct nfs4_client *cl = cstate->session->se_client;
++ struct nfs4_client *cl = cstate->clp;
+ __be32 ret = nfserr_bad_stateid;
+
+ spin_lock(&cl->cl_lock);
+@@ -6316,6 +7034,8 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (status)
+ goto put_stateid;
+
++ trace_nfsd_deleg_return(stateid);
++ wake_up_var(d_inode(cstate->current_fh.fh_dentry));
+ destroy_delegation(dp);
+ put_stateid:
+ nfs4_put_stid(&dp->dl_stid);
+@@ -6323,15 +7043,6 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ return status;
+ }
+
+-static inline u64
+-end_offset(u64 start, u64 len)
+-{
+- u64 end;
+-
+- end = start + len;
+- return end >= start ? end: NFS4_MAX_UINT64;
+-}
+-
+ /* last octet in a range */
+ static inline u64
+ last_byte_offset(u64 start, u64 len)
+@@ -6361,7 +7072,7 @@ nfs4_transform_lock_offset(struct file_lock *lock)
+ }
+
+ static fl_owner_t
+-nfsd4_fl_get_owner(fl_owner_t owner)
++nfsd4_lm_get_owner(fl_owner_t owner)
+ {
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+@@ -6370,7 +7081,7 @@ nfsd4_fl_get_owner(fl_owner_t owner)
+ }
+
+ static void
+-nfsd4_fl_put_owner(fl_owner_t owner)
++nfsd4_lm_put_owner(fl_owner_t owner)
+ {
+ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner;
+
+@@ -6378,6 +7089,29 @@ nfsd4_fl_put_owner(fl_owner_t owner)
+ nfs4_put_stateowner(&lo->lo_owner);
+ }
+
++/* return pointer to struct nfs4_client if client is expirable */
++static bool
++nfsd4_lm_lock_expirable(struct file_lock *cfl)
++{
++ struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
++ struct nfs4_client *clp = lo->lo_owner.so_client;
++ struct nfsd_net *nn;
++
++ if (try_to_expire_client(clp)) {
++ nn = net_generic(clp->net, nfsd_net_id);
++ mod_delayed_work(laundry_wq, &nn->laundromat_work, 0);
++ return true;
++ }
++ return false;
++}
++
++/* schedule laundromat to run immediately and wait for it to complete */
++static void
++nfsd4_lm_expire_lock(void)
++{
++ flush_workqueue(laundry_wq);
++}
++
+ static void
+ nfsd4_lm_notify(struct file_lock *fl)
+ {
+@@ -6397,14 +7131,19 @@ nfsd4_lm_notify(struct file_lock *fl)
+ }
+ spin_unlock(&nn->blocked_locks_lock);
+
+- if (queue)
++ if (queue) {
++ trace_nfsd_cb_notify_lock(lo, nbl);
+ nfsd4_run_cb(&nbl->nbl_cb);
++ }
+ }
+
+ static const struct lock_manager_operations nfsd_posix_mng_ops = {
++ .lm_mod_owner = THIS_MODULE,
+ .lm_notify = nfsd4_lm_notify,
+- .lm_get_owner = nfsd4_fl_get_owner,
+- .lm_put_owner = nfsd4_fl_put_owner,
++ .lm_get_owner = nfsd4_lm_get_owner,
++ .lm_put_owner = nfsd4_lm_put_owner,
++ .lm_lock_expirable = nfsd4_lm_lock_expirable,
++ .lm_expire_lock = nfsd4_lm_expire_lock,
+ };
+
+ static inline void
+@@ -6719,13 +7458,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (nfsd4_has_session(cstate))
+ /* See rfc 5661 18.10.3: given clientid is ignored: */
+ memcpy(&lock->lk_new_clientid,
+- &cstate->session->se_client->cl_clientid,
++ &cstate->clp->cl_clientid,
+ sizeof(clientid_t));
+
+- status = nfserr_stale_clientid;
+- if (STALE_CLIENTID(&lock->lk_new_clientid, nn))
+- goto out;
+-
+ /* validate and update open stateid and open seqid */
+ status = nfs4_preprocess_confirmed_seqid_op(cstate,
+ lock->lk_new_open_seqid,
+@@ -6763,6 +7498,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ if (!locks_in_grace(net) && lock->lk_reclaim)
+ goto out;
+
++ if (lock->lk_reclaim)
++ fl_flags |= FL_RECLAIM;
++
+ fp = lock_stp->st_stid.sc_file;
+ switch (lock->lk_type) {
+ case NFS4_READW_LT:
+@@ -6799,6 +7537,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ goto out;
+ }
+
++ /*
++ * Most filesystems with their own ->lock operations will block
++ * the nfsd thread waiting to acquire the lock. That leads to
++ * deadlocks (we don't want every nfsd thread tied up waiting
++ * for file locks), so don't attempt blocking lock notifications
++ * on those filesystems:
++ */
++ if (nf->nf_file->f_op->lock)
++ fl_flags &= ~FL_SLEEP;
++
+ nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
+ if (!nbl) {
+ dprintk("NFSD: %s: unable to allocate block!\n", __func__);
+@@ -6829,6 +7577,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ spin_lock(&nn->blocked_locks_lock);
+ list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
+ list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
++ kref_get(&nbl->nbl_kref);
+ spin_unlock(&nn->blocked_locks_lock);
+ }
+
+@@ -6841,6 +7590,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ nn->somebody_reclaimed = true;
+ break;
+ case FILE_LOCK_DEFERRED:
++ kref_put(&nbl->nbl_kref, free_nbl);
+ nbl = NULL;
+ fallthrough;
+ case -EAGAIN: /* conflock holds conflicting lock */
+@@ -6861,8 +7611,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ /* dequeue it if we queued it before */
+ if (fl_flags & FL_SLEEP) {
+ spin_lock(&nn->blocked_locks_lock);
+- list_del_init(&nbl->nbl_list);
+- list_del_init(&nbl->nbl_lru);
++ if (!list_empty(&nbl->nbl_list) &&
++ !list_empty(&nbl->nbl_lru)) {
++ list_del_init(&nbl->nbl_list);
++ list_del_init(&nbl->nbl_lru);
++ kref_put(&nbl->nbl_kref, free_nbl);
++ }
++ /* nbl can use one of lists to be linked to reaplist */
+ spin_unlock(&nn->blocked_locks_lock);
+ }
+ free_blocked_lock(nbl);
+@@ -6903,21 +7658,22 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+ {
+ struct nfsd_file *nf;
++ struct inode *inode;
+ __be32 err;
+
+ err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
+ if (err)
+ return err;
+- fh_lock(fhp); /* to block new leases till after test_lock: */
+- err = nfserrno(nfsd_open_break_lease(fhp->fh_dentry->d_inode,
+- NFSD_MAY_READ));
++ inode = fhp->fh_dentry->d_inode;
++ inode_lock(inode); /* to block new leases till after test_lock: */
++ err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+ if (err)
+ goto out;
+ lock->fl_file = nf->nf_file;
+ err = nfserrno(vfs_test_lock(nf->nf_file, lock));
+ lock->fl_file = NULL;
+ out:
+- fh_unlock(fhp);
++ inode_unlock(inode);
+ nfsd_file_put(nf);
+ return err;
+ }
+@@ -6942,8 +7698,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+ return nfserr_inval;
+
+ if (!nfsd4_has_session(cstate)) {
+- status = lookup_clientid(&lockt->lt_clientid, cstate, nn,
+- false);
++ status = set_client(&lockt->lt_clientid, cstate, nn);
+ if (status)
+ goto out;
+ }
+@@ -7080,18 +7835,20 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
+ {
+ struct file_lock *fl;
+ int status = false;
+- struct nfsd_file *nf = find_any_file(fp);
++ struct nfsd_file *nf;
+ struct inode *inode;
+ struct file_lock_context *flctx;
+
++ spin_lock(&fp->fi_lock);
++ nf = find_any_file_locked(fp);
+ if (!nf) {
+ /* Any valid lock stateid should have some sort of access */
+ WARN_ON_ONCE(1);
+- return status;
++ goto out;
+ }
+
+ inode = locks_inode(nf->nf_file);
+- flctx = inode->i_flctx;
++ flctx = locks_inode_context(inode);
+
+ if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+ spin_lock(&flctx->flc_lock);
+@@ -7103,57 +7860,62 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
+ }
+ spin_unlock(&flctx->flc_lock);
+ }
+- nfsd_file_put(nf);
++out:
++ spin_unlock(&fp->fi_lock);
+ return status;
+ }
+
++/**
++ * nfsd4_release_lockowner - process NFSv4.0 RELEASE_LOCKOWNER operations
++ * @rqstp: RPC transaction
++ * @cstate: NFSv4 COMPOUND state
++ * @u: RELEASE_LOCKOWNER arguments
++ *
++ * Check if theree are any locks still held and if not - free the lockowner
++ * and any lock state that is owned.
++ *
++ * Return values:
++ * %nfs_ok: lockowner released or not found
++ * %nfserr_locks_held: lockowner still in use
++ * %nfserr_stale_clientid: clientid no longer active
++ * %nfserr_expired: clientid not recognized
++ */
+ __be32
+ nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ union nfsd4_op_u *u)
+ {
+ struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
++ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ clientid_t *clid = &rlockowner->rl_clientid;
+- struct nfs4_stateowner *sop;
+- struct nfs4_lockowner *lo = NULL;
+ struct nfs4_ol_stateid *stp;
+- struct xdr_netobj *owner = &rlockowner->rl_owner;
+- unsigned int hashval = ownerstr_hashval(owner);
+- __be32 status;
+- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
++ struct nfs4_lockowner *lo;
+ struct nfs4_client *clp;
+- LIST_HEAD (reaplist);
++ LIST_HEAD(reaplist);
++ __be32 status;
+
+ dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
+ clid->cl_boot, clid->cl_id);
+
+- status = lookup_clientid(clid, cstate, nn, false);
++ status = set_client(clid, cstate, nn);
+ if (status)
+ return status;
+-
+ clp = cstate->clp;
+- /* Find the matching lock stateowner */
+- spin_lock(&clp->cl_lock);
+- list_for_each_entry(sop, &clp->cl_ownerstr_hashtbl[hashval],
+- so_strhash) {
+
+- if (sop->so_is_open_owner || !same_owner_str(sop, owner))
+- continue;
++ spin_lock(&clp->cl_lock);
++ lo = find_lockowner_str_locked(clp, &rlockowner->rl_owner);
++ if (!lo) {
++ spin_unlock(&clp->cl_lock);
++ return nfs_ok;
++ }
+
+- if (atomic_read(&sop->so_count) != 1) {
++ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
++ if (check_for_locks(stp->st_stid.sc_file, lo)) {
+ spin_unlock(&clp->cl_lock);
++ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
+ }
+-
+- lo = lockowner(sop);
+- nfs4_get_stateowner(sop);
+- break;
+- }
+- if (!lo) {
+- spin_unlock(&clp->cl_lock);
+- return status;
+ }
+-
+ unhash_lockowner_locked(lo);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+@@ -7163,11 +7925,11 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
+ put_ol_stateid_locked(stp, &reaplist);
+ }
+ spin_unlock(&clp->cl_lock);
++
+ free_ol_stateid_reaplist(&reaplist);
+ remove_blocked_locks(lo);
+ nfs4_put_stateowner(&lo->lo_owner);
+-
+- return status;
++ return nfs_ok;
+ }
+
+ static inline struct nfs4_client_reclaim *
+@@ -7256,25 +8018,13 @@ nfsd4_find_reclaim_client(struct xdr_netobj name, struct nfsd_net *nn)
+ return NULL;
+ }
+
+-/*
+-* Called from OPEN. Look for clientid in reclaim list.
+-*/
+ __be32
+-nfs4_check_open_reclaim(clientid_t *clid,
+- struct nfsd4_compound_state *cstate,
+- struct nfsd_net *nn)
++nfs4_check_open_reclaim(struct nfs4_client *clp)
+ {
+- __be32 status;
+-
+- /* find clientid in conf_id_hashtbl */
+- status = lookup_clientid(clid, cstate, nn, false);
+- if (status)
+- return nfserr_reclaim_bad;
+-
+- if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags))
++ if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &clp->cl_flags))
+ return nfserr_no_grace;
+
+- if (nfsd4_client_record_check(cstate->clp))
++ if (nfsd4_client_record_check(clp))
+ return nfserr_reclaim_bad;
+
+ return nfs_ok;
+@@ -7345,10 +8095,20 @@ static int nfs4_state_create_net(struct net *net)
+ INIT_LIST_HEAD(&nn->blocked_locks_lru);
+
+ INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
++ INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker);
+ get_net(net);
+
++ nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan;
++ nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count;
++ nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS;
++
++ if (register_shrinker(&nn->nfsd_client_shrinker))
++ goto err_shrinker;
+ return 0;
+
++err_shrinker:
++ put_net(net);
++ kfree(nn->sessionid_hashtbl);
+ err_sessionid:
+ kfree(nn->unconf_id_hashtbl);
+ err_unconf_id:
+@@ -7420,22 +8180,18 @@ nfs4_state_start(void)
+ {
+ int ret;
+
+- laundry_wq = alloc_workqueue("%s", WQ_UNBOUND, 0, "nfsd4");
+- if (laundry_wq == NULL) {
+- ret = -ENOMEM;
+- goto out;
+- }
+- ret = nfsd4_create_callback_queue();
++ ret = rhltable_init(&nfs4_file_rhltable, &nfs4_file_rhash_params);
+ if (ret)
+- goto out_free_laundry;
++ return ret;
++
++ ret = nfsd4_create_callback_queue();
++ if (ret) {
++ rhltable_destroy(&nfs4_file_rhltable);
++ return ret;
++ }
+
+ set_max_delegations();
+ return 0;
+-
+-out_free_laundry:
+- destroy_workqueue(laundry_wq);
+-out:
+- return ret;
+ }
+
+ void
+@@ -7445,6 +8201,8 @@ nfs4_state_shutdown_net(struct net *net)
+ struct list_head *pos, *next, reaplist;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
++ unregister_shrinker(&nn->nfsd_client_shrinker);
++ cancel_work(&nn->nfsd_shrinker_work);
+ cancel_delayed_work_sync(&nn->laundromat_work);
+ locks_end_grace(&nn->nfsd4_manager);
+
+@@ -7464,13 +8222,16 @@ nfs4_state_shutdown_net(struct net *net)
+
+ nfsd4_client_tracking_exit(net);
+ nfs4_state_destroy_net(net);
++#ifdef CONFIG_NFSD_V4_2_INTER_SSC
++ nfsd4_ssc_shutdown_umount(nn);
++#endif
+ }
+
+ void
+ nfs4_state_shutdown(void)
+ {
+- destroy_workqueue(laundry_wq);
+ nfsd4_destroy_callback_queue();
++ rhltable_destroy(&nfs4_file_rhltable);
+ }
+
+ static void
+diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
+index dbfa24cf33906..5a68c62864925 100644
+--- a/fs/nfsd/nfs4xdr.c
++++ b/fs/nfsd/nfs4xdr.c
+@@ -42,6 +42,8 @@
+ #include <linux/sunrpc/svcauth_gss.h>
+ #include <linux/sunrpc/addr.h>
+ #include <linux/xattr.h>
++#include <linux/vmalloc.h>
++
+ #include <uapi/linux/xattr.h>
+
+ #include "idmap.h"
+@@ -54,6 +56,8 @@
+ #include "pnfs.h"
+ #include "filecache.h"
+
++#include "trace.h"
++
+ #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ #include <linux/security.h>
+ #endif
+@@ -90,6 +94,8 @@ check_filename(char *str, int len)
+
+ if (len == 0)
+ return nfserr_inval;
++ if (len > NFS4_MAXNAMLEN)
++ return nfserr_nametoolong;
+ if (isdotent(str, len))
+ return nfserr_badname;
+ for (i = 0; i < len; i++)
+@@ -98,122 +104,6 @@ check_filename(char *str, int len)
+ return 0;
+ }
+
+-#define DECODE_HEAD \
+- __be32 *p; \
+- __be32 status
+-#define DECODE_TAIL \
+- status = 0; \
+-out: \
+- return status; \
+-xdr_error: \
+- dprintk("NFSD: xdr error (%s:%d)\n", \
+- __FILE__, __LINE__); \
+- status = nfserr_bad_xdr; \
+- goto out
+-
+-#define READMEM(x,nbytes) do { \
+- x = (char *)p; \
+- p += XDR_QUADLEN(nbytes); \
+-} while (0)
+-#define SAVEMEM(x,nbytes) do { \
+- if (!(x = (p==argp->tmp || p == argp->tmpp) ? \
+- savemem(argp, p, nbytes) : \
+- (char *)p)) { \
+- dprintk("NFSD: xdr error (%s:%d)\n", \
+- __FILE__, __LINE__); \
+- goto xdr_error; \
+- } \
+- p += XDR_QUADLEN(nbytes); \
+-} while (0)
+-#define COPYMEM(x,nbytes) do { \
+- memcpy((x), p, nbytes); \
+- p += XDR_QUADLEN(nbytes); \
+-} while (0)
+-
+-/* READ_BUF, read_buf(): nbytes must be <= PAGE_SIZE */
+-#define READ_BUF(nbytes) do { \
+- if (nbytes <= (u32)((char *)argp->end - (char *)argp->p)) { \
+- p = argp->p; \
+- argp->p += XDR_QUADLEN(nbytes); \
+- } else if (!(p = read_buf(argp, nbytes))) { \
+- dprintk("NFSD: xdr error (%s:%d)\n", \
+- __FILE__, __LINE__); \
+- goto xdr_error; \
+- } \
+-} while (0)
+-
+-static void next_decode_page(struct nfsd4_compoundargs *argp)
+-{
+- argp->p = page_address(argp->pagelist[0]);
+- argp->pagelist++;
+- if (argp->pagelen < PAGE_SIZE) {
+- argp->end = argp->p + XDR_QUADLEN(argp->pagelen);
+- argp->pagelen = 0;
+- } else {
+- argp->end = argp->p + (PAGE_SIZE>>2);
+- argp->pagelen -= PAGE_SIZE;
+- }
+-}
+-
+-static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
+-{
+- /* We want more bytes than seem to be available.
+- * Maybe we need a new page, maybe we have just run out
+- */
+- unsigned int avail = (char *)argp->end - (char *)argp->p;
+- __be32 *p;
+-
+- if (argp->pagelen == 0) {
+- struct kvec *vec = &argp->rqstp->rq_arg.tail[0];
+-
+- if (!argp->tail) {
+- argp->tail = true;
+- avail = vec->iov_len;
+- argp->p = vec->iov_base;
+- argp->end = vec->iov_base + avail;
+- }
+-
+- if (avail < nbytes)
+- return NULL;
+-
+- p = argp->p;
+- argp->p += XDR_QUADLEN(nbytes);
+- return p;
+- }
+-
+- if (avail + argp->pagelen < nbytes)
+- return NULL;
+- if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */
+- return NULL;
+- /* ok, we can do it with the current plus the next page */
+- if (nbytes <= sizeof(argp->tmp))
+- p = argp->tmp;
+- else {
+- kfree(argp->tmpp);
+- p = argp->tmpp = kmalloc(nbytes, GFP_KERNEL);
+- if (!p)
+- return NULL;
+-
+- }
+- /*
+- * The following memcpy is safe because read_buf is always
+- * called with nbytes > avail, and the two cases above both
+- * guarantee p points to at least nbytes bytes.
+- */
+- memcpy(p, argp->p, avail);
+- next_decode_page(argp);
+- memcpy(((char*)p)+avail, argp->p, (nbytes - avail));
+- argp->p += XDR_QUADLEN(nbytes - avail);
+- return p;
+-}
+-
+-static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp)
+-{
+- unsigned int this = (char *)argp->end - (char *)argp->p;
+-
+- return this + argp->pagelen;
+-}
+-
+ static int zero_clientid(clientid_t *clid)
+ {
+ return (clid->cl_boot == 0) && (clid->cl_id == 0);
+@@ -259,118 +149,246 @@ svcxdr_dupstr(struct nfsd4_compoundargs *argp, void *buf, u32 len)
+ return p;
+ }
+
+-static __be32
+-svcxdr_construct_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
+- struct page ***pagelist, u32 buflen)
++static void *
++svcxdr_savemem(struct nfsd4_compoundargs *argp, __be32 *p, u32 len)
+ {
+- int avail;
+- int len;
+- int pages;
++ __be32 *tmp;
+
+- /* Sorry .. no magic macros for this.. *
+- * READ_BUF(write->wr_buflen);
+- * SAVEMEM(write->wr_buf, write->wr_buflen);
++ /*
++ * The location of the decoded data item is stable,
++ * so @p is OK to use. This is the common case.
+ */
+- avail = (char *)argp->end - (char *)argp->p;
+- if (avail + argp->pagelen < buflen) {
+- dprintk("NFSD: xdr error (%s:%d)\n",
+- __FILE__, __LINE__);
++ if (p != argp->xdr->scratch.iov_base)
++ return p;
++
++ tmp = svcxdr_tmpalloc(argp, len);
++ if (!tmp)
++ return NULL;
++ memcpy(tmp, p, len);
++ return tmp;
++}
++
++/*
++ * NFSv4 basic data type decoders
++ */
++
++/*
++ * This helper handles variable-length opaques which belong to protocol
++ * elements that this implementation does not support.
++ */
++static __be32
++nfsd4_decode_ignored_string(struct nfsd4_compoundargs *argp, u32 maxlen)
++{
++ u32 len;
++
++ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
++ return nfserr_bad_xdr;
++ if (maxlen && len > maxlen)
++ return nfserr_bad_xdr;
++ if (!xdr_inline_decode(argp->xdr, len))
+ return nfserr_bad_xdr;
+- }
+- head->iov_base = argp->p;
+- head->iov_len = avail;
+- *pagelist = argp->pagelist;
+
+- len = XDR_QUADLEN(buflen) << 2;
+- if (len >= avail) {
+- len -= avail;
++ return nfs_ok;
++}
+
+- pages = len >> PAGE_SHIFT;
+- argp->pagelist += pages;
+- argp->pagelen -= pages * PAGE_SIZE;
+- len -= pages * PAGE_SIZE;
++static __be32
++nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
++{
++ __be32 *p;
++ u32 len;
+
+- next_decode_page(argp);
+- }
+- argp->p += XDR_QUADLEN(len);
++ if (xdr_stream_decode_u32(argp->xdr, &len) < 0)
++ return nfserr_bad_xdr;
++ if (len == 0 || len > NFS4_OPAQUE_LIMIT)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, len);
++ if (!p)
++ return nfserr_bad_xdr;
++ o->data = svcxdr_savemem(argp, p, len);
++ if (!o->data)
++ return nfserr_jukebox;
++ o->len = len;
+
+- return 0;
++ return nfs_ok;
+ }
+
+-/**
+- * savemem - duplicate a chunk of memory for later processing
+- * @argp: NFSv4 compound argument structure to be freed with
+- * @p: pointer to be duplicated
+- * @nbytes: length to be duplicated
+- *
+- * Returns a pointer to a copy of @nbytes bytes of memory at @p
+- * that are preserved until processing of the NFSv4 compound
+- * operation described by @argp finishes.
+- */
+-static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
++static __be32
++nfsd4_decode_component4(struct nfsd4_compoundargs *argp, char **namp, u32 *lenp)
+ {
+- void *ret;
++ __be32 *p, status;
+
+- ret = svcxdr_tmpalloc(argp, nbytes);
+- if (!ret)
+- return NULL;
+- memcpy(ret, p, nbytes);
+- return ret;
++ if (xdr_stream_decode_u32(argp->xdr, lenp) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, *lenp);
++ if (!p)
++ return nfserr_bad_xdr;
++ status = check_filename((char *)p, *lenp);
++ if (status)
++ return status;
++ *namp = svcxdr_savemem(argp, p, *lenp);
++ if (!*namp)
++ return nfserr_jukebox;
++
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec64 *tv)
++nfsd4_decode_nfstime4(struct nfsd4_compoundargs *argp, struct timespec64 *tv)
+ {
+- DECODE_HEAD;
++ __be32 *p;
+
+- READ_BUF(12);
++ p = xdr_inline_decode(argp->xdr, XDR_UNIT * 3);
++ if (!p)
++ return nfserr_bad_xdr;
+ p = xdr_decode_hyper(p, &tv->tv_sec);
+ tv->tv_nsec = be32_to_cpup(p++);
+ if (tv->tv_nsec >= (u32)1000000000)
+ return nfserr_inval;
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_verifier4(struct nfsd4_compoundargs *argp, nfs4_verifier *verf)
++{
++ __be32 *p;
++
++ p = xdr_inline_decode(argp->xdr, NFS4_VERIFIER_SIZE);
++ if (!p)
++ return nfserr_bad_xdr;
++ memcpy(verf->data, p, sizeof(verf->data));
++ return nfs_ok;
++}
++
++/**
++ * nfsd4_decode_bitmap4 - Decode an NFSv4 bitmap4
++ * @argp: NFSv4 compound argument structure
++ * @bmval: pointer to an array of u32's to decode into
++ * @bmlen: size of the @bmval array
++ *
++ * The server needs to return nfs_ok rather than nfserr_bad_xdr when
++ * encountering bitmaps containing bits it does not recognize. This
++ * includes bits in bitmap words past WORDn, where WORDn is the last
++ * bitmap WORD the implementation currently supports. Thus we are
++ * careful here to simply ignore bits in bitmap words that this
++ * implementation has yet to support explicitly.
++ *
++ * Return values:
++ * %nfs_ok: @bmval populated successfully
++ * %nfserr_bad_xdr: the encoded bitmap was invalid
++ */
++static __be32
++nfsd4_decode_bitmap4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen)
++{
++ ssize_t status;
+
+- DECODE_TAIL;
++ status = xdr_stream_decode_uint32_array(argp->xdr, bmval, bmlen);
++ return status == -EBADMSG ? nfserr_bad_xdr : nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
++nfsd4_decode_nfsace4(struct nfsd4_compoundargs *argp, struct nfs4_ace *ace)
++{
++ __be32 *p, status;
++ u32 length;
++
++ if (xdr_stream_decode_u32(argp->xdr, &ace->type) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &ace->flag) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &ace->access_mask) < 0)
++ return nfserr_bad_xdr;
++
++ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, length);
++ if (!p)
++ return nfserr_bad_xdr;
++ ace->whotype = nfs4_acl_get_whotype((char *)p, length);
++ if (ace->whotype != NFS4_ACL_WHO_NAMED)
++ status = nfs_ok;
++ else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
++ status = nfsd_map_name_to_gid(argp->rqstp,
++ (char *)p, length, &ace->who_gid);
++ else
++ status = nfsd_map_name_to_uid(argp->rqstp,
++ (char *)p, length, &ace->who_uid);
++
++ return status;
++}
++
++/* A counted array of nfsace4's */
++static noinline __be32
++nfsd4_decode_acl(struct nfsd4_compoundargs *argp, struct nfs4_acl **acl)
+ {
+- u32 bmlen;
+- DECODE_HEAD;
++ struct nfs4_ace *ace;
++ __be32 status;
++ u32 count;
++
++ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
++ return nfserr_bad_xdr;
++
++ if (count > xdr_stream_remaining(argp->xdr) / 20)
++ /*
++ * Even with 4-byte names there wouldn't be
++ * space for that many aces; something fishy is
++ * going on:
++ */
++ return nfserr_fbig;
++
++ *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(count));
++ if (*acl == NULL)
++ return nfserr_jukebox;
+
+- bmval[0] = 0;
+- bmval[1] = 0;
+- bmval[2] = 0;
++ (*acl)->naces = count;
++ for (ace = (*acl)->aces; ace < (*acl)->aces + count; ace++) {
++ status = nfsd4_decode_nfsace4(argp, ace);
++ if (status)
++ return status;
++ }
++
++ return nfs_ok;
++}
++
++static noinline __be32
++nfsd4_decode_security_label(struct nfsd4_compoundargs *argp,
++ struct xdr_netobj *label)
++{
++ u32 lfs, pi, length;
++ __be32 *p;
+
+- READ_BUF(4);
+- bmlen = be32_to_cpup(p++);
+- if (bmlen > 1000)
+- goto xdr_error;
++ if (xdr_stream_decode_u32(argp->xdr, &lfs) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &pi) < 0)
++ return nfserr_bad_xdr;
+
+- READ_BUF(bmlen << 2);
+- if (bmlen > 0)
+- bmval[0] = be32_to_cpup(p++);
+- if (bmlen > 1)
+- bmval[1] = be32_to_cpup(p++);
+- if (bmlen > 2)
+- bmval[2] = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
++ return nfserr_bad_xdr;
++ if (length > NFS4_MAXLABELLEN)
++ return nfserr_badlabel;
++ p = xdr_inline_decode(argp->xdr, length);
++ if (!p)
++ return nfserr_bad_xdr;
++ label->len = length;
++ label->data = svcxdr_dupstr(argp, p, length);
++ if (!label->data)
++ return nfserr_jukebox;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+- struct iattr *iattr, struct nfs4_acl **acl,
+- struct xdr_netobj *label, int *umask)
++nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen,
++ struct iattr *iattr, struct nfs4_acl **acl,
++ struct xdr_netobj *label, int *umask)
+ {
+- int expected_len, len = 0;
+- u32 dummy32;
+- char *buf;
++ unsigned int starting_pos;
++ u32 attrlist4_count;
++ __be32 *p, status;
+
+- DECODE_HEAD;
+ iattr->ia_valid = 0;
+- if ((status = nfsd4_decode_bitmap(argp, bmval)))
+- return status;
++ status = nfsd4_decode_bitmap4(argp, bmval, bmlen);
++ if (status)
++ return nfserr_bad_xdr;
+
+ if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
+ || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+@@ -380,96 +398,69 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+ return nfserr_attrnotsupp;
+ }
+
+- READ_BUF(4);
+- expected_len = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &attrlist4_count) < 0)
++ return nfserr_bad_xdr;
++ starting_pos = xdr_stream_pos(argp->xdr);
+
+ if (bmval[0] & FATTR4_WORD0_SIZE) {
+- READ_BUF(8);
+- len += 8;
+- p = xdr_decode_hyper(p, &iattr->ia_size);
++ u64 size;
++
++ if (xdr_stream_decode_u64(argp->xdr, &size) < 0)
++ return nfserr_bad_xdr;
++ iattr->ia_size = size;
+ iattr->ia_valid |= ATTR_SIZE;
+ }
+ if (bmval[0] & FATTR4_WORD0_ACL) {
+- u32 nace;
+- struct nfs4_ace *ace;
+-
+- READ_BUF(4); len += 4;
+- nace = be32_to_cpup(p++);
+-
+- if (nace > compoundargs_bytes_left(argp)/20)
+- /*
+- * Even with 4-byte names there wouldn't be
+- * space for that many aces; something fishy is
+- * going on:
+- */
+- return nfserr_fbig;
+-
+- *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace));
+- if (*acl == NULL)
+- return nfserr_jukebox;
+-
+- (*acl)->naces = nace;
+- for (ace = (*acl)->aces; ace < (*acl)->aces + nace; ace++) {
+- READ_BUF(16); len += 16;
+- ace->type = be32_to_cpup(p++);
+- ace->flag = be32_to_cpup(p++);
+- ace->access_mask = be32_to_cpup(p++);
+- dummy32 = be32_to_cpup(p++);
+- READ_BUF(dummy32);
+- len += XDR_QUADLEN(dummy32) << 2;
+- READMEM(buf, dummy32);
+- ace->whotype = nfs4_acl_get_whotype(buf, dummy32);
+- status = nfs_ok;
+- if (ace->whotype != NFS4_ACL_WHO_NAMED)
+- ;
+- else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP)
+- status = nfsd_map_name_to_gid(argp->rqstp,
+- buf, dummy32, &ace->who_gid);
+- else
+- status = nfsd_map_name_to_uid(argp->rqstp,
+- buf, dummy32, &ace->who_uid);
+- if (status)
+- return status;
+- }
++ status = nfsd4_decode_acl(argp, acl);
++ if (status)
++ return status;
+ } else
+ *acl = NULL;
+ if (bmval[1] & FATTR4_WORD1_MODE) {
+- READ_BUF(4);
+- len += 4;
+- iattr->ia_mode = be32_to_cpup(p++);
++ u32 mode;
++
++ if (xdr_stream_decode_u32(argp->xdr, &mode) < 0)
++ return nfserr_bad_xdr;
++ iattr->ia_mode = mode;
+ iattr->ia_mode &= (S_IFMT | S_IALLUGO);
+ iattr->ia_valid |= ATTR_MODE;
+ }
+ if (bmval[1] & FATTR4_WORD1_OWNER) {
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++);
+- READ_BUF(dummy32);
+- len += (XDR_QUADLEN(dummy32) << 2);
+- READMEM(buf, dummy32);
+- if ((status = nfsd_map_name_to_uid(argp->rqstp, buf, dummy32, &iattr->ia_uid)))
++ u32 length;
++
++ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, length);
++ if (!p)
++ return nfserr_bad_xdr;
++ status = nfsd_map_name_to_uid(argp->rqstp, (char *)p, length,
++ &iattr->ia_uid);
++ if (status)
+ return status;
+ iattr->ia_valid |= ATTR_UID;
+ }
+ if (bmval[1] & FATTR4_WORD1_OWNER_GROUP) {
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++);
+- READ_BUF(dummy32);
+- len += (XDR_QUADLEN(dummy32) << 2);
+- READMEM(buf, dummy32);
+- if ((status = nfsd_map_name_to_gid(argp->rqstp, buf, dummy32, &iattr->ia_gid)))
++ u32 length;
++
++ if (xdr_stream_decode_u32(argp->xdr, &length) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, length);
++ if (!p)
++ return nfserr_bad_xdr;
++ status = nfsd_map_name_to_gid(argp->rqstp, (char *)p, length,
++ &iattr->ia_gid);
++ if (status)
+ return status;
+ iattr->ia_valid |= ATTR_GID;
+ }
+ if (bmval[1] & FATTR4_WORD1_TIME_ACCESS_SET) {
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++);
+- switch (dummy32) {
++ u32 set_it;
++
++ if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0)
++ return nfserr_bad_xdr;
++ switch (set_it) {
+ case NFS4_SET_TO_CLIENT_TIME:
+- len += 12;
+- status = nfsd4_decode_time(argp, &iattr->ia_atime);
++ status = nfsd4_decode_nfstime4(argp, &iattr->ia_atime);
+ if (status)
+ return status;
+ iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
+@@ -478,17 +469,26 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+ iattr->ia_valid |= ATTR_ATIME;
+ break;
+ default:
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ }
+ }
++ if (bmval[1] & FATTR4_WORD1_TIME_CREATE) {
++ struct timespec64 ts;
++
++ /* No Linux filesystem supports setting this attribute. */
++ bmval[1] &= ~FATTR4_WORD1_TIME_CREATE;
++ status = nfsd4_decode_nfstime4(argp, &ts);
++ if (status)
++ return status;
++ }
+ if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) {
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++);
+- switch (dummy32) {
++ u32 set_it;
++
++ if (xdr_stream_decode_u32(argp->xdr, &set_it) < 0)
++ return nfserr_bad_xdr;
++ switch (set_it) {
+ case NFS4_SET_TO_CLIENT_TIME:
+- len += 12;
+- status = nfsd4_decode_time(argp, &iattr->ia_mtime);
++ status = nfsd4_decode_nfstime4(argp, &iattr->ia_mtime);
+ if (status)
+ return status;
+ iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
+@@ -497,222 +497,335 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
+ iattr->ia_valid |= ATTR_MTIME;
+ break;
+ default:
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ }
+ }
+-
+ label->len = 0;
+ if (IS_ENABLED(CONFIG_NFSD_V4_SECURITY_LABEL) &&
+ bmval[2] & FATTR4_WORD2_SECURITY_LABEL) {
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++); /* lfs: we don't use it */
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++); /* pi: we don't use it either */
+- READ_BUF(4);
+- len += 4;
+- dummy32 = be32_to_cpup(p++);
+- READ_BUF(dummy32);
+- if (dummy32 > NFS4_MAXLABELLEN)
+- return nfserr_badlabel;
+- len += (XDR_QUADLEN(dummy32) << 2);
+- READMEM(buf, dummy32);
+- label->len = dummy32;
+- label->data = svcxdr_dupstr(argp, buf, dummy32);
+- if (!label->data)
+- return nfserr_jukebox;
++ status = nfsd4_decode_security_label(argp, label);
++ if (status)
++ return status;
+ }
+ if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
++ u32 mode, mask;
++
+ if (!umask)
+- goto xdr_error;
+- READ_BUF(8);
+- len += 8;
+- dummy32 = be32_to_cpup(p++);
+- iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO);
+- dummy32 = be32_to_cpup(p++);
+- *umask = dummy32 & S_IRWXUGO;
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &mode) < 0)
++ return nfserr_bad_xdr;
++ iattr->ia_mode = mode & (S_IFMT | S_IALLUGO);
++ if (xdr_stream_decode_u32(argp->xdr, &mask) < 0)
++ return nfserr_bad_xdr;
++ *umask = mask & S_IRWXUGO;
+ iattr->ia_valid |= ATTR_MODE;
+ }
+- if (len != expected_len)
+- goto xdr_error;
+
+- DECODE_TAIL;
++ /* request sanity: did attrlist4 contain the expected number of words? */
++ if (attrlist4_count != xdr_stream_pos(argp->xdr) - starting_pos)
++ return nfserr_bad_xdr;
++
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid)
++nfsd4_decode_stateid4(struct nfsd4_compoundargs *argp, stateid_t *sid)
+ {
+- DECODE_HEAD;
++ __be32 *p;
+
+- READ_BUF(sizeof(stateid_t));
++ p = xdr_inline_decode(argp->xdr, NFS4_STATEID_SIZE);
++ if (!p)
++ return nfserr_bad_xdr;
+ sid->si_generation = be32_to_cpup(p++);
+- COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t));
+-
+- DECODE_TAIL;
++ memcpy(&sid->si_opaque, p, sizeof(sid->si_opaque));
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access)
++nfsd4_decode_clientid4(struct nfsd4_compoundargs *argp, clientid_t *clientid)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(4);
+- access->ac_req_access = be32_to_cpup(p++);
++ __be32 *p;
+
+- DECODE_TAIL;
++ p = xdr_inline_decode(argp->xdr, sizeof(__be64));
++ if (!p)
++ return nfserr_bad_xdr;
++ memcpy(clientid, p, sizeof(*clientid));
++ return nfs_ok;
+ }
+
+-static __be32 nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
++static __be32
++nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp,
++ clientid_t *clientid, struct xdr_netobj *owner)
+ {
+- DECODE_HEAD;
+- struct user_namespace *userns = nfsd_user_namespace(argp->rqstp);
+- u32 dummy, uid, gid;
+- char *machine_name;
+- int i;
+- int nr_secflavs;
++ __be32 status;
+
+- /* callback_sec_params4 */
+- READ_BUF(4);
+- nr_secflavs = be32_to_cpup(p++);
+- if (nr_secflavs)
+- cbs->flavor = (u32)(-1);
+- else
+- /* Is this legal? Be generous, take it to mean AUTH_NONE: */
+- cbs->flavor = 0;
+- for (i = 0; i < nr_secflavs; ++i) {
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- switch (dummy) {
+- case RPC_AUTH_NULL:
+- /* Nothing to read */
+- if (cbs->flavor == (u32)(-1))
+- cbs->flavor = RPC_AUTH_NULL;
+- break;
+- case RPC_AUTH_UNIX:
+- READ_BUF(8);
+- /* stamp */
+- dummy = be32_to_cpup(p++);
+-
+- /* machine name */
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy);
+- SAVEMEM(machine_name, dummy);
+-
+- /* uid, gid */
+- READ_BUF(8);
+- uid = be32_to_cpup(p++);
+- gid = be32_to_cpup(p++);
+-
+- /* more gids */
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy * 4);
+- if (cbs->flavor == (u32)(-1)) {
+- kuid_t kuid = make_kuid(userns, uid);
+- kgid_t kgid = make_kgid(userns, gid);
+- if (uid_valid(kuid) && gid_valid(kgid)) {
+- cbs->uid = kuid;
+- cbs->gid = kgid;
+- cbs->flavor = RPC_AUTH_UNIX;
+- } else {
+- dprintk("RPC_AUTH_UNIX with invalid"
+- "uid or gid ignoring!\n");
+- }
+- }
+- break;
+- case RPC_AUTH_GSS:
+- dprintk("RPC_AUTH_GSS callback secflavor "
+- "not supported!\n");
+- READ_BUF(8);
+- /* gcbp_service */
+- dummy = be32_to_cpup(p++);
+- /* gcbp_handle_from_server */
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy);
+- p += XDR_QUADLEN(dummy);
+- /* gcbp_handle_from_client */
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy);
+- break;
+- default:
+- dprintk("Illegal callback secflavor\n");
+- return nfserr_inval;
+- }
+- }
+- DECODE_TAIL;
++ status = nfsd4_decode_clientid4(argp, clientid);
++ if (status)
++ return status;
++ return nfsd4_decode_opaque(argp, owner);
+ }
+
+-static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp, struct nfsd4_backchannel_ctl *bc)
++#ifdef CONFIG_NFSD_PNFS
++static __be32
++nfsd4_decode_deviceid4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_deviceid *devid)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(4);
+- bc->bc_cb_program = be32_to_cpup(p++);
+- nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
++ __be32 *p;
+
+- DECODE_TAIL;
++ p = xdr_inline_decode(argp->xdr, NFS4_DEVICEID4_SIZE);
++ if (!p)
++ return nfserr_bad_xdr;
++ memcpy(devid, p, sizeof(*devid));
++ return nfs_ok;
+ }
+
+-static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp, struct nfsd4_bind_conn_to_session *bcts)
++static __be32
++nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_layoutcommit *lcp)
+ {
+- DECODE_HEAD;
++ if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0)
++ return nfserr_bad_xdr;
++ if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES)
++ return nfserr_bad_xdr;
++ if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX)
++ return nfserr_bad_xdr;
++
++ if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0)
++ return nfserr_bad_xdr;
++ if (lcp->lc_up_len > 0) {
++ lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len);
++ if (!lcp->lc_up_layout)
++ return nfserr_bad_xdr;
++ }
+
+- READ_BUF(NFS4_MAX_SESSIONID_LEN + 8);
+- COPYMEM(bcts->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+- bcts->dir = be32_to_cpup(p++);
+- /* XXX: skipping ctsa_use_conn_in_rdma_mode. Perhaps Tom Tucker
+- * could help us figure out we should be using it. */
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close)
++nfsd4_decode_layoutreturn4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_layoutreturn *lrp)
+ {
+- DECODE_HEAD;
++ __be32 status;
+
+- READ_BUF(4);
+- close->cl_seqid = be32_to_cpup(p++);
+- return nfsd4_decode_stateid(argp, &close->cl_stateid);
++ if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_return_type) < 0)
++ return nfserr_bad_xdr;
++ switch (lrp->lr_return_type) {
++ case RETURN_FILE:
++ if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lrp->lr_seg.length) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_stateid4(argp, &lrp->lr_sid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &lrp->lrf_body_len) < 0)
++ return nfserr_bad_xdr;
++ if (lrp->lrf_body_len > 0) {
++ lrp->lrf_body = xdr_inline_decode(argp->xdr, lrp->lrf_body_len);
++ if (!lrp->lrf_body)
++ return nfserr_bad_xdr;
++ }
++ break;
++ case RETURN_FSID:
++ case RETURN_ALL:
++ lrp->lr_seg.offset = 0;
++ lrp->lr_seg.length = NFS4_MAX_UINT64;
++ break;
++ default:
++ return nfserr_bad_xdr;
++ }
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
++#endif /* CONFIG_NFSD_PNFS */
+
+ static __be32
+-nfsd4_decode_commit(struct nfsd4_compoundargs *argp, struct nfsd4_commit *commit)
++nfsd4_decode_sessionid4(struct nfsd4_compoundargs *argp,
++ struct nfs4_sessionid *sessionid)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(12);
+- p = xdr_decode_hyper(p, &commit->co_offset);
+- commit->co_count = be32_to_cpup(p++);
++ __be32 *p;
+
+- DECODE_TAIL;
++ p = xdr_inline_decode(argp->xdr, NFS4_MAX_SESSIONID_LEN);
++ if (!p)
++ return nfserr_bad_xdr;
++ memcpy(sessionid->data, p, sizeof(sessionid->data));
++ return nfs_ok;
+ }
+
++/* Defined in Appendix A of RFC 5531 */
+ static __be32
+-nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create)
++nfsd4_decode_authsys_parms(struct nfsd4_compoundargs *argp,
++ struct nfsd4_cb_sec *cbs)
+ {
+- DECODE_HEAD;
++ u32 stamp, gidcount, uid, gid;
++ __be32 *p, status;
+
+- READ_BUF(4);
+- create->cr_type = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &stamp) < 0)
++ return nfserr_bad_xdr;
++ /* machine name */
++ status = nfsd4_decode_ignored_string(argp, 255);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &uid) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &gid) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &gidcount) < 0)
++ return nfserr_bad_xdr;
++ if (gidcount > 16)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, gidcount << 2);
++ if (!p)
++ return nfserr_bad_xdr;
++ if (cbs->flavor == (u32)(-1)) {
++ struct user_namespace *userns = nfsd_user_namespace(argp->rqstp);
++
++ kuid_t kuid = make_kuid(userns, uid);
++ kgid_t kgid = make_kgid(userns, gid);
++ if (uid_valid(kuid) && gid_valid(kgid)) {
++ cbs->uid = kuid;
++ cbs->gid = kgid;
++ cbs->flavor = RPC_AUTH_UNIX;
++ } else {
++ dprintk("RPC_AUTH_UNIX with invalid uid or gid, ignoring!\n");
++ }
++ }
++
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_gss_cb_handles4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_cb_sec *cbs)
++{
++ __be32 status;
++ u32 service;
++
++ dprintk("RPC_AUTH_GSS callback secflavor not supported!\n");
++
++ if (xdr_stream_decode_u32(argp->xdr, &service) < 0)
++ return nfserr_bad_xdr;
++ if (service < RPC_GSS_SVC_NONE || service > RPC_GSS_SVC_PRIVACY)
++ return nfserr_bad_xdr;
++ /* gcbp_handle_from_server */
++ status = nfsd4_decode_ignored_string(argp, 0);
++ if (status)
++ return status;
++ /* gcbp_handle_from_client */
++ status = nfsd4_decode_ignored_string(argp, 0);
++ if (status)
++ return status;
++
++ return nfs_ok;
++}
++
++/* a counted array of callback_sec_parms4 items */
++static __be32
++nfsd4_decode_cb_sec(struct nfsd4_compoundargs *argp, struct nfsd4_cb_sec *cbs)
++{
++ u32 i, secflavor, nr_secflavs;
++ __be32 status;
++
++ /* callback_sec_params4 */
++ if (xdr_stream_decode_u32(argp->xdr, &nr_secflavs) < 0)
++ return nfserr_bad_xdr;
++ if (nr_secflavs)
++ cbs->flavor = (u32)(-1);
++ else
++ /* Is this legal? Be generous, take it to mean AUTH_NONE: */
++ cbs->flavor = 0;
++
++ for (i = 0; i < nr_secflavs; ++i) {
++ if (xdr_stream_decode_u32(argp->xdr, &secflavor) < 0)
++ return nfserr_bad_xdr;
++ switch (secflavor) {
++ case RPC_AUTH_NULL:
++ /* void */
++ if (cbs->flavor == (u32)(-1))
++ cbs->flavor = RPC_AUTH_NULL;
++ break;
++ case RPC_AUTH_UNIX:
++ status = nfsd4_decode_authsys_parms(argp, cbs);
++ if (status)
++ return status;
++ break;
++ case RPC_AUTH_GSS:
++ status = nfsd4_decode_gss_cb_handles4(argp, cbs);
++ if (status)
++ return status;
++ break;
++ default:
++ return nfserr_inval;
++ }
++ }
++
++ return nfs_ok;
++}
++
++
++/*
++ * NFSv4 operation argument decoders
++ */
++
++static __be32
++nfsd4_decode_access(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_access *access = &u->access;
++ if (xdr_stream_decode_u32(argp->xdr, &access->ac_req_access) < 0)
++ return nfserr_bad_xdr;
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_close(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
++{
++ struct nfsd4_close *close = &u->close;
++ if (xdr_stream_decode_u32(argp->xdr, &close->cl_seqid) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_stateid4(argp, &close->cl_stateid);
++}
++
++
++static __be32
++nfsd4_decode_commit(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
++{
++ struct nfsd4_commit *commit = &u->commit;
++ if (xdr_stream_decode_u64(argp->xdr, &commit->co_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &commit->co_count) < 0)
++ return nfserr_bad_xdr;
++ memset(&commit->co_verf, 0, sizeof(commit->co_verf));
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_create(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
++{
++ struct nfsd4_create *create = &u->create;
++ __be32 *p, status;
++
++ memset(create, 0, sizeof(*create));
++ if (xdr_stream_decode_u32(argp->xdr, &create->cr_type) < 0)
++ return nfserr_bad_xdr;
+ switch (create->cr_type) {
+ case NF4LNK:
+- READ_BUF(4);
+- create->cr_datalen = be32_to_cpup(p++);
+- READ_BUF(create->cr_datalen);
++ if (xdr_stream_decode_u32(argp->xdr, &create->cr_datalen) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, create->cr_datalen);
++ if (!p)
++ return nfserr_bad_xdr;
+ create->cr_data = svcxdr_dupstr(argp, p, create->cr_datalen);
+ if (!create->cr_data)
+ return nfserr_jukebox;
+ break;
+ case NF4BLK:
+ case NF4CHR:
+- READ_BUF(8);
+- create->cr_specdata1 = be32_to_cpup(p++);
+- create->cr_specdata2 = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata1) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &create->cr_specdata2) < 0)
++ return nfserr_bad_xdr;
+ break;
+ case NF4SOCK:
+ case NF4FIFO:
+@@ -720,151 +833,221 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
+ default:
+ break;
+ }
+-
+- READ_BUF(4);
+- create->cr_namelen = be32_to_cpup(p++);
+- READ_BUF(create->cr_namelen);
+- SAVEMEM(create->cr_name, create->cr_namelen);
+- if ((status = check_filename(create->cr_name, create->cr_namelen)))
++ status = nfsd4_decode_component4(argp, &create->cr_name,
++ &create->cr_namelen);
++ if (status)
+ return status;
+-
+- status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
+- &create->cr_acl, &create->cr_label,
+- &create->cr_umask);
++ status = nfsd4_decode_fattr4(argp, create->cr_bmval,
++ ARRAY_SIZE(create->cr_bmval),
++ &create->cr_iattr, &create->cr_acl,
++ &create->cr_label, &create->cr_umask);
+ if (status)
+- goto out;
++ return status;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static inline __be32
+-nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr)
++nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- return nfsd4_decode_stateid(argp, &dr->dr_stateid);
++ struct nfsd4_delegreturn *dr = &u->delegreturn;
++ return nfsd4_decode_stateid4(argp, &dr->dr_stateid);
+ }
+
+ static inline __be32
+-nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, struct nfsd4_getattr *getattr)
++nfsd4_decode_getattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- return nfsd4_decode_bitmap(argp, getattr->ga_bmval);
++ struct nfsd4_getattr *getattr = &u->getattr;
++ memset(getattr, 0, sizeof(*getattr));
++ return nfsd4_decode_bitmap4(argp, getattr->ga_bmval,
++ ARRAY_SIZE(getattr->ga_bmval));
+ }
+
+ static __be32
+-nfsd4_decode_link(struct nfsd4_compoundargs *argp, struct nfsd4_link *link)
++nfsd4_decode_link(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_link *link = &u->link;
++ memset(link, 0, sizeof(*link));
++ return nfsd4_decode_component4(argp, &link->li_name, &link->li_namelen);
++}
+
+- READ_BUF(4);
+- link->li_namelen = be32_to_cpup(p++);
+- READ_BUF(link->li_namelen);
+- SAVEMEM(link->li_name, link->li_namelen);
+- if ((status = check_filename(link->li_name, link->li_namelen)))
+- return status;
++static __be32
++nfsd4_decode_open_to_lock_owner4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_lock *lock)
++{
++ __be32 status;
+
+- DECODE_TAIL;
++ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_open_seqid) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_stateid4(argp, &lock->lk_new_open_stateid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_new_lock_seqid) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_state_owner4(argp, &lock->lk_new_clientid,
++ &lock->lk_new_owner);
+ }
+
+ static __be32
+-nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
++nfsd4_decode_exist_lock_owner4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_lock *lock)
+ {
+- DECODE_HEAD;
++ __be32 status;
+
+- /*
+- * type, reclaim(boolean), offset, length, new_lock_owner(boolean)
+- */
+- READ_BUF(28);
+- lock->lk_type = be32_to_cpup(p++);
+- if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
+- goto xdr_error;
+- lock->lk_reclaim = be32_to_cpup(p++);
+- p = xdr_decode_hyper(p, &lock->lk_offset);
+- p = xdr_decode_hyper(p, &lock->lk_length);
+- lock->lk_is_new = be32_to_cpup(p++);
+-
+- if (lock->lk_is_new) {
+- READ_BUF(4);
+- lock->lk_new_open_seqid = be32_to_cpup(p++);
+- status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid);
+- if (status)
+- return status;
+- READ_BUF(8 + sizeof(clientid_t));
+- lock->lk_new_lock_seqid = be32_to_cpup(p++);
+- COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t));
+- lock->lk_new_owner.len = be32_to_cpup(p++);
+- READ_BUF(lock->lk_new_owner.len);
+- READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len);
+- } else {
+- status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid);
+- if (status)
+- return status;
+- READ_BUF(4);
+- lock->lk_old_lock_seqid = be32_to_cpup(p++);
+- }
++ status = nfsd4_decode_stateid4(argp, &lock->lk_old_lock_stateid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_old_lock_seqid) < 0)
++ return nfserr_bad_xdr;
+
+- DECODE_TAIL;
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_locker4(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock)
++{
++ if (xdr_stream_decode_bool(argp->xdr, &lock->lk_is_new) < 0)
++ return nfserr_bad_xdr;
++ if (lock->lk_is_new)
++ return nfsd4_decode_open_to_lock_owner4(argp, lock);
++ return nfsd4_decode_exist_lock_owner4(argp, lock);
+ }
+
+ static __be32
+-nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
++nfsd4_decode_lock(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(32);
+- lockt->lt_type = be32_to_cpup(p++);
+- if((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
+- goto xdr_error;
+- p = xdr_decode_hyper(p, &lockt->lt_offset);
+- p = xdr_decode_hyper(p, &lockt->lt_length);
+- COPYMEM(&lockt->lt_clientid, 8);
+- lockt->lt_owner.len = be32_to_cpup(p++);
+- READ_BUF(lockt->lt_owner.len);
+- READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
++ struct nfsd4_lock *lock = &u->lock;
++ memset(lock, 0, sizeof(*lock));
++ if (xdr_stream_decode_u32(argp->xdr, &lock->lk_type) < 0)
++ return nfserr_bad_xdr;
++ if ((lock->lk_type < NFS4_READ_LT) || (lock->lk_type > NFS4_WRITEW_LT))
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_bool(argp->xdr, &lock->lk_reclaim) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lock->lk_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lock->lk_length) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_locker4(argp, lock);
++}
+
+- DECODE_TAIL;
++static __be32
++nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
++{
++ struct nfsd4_lockt *lockt = &u->lockt;
++ memset(lockt, 0, sizeof(*lockt));
++ if (xdr_stream_decode_u32(argp->xdr, &lockt->lt_type) < 0)
++ return nfserr_bad_xdr;
++ if ((lockt->lt_type < NFS4_READ_LT) || (lockt->lt_type > NFS4_WRITEW_LT))
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lockt->lt_length) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_state_owner4(argp, &lockt->lt_clientid,
++ &lockt->lt_owner);
+ }
+
+ static __be32
+-nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku)
++nfsd4_decode_locku(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_locku *locku = &u->locku;
++ __be32 status;
+
+- READ_BUF(8);
+- locku->lu_type = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &locku->lu_type) < 0)
++ return nfserr_bad_xdr;
+ if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT))
+- goto xdr_error;
+- locku->lu_seqid = be32_to_cpup(p++);
+- status = nfsd4_decode_stateid(argp, &locku->lu_stateid);
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &locku->lu_seqid) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_stateid4(argp, &locku->lu_stateid);
+ if (status)
+ return status;
+- READ_BUF(16);
+- p = xdr_decode_hyper(p, &locku->lu_offset);
+- p = xdr_decode_hyper(p, &locku->lu_length);
++ if (xdr_stream_decode_u64(argp->xdr, &locku->lu_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &locku->lu_length) < 0)
++ return nfserr_bad_xdr;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, struct nfsd4_lookup *lookup)
++nfsd4_decode_lookup(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_lookup *lookup = &u->lookup;
++ return nfsd4_decode_component4(argp, &lookup->lo_name, &lookup->lo_len);
++}
+
+- READ_BUF(4);
+- lookup->lo_len = be32_to_cpup(p++);
+- READ_BUF(lookup->lo_len);
+- SAVEMEM(lookup->lo_name, lookup->lo_len);
+- if ((status = check_filename(lookup->lo_name, lookup->lo_len)))
+- return status;
++static __be32
++nfsd4_decode_createhow4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
++{
++ __be32 status;
++
++ if (xdr_stream_decode_u32(argp->xdr, &open->op_createmode) < 0)
++ return nfserr_bad_xdr;
++ switch (open->op_createmode) {
++ case NFS4_CREATE_UNCHECKED:
++ case NFS4_CREATE_GUARDED:
++ status = nfsd4_decode_fattr4(argp, open->op_bmval,
++ ARRAY_SIZE(open->op_bmval),
++ &open->op_iattr, &open->op_acl,
++ &open->op_label, &open->op_umask);
++ if (status)
++ return status;
++ break;
++ case NFS4_CREATE_EXCLUSIVE:
++ status = nfsd4_decode_verifier4(argp, &open->op_verf);
++ if (status)
++ return status;
++ break;
++ case NFS4_CREATE_EXCLUSIVE4_1:
++ if (argp->minorversion < 1)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_verifier4(argp, &open->op_verf);
++ if (status)
++ return status;
++ status = nfsd4_decode_fattr4(argp, open->op_bmval,
++ ARRAY_SIZE(open->op_bmval),
++ &open->op_iattr, &open->op_acl,
++ &open->op_label, &open->op_umask);
++ if (status)
++ return status;
++ break;
++ default:
++ return nfserr_bad_xdr;
++ }
++
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_openflag4(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
++{
++ __be32 status;
++
++ if (xdr_stream_decode_u32(argp->xdr, &open->op_create) < 0)
++ return nfserr_bad_xdr;
++ switch (open->op_create) {
++ case NFS4_OPEN_NOCREATE:
++ break;
++ case NFS4_OPEN_CREATE:
++ status = nfsd4_decode_createhow4(argp, open);
++ if (status)
++ return status;
++ break;
++ default:
++ return nfserr_bad_xdr;
++ }
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *share_access, u32 *deleg_want, u32 *deleg_when)
+ {
+- __be32 *p;
+ u32 w;
+
+- READ_BUF(4);
+- w = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &w) < 0)
++ return nfserr_bad_xdr;
+ *share_access = w & NFS4_SHARE_ACCESS_MASK;
+ *deleg_want = w & NFS4_SHARE_WANT_MASK;
+ if (deleg_when)
+@@ -907,930 +1090,935 @@ static __be32 nfsd4_decode_share_access(struct nfsd4_compoundargs *argp, u32 *sh
+ NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED):
+ return nfs_ok;
+ }
+-xdr_error:
+ return nfserr_bad_xdr;
+ }
+
+ static __be32 nfsd4_decode_share_deny(struct nfsd4_compoundargs *argp, u32 *x)
+ {
+- __be32 *p;
+-
+- READ_BUF(4);
+- *x = be32_to_cpup(p++);
+- /* Note: unlinke access bits, deny bits may be zero. */
+- if (*x & ~NFS4_SHARE_DENY_BOTH)
++ if (xdr_stream_decode_u32(argp->xdr, x) < 0)
+ return nfserr_bad_xdr;
+- return nfs_ok;
+-xdr_error:
+- return nfserr_bad_xdr;
+-}
+-
+-static __be32 nfsd4_decode_opaque(struct nfsd4_compoundargs *argp, struct xdr_netobj *o)
+-{
+- __be32 *p;
+-
+- READ_BUF(4);
+- o->len = be32_to_cpup(p++);
+-
+- if (o->len == 0 || o->len > NFS4_OPAQUE_LIMIT)
++ /* Note: unlike access bits, deny bits may be zero. */
++ if (*x & ~NFS4_SHARE_DENY_BOTH)
+ return nfserr_bad_xdr;
+
+- READ_BUF(o->len);
+- SAVEMEM(o->data, o->len);
+ return nfs_ok;
+-xdr_error:
+- return nfserr_bad_xdr;
+ }
+
+ static __be32
+-nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
++nfsd4_decode_open_claim4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_open *open)
+ {
+- DECODE_HEAD;
+- u32 dummy;
+-
+- memset(open->op_bmval, 0, sizeof(open->op_bmval));
+- open->op_iattr.ia_valid = 0;
+- open->op_openowner = NULL;
+-
+- open->op_xdr_error = 0;
+- /* seqid, share_access, share_deny, clientid, ownerlen */
+- READ_BUF(4);
+- open->op_seqid = be32_to_cpup(p++);
+- /* decode, yet ignore deleg_when until supported */
+- status = nfsd4_decode_share_access(argp, &open->op_share_access,
+- &open->op_deleg_want, &dummy);
+- if (status)
+- goto xdr_error;
+- status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+- if (status)
+- goto xdr_error;
+- READ_BUF(sizeof(clientid_t));
+- COPYMEM(&open->op_clientid, sizeof(clientid_t));
+- status = nfsd4_decode_opaque(argp, &open->op_owner);
+- if (status)
+- goto xdr_error;
+- READ_BUF(4);
+- open->op_create = be32_to_cpup(p++);
+- switch (open->op_create) {
+- case NFS4_OPEN_NOCREATE:
+- break;
+- case NFS4_OPEN_CREATE:
+- READ_BUF(4);
+- open->op_createmode = be32_to_cpup(p++);
+- switch (open->op_createmode) {
+- case NFS4_CREATE_UNCHECKED:
+- case NFS4_CREATE_GUARDED:
+- status = nfsd4_decode_fattr(argp, open->op_bmval,
+- &open->op_iattr, &open->op_acl, &open->op_label,
+- &open->op_umask);
+- if (status)
+- goto out;
+- break;
+- case NFS4_CREATE_EXCLUSIVE:
+- READ_BUF(NFS4_VERIFIER_SIZE);
+- COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
+- break;
+- case NFS4_CREATE_EXCLUSIVE4_1:
+- if (argp->minorversion < 1)
+- goto xdr_error;
+- READ_BUF(NFS4_VERIFIER_SIZE);
+- COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
+- status = nfsd4_decode_fattr(argp, open->op_bmval,
+- &open->op_iattr, &open->op_acl, &open->op_label,
+- &open->op_umask);
+- if (status)
+- goto out;
+- break;
+- default:
+- goto xdr_error;
+- }
+- break;
+- default:
+- goto xdr_error;
+- }
++ __be32 status;
+
+- /* open_claim */
+- READ_BUF(4);
+- open->op_claim_type = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &open->op_claim_type) < 0)
++ return nfserr_bad_xdr;
+ switch (open->op_claim_type) {
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+- READ_BUF(4);
+- open->op_fname.len = be32_to_cpup(p++);
+- READ_BUF(open->op_fname.len);
+- SAVEMEM(open->op_fname.data, open->op_fname.len);
+- if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
++ status = nfsd4_decode_component4(argp, &open->op_fname,
++ &open->op_fnamelen);
++ if (status)
+ return status;
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+- READ_BUF(4);
+- open->op_delegate_type = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &open->op_delegate_type) < 0)
++ return nfserr_bad_xdr;
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+- status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
++ status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+- READ_BUF(4);
+- open->op_fname.len = be32_to_cpup(p++);
+- READ_BUF(open->op_fname.len);
+- SAVEMEM(open->op_fname.data, open->op_fname.len);
+- if ((status = check_filename(open->op_fname.data, open->op_fname.len)))
++ status = nfsd4_decode_component4(argp, &open->op_fname,
++ &open->op_fnamelen);
++ if (status)
+ return status;
+ break;
+ case NFS4_OPEN_CLAIM_FH:
+ case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+ if (argp->minorversion < 1)
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ /* void */
+ break;
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+ if (argp->minorversion < 1)
+- goto xdr_error;
+- status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid);
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_stateid4(argp, &open->op_delegate_stateid);
+ if (status)
+ return status;
+ break;
+ default:
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ }
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_confirm *open_conf)
++nfsd4_decode_open(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_open *open = &u->open;
++ __be32 status;
++ u32 dummy;
+
+- if (argp->minorversion >= 1)
+- return nfserr_notsupp;
++ memset(open, 0, sizeof(*open));
+
+- status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid);
++ if (xdr_stream_decode_u32(argp->xdr, &open->op_seqid) < 0)
++ return nfserr_bad_xdr;
++ /* deleg_want is ignored */
++ status = nfsd4_decode_share_access(argp, &open->op_share_access,
++ &open->op_deleg_want, &dummy);
+ if (status)
+ return status;
+- READ_BUF(4);
+- open_conf->oc_seqid = be32_to_cpup(p++);
+-
+- DECODE_TAIL;
+-}
+-
+-static __be32
+-nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_downgrade *open_down)
+-{
+- DECODE_HEAD;
+-
+- status = nfsd4_decode_stateid(argp, &open_down->od_stateid);
++ status = nfsd4_decode_share_deny(argp, &open->op_share_deny);
+ if (status)
+ return status;
+- READ_BUF(4);
+- open_down->od_seqid = be32_to_cpup(p++);
+- status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
+- &open_down->od_deleg_want, NULL);
++ status = nfsd4_decode_state_owner4(argp, &open->op_clientid,
++ &open->op_owner);
+ if (status)
+ return status;
+- status = nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
++ status = nfsd4_decode_openflag4(argp, open);
+ if (status)
+ return status;
+- DECODE_TAIL;
++ return nfsd4_decode_open_claim4(argp, open);
+ }
+
+ static __be32
+-nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, struct nfsd4_putfh *putfh)
++nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(4);
+- putfh->pf_fhlen = be32_to_cpup(p++);
+- if (putfh->pf_fhlen > NFS4_FHSIZE)
+- goto xdr_error;
+- READ_BUF(putfh->pf_fhlen);
+- SAVEMEM(putfh->pf_fhval, putfh->pf_fhlen);
++ struct nfsd4_open_confirm *open_conf = &u->open_confirm;
++ __be32 status;
+
+- DECODE_TAIL;
+-}
++ if (argp->minorversion >= 1)
++ return nfserr_notsupp;
+
+-static __be32
+-nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, void *p)
+-{
+- if (argp->minorversion == 0)
+- return nfs_ok;
+- return nfserr_notsupp;
++ status = nfsd4_decode_stateid4(argp, &open_conf->oc_req_stateid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &open_conf->oc_seqid) < 0)
++ return nfserr_bad_xdr;
++
++ memset(&open_conf->oc_resp_stateid, 0,
++ sizeof(open_conf->oc_resp_stateid));
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read)
++nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_open_downgrade *open_down = &u->open_downgrade;
++ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &read->rd_stateid);
++ memset(open_down, 0, sizeof(*open_down));
++ status = nfsd4_decode_stateid4(argp, &open_down->od_stateid);
+ if (status)
+ return status;
+- READ_BUF(12);
+- p = xdr_decode_hyper(p, &read->rd_offset);
+- read->rd_length = be32_to_cpup(p++);
+-
+- DECODE_TAIL;
++ if (xdr_stream_decode_u32(argp->xdr, &open_down->od_seqid) < 0)
++ return nfserr_bad_xdr;
++ /* deleg_want is ignored */
++ status = nfsd4_decode_share_access(argp, &open_down->od_share_access,
++ &open_down->od_deleg_want, NULL);
++ if (status)
++ return status;
++ return nfsd4_decode_share_deny(argp, &open_down->od_share_deny);
+ }
+
+ static __be32
+-nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, struct nfsd4_readdir *readdir)
++nfsd4_decode_putfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_putfh *putfh = &u->putfh;
++ __be32 *p;
+
+- READ_BUF(24);
+- p = xdr_decode_hyper(p, &readdir->rd_cookie);
+- COPYMEM(readdir->rd_verf.data, sizeof(readdir->rd_verf.data));
+- readdir->rd_dircount = be32_to_cpup(p++);
+- readdir->rd_maxcount = be32_to_cpup(p++);
+- if ((status = nfsd4_decode_bitmap(argp, readdir->rd_bmval)))
+- goto out;
++ if (xdr_stream_decode_u32(argp->xdr, &putfh->pf_fhlen) < 0)
++ return nfserr_bad_xdr;
++ if (putfh->pf_fhlen > NFS4_FHSIZE)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, putfh->pf_fhlen);
++ if (!p)
++ return nfserr_bad_xdr;
++ putfh->pf_fhval = svcxdr_savemem(argp, p, putfh->pf_fhlen);
++ if (!putfh->pf_fhval)
++ return nfserr_jukebox;
+
+- DECODE_TAIL;
++ putfh->no_verify = false;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_remove(struct nfsd4_compoundargs *argp, struct nfsd4_remove *remove)
++nfsd4_decode_putpubfh(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(4);
+- remove->rm_namelen = be32_to_cpup(p++);
+- READ_BUF(remove->rm_namelen);
+- SAVEMEM(remove->rm_name, remove->rm_namelen);
+- if ((status = check_filename(remove->rm_name, remove->rm_namelen)))
+- return status;
+-
+- DECODE_TAIL;
++ if (argp->minorversion == 0)
++ return nfs_ok;
++ return nfserr_notsupp;
+ }
+
+ static __be32
+-nfsd4_decode_rename(struct nfsd4_compoundargs *argp, struct nfsd4_rename *rename)
++nfsd4_decode_read(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_read *read = &u->read;
++ __be32 status;
+
+- READ_BUF(4);
+- rename->rn_snamelen = be32_to_cpup(p++);
+- READ_BUF(rename->rn_snamelen);
+- SAVEMEM(rename->rn_sname, rename->rn_snamelen);
+- READ_BUF(4);
+- rename->rn_tnamelen = be32_to_cpup(p++);
+- READ_BUF(rename->rn_tnamelen);
+- SAVEMEM(rename->rn_tname, rename->rn_tnamelen);
+- if ((status = check_filename(rename->rn_sname, rename->rn_snamelen)))
+- return status;
+- if ((status = check_filename(rename->rn_tname, rename->rn_tnamelen)))
++ memset(read, 0, sizeof(*read));
++ status = nfsd4_decode_stateid4(argp, &read->rd_stateid);
++ if (status)
+ return status;
++ if (xdr_stream_decode_u64(argp->xdr, &read->rd_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &read->rd_length) < 0)
++ return nfserr_bad_xdr;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_renew(struct nfsd4_compoundargs *argp, clientid_t *clientid)
++nfsd4_decode_readdir(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_readdir *readdir = &u->readdir;
++ __be32 status;
+
+- if (argp->minorversion >= 1)
+- return nfserr_notsupp;
++ memset(readdir, 0, sizeof(*readdir));
++ if (xdr_stream_decode_u64(argp->xdr, &readdir->rd_cookie) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_verifier4(argp, &readdir->rd_verf);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_dircount) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &readdir->rd_maxcount) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_uint32_array(argp->xdr, readdir->rd_bmval,
++ ARRAY_SIZE(readdir->rd_bmval)) < 0)
++ return nfserr_bad_xdr;
+
+- READ_BUF(sizeof(clientid_t));
+- COPYMEM(clientid, sizeof(clientid_t));
++ return nfs_ok;
++}
+
+- DECODE_TAIL;
++static __be32
++nfsd4_decode_remove(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
++{
++ struct nfsd4_remove *remove = &u->remove;
++ memset(&remove->rm_cinfo, 0, sizeof(remove->rm_cinfo));
++ return nfsd4_decode_component4(argp, &remove->rm_name, &remove->rm_namelen);
+ }
+
+ static __be32
+-nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
+- struct nfsd4_secinfo *secinfo)
++nfsd4_decode_rename(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_rename *rename = &u->rename;
++ __be32 status;
+
+- READ_BUF(4);
+- secinfo->si_namelen = be32_to_cpup(p++);
+- READ_BUF(secinfo->si_namelen);
+- SAVEMEM(secinfo->si_name, secinfo->si_namelen);
+- status = check_filename(secinfo->si_name, secinfo->si_namelen);
++ memset(rename, 0, sizeof(*rename));
++ status = nfsd4_decode_component4(argp, &rename->rn_sname, &rename->rn_snamelen);
+ if (status)
+ return status;
+- DECODE_TAIL;
++ return nfsd4_decode_component4(argp, &rename->rn_tname, &rename->rn_tnamelen);
+ }
+
+ static __be32
+-nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
+- struct nfsd4_secinfo_no_name *sin)
++nfsd4_decode_renew(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ clientid_t *clientid = &u->renew;
++ return nfsd4_decode_clientid4(argp, clientid);
++}
+
+- READ_BUF(4);
+- sin->sin_style = be32_to_cpup(p++);
+- DECODE_TAIL;
++static __be32
++nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_secinfo *secinfo = &u->secinfo;
++ secinfo->si_exp = NULL;
++ return nfsd4_decode_component4(argp, &secinfo->si_name, &secinfo->si_namelen);
+ }
+
+ static __be32
+-nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr)
++nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
++ struct nfsd4_setattr *setattr = &u->setattr;
+ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
++ memset(setattr, 0, sizeof(*setattr));
++ status = nfsd4_decode_stateid4(argp, &setattr->sa_stateid);
+ if (status)
+ return status;
+- return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
+- &setattr->sa_acl, &setattr->sa_label, NULL);
++ return nfsd4_decode_fattr4(argp, setattr->sa_bmval,
++ ARRAY_SIZE(setattr->sa_bmval),
++ &setattr->sa_iattr, &setattr->sa_acl,
++ &setattr->sa_label, NULL);
+ }
+
+ static __be32
+-nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid *setclientid)
++nfsd4_decode_setclientid(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_setclientid *setclientid = &u->setclientid;
++ __be32 *p, status;
++
++ memset(setclientid, 0, sizeof(*setclientid));
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+- READ_BUF(NFS4_VERIFIER_SIZE);
+- COPYMEM(setclientid->se_verf.data, NFS4_VERIFIER_SIZE);
+-
++ status = nfsd4_decode_verifier4(argp, &setclientid->se_verf);
++ if (status)
++ return status;
+ status = nfsd4_decode_opaque(argp, &setclientid->se_name);
+ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_prog) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_netid_len) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, setclientid->se_callback_netid_len);
++ if (!p)
+ return nfserr_bad_xdr;
+- READ_BUF(8);
+- setclientid->se_callback_prog = be32_to_cpup(p++);
+- setclientid->se_callback_netid_len = be32_to_cpup(p++);
+- READ_BUF(setclientid->se_callback_netid_len);
+- SAVEMEM(setclientid->se_callback_netid_val, setclientid->se_callback_netid_len);
+- READ_BUF(4);
+- setclientid->se_callback_addr_len = be32_to_cpup(p++);
++ setclientid->se_callback_netid_val = svcxdr_savemem(argp, p,
++ setclientid->se_callback_netid_len);
++ if (!setclientid->se_callback_netid_val)
++ return nfserr_jukebox;
+
+- READ_BUF(setclientid->se_callback_addr_len);
+- SAVEMEM(setclientid->se_callback_addr_val, setclientid->se_callback_addr_len);
+- READ_BUF(4);
+- setclientid->se_callback_ident = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_addr_len) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, setclientid->se_callback_addr_len);
++ if (!p)
++ return nfserr_bad_xdr;
++ setclientid->se_callback_addr_val = svcxdr_savemem(argp, p,
++ setclientid->se_callback_addr_len);
++ if (!setclientid->se_callback_addr_val)
++ return nfserr_jukebox;
++ if (xdr_stream_decode_u32(argp->xdr, &setclientid->se_callback_ident) < 0)
++ return nfserr_bad_xdr;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_setclientid_confirm *scd_c)
++nfsd4_decode_setclientid_confirm(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_setclientid_confirm *scd_c = &u->setclientid_confirm;
++ __be32 status;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+- READ_BUF(8 + NFS4_VERIFIER_SIZE);
+- COPYMEM(&scd_c->sc_clientid, 8);
+- COPYMEM(&scd_c->sc_confirm, NFS4_VERIFIER_SIZE);
+-
+- DECODE_TAIL;
++ status = nfsd4_decode_clientid4(argp, &scd_c->sc_clientid);
++ if (status)
++ return status;
++ return nfsd4_decode_verifier4(argp, &scd_c->sc_confirm);
+ }
+
+ /* Also used for NVERIFY */
+ static __be32
+-nfsd4_decode_verify(struct nfsd4_compoundargs *argp, struct nfsd4_verify *verify)
++nfsd4_decode_verify(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_verify *verify = &u->verify;
++ __be32 *p, status;
+
+- if ((status = nfsd4_decode_bitmap(argp, verify->ve_bmval)))
+- goto out;
++ memset(verify, 0, sizeof(*verify));
++
++ status = nfsd4_decode_bitmap4(argp, verify->ve_bmval,
++ ARRAY_SIZE(verify->ve_bmval));
++ if (status)
++ return status;
+
+ /* For convenience's sake, we compare raw xdr'd attributes in
+ * nfsd4_proc_verify */
+
+- READ_BUF(4);
+- verify->ve_attrlen = be32_to_cpup(p++);
+- READ_BUF(verify->ve_attrlen);
+- SAVEMEM(verify->ve_attrval, verify->ve_attrlen);
++ if (xdr_stream_decode_u32(argp->xdr, &verify->ve_attrlen) < 0)
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, verify->ve_attrlen);
++ if (!p)
++ return nfserr_bad_xdr;
++ verify->ve_attrval = svcxdr_savemem(argp, p, verify->ve_attrlen);
++ if (!verify->ve_attrval)
++ return nfserr_jukebox;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write)
++nfsd4_decode_write(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_write *write = &u->write;
++ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &write->wr_stateid);
++ status = nfsd4_decode_stateid4(argp, &write->wr_stateid);
+ if (status)
+ return status;
+- READ_BUF(16);
+- p = xdr_decode_hyper(p, &write->wr_offset);
+- write->wr_stable_how = be32_to_cpup(p++);
++ if (xdr_stream_decode_u64(argp->xdr, &write->wr_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &write->wr_stable_how) < 0)
++ return nfserr_bad_xdr;
+ if (write->wr_stable_how > NFS_FILE_SYNC)
+- goto xdr_error;
+- write->wr_buflen = be32_to_cpup(p++);
+-
+- status = svcxdr_construct_vector(argp, &write->wr_head,
+- &write->wr_pagelist, write->wr_buflen);
+- if (status)
+- return status;
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &write->wr_buflen) < 0)
++ return nfserr_bad_xdr;
++ if (!xdr_stream_subsegment(argp->xdr, &write->wr_payload, write->wr_buflen))
++ return nfserr_bad_xdr;
+
+- DECODE_TAIL;
++ write->wr_bytes_written = 0;
++ write->wr_how_written = 0;
++ memset(&write->wr_verifier, 0, sizeof(write->wr_verifier));
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_release_lockowner *rlockowner)
++nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_release_lockowner *rlockowner = &u->release_lockowner;
++ __be32 status;
+
+ if (argp->minorversion >= 1)
+ return nfserr_notsupp;
+
+- READ_BUF(12);
+- COPYMEM(&rlockowner->rl_clientid, sizeof(clientid_t));
+- rlockowner->rl_owner.len = be32_to_cpup(p++);
+- READ_BUF(rlockowner->rl_owner.len);
+- READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
++ status = nfsd4_decode_state_owner4(argp, &rlockowner->rl_clientid,
++ &rlockowner->rl_owner);
++ if (status)
++ return status;
+
+ if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
+ return nfserr_inval;
+- DECODE_TAIL;
++
++ return nfs_ok;
++}
++
++static __be32 nfsd4_decode_backchannel_ctl(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_backchannel_ctl *bc = &u->backchannel_ctl;
++ memset(bc, 0, sizeof(*bc));
++ if (xdr_stream_decode_u32(argp->xdr, &bc->bc_cb_program) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_cb_sec(argp, &bc->bc_cb_sec);
++}
++
++static __be32 nfsd4_decode_bind_conn_to_session(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
++ u32 use_conn_in_rdma_mode;
++ __be32 status;
++
++ memset(bcts, 0, sizeof(*bcts));
++ status = nfsd4_decode_sessionid4(argp, &bcts->sessionid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &bcts->dir) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &use_conn_in_rdma_mode) < 0)
++ return nfserr_bad_xdr;
++
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
+- struct nfsd4_exchange_id *exid)
++nfsd4_decode_state_protect_ops(struct nfsd4_compoundargs *argp,
++ struct nfsd4_exchange_id *exid)
+ {
+- int dummy, tmp;
+- DECODE_HEAD;
++ __be32 status;
+
+- READ_BUF(NFS4_VERIFIER_SIZE);
+- COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
++ status = nfsd4_decode_bitmap4(argp, exid->spo_must_enforce,
++ ARRAY_SIZE(exid->spo_must_enforce));
++ if (status)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_bitmap4(argp, exid->spo_must_allow,
++ ARRAY_SIZE(exid->spo_must_allow));
++ if (status)
++ return nfserr_bad_xdr;
+
+- status = nfsd4_decode_opaque(argp, &exid->clname);
++ return nfs_ok;
++}
++
++/*
++ * This implementation currently does not support SP4_SSV.
++ * This decoder simply skips over these arguments.
++ */
++static noinline __be32
++nfsd4_decode_ssv_sp_parms(struct nfsd4_compoundargs *argp,
++ struct nfsd4_exchange_id *exid)
++{
++ u32 count, window, num_gss_handles;
++ __be32 status;
++
++ /* ssp_ops */
++ status = nfsd4_decode_state_protect_ops(argp, exid);
+ if (status)
++ return status;
++
++ /* ssp_hash_algs<> */
++ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
+ return nfserr_bad_xdr;
++ while (count--) {
++ status = nfsd4_decode_ignored_string(argp, 0);
++ if (status)
++ return status;
++ }
+
+- READ_BUF(4);
+- exid->flags = be32_to_cpup(p++);
++ /* ssp_encr_algs<> */
++ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
++ return nfserr_bad_xdr;
++ while (count--) {
++ status = nfsd4_decode_ignored_string(argp, 0);
++ if (status)
++ return status;
++ }
++
++ if (xdr_stream_decode_u32(argp->xdr, &window) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &num_gss_handles) < 0)
++ return nfserr_bad_xdr;
++
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_state_protect4_a(struct nfsd4_compoundargs *argp,
++ struct nfsd4_exchange_id *exid)
++{
++ __be32 status;
+
+- /* Ignore state_protect4_a */
+- READ_BUF(4);
+- exid->spa_how = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &exid->spa_how) < 0)
++ return nfserr_bad_xdr;
+ switch (exid->spa_how) {
+ case SP4_NONE:
+ break;
+ case SP4_MACH_CRED:
+- /* spo_must_enforce */
+- status = nfsd4_decode_bitmap(argp,
+- exid->spo_must_enforce);
+- if (status)
+- goto out;
+- /* spo_must_allow */
+- status = nfsd4_decode_bitmap(argp, exid->spo_must_allow);
++ status = nfsd4_decode_state_protect_ops(argp, exid);
+ if (status)
+- goto out;
++ return status;
+ break;
+ case SP4_SSV:
+- /* ssp_ops */
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy * 4);
+- p += dummy;
+-
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy * 4);
+- p += dummy;
+-
+- /* ssp_hash_algs<> */
+- READ_BUF(4);
+- tmp = be32_to_cpup(p++);
+- while (tmp--) {
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy);
+- p += XDR_QUADLEN(dummy);
+- }
+-
+- /* ssp_encr_algs<> */
+- READ_BUF(4);
+- tmp = be32_to_cpup(p++);
+- while (tmp--) {
+- READ_BUF(4);
+- dummy = be32_to_cpup(p++);
+- READ_BUF(dummy);
+- p += XDR_QUADLEN(dummy);
+- }
+-
+- /* ignore ssp_window and ssp_num_gss_handles: */
+- READ_BUF(8);
++ status = nfsd4_decode_ssv_sp_parms(argp, exid);
++ if (status)
++ return status;
+ break;
+ default:
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ }
+
+- READ_BUF(4); /* nfs_impl_id4 array length */
+- dummy = be32_to_cpup(p++);
++ return nfs_ok;
++}
+
+- if (dummy > 1)
+- goto xdr_error;
++static __be32
++nfsd4_decode_nfs_impl_id4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_exchange_id *exid)
++{
++ __be32 status;
++ u32 count;
+
+- if (dummy == 1) {
++ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
++ return nfserr_bad_xdr;
++ switch (count) {
++ case 0:
++ break;
++ case 1:
++ /* Note that RFC 8881 places no length limit on
++ * nii_domain, but this implementation permits no
++ * more than NFS4_OPAQUE_LIMIT bytes */
+ status = nfsd4_decode_opaque(argp, &exid->nii_domain);
+ if (status)
+- goto xdr_error;
+-
+- /* nii_name */
++ return status;
++ /* Note that RFC 8881 places no length limit on
++ * nii_name, but this implementation permits no
++ * more than NFS4_OPAQUE_LIMIT bytes */
+ status = nfsd4_decode_opaque(argp, &exid->nii_name);
+ if (status)
+- goto xdr_error;
+-
+- /* nii_date */
+- status = nfsd4_decode_time(argp, &exid->nii_time);
++ return status;
++ status = nfsd4_decode_nfstime4(argp, &exid->nii_time);
+ if (status)
+- goto xdr_error;
++ return status;
++ break;
++ default:
++ return nfserr_bad_xdr;
+ }
+- DECODE_TAIL;
+-}
+
+-static __be32
+-nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
+- struct nfsd4_create_session *sess)
+-{
+- DECODE_HEAD;
+-
+- READ_BUF(16);
+- COPYMEM(&sess->clientid, 8);
+- sess->seqid = be32_to_cpup(p++);
+- sess->flags = be32_to_cpup(p++);
+-
+- /* Fore channel attrs */
+- READ_BUF(28);
+- p++; /* headerpadsz is always 0 */
+- sess->fore_channel.maxreq_sz = be32_to_cpup(p++);
+- sess->fore_channel.maxresp_sz = be32_to_cpup(p++);
+- sess->fore_channel.maxresp_cached = be32_to_cpup(p++);
+- sess->fore_channel.maxops = be32_to_cpup(p++);
+- sess->fore_channel.maxreqs = be32_to_cpup(p++);
+- sess->fore_channel.nr_rdma_attrs = be32_to_cpup(p++);
+- if (sess->fore_channel.nr_rdma_attrs == 1) {
+- READ_BUF(4);
+- sess->fore_channel.rdma_attrs = be32_to_cpup(p++);
+- } else if (sess->fore_channel.nr_rdma_attrs > 1) {
+- dprintk("Too many fore channel attr bitmaps!\n");
+- goto xdr_error;
+- }
+-
+- /* Back channel attrs */
+- READ_BUF(28);
+- p++; /* headerpadsz is always 0 */
+- sess->back_channel.maxreq_sz = be32_to_cpup(p++);
+- sess->back_channel.maxresp_sz = be32_to_cpup(p++);
+- sess->back_channel.maxresp_cached = be32_to_cpup(p++);
+- sess->back_channel.maxops = be32_to_cpup(p++);
+- sess->back_channel.maxreqs = be32_to_cpup(p++);
+- sess->back_channel.nr_rdma_attrs = be32_to_cpup(p++);
+- if (sess->back_channel.nr_rdma_attrs == 1) {
+- READ_BUF(4);
+- sess->back_channel.rdma_attrs = be32_to_cpup(p++);
+- } else if (sess->back_channel.nr_rdma_attrs > 1) {
+- dprintk("Too many back channel attr bitmaps!\n");
+- goto xdr_error;
+- }
+-
+- READ_BUF(4);
+- sess->callback_prog = be32_to_cpup(p++);
+- nfsd4_decode_cb_sec(argp, &sess->cb_sec);
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
+- struct nfsd4_destroy_session *destroy_session)
++nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+- READ_BUF(NFS4_MAX_SESSIONID_LEN);
+- COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
++ struct nfsd4_exchange_id *exid = &u->exchange_id;
++ __be32 status;
+
+- DECODE_TAIL;
++ memset(exid, 0, sizeof(*exid));
++ status = nfsd4_decode_verifier4(argp, &exid->verifier);
++ if (status)
++ return status;
++ status = nfsd4_decode_opaque(argp, &exid->clname);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &exid->flags) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_state_protect4_a(argp, exid);
++ if (status)
++ return status;
++ return nfsd4_decode_nfs_impl_id4(argp, exid);
+ }
+
+ static __be32
+-nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
+- struct nfsd4_free_stateid *free_stateid)
++nfsd4_decode_channel_attrs4(struct nfsd4_compoundargs *argp,
++ struct nfsd4_channel_attrs *ca)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(sizeof(stateid_t));
+- free_stateid->fr_stateid.si_generation = be32_to_cpup(p++);
+- COPYMEM(&free_stateid->fr_stateid.si_opaque, sizeof(stateid_opaque_t));
+-
+- DECODE_TAIL;
+-}
++ __be32 *p;
+
+-static __be32
+-nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
+- struct nfsd4_sequence *seq)
+-{
+- DECODE_HEAD;
++ p = xdr_inline_decode(argp->xdr, XDR_UNIT * 7);
++ if (!p)
++ return nfserr_bad_xdr;
+
+- READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
+- COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
+- seq->seqid = be32_to_cpup(p++);
+- seq->slotid = be32_to_cpup(p++);
+- seq->maxslots = be32_to_cpup(p++);
+- seq->cachethis = be32_to_cpup(p++);
++ /* headerpadsz is ignored */
++ p++;
++ ca->maxreq_sz = be32_to_cpup(p++);
++ ca->maxresp_sz = be32_to_cpup(p++);
++ ca->maxresp_cached = be32_to_cpup(p++);
++ ca->maxops = be32_to_cpup(p++);
++ ca->maxreqs = be32_to_cpup(p++);
++ ca->nr_rdma_attrs = be32_to_cpup(p);
++ switch (ca->nr_rdma_attrs) {
++ case 0:
++ break;
++ case 1:
++ if (xdr_stream_decode_u32(argp->xdr, &ca->rdma_attrs) < 0)
++ return nfserr_bad_xdr;
++ break;
++ default:
++ return nfserr_bad_xdr;
++ }
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_stateid *test_stateid)
++nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- int i;
+- __be32 *p, status;
+- struct nfsd4_test_stateid_id *stateid;
+-
+- READ_BUF(4);
+- test_stateid->ts_num_ids = ntohl(*p++);
+-
+- INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
+-
+- for (i = 0; i < test_stateid->ts_num_ids; i++) {
+- stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
+- if (!stateid) {
+- status = nfserrno(-ENOMEM);
+- goto out;
+- }
+-
+- INIT_LIST_HEAD(&stateid->ts_id_list);
+- list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
+-
+- status = nfsd4_decode_stateid(argp, &stateid->ts_id_stateid);
+- if (status)
+- goto out;
+- }
++ struct nfsd4_create_session *sess = &u->create_session;
++ __be32 status;
+
+- status = 0;
+-out:
+- return status;
+-xdr_error:
+- dprintk("NFSD: xdr error (%s:%d)\n", __FILE__, __LINE__);
+- status = nfserr_bad_xdr;
+- goto out;
++ memset(sess, 0, sizeof(*sess));
++ status = nfsd4_decode_clientid4(argp, &sess->clientid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &sess->seqid) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &sess->flags) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_channel_attrs4(argp, &sess->fore_channel);
++ if (status)
++ return status;
++ status = nfsd4_decode_channel_attrs4(argp, &sess->back_channel);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &sess->callback_prog) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_cb_sec(argp, &sess->cb_sec);
+ }
+
+-static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp, struct nfsd4_destroy_clientid *dc)
++static __be32
++nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(8);
+- COPYMEM(&dc->clientid, 8);
+-
+- DECODE_TAIL;
++ struct nfsd4_destroy_session *destroy_session = &u->destroy_session;
++ return nfsd4_decode_sessionid4(argp, &destroy_session->sessionid);
+ }
+
+-static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, struct nfsd4_reclaim_complete *rc)
++static __be32
++nfsd4_decode_free_stateid(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(4);
+- rc->rca_one_fs = be32_to_cpup(p++);
+-
+- DECODE_TAIL;
++ struct nfsd4_free_stateid *free_stateid = &u->free_stateid;
++ return nfsd4_decode_stateid4(argp, &free_stateid->fr_stateid);
+ }
+
+ #ifdef CONFIG_NFSD_PNFS
+ static __be32
+ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+- struct nfsd4_getdeviceinfo *gdev)
+-{
+- DECODE_HEAD;
+- u32 num, i;
+-
+- READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+- COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+- gdev->gd_layout_type = be32_to_cpup(p++);
+- gdev->gd_maxcount = be32_to_cpup(p++);
+- num = be32_to_cpup(p++);
+- if (num) {
+- if (num > 1000)
+- goto xdr_error;
+- READ_BUF(4 * num);
+- gdev->gd_notify_types = be32_to_cpup(p++);
+- for (i = 1; i < num; i++) {
+- if (be32_to_cpup(p++)) {
+- status = nfserr_inval;
+- goto out;
+- }
+- }
+- }
+- DECODE_TAIL;
+-}
+-
+-static __be32
+-nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+- struct nfsd4_layoutget *lgp)
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+-
+- READ_BUF(36);
+- lgp->lg_signal = be32_to_cpup(p++);
+- lgp->lg_layout_type = be32_to_cpup(p++);
+- lgp->lg_seg.iomode = be32_to_cpup(p++);
+- p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+- p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+- p = xdr_decode_hyper(p, &lgp->lg_minlength);
++ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
++ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &lgp->lg_sid);
++ memset(gdev, 0, sizeof(*gdev));
++ status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid);
+ if (status)
+ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_maxcount) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_uint32_array(argp->xdr,
++ &gdev->gd_notify_types, 1) < 0)
++ return nfserr_bad_xdr;
+
+- READ_BUF(4);
+- lgp->lg_maxcount = be32_to_cpup(p++);
+-
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+- struct nfsd4_layoutcommit *lcp)
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+- u32 timechange;
+-
+- READ_BUF(20);
+- p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+- p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+- lcp->lc_reclaim = be32_to_cpup(p++);
++ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
++ __be32 *p, status;
+
+- status = nfsd4_decode_stateid(argp, &lcp->lc_sid);
++ memset(lcp, 0, sizeof(*lcp));
++ if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_seg.length) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_reclaim) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_stateid4(argp, &lcp->lc_sid);
+ if (status)
+ return status;
+-
+- READ_BUF(4);
+- lcp->lc_newoffset = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_newoffset) < 0)
++ return nfserr_bad_xdr;
+ if (lcp->lc_newoffset) {
+- READ_BUF(8);
+- p = xdr_decode_hyper(p, &lcp->lc_last_wr);
++ if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0)
++ return nfserr_bad_xdr;
+ } else
+ lcp->lc_last_wr = 0;
+- READ_BUF(4);
+- timechange = be32_to_cpup(p++);
+- if (timechange) {
+- status = nfsd4_decode_time(argp, &lcp->lc_mtime);
++ p = xdr_inline_decode(argp->xdr, XDR_UNIT);
++ if (!p)
++ return nfserr_bad_xdr;
++ if (xdr_item_is_present(p)) {
++ status = nfsd4_decode_nfstime4(argp, &lcp->lc_mtime);
+ if (status)
+ return status;
+ } else {
+ lcp->lc_mtime.tv_nsec = UTIME_NOW;
+ }
+- READ_BUF(8);
+- lcp->lc_layout_type = be32_to_cpup(p++);
++ return nfsd4_decode_layoutupdate4(argp, lcp);
++}
+
+- /*
+- * Save the layout update in XDR format and let the layout driver deal
+- * with it later.
+- */
+- lcp->lc_up_len = be32_to_cpup(p++);
+- if (lcp->lc_up_len > 0) {
+- READ_BUF(lcp->lc_up_len);
+- READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+- }
++static __be32
++nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_layoutget *lgp = &u->layoutget;
++ __be32 status;
+
+- DECODE_TAIL;
++ memset(lgp, 0, sizeof(*lgp));
++ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_signal) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_layout_type) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_seg.iomode) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_seg.length) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &lgp->lg_minlength) < 0)
++ return nfserr_bad_xdr;
++ status = nfsd4_decode_stateid4(argp, &lgp->lg_sid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u32(argp->xdr, &lgp->lg_maxcount) < 0)
++ return nfserr_bad_xdr;
++
++ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+- struct nfsd4_layoutreturn *lrp)
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
++ memset(lrp, 0, sizeof(*lrp));
++ if (xdr_stream_decode_bool(argp->xdr, &lrp->lr_reclaim) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_layout_type) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &lrp->lr_seg.iomode) < 0)
++ return nfserr_bad_xdr;
++ return nfsd4_decode_layoutreturn4(argp, lrp);
++}
++#endif /* CONFIG_NFSD_PNFS */
+
+- READ_BUF(16);
+- lrp->lr_reclaim = be32_to_cpup(p++);
+- lrp->lr_layout_type = be32_to_cpup(p++);
+- lrp->lr_seg.iomode = be32_to_cpup(p++);
+- lrp->lr_return_type = be32_to_cpup(p++);
+- if (lrp->lr_return_type == RETURN_FILE) {
+- READ_BUF(16);
+- p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+- p = xdr_decode_hyper(p, &lrp->lr_seg.length);
++static __be32 nfsd4_decode_secinfo_no_name(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_secinfo_no_name *sin = &u->secinfo_no_name;
++ if (xdr_stream_decode_u32(argp->xdr, &sin->sin_style) < 0)
++ return nfserr_bad_xdr;
+
+- status = nfsd4_decode_stateid(argp, &lrp->lr_sid);
+- if (status)
+- return status;
++ sin->sin_exp = NULL;
++ return nfs_ok;
++}
+
+- READ_BUF(4);
+- lrp->lrf_body_len = be32_to_cpup(p++);
+- if (lrp->lrf_body_len > 0) {
+- READ_BUF(lrp->lrf_body_len);
+- READMEM(lrp->lrf_body, lrp->lrf_body_len);
+- }
+- } else {
+- lrp->lr_seg.offset = 0;
+- lrp->lr_seg.length = NFS4_MAX_UINT64;
++static __be32
++nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_sequence *seq = &u->sequence;
++ __be32 *p, status;
++
++ status = nfsd4_decode_sessionid4(argp, &seq->sessionid);
++ if (status)
++ return status;
++ p = xdr_inline_decode(argp->xdr, XDR_UNIT * 4);
++ if (!p)
++ return nfserr_bad_xdr;
++ seq->seqid = be32_to_cpup(p++);
++ seq->slotid = be32_to_cpup(p++);
++ seq->maxslots = be32_to_cpup(p++);
++ seq->cachethis = be32_to_cpup(p);
++
++ seq->status_flags = 0;
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
++ struct nfsd4_test_stateid_id *stateid;
++ __be32 status;
++ u32 i;
++
++ memset(test_stateid, 0, sizeof(*test_stateid));
++ if (xdr_stream_decode_u32(argp->xdr, &test_stateid->ts_num_ids) < 0)
++ return nfserr_bad_xdr;
++
++ INIT_LIST_HEAD(&test_stateid->ts_stateid_list);
++ for (i = 0; i < test_stateid->ts_num_ids; i++) {
++ stateid = svcxdr_tmpalloc(argp, sizeof(*stateid));
++ if (!stateid)
++ return nfserr_jukebox;
++ INIT_LIST_HEAD(&stateid->ts_id_list);
++ list_add_tail(&stateid->ts_id_list, &test_stateid->ts_stateid_list);
++ status = nfsd4_decode_stateid4(argp, &stateid->ts_id_stateid);
++ if (status)
++ return status;
+ }
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+-#endif /* CONFIG_NFSD_PNFS */
+
+-static __be32
+-nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
+- struct nfsd4_fallocate *fallocate)
++static __be32 nfsd4_decode_destroy_clientid(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
+-
+- status = nfsd4_decode_stateid(argp, &fallocate->falloc_stateid);
+- if (status)
+- return status;
+-
+- READ_BUF(16);
+- p = xdr_decode_hyper(p, &fallocate->falloc_offset);
+- xdr_decode_hyper(p, &fallocate->falloc_length);
++ struct nfsd4_destroy_clientid *dc = &u->destroy_clientid;
++ return nfsd4_decode_clientid4(argp, &dc->clientid);
++}
+
+- DECODE_TAIL;
++static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_reclaim_complete *rc = &u->reclaim_complete;
++ if (xdr_stream_decode_bool(argp->xdr, &rc->rca_one_fs) < 0)
++ return nfserr_bad_xdr;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
++nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_fallocate *fallocate = &u->allocate;
++ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
+- if (status)
+- return status;
+- status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
++ status = nfsd4_decode_stateid4(argp, &fallocate->falloc_stateid);
+ if (status)
+ return status;
++ if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &fallocate->falloc_length) < 0)
++ return nfserr_bad_xdr;
+
+- READ_BUF(8 + 8 + 8);
+- p = xdr_decode_hyper(p, &clone->cl_src_pos);
+- p = xdr_decode_hyper(p, &clone->cl_dst_pos);
+- p = xdr_decode_hyper(p, &clone->cl_count);
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32 nfsd4_decode_nl4_server(struct nfsd4_compoundargs *argp,
+ struct nl4_server *ns)
+ {
+- DECODE_HEAD;
+ struct nfs42_netaddr *naddr;
++ __be32 *p;
+
+- READ_BUF(4);
+- ns->nl4_type = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &ns->nl4_type) < 0)
++ return nfserr_bad_xdr;
+
+ /* currently support for 1 inter-server source server */
+ switch (ns->nl4_type) {
+ case NL4_NETADDR:
+ naddr = &ns->u.nl4_addr;
+
+- READ_BUF(4);
+- naddr->netid_len = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &naddr->netid_len) < 0)
++ return nfserr_bad_xdr;
+ if (naddr->netid_len > RPCBIND_MAXNETIDLEN)
+- goto xdr_error;
++ return nfserr_bad_xdr;
+
+- READ_BUF(naddr->netid_len + 4); /* 4 for uaddr len */
+- COPYMEM(naddr->netid, naddr->netid_len);
++ p = xdr_inline_decode(argp->xdr, naddr->netid_len);
++ if (!p)
++ return nfserr_bad_xdr;
++ memcpy(naddr->netid, p, naddr->netid_len);
+
+- naddr->addr_len = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &naddr->addr_len) < 0)
++ return nfserr_bad_xdr;
+ if (naddr->addr_len > RPCBIND_MAXUADDRLEN)
+- goto xdr_error;
++ return nfserr_bad_xdr;
+
+- READ_BUF(naddr->addr_len);
+- COPYMEM(naddr->addr, naddr->addr_len);
++ p = xdr_inline_decode(argp->xdr, naddr->addr_len);
++ if (!p)
++ return nfserr_bad_xdr;
++ memcpy(naddr->addr, p, naddr->addr_len);
+ break;
+ default:
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ }
+- DECODE_TAIL;
++
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
++nfsd4_decode_copy(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_copy *copy = &u->copy;
++ u32 consecutive, i, count, sync;
+ struct nl4_server *ns_dummy;
+- int i, count;
++ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
++ memset(copy, 0, sizeof(*copy));
++ status = nfsd4_decode_stateid4(argp, &copy->cp_src_stateid);
+ if (status)
+ return status;
+- status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
++ status = nfsd4_decode_stateid4(argp, &copy->cp_dst_stateid);
+ if (status)
+ return status;
++ if (xdr_stream_decode_u64(argp->xdr, &copy->cp_src_pos) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &copy->cp_dst_pos) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &copy->cp_count) < 0)
++ return nfserr_bad_xdr;
++ /* ca_consecutive: we always do consecutive copies */
++ if (xdr_stream_decode_u32(argp->xdr, &consecutive) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_bool(argp->xdr, &sync) < 0)
++ return nfserr_bad_xdr;
++ nfsd4_copy_set_sync(copy, sync);
+
+- READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
+- p = xdr_decode_hyper(p, &copy->cp_src_pos);
+- p = xdr_decode_hyper(p, &copy->cp_dst_pos);
+- p = xdr_decode_hyper(p, &copy->cp_count);
+- p++; /* ca_consecutive: we always do consecutive copies */
+- copy->cp_synchronous = be32_to_cpup(p++);
+-
+- count = be32_to_cpup(p++);
+-
+- copy->cp_intra = false;
++ if (xdr_stream_decode_u32(argp->xdr, &count) < 0)
++ return nfserr_bad_xdr;
++ copy->cp_src = svcxdr_tmpalloc(argp, sizeof(*copy->cp_src));
++ if (copy->cp_src == NULL)
++ return nfserr_jukebox;
+ if (count == 0) { /* intra-server copy */
+- copy->cp_intra = true;
+- goto intra;
++ __set_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
++ return nfs_ok;
+ }
+
+- /* decode all the supplied server addresses but use first */
+- status = nfsd4_decode_nl4_server(argp, &copy->cp_src);
++ /* decode all the supplied server addresses but use only the first */
++ status = nfsd4_decode_nl4_server(argp, copy->cp_src);
+ if (status)
+ return status;
+
+ ns_dummy = kmalloc(sizeof(struct nl4_server), GFP_KERNEL);
+ if (ns_dummy == NULL)
+- return nfserrno(-ENOMEM);
++ return nfserr_jukebox;
+ for (i = 0; i < count - 1; i++) {
+ status = nfsd4_decode_nl4_server(argp, ns_dummy);
+ if (status) {
+@@ -1839,44 +2027,80 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+ }
+ }
+ kfree(ns_dummy);
+-intra:
+
+- DECODE_TAIL;
++ return nfs_ok;
++}
++
++static __be32
++nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
++ union nfsd4_op_u *u)
++{
++ struct nfsd4_copy_notify *cn = &u->copy_notify;
++ __be32 status;
++
++ memset(cn, 0, sizeof(*cn));
++ cn->cpn_src = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_src));
++ if (cn->cpn_src == NULL)
++ return nfserr_jukebox;
++ cn->cpn_dst = svcxdr_tmpalloc(argp, sizeof(*cn->cpn_dst));
++ if (cn->cpn_dst == NULL)
++ return nfserr_jukebox;
++
++ status = nfsd4_decode_stateid4(argp, &cn->cpn_src_stateid);
++ if (status)
++ return status;
++ return nfsd4_decode_nl4_server(argp, cn->cpn_dst);
+ }
+
+ static __be32
+ nfsd4_decode_offload_status(struct nfsd4_compoundargs *argp,
+- struct nfsd4_offload_status *os)
++ union nfsd4_op_u *u)
+ {
+- return nfsd4_decode_stateid(argp, &os->stateid);
++ struct nfsd4_offload_status *os = &u->offload_status;
++ os->count = 0;
++ os->status = 0;
++ return nfsd4_decode_stateid4(argp, &os->stateid);
+ }
+
+ static __be32
+-nfsd4_decode_copy_notify(struct nfsd4_compoundargs *argp,
+- struct nfsd4_copy_notify *cn)
++nfsd4_decode_seek(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
++ struct nfsd4_seek *seek = &u->seek;
+ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &cn->cpn_src_stateid);
++ status = nfsd4_decode_stateid4(argp, &seek->seek_stateid);
+ if (status)
+ return status;
+- return nfsd4_decode_nl4_server(argp, &cn->cpn_dst);
++ if (xdr_stream_decode_u64(argp->xdr, &seek->seek_offset) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u32(argp->xdr, &seek->seek_whence) < 0)
++ return nfserr_bad_xdr;
++
++ seek->seek_eof = 0;
++ seek->seek_pos = 0;
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
++nfsd4_decode_clone(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_clone *clone = &u->clone;
++ __be32 status;
+
+- status = nfsd4_decode_stateid(argp, &seek->seek_stateid);
++ status = nfsd4_decode_stateid4(argp, &clone->cl_src_stateid);
+ if (status)
+ return status;
++ status = nfsd4_decode_stateid4(argp, &clone->cl_dst_stateid);
++ if (status)
++ return status;
++ if (xdr_stream_decode_u64(argp->xdr, &clone->cl_src_pos) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &clone->cl_dst_pos) < 0)
++ return nfserr_bad_xdr;
++ if (xdr_stream_decode_u64(argp->xdr, &clone->cl_count) < 0)
++ return nfserr_bad_xdr;
+
+- READ_BUF(8 + 4);
+- p = xdr_decode_hyper(p, &seek->seek_offset);
+- seek->seek_whence = be32_to_cpup(p);
+-
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ /*
+@@ -1889,13 +2113,14 @@ nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
+ */
+
+ /*
+- * Decode data into buffer. Uses head and pages constructed by
+- * svcxdr_construct_vector.
++ * Decode data into buffer.
+ */
+ static __be32
+-nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
+- struct page **pages, char **bufp, u32 buflen)
++nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct xdr_buf *xdr,
++ char **bufp, u32 buflen)
+ {
++ struct page **pages = xdr->pages;
++ struct kvec *head = xdr->head;
+ char *tmp, *dp;
+ u32 len;
+
+@@ -1938,25 +2163,22 @@ nfsd4_vbuf_from_vector(struct nfsd4_compoundargs *argp, struct kvec *head,
+ static __be32
+ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
+ {
+- DECODE_HEAD;
+ char *name, *sp, *dp;
+ u32 namelen, cnt;
++ __be32 *p;
+
+- READ_BUF(4);
+- namelen = be32_to_cpup(p++);
+-
++ if (xdr_stream_decode_u32(argp->xdr, &namelen) < 0)
++ return nfserr_bad_xdr;
+ if (namelen > (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN))
+ return nfserr_nametoolong;
+-
+ if (namelen == 0)
+- goto xdr_error;
+-
+- READ_BUF(namelen);
+-
++ return nfserr_bad_xdr;
++ p = xdr_inline_decode(argp->xdr, namelen);
++ if (!p)
++ return nfserr_bad_xdr;
+ name = svcxdr_tmpalloc(argp, namelen + XATTR_USER_PREFIX_LEN + 1);
+ if (!name)
+ return nfserr_jukebox;
+-
+ memcpy(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+
+ /*
+@@ -1969,14 +2191,14 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
+
+ while (cnt-- > 0) {
+ if (*sp == '\0')
+- goto xdr_error;
++ return nfserr_bad_xdr;
+ *dp++ = *sp++;
+ }
+ *dp = '\0';
+
+ *namep = name;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ /*
+@@ -1987,11 +2209,13 @@ nfsd4_decode_xattr_name(struct nfsd4_compoundargs *argp, char **namep)
+ */
+ static __be32
+ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
+- struct nfsd4_getxattr *getxattr)
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_getxattr *getxattr = &u->getxattr;
+ __be32 status;
+ u32 maxcount;
+
++ memset(getxattr, 0, sizeof(*getxattr));
+ status = nfsd4_decode_xattr_name(argp, &getxattr->getxa_name);
+ if (status)
+ return status;
+@@ -2000,21 +2224,21 @@ nfsd4_decode_getxattr(struct nfsd4_compoundargs *argp,
+ maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
+
+ getxattr->getxa_len = maxcount;
+-
+- return status;
++ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
+- struct nfsd4_setxattr *setxattr)
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_setxattr *setxattr = &u->setxattr;
+ u32 flags, maxcount, size;
+- struct kvec head;
+- struct page **pagelist;
++ __be32 status;
++
++ memset(setxattr, 0, sizeof(*setxattr));
+
+- READ_BUF(4);
+- flags = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &flags) < 0)
++ return nfserr_bad_xdr;
+
+ if (flags > SETXATTR4_REPLACE)
+ return nfserr_inval;
+@@ -2027,33 +2251,35 @@ nfsd4_decode_setxattr(struct nfsd4_compoundargs *argp,
+ maxcount = svc_max_payload(argp->rqstp);
+ maxcount = min_t(u32, XATTR_SIZE_MAX, maxcount);
+
+- READ_BUF(4);
+- size = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &size) < 0)
++ return nfserr_bad_xdr;
+ if (size > maxcount)
+ return nfserr_xattr2big;
+
+ setxattr->setxa_len = size;
+ if (size > 0) {
+- status = svcxdr_construct_vector(argp, &head, &pagelist, size);
+- if (status)
+- return status;
++ struct xdr_buf payload;
+
+- status = nfsd4_vbuf_from_vector(argp, &head, pagelist,
+- &setxattr->setxa_buf, size);
++ if (!xdr_stream_subsegment(argp->xdr, &payload, size))
++ return nfserr_bad_xdr;
++ status = nfsd4_vbuf_from_vector(argp, &payload,
++ &setxattr->setxa_buf, size);
+ }
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
+- struct nfsd4_listxattrs *listxattrs)
++ union nfsd4_op_u *u)
+ {
+- DECODE_HEAD;
++ struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
+ u32 maxcount;
+
+- READ_BUF(12);
+- p = xdr_decode_hyper(p, &listxattrs->lsxa_cookie);
++ memset(listxattrs, 0, sizeof(*listxattrs));
++
++ if (xdr_stream_decode_u64(argp->xdr, &listxattrs->lsxa_cookie) < 0)
++ return nfserr_bad_xdr;
+
+ /*
+ * If the cookie is too large to have even one user.x attribute
+@@ -2063,7 +2289,8 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
+ (XATTR_LIST_MAX / (XATTR_USER_PREFIX_LEN + 2)))
+ return nfserr_badcookie;
+
+- maxcount = be32_to_cpup(p++);
++ if (xdr_stream_decode_u32(argp->xdr, &maxcount) < 0)
++ return nfserr_bad_xdr;
+ if (maxcount < 8)
+ /* Always need at least 2 words (length and one character) */
+ return nfserr_inval;
+@@ -2071,117 +2298,119 @@ nfsd4_decode_listxattrs(struct nfsd4_compoundargs *argp,
+ maxcount = min(maxcount, svc_max_payload(argp->rqstp));
+ listxattrs->lsxa_maxcount = maxcount;
+
+- DECODE_TAIL;
++ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_decode_removexattr(struct nfsd4_compoundargs *argp,
+- struct nfsd4_removexattr *removexattr)
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_removexattr *removexattr = &u->removexattr;
++ memset(removexattr, 0, sizeof(*removexattr));
+ return nfsd4_decode_xattr_name(argp, &removexattr->rmxa_name);
+ }
+
+ static __be32
+-nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
++nfsd4_decode_noop(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
+ {
+ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
++nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, union nfsd4_op_u *p)
+ {
+ return nfserr_notsupp;
+ }
+
+-typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
++typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, union nfsd4_op_u *u);
+
+ static const nfsd4_dec nfsd4_dec_ops[] = {
+- [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access,
+- [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close,
+- [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit,
+- [OP_CREATE] = (nfsd4_dec)nfsd4_decode_create,
+- [OP_DELEGPURGE] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_DELEGRETURN] = (nfsd4_dec)nfsd4_decode_delegreturn,
+- [OP_GETATTR] = (nfsd4_dec)nfsd4_decode_getattr,
+- [OP_GETFH] = (nfsd4_dec)nfsd4_decode_noop,
+- [OP_LINK] = (nfsd4_dec)nfsd4_decode_link,
+- [OP_LOCK] = (nfsd4_dec)nfsd4_decode_lock,
+- [OP_LOCKT] = (nfsd4_dec)nfsd4_decode_lockt,
+- [OP_LOCKU] = (nfsd4_dec)nfsd4_decode_locku,
+- [OP_LOOKUP] = (nfsd4_dec)nfsd4_decode_lookup,
+- [OP_LOOKUPP] = (nfsd4_dec)nfsd4_decode_noop,
+- [OP_NVERIFY] = (nfsd4_dec)nfsd4_decode_verify,
+- [OP_OPEN] = (nfsd4_dec)nfsd4_decode_open,
+- [OP_OPENATTR] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
+- [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
+- [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
+- [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_putpubfh,
+- [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
+- [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
+- [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
+- [OP_READLINK] = (nfsd4_dec)nfsd4_decode_noop,
+- [OP_REMOVE] = (nfsd4_dec)nfsd4_decode_remove,
+- [OP_RENAME] = (nfsd4_dec)nfsd4_decode_rename,
+- [OP_RENEW] = (nfsd4_dec)nfsd4_decode_renew,
+- [OP_RESTOREFH] = (nfsd4_dec)nfsd4_decode_noop,
+- [OP_SAVEFH] = (nfsd4_dec)nfsd4_decode_noop,
+- [OP_SECINFO] = (nfsd4_dec)nfsd4_decode_secinfo,
+- [OP_SETATTR] = (nfsd4_dec)nfsd4_decode_setattr,
+- [OP_SETCLIENTID] = (nfsd4_dec)nfsd4_decode_setclientid,
+- [OP_SETCLIENTID_CONFIRM] = (nfsd4_dec)nfsd4_decode_setclientid_confirm,
+- [OP_VERIFY] = (nfsd4_dec)nfsd4_decode_verify,
+- [OP_WRITE] = (nfsd4_dec)nfsd4_decode_write,
+- [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
++ [OP_ACCESS] = nfsd4_decode_access,
++ [OP_CLOSE] = nfsd4_decode_close,
++ [OP_COMMIT] = nfsd4_decode_commit,
++ [OP_CREATE] = nfsd4_decode_create,
++ [OP_DELEGPURGE] = nfsd4_decode_notsupp,
++ [OP_DELEGRETURN] = nfsd4_decode_delegreturn,
++ [OP_GETATTR] = nfsd4_decode_getattr,
++ [OP_GETFH] = nfsd4_decode_noop,
++ [OP_LINK] = nfsd4_decode_link,
++ [OP_LOCK] = nfsd4_decode_lock,
++ [OP_LOCKT] = nfsd4_decode_lockt,
++ [OP_LOCKU] = nfsd4_decode_locku,
++ [OP_LOOKUP] = nfsd4_decode_lookup,
++ [OP_LOOKUPP] = nfsd4_decode_noop,
++ [OP_NVERIFY] = nfsd4_decode_verify,
++ [OP_OPEN] = nfsd4_decode_open,
++ [OP_OPENATTR] = nfsd4_decode_notsupp,
++ [OP_OPEN_CONFIRM] = nfsd4_decode_open_confirm,
++ [OP_OPEN_DOWNGRADE] = nfsd4_decode_open_downgrade,
++ [OP_PUTFH] = nfsd4_decode_putfh,
++ [OP_PUTPUBFH] = nfsd4_decode_putpubfh,
++ [OP_PUTROOTFH] = nfsd4_decode_noop,
++ [OP_READ] = nfsd4_decode_read,
++ [OP_READDIR] = nfsd4_decode_readdir,
++ [OP_READLINK] = nfsd4_decode_noop,
++ [OP_REMOVE] = nfsd4_decode_remove,
++ [OP_RENAME] = nfsd4_decode_rename,
++ [OP_RENEW] = nfsd4_decode_renew,
++ [OP_RESTOREFH] = nfsd4_decode_noop,
++ [OP_SAVEFH] = nfsd4_decode_noop,
++ [OP_SECINFO] = nfsd4_decode_secinfo,
++ [OP_SETATTR] = nfsd4_decode_setattr,
++ [OP_SETCLIENTID] = nfsd4_decode_setclientid,
++ [OP_SETCLIENTID_CONFIRM] = nfsd4_decode_setclientid_confirm,
++ [OP_VERIFY] = nfsd4_decode_verify,
++ [OP_WRITE] = nfsd4_decode_write,
++ [OP_RELEASE_LOCKOWNER] = nfsd4_decode_release_lockowner,
+
+ /* new operations for NFSv4.1 */
+- [OP_BACKCHANNEL_CTL] = (nfsd4_dec)nfsd4_decode_backchannel_ctl,
+- [OP_BIND_CONN_TO_SESSION]= (nfsd4_dec)nfsd4_decode_bind_conn_to_session,
+- [OP_EXCHANGE_ID] = (nfsd4_dec)nfsd4_decode_exchange_id,
+- [OP_CREATE_SESSION] = (nfsd4_dec)nfsd4_decode_create_session,
+- [OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
+- [OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
+- [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
++ [OP_BACKCHANNEL_CTL] = nfsd4_decode_backchannel_ctl,
++ [OP_BIND_CONN_TO_SESSION] = nfsd4_decode_bind_conn_to_session,
++ [OP_EXCHANGE_ID] = nfsd4_decode_exchange_id,
++ [OP_CREATE_SESSION] = nfsd4_decode_create_session,
++ [OP_DESTROY_SESSION] = nfsd4_decode_destroy_session,
++ [OP_FREE_STATEID] = nfsd4_decode_free_stateid,
++ [OP_GET_DIR_DELEGATION] = nfsd4_decode_notsupp,
+ #ifdef CONFIG_NFSD_PNFS
+- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
+- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
+- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
++ [OP_GETDEVICEINFO] = nfsd4_decode_getdeviceinfo,
++ [OP_GETDEVICELIST] = nfsd4_decode_notsupp,
++ [OP_LAYOUTCOMMIT] = nfsd4_decode_layoutcommit,
++ [OP_LAYOUTGET] = nfsd4_decode_layoutget,
++ [OP_LAYOUTRETURN] = nfsd4_decode_layoutreturn,
+ #else
+- [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
++ [OP_GETDEVICEINFO] = nfsd4_decode_notsupp,
++ [OP_GETDEVICELIST] = nfsd4_decode_notsupp,
++ [OP_LAYOUTCOMMIT] = nfsd4_decode_notsupp,
++ [OP_LAYOUTGET] = nfsd4_decode_notsupp,
++ [OP_LAYOUTRETURN] = nfsd4_decode_notsupp,
+ #endif
+- [OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
+- [OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
+- [OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_TEST_STATEID] = (nfsd4_dec)nfsd4_decode_test_stateid,
+- [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid,
+- [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete,
++ [OP_SECINFO_NO_NAME] = nfsd4_decode_secinfo_no_name,
++ [OP_SEQUENCE] = nfsd4_decode_sequence,
++ [OP_SET_SSV] = nfsd4_decode_notsupp,
++ [OP_TEST_STATEID] = nfsd4_decode_test_stateid,
++ [OP_WANT_DELEGATION] = nfsd4_decode_notsupp,
++ [OP_DESTROY_CLIENTID] = nfsd4_decode_destroy_clientid,
++ [OP_RECLAIM_COMPLETE] = nfsd4_decode_reclaim_complete,
+
+ /* new operations for NFSv4.2 */
+- [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
+- [OP_COPY] = (nfsd4_dec)nfsd4_decode_copy,
+- [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_copy_notify,
+- [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_fallocate,
+- [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_offload_status,
+- [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_offload_status,
+- [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_read,
+- [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek,
+- [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp,
+- [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone,
++ [OP_ALLOCATE] = nfsd4_decode_fallocate,
++ [OP_COPY] = nfsd4_decode_copy,
++ [OP_COPY_NOTIFY] = nfsd4_decode_copy_notify,
++ [OP_DEALLOCATE] = nfsd4_decode_fallocate,
++ [OP_IO_ADVISE] = nfsd4_decode_notsupp,
++ [OP_LAYOUTERROR] = nfsd4_decode_notsupp,
++ [OP_LAYOUTSTATS] = nfsd4_decode_notsupp,
++ [OP_OFFLOAD_CANCEL] = nfsd4_decode_offload_status,
++ [OP_OFFLOAD_STATUS] = nfsd4_decode_offload_status,
++ [OP_READ_PLUS] = nfsd4_decode_read,
++ [OP_SEEK] = nfsd4_decode_seek,
++ [OP_WRITE_SAME] = nfsd4_decode_notsupp,
++ [OP_CLONE] = nfsd4_decode_clone,
+ /* RFC 8276 extended atributes operations */
+- [OP_GETXATTR] = (nfsd4_dec)nfsd4_decode_getxattr,
+- [OP_SETXATTR] = (nfsd4_dec)nfsd4_decode_setxattr,
+- [OP_LISTXATTRS] = (nfsd4_dec)nfsd4_decode_listxattrs,
+- [OP_REMOVEXATTR] = (nfsd4_dec)nfsd4_decode_removexattr,
++ [OP_GETXATTR] = nfsd4_decode_getxattr,
++ [OP_SETXATTR] = nfsd4_decode_setxattr,
++ [OP_LISTXATTRS] = nfsd4_decode_listxattrs,
++ [OP_REMOVEXATTR] = nfsd4_decode_removexattr,
+ };
+
+ static inline bool
+@@ -2198,43 +2427,46 @@ nfsd4_opnum_in_range(struct nfsd4_compoundargs *argp, struct nfsd4_op *op)
+ return true;
+ }
+
+-static __be32
++static bool
+ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
+ {
+- DECODE_HEAD;
+ struct nfsd4_op *op;
+ bool cachethis = false;
+ int auth_slack= argp->rqstp->rq_auth_slack;
+ int max_reply = auth_slack + 8; /* opcnt, status */
+ int readcount = 0;
+ int readbytes = 0;
++ __be32 *p;
+ int i;
+
+- READ_BUF(4);
+- argp->taglen = be32_to_cpup(p++);
+- READ_BUF(argp->taglen);
+- SAVEMEM(argp->tag, argp->taglen);
+- READ_BUF(8);
+- argp->minorversion = be32_to_cpup(p++);
+- argp->opcnt = be32_to_cpup(p++);
+- max_reply += 4 + (XDR_QUADLEN(argp->taglen) << 2);
+-
+- if (argp->taglen > NFSD4_MAX_TAGLEN)
+- goto xdr_error;
+- /*
+- * NFS4ERR_RESOURCE is a more helpful error than GARBAGE_ARGS
+- * here, so we return success at the xdr level so that
+- * nfsd4_proc can handle this is an NFS-level error.
+- */
+- if (argp->opcnt > NFSD_MAX_OPS_PER_COMPOUND)
+- return 0;
++ if (xdr_stream_decode_u32(argp->xdr, &argp->taglen) < 0)
++ return false;
++ max_reply += XDR_UNIT;
++ argp->tag = NULL;
++ if (unlikely(argp->taglen)) {
++ if (argp->taglen > NFSD4_MAX_TAGLEN)
++ return false;
++ p = xdr_inline_decode(argp->xdr, argp->taglen);
++ if (!p)
++ return false;
++ argp->tag = svcxdr_savemem(argp, p, argp->taglen);
++ if (!argp->tag)
++ return false;
++ max_reply += xdr_align_size(argp->taglen);
++ }
++
++ if (xdr_stream_decode_u32(argp->xdr, &argp->minorversion) < 0)
++ return false;
++ if (xdr_stream_decode_u32(argp->xdr, &argp->client_opcnt) < 0)
++ return false;
++ argp->opcnt = min_t(u32, argp->client_opcnt,
++ NFSD_MAX_OPS_PER_COMPOUND);
+
+ if (argp->opcnt > ARRAY_SIZE(argp->iops)) {
+- argp->ops = kzalloc(argp->opcnt * sizeof(*argp->ops), GFP_KERNEL);
++ argp->ops = vcalloc(argp->opcnt, sizeof(*argp->ops));
+ if (!argp->ops) {
+ argp->ops = argp->iops;
+- dprintk("nfsd: couldn't allocate room for COMPOUND\n");
+- goto xdr_error;
++ return false;
+ }
+ }
+
+@@ -2244,17 +2476,23 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
+ for (i = 0; i < argp->opcnt; i++) {
+ op = &argp->ops[i];
+ op->replay = NULL;
++ op->opdesc = NULL;
+
+- READ_BUF(4);
+- op->opnum = be32_to_cpup(p++);
+-
+- if (nfsd4_opnum_in_range(argp, op))
++ if (xdr_stream_decode_u32(argp->xdr, &op->opnum) < 0)
++ return false;
++ if (nfsd4_opnum_in_range(argp, op)) {
++ op->opdesc = OPDESC(op);
+ op->status = nfsd4_dec_ops[op->opnum](argp, &op->u);
+- else {
++ if (op->status != nfs_ok)
++ trace_nfsd_compound_decode_err(argp->rqstp,
++ argp->opcnt, i,
++ op->opnum,
++ op->status);
++ } else {
+ op->opnum = OP_ILLEGAL;
+ op->status = nfserr_op_illegal;
+ }
+- op->opdesc = OPDESC(op);
++
+ /*
+ * We'll try to cache the result in the DRC if any one
+ * op in the compound wants to be cached:
+@@ -2289,7 +2527,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
+ if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
+ clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+
+- DECODE_TAIL;
++ return true;
+ }
+
+ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+@@ -2298,15 +2536,25 @@ static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode,
+ if (exp->ex_flags & NFSEXP_V4ROOT) {
+ *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time));
+ *p++ = 0;
+- } else if (IS_I_VERSION(inode)) {
++ } else
+ p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode));
+- } else {
+- *p++ = cpu_to_be32(stat->ctime.tv_sec);
+- *p++ = cpu_to_be32(stat->ctime.tv_nsec);
+- }
+ return p;
+ }
+
++static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr,
++ struct timespec64 *tv)
++{
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, XDR_UNIT * 3);
++ if (!p)
++ return nfserr_resource;
++
++ p = xdr_encode_hyper(p, (s64)tv->tv_sec);
++ *p = cpu_to_be32(tv->tv_nsec);
++ return nfs_ok;
++}
++
+ /*
+ * ctime (in NFSv4, time_metadata) is not writeable, and the client
+ * doesn't really care what resolution could theoretically be stored by
+@@ -2335,15 +2583,8 @@ static __be32 *encode_time_delta(__be32 *p, struct inode *inode)
+ static __be32 *encode_cinfo(__be32 *p, struct nfsd4_change_info *c)
+ {
+ *p++ = cpu_to_be32(c->atomic);
+- if (c->change_supported) {
+- p = xdr_encode_hyper(p, c->before_change);
+- p = xdr_encode_hyper(p, c->after_change);
+- } else {
+- *p++ = cpu_to_be32(c->before_ctime_sec);
+- *p++ = cpu_to_be32(c->before_ctime_nsec);
+- *p++ = cpu_to_be32(c->after_ctime_sec);
+- *p++ = cpu_to_be32(c->after_ctime_nsec);
+- }
++ p = xdr_encode_hyper(p, c->before_change);
++ p = xdr_encode_hyper(p, c->after_change);
+ return p;
+ }
+
+@@ -2558,7 +2799,7 @@ static u32 nfs4_file_type(umode_t mode)
+ case S_IFREG: return NF4REG;
+ case S_IFSOCK: return NF4SOCK;
+ default: return NF4BAD;
+- };
++ }
+ }
+
+ static inline __be32
+@@ -2642,9 +2883,10 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32
+ }
+
+
+-static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
++static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino)
+ {
+ struct path path = exp->ex_path;
++ struct kstat stat;
+ int err;
+
+ path_get(&path);
+@@ -2652,8 +2894,10 @@ static int get_parent_attributes(struct svc_export *exp, struct kstat *stat)
+ if (path.dentry != path.mnt->mnt_root)
+ break;
+ }
+- err = vfs_getattr(&path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
++ err = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
+ path_put(&path);
++ if (!err)
++ *pino = stat.ino;
+ return err;
+ }
+
+@@ -2706,10 +2950,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ struct kstat stat;
+ struct svc_fh *tempfh = NULL;
+ struct kstatfs statfs;
+- __be32 *p;
++ __be32 *p, *attrlen_p;
+ int starting_len = xdr->buf->len;
+ int attrlen_offset;
+- __be32 attrlen;
+ u32 dummy;
+ u64 dummy64;
+ u32 rdattr_err = 0;
+@@ -2741,6 +2984,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ err = vfs_getattr(&path, &stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
+ if (err)
+ goto out_nfserr;
++ if (!(stat.result_mask & STATX_BTIME))
++ /* underlying FS does not offer btime so we can't share it */
++ bmval1 &= ~FATTR4_WORD1_TIME_CREATE;
+ if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+ FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
+ (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+@@ -2794,10 +3040,9 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ goto out;
+
+ attrlen_offset = xdr->buf->len;
+- p = xdr_reserve_space(xdr, 4);
+- if (!p)
++ attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
++ if (!attrlen_p)
+ goto out_resource;
+- p++; /* to be backfilled later */
+
+ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
+ u32 supp[3];
+@@ -2983,7 +3228,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4);
+ if (!p)
+ goto out_resource;
+- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base,
++ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw,
+ fhp->fh_handle.fh_size);
+ }
+ if (bmval0 & FATTR4_WORD0_FILEID) {
+@@ -3115,11 +3360,14 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ p = xdr_encode_hyper(p, dummy64);
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_ACCESS) {
+- p = xdr_reserve_space(xdr, 12);
+- if (!p)
+- goto out_resource;
+- p = xdr_encode_hyper(p, (s64)stat.atime.tv_sec);
+- *p++ = cpu_to_be32(stat.atime.tv_nsec);
++ status = nfsd4_encode_nfstime4(xdr, &stat.atime);
++ if (status)
++ goto out;
++ }
++ if (bmval1 & FATTR4_WORD1_TIME_CREATE) {
++ status = nfsd4_encode_nfstime4(xdr, &stat.btime);
++ if (status)
++ goto out;
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_DELTA) {
+ p = xdr_reserve_space(xdr, 12);
+@@ -3128,36 +3376,31 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ p = encode_time_delta(p, d_inode(dentry));
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_METADATA) {
+- p = xdr_reserve_space(xdr, 12);
+- if (!p)
+- goto out_resource;
+- p = xdr_encode_hyper(p, (s64)stat.ctime.tv_sec);
+- *p++ = cpu_to_be32(stat.ctime.tv_nsec);
++ status = nfsd4_encode_nfstime4(xdr, &stat.ctime);
++ if (status)
++ goto out;
+ }
+ if (bmval1 & FATTR4_WORD1_TIME_MODIFY) {
+- p = xdr_reserve_space(xdr, 12);
+- if (!p)
+- goto out_resource;
+- p = xdr_encode_hyper(p, (s64)stat.mtime.tv_sec);
+- *p++ = cpu_to_be32(stat.mtime.tv_nsec);
++ status = nfsd4_encode_nfstime4(xdr, &stat.mtime);
++ if (status)
++ goto out;
+ }
+ if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+- struct kstat parent_stat;
+ u64 ino = stat.ino;
+
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ /*
+- * Get parent's attributes if not ignoring crossmount
+- * and this is the root of a cross-mounted filesystem.
++ * Get ino of mountpoint in parent filesystem, if not ignoring
++ * crossmount and this is the root of a cross-mounted
++ * filesystem.
+ */
+ if (ignore_crossmnt == 0 &&
+ dentry == exp->ex_path.mnt->mnt_root) {
+- err = get_parent_attributes(exp, &parent_stat);
++ err = nfsd4_get_mounted_on_ino(exp, &ino);
+ if (err)
+ goto out_nfserr;
+- ino = parent_stat.ino;
+ }
+ p = xdr_encode_hyper(p, ino);
+ }
+@@ -3194,16 +3437,6 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ goto out;
+ }
+
+- if (bmval2 & FATTR4_WORD2_CHANGE_ATTR_TYPE) {
+- p = xdr_reserve_space(xdr, 4);
+- if (!p)
+- goto out_resource;
+- if (IS_I_VERSION(d_inode(dentry)))
+- *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR);
+- else
+- *p++ = cpu_to_be32(NFS4_CHANGE_TYPE_IS_TIME_METADATA);
+- }
+-
+ #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
+ status = nfsd4_encode_security_label(xdr, rqstp, context,
+@@ -3222,8 +3455,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
+ *p++ = cpu_to_be32(err == 0);
+ }
+
+- attrlen = htonl(xdr->buf->len - attrlen_offset - 4);
+- write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, 4);
++ *attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
+ status = nfs_ok;
+
+ out:
+@@ -3392,7 +3624,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
+ p = xdr_reserve_space(xdr, 3*4 + namlen);
+ if (!p)
+ goto fail;
+- p = xdr_encode_hyper(p, NFS_OFFSET_MAX); /* offset of next entry */
++ p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */
+ p = xdr_encode_array(p, name, namlen); /* name length & name */
+
+ nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen);
+@@ -3476,9 +3708,11 @@ nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid)
+ }
+
+ static __be32
+-nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access)
++nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_access *access = &u->access;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+@@ -3489,9 +3723,11 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
+ return 0;
+ }
+
+-static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts)
++static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8);
+@@ -3506,18 +3742,22 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp,
+ }
+
+ static __be32
+-nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close)
++nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_close *close = &u->close;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &close->cl_stateid);
+ }
+
+
+ static __be32
+-nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_commit *commit)
++nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_commit *commit = &u->commit;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
+@@ -3529,9 +3769,11 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
+ }
+
+ static __be32
+-nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create)
++nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_create *create = &u->create;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+@@ -3543,19 +3785,23 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
+ }
+
+ static __be32
+-nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getattr *getattr)
++nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_getattr *getattr = &u->getattr;
+ struct svc_fh *fhp = getattr->ga_fhp;
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry,
+ getattr->ga_bmval, resp->rqstp, 0);
+ }
+
+ static __be32
+-nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh **fhpp)
++nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct svc_fh **fhpp = &u->getfh;
++ struct xdr_stream *xdr = resp->xdr;
+ struct svc_fh *fhp = *fhpp;
+ unsigned int len;
+ __be32 *p;
+@@ -3564,7 +3810,7 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh
+ p = xdr_reserve_space(xdr, len + 4);
+ if (!p)
+ return nfserr_resource;
+- p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len);
++ p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len);
+ return 0;
+ }
+
+@@ -3608,9 +3854,11 @@ nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld)
+ }
+
+ static __be32
+-nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lock *lock)
++nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_lock *lock = &u->lock;
++ struct xdr_stream *xdr = resp->xdr;
+
+ if (!nfserr)
+ nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid);
+@@ -3621,9 +3869,11 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo
+ }
+
+ static __be32
+-nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lockt *lockt)
++nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_lockt *lockt = &u->lockt;
++ struct xdr_stream *xdr = resp->xdr;
+
+ if (nfserr == nfserr_denied)
+ nfsd4_encode_lock_denied(xdr, &lockt->lt_denied);
+@@ -3631,18 +3881,22 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l
+ }
+
+ static __be32
+-nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku)
++nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_locku *locku = &u->locku;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &locku->lu_stateid);
+ }
+
+
+ static __be32
+-nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_link *link)
++nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_link *link = &u->link;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+@@ -3654,9 +3908,11 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li
+
+
+ static __be32
+-nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open)
++nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_open *open = &u->open;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid);
+@@ -3748,17 +4004,21 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
+ }
+
+ static __be32
+-nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc)
++nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_open_confirm *oc = &u->open_confirm;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid);
+ }
+
+ static __be32
+-nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od)
++nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_open_downgrade *od = &u->open_downgrade;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_encode_stateid(xdr, &od->od_stateid);
+ }
+@@ -3768,33 +4028,28 @@ static __be32 nfsd4_encode_splice_read(
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ struct xdr_buf *buf = xdr->buf;
+- u32 eof;
+- int space_left;
++ int status, space_left;
+ __be32 nfserr;
+- __be32 *p = xdr->p - 2;
+
+ /* Make sure there will be room for padding if needed */
+ if (xdr->end - xdr->p < 1)
+ return nfserr_resource;
+
+ nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp,
+- file, read->rd_offset, &maxcount, &eof);
++ file, read->rd_offset, &maxcount,
++ &read->rd_eof);
+ read->rd_length = maxcount;
+- if (nfserr) {
+- /*
+- * nfsd_splice_actor may have already messed with the
+- * page length; reset it so as not to confuse
+- * xdr_truncate_encode:
+- */
+- buf->page_len = 0;
+- return nfserr;
++ if (nfserr)
++ goto out_err;
++ status = svc_encode_result_payload(read->rd_rqstp,
++ buf->head[0].iov_len, maxcount);
++ if (status) {
++ nfserr = nfserrno(status);
++ goto out_err;
+ }
+
+- *(p++) = htonl(eof);
+- *(p++) = htonl(maxcount);
+-
+ buf->page_len = maxcount;
+ buf->len += maxcount;
+ xdr->page_ptr += (buf->page_base + maxcount + PAGE_SIZE - 1)
+@@ -3820,18 +4075,25 @@ static __be32 nfsd4_encode_splice_read(
+ xdr->end = (__be32 *)((void *)xdr->end + space_left);
+
+ return 0;
++
++out_err:
++ /*
++ * nfsd_splice_actor may have already messed with the
++ * page length; reset it so as not to confuse
++ * xdr_truncate_encode in our caller.
++ */
++ buf->page_len = 0;
++ return nfserr;
+ }
+
+ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
+ struct nfsd4_read *read,
+ struct file *file, unsigned long maxcount)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
+- u32 eof;
+- int starting_len = xdr->buf->len - 8;
++ struct xdr_stream *xdr = resp->xdr;
++ unsigned int starting_len = xdr->buf->len;
++ __be32 zero = xdr_zero;
+ __be32 nfserr;
+- __be32 tmp;
+- int pad;
+
+ read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, maxcount);
+ if (read->rd_vlen < 0)
+@@ -3839,33 +4101,27 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
+
+ nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+ resp->rqstp->rq_vec, read->rd_vlen, &maxcount,
+- &eof);
++ &read->rd_eof);
+ read->rd_length = maxcount;
+ if (nfserr)
+ return nfserr;
+- if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount))
++ if (svc_encode_result_payload(resp->rqstp, starting_len, maxcount))
+ return nfserr_io;
+- xdr_truncate_encode(xdr, starting_len + 8 + xdr_align_size(maxcount));
+-
+- tmp = htonl(eof);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4);
+- tmp = htonl(maxcount);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
+-
+- tmp = xdr_zero;
+- pad = (maxcount&3) ? 4 - (maxcount&3) : 0;
+- write_bytes_to_xdr_buf(xdr->buf, starting_len + 8 + maxcount,
+- &tmp, pad);
+- return 0;
++ xdr_truncate_encode(xdr, starting_len + xdr_align_size(maxcount));
+
++ write_bytes_to_xdr_buf(xdr->buf, starting_len + maxcount, &zero,
++ xdr_pad_size(maxcount));
++ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_read *read)
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_read *read = &u->read;
++ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
+ unsigned long maxcount;
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ struct file *file;
+ int starting_len = xdr->buf->len;
+ __be32 *p;
+@@ -3876,45 +4132,44 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
+ if (!p) {
+- WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags));
++ WARN_ON_ONCE(splice_ok);
+ return nfserr_resource;
+ }
+- if (resp->xdr.buf->page_len &&
+- test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
++ if (resp->xdr->buf->page_len && splice_ok) {
+ WARN_ON_ONCE(1);
+ return nfserr_serverfault;
+ }
+ xdr_commit_encode(xdr);
+
+- maxcount = svc_max_payload(resp->rqstp);
+- maxcount = min_t(unsigned long, maxcount,
++ maxcount = min_t(unsigned long, read->rd_length,
+ (xdr->buf->buflen - xdr->buf->len));
+- maxcount = min_t(unsigned long, maxcount, read->rd_length);
+
+- if (file->f_op->splice_read &&
+- test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags))
++ if (file->f_op->splice_read && splice_ok)
+ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
+ else
+ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+-
+- if (nfserr)
++ if (nfserr) {
+ xdr_truncate_encode(xdr, starting_len);
++ return nfserr;
++ }
+
+- return nfserr;
++ p = xdr_encode_bool(p, read->rd_eof);
++ *p = cpu_to_be32(read->rd_length);
++ return nfs_ok;
+ }
+
+ static __be32
+-nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readlink *readlink)
++nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- int maxcount;
+- __be32 wire_count;
+- int zero = 0;
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_readlink *readlink = &u->readlink;
++ __be32 *p, *maxcount_p, zero = xdr_zero;
++ struct xdr_stream *xdr = resp->xdr;
+ int length_offset = xdr->buf->len;
+- __be32 *p;
++ int maxcount, status;
+
+- p = xdr_reserve_space(xdr, 4);
+- if (!p)
++ maxcount_p = xdr_reserve_space(xdr, XDR_UNIT);
++ if (!maxcount_p)
+ return nfserr_resource;
+ maxcount = PAGE_SIZE;
+
+@@ -3931,28 +4186,35 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
+ (char *)p, &maxcount);
+ if (nfserr == nfserr_isdir)
+ nfserr = nfserr_inval;
+- if (nfserr) {
+- xdr_truncate_encode(xdr, length_offset);
+- return nfserr;
+- }
++ if (nfserr)
++ goto out_err;
++ status = svc_encode_result_payload(readlink->rl_rqstp, length_offset,
++ maxcount);
++ if (status) {
++ nfserr = nfserrno(status);
++ goto out_err;
++ }
++ *maxcount_p = cpu_to_be32(maxcount);
++ xdr_truncate_encode(xdr, length_offset + 4 + xdr_align_size(maxcount));
++ write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount, &zero,
++ xdr_pad_size(maxcount));
++ return nfs_ok;
+
+- wire_count = htonl(maxcount);
+- write_bytes_to_xdr_buf(xdr->buf, length_offset, &wire_count, 4);
+- xdr_truncate_encode(xdr, length_offset + 4 + ALIGN(maxcount, 4));
+- if (maxcount & 3)
+- write_bytes_to_xdr_buf(xdr->buf, length_offset + 4 + maxcount,
+- &zero, 4 - (maxcount&3));
+- return 0;
++out_err:
++ xdr_truncate_encode(xdr, length_offset);
++ return nfserr;
+ }
+
+ static __be32
+-nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_readdir *readdir)
++nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_readdir *readdir = &u->readdir;
+ int maxcount;
+ int bytes_left;
+ loff_t offset;
+ __be64 wire_offset;
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ int starting_len = xdr->buf->len;
+ __be32 *p;
+
+@@ -3963,8 +4225,8 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
+ /* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
+ *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(0);
+- resp->xdr.buf->head[0].iov_len = ((char *)resp->xdr.p)
+- - (char *)resp->xdr.buf->head[0].iov_base;
++ xdr->buf->head[0].iov_len = (char *)xdr->p -
++ (char *)xdr->buf->head[0].iov_base;
+
+ /*
+ * Number of bytes left for directory entries allowing for the
+@@ -4037,9 +4299,11 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
+ }
+
+ static __be32
+-nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_remove *remove)
++nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_remove *remove = &u->remove;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+@@ -4050,9 +4314,11 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
+ }
+
+ static __be32
+-nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_rename *rename)
++nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_rename *rename = &u->rename;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 40);
+@@ -4133,18 +4399,20 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp)
+
+ static __be32
+ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_secinfo *secinfo)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_secinfo *secinfo = &u->secinfo;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp);
+ }
+
+ static __be32
+ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_secinfo_no_name *secinfo)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_secinfo_no_name *secinfo = &u->secinfo_no_name;
++ struct xdr_stream *xdr = resp->xdr;
+
+ return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp);
+ }
+@@ -4154,9 +4422,11 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr,
+ * regardless of the error status.
+ */
+ static __be32
+-nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setattr *setattr)
++nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_setattr *setattr = &u->setattr;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 16);
+@@ -4178,9 +4448,11 @@ nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
+ }
+
+ static __be32
+-nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_setclientid *scd)
++nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_setclientid *scd = &u->setclientid;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ if (!nfserr) {
+@@ -4202,9 +4474,11 @@ nfsd4_encode_setclientid(struct nfsd4_compoundres *resp, __be32 nfserr, struct n
+ }
+
+ static __be32
+-nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_write *write)
++nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_write *write = &u->write;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 16);
+@@ -4219,9 +4493,10 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
+
+ static __be32
+ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_exchange_id *exid)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_exchange_id *exid = &u->exchange_id;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+ char *major_id;
+ char *server_scope;
+@@ -4297,9 +4572,10 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_create_session *sess)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_create_session *sess = &u->create_session;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 24);
+@@ -4350,9 +4626,10 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_sequence *seq)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_sequence *seq = &u->sequence;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20);
+@@ -4373,9 +4650,10 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_test_stateid *test_stateid)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_test_stateid *test_stateid = &u->test_stateid;
++ struct xdr_stream *xdr = resp->xdr;
+ struct nfsd4_test_stateid_id *stateid, *next;
+ __be32 *p;
+
+@@ -4394,9 +4672,10 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
+ #ifdef CONFIG_NFSD_PNFS
+ static __be32
+ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_getdeviceinfo *gdev)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo;
++ struct xdr_stream *xdr = resp->xdr;
+ const struct nfsd4_layout_ops *ops;
+ u32 starting_len = xdr->buf->len, needed_len;
+ __be32 *p;
+@@ -4447,9 +4726,10 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_layoutget *lgp)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_layoutget *lgp = &u->layoutget;
++ struct xdr_stream *xdr = resp->xdr;
+ const struct nfsd4_layout_ops *ops;
+ __be32 *p;
+
+@@ -4474,9 +4754,10 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_layoutcommit *lcp)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_layoutcommit *lcp = &u->layoutcommit;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+@@ -4495,9 +4776,10 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_layoutreturn *lrp)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_layoutreturn *lrp = &u->layoutreturn;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+@@ -4515,7 +4797,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+ struct nfsd42_write_res *write, bool sync)
+ {
+ __be32 *p;
+- p = xdr_reserve_space(&resp->xdr, 4);
++ p = xdr_reserve_space(resp->xdr, 4);
+ if (!p)
+ return nfserr_resource;
+
+@@ -4524,11 +4806,11 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+ else {
+ __be32 nfserr;
+ *p++ = cpu_to_be32(1);
+- nfserr = nfsd4_encode_stateid(&resp->xdr, &write->cb_stateid);
++ nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid);
+ if (nfserr)
+ return nfserr;
+ }
+- p = xdr_reserve_space(&resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
++ p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE);
+ if (!p)
+ return nfserr_resource;
+
+@@ -4542,7 +4824,7 @@ nfsd42_encode_write_res(struct nfsd4_compoundres *resp,
+ static __be32
+ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ struct nfs42_netaddr *addr;
+ __be32 *p;
+
+@@ -4581,26 +4863,28 @@ nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns)
+
+ static __be32
+ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_copy *copy)
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_copy *copy = &u->copy;
+ __be32 *p;
+
+ nfserr = nfsd42_encode_write_res(resp, &copy->cp_res,
+- copy->cp_synchronous);
++ nfsd4_copy_is_sync(copy));
+ if (nfserr)
+ return nfserr;
+
+- p = xdr_reserve_space(&resp->xdr, 4 + 4);
++ p = xdr_reserve_space(resp->xdr, 4 + 4);
+ *p++ = xdr_one; /* cr_consecutive */
+- *p++ = cpu_to_be32(copy->cp_synchronous);
++ *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero;
+ return 0;
+ }
+
+ static __be32
+ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_offload_status *os)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_offload_status *os = &u->offload_status;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8 + 4);
+@@ -4613,159 +4897,84 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
+- struct nfsd4_read *read,
+- unsigned long *maxcount, u32 *eof,
+- loff_t *pos)
++ struct nfsd4_read *read)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
+ struct file *file = read->rd_nf->nf_file;
+- int starting_len = xdr->buf->len;
+- loff_t hole_pos;
+- __be32 nfserr;
+- __be32 *p, tmp;
+- __be64 tmp64;
+-
+- hole_pos = pos ? *pos : vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+- if (hole_pos > read->rd_offset)
+- *maxcount = min_t(unsigned long, *maxcount, hole_pos - read->rd_offset);
+- *maxcount = min_t(unsigned long, *maxcount, (xdr->buf->buflen - xdr->buf->len));
++ struct xdr_stream *xdr = resp->xdr;
++ unsigned long maxcount;
++ __be32 nfserr, *p;
+
+ /* Content type, offset, byte count */
+ p = xdr_reserve_space(xdr, 4 + 8 + 4);
+ if (!p)
+- return nfserr_resource;
++ return nfserr_io;
++ if (resp->xdr->buf->page_len && splice_ok) {
++ WARN_ON_ONCE(splice_ok);
++ return nfserr_serverfault;
++ }
+
+- read->rd_vlen = xdr_reserve_space_vec(xdr, resp->rqstp->rq_vec, *maxcount);
+- if (read->rd_vlen < 0)
+- return nfserr_resource;
++ maxcount = min_t(unsigned long, read->rd_length,
++ (xdr->buf->buflen - xdr->buf->len));
+
+- nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset,
+- resp->rqstp->rq_vec, read->rd_vlen, maxcount, eof);
++ if (file->f_op->splice_read && splice_ok)
++ nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount);
++ else
++ nfserr = nfsd4_encode_readv(resp, read, file, maxcount);
+ if (nfserr)
+ return nfserr;
+- xdr_truncate_encode(xdr, starting_len + 16 + xdr_align_size(*maxcount));
+-
+- tmp = htonl(NFS4_CONTENT_DATA);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
+- tmp64 = cpu_to_be64(read->rd_offset);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp64, 8);
+- tmp = htonl(*maxcount);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len + 12, &tmp, 4);
+-
+- tmp = xdr_zero;
+- write_bytes_to_xdr_buf(xdr->buf, starting_len + 16 + *maxcount, &tmp,
+- xdr_pad_size(*maxcount));
+- return nfs_ok;
+-}
+-
+-static __be32
+-nfsd4_encode_read_plus_hole(struct nfsd4_compoundres *resp,
+- struct nfsd4_read *read,
+- unsigned long *maxcount, u32 *eof)
+-{
+- struct file *file = read->rd_nf->nf_file;
+- loff_t data_pos = vfs_llseek(file, read->rd_offset, SEEK_DATA);
+- loff_t f_size = i_size_read(file_inode(file));
+- unsigned long count;
+- __be32 *p;
+-
+- if (data_pos == -ENXIO)
+- data_pos = f_size;
+- else if (data_pos <= read->rd_offset || (data_pos < f_size && data_pos % PAGE_SIZE))
+- return nfsd4_encode_read_plus_data(resp, read, maxcount, eof, &f_size);
+- count = data_pos - read->rd_offset;
+-
+- /* Content type, offset, byte count */
+- p = xdr_reserve_space(&resp->xdr, 4 + 8 + 8);
+- if (!p)
+- return nfserr_resource;
+
+- *p++ = htonl(NFS4_CONTENT_HOLE);
+- p = xdr_encode_hyper(p, read->rd_offset);
+- p = xdr_encode_hyper(p, count);
++ *p++ = cpu_to_be32(NFS4_CONTENT_DATA);
++ p = xdr_encode_hyper(p, read->rd_offset);
++ *p = cpu_to_be32(read->rd_length);
+
+- *eof = (read->rd_offset + count) >= f_size;
+- *maxcount = min_t(unsigned long, count, *maxcount);
+ return nfs_ok;
+ }
+
+ static __be32
+ nfsd4_encode_read_plus(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_read *read)
++ union nfsd4_op_u *u)
+ {
+- unsigned long maxcount, count;
+- struct xdr_stream *xdr = &resp->xdr;
+- struct file *file;
++ struct nfsd4_read *read = &u->read;
++ struct file *file = read->rd_nf->nf_file;
++ struct xdr_stream *xdr = resp->xdr;
+ int starting_len = xdr->buf->len;
+- int last_segment = xdr->buf->len;
+- int segments = 0;
+- __be32 *p, tmp;
+- bool is_data;
+- loff_t pos;
+- u32 eof;
++ u32 segments = 0;
++ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+- file = read->rd_nf->nf_file;
+
+ /* eof flag, segment count */
+ p = xdr_reserve_space(xdr, 4 + 4);
+ if (!p)
+- return nfserr_resource;
++ return nfserr_io;
+ xdr_commit_encode(xdr);
+
+- maxcount = svc_max_payload(resp->rqstp);
+- maxcount = min_t(unsigned long, maxcount,
+- (xdr->buf->buflen - xdr->buf->len));
+- maxcount = min_t(unsigned long, maxcount, read->rd_length);
+- count = maxcount;
+-
+- eof = read->rd_offset >= i_size_read(file_inode(file));
+- if (eof)
++ read->rd_eof = read->rd_offset >= i_size_read(file_inode(file));
++ if (read->rd_eof)
+ goto out;
+
+- pos = vfs_llseek(file, read->rd_offset, SEEK_HOLE);
+- is_data = pos > read->rd_offset;
+-
+- while (count > 0 && !eof) {
+- maxcount = count;
+- if (is_data)
+- nfserr = nfsd4_encode_read_plus_data(resp, read, &maxcount, &eof,
+- segments == 0 ? &pos : NULL);
+- else
+- nfserr = nfsd4_encode_read_plus_hole(resp, read, &maxcount, &eof);
+- if (nfserr)
+- goto out;
+- count -= maxcount;
+- read->rd_offset += maxcount;
+- is_data = !is_data;
+- last_segment = xdr->buf->len;
+- segments++;
+- }
+-
+-out:
+- if (nfserr && segments == 0)
++ nfserr = nfsd4_encode_read_plus_data(resp, read);
++ if (nfserr) {
+ xdr_truncate_encode(xdr, starting_len);
+- else {
+- if (nfserr) {
+- xdr_truncate_encode(xdr, last_segment);
+- nfserr = nfs_ok;
+- eof = 0;
+- }
+- tmp = htonl(eof);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len, &tmp, 4);
+- tmp = htonl(segments);
+- write_bytes_to_xdr_buf(xdr->buf, starting_len + 4, &tmp, 4);
++ return nfserr;
+ }
+
++ segments++;
++
++out:
++ p = xdr_encode_bool(p, read->rd_eof);
++ *p = cpu_to_be32(segments);
+ return nfserr;
+ }
+
+ static __be32
+ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_copy_notify *cn)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_copy_notify *cn = &u->copy_notify;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ if (nfserr)
+@@ -4792,16 +5001,18 @@ nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ *p++ = cpu_to_be32(1);
+
+- return nfsd42_encode_nl4_server(resp, &cn->cpn_src);
++ nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src);
++ return nfserr;
+ }
+
+ static __be32
+ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_seek *seek)
++ union nfsd4_op_u *u)
+ {
++ struct nfsd4_seek *seek = &u->seek;
+ __be32 *p;
+
+- p = xdr_reserve_space(&resp->xdr, 4 + 8);
++ p = xdr_reserve_space(resp->xdr, 4 + 8);
+ *p++ = cpu_to_be32(seek->seek_eof);
+ p = xdr_encode_hyper(p, seek->seek_pos);
+
+@@ -4809,7 +5020,8 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
+ }
+
+ static __be32
+-nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
++nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr,
++ union nfsd4_op_u *p)
+ {
+ return nfserr;
+ }
+@@ -4860,9 +5072,10 @@ nfsd4_vbuf_to_stream(struct xdr_stream *xdr, char *buf, u32 buflen)
+
+ static __be32
+ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_getxattr *getxattr)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_getxattr *getxattr = &u->getxattr;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p, err;
+
+ p = xdr_reserve_space(xdr, 4);
+@@ -4884,9 +5097,10 @@ nfsd4_encode_getxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_setxattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_setxattr *setxattr)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_setxattr *setxattr = &u->setxattr;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+@@ -4925,9 +5139,10 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
+
+ static __be32
+ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_listxattrs *listxattrs)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_listxattrs *listxattrs = &u->listxattrs;
++ struct xdr_stream *xdr = resp->xdr;
+ u32 cookie_offset, count_offset, eof;
+ u32 left, xdrleft, slen, count;
+ u32 xdrlen, offset;
+@@ -5036,9 +5251,10 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
+
+ static __be32
+ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+- struct nfsd4_removexattr *removexattr)
++ union nfsd4_op_u *u)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct nfsd4_removexattr *removexattr = &u->removexattr;
++ struct xdr_stream *xdr = resp->xdr;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 20);
+@@ -5049,7 +5265,7 @@ nfsd4_encode_removexattr(struct nfsd4_compoundres *resp, __be32 nfserr,
+ return 0;
+ }
+
+-typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
++typedef __be32(*nfsd4_enc)(struct nfsd4_compoundres *, __be32, union nfsd4_op_u *u);
+
+ /*
+ * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
+@@ -5057,93 +5273,93 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
+ * done in the decoding phase.
+ */
+ static const nfsd4_enc nfsd4_enc_ops[] = {
+- [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
+- [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
+- [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit,
+- [OP_CREATE] = (nfsd4_enc)nfsd4_encode_create,
+- [OP_DELEGPURGE] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_DELEGRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_GETATTR] = (nfsd4_enc)nfsd4_encode_getattr,
+- [OP_GETFH] = (nfsd4_enc)nfsd4_encode_getfh,
+- [OP_LINK] = (nfsd4_enc)nfsd4_encode_link,
+- [OP_LOCK] = (nfsd4_enc)nfsd4_encode_lock,
+- [OP_LOCKT] = (nfsd4_enc)nfsd4_encode_lockt,
+- [OP_LOCKU] = (nfsd4_enc)nfsd4_encode_locku,
+- [OP_LOOKUP] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open,
+- [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm,
+- [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade,
+- [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_PUTPUBFH] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_PUTROOTFH] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_READ] = (nfsd4_enc)nfsd4_encode_read,
+- [OP_READDIR] = (nfsd4_enc)nfsd4_encode_readdir,
+- [OP_READLINK] = (nfsd4_enc)nfsd4_encode_readlink,
+- [OP_REMOVE] = (nfsd4_enc)nfsd4_encode_remove,
+- [OP_RENAME] = (nfsd4_enc)nfsd4_encode_rename,
+- [OP_RENEW] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_RESTOREFH] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_SAVEFH] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_SECINFO] = (nfsd4_enc)nfsd4_encode_secinfo,
+- [OP_SETATTR] = (nfsd4_enc)nfsd4_encode_setattr,
+- [OP_SETCLIENTID] = (nfsd4_enc)nfsd4_encode_setclientid,
+- [OP_SETCLIENTID_CONFIRM] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
+- [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
++ [OP_ACCESS] = nfsd4_encode_access,
++ [OP_CLOSE] = nfsd4_encode_close,
++ [OP_COMMIT] = nfsd4_encode_commit,
++ [OP_CREATE] = nfsd4_encode_create,
++ [OP_DELEGPURGE] = nfsd4_encode_noop,
++ [OP_DELEGRETURN] = nfsd4_encode_noop,
++ [OP_GETATTR] = nfsd4_encode_getattr,
++ [OP_GETFH] = nfsd4_encode_getfh,
++ [OP_LINK] = nfsd4_encode_link,
++ [OP_LOCK] = nfsd4_encode_lock,
++ [OP_LOCKT] = nfsd4_encode_lockt,
++ [OP_LOCKU] = nfsd4_encode_locku,
++ [OP_LOOKUP] = nfsd4_encode_noop,
++ [OP_LOOKUPP] = nfsd4_encode_noop,
++ [OP_NVERIFY] = nfsd4_encode_noop,
++ [OP_OPEN] = nfsd4_encode_open,
++ [OP_OPENATTR] = nfsd4_encode_noop,
++ [OP_OPEN_CONFIRM] = nfsd4_encode_open_confirm,
++ [OP_OPEN_DOWNGRADE] = nfsd4_encode_open_downgrade,
++ [OP_PUTFH] = nfsd4_encode_noop,
++ [OP_PUTPUBFH] = nfsd4_encode_noop,
++ [OP_PUTROOTFH] = nfsd4_encode_noop,
++ [OP_READ] = nfsd4_encode_read,
++ [OP_READDIR] = nfsd4_encode_readdir,
++ [OP_READLINK] = nfsd4_encode_readlink,
++ [OP_REMOVE] = nfsd4_encode_remove,
++ [OP_RENAME] = nfsd4_encode_rename,
++ [OP_RENEW] = nfsd4_encode_noop,
++ [OP_RESTOREFH] = nfsd4_encode_noop,
++ [OP_SAVEFH] = nfsd4_encode_noop,
++ [OP_SECINFO] = nfsd4_encode_secinfo,
++ [OP_SETATTR] = nfsd4_encode_setattr,
++ [OP_SETCLIENTID] = nfsd4_encode_setclientid,
++ [OP_SETCLIENTID_CONFIRM] = nfsd4_encode_noop,
++ [OP_VERIFY] = nfsd4_encode_noop,
++ [OP_WRITE] = nfsd4_encode_write,
++ [OP_RELEASE_LOCKOWNER] = nfsd4_encode_noop,
+
+ /* NFSv4.1 operations */
+- [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_bind_conn_to_session,
+- [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
+- [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
+- [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
++ [OP_BACKCHANNEL_CTL] = nfsd4_encode_noop,
++ [OP_BIND_CONN_TO_SESSION] = nfsd4_encode_bind_conn_to_session,
++ [OP_EXCHANGE_ID] = nfsd4_encode_exchange_id,
++ [OP_CREATE_SESSION] = nfsd4_encode_create_session,
++ [OP_DESTROY_SESSION] = nfsd4_encode_noop,
++ [OP_FREE_STATEID] = nfsd4_encode_noop,
++ [OP_GET_DIR_DELEGATION] = nfsd4_encode_noop,
+ #ifdef CONFIG_NFSD_PNFS
+- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
+- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
+- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
++ [OP_GETDEVICEINFO] = nfsd4_encode_getdeviceinfo,
++ [OP_GETDEVICELIST] = nfsd4_encode_noop,
++ [OP_LAYOUTCOMMIT] = nfsd4_encode_layoutcommit,
++ [OP_LAYOUTGET] = nfsd4_encode_layoutget,
++ [OP_LAYOUTRETURN] = nfsd4_encode_layoutreturn,
+ #else
+- [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
++ [OP_GETDEVICEINFO] = nfsd4_encode_noop,
++ [OP_GETDEVICELIST] = nfsd4_encode_noop,
++ [OP_LAYOUTCOMMIT] = nfsd4_encode_noop,
++ [OP_LAYOUTGET] = nfsd4_encode_noop,
++ [OP_LAYOUTRETURN] = nfsd4_encode_noop,
+ #endif
+- [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
+- [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
+- [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_test_stateid,
+- [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
++ [OP_SECINFO_NO_NAME] = nfsd4_encode_secinfo_no_name,
++ [OP_SEQUENCE] = nfsd4_encode_sequence,
++ [OP_SET_SSV] = nfsd4_encode_noop,
++ [OP_TEST_STATEID] = nfsd4_encode_test_stateid,
++ [OP_WANT_DELEGATION] = nfsd4_encode_noop,
++ [OP_DESTROY_CLIENTID] = nfsd4_encode_noop,
++ [OP_RECLAIM_COMPLETE] = nfsd4_encode_noop,
+
+ /* NFSv4.2 operations */
+- [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_COPY] = (nfsd4_enc)nfsd4_encode_copy,
+- [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_copy_notify,
+- [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_offload_status,
+- [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_read_plus,
+- [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek,
+- [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop,
+- [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop,
++ [OP_ALLOCATE] = nfsd4_encode_noop,
++ [OP_COPY] = nfsd4_encode_copy,
++ [OP_COPY_NOTIFY] = nfsd4_encode_copy_notify,
++ [OP_DEALLOCATE] = nfsd4_encode_noop,
++ [OP_IO_ADVISE] = nfsd4_encode_noop,
++ [OP_LAYOUTERROR] = nfsd4_encode_noop,
++ [OP_LAYOUTSTATS] = nfsd4_encode_noop,
++ [OP_OFFLOAD_CANCEL] = nfsd4_encode_noop,
++ [OP_OFFLOAD_STATUS] = nfsd4_encode_offload_status,
++ [OP_READ_PLUS] = nfsd4_encode_read_plus,
++ [OP_SEEK] = nfsd4_encode_seek,
++ [OP_WRITE_SAME] = nfsd4_encode_noop,
++ [OP_CLONE] = nfsd4_encode_noop,
+
+ /* RFC 8276 extended atributes operations */
+- [OP_GETXATTR] = (nfsd4_enc)nfsd4_encode_getxattr,
+- [OP_SETXATTR] = (nfsd4_enc)nfsd4_encode_setxattr,
+- [OP_LISTXATTRS] = (nfsd4_enc)nfsd4_encode_listxattrs,
+- [OP_REMOVEXATTR] = (nfsd4_enc)nfsd4_encode_removexattr,
++ [OP_GETXATTR] = nfsd4_encode_getxattr,
++ [OP_SETXATTR] = nfsd4_encode_setxattr,
++ [OP_LISTXATTRS] = nfsd4_encode_listxattrs,
++ [OP_REMOVEXATTR] = nfsd4_encode_removexattr,
+ };
+
+ /*
+@@ -5178,7 +5394,7 @@ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 respsize)
+ void
+ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+ {
+- struct xdr_stream *xdr = &resp->xdr;
++ struct xdr_stream *xdr = resp->xdr;
+ struct nfs4_stateowner *so = resp->cstate.replay_owner;
+ struct svc_rqst *rqstp = resp->rqstp;
+ const struct nfsd4_operation *opdesc = op->opdesc;
+@@ -5187,10 +5403,8 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 8);
+- if (!p) {
+- WARN_ON_ONCE(1);
+- return;
+- }
++ if (!p)
++ goto release;
+ *p++ = cpu_to_be32(op->opnum);
+ post_err_offset = xdr->buf->len;
+
+@@ -5199,12 +5413,12 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+ if (op->status && opdesc &&
+ !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE))
+ goto status;
+- BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
++ BUG_ON(op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
+ !nfsd4_enc_ops[op->opnum]);
+ encoder = nfsd4_enc_ops[op->opnum];
+ op->status = encoder(resp, op->status, &op->u);
+- if (opdesc && opdesc->op_release)
+- opdesc->op_release(&op->u);
++ if (op->status)
++ trace_nfsd_compound_encode_err(rqstp, op->opnum, op->status);
+ xdr_commit_encode(xdr);
+
+ /* nfsd4_check_resp_size guarantees enough room for error status */
+@@ -5244,8 +5458,10 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
+ so->so_replay.rp_buf, len);
+ }
+ status:
+- /* Note that op->status is already in network byte order: */
+- write_bytes_to_xdr_buf(xdr->buf, post_err_offset - 4, &op->status, 4);
++ *p = op->status;
++release:
++ if (opdesc && opdesc->op_release)
++ opdesc->op_release(&op->u);
+ }
+
+ /*
+@@ -5271,22 +5487,14 @@ nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
+ p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
+ }
+
+-int
+-nfs4svc_encode_voidres(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return xdr_ressize_check(rqstp, p);
+-}
+-
+ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
+ {
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+ if (args->ops != args->iops) {
+- kfree(args->ops);
++ vfree(args->ops);
+ args->ops = args->iops;
+ }
+- kfree(args->tmpp);
+- args->tmpp = NULL;
+ while (args->to_free) {
+ struct svcxdr_tmpbuf *tb = args->to_free;
+ args->to_free = tb->next;
+@@ -5294,57 +5502,44 @@ void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
+ }
+ }
+
+-int
+-nfs4svc_decode_voidarg(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return 1;
+-}
+-
+-int
+-nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd4_compoundargs *args = rqstp->rq_argp;
+
+- if (rqstp->rq_arg.head[0].iov_len % 4) {
+- /* client is nuts */
+- dprintk("%s: compound not properly padded! (peeraddr=%pISc xid=0x%x)",
+- __func__, svc_addr(rqstp), be32_to_cpu(rqstp->rq_xid));
+- return 0;
+- }
+- args->p = p;
+- args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len;
+- args->pagelist = rqstp->rq_arg.pages;
+- args->pagelen = rqstp->rq_arg.page_len;
+- args->tail = false;
+- args->tmpp = NULL;
++ /* svcxdr_tmp_alloc */
+ args->to_free = NULL;
++
++ args->xdr = xdr;
+ args->ops = args->iops;
+ args->rqstp = rqstp;
+
+- return !nfsd4_decode_compound(args);
++ return nfsd4_decode_compound(args);
+ }
+
+-int
+-nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+- struct xdr_buf *buf = resp->xdr.buf;
++ __be32 *p;
+
+- WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
+- buf->tail[0].iov_len);
++ /*
++ * Send buffer space for the following items is reserved
++ * at the top of nfsd4_proc_compound().
++ */
++ p = resp->statusp;
+
+- *p = resp->cstate.status;
++ *p++ = resp->cstate.status;
+
+- rqstp->rq_next_page = resp->xdr.page_ptr + 1;
++ rqstp->rq_next_page = xdr->page_ptr + 1;
+
+- p = resp->tagp;
+ *p++ = htonl(resp->taglen);
+ memcpy(p, resp->tag, resp->taglen);
+ p += XDR_QUADLEN(resp->taglen);
+ *p++ = htonl(resp->opcnt);
+
+ nfsd4_sequence_done(resp);
+- return 1;
++ return true;
+ }
+
+ /*
+diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
+index 80c90fc231a53..2b5417e06d80d 100644
+--- a/fs/nfsd/nfscache.c
++++ b/fs/nfsd/nfscache.c
+@@ -84,12 +84,6 @@ nfsd_hashsize(unsigned int limit)
+ return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
+ }
+
+-static u32
+-nfsd_cache_hash(__be32 xid, struct nfsd_net *nn)
+-{
+- return hash_32(be32_to_cpu(xid), nn->maskbits);
+-}
+-
+ static struct svc_cacherep *
+ nfsd_reply_cache_alloc(struct svc_rqst *rqstp, __wsum csum,
+ struct nfsd_net *nn)
+@@ -121,14 +115,14 @@ nfsd_reply_cache_free_locked(struct nfsd_drc_bucket *b, struct svc_cacherep *rp,
+ struct nfsd_net *nn)
+ {
+ if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
+- nn->drc_mem_usage -= rp->c_replvec.iov_len;
++ nfsd_stats_drc_mem_usage_sub(nn, rp->c_replvec.iov_len);
+ kfree(rp->c_replvec.iov_base);
+ }
+ if (rp->c_state != RC_UNUSED) {
+ rb_erase(&rp->c_node, &b->rb_head);
+ list_del(&rp->c_lru);
+ atomic_dec(&nn->num_drc_entries);
+- nn->drc_mem_usage -= sizeof(*rp);
++ nfsd_stats_drc_mem_usage_sub(nn, sizeof(*rp));
+ }
+ kmem_cache_free(drc_slab, rp);
+ }
+@@ -154,6 +148,16 @@ void nfsd_drc_slab_free(void)
+ kmem_cache_destroy(drc_slab);
+ }
+
++static int nfsd_reply_cache_stats_init(struct nfsd_net *nn)
++{
++ return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
++}
++
++static void nfsd_reply_cache_stats_destroy(struct nfsd_net *nn)
++{
++ nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
++}
++
+ int nfsd_reply_cache_init(struct nfsd_net *nn)
+ {
+ unsigned int hashsize;
+@@ -165,12 +169,16 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
+ hashsize = nfsd_hashsize(nn->max_drc_entries);
+ nn->maskbits = ilog2(hashsize);
+
++ status = nfsd_reply_cache_stats_init(nn);
++ if (status)
++ goto out_nomem;
++
+ nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan;
+ nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count;
+ nn->nfsd_reply_cache_shrinker.seeks = 1;
+ status = register_shrinker(&nn->nfsd_reply_cache_shrinker);
+ if (status)
+- goto out_nomem;
++ goto out_stats_destroy;
+
+ nn->drc_hashtbl = kvzalloc(array_size(hashsize,
+ sizeof(*nn->drc_hashtbl)), GFP_KERNEL);
+@@ -186,6 +194,8 @@ int nfsd_reply_cache_init(struct nfsd_net *nn)
+ return 0;
+ out_shrinker:
+ unregister_shrinker(&nn->nfsd_reply_cache_shrinker);
++out_stats_destroy:
++ nfsd_reply_cache_stats_destroy(nn);
+ out_nomem:
+ printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
+ return -ENOMEM;
+@@ -206,6 +216,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn)
+ rp, nn);
+ }
+ }
++ nfsd_reply_cache_stats_destroy(nn);
+
+ kvfree(nn->drc_hashtbl);
+ nn->drc_hashtbl = NULL;
+@@ -224,8 +235,16 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
+ list_move_tail(&rp->c_lru, &b->lru_head);
+ }
+
+-static long
+-prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
++static noinline struct nfsd_drc_bucket *
++nfsd_cache_bucket_find(__be32 xid, struct nfsd_net *nn)
++{
++ unsigned int hash = hash_32((__force u32)xid, nn->maskbits);
++
++ return &nn->drc_hashtbl[hash];
++}
++
++static long prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn,
++ unsigned int max)
+ {
+ struct svc_cacherep *rp, *tmp;
+ long freed = 0;
+@@ -241,11 +260,17 @@ prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
+ time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
+ break;
+ nfsd_reply_cache_free_locked(b, rp, nn);
+- freed++;
++ if (max && freed++ > max)
++ break;
+ }
+ return freed;
+ }
+
++static long nfsd_prune_bucket(struct nfsd_drc_bucket *b, struct nfsd_net *nn)
++{
++ return prune_bucket(b, nn, 3);
++}
++
+ /*
+ * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
+ * Also prune the oldest ones when the total exceeds the max number of entries.
+@@ -262,7 +287,7 @@ prune_cache_entries(struct nfsd_net *nn)
+ if (list_empty(&b->lru_head))
+ continue;
+ spin_lock(&b->cache_lock);
+- freed += prune_bucket(b, nn);
++ freed += prune_bucket(b, nn, 0);
+ spin_unlock(&b->cache_lock);
+ }
+ return freed;
+@@ -324,7 +349,7 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key,
+ {
+ if (key->c_key.k_xid == rp->c_key.k_xid &&
+ key->c_key.k_csum != rp->c_key.k_csum) {
+- ++nn->payload_misses;
++ nfsd_stats_payload_misses_inc(nn);
+ trace_nfsd_drc_mismatch(nn, key, rp);
+ }
+
+@@ -396,18 +421,16 @@ nfsd_cache_insert(struct nfsd_drc_bucket *b, struct svc_cacherep *key,
+ */
+ int nfsd_cache_lookup(struct svc_rqst *rqstp)
+ {
+- struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
++ struct nfsd_net *nn;
+ struct svc_cacherep *rp, *found;
+- __be32 xid = rqstp->rq_xid;
+ __wsum csum;
+- u32 hash = nfsd_cache_hash(xid, nn);
+- struct nfsd_drc_bucket *b = &nn->drc_hashtbl[hash];
++ struct nfsd_drc_bucket *b;
+ int type = rqstp->rq_cachetype;
+ int rtn = RC_DOIT;
+
+ rqstp->rq_cacherep = NULL;
+ if (type == RC_NOCACHE) {
+- nfsdstats.rcnocache++;
++ nfsd_stats_rc_nocache_inc();
+ goto out;
+ }
+
+@@ -417,27 +440,25 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
+ * Since the common case is a cache miss followed by an insert,
+ * preallocate an entry.
+ */
++ nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
+ if (!rp)
+ goto out;
+
++ b = nfsd_cache_bucket_find(rqstp->rq_xid, nn);
+ spin_lock(&b->cache_lock);
+ found = nfsd_cache_insert(b, rp, nn);
+- if (found != rp) {
+- nfsd_reply_cache_free_locked(NULL, rp, nn);
+- rp = found;
++ if (found != rp)
+ goto found_entry;
+- }
+
+- nfsdstats.rcmisses++;
++ nfsd_stats_rc_misses_inc();
+ rqstp->rq_cacherep = rp;
+ rp->c_state = RC_INPROG;
+
+ atomic_inc(&nn->num_drc_entries);
+- nn->drc_mem_usage += sizeof(*rp);
++ nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
+
+- /* go ahead and prune the cache */
+- prune_bucket(b, nn);
++ nfsd_prune_bucket(b, nn);
+
+ out_unlock:
+ spin_unlock(&b->cache_lock);
+@@ -446,8 +467,10 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp)
+
+ found_entry:
+ /* We found a matching entry which is either in progress or done. */
+- nfsdstats.rchits++;
++ nfsd_reply_cache_free_locked(NULL, rp, nn);
++ nfsd_stats_rc_hits_inc();
+ rtn = RC_DROPIT;
++ rp = found;
+
+ /* Request being processed */
+ if (rp->c_state == RC_INPROG)
+@@ -506,7 +529,6 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct svc_cacherep *rp = rqstp->rq_cacherep;
+ struct kvec *resv = &rqstp->rq_res.head[0], *cachv;
+- u32 hash;
+ struct nfsd_drc_bucket *b;
+ int len;
+ size_t bufsize = 0;
+@@ -514,8 +536,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+ if (!rp)
+ return;
+
+- hash = nfsd_cache_hash(rp->c_key.k_xid, nn);
+- b = &nn->drc_hashtbl[hash];
++ b = nfsd_cache_bucket_find(rp->c_key.k_xid, nn);
+
+ len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
+ len >>= 2;
+@@ -548,7 +569,7 @@ void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+ return;
+ }
+ spin_lock(&b->cache_lock);
+- nn->drc_mem_usage += bufsize;
++ nfsd_stats_drc_mem_usage_add(nn, bufsize);
+ lru_put_end(b, rp);
+ rp->c_secure = test_bit(RQ_SECURE, &rqstp->rq_flags);
+ rp->c_type = cachetype;
+@@ -582,28 +603,26 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
+ * scraping this file for info should test the labels to ensure they're
+ * getting the correct field.
+ */
+-static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
++int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
+ {
+- struct nfsd_net *nn = m->private;
++ struct nfsd_net *nn = net_generic(file_inode(m->file)->i_sb->s_fs_info,
++ nfsd_net_id);
+
+ seq_printf(m, "max entries: %u\n", nn->max_drc_entries);
+ seq_printf(m, "num entries: %u\n",
+- atomic_read(&nn->num_drc_entries));
++ atomic_read(&nn->num_drc_entries));
+ seq_printf(m, "hash buckets: %u\n", 1 << nn->maskbits);
+- seq_printf(m, "mem usage: %u\n", nn->drc_mem_usage);
+- seq_printf(m, "cache hits: %u\n", nfsdstats.rchits);
+- seq_printf(m, "cache misses: %u\n", nfsdstats.rcmisses);
+- seq_printf(m, "not cached: %u\n", nfsdstats.rcnocache);
+- seq_printf(m, "payload misses: %u\n", nn->payload_misses);
++ seq_printf(m, "mem usage: %lld\n",
++ percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE]));
++ seq_printf(m, "cache hits: %lld\n",
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
++ seq_printf(m, "cache misses: %lld\n",
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]));
++ seq_printf(m, "not cached: %lld\n",
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
++ seq_printf(m, "payload misses: %lld\n",
++ percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES]));
+ seq_printf(m, "longest chain len: %u\n", nn->longest_chain);
+ seq_printf(m, "cachesize at longest: %u\n", nn->longest_chain_cachesize);
+ return 0;
+ }
+-
+-int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
+-{
+- struct nfsd_net *nn = net_generic(file_inode(file)->i_sb->s_fs_info,
+- nfsd_net_id);
+-
+- return single_open(file, nfsd_reply_cache_stats_show, nn);
+-}
+diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
+index c4b11560ac1b6..f77f00c931723 100644
+--- a/fs/nfsd/nfsctl.c
++++ b/fs/nfsd/nfsctl.c
+@@ -25,6 +25,7 @@
+ #include "state.h"
+ #include "netns.h"
+ #include "pnfs.h"
++#include "filecache.h"
+
+ /*
+ * We have a single directory with several nodes in it.
+@@ -32,6 +33,7 @@
+ enum {
+ NFSD_Root = 1,
+ NFSD_List,
++ NFSD_Export_Stats,
+ NFSD_Export_features,
+ NFSD_Fh,
+ NFSD_FO_UnlockIP,
+@@ -44,6 +46,7 @@ enum {
+ NFSD_Ports,
+ NFSD_MaxBlkSize,
+ NFSD_MaxConnections,
++ NFSD_Filecache,
+ NFSD_SupportedEnctypes,
+ /*
+ * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
+@@ -182,17 +185,7 @@ static int export_features_show(struct seq_file *m, void *v)
+ return 0;
+ }
+
+-static int export_features_open(struct inode *inode, struct file *file)
+-{
+- return single_open(file, export_features_show, NULL);
+-}
+-
+-static const struct file_operations export_features_operations = {
+- .open = export_features_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = single_release,
+-};
++DEFINE_SHOW_ATTRIBUTE(export_features);
+
+ #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+ static int supported_enctypes_show(struct seq_file *m, void *v)
+@@ -201,17 +194,7 @@ static int supported_enctypes_show(struct seq_file *m, void *v)
+ return 0;
+ }
+
+-static int supported_enctypes_open(struct inode *inode, struct file *file)
+-{
+- return single_open(file, supported_enctypes_show, NULL);
+-}
+-
+-static const struct file_operations supported_enctypes_ops = {
+- .open = supported_enctypes_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = single_release,
+-};
++DEFINE_SHOW_ATTRIBUTE(supported_enctypes);
+ #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+
+ static const struct file_operations pool_stats_operations = {
+@@ -221,12 +204,9 @@ static const struct file_operations pool_stats_operations = {
+ .release = nfsd_pool_stats_release,
+ };
+
+-static const struct file_operations reply_cache_stats_operations = {
+- .open = nfsd_reply_cache_stats_open,
+- .read = seq_read,
+- .llseek = seq_lseek,
+- .release = single_release,
+-};
++DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
++
++DEFINE_SHOW_ATTRIBUTE(nfsd_file_cache_stats);
+
+ /*----------------------------------------------------------------------------*/
+ /*
+@@ -394,12 +374,12 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
+ auth_domain_put(dom);
+ if (len)
+ return len;
+-
++
+ mesg = buf;
+ len = SIMPLE_TRANSACTION_LIMIT;
+- qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size);
++ qword_addhex(&mesg, &len, fh.fh_raw, fh.fh_size);
+ mesg[-1] = '\n';
+- return mesg - buf;
++ return mesg - buf;
+ }
+
+ /*
+@@ -601,7 +581,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
+
+ cmd = sign == '-' ? NFSD_CLEAR : NFSD_SET;
+ switch(num) {
++#ifdef CONFIG_NFSD_V2
+ case 2:
++#endif
+ case 3:
+ nfsd_vers(nn, num, cmd);
+ break;
+@@ -621,7 +603,9 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
+ }
+ break;
+ default:
+- return -EINVAL;
++ /* Ignore requests to disable non-existent versions */
++ if (cmd == NFSD_SET)
++ return -EINVAL;
+ }
+ vers += len + 1;
+ } while ((len = qword_get(&mesg, vers, size)) > 0);
+@@ -632,7 +616,6 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
+ }
+
+ /* Now write current state into reply buffer */
+- len = 0;
+ sep = "";
+ remaining = SIMPLE_TRANSACTION_LIMIT;
+ for (num=2 ; num <= 4 ; num++) {
+@@ -726,28 +709,25 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
+ char *mesg = buf;
+ int fd, err;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct svc_serv *serv;
+
+ err = get_int(&mesg, &fd);
+ if (err != 0 || fd < 0)
+ return -EINVAL;
+
+- if (svc_alien_sock(net, fd)) {
+- printk(KERN_ERR "%s: socket net is different to NFSd's one\n", __func__);
+- return -EINVAL;
+- }
+-
+ err = nfsd_create_serv(net);
+ if (err != 0)
+ return err;
+
+- err = svc_addsock(nn->nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
+- if (err < 0) {
+- nfsd_destroy(net);
+- return err;
+- }
++ serv = nn->nfsd_serv;
++ err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
++
++ if (err < 0 && !serv->sv_nrthreads && !nn->keep_active)
++ nfsd_last_thread(net);
++ else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
++ svc_get(serv);
+
+- /* Decrease the count, but don't shut down the service */
+- nn->nfsd_serv->sv_nrthreads--;
++ svc_put(serv);
+ return err;
+ }
+
+@@ -761,6 +741,7 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
+ struct svc_xprt *xprt;
+ int port, err;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct svc_serv *serv;
+
+ if (sscanf(buf, "%15s %5u", transport, &port) != 2)
+ return -EINVAL;
+@@ -772,30 +753,33 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
+ if (err != 0)
+ return err;
+
+- err = svc_create_xprt(nn->nfsd_serv, transport, net,
+- PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
++ serv = nn->nfsd_serv;
++ err = svc_xprt_create(serv, transport, net,
++ PF_INET, port, SVC_SOCK_ANONYMOUS, cred);
+ if (err < 0)
+ goto out_err;
+
+- err = svc_create_xprt(nn->nfsd_serv, transport, net,
+- PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
++ err = svc_xprt_create(serv, transport, net,
++ PF_INET6, port, SVC_SOCK_ANONYMOUS, cred);
+ if (err < 0 && err != -EAFNOSUPPORT)
+ goto out_close;
+
+- /* Decrease the count, but don't shut down the service */
+- nn->nfsd_serv->sv_nrthreads--;
++ if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
++ svc_get(serv);
++
++ svc_put(serv);
+ return 0;
+ out_close:
+- xprt = svc_find_xprt(nn->nfsd_serv, transport, net, PF_INET, port);
++ xprt = svc_find_xprt(serv, transport, net, PF_INET, port);
+ if (xprt != NULL) {
+- svc_close_xprt(xprt);
++ svc_xprt_close(xprt);
+ svc_xprt_put(xprt);
+ }
+ out_err:
+- if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+- nn->nfsd_serv->sv_nrthreads--;
+- else
+- nfsd_destroy(net);
++ if (!serv->sv_nrthreads && !nn->keep_active)
++ nfsd_last_thread(net);
++
++ svc_put(serv);
+ return err;
+ }
+
+@@ -1168,6 +1152,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode)
+ inode->i_fop = &simple_dir_operations;
+ inode->i_op = &simple_dir_inode_operations;
+ inc_nlink(inode);
++ break;
+ default:
+ break;
+ }
+@@ -1269,7 +1254,8 @@ static void nfsdfs_remove_files(struct dentry *root)
+ /* XXX: cut'n'paste from simple_fill_super; figure out if we could share
+ * code instead. */
+ static int nfsdfs_create_files(struct dentry *root,
+- const struct tree_descr *files)
++ const struct tree_descr *files,
++ struct dentry **fdentries)
+ {
+ struct inode *dir = d_inode(root);
+ struct inode *inode;
+@@ -1278,8 +1264,6 @@ static int nfsdfs_create_files(struct dentry *root,
+
+ inode_lock(dir);
+ for (i = 0; files->name && files->name[0]; i++, files++) {
+- if (!files->name)
+- continue;
+ dentry = d_alloc_name(root, files->name);
+ if (!dentry)
+ goto out;
+@@ -1293,6 +1277,8 @@ static int nfsdfs_create_files(struct dentry *root,
+ inode->i_private = __get_nfsdfs_client(dir);
+ d_add(dentry, inode);
+ fsnotify_create(dir, dentry);
++ if (fdentries)
++ fdentries[i] = dentry;
+ }
+ inode_unlock(dir);
+ return 0;
+@@ -1304,8 +1290,9 @@ static int nfsdfs_create_files(struct dentry *root,
+
+ /* on success, returns positive number unique to that client. */
+ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+- struct nfsdfs_client *ncl, u32 id,
+- const struct tree_descr *files)
++ struct nfsdfs_client *ncl, u32 id,
++ const struct tree_descr *files,
++ struct dentry **fdentries)
+ {
+ struct dentry *dentry;
+ char name[11];
+@@ -1316,7 +1303,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+ dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
+ if (IS_ERR(dentry)) /* XXX: tossing errors? */
+ return NULL;
+- ret = nfsdfs_create_files(dentry, files);
++ ret = nfsdfs_create_files(dentry, files, fdentries);
+ if (ret) {
+ nfsd_client_rmdir(dentry);
+ return NULL;
+@@ -1352,8 +1339,10 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
+
+ static const struct tree_descr nfsd_files[] = {
+ [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO},
++ /* Per-export io stats use same ops as exports file */
++ [NFSD_Export_Stats] = {"export_stats", &exports_nfsd_operations, S_IRUGO},
+ [NFSD_Export_features] = {"export_features",
+- &export_features_operations, S_IRUGO},
++ &export_features_fops, S_IRUGO},
+ [NFSD_FO_UnlockIP] = {"unlock_ip",
+ &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_FO_UnlockFS] = {"unlock_filesystem",
+@@ -1362,13 +1351,16 @@ static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
+ [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
+- [NFSD_Reply_Cache_Stats] = {"reply_cache_stats", &reply_cache_stats_operations, S_IRUGO},
++ [NFSD_Reply_Cache_Stats] = {"reply_cache_stats",
++ &nfsd_reply_cache_stats_fops, S_IRUGO},
+ [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
+ [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
+ [NFSD_MaxConnections] = {"max_connections", &transaction_ops, S_IWUSR|S_IRUGO},
++ [NFSD_Filecache] = {"filecache", &nfsd_file_cache_stats_fops, S_IRUGO},
+ #if defined(CONFIG_SUNRPC_GSS) || defined(CONFIG_SUNRPC_GSS_MODULE)
+- [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes", &supported_enctypes_ops, S_IRUGO},
++ [NFSD_SupportedEnctypes] = {"supported_krb5_enctypes",
++ &supported_enctypes_fops, S_IRUGO},
+ #endif /* CONFIG_SUNRPC_GSS or CONFIG_SUNRPC_GSS_MODULE */
+ #ifdef CONFIG_NFSD_V4
+ [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR},
+@@ -1468,25 +1460,16 @@ static __net_init int nfsd_init_net(struct net *net)
+ goto out_idmap_error;
+ nn->nfsd_versions = NULL;
+ nn->nfsd4_minorversions = NULL;
++ nfsd4_init_leases_net(nn);
+ retval = nfsd_reply_cache_init(nn);
+ if (retval)
+- goto out_drc_error;
+- nn->nfsd4_lease = 90; /* default lease time */
+- nn->nfsd4_grace = 90;
+- nn->somebody_reclaimed = false;
+- nn->track_reclaim_completes = false;
+- nn->clverifier_counter = prandom_u32();
+- nn->clientid_base = prandom_u32();
+- nn->clientid_counter = nn->clientid_base + 1;
+- nn->s2s_cp_cl_id = nn->clientid_counter++;
+-
+- atomic_set(&nn->ntf_refcnt, 0);
+- init_waitqueue_head(&nn->ntf_wq);
+- seqlock_init(&nn->boot_lock);
++ goto out_cache_error;
++ get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
++ seqlock_init(&nn->writeverf_lock);
+
+ return 0;
+
+-out_drc_error:
++out_cache_error:
+ nfsd_idmap_shutdown(net);
+ out_idmap_error:
+ nfsd_export_shutdown(net);
+@@ -1514,7 +1497,6 @@ static struct pernet_operations nfsd_net_ops = {
+ static int __init init_nfsd(void)
+ {
+ int retval;
+- printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n");
+
+ retval = nfsd4_init_slabs();
+ if (retval)
+@@ -1522,7 +1504,9 @@ static int __init init_nfsd(void)
+ retval = nfsd4_init_pnfs();
+ if (retval)
+ goto out_free_slabs;
+- nfsd_stat_init(); /* Statistics */
++ retval = nfsd_stat_init(); /* Statistics */
++ if (retval)
++ goto out_free_pnfs;
+ retval = nfsd_drc_slab_create();
+ if (retval)
+ goto out_free_stat;
+@@ -1530,20 +1514,25 @@ static int __init init_nfsd(void)
+ retval = create_proc_exports_entry();
+ if (retval)
+ goto out_free_lockd;
+- retval = register_filesystem(&nfsd_fs_type);
+- if (retval)
+- goto out_free_exports;
+ retval = register_pernet_subsys(&nfsd_net_ops);
+ if (retval < 0)
+- goto out_free_filesystem;
++ goto out_free_exports;
+ retval = register_cld_notifier();
++ if (retval)
++ goto out_free_subsys;
++ retval = nfsd4_create_laundry_wq();
++ if (retval)
++ goto out_free_cld;
++ retval = register_filesystem(&nfsd_fs_type);
+ if (retval)
+ goto out_free_all;
+ return 0;
+ out_free_all:
++ nfsd4_destroy_laundry_wq();
++out_free_cld:
++ unregister_cld_notifier();
++out_free_subsys:
+ unregister_pernet_subsys(&nfsd_net_ops);
+-out_free_filesystem:
+- unregister_filesystem(&nfsd_fs_type);
+ out_free_exports:
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
+@@ -1552,6 +1541,7 @@ static int __init init_nfsd(void)
+ nfsd_drc_slab_free();
+ out_free_stat:
+ nfsd_stat_shutdown();
++out_free_pnfs:
+ nfsd4_exit_pnfs();
+ out_free_slabs:
+ nfsd4_free_slabs();
+@@ -1560,6 +1550,8 @@ static int __init init_nfsd(void)
+
+ static void __exit exit_nfsd(void)
+ {
++ unregister_filesystem(&nfsd_fs_type);
++ nfsd4_destroy_laundry_wq();
+ unregister_cld_notifier();
+ unregister_pernet_subsys(&nfsd_net_ops);
+ nfsd_drc_slab_free();
+@@ -1569,7 +1561,6 @@ static void __exit exit_nfsd(void)
+ nfsd_lockd_shutdown();
+ nfsd4_free_slabs();
+ nfsd4_exit_pnfs();
+- unregister_filesystem(&nfsd_fs_type);
+ }
+
+ MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
+diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
+index 4362d295ed340..013bfa24ced21 100644
+--- a/fs/nfsd/nfsd.h
++++ b/fs/nfsd/nfsd.h
+@@ -24,8 +24,8 @@
+ #include <uapi/linux/nfsd/debug.h>
+
+ #include "netns.h"
+-#include "stats.h"
+ #include "export.h"
++#include "stats.h"
+
+ #undef ifdebug
+ #ifdef CONFIG_SUNRPC_DEBUG
+@@ -64,8 +64,7 @@ struct readdir_cd {
+
+
+ extern struct svc_program nfsd_program;
+-extern const struct svc_version nfsd_version2, nfsd_version3,
+- nfsd_version4;
++extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4;
+ extern struct mutex nfsd_mutex;
+ extern spinlock_t nfsd_drc_lock;
+ extern unsigned long nfsd_drc_max_mem;
+@@ -73,6 +72,16 @@ extern unsigned long nfsd_drc_mem_used;
+
+ extern const struct seq_operations nfs_exports_op;
+
++/*
++ * Common void argument and result helpers
++ */
++struct nfsd_voidargs { };
++struct nfsd_voidres { };
++bool nfssvc_decode_voidarg(struct svc_rqst *rqstp,
++ struct xdr_stream *xdr);
++bool nfssvc_encode_voidres(struct svc_rqst *rqstp,
++ struct xdr_stream *xdr);
++
+ /*
+ * Function prototypes.
+ */
+@@ -87,8 +96,6 @@ int nfsd_pool_stats_open(struct inode *, struct file *);
+ int nfsd_pool_stats_release(struct inode *, struct file *);
+ void nfsd_shutdown_threads(struct net *net);
+
+-void nfsd_destroy(struct net *net);
+-
+ bool i_am_nfsd(void);
+
+ struct nfsdfs_client {
+@@ -98,7 +105,9 @@ struct nfsdfs_client {
+
+ struct nfsdfs_client *get_nfsdfs_client(struct inode *);
+ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
+- struct nfsdfs_client *ncl, u32 id, const struct tree_descr *);
++ struct nfsdfs_client *ncl, u32 id,
++ const struct tree_descr *,
++ struct dentry **fdentries);
+ void nfsd_client_rmdir(struct dentry *dentry);
+
+
+@@ -122,6 +131,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change);
+ int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change);
+ void nfsd_reset_versions(struct nfsd_net *nn);
+ int nfsd_create_serv(struct net *net);
++void nfsd_last_thread(struct net *net);
+
+ extern int nfsd_max_blksize;
+
+@@ -150,6 +160,9 @@ void nfs4_state_shutdown_net(struct net *net);
+ int nfs4_reset_recoverydir(char *recdir);
+ char * nfs4_recoverydir(void);
+ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
++int nfsd4_create_laundry_wq(void);
++void nfsd4_destroy_laundry_wq(void);
++bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp, struct inode *inode);
+ #else
+ static inline int nfsd4_init_slabs(void) { return 0; }
+ static inline void nfsd4_free_slabs(void) { }
+@@ -163,6 +176,13 @@ static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+ {
+ return false;
+ }
++static inline int nfsd4_create_laundry_wq(void) { return 0; };
++static inline void nfsd4_destroy_laundry_wq(void) {};
++static inline bool nfsd_wait_for_delegreturn(struct svc_rqst *rqstp,
++ struct inode *inode)
++{
++ return false;
++}
+ #endif
+
+ /*
+@@ -324,6 +344,10 @@ void nfsd_lockd_shutdown(void);
+ #define COMPOUND_ERR_SLACK_SPACE 16 /* OP_SETATTR */
+
+ #define NFSD_LAUNDROMAT_MINTIMEOUT 1 /* seconds */
++#define NFSD_COURTESY_CLIENT_TIMEOUT (24 * 60 * 60) /* seconds */
++#define NFSD_CLIENT_MAX_TRIM_PER_RUN 128
++#define NFS4_CLIENTS_PER_GB 1024
++#define NFSD_DELEGRETURN_TIMEOUT (HZ / 34) /* 30ms */
+
+ /*
+ * The following attributes are currently not supported by the NFSv4 server:
+@@ -352,7 +376,7 @@ void nfsd_lockd_shutdown(void);
+ | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV \
+ | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \
+ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \
+- | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \
++ | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_CREATE \
+ | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
+
+ #define NFSD4_SUPPORTED_ATTRS_WORD2 0
+@@ -386,7 +410,6 @@ void nfsd_lockd_shutdown(void);
+
+ #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
+ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+- FATTR4_WORD2_CHANGE_ATTR_TYPE | \
+ FATTR4_WORD2_MODE_UMASK | \
+ NFSD4_2_SECURITY_ATTRS | \
+ FATTR4_WORD2_XATTR_SUPPORT)
+@@ -449,7 +472,8 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
+ (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
+ #define NFSD_WRITEABLE_ATTRS_WORD1 \
+ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
+- | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
++ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_CREATE \
++ | FATTR4_WORD1_TIME_MODIFY_SET)
+ #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+ #define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
+ FATTR4_WORD2_SECURITY_LABEL
+@@ -475,12 +499,20 @@ static inline bool nfsd_attrs_supported(u32 minorversion, const u32 *bmval)
+ extern int nfsd4_is_junction(struct dentry *dentry);
+ extern int register_cld_notifier(void);
+ extern void unregister_cld_notifier(void);
++#ifdef CONFIG_NFSD_V4_2_INTER_SSC
++extern void nfsd4_ssc_init_umount_work(struct nfsd_net *nn);
++#endif
++
++extern void nfsd4_init_leases_net(struct nfsd_net *nn);
++
+ #else /* CONFIG_NFSD_V4 */
+ static inline int nfsd4_is_junction(struct dentry *dentry)
+ {
+ return 0;
+ }
+
++static inline void nfsd4_init_leases_net(struct nfsd_net *nn) { };
++
+ #define register_cld_notifier() 0
+ #define unregister_cld_notifier() do { } while(0)
+
+diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
+index c81dbbad87920..db8d62632a5be 100644
+--- a/fs/nfsd/nfsfh.c
++++ b/fs/nfsd/nfsfh.c
+@@ -153,11 +153,12 @@ static inline __be32 check_pseudo_root(struct svc_rqst *rqstp,
+ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ {
+ struct knfsd_fh *fh = &fhp->fh_handle;
+- struct fid *fid = NULL, sfid;
++ struct fid *fid = NULL;
+ struct svc_export *exp;
+ struct dentry *dentry;
+ int fileid_type;
+ int data_left = fh->fh_size/4;
++ int len;
+ __be32 error;
+
+ error = nfserr_stale;
+@@ -166,48 +167,35 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ if (rqstp->rq_vers == 4 && fh->fh_size == 0)
+ return nfserr_nofilehandle;
+
+- if (fh->fh_version == 1) {
+- int len;
+-
+- if (--data_left < 0)
+- return error;
+- if (fh->fh_auth_type != 0)
+- return error;
+- len = key_len(fh->fh_fsid_type) / 4;
+- if (len == 0)
+- return error;
+- if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
+- /* deprecated, convert to type 3 */
+- len = key_len(FSID_ENCODE_DEV)/4;
+- fh->fh_fsid_type = FSID_ENCODE_DEV;
+- /*
+- * struct knfsd_fh uses host-endian fields, which are
+- * sometimes used to hold net-endian values. This
+- * confuses sparse, so we must use __force here to
+- * keep it from complaining.
+- */
+- fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
+- ntohl((__force __be32)fh->fh_fsid[1])));
+- fh->fh_fsid[1] = fh->fh_fsid[2];
+- }
+- data_left -= len;
+- if (data_left < 0)
+- return error;
+- exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
+- fid = (struct fid *)(fh->fh_fsid + len);
+- } else {
+- __u32 tfh[2];
+- dev_t xdev;
+- ino_t xino;
+-
+- if (fh->fh_size != NFS_FHSIZE)
+- return error;
+- /* assume old filehandle format */
+- xdev = old_decode_dev(fh->ofh_xdev);
+- xino = u32_to_ino_t(fh->ofh_xino);
+- mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL);
+- exp = rqst_exp_find(rqstp, FSID_DEV, tfh);
++ if (fh->fh_version != 1)
++ return error;
++
++ if (--data_left < 0)
++ return error;
++ if (fh->fh_auth_type != 0)
++ return error;
++ len = key_len(fh->fh_fsid_type) / 4;
++ if (len == 0)
++ return error;
++ if (fh->fh_fsid_type == FSID_MAJOR_MINOR) {
++ /* deprecated, convert to type 3 */
++ len = key_len(FSID_ENCODE_DEV)/4;
++ fh->fh_fsid_type = FSID_ENCODE_DEV;
++ /*
++ * struct knfsd_fh uses host-endian fields, which are
++ * sometimes used to hold net-endian values. This
++ * confuses sparse, so we must use __force here to
++ * keep it from complaining.
++ */
++ fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl((__force __be32)fh->fh_fsid[0]),
++ ntohl((__force __be32)fh->fh_fsid[1])));
++ fh->fh_fsid[1] = fh->fh_fsid[2];
+ }
++ data_left -= len;
++ if (data_left < 0)
++ return error;
++ exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_fsid);
++ fid = (struct fid *)(fh->fh_fsid + len);
+
+ error = nfserr_stale;
+ if (IS_ERR(exp)) {
+@@ -252,28 +240,25 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ if (rqstp->rq_vers > 2)
+ error = nfserr_badhandle;
+
+- if (fh->fh_version != 1) {
+- sfid.i32.ino = fh->ofh_ino;
+- sfid.i32.gen = fh->ofh_generation;
+- sfid.i32.parent_ino = fh->ofh_dirino;
+- fid = &sfid;
+- data_left = 3;
+- if (fh->ofh_dirino == 0)
+- fileid_type = FILEID_INO32_GEN;
+- else
+- fileid_type = FILEID_INO32_GEN_PARENT;
+- } else
+- fileid_type = fh->fh_fileid_type;
++ fileid_type = fh->fh_fileid_type;
+
+ if (fileid_type == FILEID_ROOT)
+ dentry = dget(exp->ex_path.dentry);
+ else {
+- dentry = exportfs_decode_fh(exp->ex_path.mnt, fid,
+- data_left, fileid_type,
+- nfsd_acceptable, exp);
+- if (IS_ERR_OR_NULL(dentry))
++ dentry = exportfs_decode_fh_raw(exp->ex_path.mnt, fid,
++ data_left, fileid_type,
++ nfsd_acceptable, exp);
++ if (IS_ERR_OR_NULL(dentry)) {
+ trace_nfsd_set_fh_dentry_badhandle(rqstp, fhp,
+ dentry ? PTR_ERR(dentry) : -ESTALE);
++ switch (PTR_ERR(dentry)) {
++ case -ENOMEM:
++ case -ETIMEDOUT:
++ break;
++ default:
++ dentry = ERR_PTR(-ESTALE);
++ }
++ }
+ }
+ if (dentry == NULL)
+ goto out;
+@@ -291,6 +276,20 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+
+ fhp->fh_dentry = dentry;
+ fhp->fh_export = exp;
++
++ switch (rqstp->rq_vers) {
++ case 4:
++ if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOATOMIC_ATTR)
++ fhp->fh_no_atomic_attr = true;
++ break;
++ case 3:
++ if (dentry->d_sb->s_export_op->flags & EXPORT_OP_NOWCC)
++ fhp->fh_no_wcc = true;
++ break;
++ case 2:
++ fhp->fh_no_wcc = true;
++ }
++
+ return 0;
+ out:
+ exp_put(exp);
+@@ -327,7 +326,7 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp)
+ __be32
+ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
+ {
+- struct svc_export *exp;
++ struct svc_export *exp = NULL;
+ struct dentry *dentry;
+ __be32 error;
+
+@@ -400,7 +399,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
+ }
+ out:
+ if (error == nfserr_stale)
+- nfsdstats.fh_stale++;
++ nfsd_stats_fh_stale_inc(exp);
+ return error;
+ }
+
+@@ -429,20 +428,6 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
+ }
+ }
+
+-/*
+- * for composing old style file handles
+- */
+-static inline void _fh_update_old(struct dentry *dentry,
+- struct svc_export *exp,
+- struct knfsd_fh *fh)
+-{
+- fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino);
+- fh->ofh_generation = d_inode(dentry)->i_generation;
+- if (d_is_dir(dentry) ||
+- (exp->ex_flags & NFSEXP_NOSUBTREECHECK))
+- fh->ofh_dirino = 0;
+-}
+-
+ static bool is_root_export(struct svc_export *exp)
+ {
+ return exp->ex_path.dentry == exp->ex_path.dentry->d_sb->s_root;
+@@ -539,9 +524,6 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+ /* ref_fh is a reference file handle.
+ * if it is non-null and for the same filesystem, then we should compose
+ * a filehandle which is of the same version, where possible.
+- * Currently, that means that if ref_fh->fh_handle.fh_version == 0xca
+- * Then create a 32byte filehandle using nfs_fhbase_old
+- *
+ */
+
+ struct inode * inode = d_inode(dentry);
+@@ -559,10 +541,13 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+ */
+ set_version_and_fsid_type(fhp, exp, ref_fh);
+
++ /* If we have a ref_fh, then copy the fh_no_wcc setting from it. */
++ fhp->fh_no_wcc = ref_fh ? ref_fh->fh_no_wcc : false;
++
+ if (ref_fh == fhp)
+ fh_put(ref_fh);
+
+- if (fhp->fh_locked || fhp->fh_dentry) {
++ if (fhp->fh_dentry) {
+ printk(KERN_ERR "fh_compose: fh %pd2 not initialized!\n",
+ dentry);
+ }
+@@ -574,35 +559,21 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
+ fhp->fh_dentry = dget(dentry); /* our internal copy */
+ fhp->fh_export = exp_get(exp);
+
+- if (fhp->fh_handle.fh_version == 0xca) {
+- /* old style filehandle please */
+- memset(&fhp->fh_handle.fh_base, 0, NFS_FHSIZE);
+- fhp->fh_handle.fh_size = NFS_FHSIZE;
+- fhp->fh_handle.ofh_dcookie = 0xfeebbaca;
+- fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev);
+- fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev;
+- fhp->fh_handle.ofh_xino =
+- ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino);
+- fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry));
+- if (inode)
+- _fh_update_old(dentry, exp, &fhp->fh_handle);
+- } else {
+- fhp->fh_handle.fh_size =
+- key_len(fhp->fh_handle.fh_fsid_type) + 4;
+- fhp->fh_handle.fh_auth_type = 0;
+-
+- mk_fsid(fhp->fh_handle.fh_fsid_type,
+- fhp->fh_handle.fh_fsid,
+- ex_dev,
+- d_inode(exp->ex_path.dentry)->i_ino,
+- exp->ex_fsid, exp->ex_uuid);
+-
+- if (inode)
+- _fh_update(fhp, exp, dentry);
+- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
+- fh_put(fhp);
+- return nfserr_opnotsupp;
+- }
++ fhp->fh_handle.fh_size =
++ key_len(fhp->fh_handle.fh_fsid_type) + 4;
++ fhp->fh_handle.fh_auth_type = 0;
++
++ mk_fsid(fhp->fh_handle.fh_fsid_type,
++ fhp->fh_handle.fh_fsid,
++ ex_dev,
++ d_inode(exp->ex_path.dentry)->i_ino,
++ exp->ex_fsid, exp->ex_uuid);
++
++ if (inode)
++ _fh_update(fhp, exp, dentry);
++ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID) {
++ fh_put(fhp);
++ return nfserr_opnotsupp;
+ }
+
+ return 0;
+@@ -623,16 +594,12 @@ fh_update(struct svc_fh *fhp)
+ dentry = fhp->fh_dentry;
+ if (d_really_is_negative(dentry))
+ goto out_negative;
+- if (fhp->fh_handle.fh_version != 1) {
+- _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle);
+- } else {
+- if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
+- return 0;
++ if (fhp->fh_handle.fh_fileid_type != FILEID_ROOT)
++ return 0;
+
+- _fh_update(fhp, fhp->fh_export, dentry);
+- if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
+- return nfserr_opnotsupp;
+- }
++ _fh_update(fhp, fhp->fh_export, dentry);
++ if (fhp->fh_handle.fh_fileid_type == FILEID_INVALID)
++ return nfserr_opnotsupp;
+ return 0;
+ out_bad:
+ printk(KERN_ERR "fh_update: fh not verified!\n");
+@@ -643,6 +610,85 @@ fh_update(struct svc_fh *fhp)
+ return nfserr_serverfault;
+ }
+
++/**
++ * fh_fill_pre_attrs - Fill in pre-op attributes
++ * @fhp: file handle to be updated
++ *
++ */
++void fh_fill_pre_attrs(struct svc_fh *fhp)
++{
++ bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
++ struct inode *inode;
++ struct kstat stat;
++ __be32 err;
++
++ if (fhp->fh_no_wcc || fhp->fh_pre_saved)
++ return;
++
++ inode = d_inode(fhp->fh_dentry);
++ err = fh_getattr(fhp, &stat);
++ if (err) {
++ /* Grab the times from inode anyway */
++ stat.mtime = inode->i_mtime;
++ stat.ctime = inode->i_ctime;
++ stat.size = inode->i_size;
++ }
++ if (v4)
++ fhp->fh_pre_change = nfsd4_change_attribute(&stat, inode);
++
++ fhp->fh_pre_mtime = stat.mtime;
++ fhp->fh_pre_ctime = stat.ctime;
++ fhp->fh_pre_size = stat.size;
++ fhp->fh_pre_saved = true;
++}
++
++/**
++ * fh_fill_post_attrs - Fill in post-op attributes
++ * @fhp: file handle to be updated
++ *
++ */
++void fh_fill_post_attrs(struct svc_fh *fhp)
++{
++ bool v4 = (fhp->fh_maxsize == NFS4_FHSIZE);
++ struct inode *inode = d_inode(fhp->fh_dentry);
++ __be32 err;
++
++ if (fhp->fh_no_wcc)
++ return;
++
++ if (fhp->fh_post_saved)
++ printk("nfsd: inode locked twice during operation.\n");
++
++ err = fh_getattr(fhp, &fhp->fh_post_attr);
++ if (err) {
++ fhp->fh_post_saved = false;
++ fhp->fh_post_attr.ctime = inode->i_ctime;
++ } else
++ fhp->fh_post_saved = true;
++ if (v4)
++ fhp->fh_post_change =
++ nfsd4_change_attribute(&fhp->fh_post_attr, inode);
++}
++
++/**
++ * fh_fill_both_attrs - Fill pre-op and post-op attributes
++ * @fhp: file handle to be updated
++ *
++ * This is used when the directory wasn't changed, but wcc attributes
++ * are needed anyway.
++ */
++void fh_fill_both_attrs(struct svc_fh *fhp)
++{
++ fh_fill_post_attrs(fhp);
++ if (!fhp->fh_post_saved)
++ return;
++ fhp->fh_pre_change = fhp->fh_post_change;
++ fhp->fh_pre_mtime = fhp->fh_post_attr.mtime;
++ fhp->fh_pre_ctime = fhp->fh_post_attr.ctime;
++ fhp->fh_pre_size = fhp->fh_post_attr.size;
++ fhp->fh_pre_saved = true;
++}
++
+ /*
+ * Release a file handle.
+ */
+@@ -652,16 +698,16 @@ fh_put(struct svc_fh *fhp)
+ struct dentry * dentry = fhp->fh_dentry;
+ struct svc_export * exp = fhp->fh_export;
+ if (dentry) {
+- fh_unlock(fhp);
+ fhp->fh_dentry = NULL;
+ dput(dentry);
+- fh_clear_wcc(fhp);
++ fh_clear_pre_post_attrs(fhp);
+ }
+ fh_drop_write(fhp);
+ if (exp) {
+ exp_put(exp);
+ fhp->fh_export = NULL;
+ }
++ fhp->fh_no_wcc = false;
+ return;
+ }
+
+@@ -671,20 +717,15 @@ fh_put(struct svc_fh *fhp)
+ char * SVCFH_fmt(struct svc_fh *fhp)
+ {
+ struct knfsd_fh *fh = &fhp->fh_handle;
++ static char buf[2+1+1+64*3+1];
+
+- static char buf[80];
+- sprintf(buf, "%d: %08x %08x %08x %08x %08x %08x",
+- fh->fh_size,
+- fh->fh_base.fh_pad[0],
+- fh->fh_base.fh_pad[1],
+- fh->fh_base.fh_pad[2],
+- fh->fh_base.fh_pad[3],
+- fh->fh_base.fh_pad[4],
+- fh->fh_base.fh_pad[5]);
++ if (fh->fh_size < 0 || fh->fh_size> 64)
++ return "bad-fh";
++ sprintf(buf, "%d: %*ph", fh->fh_size, fh->fh_size, fh->fh_raw);
+ return buf;
+ }
+
+-enum fsid_source fsid_source(struct svc_fh *fhp)
++enum fsid_source fsid_source(const struct svc_fh *fhp)
+ {
+ if (fhp->fh_handle.fh_version != 1)
+ return FSIDSOURCE_DEV;
+diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
+index 56cfbc3615618..513e028b0bbee 100644
+--- a/fs/nfsd/nfsfh.h
++++ b/fs/nfsd/nfsfh.h
+@@ -10,8 +10,56 @@
+
+ #include <linux/crc32.h>
+ #include <linux/sunrpc/svc.h>
+-#include <uapi/linux/nfsd/nfsfh.h>
+ #include <linux/iversion.h>
++#include <linux/exportfs.h>
++#include <linux/nfs4.h>
++
++/*
++ * The file handle starts with a sequence of four-byte words.
++ * The first word contains a version number (1) and three descriptor bytes
++ * that tell how the remaining 3 variable length fields should be handled.
++ * These three bytes are auth_type, fsid_type and fileid_type.
++ *
++ * All four-byte values are in host-byte-order.
++ *
++ * The auth_type field is deprecated and must be set to 0.
++ *
++ * The fsid_type identifies how the filesystem (or export point) is
++ * encoded.
++ * Current values:
++ * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
++ * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
++ * says we mustn't. We must break it up and reassemble.
++ * 1 - 4 byte user specified identifier
++ * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
++ * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
++ * 4 - 4 byte inode number and 4 byte uuid
++ * 5 - 8 byte uuid
++ * 6 - 16 byte uuid
++ * 7 - 8 byte inode number and 16 byte uuid
++ *
++ * The fileid_type identifies how the file within the filesystem is encoded.
++ * The values for this field are filesystem specific, exccept that
++ * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
++ * in include/linux/exportfs.h for currently registered values.
++ */
++
++struct knfsd_fh {
++ unsigned int fh_size; /*
++ * Points to the current size while
++ * building a new file handle.
++ */
++ union {
++ char fh_raw[NFS4_FHSIZE];
++ struct {
++ u8 fh_version; /* == 1 */
++ u8 fh_auth_type; /* deprecated */
++ u8 fh_fsid_type;
++ u8 fh_fileid_type;
++ u32 fh_fsid[]; /* flexible-array member */
++ };
++ };
++};
+
+ static inline __u32 ino_t_to_u32(ino_t ino)
+ {
+@@ -33,14 +81,18 @@ typedef struct svc_fh {
+ struct dentry * fh_dentry; /* validated dentry */
+ struct svc_export * fh_export; /* export pointer */
+
+- bool fh_locked; /* inode locked by us */
+ bool fh_want_write; /* remount protection taken */
++ bool fh_no_wcc; /* no wcc data needed */
++ bool fh_no_atomic_attr;
++ /*
++ * wcc data is not atomic with
++ * operation
++ */
+ int fh_flags; /* FH flags */
+-#ifdef CONFIG_NFSD_V3
+ bool fh_post_saved; /* post-op attrs saved */
+ bool fh_pre_saved; /* pre-op attrs saved */
+
+- /* Pre-op attributes saved during fh_lock */
++ /* Pre-op attributes saved when inode is locked */
+ __u64 fh_pre_size; /* size before operation */
+ struct timespec64 fh_pre_mtime; /* mtime before oper */
+ struct timespec64 fh_pre_ctime; /* ctime before oper */
+@@ -50,11 +102,9 @@ typedef struct svc_fh {
+ */
+ u64 fh_pre_change;
+
+- /* Post-op attributes saved in fh_unlock */
++ /* Post-op attributes saved in fh_fill_post_attrs() */
+ struct kstat fh_post_attr; /* full attrs after operation */
+ u64 fh_post_change; /* nfsv4 change; see above */
+-#endif /* CONFIG_NFSD_V3 */
+-
+ } svc_fh;
+ #define NFSD4_FH_FOREIGN (1<<0)
+ #define SET_FH_FLAG(c, f) ((c)->fh_flags |= (f))
+@@ -76,7 +126,7 @@ enum fsid_source {
+ FSIDSOURCE_FSID,
+ FSIDSOURCE_UUID,
+ };
+-extern enum fsid_source fsid_source(struct svc_fh *fhp);
++extern enum fsid_source fsid_source(const struct svc_fh *fhp);
+
+
+ /*
+@@ -170,19 +220,19 @@ __be32 fh_update(struct svc_fh *);
+ void fh_put(struct svc_fh *);
+
+ static __inline__ struct svc_fh *
+-fh_copy(struct svc_fh *dst, struct svc_fh *src)
++fh_copy(struct svc_fh *dst, const struct svc_fh *src)
+ {
+- WARN_ON(src->fh_dentry || src->fh_locked);
+-
++ WARN_ON(src->fh_dentry);
++
+ *dst = *src;
+ return dst;
+ }
+
+ static inline void
+-fh_copy_shallow(struct knfsd_fh *dst, struct knfsd_fh *src)
++fh_copy_shallow(struct knfsd_fh *dst, const struct knfsd_fh *src)
+ {
+ dst->fh_size = src->fh_size;
+- memcpy(&dst->fh_base, &src->fh_base, src->fh_size);
++ memcpy(&dst->fh_raw, &src->fh_raw, src->fh_size);
+ }
+
+ static __inline__ struct svc_fh *
+@@ -193,16 +243,18 @@ fh_init(struct svc_fh *fhp, int maxsize)
+ return fhp;
+ }
+
+-static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
++static inline bool fh_match(const struct knfsd_fh *fh1,
++ const struct knfsd_fh *fh2)
+ {
+ if (fh1->fh_size != fh2->fh_size)
+ return false;
+- if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
++ if (memcmp(fh1->fh_raw, fh2->fh_raw, fh1->fh_size) != 0)
+ return false;
+ return true;
+ }
+
+-static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
++static inline bool fh_fsid_match(const struct knfsd_fh *fh1,
++ const struct knfsd_fh *fh2)
+ {
+ if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+ return false;
+@@ -219,27 +271,23 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+-
+-static inline u32
+-knfsd_fh_hash(struct knfsd_fh *fh)
++static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
+ {
+- return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
++ return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size);
+ }
+ #else
+-static inline u32
+-knfsd_fh_hash(struct knfsd_fh *fh)
++static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh)
+ {
+ return 0;
+ }
+ #endif
+
+-#ifdef CONFIG_NFSD_V3
+-/*
+- * The wcc data stored in current_fh should be cleared
+- * between compound ops.
++/**
++ * fh_clear_pre_post_attrs - Reset pre/post attributes
++ * @fhp: file handle to be updated
++ *
+ */
+-static inline void
+-fh_clear_wcc(struct svc_fh *fhp)
++static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp)
+ {
+ fhp->fh_post_saved = false;
+ fhp->fh_pre_saved = false;
+@@ -259,68 +307,21 @@ fh_clear_wcc(struct svc_fh *fhp)
+ static inline u64 nfsd4_change_attribute(struct kstat *stat,
+ struct inode *inode)
+ {
+- u64 chattr;
+-
+- chattr = stat->ctime.tv_sec;
+- chattr <<= 30;
+- chattr += stat->ctime.tv_nsec;
+- chattr += inode_query_iversion(inode);
+- return chattr;
+-}
+-
+-extern void fill_pre_wcc(struct svc_fh *fhp);
+-extern void fill_post_wcc(struct svc_fh *fhp);
+-#else
+-#define fh_clear_wcc(ignored)
+-#define fill_pre_wcc(ignored)
+-#define fill_post_wcc(notused)
+-#endif /* CONFIG_NFSD_V3 */
+-
+-
+-/*
+- * Lock a file handle/inode
+- * NOTE: both fh_lock and fh_unlock are done "by hand" in
+- * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once
+- * so, any changes here should be reflected there.
+- */
+-
+-static inline void
+-fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
+-{
+- struct dentry *dentry = fhp->fh_dentry;
+- struct inode *inode;
+-
+- BUG_ON(!dentry);
+-
+- if (fhp->fh_locked) {
+- printk(KERN_WARNING "fh_lock: %pd2 already locked!\n",
+- dentry);
+- return;
+- }
+-
+- inode = d_inode(dentry);
+- inode_lock_nested(inode, subclass);
+- fill_pre_wcc(fhp);
+- fhp->fh_locked = true;
+-}
+-
+-static inline void
+-fh_lock(struct svc_fh *fhp)
+-{
+- fh_lock_nested(fhp, I_MUTEX_NORMAL);
+-}
+-
+-/*
+- * Unlock a file handle/inode
+- */
+-static inline void
+-fh_unlock(struct svc_fh *fhp)
+-{
+- if (fhp->fh_locked) {
+- fill_post_wcc(fhp);
+- inode_unlock(d_inode(fhp->fh_dentry));
+- fhp->fh_locked = false;
+- }
++ if (inode->i_sb->s_export_op->fetch_iversion)
++ return inode->i_sb->s_export_op->fetch_iversion(inode);
++ else if (IS_I_VERSION(inode)) {
++ u64 chattr;
++
++ chattr = stat->ctime.tv_sec;
++ chattr <<= 30;
++ chattr += stat->ctime.tv_nsec;
++ chattr += inode_query_iversion(inode);
++ return chattr;
++ } else
++ return time_to_chattr(&stat->ctime);
+ }
+
++extern void fh_fill_pre_attrs(struct svc_fh *fhp);
++extern void fh_fill_post_attrs(struct svc_fh *fhp);
++extern void fh_fill_both_attrs(struct svc_fh *fhp);
+ #endif /* _LINUX_NFSD_NFSFH_H */
+diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
+index bbd01e8397f6e..96426dea7d412 100644
+--- a/fs/nfsd/nfsproc.c
++++ b/fs/nfsd/nfsproc.c
+@@ -51,6 +51,9 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
+ struct nfsd_sattrargs *argp = rqstp->rq_argp;
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+ struct iattr *iap = &argp->attrs;
++ struct nfsd_attrs attrs = {
++ .na_iattr = iap,
++ };
+ struct svc_fh *fhp;
+
+ dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n",
+@@ -100,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
+ }
+ }
+
+- resp->status = nfsd_setattr(rqstp, fhp, iap, 0, (time64_t)0);
++ resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0);
+ if (resp->status != nfs_ok)
+ goto out;
+
+@@ -149,14 +152,16 @@ nfsd_proc_lookup(struct svc_rqst *rqstp)
+ static __be32
+ nfsd_proc_readlink(struct svc_rqst *rqstp)
+ {
+- struct nfsd_readlinkargs *argp = rqstp->rq_argp;
++ struct nfsd_fhandle *argp = rqstp->rq_argp;
+ struct nfsd_readlinkres *resp = rqstp->rq_resp;
+
+ dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
+
+ /* Read the symlink. */
+ resp->len = NFS_MAXPATHLEN;
+- resp->status = nfsd_readlink(rqstp, &argp->fh, argp->buffer, &resp->len);
++ resp->page = *(rqstp->rq_next_page++);
++ resp->status = nfsd_readlink(rqstp, &argp->fh,
++ page_address(resp->page), &resp->len);
+
+ fh_put(&argp->fh);
+ return rpc_success;
+@@ -171,36 +176,42 @@ nfsd_proc_read(struct svc_rqst *rqstp)
+ {
+ struct nfsd_readargs *argp = rqstp->rq_argp;
+ struct nfsd_readres *resp = rqstp->rq_resp;
++ unsigned int len;
+ u32 eof;
++ int v;
+
+ dprintk("nfsd: READ %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, argp->offset);
+
++ argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
++ argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
++
++ v = 0;
++ len = argp->count;
++ resp->pages = rqstp->rq_next_page;
++ while (len > 0) {
++ struct page *page = *(rqstp->rq_next_page++);
++
++ rqstp->rq_vec[v].iov_base = page_address(page);
++ rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
++ len -= rqstp->rq_vec[v].iov_len;
++ v++;
++ }
++
+ /* Obtain buffer pointer for payload. 19 is 1 word for
+ * status, 17 words for fattr, and 1 word for the byte count.
+ */
+-
+- if (NFSSVC_MAXBLKSIZE_V2 < argp->count) {
+- char buf[RPC_MAX_ADDRBUFLEN];
+- printk(KERN_NOTICE
+- "oversized read request from %s (%d bytes)\n",
+- svc_print_addr(rqstp, buf, sizeof(buf)),
+- argp->count);
+- argp->count = NFSSVC_MAXBLKSIZE_V2;
+- }
+ svc_reserve_auth(rqstp, (19<<2) + argp->count + 4);
+
+ resp->count = argp->count;
+- resp->status = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
+- argp->offset,
+- rqstp->rq_vec, argp->vlen,
+- &resp->count,
+- &eof);
++ fh_copy(&resp->fh, &argp->fh);
++ resp->status = nfsd_read(rqstp, &resp->fh, argp->offset,
++ rqstp->rq_vec, v, &resp->count, &eof);
+ if (resp->status == nfs_ok)
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+ else if (resp->status == nfserr_jukebox)
+- return rpc_drop_reply;
++ set_bit(RQ_DROPME, &rqstp->rq_flags);
+ return rpc_success;
+ }
+
+@@ -227,12 +238,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
+ SVCFH_fmt(&argp->fh),
+ argp->len, argp->offset);
+
+- nvecs = svc_fill_write_vector(rqstp, rqstp->rq_arg.pages,
+- &argp->first, cnt);
+- if (!nvecs) {
+- resp->status = nfserr_io;
+- goto out;
+- }
++ nvecs = svc_fill_write_vector(rqstp, &argp->payload);
+
+ resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
+ argp->offset, rqstp->rq_vec, nvecs,
+@@ -240,8 +246,7 @@ nfsd_proc_write(struct svc_rqst *rqstp)
+ if (resp->status == nfs_ok)
+ resp->status = fh_getattr(&resp->fh, &resp->stat);
+ else if (resp->status == nfserr_jukebox)
+- return rpc_drop_reply;
+-out:
++ set_bit(RQ_DROPME, &rqstp->rq_flags);
+ return rpc_success;
+ }
+
+@@ -259,6 +264,9 @@ nfsd_proc_create(struct svc_rqst *rqstp)
+ svc_fh *dirfhp = &argp->fh;
+ svc_fh *newfhp = &resp->fh;
+ struct iattr *attr = &argp->attrs;
++ struct nfsd_attrs attrs = {
++ .na_iattr = attr,
++ };
+ struct inode *inode;
+ struct dentry *dchild;
+ int type, mode;
+@@ -284,7 +292,7 @@ nfsd_proc_create(struct svc_rqst *rqstp)
+ goto done;
+ }
+
+- fh_lock_nested(dirfhp, I_MUTEX_PARENT);
++ inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT);
+ dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len);
+ if (IS_ERR(dchild)) {
+ resp->status = nfserrno(PTR_ERR(dchild));
+@@ -383,9 +391,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
+ resp->status = nfs_ok;
+ if (!inode) {
+ /* File doesn't exist. Create it and set attrs */
+- resp->status = nfsd_create_locked(rqstp, dirfhp, argp->name,
+- argp->len, attr, type, rdev,
+- newfhp);
++ resp->status = nfsd_create_locked(rqstp, dirfhp, &attrs, type,
++ rdev, newfhp);
+ } else if (type == S_IFREG) {
+ dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
+ argp->name, attr->ia_valid, (long) attr->ia_size);
+@@ -395,13 +402,12 @@ nfsd_proc_create(struct svc_rqst *rqstp)
+ */
+ attr->ia_valid &= ATTR_SIZE;
+ if (attr->ia_valid)
+- resp->status = nfsd_setattr(rqstp, newfhp, attr, 0,
++ resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0,
+ (time64_t)0);
+ }
+
+ out_unlock:
+- /* We don't really need to unlock, as fh_put does it. */
+- fh_unlock(dirfhp);
++ inode_unlock(dirfhp->fh_dentry->d_inode);
+ fh_drop_write(dirfhp);
+ done:
+ fh_put(dirfhp);
+@@ -471,6 +477,9 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
+ {
+ struct nfsd_symlinkargs *argp = rqstp->rq_argp;
+ struct nfsd_stat *resp = rqstp->rq_resp;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &argp->attrs,
++ };
+ struct svc_fh newfh;
+
+ if (argp->tlen > NFS_MAXPATHLEN) {
+@@ -492,7 +501,7 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
+
+ fh_init(&newfh, NFS_FHSIZE);
+ resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
+- argp->tname, &newfh);
++ argp->tname, &attrs, &newfh);
+
+ kfree(argp->tname);
+ fh_put(&argp->ffh);
+@@ -510,6 +519,9 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
+ {
+ struct nfsd_createargs *argp = rqstp->rq_argp;
+ struct nfsd_diropres *resp = rqstp->rq_resp;
++ struct nfsd_attrs attrs = {
++ .na_iattr = &argp->attrs,
++ };
+
+ dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
+
+@@ -521,7 +533,7 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
+ argp->attrs.ia_valid &= ~ATTR_SIZE;
+ fh_init(&resp->fh, NFS_FHSIZE);
+ resp->status = nfsd_create(rqstp, &argp->fh, argp->name, argp->len,
+- &argp->attrs, S_IFDIR, 0, &resp->fh);
++ &attrs, S_IFDIR, 0, &resp->fh);
+ fh_put(&argp->fh);
+ if (resp->status != nfs_ok)
+ goto out;
+@@ -548,6 +560,24 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp)
+ return rpc_success;
+ }
+
++static void nfsd_init_dirlist_pages(struct svc_rqst *rqstp,
++ struct nfsd_readdirres *resp,
++ u32 count)
++{
++ struct xdr_buf *buf = &resp->dirlist;
++ struct xdr_stream *xdr = &resp->xdr;
++
++ memset(buf, 0, sizeof(*buf));
++
++ /* Reserve room for the NULL ptr & eof flag (-2 words) */
++ buf->buflen = clamp(count, (u32)(XDR_UNIT * 2), (u32)PAGE_SIZE);
++ buf->buflen -= XDR_UNIT * 2;
++ buf->pages = rqstp->rq_next_page;
++ rqstp->rq_next_page++;
++
++ xdr_init_encode_pages(xdr, buf, buf->pages, NULL);
++}
++
+ /*
+ * Read a portion of a directory.
+ */
+@@ -556,33 +586,20 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
+ {
+ struct nfsd_readdirargs *argp = rqstp->rq_argp;
+ struct nfsd_readdirres *resp = rqstp->rq_resp;
+- int count;
+ loff_t offset;
+
+ dprintk("nfsd: READDIR %s %d bytes at %d\n",
+ SVCFH_fmt(&argp->fh),
+ argp->count, argp->cookie);
+
+- /* Shrink to the client read size */
+- count = (argp->count >> 2) - 2;
++ nfsd_init_dirlist_pages(rqstp, resp, argp->count);
+
+- /* Make sure we've room for the NULL ptr & eof flag */
+- count -= 2;
+- if (count < 0)
+- count = 0;
+-
+- resp->buffer = argp->buffer;
+- resp->offset = NULL;
+- resp->buflen = count;
+ resp->common.err = nfs_ok;
+- /* Read directory and encode entries on the fly */
++ resp->cookie_offset = 0;
+ offset = argp->cookie;
+ resp->status = nfsd_readdir(rqstp, &argp->fh, &offset,
+ &resp->common, nfssvc_encode_entry);
+-
+- resp->count = resp->buffer - argp->buffer;
+- if (resp->offset)
+- *resp->offset = htonl(offset);
++ nfssvc_encode_nfscookie(resp, offset);
+
+ fh_put(&argp->fh);
+ return rpc_success;
+@@ -609,7 +626,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp)
+ * NFSv2 Server procedures.
+ * Only the results of non-idempotent operations are cached.
+ */
+-struct nfsd_void { int dummy; };
+
+ #define ST 1 /* status */
+ #define FH 8 /* filehandle */
+@@ -618,41 +634,49 @@ struct nfsd_void { int dummy; };
+ static const struct svc_procedure nfsd_procedures2[18] = {
+ [NFSPROC_NULL] = {
+ .pc_func = nfsd_proc_null,
+- .pc_decode = nfssvc_decode_void,
+- .pc_encode = nfssvc_encode_void,
+- .pc_argsize = sizeof(struct nfsd_void),
+- .pc_ressize = sizeof(struct nfsd_void),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
++ .pc_name = "NULL",
+ },
+ [NFSPROC_GETATTR] = {
+ .pc_func = nfsd_proc_getattr,
+- .pc_decode = nfssvc_decode_fhandle,
+- .pc_encode = nfssvc_encode_attrstat,
++ .pc_decode = nfssvc_decode_fhandleargs,
++ .pc_encode = nfssvc_encode_attrstatres,
+ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
++ .pc_argzero = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT,
++ .pc_name = "GETATTR",
+ },
+ [NFSPROC_SETATTR] = {
+ .pc_func = nfsd_proc_setattr,
+ .pc_decode = nfssvc_decode_sattrargs,
+- .pc_encode = nfssvc_encode_attrstat,
++ .pc_encode = nfssvc_encode_attrstatres,
+ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_sattrargs),
++ .pc_argzero = sizeof(struct nfsd_sattrargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+AT,
++ .pc_name = "SETATTR",
+ },
+ [NFSPROC_ROOT] = {
+ .pc_func = nfsd_proc_root,
+- .pc_decode = nfssvc_decode_void,
+- .pc_encode = nfssvc_encode_void,
+- .pc_argsize = sizeof(struct nfsd_void),
+- .pc_ressize = sizeof(struct nfsd_void),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
++ .pc_name = "ROOT",
+ },
+ [NFSPROC_LOOKUP] = {
+ .pc_func = nfsd_proc_lookup,
+@@ -660,18 +684,22 @@ static const struct svc_procedure nfsd_procedures2[18] = {
+ .pc_encode = nfssvc_encode_diropres,
+ .pc_release = nfssvc_release_diropres,
+ .pc_argsize = sizeof(struct nfsd_diropargs),
++ .pc_argzero = sizeof(struct nfsd_diropargs),
+ .pc_ressize = sizeof(struct nfsd_diropres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+FH+AT,
++ .pc_name = "LOOKUP",
+ },
+ [NFSPROC_READLINK] = {
+ .pc_func = nfsd_proc_readlink,
+- .pc_decode = nfssvc_decode_readlinkargs,
++ .pc_decode = nfssvc_decode_fhandleargs,
+ .pc_encode = nfssvc_encode_readlinkres,
+- .pc_argsize = sizeof(struct nfsd_readlinkargs),
++ .pc_argsize = sizeof(struct nfsd_fhandle),
++ .pc_argzero = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_readlinkres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+1+NFS_MAXPATHLEN/4,
++ .pc_name = "READLINK",
+ },
+ [NFSPROC_READ] = {
+ .pc_func = nfsd_proc_read,
+@@ -679,28 +707,34 @@ static const struct svc_procedure nfsd_procedures2[18] = {
+ .pc_encode = nfssvc_encode_readres,
+ .pc_release = nfssvc_release_readres,
+ .pc_argsize = sizeof(struct nfsd_readargs),
++ .pc_argzero = sizeof(struct nfsd_readargs),
+ .pc_ressize = sizeof(struct nfsd_readres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
++ .pc_name = "READ",
+ },
+ [NFSPROC_WRITECACHE] = {
+ .pc_func = nfsd_proc_writecache,
+- .pc_decode = nfssvc_decode_void,
+- .pc_encode = nfssvc_encode_void,
+- .pc_argsize = sizeof(struct nfsd_void),
+- .pc_ressize = sizeof(struct nfsd_void),
++ .pc_decode = nfssvc_decode_voidarg,
++ .pc_encode = nfssvc_encode_voidres,
++ .pc_argsize = sizeof(struct nfsd_voidargs),
++ .pc_argzero = sizeof(struct nfsd_voidargs),
++ .pc_ressize = sizeof(struct nfsd_voidres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = 0,
++ .pc_name = "WRITECACHE",
+ },
+ [NFSPROC_WRITE] = {
+ .pc_func = nfsd_proc_write,
+ .pc_decode = nfssvc_decode_writeargs,
+- .pc_encode = nfssvc_encode_attrstat,
++ .pc_encode = nfssvc_encode_attrstatres,
+ .pc_release = nfssvc_release_attrstat,
+ .pc_argsize = sizeof(struct nfsd_writeargs),
++ .pc_argzero = sizeof(struct nfsd_writeargs),
+ .pc_ressize = sizeof(struct nfsd_attrstat),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+AT,
++ .pc_name = "WRITE",
+ },
+ [NFSPROC_CREATE] = {
+ .pc_func = nfsd_proc_create,
+@@ -708,45 +742,55 @@ static const struct svc_procedure nfsd_procedures2[18] = {
+ .pc_encode = nfssvc_encode_diropres,
+ .pc_release = nfssvc_release_diropres,
+ .pc_argsize = sizeof(struct nfsd_createargs),
++ .pc_argzero = sizeof(struct nfsd_createargs),
+ .pc_ressize = sizeof(struct nfsd_diropres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+FH+AT,
++ .pc_name = "CREATE",
+ },
+ [NFSPROC_REMOVE] = {
+ .pc_func = nfsd_proc_remove,
+ .pc_decode = nfssvc_decode_diropargs,
+- .pc_encode = nfssvc_encode_stat,
++ .pc_encode = nfssvc_encode_statres,
+ .pc_argsize = sizeof(struct nfsd_diropargs),
++ .pc_argzero = sizeof(struct nfsd_diropargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
++ .pc_name = "REMOVE",
+ },
+ [NFSPROC_RENAME] = {
+ .pc_func = nfsd_proc_rename,
+ .pc_decode = nfssvc_decode_renameargs,
+- .pc_encode = nfssvc_encode_stat,
++ .pc_encode = nfssvc_encode_statres,
+ .pc_argsize = sizeof(struct nfsd_renameargs),
++ .pc_argzero = sizeof(struct nfsd_renameargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
++ .pc_name = "RENAME",
+ },
+ [NFSPROC_LINK] = {
+ .pc_func = nfsd_proc_link,
+ .pc_decode = nfssvc_decode_linkargs,
+- .pc_encode = nfssvc_encode_stat,
++ .pc_encode = nfssvc_encode_statres,
+ .pc_argsize = sizeof(struct nfsd_linkargs),
++ .pc_argzero = sizeof(struct nfsd_linkargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
++ .pc_name = "LINK",
+ },
+ [NFSPROC_SYMLINK] = {
+ .pc_func = nfsd_proc_symlink,
+ .pc_decode = nfssvc_decode_symlinkargs,
+- .pc_encode = nfssvc_encode_stat,
++ .pc_encode = nfssvc_encode_statres,
+ .pc_argsize = sizeof(struct nfsd_symlinkargs),
++ .pc_argzero = sizeof(struct nfsd_symlinkargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
++ .pc_name = "SYMLINK",
+ },
+ [NFSPROC_MKDIR] = {
+ .pc_func = nfsd_proc_mkdir,
+@@ -754,35 +798,43 @@ static const struct svc_procedure nfsd_procedures2[18] = {
+ .pc_encode = nfssvc_encode_diropres,
+ .pc_release = nfssvc_release_diropres,
+ .pc_argsize = sizeof(struct nfsd_createargs),
++ .pc_argzero = sizeof(struct nfsd_createargs),
+ .pc_ressize = sizeof(struct nfsd_diropres),
+ .pc_cachetype = RC_REPLBUFF,
+ .pc_xdrressize = ST+FH+AT,
++ .pc_name = "MKDIR",
+ },
+ [NFSPROC_RMDIR] = {
+ .pc_func = nfsd_proc_rmdir,
+ .pc_decode = nfssvc_decode_diropargs,
+- .pc_encode = nfssvc_encode_stat,
++ .pc_encode = nfssvc_encode_statres,
+ .pc_argsize = sizeof(struct nfsd_diropargs),
++ .pc_argzero = sizeof(struct nfsd_diropargs),
+ .pc_ressize = sizeof(struct nfsd_stat),
+ .pc_cachetype = RC_REPLSTAT,
+ .pc_xdrressize = ST,
++ .pc_name = "RMDIR",
+ },
+ [NFSPROC_READDIR] = {
+ .pc_func = nfsd_proc_readdir,
+ .pc_decode = nfssvc_decode_readdirargs,
+ .pc_encode = nfssvc_encode_readdirres,
+ .pc_argsize = sizeof(struct nfsd_readdirargs),
++ .pc_argzero = sizeof(struct nfsd_readdirargs),
+ .pc_ressize = sizeof(struct nfsd_readdirres),
+ .pc_cachetype = RC_NOCACHE,
++ .pc_name = "READDIR",
+ },
+ [NFSPROC_STATFS] = {
+ .pc_func = nfsd_proc_statfs,
+- .pc_decode = nfssvc_decode_fhandle,
++ .pc_decode = nfssvc_decode_fhandleargs,
+ .pc_encode = nfssvc_encode_statfsres,
+ .pc_argsize = sizeof(struct nfsd_fhandle),
++ .pc_argzero = sizeof(struct nfsd_fhandle),
+ .pc_ressize = sizeof(struct nfsd_statfsres),
+ .pc_cachetype = RC_NOCACHE,
+ .pc_xdrressize = ST+5,
++ .pc_name = "STATFS",
+ },
+ };
+
+@@ -796,61 +848,3 @@ const struct svc_version nfsd_version2 = {
+ .vs_dispatch = nfsd_dispatch,
+ .vs_xdrsize = NFS2_SVC_XDRSIZE,
+ };
+-
+-/*
+- * Map errnos to NFS errnos.
+- */
+-__be32
+-nfserrno (int errno)
+-{
+- static struct {
+- __be32 nfserr;
+- int syserr;
+- } nfs_errtbl[] = {
+- { nfs_ok, 0 },
+- { nfserr_perm, -EPERM },
+- { nfserr_noent, -ENOENT },
+- { nfserr_io, -EIO },
+- { nfserr_nxio, -ENXIO },
+- { nfserr_fbig, -E2BIG },
+- { nfserr_acces, -EACCES },
+- { nfserr_exist, -EEXIST },
+- { nfserr_xdev, -EXDEV },
+- { nfserr_mlink, -EMLINK },
+- { nfserr_nodev, -ENODEV },
+- { nfserr_notdir, -ENOTDIR },
+- { nfserr_isdir, -EISDIR },
+- { nfserr_inval, -EINVAL },
+- { nfserr_fbig, -EFBIG },
+- { nfserr_nospc, -ENOSPC },
+- { nfserr_rofs, -EROFS },
+- { nfserr_mlink, -EMLINK },
+- { nfserr_nametoolong, -ENAMETOOLONG },
+- { nfserr_notempty, -ENOTEMPTY },
+-#ifdef EDQUOT
+- { nfserr_dquot, -EDQUOT },
+-#endif
+- { nfserr_stale, -ESTALE },
+- { nfserr_jukebox, -ETIMEDOUT },
+- { nfserr_jukebox, -ERESTARTSYS },
+- { nfserr_jukebox, -EAGAIN },
+- { nfserr_jukebox, -EWOULDBLOCK },
+- { nfserr_jukebox, -ENOMEM },
+- { nfserr_io, -ETXTBSY },
+- { nfserr_notsupp, -EOPNOTSUPP },
+- { nfserr_toosmall, -ETOOSMALL },
+- { nfserr_serverfault, -ESERVERFAULT },
+- { nfserr_serverfault, -ENFILE },
+- { nfserr_io, -EUCLEAN },
+- { nfserr_perm, -ENOKEY },
+- };
+- int i;
+-
+- for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
+- if (nfs_errtbl[i].syserr == errno)
+- return nfs_errtbl[i].nfserr;
+- }
+- WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
+- return nfserr_io;
+-}
+-
+diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
+index 2e61a565cdbd8..3d4fd40c987bd 100644
+--- a/fs/nfsd/nfssvc.c
++++ b/fs/nfsd/nfssvc.c
+@@ -12,6 +12,7 @@
+ #include <linux/module.h>
+ #include <linux/fs_struct.h>
+ #include <linux/swap.h>
++#include <linux/siphash.h>
+
+ #include <linux/sunrpc/stats.h>
+ #include <linux/sunrpc/svcsock.h>
+@@ -29,13 +30,9 @@
+ #include "netns.h"
+ #include "filecache.h"
+
+-#define NFSDDBG_FACILITY NFSDDBG_SVC
++#include "trace.h"
+
+-bool inter_copy_offload_enable;
+-EXPORT_SYMBOL_GPL(inter_copy_offload_enable);
+-module_param(inter_copy_offload_enable, bool, 0644);
+-MODULE_PARM_DESC(inter_copy_offload_enable,
+- "Enable inter server to server copy offload. Default: false");
++#define NFSDDBG_FACILITY NFSDDBG_SVC
+
+ extern struct svc_program nfsd_program;
+ static int nfsd(void *vrqstp);
+@@ -59,18 +56,17 @@ static __be32 nfsd_init_request(struct svc_rqst *,
+ struct svc_process_info *);
+
+ /*
+- * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and the members
+- * of the svc_serv struct. In particular, ->sv_nrthreads but also to some
+- * extent ->sv_temp_socks and ->sv_permsocks. It also protects nfsdstats.th_cnt
++ * nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members
++ * of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks.
+ *
+ * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
+- * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0. That number
+- * of nfsd threads must exist and each must listed in ->sp_all_threads in each
+- * entry of ->sv_pools[].
++ * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless
++ * nn->keep_active is set). That number of nfsd threads must
++ * exist and each must be listed in ->sp_all_threads in some entry of
++ * ->sv_pools[].
+ *
+- * Transitions of the thread count between zero and non-zero are of particular
+- * interest since the svc_serv needs to be created and initialized at that
+- * point, or freed.
++ * Each active thread holds a counted reference on nn->nfsd_serv, as does
++ * the nn->keep_active flag and various transient calls to svc_get().
+ *
+ * Finally, the nfsd_mutex also protects some of the global variables that are
+ * accessed when nfsd starts and that are settable via the write_* routines in
+@@ -88,15 +84,19 @@ DEFINE_MUTEX(nfsd_mutex);
+ * version 4.1 DRC caches.
+ * nfsd_drc_pages_used tracks the current version 4.1 DRC memory usage.
+ */
+-spinlock_t nfsd_drc_lock;
++DEFINE_SPINLOCK(nfsd_drc_lock);
+ unsigned long nfsd_drc_max_mem;
+ unsigned long nfsd_drc_mem_used;
+
+ #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
+ static struct svc_stat nfsd_acl_svcstats;
+ static const struct svc_version *nfsd_acl_version[] = {
++# if defined(CONFIG_NFSD_V2_ACL)
+ [2] = &nfsd_acl_version2,
++# endif
++# if defined(CONFIG_NFSD_V3_ACL)
+ [3] = &nfsd_acl_version3,
++# endif
+ };
+
+ #define NFSD_ACL_MINVERS 2
+@@ -120,10 +120,10 @@ static struct svc_stat nfsd_acl_svcstats = {
+ #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
+
+ static const struct svc_version *nfsd_version[] = {
++#if defined(CONFIG_NFSD_V2)
+ [2] = &nfsd_version2,
+-#if defined(CONFIG_NFSD_V3)
+- [3] = &nfsd_version3,
+ #endif
++ [3] = &nfsd_version3,
+ #if defined(CONFIG_NFSD_V4)
+ [4] = &nfsd_version4,
+ #endif
+@@ -297,13 +297,13 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
+ if (!list_empty(&nn->nfsd_serv->sv_permsocks))
+ return 0;
+
+- error = svc_create_xprt(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
+- SVC_SOCK_DEFAULTS, cred);
++ error = svc_xprt_create(nn->nfsd_serv, "udp", net, PF_INET, NFS_PORT,
++ SVC_SOCK_DEFAULTS, cred);
+ if (error < 0)
+ return error;
+
+- error = svc_create_xprt(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
+- SVC_SOCK_DEFAULTS, cred);
++ error = svc_xprt_create(nn->nfsd_serv, "tcp", net, PF_INET, NFS_PORT,
++ SVC_SOCK_DEFAULTS, cred);
+ if (error < 0)
+ return error;
+
+@@ -312,7 +312,7 @@ static int nfsd_init_socks(struct net *net, const struct cred *cred)
+
+ static int nfsd_users = 0;
+
+-static int nfsd_startup_generic(int nrservs)
++static int nfsd_startup_generic(void)
+ {
+ int ret;
+
+@@ -349,36 +349,60 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn)
+ return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST);
+ }
+
+-void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn)
++/**
++ * nfsd_copy_write_verifier - Atomically copy a write verifier
++ * @verf: buffer in which to receive the verifier cookie
++ * @nn: NFS net namespace
++ *
++ * This function provides a wait-free mechanism for copying the
++ * namespace's write verifier without tearing it.
++ */
++void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn)
+ {
+ int seq = 0;
+
+ do {
+- read_seqbegin_or_lock(&nn->boot_lock, &seq);
+- /*
+- * This is opaque to client, so no need to byte-swap. Use
+- * __force to keep sparse happy. y2038 time_t overflow is
+- * irrelevant in this usage
+- */
+- verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec;
+- verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec;
+- } while (need_seqretry(&nn->boot_lock, seq));
+- done_seqretry(&nn->boot_lock, seq);
++ read_seqbegin_or_lock(&nn->writeverf_lock, &seq);
++ memcpy(verf, nn->writeverf, sizeof(nn->writeverf));
++ } while (need_seqretry(&nn->writeverf_lock, seq));
++ done_seqretry(&nn->writeverf_lock, seq);
+ }
+
+-static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn)
++static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn)
+ {
+- ktime_get_real_ts64(&nn->nfssvc_boot);
++ struct timespec64 now;
++ u64 verf;
++
++ /*
++ * Because the time value is hashed, y2038 time_t overflow
++ * is irrelevant in this usage.
++ */
++ ktime_get_raw_ts64(&now);
++ verf = siphash_2u64(now.tv_sec, now.tv_nsec, &nn->siphash_key);
++ memcpy(nn->writeverf, &verf, sizeof(nn->writeverf));
+ }
+
+-void nfsd_reset_boot_verifier(struct nfsd_net *nn)
++/**
++ * nfsd_reset_write_verifier - Generate a new write verifier
++ * @nn: NFS net namespace
++ *
++ * This function updates the ->writeverf field of @nn. This field
++ * contains an opaque cookie that, according to Section 18.32.3 of
++ * RFC 8881, "the client can use to determine whether a server has
++ * changed instance state (e.g., server restart) between a call to
++ * WRITE and a subsequent call to either WRITE or COMMIT. This
++ * cookie MUST be unchanged during a single instance of the NFSv4.1
++ * server and MUST be unique between instances of the NFSv4.1
++ * server."
++ */
++void nfsd_reset_write_verifier(struct nfsd_net *nn)
+ {
+- write_seqlock(&nn->boot_lock);
+- nfsd_reset_boot_verifier_locked(nn);
+- write_sequnlock(&nn->boot_lock);
++ write_seqlock(&nn->writeverf_lock);
++ nfsd_reset_write_verifier_locked(nn);
++ write_sequnlock(&nn->writeverf_lock);
+ }
+
+-static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred)
++static int nfsd_startup_net(struct net *net, const struct cred *cred)
+ {
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int ret;
+@@ -386,7 +410,7 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre
+ if (nn->nfsd_net_up)
+ return 0;
+
+- ret = nfsd_startup_generic(nrservs);
++ ret = nfsd_startup_generic();
+ if (ret)
+ return ret;
+ ret = nfsd_init_socks(net, cred);
+@@ -407,6 +431,9 @@ static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cre
+ if (ret)
+ goto out_filecache;
+
++#ifdef CONFIG_NFSD_V4_2_INTER_SSC
++ nfsd4_ssc_init_umount_work(nn);
++#endif
+ nn->nfsd_net_up = true;
+ return 0;
+
+@@ -436,6 +463,7 @@ static void nfsd_shutdown_net(struct net *net)
+ nfsd_shutdown_generic();
+ }
+
++static DEFINE_SPINLOCK(nfsd_notifier_lock);
+ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ void *ptr)
+ {
+@@ -445,18 +473,17 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in sin;
+
+- if ((event != NETDEV_DOWN) ||
+- !atomic_inc_not_zero(&nn->ntf_refcnt))
++ if (event != NETDEV_DOWN || !nn->nfsd_serv)
+ goto out;
+
++ spin_lock(&nfsd_notifier_lock);
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = ifa->ifa_local;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+ }
+- atomic_dec(&nn->ntf_refcnt);
+- wake_up(&nn->ntf_wq);
++ spin_unlock(&nfsd_notifier_lock);
+
+ out:
+ return NOTIFY_DONE;
+@@ -476,10 +503,10 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct sockaddr_in6 sin6;
+
+- if ((event != NETDEV_DOWN) ||
+- !atomic_inc_not_zero(&nn->ntf_refcnt))
++ if (event != NETDEV_DOWN || !nn->nfsd_serv)
+ goto out;
+
++ spin_lock(&nfsd_notifier_lock);
+ if (nn->nfsd_serv) {
+ dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+ sin6.sin6_family = AF_INET6;
+@@ -488,8 +515,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this,
+ sin6.sin6_scope_id = ifa->idev->dev->ifindex;
+ svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+ }
+- atomic_dec(&nn->ntf_refcnt);
+- wake_up(&nn->ntf_wq);
++ spin_unlock(&nfsd_notifier_lock);
++
+ out:
+ return NOTIFY_DONE;
+ }
+@@ -502,11 +529,15 @@ static struct notifier_block nfsd_inet6addr_notifier = {
+ /* Only used under nfsd_mutex, so this atomic may be overkill: */
+ static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
+
+-static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
++void nfsd_last_thread(struct net *net)
+ {
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct svc_serv *serv = nn->nfsd_serv;
++
++ spin_lock(&nfsd_notifier_lock);
++ nn->nfsd_serv = NULL;
++ spin_unlock(&nfsd_notifier_lock);
+
+- atomic_dec(&nn->ntf_refcnt);
+ /* check if the notifier still has clients */
+ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
+ unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+@@ -514,7 +545,8 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
+ unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+ #endif
+ }
+- wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0);
++
++ svc_xprt_destroy_all(serv, net);
+
+ /*
+ * write_ports can create the server without actually starting
+@@ -567,7 +599,6 @@ static void set_max_drc(void)
+ nfsd_drc_max_mem = (nr_free_buffer_pages()
+ >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE;
+ nfsd_drc_mem_used = 0;
+- spin_lock_init(&nfsd_drc_lock);
+ dprintk("%s nfsd_drc_max_mem %lu \n", __func__, nfsd_drc_max_mem);
+ }
+
+@@ -592,24 +623,6 @@ static int nfsd_get_default_max_blksize(void)
+ return ret;
+ }
+
+-static const struct svc_serv_ops nfsd_thread_sv_ops = {
+- .svo_shutdown = nfsd_last_thread,
+- .svo_function = nfsd,
+- .svo_enqueue_xprt = svc_xprt_do_enqueue,
+- .svo_setup = svc_set_num_threads,
+- .svo_module = THIS_MODULE,
+-};
+-
+-static void nfsd_complete_shutdown(struct net *net)
+-{
+- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+-
+- WARN_ON(!mutex_is_locked(&nfsd_mutex));
+-
+- nn->nfsd_serv = NULL;
+- complete(&nn->nfsd_shutdown_complete);
+-}
+-
+ void nfsd_shutdown_threads(struct net *net)
+ {
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+@@ -624,11 +637,10 @@ void nfsd_shutdown_threads(struct net *net)
+
+ svc_get(serv);
+ /* Kill outstanding nfsd threads */
+- serv->sv_ops->svo_setup(serv, NULL, 0);
+- nfsd_destroy(net);
++ svc_set_num_threads(serv, NULL, 0);
++ nfsd_last_thread(net);
++ svc_put(serv);
+ mutex_unlock(&nfsd_mutex);
+- /* Wait for shutdown of nfsd_serv to complete */
+- wait_for_completion(&nn->nfsd_shutdown_complete);
+ }
+
+ bool i_am_nfsd(void)
+@@ -640,6 +652,7 @@ int nfsd_create_serv(struct net *net)
+ {
+ int error;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct svc_serv *serv;
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+ if (nn->nfsd_serv) {
+@@ -649,19 +662,19 @@ int nfsd_create_serv(struct net *net)
+ if (nfsd_max_blksize == 0)
+ nfsd_max_blksize = nfsd_get_default_max_blksize();
+ nfsd_reset_versions(nn);
+- nn->nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
+- &nfsd_thread_sv_ops);
+- if (nn->nfsd_serv == NULL)
++ serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
++ if (serv == NULL)
+ return -ENOMEM;
+- init_completion(&nn->nfsd_shutdown_complete);
+
+- nn->nfsd_serv->sv_maxconn = nn->max_connections;
+- error = svc_bind(nn->nfsd_serv, net);
++ serv->sv_maxconn = nn->max_connections;
++ error = svc_bind(serv, net);
+ if (error < 0) {
+- svc_destroy(nn->nfsd_serv);
+- nfsd_complete_shutdown(net);
++ svc_put(serv);
+ return error;
+ }
++ spin_lock(&nfsd_notifier_lock);
++ nn->nfsd_serv = serv;
++ spin_unlock(&nfsd_notifier_lock);
+
+ set_max_drc();
+ /* check if the notifier is already set */
+@@ -671,8 +684,7 @@ int nfsd_create_serv(struct net *net)
+ register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+ #endif
+ }
+- atomic_inc(&nn->ntf_refcnt);
+- nfsd_reset_boot_verifier(nn);
++ nfsd_reset_write_verifier(nn);
+ return 0;
+ }
+
+@@ -699,18 +711,6 @@ int nfsd_get_nrthreads(int n, int *nthreads, struct net *net)
+ return 0;
+ }
+
+-void nfsd_destroy(struct net *net)
+-{
+- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+- int destroy = (nn->nfsd_serv->sv_nrthreads == 1);
+-
+- if (destroy)
+- svc_shutdown_net(nn->nfsd_serv, net);
+- svc_destroy(nn->nfsd_serv);
+- if (destroy)
+- nfsd_complete_shutdown(net);
+-}
+-
+ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
+ {
+ int i = 0;
+@@ -735,7 +735,7 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
+ if (tot > NFSD_MAXSERVS) {
+ /* total too large: scale down requested numbers */
+ for (i = 0; i < n && tot > 0; i++) {
+- int new = nthreads[i] * NFSD_MAXSERVS / tot;
++ int new = nthreads[i] * NFSD_MAXSERVS / tot;
+ tot -= (nthreads[i] - new);
+ nthreads[i] = new;
+ }
+@@ -755,12 +755,13 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
+ /* apply the new numbers */
+ svc_get(nn->nfsd_serv);
+ for (i = 0; i < n; i++) {
+- err = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+- &nn->nfsd_serv->sv_pools[i], nthreads[i]);
++ err = svc_set_num_threads(nn->nfsd_serv,
++ &nn->nfsd_serv->sv_pools[i],
++ nthreads[i]);
+ if (err)
+ break;
+ }
+- nfsd_destroy(net);
++ svc_put(nn->nfsd_serv);
+ return err;
+ }
+
+@@ -775,6 +776,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
+ int error;
+ bool nfsd_up_before;
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
++ struct svc_serv *serv;
+
+ mutex_lock(&nfsd_mutex);
+ dprintk("nfsd: creating service\n");
+@@ -786,7 +788,7 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
+ if (nrservs == 0 && nn->nfsd_serv == NULL)
+ goto out;
+
+- strlcpy(nn->nfsd_name, utsname()->nodename,
++ strscpy(nn->nfsd_name, utsname()->nodename,
+ sizeof(nn->nfsd_name));
+
+ error = nfsd_create_serv(net);
+@@ -794,24 +796,25 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
+ goto out;
+
+ nfsd_up_before = nn->nfsd_net_up;
++ serv = nn->nfsd_serv;
+
+- error = nfsd_startup_net(nrservs, net, cred);
++ error = nfsd_startup_net(net, cred);
+ if (error)
+- goto out_destroy;
+- error = nn->nfsd_serv->sv_ops->svo_setup(nn->nfsd_serv,
+- NULL, nrservs);
++ goto out_put;
++ error = svc_set_num_threads(serv, NULL, nrservs);
+ if (error)
+ goto out_shutdown;
+- /* We are holding a reference to nn->nfsd_serv which
+- * we don't want to count in the return value,
+- * so subtract 1
+- */
+- error = nn->nfsd_serv->sv_nrthreads - 1;
++ error = serv->sv_nrthreads;
++ if (error == 0)
++ nfsd_last_thread(net);
+ out_shutdown:
+ if (error < 0 && !nfsd_up_before)
+ nfsd_shutdown_net(net);
+-out_destroy:
+- nfsd_destroy(net); /* Release server */
++out_put:
++ /* Threads now hold service active */
++ if (xchg(&nn->keep_active, 0))
++ svc_put(serv);
++ svc_put(serv);
+ out:
+ mutex_unlock(&nfsd_mutex);
+ return error;
+@@ -925,9 +928,6 @@ nfsd(void *vrqstp)
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int err;
+
+- /* Lock module and set up kernel thread */
+- mutex_lock(&nfsd_mutex);
+-
+ /* At this point, the thread shares current->fs
+ * with the init process. We need to create files with the
+ * umask as defined by the client instead of init's umask. */
+@@ -938,17 +938,7 @@ nfsd(void *vrqstp)
+
+ current->fs->umask = 0;
+
+- /*
+- * thread is spawned with all signals set to SIG_IGN, re-enable
+- * the ones that will bring down the thread
+- */
+- allow_signal(SIGKILL);
+- allow_signal(SIGHUP);
+- allow_signal(SIGINT);
+- allow_signal(SIGQUIT);
+-
+- nfsdstats.th_cnt++;
+- mutex_unlock(&nfsd_mutex);
++ atomic_inc(&nfsdstats.th_cnt);
+
+ set_freezable();
+
+@@ -972,57 +962,14 @@ nfsd(void *vrqstp)
+ validate_process_creds();
+ }
+
+- /* Clear signals before calling svc_exit_thread() */
+- flush_signals(current);
+-
+- mutex_lock(&nfsd_mutex);
+- nfsdstats.th_cnt --;
++ atomic_dec(&nfsdstats.th_cnt);
+
+ out:
+- rqstp->rq_server = NULL;
+-
+ /* Release the thread */
+ svc_exit_thread(rqstp);
+-
+- nfsd_destroy(net);
+-
+- /* Release module */
+- mutex_unlock(&nfsd_mutex);
+- module_put_and_exit(0);
+ return 0;
+ }
+
+-/*
+- * A write procedure can have a large argument, and a read procedure can
+- * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+- * reply that can both be larger than a page. The xdr code has taken
+- * advantage of this assumption to be a sloppy about bounds checking in
+- * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that
+- * problem, we enforce these assumptions here:
+- */
+-static bool nfs_request_too_big(struct svc_rqst *rqstp,
+- const struct svc_procedure *proc)
+-{
+- /*
+- * The ACL code has more careful bounds-checking and is not
+- * susceptible to this problem:
+- */
+- if (rqstp->rq_prog != NFS_PROGRAM)
+- return false;
+- /*
+- * Ditto NFSv4 (which can in theory have argument and reply both
+- * more than a page):
+- */
+- if (rqstp->rq_vers >= 4)
+- return false;
+- /* The reply will be small, we're OK: */
+- if (proc->pc_xdrressize > 0 &&
+- proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+- return false;
+-
+- return rqstp->rq_arg.len > PAGE_SIZE;
+-}
+-
+ /**
+ * nfsd_dispatch - Process an NFS or NFSACL Request
+ * @rqstp: incoming request
+@@ -1037,22 +984,15 @@ static bool nfs_request_too_big(struct svc_rqst *rqstp,
+ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ {
+ const struct svc_procedure *proc = rqstp->rq_procinfo;
+- struct kvec *argv = &rqstp->rq_arg.head[0];
+- struct kvec *resv = &rqstp->rq_res.head[0];
+- __be32 *p;
+-
+- dprintk("nfsd_dispatch: vers %d proc %d\n",
+- rqstp->rq_vers, rqstp->rq_proc);
+-
+- if (nfs_request_too_big(rqstp, proc))
+- goto out_too_large;
+
+ /*
+ * Give the xdr decoder a chance to change this if it wants
+ * (necessary in the NFSv4.0 compound case)
+ */
+ rqstp->rq_cachetype = proc->pc_cachetype;
+- if (!proc->pc_decode(rqstp, argv->iov_base))
++
++ svcxdr_init_decode(rqstp);
++ if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream))
+ goto out_decode_err;
+
+ switch (nfsd_cache_lookup(rqstp)) {
+@@ -1068,43 +1008,64 @@ int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ * Need to grab the location to store the status, as
+ * NFSv4 does some encoding while processing
+ */
+- p = resv->iov_base + resv->iov_len;
+- resv->iov_len += sizeof(__be32);
++ svcxdr_init_encode(rqstp);
+
+ *statp = proc->pc_func(rqstp);
+- if (*statp == rpc_drop_reply || test_bit(RQ_DROPME, &rqstp->rq_flags))
++ if (test_bit(RQ_DROPME, &rqstp->rq_flags))
+ goto out_update_drop;
+
+- if (!proc->pc_encode(rqstp, p))
++ if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream))
+ goto out_encode_err;
+
+ nfsd_cache_update(rqstp, rqstp->rq_cachetype, statp + 1);
+ out_cached_reply:
+ return 1;
+
+-out_too_large:
+- dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+- *statp = rpc_garbage_args;
+- return 1;
+-
+ out_decode_err:
+- dprintk("nfsd: failed to decode arguments!\n");
++ trace_nfsd_garbage_args_err(rqstp);
+ *statp = rpc_garbage_args;
+ return 1;
+
+ out_update_drop:
+- dprintk("nfsd: Dropping request; may be revisited later\n");
+ nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ out_dropit:
+ return 0;
+
+ out_encode_err:
+- dprintk("nfsd: failed to encode result!\n");
++ trace_nfsd_cant_encode_err(rqstp);
+ nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
+ *statp = rpc_system_err;
+ return 1;
+ }
+
++/**
++ * nfssvc_decode_voidarg - Decode void arguments
++ * @rqstp: Server RPC transaction context
++ * @xdr: XDR stream positioned at arguments to decode
++ *
++ * Return values:
++ * %false: Arguments were not valid
++ * %true: Decoding was successful
++ */
++bool nfssvc_decode_voidarg(struct svc_rqst *rqstp, struct xdr_stream *xdr)
++{
++ return true;
++}
++
++/**
++ * nfssvc_encode_voidres - Encode void results
++ * @rqstp: Server RPC transaction context
++ * @xdr: XDR stream into which to encode results
++ *
++ * Return values:
++ * %false: Local error while encoding
++ * %true: Encoding was successful
++ */
++bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
++{
++ return true;
++}
++
+ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+ {
+ int ret;
+@@ -1115,7 +1076,6 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+ mutex_unlock(&nfsd_mutex);
+ return -ENODEV;
+ }
+- /* bump up the psudo refcount while traversing */
+ svc_get(nn->nfsd_serv);
+ ret = svc_pool_stats_open(nn->nfsd_serv, file);
+ mutex_unlock(&nfsd_mutex);
+@@ -1124,12 +1084,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
+
+ int nfsd_pool_stats_release(struct inode *inode, struct file *file)
+ {
++ struct seq_file *seq = file->private_data;
++ struct svc_serv *serv = seq->private;
+ int ret = seq_release(inode, file);
+- struct net *net = inode->i_sb->s_fs_info;
+
+ mutex_lock(&nfsd_mutex);
+- /* this function really, really should have been called svc_put() */
+- nfsd_destroy(net);
++ svc_put(serv);
+ mutex_unlock(&nfsd_mutex);
+ return ret;
+ }
+diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
+index 8a288c8fcd57c..caf6355b18fa9 100644
+--- a/fs/nfsd/nfsxdr.c
++++ b/fs/nfsd/nfsxdr.c
+@@ -9,12 +9,10 @@
+ #include "xdr.h"
+ #include "auth.h"
+
+-#define NFSDDBG_FACILITY NFSDDBG_XDR
+-
+ /*
+ * Mapping of S_IF* types to NFS file types
+ */
+-static u32 nfs_ftypes[] = {
++static const u32 nfs_ftypes[] = {
+ NFNON, NFCHR, NFCHR, NFBAD,
+ NFDIR, NFBAD, NFBLK, NFBAD,
+ NFREG, NFBAD, NFLNK, NFBAD,
+@@ -23,93 +21,168 @@ static u32 nfs_ftypes[] = {
+
+
+ /*
+- * XDR functions for basic NFS types
++ * Basic NFSv2 data types (RFC 1094 Section 2.3)
+ */
+-static __be32 *
+-decode_fh(__be32 *p, struct svc_fh *fhp)
++
++/**
++ * svcxdr_encode_stat - Encode an NFSv2 status code
++ * @xdr: XDR stream
++ * @status: status value to encode
++ *
++ * Return values:
++ * %false: Send buffer space was exhausted
++ * %true: Success
++ */
++bool
++svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status)
+ {
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, sizeof(status));
++ if (!p)
++ return false;
++ *p = status;
++
++ return true;
++}
++
++/**
++ * svcxdr_decode_fhandle - Decode an NFSv2 file handle
++ * @xdr: XDR stream positioned at an encoded NFSv2 FH
++ * @fhp: OUT: filled-in server file handle
++ *
++ * Return values:
++ * %false: The encoded file handle was not valid
++ * %true: @fhp has been initialized
++ */
++bool
++svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp)
++{
++ __be32 *p;
++
++ p = xdr_inline_decode(xdr, NFS_FHSIZE);
++ if (!p)
++ return false;
+ fh_init(fhp, NFS_FHSIZE);
+- memcpy(&fhp->fh_handle.fh_base, p, NFS_FHSIZE);
++ memcpy(&fhp->fh_handle.fh_raw, p, NFS_FHSIZE);
+ fhp->fh_handle.fh_size = NFS_FHSIZE;
+
+- /* FIXME: Look up export pointer here and verify
+- * Sun Secure RPC if requested */
+- return p + (NFS_FHSIZE >> 2);
++ return true;
+ }
+
+-/* Helper function for NFSv2 ACL code */
+-__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp)
++static bool
++svcxdr_encode_fhandle(struct xdr_stream *xdr, const struct svc_fh *fhp)
+ {
+- return decode_fh(p, fhp);
++ __be32 *p;
++
++ p = xdr_reserve_space(xdr, NFS_FHSIZE);
++ if (!p)
++ return false;
++ memcpy(p, &fhp->fh_handle.fh_raw, NFS_FHSIZE);
++
++ return true;
+ }
+
+ static __be32 *
+-encode_fh(__be32 *p, struct svc_fh *fhp)
++encode_timeval(__be32 *p, const struct timespec64 *time)
+ {
+- memcpy(p, &fhp->fh_handle.fh_base, NFS_FHSIZE);
+- return p + (NFS_FHSIZE>> 2);
++ *p++ = cpu_to_be32((u32)time->tv_sec);
++ if (time->tv_nsec)
++ *p++ = cpu_to_be32(time->tv_nsec / NSEC_PER_USEC);
++ else
++ *p++ = xdr_zero;
++ return p;
+ }
+
+-/*
+- * Decode a file name and make sure that the path contains
+- * no slashes or null bytes.
+- */
+-static __be32 *
+-decode_filename(__be32 *p, char **namp, unsigned int *lenp)
++static bool
++svcxdr_decode_filename(struct xdr_stream *xdr, char **name, unsigned int *len)
+ {
+- char *name;
+- unsigned int i;
+-
+- if ((p = xdr_decode_string_inplace(p, namp, lenp, NFS_MAXNAMLEN)) != NULL) {
+- for (i = 0, name = *namp; i < *lenp; i++, name++) {
+- if (*name == '\0' || *name == '/')
+- return NULL;
+- }
+- }
++ u32 size, i;
++ __be32 *p;
++ char *c;
++
++ if (xdr_stream_decode_u32(xdr, &size) < 0)
++ return false;
++ if (size == 0 || size > NFS_MAXNAMLEN)
++ return false;
++ p = xdr_inline_decode(xdr, size);
++ if (!p)
++ return false;
+
+- return p;
++ *len = size;
++ *name = (char *)p;
++ for (i = 0, c = *name; i < size; i++, c++)
++ if (*c == '\0' || *c == '/')
++ return false;
++
++ return true;
+ }
+
+-static __be32 *
+-decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns)
++static bool
++svcxdr_decode_diropargs(struct xdr_stream *xdr, struct svc_fh *fhp,
++ char **name, unsigned int *len)
++{
++ return svcxdr_decode_fhandle(xdr, fhp) &&
++ svcxdr_decode_filename(xdr, name, len);
++}
++
++static bool
++svcxdr_decode_sattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ struct iattr *iap)
+ {
+- u32 tmp, tmp1;
++ u32 tmp1, tmp2;
++ __be32 *p;
++
++ p = xdr_inline_decode(xdr, XDR_UNIT * 8);
++ if (!p)
++ return false;
+
+ iap->ia_valid = 0;
+
+- /* Sun client bug compatibility check: some sun clients seem to
+- * put 0xffff in the mode field when they mean 0xffffffff.
+- * Quoting the 4.4BSD nfs server code: Nah nah nah nah na nah.
++ /*
++ * Some Sun clients put 0xffff in the mode field when they
++ * mean 0xffffffff.
+ */
+- if ((tmp = ntohl(*p++)) != (u32)-1 && tmp != 0xffff) {
++ tmp1 = be32_to_cpup(p++);
++ if (tmp1 != (u32)-1 && tmp1 != 0xffff) {
+ iap->ia_valid |= ATTR_MODE;
+- iap->ia_mode = tmp;
++ iap->ia_mode = tmp1;
+ }
+- if ((tmp = ntohl(*p++)) != (u32)-1) {
+- iap->ia_uid = make_kuid(userns, tmp);
++
++ tmp1 = be32_to_cpup(p++);
++ if (tmp1 != (u32)-1) {
++ iap->ia_uid = make_kuid(nfsd_user_namespace(rqstp), tmp1);
+ if (uid_valid(iap->ia_uid))
+ iap->ia_valid |= ATTR_UID;
+ }
+- if ((tmp = ntohl(*p++)) != (u32)-1) {
+- iap->ia_gid = make_kgid(userns, tmp);
++
++ tmp1 = be32_to_cpup(p++);
++ if (tmp1 != (u32)-1) {
++ iap->ia_gid = make_kgid(nfsd_user_namespace(rqstp), tmp1);
+ if (gid_valid(iap->ia_gid))
+ iap->ia_valid |= ATTR_GID;
+ }
+- if ((tmp = ntohl(*p++)) != (u32)-1) {
++
++ tmp1 = be32_to_cpup(p++);
++ if (tmp1 != (u32)-1) {
+ iap->ia_valid |= ATTR_SIZE;
+- iap->ia_size = tmp;
++ iap->ia_size = tmp1;
+ }
+- tmp = ntohl(*p++); tmp1 = ntohl(*p++);
+- if (tmp != (u32)-1 && tmp1 != (u32)-1) {
++
++ tmp1 = be32_to_cpup(p++);
++ tmp2 = be32_to_cpup(p++);
++ if (tmp1 != (u32)-1 && tmp2 != (u32)-1) {
+ iap->ia_valid |= ATTR_ATIME | ATTR_ATIME_SET;
+- iap->ia_atime.tv_sec = tmp;
+- iap->ia_atime.tv_nsec = tmp1 * 1000;
++ iap->ia_atime.tv_sec = tmp1;
++ iap->ia_atime.tv_nsec = tmp2 * NSEC_PER_USEC;
+ }
+- tmp = ntohl(*p++); tmp1 = ntohl(*p++);
+- if (tmp != (u32)-1 && tmp1 != (u32)-1) {
++
++ tmp1 = be32_to_cpup(p++);
++ tmp2 = be32_to_cpup(p++);
++ if (tmp1 != (u32)-1 && tmp2 != (u32)-1) {
+ iap->ia_valid |= ATTR_MTIME | ATTR_MTIME_SET;
+- iap->ia_mtime.tv_sec = tmp;
+- iap->ia_mtime.tv_nsec = tmp1 * 1000;
++ iap->ia_mtime.tv_sec = tmp1;
++ iap->ia_mtime.tv_nsec = tmp2 * NSEC_PER_USEC;
+ /*
+ * Passing the invalid value useconds=1000000 for mtime
+ * is a Sun convention for "set both mtime and atime to
+@@ -119,476 +192,447 @@ decode_sattr(__be32 *p, struct iattr *iap, struct user_namespace *userns)
+ * sattr in section 6.1 of "NFS Illustrated" by
+ * Brent Callaghan, Addison-Wesley, ISBN 0-201-32750-5
+ */
+- if (tmp1 == 1000000)
++ if (tmp2 == 1000000)
+ iap->ia_valid &= ~(ATTR_ATIME_SET|ATTR_MTIME_SET);
+ }
+- return p;
++
++ return true;
+ }
+
+-static __be32 *
+-encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp,
+- struct kstat *stat)
++/**
++ * svcxdr_encode_fattr - Encode NFSv2 file attributes
++ * @rqstp: Context of a completed RPC transaction
++ * @xdr: XDR stream
++ * @fhp: File handle to encode
++ * @stat: Attributes to encode
++ *
++ * Return values:
++ * %false: Send buffer space was exhausted
++ * %true: Success
++ */
++bool
++svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ const struct svc_fh *fhp, const struct kstat *stat)
+ {
+ struct user_namespace *userns = nfsd_user_namespace(rqstp);
+- struct dentry *dentry = fhp->fh_dentry;
+- int type;
++ struct dentry *dentry = fhp->fh_dentry;
++ int type = stat->mode & S_IFMT;
+ struct timespec64 time;
+- u32 f;
++ __be32 *p;
++ u32 fsid;
+
+- type = (stat->mode & S_IFMT);
++ p = xdr_reserve_space(xdr, XDR_UNIT * 17);
++ if (!p)
++ return false;
+
+- *p++ = htonl(nfs_ftypes[type >> 12]);
+- *p++ = htonl((u32) stat->mode);
+- *p++ = htonl((u32) stat->nlink);
+- *p++ = htonl((u32) from_kuid_munged(userns, stat->uid));
+- *p++ = htonl((u32) from_kgid_munged(userns, stat->gid));
++ *p++ = cpu_to_be32(nfs_ftypes[type >> 12]);
++ *p++ = cpu_to_be32((u32)stat->mode);
++ *p++ = cpu_to_be32((u32)stat->nlink);
++ *p++ = cpu_to_be32((u32)from_kuid_munged(userns, stat->uid));
++ *p++ = cpu_to_be32((u32)from_kgid_munged(userns, stat->gid));
+
+- if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN) {
+- *p++ = htonl(NFS_MAXPATHLEN);
+- } else {
+- *p++ = htonl((u32) stat->size);
+- }
+- *p++ = htonl((u32) stat->blksize);
++ if (S_ISLNK(type) && stat->size > NFS_MAXPATHLEN)
++ *p++ = cpu_to_be32(NFS_MAXPATHLEN);
++ else
++ *p++ = cpu_to_be32((u32) stat->size);
++ *p++ = cpu_to_be32((u32) stat->blksize);
+ if (S_ISCHR(type) || S_ISBLK(type))
+- *p++ = htonl(new_encode_dev(stat->rdev));
++ *p++ = cpu_to_be32(new_encode_dev(stat->rdev));
+ else
+- *p++ = htonl(0xffffffff);
+- *p++ = htonl((u32) stat->blocks);
++ *p++ = cpu_to_be32(0xffffffff);
++ *p++ = cpu_to_be32((u32)stat->blocks);
++
+ switch (fsid_source(fhp)) {
+- default:
+- case FSIDSOURCE_DEV:
+- *p++ = htonl(new_encode_dev(stat->dev));
+- break;
+ case FSIDSOURCE_FSID:
+- *p++ = htonl((u32) fhp->fh_export->ex_fsid);
++ fsid = (u32)fhp->fh_export->ex_fsid;
+ break;
+ case FSIDSOURCE_UUID:
+- f = ((u32*)fhp->fh_export->ex_uuid)[0];
+- f ^= ((u32*)fhp->fh_export->ex_uuid)[1];
+- f ^= ((u32*)fhp->fh_export->ex_uuid)[2];
+- f ^= ((u32*)fhp->fh_export->ex_uuid)[3];
+- *p++ = htonl(f);
++ fsid = ((u32 *)fhp->fh_export->ex_uuid)[0];
++ fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[1];
++ fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[2];
++ fsid ^= ((u32 *)fhp->fh_export->ex_uuid)[3];
++ break;
++ default:
++ fsid = new_encode_dev(stat->dev);
+ break;
+ }
+- *p++ = htonl((u32) stat->ino);
+- *p++ = htonl((u32) stat->atime.tv_sec);
+- *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0);
++ *p++ = cpu_to_be32(fsid);
++
++ *p++ = cpu_to_be32((u32)stat->ino);
++ p = encode_timeval(p, &stat->atime);
+ time = stat->mtime;
+- lease_get_mtime(d_inode(dentry), &time);
+- *p++ = htonl((u32) time.tv_sec);
+- *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0);
+- *p++ = htonl((u32) stat->ctime.tv_sec);
+- *p++ = htonl(stat->ctime.tv_nsec ? stat->ctime.tv_nsec / 1000 : 0);
++ lease_get_mtime(d_inode(dentry), &time);
++ p = encode_timeval(p, &time);
++ encode_timeval(p, &stat->ctime);
+
+- return p;
+-}
+-
+-/* Helper function for NFSv2 ACL code */
+-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat)
+-{
+- return encode_fattr(rqstp, p, fhp, stat);
++ return true;
+ }
+
+ /*
+ * XDR decode functions
+ */
+-int
+-nfssvc_decode_void(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return xdr_argsize_check(rqstp, p);
+-}
+
+-int
+-nfssvc_decode_fhandle(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_fhandle *args = rqstp->rq_argp;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_fhandle(xdr, &args->fh);
+ }
+
+-int
+-nfssvc_decode_sattrargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_sattrargs *args = rqstp->rq_argp;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_fhandle(xdr, &args->fh) &&
++ svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
+ }
+
+-int
+-nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_diropargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->fh))
+- || !(p = decode_filename(p, &args->name, &args->len)))
+- return 0;
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_diropargs(xdr, &args->fh, &args->name, &args->len);
+ }
+
+-int
+-nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_readargs *args = rqstp->rq_argp;
+- unsigned int len;
+- int v;
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+-
+- args->offset = ntohl(*p++);
+- len = args->count = ntohl(*p++);
+- p++; /* totalcount - unused */
+-
+- len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2);
+-
+- /* set up somewhere to store response.
+- * We take pages, put them on reslist and include in iovec
+- */
+- v=0;
+- while (len > 0) {
+- struct page *p = *(rqstp->rq_next_page++);
+-
+- rqstp->rq_vec[v].iov_base = page_address(p);
+- rqstp->rq_vec[v].iov_len = min_t(unsigned int, len, PAGE_SIZE);
+- len -= rqstp->rq_vec[v].iov_len;
+- v++;
+- }
+- args->vlen = v;
+- return xdr_argsize_check(rqstp, p);
++ u32 totalcount;
++
++ if (!svcxdr_decode_fhandle(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
++ /* totalcount is ignored */
++ if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
++ return false;
++
++ return true;
+ }
+
+-int
+-nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_writeargs *args = rqstp->rq_argp;
+- unsigned int len, hdr, dlen;
+- struct kvec *head = rqstp->rq_arg.head;
+-
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+-
+- p++; /* beginoffset */
+- args->offset = ntohl(*p++); /* offset */
+- p++; /* totalcount */
+- len = args->len = ntohl(*p++);
+- /*
+- * The protocol specifies a maximum of 8192 bytes.
+- */
+- if (len > NFSSVC_MAXBLKSIZE_V2)
+- return 0;
+-
+- /*
+- * Check to make sure that we got the right number of
+- * bytes.
+- */
+- hdr = (void*)p - head->iov_base;
+- if (hdr > head->iov_len)
+- return 0;
+- dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
+-
+- /*
+- * Round the length of the data which was specified up to
+- * the next multiple of XDR units and then compare that
+- * against the length which was actually received.
+- * Note that when RPCSEC/GSS (for example) is used, the
+- * data buffer can be padded so dlen might be larger
+- * than required. It must never be smaller.
+- */
+- if (dlen < XDR_QUADLEN(len)*4)
+- return 0;
+-
+- args->first.iov_base = (void *)p;
+- args->first.iov_len = head->iov_len - hdr;
+- return 1;
++ u32 beginoffset, totalcount;
++
++ if (!svcxdr_decode_fhandle(xdr, &args->fh))
++ return false;
++ /* beginoffset is ignored */
++ if (xdr_stream_decode_u32(xdr, &beginoffset) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->offset) < 0)
++ return false;
++ /* totalcount is ignored */
++ if (xdr_stream_decode_u32(xdr, &totalcount) < 0)
++ return false;
++
++ /* opaque data */
++ if (xdr_stream_decode_u32(xdr, &args->len) < 0)
++ return false;
++ if (args->len > NFSSVC_MAXBLKSIZE_V2)
++ return false;
++
++ return xdr_stream_subsegment(xdr, &args->payload, args->len);
+ }
+
+-int
+-nfssvc_decode_createargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_createargs *args = rqstp->rq_argp;
+
+- if ( !(p = decode_fh(p, &args->fh))
+- || !(p = decode_filename(p, &args->name, &args->len)))
+- return 0;
+- p = decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_diropargs(xdr, &args->fh,
++ &args->name, &args->len) &&
++ svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
+ }
+
+-int
+-nfssvc_decode_renameargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_renameargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->ffh))
+- || !(p = decode_filename(p, &args->fname, &args->flen))
+- || !(p = decode_fh(p, &args->tfh))
+- || !(p = decode_filename(p, &args->tname, &args->tlen)))
+- return 0;
+-
+- return xdr_argsize_check(rqstp, p);
+-}
+-
+-int
+-nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p)
+-{
+- struct nfsd_readlinkargs *args = rqstp->rq_argp;
+-
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- args->buffer = page_address(*(rqstp->rq_next_page++));
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_diropargs(xdr, &args->ffh,
++ &args->fname, &args->flen) &&
++ svcxdr_decode_diropargs(xdr, &args->tfh,
++ &args->tname, &args->tlen);
+ }
+
+-int
+-nfssvc_decode_linkargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_linkargs *args = rqstp->rq_argp;
+
+- if (!(p = decode_fh(p, &args->ffh))
+- || !(p = decode_fh(p, &args->tfh))
+- || !(p = decode_filename(p, &args->tname, &args->tlen)))
+- return 0;
+-
+- return xdr_argsize_check(rqstp, p);
++ return svcxdr_decode_fhandle(xdr, &args->ffh) &&
++ svcxdr_decode_diropargs(xdr, &args->tfh,
++ &args->tname, &args->tlen);
+ }
+
+-int
+-nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_symlinkargs *args = rqstp->rq_argp;
+- char *base = (char *)p;
+- size_t xdrlen;
+-
+- if ( !(p = decode_fh(p, &args->ffh))
+- || !(p = decode_filename(p, &args->fname, &args->flen)))
+- return 0;
++ struct kvec *head = rqstp->rq_arg.head;
+
+- args->tlen = ntohl(*p++);
++ if (!svcxdr_decode_diropargs(xdr, &args->ffh, &args->fname, &args->flen))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->tlen) < 0)
++ return false;
+ if (args->tlen == 0)
+- return 0;
+-
+- args->first.iov_base = p;
+- args->first.iov_len = rqstp->rq_arg.head[0].iov_len;
+- args->first.iov_len -= (char *)p - base;
++ return false;
+
+- /* This request is never larger than a page. Therefore,
+- * transport will deliver either:
+- * 1. pathname in the pagelist -> sattr is in the tail.
+- * 2. everything in the head buffer -> sattr is in the head.
+- */
+- if (rqstp->rq_arg.page_len) {
+- if (args->tlen != rqstp->rq_arg.page_len)
+- return 0;
+- p = rqstp->rq_arg.tail[0].iov_base;
+- } else {
+- xdrlen = XDR_QUADLEN(args->tlen);
+- if (xdrlen > args->first.iov_len - (8 * sizeof(__be32)))
+- return 0;
+- p += xdrlen;
+- }
+- decode_sattr(p, &args->attrs, nfsd_user_namespace(rqstp));
+-
+- return 1;
++ args->first.iov_len = head->iov_len - xdr_stream_pos(xdr);
++ args->first.iov_base = xdr_inline_decode(xdr, args->tlen);
++ if (!args->first.iov_base)
++ return false;
++ return svcxdr_decode_sattr(rqstp, xdr, &args->attrs);
+ }
+
+-int
+-nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_readdirargs *args = rqstp->rq_argp;
+
+- p = decode_fh(p, &args->fh);
+- if (!p)
+- return 0;
+- args->cookie = ntohl(*p++);
+- args->count = ntohl(*p++);
+- args->count = min_t(u32, args->count, PAGE_SIZE);
+- args->buffer = page_address(*(rqstp->rq_next_page++));
++ if (!svcxdr_decode_fhandle(xdr, &args->fh))
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->cookie) < 0)
++ return false;
++ if (xdr_stream_decode_u32(xdr, &args->count) < 0)
++ return false;
+
+- return xdr_argsize_check(rqstp, p);
++ return true;
+ }
+
+ /*
+ * XDR encode functions
+ */
+-int
+-nfssvc_encode_void(struct svc_rqst *rqstp, __be32 *p)
+-{
+- return xdr_ressize_check(rqstp, p);
+-}
+
+-int
+-nfssvc_encode_stat(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_stat *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- return xdr_ressize_check(rqstp, p);
++ return svcxdr_encode_stat(xdr, resp->status);
+ }
+
+-int
+-nfssvc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_attrstat *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- goto out;
+- p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+-out:
+- return xdr_ressize_check(rqstp, p);
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
++ return false;
++ break;
++ }
++
++ return true;
+ }
+
+-int
+-nfssvc_encode_diropres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_diropres *resp = rqstp->rq_resp;
+
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- goto out;
+- p = encode_fh(p, &resp->fh);
+- p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+-out:
+- return xdr_ressize_check(rqstp, p);
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_fhandle(xdr, &resp->fh))
++ return false;
++ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
++ return false;
++ break;
++ }
++
++ return true;
+ }
+
+-int
+-nfssvc_encode_readlinkres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_readlinkres *resp = rqstp->rq_resp;
+-
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- return xdr_ressize_check(rqstp, p);
+-
+- *p++ = htonl(resp->len);
+- xdr_ressize_check(rqstp, p);
+- rqstp->rq_res.page_len = resp->len;
+- if (resp->len & 3) {
+- /* need to pad the tail */
+- rqstp->rq_res.tail[0].iov_base = p;
+- *p = 0;
+- rqstp->rq_res.tail[0].iov_len = 4 - (resp->len&3);
++ struct kvec *head = rqstp->rq_res.head;
++
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (xdr_stream_encode_u32(xdr, resp->len) < 0)
++ return false;
++ xdr_write_pages(xdr, &resp->page, 0, resp->len);
++ if (svc_encode_result_payload(rqstp, head->iov_len, resp->len) < 0)
++ return false;
++ break;
+ }
+- return 1;
++
++ return true;
+ }
+
+-int
+-nfssvc_encode_readres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_readres *resp = rqstp->rq_resp;
+-
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- return xdr_ressize_check(rqstp, p);
+-
+- p = encode_fattr(rqstp, p, &resp->fh, &resp->stat);
+- *p++ = htonl(resp->count);
+- xdr_ressize_check(rqstp, p);
+-
+- /* now update rqstp->rq_res to reflect data as well */
+- rqstp->rq_res.page_len = resp->count;
+- if (resp->count & 3) {
+- /* need to pad the tail */
+- rqstp->rq_res.tail[0].iov_base = p;
+- *p = 0;
+- rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
++ struct kvec *head = rqstp->rq_res.head;
++
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ if (!svcxdr_encode_fattr(rqstp, xdr, &resp->fh, &resp->stat))
++ return false;
++ if (xdr_stream_encode_u32(xdr, resp->count) < 0)
++ return false;
++ xdr_write_pages(xdr, resp->pages, rqstp->rq_res.page_base,
++ resp->count);
++ if (svc_encode_result_payload(rqstp, head->iov_len, resp->count) < 0)
++ return false;
++ break;
+ }
+- return 1;
++
++ return true;
+ }
+
+-int
+-nfssvc_encode_readdirres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_readdirres *resp = rqstp->rq_resp;
++ struct xdr_buf *dirlist = &resp->dirlist;
++
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ xdr_write_pages(xdr, dirlist->pages, 0, dirlist->len);
++ /* no more entries */
++ if (xdr_stream_encode_item_absent(xdr) < 0)
++ return false;
++ if (xdr_stream_encode_bool(xdr, resp->common.err == nfserr_eof) < 0)
++ return false;
++ break;
++ }
+
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- return xdr_ressize_check(rqstp, p);
+-
+- xdr_ressize_check(rqstp, p);
+- p = resp->buffer;
+- *p++ = 0; /* no more entries */
+- *p++ = htonl((resp->common.err == nfserr_eof));
+- rqstp->rq_res.page_len = (((unsigned long)p-1) & ~PAGE_MASK)+1;
+-
+- return 1;
++ return true;
+ }
+
+-int
+-nfssvc_encode_statfsres(struct svc_rqst *rqstp, __be32 *p)
++bool
++nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
+ {
+ struct nfsd_statfsres *resp = rqstp->rq_resp;
+ struct kstatfs *stat = &resp->stats;
++ __be32 *p;
++
++ if (!svcxdr_encode_stat(xdr, resp->status))
++ return false;
++ switch (resp->status) {
++ case nfs_ok:
++ p = xdr_reserve_space(xdr, XDR_UNIT * 5);
++ if (!p)
++ return false;
++ *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2);
++ *p++ = cpu_to_be32(stat->f_bsize);
++ *p++ = cpu_to_be32(stat->f_blocks);
++ *p++ = cpu_to_be32(stat->f_bfree);
++ *p = cpu_to_be32(stat->f_bavail);
++ break;
++ }
+
+- *p++ = resp->status;
+- if (resp->status != nfs_ok)
+- return xdr_ressize_check(rqstp, p);
++ return true;
++}
+
+- *p++ = htonl(NFSSVC_MAXBLKSIZE_V2); /* max transfer size */
+- *p++ = htonl(stat->f_bsize);
+- *p++ = htonl(stat->f_blocks);
+- *p++ = htonl(stat->f_bfree);
+- *p++ = htonl(stat->f_bavail);
+- return xdr_ressize_check(rqstp, p);
++/**
++ * nfssvc_encode_nfscookie - Encode a directory offset cookie
++ * @resp: readdir result context
++ * @offset: offset cookie to encode
++ *
++ * The buffer space for the offset cookie has already been reserved
++ * by svcxdr_encode_entry_common().
++ */
++void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset)
++{
++ __be32 cookie = cpu_to_be32(offset);
++
++ if (!resp->cookie_offset)
++ return;
++
++ write_bytes_to_xdr_buf(&resp->dirlist, resp->cookie_offset, &cookie,
++ sizeof(cookie));
++ resp->cookie_offset = 0;
+ }
+
+-int
+-nfssvc_encode_entry(void *ccdv, const char *name,
+- int namlen, loff_t offset, u64 ino, unsigned int d_type)
++static bool
++svcxdr_encode_entry_common(struct nfsd_readdirres *resp, const char *name,
++ int namlen, loff_t offset, u64 ino)
+ {
+- struct readdir_cd *ccd = ccdv;
+- struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
+- __be32 *p = cd->buffer;
+- int buflen, slen;
++ struct xdr_buf *dirlist = &resp->dirlist;
++ struct xdr_stream *xdr = &resp->xdr;
++
++ if (xdr_stream_encode_item_present(xdr) < 0)
++ return false;
++ /* fileid */
++ if (xdr_stream_encode_u32(xdr, (u32)ino) < 0)
++ return false;
++ /* name */
++ if (xdr_stream_encode_opaque(xdr, name, min(namlen, NFS2_MAXNAMLEN)) < 0)
++ return false;
++ /* cookie */
++ resp->cookie_offset = dirlist->len;
++ if (xdr_stream_encode_u32(xdr, ~0U) < 0)
++ return false;
++
++ return true;
++}
+
+- /*
+- dprintk("nfsd: entry(%.*s off %ld ino %ld)\n",
+- namlen, name, offset, ino);
+- */
++/**
++ * nfssvc_encode_entry - encode one NFSv2 READDIR entry
++ * @data: directory context
++ * @name: name of the object to be encoded
++ * @namlen: length of that name, in bytes
++ * @offset: the offset of the previous entry
++ * @ino: the fileid of this entry
++ * @d_type: unused
++ *
++ * Return values:
++ * %0: Entry was successfully encoded.
++ * %-EINVAL: An encoding problem occured, secondary status code in resp->common.err
++ *
++ * On exit, the following fields are updated:
++ * - resp->xdr
++ * - resp->common.err
++ * - resp->cookie_offset
++ */
++int nfssvc_encode_entry(void *data, const char *name, int namlen,
++ loff_t offset, u64 ino, unsigned int d_type)
++{
++ struct readdir_cd *ccd = data;
++ struct nfsd_readdirres *resp = container_of(ccd,
++ struct nfsd_readdirres,
++ common);
++ unsigned int starting_length = resp->dirlist.len;
+
+- if (offset > ~((u32) 0)) {
+- cd->common.err = nfserr_fbig;
+- return -EINVAL;
+- }
+- if (cd->offset)
+- *cd->offset = htonl(offset);
++ /* The offset cookie for the previous entry */
++ nfssvc_encode_nfscookie(resp, offset);
+
+- /* truncate filename */
+- namlen = min(namlen, NFS2_MAXNAMLEN);
+- slen = XDR_QUADLEN(namlen);
++ if (!svcxdr_encode_entry_common(resp, name, namlen, offset, ino))
++ goto out_toosmall;
+
+- if ((buflen = cd->buflen - slen - 4) < 0) {
+- cd->common.err = nfserr_toosmall;
+- return -EINVAL;
+- }
+- if (ino > ~((u32) 0)) {
+- cd->common.err = nfserr_fbig;
+- return -EINVAL;
+- }
+- *p++ = xdr_one; /* mark entry present */
+- *p++ = htonl((u32) ino); /* file id */
+- p = xdr_encode_array(p, name, namlen);/* name length & name */
+- cd->offset = p; /* remember pointer */
+- *p++ = htonl(~0U); /* offset of next entry */
+-
+- cd->buflen = buflen;
+- cd->buffer = p;
+- cd->common.err = nfs_ok;
++ xdr_commit_encode(&resp->xdr);
++ resp->common.err = nfs_ok;
+ return 0;
++
++out_toosmall:
++ resp->cookie_offset = 0;
++ resp->common.err = nfserr_toosmall;
++ resp->dirlist.len = starting_length;
++ return -EINVAL;
+ }
+
+ /*
+diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
+index 9eae11a9d21ca..e94634d305912 100644
+--- a/fs/nfsd/state.h
++++ b/fs/nfsd/state.h
+@@ -57,11 +57,11 @@ typedef struct {
+ } stateid_t;
+
+ typedef struct {
+- stateid_t stid;
++ stateid_t cs_stid;
+ #define NFS4_COPY_STID 1
+ #define NFS4_COPYNOTIFY_STID 2
+- unsigned char sc_type;
+- refcount_t sc_count;
++ unsigned char cs_type;
++ refcount_t cs_count;
+ } copy_stateid_t;
+
+ struct nfsd4_callback {
+@@ -149,6 +149,7 @@ struct nfs4_delegation {
+ /* For recall: */
+ int dl_retries;
+ struct nfsd4_callback dl_recall;
++ bool dl_recalled;
+ };
+
+ #define cb_to_delegation(cb) \
+@@ -174,7 +175,7 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s)
+ /* Maximum number of slots per session. 160 is useful for long haul TCP */
+ #define NFSD_MAX_SLOTS_PER_SESSION 160
+ /* Maximum number of operations per session compound */
+-#define NFSD_MAX_OPS_PER_COMPOUND 16
++#define NFSD_MAX_OPS_PER_COMPOUND 50
+ /* Maximum session per slot cache size */
+ #define NFSD_SLOT_CACHE_SIZE 2048
+ /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */
+@@ -282,6 +283,28 @@ struct nfsd4_sessionid {
+
+ #define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */
+
++/*
++ * State Meaning Where set
++ * --------------------------------------------------------------------------
++ * | NFSD4_ACTIVE | Confirmed, active | Default |
++ * |------------------- ----------------------------------------------------|
++ * | NFSD4_COURTESY | Courtesy state. | nfs4_get_client_reaplist |
++ * | | Lease/lock/share | |
++ * | | reservation conflict | |
++ * | | can cause Courtesy | |
++ * | | client to be expired | |
++ * |------------------------------------------------------------------------|
++ * | NFSD4_EXPIRABLE | Courtesy client to be| nfs4_laundromat |
++ * | | expired by Laundromat| try_to_expire_client |
++ * | | due to conflict | |
++ * |------------------------------------------------------------------------|
++ */
++enum {
++ NFSD4_ACTIVE = 0,
++ NFSD4_COURTESY,
++ NFSD4_EXPIRABLE,
++};
++
+ /*
+ * struct nfs4_client - one per client. Clientids live here.
+ *
+@@ -345,6 +368,7 @@ struct nfs4_client {
+ #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */
+ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \
+ 1 << NFSD4_CLIENT_CB_KILL)
++#define NFSD4_CLIENT_CB_RECALL_ANY (6)
+ unsigned long cl_flags;
+ const struct cred *cl_cb_cred;
+ struct rpc_clnt *cl_cb_client;
+@@ -371,6 +395,10 @@ struct nfs4_client {
+
+ /* debugging info directory under nfsd/clients/ : */
+ struct dentry *cl_nfsd_dentry;
++ /* 'info' file within that directory. Ref is not counted,
++ * but will remain valid iff cl_nfsd_dentry != NULL
++ */
++ struct dentry *cl_nfsd_info_dentry;
+
+ /* for nfs41 callbacks */
+ /* We currently support a single back channel with a single slot */
+@@ -381,6 +409,13 @@ struct nfs4_client {
+ struct list_head async_copies; /* list of async copies */
+ spinlock_t async_lock; /* lock for async copies */
+ atomic_t cl_cb_inflight; /* Outstanding callbacks */
++
++ unsigned int cl_state;
++ atomic_t cl_delegs_in_recall;
++
++ struct nfsd4_cb_recall_any *cl_ra;
++ time64_t cl_ra_time;
++ struct list_head cl_ra_cblist;
+ };
+
+ /* struct nfs4_client_reset
+@@ -506,14 +541,13 @@ struct nfs4_clnt_odstate {
+ * inode can have multiple filehandles associated with it, so there is
+ * (potentially) a many to one relationship between this struct and struct
+ * inode.
+- *
+- * These are hashed by filehandle in the file_hashtbl, which is protected by
+- * the global state_lock spinlock.
+ */
+ struct nfs4_file {
+ refcount_t fi_ref;
++ struct inode * fi_inode;
++ bool fi_aliased;
+ spinlock_t fi_lock;
+- struct hlist_node fi_hash; /* hash on fi_fhandle */
++ struct rhlist_head fi_rlist;
+ struct list_head fi_stateids;
+ union {
+ struct list_head fi_delegations;
+@@ -562,6 +596,10 @@ struct nfs4_ol_stateid {
+ struct list_head st_locks;
+ struct nfs4_stateowner *st_stateowner;
+ struct nfs4_clnt_odstate *st_clnt_odstate;
++/*
++ * These bitmasks use 3 separate bits for READ, ALLOW, and BOTH; see the
++ * comment above bmap_to_share_mode() for explanation:
++ */
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
+ struct nfs4_ol_stateid *st_openstp;
+@@ -603,6 +641,7 @@ enum nfsd4_cb_op {
+ NFSPROC4_CLNT_CB_OFFLOAD,
+ NFSPROC4_CLNT_CB_SEQUENCE,
+ NFSPROC4_CLNT_CB_NOTIFY_LOCK,
++ NFSPROC4_CLNT_CB_RECALL_ANY,
+ };
+
+ /* Returns true iff a is later than b: */
+@@ -623,6 +662,7 @@ struct nfsd4_blocked_lock {
+ struct file_lock nbl_lock;
+ struct knfsd_fh nbl_fh;
+ struct nfsd4_callback nbl_cb;
++ struct kref nbl_kref;
+ };
+
+ struct nfsd4_compound_state;
+@@ -649,26 +689,22 @@ void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *)
+ extern void nfs4_release_reclaim(struct nfsd_net *);
+ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(struct xdr_netobj name,
+ struct nfsd_net *nn);
+-extern __be32 nfs4_check_open_reclaim(clientid_t *clid,
+- struct nfsd4_compound_state *cstate, struct nfsd_net *nn);
++extern __be32 nfs4_check_open_reclaim(struct nfs4_client *);
+ extern void nfsd4_probe_callback(struct nfs4_client *clp);
+ extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
+ extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+ extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
+ const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+-extern void nfsd4_run_cb(struct nfsd4_callback *cb);
++extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
+ extern int nfsd4_create_callback_queue(void);
+ extern void nfsd4_destroy_callback_queue(void);
+ extern void nfsd4_shutdown_callback(struct nfs4_client *);
+ extern void nfsd4_shutdown_copy(struct nfs4_client *clp);
+-extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp);
+ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
+ struct xdr_netobj princhash, struct nfsd_net *nn);
+ extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
+
+-struct nfs4_file *find_file(struct knfsd_fh *fh);
+ void put_nfs4_file(struct nfs4_file *fi);
+-extern void nfs4_put_copy(struct nfsd4_copy *copy);
+ extern struct nfsd4_copy *
+ find_async_copy(struct nfs4_client *clp, stateid_t *staetid);
+ extern void nfs4_put_cpntf_state(struct nfsd_net *nn,
+@@ -693,4 +729,9 @@ extern void nfsd4_client_record_remove(struct nfs4_client *clp);
+ extern int nfsd4_client_record_check(struct nfs4_client *clp);
+ extern void nfsd4_record_grace_done(struct nfsd_net *nn);
+
++static inline bool try_to_expire_client(struct nfs4_client *clp)
++{
++ cmpxchg(&clp->cl_state, NFSD4_COURTESY, NFSD4_EXPIRABLE);
++ return clp->cl_state == NFSD4_EXPIRABLE;
++}
+ #endif /* NFSD4_STATE_H */
+diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
+index b1bc582b0493e..777e24e5da33b 100644
+--- a/fs/nfsd/stats.c
++++ b/fs/nfsd/stats.c
+@@ -7,16 +7,14 @@
+ * Format:
+ * rc <hits> <misses> <nocache>
+ * Statistsics for the reply cache
+- * fh <stale> <total-lookups> <anonlookups> <dir-not-in-dcache> <nondir-not-in-dcache>
++ * fh <stale> <deprecated filehandle cache stats>
+ * statistics for filehandle lookup
+ * io <bytes-read> <bytes-written>
+ * statistics for IO throughput
+- * th <threads> <fullcnt> <10%-20%> <20%-30%> ... <90%-100%> <100%>
+- * time (seconds) when nfsd thread usage above thresholds
+- * and number of times that all threads were in use
+- * ra cache-size <10% <20% <30% ... <100% not-found
+- * number of times that read-ahead entry was found that deep in
+- * the cache.
++ * th <threads> <deprecated thread usage histogram stats>
++ * number of threads
++ * ra <deprecated ra-cache stats>
++ *
+ * plus generic RPC stats (see net/sunrpc/stats.c)
+ *
+ * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+@@ -34,35 +32,28 @@ struct svc_stat nfsd_svcstats = {
+ .program = &nfsd_program,
+ };
+
+-static int nfsd_proc_show(struct seq_file *seq, void *v)
++static int nfsd_show(struct seq_file *seq, void *v)
+ {
+ int i;
+
+- seq_printf(seq, "rc %u %u %u\nfh %u %u %u %u %u\nio %u %u\n",
+- nfsdstats.rchits,
+- nfsdstats.rcmisses,
+- nfsdstats.rcnocache,
+- nfsdstats.fh_stale,
+- nfsdstats.fh_lookup,
+- nfsdstats.fh_anon,
+- nfsdstats.fh_nocache_dir,
+- nfsdstats.fh_nocache_nondir,
+- nfsdstats.io_read,
+- nfsdstats.io_write);
++ seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n",
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]),
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]),
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]),
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]),
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]),
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
++
+ /* thread usage: */
+- seq_printf(seq, "th %u %u", nfsdstats.th_cnt, nfsdstats.th_fullcnt);
+- for (i=0; i<10; i++) {
+- unsigned int jifs = nfsdstats.th_usage[i];
+- unsigned int sec = jifs / HZ, msec = (jifs % HZ)*1000/HZ;
+- seq_printf(seq, " %u.%03u", sec, msec);
+- }
++ seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
++
++ /* deprecated thread usage histogram stats */
++ for (i = 0; i < 10; i++)
++ seq_puts(seq, " 0.000");
++
++ /* deprecated ra-cache stats */
++ seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n");
+
+- /* newline and ra-cache */
+- seq_printf(seq, "\nra %u", nfsdstats.ra_size);
+- for (i=0; i<11; i++)
+- seq_printf(seq, " %u", nfsdstats.ra_depth[i]);
+- seq_putc(seq, '\n');
+-
+ /* show my rpc info */
+ svc_seq_show(seq, &nfsd_svcstats);
+
+@@ -70,8 +61,10 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
+ /* Show count for individual nfsv4 operations */
+ /* Writing operation numbers 0 1 2 also for maintaining uniformity */
+ seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1);
+- for (i = 0; i <= LAST_NFS4_OP; i++)
+- seq_printf(seq, " %u", nfsdstats.nfs4_opcount[i]);
++ for (i = 0; i <= LAST_NFS4_OP; i++) {
++ seq_printf(seq, " %lld",
++ percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
++ }
+
+ seq_putc(seq, '\n');
+ #endif
+@@ -79,26 +72,65 @@ static int nfsd_proc_show(struct seq_file *seq, void *v)
+ return 0;
+ }
+
+-static int nfsd_proc_open(struct inode *inode, struct file *file)
++DEFINE_PROC_SHOW_ATTRIBUTE(nfsd);
++
++int nfsd_percpu_counters_init(struct percpu_counter counters[], int num)
+ {
+- return single_open(file, nfsd_proc_show, NULL);
++ int i, err = 0;
++
++ for (i = 0; !err && i < num; i++)
++ err = percpu_counter_init(&counters[i], 0, GFP_KERNEL);
++
++ if (!err)
++ return 0;
++
++ for (; i > 0; i--)
++ percpu_counter_destroy(&counters[i-1]);
++
++ return err;
+ }
+
+-static const struct proc_ops nfsd_proc_ops = {
+- .proc_open = nfsd_proc_open,
+- .proc_read = seq_read,
+- .proc_lseek = seq_lseek,
+- .proc_release = single_release,
+-};
++void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num)
++{
++ int i;
++
++ for (i = 0; i < num; i++)
++ percpu_counter_set(&counters[i], 0);
++}
++
++void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
++{
++ int i;
++
++ for (i = 0; i < num; i++)
++ percpu_counter_destroy(&counters[i]);
++}
++
++static int nfsd_stat_counters_init(void)
++{
++ return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
++}
++
++static void nfsd_stat_counters_destroy(void)
++{
++ nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
++}
+
+-void
+-nfsd_stat_init(void)
++int nfsd_stat_init(void)
+ {
++ int err;
++
++ err = nfsd_stat_counters_init();
++ if (err)
++ return err;
++
+ svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
++
++ return 0;
+ }
+
+-void
+-nfsd_stat_shutdown(void)
++void nfsd_stat_shutdown(void)
+ {
++ nfsd_stat_counters_destroy();
+ svc_proc_unregister(&init_net, "nfsd");
+ }
+diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
+index b23fdac698201..9b43dc3d99913 100644
+--- a/fs/nfsd/stats.h
++++ b/fs/nfsd/stats.h
+@@ -8,37 +8,89 @@
+ #define _NFSD_STATS_H
+
+ #include <uapi/linux/nfsd/stats.h>
++#include <linux/percpu_counter.h>
+
+
+-struct nfsd_stats {
+- unsigned int rchits; /* repcache hits */
+- unsigned int rcmisses; /* repcache hits */
+- unsigned int rcnocache; /* uncached reqs */
+- unsigned int fh_stale; /* FH stale error */
+- unsigned int fh_lookup; /* dentry cached */
+- unsigned int fh_anon; /* anon file dentry returned */
+- unsigned int fh_nocache_dir; /* filehandle not found in dcache */
+- unsigned int fh_nocache_nondir; /* filehandle not found in dcache */
+- unsigned int io_read; /* bytes returned to read requests */
+- unsigned int io_write; /* bytes passed in write requests */
+- unsigned int th_cnt; /* number of available threads */
+- unsigned int th_usage[10]; /* number of ticks during which n perdeciles
+- * of available threads were in use */
+- unsigned int th_fullcnt; /* number of times last free thread was used */
+- unsigned int ra_size; /* size of ra cache */
+- unsigned int ra_depth[11]; /* number of times ra entry was found that deep
+- * in the cache (10percentiles). [10] = not found */
++enum {
++ NFSD_STATS_RC_HITS, /* repcache hits */
++ NFSD_STATS_RC_MISSES, /* repcache misses */
++ NFSD_STATS_RC_NOCACHE, /* uncached reqs */
++ NFSD_STATS_FH_STALE, /* FH stale error */
++ NFSD_STATS_IO_READ, /* bytes returned to read requests */
++ NFSD_STATS_IO_WRITE, /* bytes passed in write requests */
+ #ifdef CONFIG_NFSD_V4
+- unsigned int nfs4_opcount[LAST_NFS4_OP + 1]; /* count of individual nfsv4 operations */
++ NFSD_STATS_FIRST_NFS4_OP, /* count of individual nfsv4 operations */
++ NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
++#define NFSD_STATS_NFS4_OP(op) (NFSD_STATS_FIRST_NFS4_OP + (op))
+ #endif
+-
++ NFSD_STATS_COUNTERS_NUM
+ };
+
++struct nfsd_stats {
++ struct percpu_counter counter[NFSD_STATS_COUNTERS_NUM];
++
++ atomic_t th_cnt; /* number of available threads */
++};
+
+ extern struct nfsd_stats nfsdstats;
++
+ extern struct svc_stat nfsd_svcstats;
+
+-void nfsd_stat_init(void);
+-void nfsd_stat_shutdown(void);
++int nfsd_percpu_counters_init(struct percpu_counter counters[], int num);
++void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num);
++void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num);
++int nfsd_stat_init(void);
++void nfsd_stat_shutdown(void);
++
++static inline void nfsd_stats_rc_hits_inc(void)
++{
++ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]);
++}
++
++static inline void nfsd_stats_rc_misses_inc(void)
++{
++ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]);
++}
++
++static inline void nfsd_stats_rc_nocache_inc(void)
++{
++ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]);
++}
++
++static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
++{
++ percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
++ if (exp)
++ percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]);
++}
++
++static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
++{
++ percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
++ if (exp)
++ percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount);
++}
++
++static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
++{
++ percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
++ if (exp)
++ percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount);
++}
++
++static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
++{
++ percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]);
++}
++
++static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount)
++{
++ percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
++}
++
++static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
++{
++ percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
++}
+
+ #endif /* _NFSD_STATS_H */
+diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
+index 90967466a1e56..f008b95ceec2e 100644
+--- a/fs/nfsd/trace.c
++++ b/fs/nfsd/trace.c
+@@ -1,3 +1,4 @@
++// SPDX-License-Identifier: GPL-2.0
+
+ #define CREATE_TRACE_POINTS
+ #include "trace.h"
+diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
+index a952f4a9b2a68..445d00f00eab7 100644
+--- a/fs/nfsd/trace.h
++++ b/fs/nfsd/trace.h
+@@ -12,6 +12,86 @@
+ #include "export.h"
+ #include "nfsfh.h"
+
++#define NFSD_TRACE_PROC_ARG_FIELDS \
++ __field(unsigned int, netns_ino) \
++ __field(u32, xid) \
++ __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
++ __array(unsigned char, client, sizeof(struct sockaddr_in6))
++
++#define NFSD_TRACE_PROC_ARG_ASSIGNMENTS \
++ do { \
++ __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
++ __entry->xid = be32_to_cpu(rqstp->rq_xid); \
++ memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
++ rqstp->rq_xprt->xpt_locallen); \
++ memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
++ rqstp->rq_xprt->xpt_remotelen); \
++ } while (0);
++
++#define NFSD_TRACE_PROC_RES_FIELDS \
++ __field(unsigned int, netns_ino) \
++ __field(u32, xid) \
++ __field(unsigned long, status) \
++ __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
++ __array(unsigned char, client, sizeof(struct sockaddr_in6))
++
++#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \
++ do { \
++ __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
++ __entry->xid = be32_to_cpu(rqstp->rq_xid); \
++ __entry->status = be32_to_cpu(error); \
++ memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
++ rqstp->rq_xprt->xpt_locallen); \
++ memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
++ rqstp->rq_xprt->xpt_remotelen); \
++ } while (0);
++
++DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
++ TP_PROTO(
++ const struct svc_rqst *rqstp
++ ),
++ TP_ARGS(rqstp),
++ TP_STRUCT__entry(
++ NFSD_TRACE_PROC_ARG_FIELDS
++
++ __field(u32, vers)
++ __field(u32, proc)
++ ),
++ TP_fast_assign(
++ NFSD_TRACE_PROC_ARG_ASSIGNMENTS
++
++ __entry->vers = rqstp->rq_vers;
++ __entry->proc = rqstp->rq_proc;
++ ),
++ TP_printk("xid=0x%08x vers=%u proc=%u",
++ __entry->xid, __entry->vers, __entry->proc
++ )
++);
++
++#define DEFINE_NFSD_XDR_ERR_EVENT(name) \
++DEFINE_EVENT(nfsd_xdr_err_class, nfsd_##name##_err, \
++ TP_PROTO(const struct svc_rqst *rqstp), \
++ TP_ARGS(rqstp))
++
++DEFINE_NFSD_XDR_ERR_EVENT(garbage_args);
++DEFINE_NFSD_XDR_ERR_EVENT(cant_encode);
++
++#define show_nfsd_may_flags(x) \
++ __print_flags(x, "|", \
++ { NFSD_MAY_EXEC, "EXEC" }, \
++ { NFSD_MAY_WRITE, "WRITE" }, \
++ { NFSD_MAY_READ, "READ" }, \
++ { NFSD_MAY_SATTR, "SATTR" }, \
++ { NFSD_MAY_TRUNC, "TRUNC" }, \
++ { NFSD_MAY_LOCK, "LOCK" }, \
++ { NFSD_MAY_OWNER_OVERRIDE, "OWNER_OVERRIDE" }, \
++ { NFSD_MAY_LOCAL_ACCESS, "LOCAL_ACCESS" }, \
++ { NFSD_MAY_BYPASS_GSS_ON_ROOT, "BYPASS_GSS_ON_ROOT" }, \
++ { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }, \
++ { NFSD_MAY_BYPASS_GSS, "BYPASS_GSS" }, \
++ { NFSD_MAY_READ_IF_EXEC, "READ_IF_EXEC" }, \
++ { NFSD_MAY_64BIT_COOKIE, "64BIT_COOKIE" })
++
+ TRACE_EVENT(nfsd_compound,
+ TP_PROTO(const struct svc_rqst *rqst,
+ u32 args_opcnt),
+@@ -51,6 +131,56 @@ TRACE_EVENT(nfsd_compound_status,
+ __get_str(name), __entry->status)
+ )
+
++TRACE_EVENT(nfsd_compound_decode_err,
++ TP_PROTO(
++ const struct svc_rqst *rqstp,
++ u32 args_opcnt,
++ u32 resp_opcnt,
++ u32 opnum,
++ __be32 status
++ ),
++ TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status),
++ TP_STRUCT__entry(
++ NFSD_TRACE_PROC_RES_FIELDS
++
++ __field(u32, args_opcnt)
++ __field(u32, resp_opcnt)
++ __field(u32, opnum)
++ ),
++ TP_fast_assign(
++ NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
++
++ __entry->args_opcnt = args_opcnt;
++ __entry->resp_opcnt = resp_opcnt;
++ __entry->opnum = opnum;
++ ),
++ TP_printk("op=%u/%u opnum=%u status=%lu",
++ __entry->resp_opcnt, __entry->args_opcnt,
++ __entry->opnum, __entry->status)
++);
++
++TRACE_EVENT(nfsd_compound_encode_err,
++ TP_PROTO(
++ const struct svc_rqst *rqstp,
++ u32 opnum,
++ __be32 status
++ ),
++ TP_ARGS(rqstp, opnum, status),
++ TP_STRUCT__entry(
++ NFSD_TRACE_PROC_RES_FIELDS
++
++ __field(u32, opnum)
++ ),
++ TP_fast_assign(
++ NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
++
++ __entry->opnum = opnum;
++ ),
++ TP_printk("opnum=%u status=%lu",
++ __entry->opnum, __entry->status)
++);
++
++
+ DECLARE_EVENT_CLASS(nfsd_fh_err_class,
+ TP_PROTO(struct svc_rqst *rqstp,
+ struct svc_fh *fhp,
+@@ -247,10 +377,106 @@ DEFINE_EVENT(nfsd_err_class, nfsd_##name, \
+ DEFINE_NFSD_ERR_EVENT(read_err);
+ DEFINE_NFSD_ERR_EVENT(write_err);
+
++TRACE_EVENT(nfsd_dirent,
++ TP_PROTO(struct svc_fh *fhp,
++ u64 ino,
++ const char *name,
++ int namlen),
++ TP_ARGS(fhp, ino, name, namlen),
++ TP_STRUCT__entry(
++ __field(u32, fh_hash)
++ __field(u64, ino)
++ __field(int, len)
++ __dynamic_array(unsigned char, name, namlen)
++ ),
++ TP_fast_assign(
++ __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
++ __entry->ino = ino;
++ __entry->len = namlen;
++ memcpy(__get_str(name), name, namlen);
++ ),
++ TP_printk("fh_hash=0x%08x ino=%llu name=%.*s",
++ __entry->fh_hash, __entry->ino,
++ __entry->len, __get_str(name))
++)
++
++DECLARE_EVENT_CLASS(nfsd_copy_err_class,
++ TP_PROTO(struct svc_rqst *rqstp,
++ struct svc_fh *src_fhp,
++ loff_t src_offset,
++ struct svc_fh *dst_fhp,
++ loff_t dst_offset,
++ u64 count,
++ int status),
++ TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, count, status),
++ TP_STRUCT__entry(
++ __field(u32, xid)
++ __field(u32, src_fh_hash)
++ __field(loff_t, src_offset)
++ __field(u32, dst_fh_hash)
++ __field(loff_t, dst_offset)
++ __field(u64, count)
++ __field(int, status)
++ ),
++ TP_fast_assign(
++ __entry->xid = be32_to_cpu(rqstp->rq_xid);
++ __entry->src_fh_hash = knfsd_fh_hash(&src_fhp->fh_handle);
++ __entry->src_offset = src_offset;
++ __entry->dst_fh_hash = knfsd_fh_hash(&dst_fhp->fh_handle);
++ __entry->dst_offset = dst_offset;
++ __entry->count = count;
++ __entry->status = status;
++ ),
++ TP_printk("xid=0x%08x src_fh_hash=0x%08x src_offset=%lld "
++ "dst_fh_hash=0x%08x dst_offset=%lld "
++ "count=%llu status=%d",
++ __entry->xid, __entry->src_fh_hash, __entry->src_offset,
++ __entry->dst_fh_hash, __entry->dst_offset,
++ (unsigned long long)__entry->count,
++ __entry->status)
++)
++
++#define DEFINE_NFSD_COPY_ERR_EVENT(name) \
++DEFINE_EVENT(nfsd_copy_err_class, nfsd_##name, \
++ TP_PROTO(struct svc_rqst *rqstp, \
++ struct svc_fh *src_fhp, \
++ loff_t src_offset, \
++ struct svc_fh *dst_fhp, \
++ loff_t dst_offset, \
++ u64 count, \
++ int status), \
++ TP_ARGS(rqstp, src_fhp, src_offset, dst_fhp, dst_offset, \
++ count, status))
++
++DEFINE_NFSD_COPY_ERR_EVENT(clone_file_range_err);
++
+ #include "state.h"
+ #include "filecache.h"
+ #include "vfs.h"
+
++TRACE_EVENT(nfsd_delegret_wakeup,
++ TP_PROTO(
++ const struct svc_rqst *rqstp,
++ const struct inode *inode,
++ long timeo
++ ),
++ TP_ARGS(rqstp, inode, timeo),
++ TP_STRUCT__entry(
++ __field(u32, xid)
++ __field(const void *, inode)
++ __field(long, timeo)
++ ),
++ TP_fast_assign(
++ __entry->xid = be32_to_cpu(rqstp->rq_xid);
++ __entry->inode = inode;
++ __entry->timeo = timeo;
++ ),
++ TP_printk("xid=0x%08x inode=%p%s",
++ __entry->xid, __entry->inode,
++ __entry->timeo == 0 ? " (timed out)" : ""
++ )
++);
++
+ DECLARE_EVENT_CLASS(nfsd_stateid_class,
+ TP_PROTO(stateid_t *stp),
+ TP_ARGS(stp),
+@@ -291,7 +517,7 @@ DEFINE_STATEID_EVENT(layout_recall_release);
+
+ DEFINE_STATEID_EVENT(open);
+ DEFINE_STATEID_EVENT(deleg_read);
+-DEFINE_STATEID_EVENT(deleg_break);
++DEFINE_STATEID_EVENT(deleg_return);
+ DEFINE_STATEID_EVENT(deleg_recall);
+
+ DECLARE_EVENT_CLASS(nfsd_stateseqid_class,
+@@ -324,6 +550,61 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
+ DEFINE_STATESEQID_EVENT(preprocess);
+ DEFINE_STATESEQID_EVENT(open_confirm);
+
++TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
++TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
++TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
++TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
++TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
++TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
++TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
++
++#define show_stid_type(x) \
++ __print_flags(x, "|", \
++ { NFS4_OPEN_STID, "OPEN" }, \
++ { NFS4_LOCK_STID, "LOCK" }, \
++ { NFS4_DELEG_STID, "DELEG" }, \
++ { NFS4_CLOSED_STID, "CLOSED" }, \
++ { NFS4_REVOKED_DELEG_STID, "REVOKED" }, \
++ { NFS4_CLOSED_DELEG_STID, "CLOSED_DELEG" }, \
++ { NFS4_LAYOUT_STID, "LAYOUT" })
++
++DECLARE_EVENT_CLASS(nfsd_stid_class,
++ TP_PROTO(
++ const struct nfs4_stid *stid
++ ),
++ TP_ARGS(stid),
++ TP_STRUCT__entry(
++ __field(unsigned long, sc_type)
++ __field(int, sc_count)
++ __field(u32, cl_boot)
++ __field(u32, cl_id)
++ __field(u32, si_id)
++ __field(u32, si_generation)
++ ),
++ TP_fast_assign(
++ const stateid_t *stp = &stid->sc_stateid;
++
++ __entry->sc_type = stid->sc_type;
++ __entry->sc_count = refcount_read(&stid->sc_count);
++ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
++ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
++ __entry->si_id = stp->si_opaque.so_id;
++ __entry->si_generation = stp->si_generation;
++ ),
++ TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
++ __entry->cl_boot, __entry->cl_id,
++ __entry->si_id, __entry->si_generation,
++ __entry->sc_count, show_stid_type(__entry->sc_type)
++ )
++);
++
++#define DEFINE_STID_EVENT(name) \
++DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name, \
++ TP_PROTO(const struct nfs4_stid *stid), \
++ TP_ARGS(stid))
++
++DEFINE_STID_EVENT(revoke);
++
+ DECLARE_EVENT_CLASS(nfsd_clientid_class,
+ TP_PROTO(const clientid_t *clid),
+ TP_ARGS(clid),
+@@ -343,7 +624,12 @@ DEFINE_EVENT(nfsd_clientid_class, nfsd_clid_##name, \
+ TP_PROTO(const clientid_t *clid), \
+ TP_ARGS(clid))
+
+-DEFINE_CLIENTID_EVENT(expired);
++DEFINE_CLIENTID_EVENT(expire_unconf);
++DEFINE_CLIENTID_EVENT(reclaim_complete);
++DEFINE_CLIENTID_EVENT(confirmed);
++DEFINE_CLIENTID_EVENT(destroyed);
++DEFINE_CLIENTID_EVENT(admin_expired);
++DEFINE_CLIENTID_EVENT(replaced);
+ DEFINE_CLIENTID_EVENT(purged);
+ DEFINE_CLIENTID_EVENT(renew);
+ DEFINE_CLIENTID_EVENT(stale);
+@@ -368,56 +654,145 @@ DEFINE_EVENT(nfsd_net_class, nfsd_##name, \
+ DEFINE_NET_EVENT(grace_start);
+ DEFINE_NET_EVENT(grace_complete);
+
+-TRACE_EVENT(nfsd_clid_inuse_err,
++TRACE_EVENT(nfsd_writeverf_reset,
++ TP_PROTO(
++ const struct nfsd_net *nn,
++ const struct svc_rqst *rqstp,
++ int error
++ ),
++ TP_ARGS(nn, rqstp, error),
++ TP_STRUCT__entry(
++ __field(unsigned long long, boot_time)
++ __field(u32, xid)
++ __field(int, error)
++ __array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
++ ),
++ TP_fast_assign(
++ __entry->boot_time = nn->boot_time;
++ __entry->xid = be32_to_cpu(rqstp->rq_xid);
++ __entry->error = error;
++
++ /* avoid seqlock inside TP_fast_assign */
++ memcpy(__entry->verifier, nn->writeverf,
++ NFS4_VERIFIER_SIZE);
++ ),
++ TP_printk("boot_time=%16llx xid=0x%08x error=%d new verifier=0x%s",
++ __entry->boot_time, __entry->xid, __entry->error,
++ __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE)
++ )
++);
++
++TRACE_EVENT(nfsd_clid_cred_mismatch,
++ TP_PROTO(
++ const struct nfs4_client *clp,
++ const struct svc_rqst *rqstp
++ ),
++ TP_ARGS(clp, rqstp),
++ TP_STRUCT__entry(
++ __field(u32, cl_boot)
++ __field(u32, cl_id)
++ __field(unsigned long, cl_flavor)
++ __field(unsigned long, new_flavor)
++ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
++ ),
++ TP_fast_assign(
++ __entry->cl_boot = clp->cl_clientid.cl_boot;
++ __entry->cl_id = clp->cl_clientid.cl_id;
++ __entry->cl_flavor = clp->cl_cred.cr_flavor;
++ __entry->new_flavor = rqstp->rq_cred.cr_flavor;
++ memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
++ sizeof(struct sockaddr_in6));
++ ),
++ TP_printk("client %08x:%08x flavor=%s, conflict=%s from addr=%pISpc",
++ __entry->cl_boot, __entry->cl_id,
++ show_nfsd_authflavor(__entry->cl_flavor),
++ show_nfsd_authflavor(__entry->new_flavor), __entry->addr
++ )
++)
++
++TRACE_EVENT(nfsd_clid_verf_mismatch,
++ TP_PROTO(
++ const struct nfs4_client *clp,
++ const struct svc_rqst *rqstp,
++ const nfs4_verifier *verf
++ ),
++ TP_ARGS(clp, rqstp, verf),
++ TP_STRUCT__entry(
++ __field(u32, cl_boot)
++ __field(u32, cl_id)
++ __array(unsigned char, cl_verifier, NFS4_VERIFIER_SIZE)
++ __array(unsigned char, new_verifier, NFS4_VERIFIER_SIZE)
++ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
++ ),
++ TP_fast_assign(
++ __entry->cl_boot = clp->cl_clientid.cl_boot;
++ __entry->cl_id = clp->cl_clientid.cl_id;
++ memcpy(__entry->cl_verifier, (void *)&clp->cl_verifier,
++ NFS4_VERIFIER_SIZE);
++ memcpy(__entry->new_verifier, (void *)verf,
++ NFS4_VERIFIER_SIZE);
++ memcpy(__entry->addr, &rqstp->rq_xprt->xpt_remote,
++ sizeof(struct sockaddr_in6));
++ ),
++ TP_printk("client %08x:%08x verf=0x%s, updated=0x%s from addr=%pISpc",
++ __entry->cl_boot, __entry->cl_id,
++ __print_hex_str(__entry->cl_verifier, NFS4_VERIFIER_SIZE),
++ __print_hex_str(__entry->new_verifier, NFS4_VERIFIER_SIZE),
++ __entry->addr
++ )
++);
++
++DECLARE_EVENT_CLASS(nfsd_clid_class,
+ TP_PROTO(const struct nfs4_client *clp),
+ TP_ARGS(clp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+- __field(unsigned int, namelen)
+- __dynamic_array(unsigned char, name, clp->cl_name.len)
++ __field(unsigned long, flavor)
++ __array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
++ __dynamic_array(char, name, clp->cl_name.len + 1)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+ memcpy(__entry->addr, &clp->cl_addr,
+ sizeof(struct sockaddr_in6));
+- __entry->namelen = clp->cl_name.len;
+- memcpy(__get_dynamic_array(name), clp->cl_name.data,
+- clp->cl_name.len);
++ __entry->flavor = clp->cl_cred.cr_flavor;
++ memcpy(__entry->verifier, (void *)&clp->cl_verifier,
++ NFS4_VERIFIER_SIZE);
++ memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len);
++ __get_str(name)[clp->cl_name.len] = '\0';
+ ),
+- TP_printk("nfs4_clientid %.*s already in use by %pISpc, client %08x:%08x",
+- __entry->namelen, __get_str(name), __entry->addr,
++ TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
++ __entry->addr, __get_str(name),
++ __print_hex_str(__entry->verifier, NFS4_VERIFIER_SIZE),
++ show_nfsd_authflavor(__entry->flavor),
+ __entry->cl_boot, __entry->cl_id)
+-)
++);
+
+-TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
+-TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
+-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
+-TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE);
+-TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
++#define DEFINE_CLID_EVENT(name) \
++DEFINE_EVENT(nfsd_clid_class, nfsd_clid_##name, \
++ TP_PROTO(const struct nfs4_client *clp), \
++ TP_ARGS(clp))
++
++DEFINE_CLID_EVENT(fresh);
++DEFINE_CLID_EVENT(confirmed_r);
+
++/*
++ * from fs/nfsd/filecache.h
++ */
+ #define show_nf_flags(val) \
+ __print_flags(val, "|", \
+ { 1 << NFSD_FILE_HASHED, "HASHED" }, \
+ { 1 << NFSD_FILE_PENDING, "PENDING" }, \
+- { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \
+- { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \
+- { 1 << NFSD_FILE_REFERENCED, "REFERENCED"})
+-
+-/* FIXME: This should probably be fleshed out in the future. */
+-#define show_nf_may(val) \
+- __print_flags(val, "|", \
+- { NFSD_MAY_READ, "READ" }, \
+- { NFSD_MAY_WRITE, "WRITE" }, \
+- { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" })
++ { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \
++ { 1 << NFSD_FILE_GC, "GC" })
+
+ DECLARE_EVENT_CLASS(nfsd_file_class,
+ TP_PROTO(struct nfsd_file *nf),
+ TP_ARGS(nf),
+ TP_STRUCT__entry(
+- __field(unsigned int, nf_hashval)
+ __field(void *, nf_inode)
+ __field(int, nf_ref)
+ __field(unsigned long, nf_flags)
+@@ -425,19 +800,17 @@ DECLARE_EVENT_CLASS(nfsd_file_class,
+ __field(struct file *, nf_file)
+ ),
+ TP_fast_assign(
+- __entry->nf_hashval = nf->nf_hashval;
+ __entry->nf_inode = nf->nf_inode;
+ __entry->nf_ref = refcount_read(&nf->nf_ref);
+ __entry->nf_flags = nf->nf_flags;
+ __entry->nf_may = nf->nf_may;
+ __entry->nf_file = nf->nf_file;
+ ),
+- TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p",
+- __entry->nf_hashval,
++ TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p",
+ __entry->nf_inode,
+ __entry->nf_ref,
+ show_nf_flags(__entry->nf_flags),
+- show_nf_may(__entry->nf_may),
++ show_nfsd_may_flags(__entry->nf_may),
+ __entry->nf_file)
+ )
+
+@@ -446,34 +819,60 @@ DEFINE_EVENT(nfsd_file_class, name, \
+ TP_PROTO(struct nfsd_file *nf), \
+ TP_ARGS(nf))
+
+-DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc);
+-DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final);
++DEFINE_NFSD_FILE_EVENT(nfsd_file_free);
+ DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash);
+ DEFINE_NFSD_FILE_EVENT(nfsd_file_put);
+-DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked);
++DEFINE_NFSD_FILE_EVENT(nfsd_file_closing);
++DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_queue);
++
++TRACE_EVENT(nfsd_file_alloc,
++ TP_PROTO(
++ const struct nfsd_file *nf
++ ),
++ TP_ARGS(nf),
++ TP_STRUCT__entry(
++ __field(const void *, nf_inode)
++ __field(unsigned long, nf_flags)
++ __field(unsigned long, nf_may)
++ __field(unsigned int, nf_ref)
++ ),
++ TP_fast_assign(
++ __entry->nf_inode = nf->nf_inode;
++ __entry->nf_flags = nf->nf_flags;
++ __entry->nf_ref = refcount_read(&nf->nf_ref);
++ __entry->nf_may = nf->nf_may;
++ ),
++ TP_printk("inode=%p ref=%u flags=%s may=%s",
++ __entry->nf_inode, __entry->nf_ref,
++ show_nf_flags(__entry->nf_flags),
++ show_nfsd_may_flags(__entry->nf_may)
++ )
++);
+
+ TRACE_EVENT(nfsd_file_acquire,
+- TP_PROTO(struct svc_rqst *rqstp, unsigned int hash,
+- struct inode *inode, unsigned int may_flags,
+- struct nfsd_file *nf, __be32 status),
++ TP_PROTO(
++ const struct svc_rqst *rqstp,
++ const struct inode *inode,
++ unsigned int may_flags,
++ const struct nfsd_file *nf,
++ __be32 status
++ ),
+
+- TP_ARGS(rqstp, hash, inode, may_flags, nf, status),
++ TP_ARGS(rqstp, inode, may_flags, nf, status),
+
+ TP_STRUCT__entry(
+ __field(u32, xid)
+- __field(unsigned int, hash)
+- __field(void *, inode)
+- __field(unsigned int, may_flags)
+- __field(int, nf_ref)
++ __field(const void *, inode)
++ __field(unsigned long, may_flags)
++ __field(unsigned int, nf_ref)
+ __field(unsigned long, nf_flags)
+- __field(unsigned char, nf_may)
+- __field(struct file *, nf_file)
++ __field(unsigned long, nf_may)
++ __field(const void *, nf_file)
+ __field(u32, status)
+ ),
+
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+- __entry->hash = hash;
+ __entry->inode = inode;
+ __entry->may_flags = may_flags;
+ __entry->nf_ref = nf ? refcount_read(&nf->nf_ref) : 0;
+@@ -483,39 +882,131 @@ TRACE_EVENT(nfsd_file_acquire,
+ __entry->status = be32_to_cpu(status);
+ ),
+
+- TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u",
+- __entry->xid, __entry->hash, __entry->inode,
+- show_nf_may(__entry->may_flags), __entry->nf_ref,
+- show_nf_flags(__entry->nf_flags),
+- show_nf_may(__entry->nf_may), __entry->nf_file,
+- __entry->status)
++ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p status=%u",
++ __entry->xid, __entry->inode,
++ show_nfsd_may_flags(__entry->may_flags),
++ __entry->nf_ref, show_nf_flags(__entry->nf_flags),
++ show_nfsd_may_flags(__entry->nf_may),
++ __entry->nf_file, __entry->status
++ )
+ );
+
+-DECLARE_EVENT_CLASS(nfsd_file_search_class,
+- TP_PROTO(struct inode *inode, unsigned int hash, int found),
+- TP_ARGS(inode, hash, found),
++TRACE_EVENT(nfsd_file_insert_err,
++ TP_PROTO(
++ const struct svc_rqst *rqstp,
++ const struct inode *inode,
++ unsigned int may_flags,
++ long error
++ ),
++ TP_ARGS(rqstp, inode, may_flags, error),
+ TP_STRUCT__entry(
+- __field(struct inode *, inode)
+- __field(unsigned int, hash)
+- __field(int, found)
++ __field(u32, xid)
++ __field(const void *, inode)
++ __field(unsigned long, may_flags)
++ __field(long, error)
+ ),
+ TP_fast_assign(
++ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ __entry->inode = inode;
+- __entry->hash = hash;
+- __entry->found = found;
++ __entry->may_flags = may_flags;
++ __entry->error = error;
+ ),
+- TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash,
+- __entry->inode, __entry->found)
++ TP_printk("xid=0x%x inode=%p may_flags=%s error=%ld",
++ __entry->xid, __entry->inode,
++ show_nfsd_may_flags(__entry->may_flags),
++ __entry->error
++ )
+ );
+
+-#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \
+-DEFINE_EVENT(nfsd_file_search_class, name, \
+- TP_PROTO(struct inode *inode, unsigned int hash, int found), \
+- TP_ARGS(inode, hash, found))
++TRACE_EVENT(nfsd_file_cons_err,
++ TP_PROTO(
++ const struct svc_rqst *rqstp,
++ const struct inode *inode,
++ unsigned int may_flags,
++ const struct nfsd_file *nf
++ ),
++ TP_ARGS(rqstp, inode, may_flags, nf),
++ TP_STRUCT__entry(
++ __field(u32, xid)
++ __field(const void *, inode)
++ __field(unsigned long, may_flags)
++ __field(unsigned int, nf_ref)
++ __field(unsigned long, nf_flags)
++ __field(unsigned long, nf_may)
++ __field(const void *, nf_file)
++ ),
++ TP_fast_assign(
++ __entry->xid = be32_to_cpu(rqstp->rq_xid);
++ __entry->inode = inode;
++ __entry->may_flags = may_flags;
++ __entry->nf_ref = refcount_read(&nf->nf_ref);
++ __entry->nf_flags = nf->nf_flags;
++ __entry->nf_may = nf->nf_may;
++ __entry->nf_file = nf->nf_file;
++ ),
++ TP_printk("xid=0x%x inode=%p may_flags=%s ref=%u nf_flags=%s nf_may=%s nf_file=%p",
++ __entry->xid, __entry->inode,
++ show_nfsd_may_flags(__entry->may_flags), __entry->nf_ref,
++ show_nf_flags(__entry->nf_flags),
++ show_nfsd_may_flags(__entry->nf_may), __entry->nf_file
++ )
++);
++
++DECLARE_EVENT_CLASS(nfsd_file_open_class,
++ TP_PROTO(const struct nfsd_file *nf, __be32 status),
++ TP_ARGS(nf, status),
++ TP_STRUCT__entry(
++ __field(void *, nf_inode) /* cannot be dereferenced */
++ __field(int, nf_ref)
++ __field(unsigned long, nf_flags)
++ __field(unsigned long, nf_may)
++ __field(void *, nf_file) /* cannot be dereferenced */
++ ),
++ TP_fast_assign(
++ __entry->nf_inode = nf->nf_inode;
++ __entry->nf_ref = refcount_read(&nf->nf_ref);
++ __entry->nf_flags = nf->nf_flags;
++ __entry->nf_may = nf->nf_may;
++ __entry->nf_file = nf->nf_file;
++ ),
++ TP_printk("inode=%p ref=%d flags=%s may=%s file=%p",
++ __entry->nf_inode,
++ __entry->nf_ref,
++ show_nf_flags(__entry->nf_flags),
++ show_nfsd_may_flags(__entry->nf_may),
++ __entry->nf_file)
++)
++
++#define DEFINE_NFSD_FILE_OPEN_EVENT(name) \
++DEFINE_EVENT(nfsd_file_open_class, name, \
++ TP_PROTO( \
++ const struct nfsd_file *nf, \
++ __be32 status \
++ ), \
++ TP_ARGS(nf, status))
+
+-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync);
+-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode);
+-DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached);
++DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_open);
++DEFINE_NFSD_FILE_OPEN_EVENT(nfsd_file_opened);
++
++TRACE_EVENT(nfsd_file_is_cached,
++ TP_PROTO(
++ const struct inode *inode,
++ int found
++ ),
++ TP_ARGS(inode, found),
++ TP_STRUCT__entry(
++ __field(const struct inode *, inode)
++ __field(int, found)
++ ),
++ TP_fast_assign(
++ __entry->inode = inode;
++ __entry->found = found;
++ ),
++ TP_printk("inode=%p is %scached",
++ __entry->inode,
++ __entry->found ? "" : "not "
++ )
++);
+
+ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
+ TP_PROTO(struct inode *inode, u32 mask),
+@@ -532,10 +1023,95 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
+ __entry->mode = inode->i_mode;
+ __entry->mask = mask;
+ ),
+- TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
++ TP_printk("inode=%p nlink=%u mode=0%ho mask=0x%x", __entry->inode,
+ __entry->nlink, __entry->mode, __entry->mask)
+ );
+
++DECLARE_EVENT_CLASS(nfsd_file_gc_class,
++ TP_PROTO(
++ const struct nfsd_file *nf
++ ),
++ TP_ARGS(nf),
++ TP_STRUCT__entry(
++ __field(void *, nf_inode)
++ __field(void *, nf_file)
++ __field(int, nf_ref)
++ __field(unsigned long, nf_flags)
++ ),
++ TP_fast_assign(
++ __entry->nf_inode = nf->nf_inode;
++ __entry->nf_file = nf->nf_file;
++ __entry->nf_ref = refcount_read(&nf->nf_ref);
++ __entry->nf_flags = nf->nf_flags;
++ ),
++ TP_printk("inode=%p ref=%d nf_flags=%s nf_file=%p",
++ __entry->nf_inode, __entry->nf_ref,
++ show_nf_flags(__entry->nf_flags),
++ __entry->nf_file
++ )
++);
++
++#define DEFINE_NFSD_FILE_GC_EVENT(name) \
++DEFINE_EVENT(nfsd_file_gc_class, name, \
++ TP_PROTO( \
++ const struct nfsd_file *nf \
++ ), \
++ TP_ARGS(nf))
++
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_add_disposed);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced);
++DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed);
++
++DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class,
++ TP_PROTO(
++ unsigned long removed,
++ unsigned long remaining
++ ),
++ TP_ARGS(removed, remaining),
++ TP_STRUCT__entry(
++ __field(unsigned long, removed)
++ __field(unsigned long, remaining)
++ ),
++ TP_fast_assign(
++ __entry->removed = removed;
++ __entry->remaining = remaining;
++ ),
++ TP_printk("%lu entries removed, %lu remaining",
++ __entry->removed, __entry->remaining)
++);
++
++#define DEFINE_NFSD_FILE_LRUWALK_EVENT(name) \
++DEFINE_EVENT(nfsd_file_lruwalk_class, name, \
++ TP_PROTO( \
++ unsigned long removed, \
++ unsigned long remaining \
++ ), \
++ TP_ARGS(removed, remaining))
++
++DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed);
++DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed);
++
++TRACE_EVENT(nfsd_file_close,
++ TP_PROTO(
++ const struct inode *inode
++ ),
++ TP_ARGS(inode),
++ TP_STRUCT__entry(
++ __field(const void *, inode)
++ ),
++ TP_fast_assign(
++ __entry->inode = inode;
++ ),
++ TP_printk("inode=%p",
++ __entry->inode
++ )
++);
++
+ #include "cache.h"
+
+ TRACE_DEFINE_ENUM(RC_DROPIT);
+@@ -616,9 +1192,9 @@ TRACE_EVENT(nfsd_cb_args,
+ memcpy(__entry->addr, &conn->cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+- TP_printk("client %08x:%08x callback addr=%pISpc prog=%u ident=%u",
+- __entry->cl_boot, __entry->cl_id,
+- __entry->addr, __entry->prog, __entry->ident)
++ TP_printk("addr=%pISpc client %08x:%08x prog=%u ident=%u",
++ __entry->addr, __entry->cl_boot, __entry->cl_id,
++ __entry->prog, __entry->ident)
+ );
+
+ TRACE_EVENT(nfsd_cb_nodelegs,
+@@ -635,11 +1211,6 @@ TRACE_EVENT(nfsd_cb_nodelegs,
+ TP_printk("client %08x:%08x", __entry->cl_boot, __entry->cl_id)
+ )
+
+-TRACE_DEFINE_ENUM(NFSD4_CB_UP);
+-TRACE_DEFINE_ENUM(NFSD4_CB_UNKNOWN);
+-TRACE_DEFINE_ENUM(NFSD4_CB_DOWN);
+-TRACE_DEFINE_ENUM(NFSD4_CB_FAULT);
+-
+ #define show_cb_state(val) \
+ __print_symbolic(val, \
+ { NFSD4_CB_UP, "UP" }, \
+@@ -673,10 +1244,53 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name, \
+ TP_PROTO(const struct nfs4_client *clp), \
+ TP_ARGS(clp))
+
+-DEFINE_NFSD_CB_EVENT(setup);
+ DEFINE_NFSD_CB_EVENT(state);
++DEFINE_NFSD_CB_EVENT(probe);
++DEFINE_NFSD_CB_EVENT(lost);
+ DEFINE_NFSD_CB_EVENT(shutdown);
+
++TRACE_DEFINE_ENUM(RPC_AUTH_NULL);
++TRACE_DEFINE_ENUM(RPC_AUTH_UNIX);
++TRACE_DEFINE_ENUM(RPC_AUTH_GSS);
++TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5);
++TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5I);
++TRACE_DEFINE_ENUM(RPC_AUTH_GSS_KRB5P);
++
++#define show_nfsd_authflavor(val) \
++ __print_symbolic(val, \
++ { RPC_AUTH_NULL, "none" }, \
++ { RPC_AUTH_UNIX, "sys" }, \
++ { RPC_AUTH_GSS, "gss" }, \
++ { RPC_AUTH_GSS_KRB5, "krb5" }, \
++ { RPC_AUTH_GSS_KRB5I, "krb5i" }, \
++ { RPC_AUTH_GSS_KRB5P, "krb5p" })
++
++TRACE_EVENT(nfsd_cb_setup,
++ TP_PROTO(const struct nfs4_client *clp,
++ const char *netid,
++ rpc_authflavor_t authflavor
++ ),
++ TP_ARGS(clp, netid, authflavor),
++ TP_STRUCT__entry(
++ __field(u32, cl_boot)
++ __field(u32, cl_id)
++ __field(unsigned long, authflavor)
++ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
++ __array(unsigned char, netid, 8)
++ ),
++ TP_fast_assign(
++ __entry->cl_boot = clp->cl_clientid.cl_boot;
++ __entry->cl_id = clp->cl_clientid.cl_id;
++ strlcpy(__entry->netid, netid, sizeof(__entry->netid));
++ __entry->authflavor = authflavor;
++ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
++ sizeof(struct sockaddr_in6));
++ ),
++ TP_printk("addr=%pISpc client %08x:%08x proto=%s flavor=%s",
++ __entry->addr, __entry->cl_boot, __entry->cl_id,
++ __entry->netid, show_nfsd_authflavor(__entry->authflavor))
++);
++
+ TRACE_EVENT(nfsd_cb_setup_err,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+@@ -700,54 +1314,138 @@ TRACE_EVENT(nfsd_cb_setup_err,
+ __entry->addr, __entry->cl_boot, __entry->cl_id, __entry->error)
+ );
+
+-TRACE_EVENT(nfsd_cb_work,
++TRACE_EVENT(nfsd_cb_recall,
+ TP_PROTO(
+- const struct nfs4_client *clp,
+- const char *procedure
++ const struct nfs4_stid *stid
++ ),
++ TP_ARGS(stid),
++ TP_STRUCT__entry(
++ __field(u32, cl_boot)
++ __field(u32, cl_id)
++ __field(u32, si_id)
++ __field(u32, si_generation)
++ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+- TP_ARGS(clp, procedure),
++ TP_fast_assign(
++ const stateid_t *stp = &stid->sc_stateid;
++ const struct nfs4_client *clp = stid->sc_client;
++
++ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
++ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
++ __entry->si_id = stp->si_opaque.so_id;
++ __entry->si_generation = stp->si_generation;
++ if (clp)
++ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
++ sizeof(struct sockaddr_in6));
++ else
++ memset(__entry->addr, 0, sizeof(struct sockaddr_in6));
++ ),
++ TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x",
++ __entry->addr, __entry->cl_boot, __entry->cl_id,
++ __entry->si_id, __entry->si_generation)
++);
++
++TRACE_EVENT(nfsd_cb_notify_lock,
++ TP_PROTO(
++ const struct nfs4_lockowner *lo,
++ const struct nfsd4_blocked_lock *nbl
++ ),
++ TP_ARGS(lo, nbl),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+- __string(procedure, procedure)
++ __field(u32, fh_hash)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
++ const struct nfs4_client *clp = lo->lo_owner.so_client;
++
+ __entry->cl_boot = clp->cl_clientid.cl_boot;
+ __entry->cl_id = clp->cl_clientid.cl_id;
+- __assign_str(procedure, procedure)
++ __entry->fh_hash = knfsd_fh_hash(&nbl->nbl_fh);
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+- TP_printk("addr=%pISpc client %08x:%08x procedure=%s",
++ TP_printk("addr=%pISpc client %08x:%08x fh_hash=0x%08x",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+- __get_str(procedure))
++ __entry->fh_hash)
+ );
+
+-TRACE_EVENT(nfsd_cb_done,
++TRACE_EVENT(nfsd_cb_offload,
+ TP_PROTO(
+ const struct nfs4_client *clp,
+- int status
++ const stateid_t *stp,
++ const struct knfsd_fh *fh,
++ u64 count,
++ __be32 status
+ ),
+- TP_ARGS(clp, status),
++ TP_ARGS(clp, stp, fh, count, status),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
++ __field(u32, si_id)
++ __field(u32, si_generation)
++ __field(u32, fh_hash)
+ __field(int, status)
++ __field(u64, count)
+ __array(unsigned char, addr, sizeof(struct sockaddr_in6))
+ ),
+ TP_fast_assign(
+- __entry->cl_boot = clp->cl_clientid.cl_boot;
+- __entry->cl_id = clp->cl_clientid.cl_id;
+- __entry->status = status;
++ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
++ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
++ __entry->si_id = stp->si_opaque.so_id;
++ __entry->si_generation = stp->si_generation;
++ __entry->fh_hash = knfsd_fh_hash(fh);
++ __entry->status = be32_to_cpu(status);
++ __entry->count = count;
+ memcpy(__entry->addr, &clp->cl_cb_conn.cb_addr,
+ sizeof(struct sockaddr_in6));
+ ),
+- TP_printk("addr=%pISpc client %08x:%08x status=%d",
++ TP_printk("addr=%pISpc client %08x:%08x stateid %08x:%08x fh_hash=0x%08x count=%llu status=%d",
+ __entry->addr, __entry->cl_boot, __entry->cl_id,
+- __entry->status)
++ __entry->si_id, __entry->si_generation,
++ __entry->fh_hash, __entry->count, __entry->status)
++);
++
++DECLARE_EVENT_CLASS(nfsd_cb_done_class,
++ TP_PROTO(
++ const stateid_t *stp,
++ const struct rpc_task *task
++ ),
++ TP_ARGS(stp, task),
++ TP_STRUCT__entry(
++ __field(u32, cl_boot)
++ __field(u32, cl_id)
++ __field(u32, si_id)
++ __field(u32, si_generation)
++ __field(int, status)
++ ),
++ TP_fast_assign(
++ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
++ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
++ __entry->si_id = stp->si_opaque.so_id;
++ __entry->si_generation = stp->si_generation;
++ __entry->status = task->tk_status;
++ ),
++ TP_printk("client %08x:%08x stateid %08x:%08x status=%d",
++ __entry->cl_boot, __entry->cl_id, __entry->si_id,
++ __entry->si_generation, __entry->status
++ )
+ );
+
++#define DEFINE_NFSD_CB_DONE_EVENT(name) \
++DEFINE_EVENT(nfsd_cb_done_class, name, \
++ TP_PROTO( \
++ const stateid_t *stp, \
++ const struct rpc_task *task \
++ ), \
++ TP_ARGS(stp, task))
++
++DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_recall_done);
++DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_notify_lock_done);
++DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_layout_done);
++DEFINE_NFSD_CB_DONE_EVENT(nfsd_cb_offload_done);
++
+ #endif /* _NFSD_TRACE_H */
+
+ #undef TRACE_INCLUDE_PATH
+diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
+index 31edb883afd0d..0ea05ddff0d08 100644
+--- a/fs/nfsd/vfs.c
++++ b/fs/nfsd/vfs.c
+@@ -32,14 +32,13 @@
+ #include <linux/writeback.h>
+ #include <linux/security.h>
+
+-#ifdef CONFIG_NFSD_V3
+ #include "xdr3.h"
+-#endif /* CONFIG_NFSD_V3 */
+
+ #ifdef CONFIG_NFSD_V4
+ #include "../internal.h"
+ #include "acl.h"
+ #include "idmap.h"
++#include "xdr4.h"
+ #endif /* CONFIG_NFSD_V4 */
+
+ #include "nfsd.h"
+@@ -49,6 +48,69 @@
+
+ #define NFSDDBG_FACILITY NFSDDBG_FILEOP
+
++/**
++ * nfserrno - Map Linux errnos to NFS errnos
++ * @errno: POSIX(-ish) error code to be mapped
++ *
++ * Returns the appropriate (net-endian) nfserr_* (or nfs_ok if errno is 0). If
++ * it's an error we don't expect, log it once and return nfserr_io.
++ */
++__be32
++nfserrno (int errno)
++{
++ static struct {
++ __be32 nfserr;
++ int syserr;
++ } nfs_errtbl[] = {
++ { nfs_ok, 0 },
++ { nfserr_perm, -EPERM },
++ { nfserr_noent, -ENOENT },
++ { nfserr_io, -EIO },
++ { nfserr_nxio, -ENXIO },
++ { nfserr_fbig, -E2BIG },
++ { nfserr_stale, -EBADF },
++ { nfserr_acces, -EACCES },
++ { nfserr_exist, -EEXIST },
++ { nfserr_xdev, -EXDEV },
++ { nfserr_mlink, -EMLINK },
++ { nfserr_nodev, -ENODEV },
++ { nfserr_notdir, -ENOTDIR },
++ { nfserr_isdir, -EISDIR },
++ { nfserr_inval, -EINVAL },
++ { nfserr_fbig, -EFBIG },
++ { nfserr_nospc, -ENOSPC },
++ { nfserr_rofs, -EROFS },
++ { nfserr_mlink, -EMLINK },
++ { nfserr_nametoolong, -ENAMETOOLONG },
++ { nfserr_notempty, -ENOTEMPTY },
++ { nfserr_dquot, -EDQUOT },
++ { nfserr_stale, -ESTALE },
++ { nfserr_jukebox, -ETIMEDOUT },
++ { nfserr_jukebox, -ERESTARTSYS },
++ { nfserr_jukebox, -EAGAIN },
++ { nfserr_jukebox, -EWOULDBLOCK },
++ { nfserr_jukebox, -ENOMEM },
++ { nfserr_io, -ETXTBSY },
++ { nfserr_notsupp, -EOPNOTSUPP },
++ { nfserr_toosmall, -ETOOSMALL },
++ { nfserr_serverfault, -ESERVERFAULT },
++ { nfserr_serverfault, -ENFILE },
++ { nfserr_io, -EREMOTEIO },
++ { nfserr_stale, -EOPENSTALE },
++ { nfserr_io, -EUCLEAN },
++ { nfserr_perm, -ENOKEY },
++ { nfserr_no_grace, -ENOGRACE},
++ };
++ int i;
++
++ for (i = 0; i < ARRAY_SIZE(nfs_errtbl); i++) {
++ if (nfs_errtbl[i].syserr == errno)
++ return nfs_errtbl[i].nfserr;
++ }
++ WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
++ return nfserr_io;
++}
++
+ /*
+ * Called from nfsd_lookup and encode_dirent. Check if we have crossed
+ * a mount point.
+@@ -199,27 +261,13 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ goto out_nfserr;
+ }
+ } else {
+- /*
+- * In the nfsd4_open() case, this may be held across
+- * subsequent open and delegation acquisition which may
+- * need to take the child's i_mutex:
+- */
+- fh_lock_nested(fhp, I_MUTEX_PARENT);
+- dentry = lookup_one_len(name, dparent, len);
++ dentry = lookup_one_len_unlocked(name, dparent, len);
+ host_err = PTR_ERR(dentry);
+ if (IS_ERR(dentry))
+ goto out_nfserr;
+ if (nfsd_mountpoint(dentry, exp)) {
+- /*
+- * We don't need the i_mutex after all. It's
+- * still possible we could open this (regular
+- * files can be mountpoints too), but the
+- * i_mutex is just there to prevent renames of
+- * something that we might be about to delegate,
+- * and a mountpoint won't be renamed:
+- */
+- fh_unlock(fhp);
+- if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
++ host_err = nfsd_cross_mnt(rqstp, &dentry, &exp);
++ if (host_err) {
+ dput(dentry);
+ goto out_nfserr;
+ }
+@@ -234,7 +282,15 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ return nfserrno(host_err);
+ }
+
+-/*
++/**
++ * nfsd_lookup - look up a single path component for nfsd
++ *
++ * @rqstp: the request context
++ * @fhp: the file handle of the directory
++ * @name: the component name, or %NULL to look up parent
++ * @len: length of name to examine
++ * @resfh: pointer to pre-initialised filehandle to hold result.
++ *
+ * Look up one component of a pathname.
+ * N.B. After this call _both_ fhp and resfh need an fh_put
+ *
+@@ -244,11 +300,11 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ * returned. Otherwise the covered directory is returned.
+ * NOTE: this mountpoint crossing is not supported properly by all
+ * clients and is explicitly disallowed for NFSv3
+- * NeilBrown <neilb@cse.unsw.edu.au>
++ *
+ */
+ __be32
+ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name,
+- unsigned int len, struct svc_fh *resfh)
++ unsigned int len, struct svc_fh *resfh)
+ {
+ struct svc_export *exp;
+ struct dentry *dentry;
+@@ -306,6 +362,10 @@ commit_metadata(struct svc_fh *fhp)
+ static void
+ nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap)
+ {
++ /* Ignore mode updates on symlinks */
++ if (S_ISLNK(inode->i_mode))
++ iap->ia_valid &= ~ATTR_MODE;
++
+ /* sanitize the mode change */
+ if (iap->ia_valid & ATTR_MODE) {
+ iap->ia_mode &= S_IALLUGO;
+@@ -359,21 +419,77 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ return nfserrno(host_err);
+ }
+
+-/*
+- * Set various file attributes. After this call fhp needs an fh_put.
++static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
++{
++ int host_err;
++
++ if (iap->ia_valid & ATTR_SIZE) {
++ /*
++ * RFC5661, Section 18.30.4:
++ * Changing the size of a file with SETATTR indirectly
++ * changes the time_modify and change attributes.
++ *
++ * (and similar for the older RFCs)
++ */
++ struct iattr size_attr = {
++ .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
++ .ia_size = iap->ia_size,
++ };
++
++ if (iap->ia_size < 0)
++ return -EFBIG;
++
++ host_err = notify_change(dentry, &size_attr, NULL);
++ if (host_err)
++ return host_err;
++ iap->ia_valid &= ~ATTR_SIZE;
++
++ /*
++ * Avoid the additional setattr call below if the only other
++ * attribute that the client sends is the mtime, as we update
++ * it as part of the size change above.
++ */
++ if ((iap->ia_valid & ~ATTR_MTIME) == 0)
++ return 0;
++ }
++
++ if (!iap->ia_valid)
++ return 0;
++
++ iap->ia_valid |= ATTR_CTIME;
++ return notify_change(dentry, iap, NULL);
++}
++
++/**
++ * nfsd_setattr - Set various file attributes.
++ * @rqstp: controlling RPC transaction
++ * @fhp: filehandle of target
++ * @attr: attributes to set
++ * @check_guard: set to 1 if guardtime is a valid timestamp
++ * @guardtime: do not act if ctime.tv_sec does not match this timestamp
++ *
++ * This call may adjust the contents of @attr (in particular, this
++ * call may change the bits in the na_iattr.ia_valid field).
++ *
++ * Returns nfs_ok on success, otherwise an NFS status code is
++ * returned. Caller must release @fhp by calling fh_put in either
++ * case.
+ */
+ __be32
+-nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
++nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ struct nfsd_attrs *attr,
+ int check_guard, time64_t guardtime)
+ {
+ struct dentry *dentry;
+ struct inode *inode;
++ struct iattr *iap = attr->na_iattr;
+ int accmode = NFSD_MAY_SATTR;
+ umode_t ftype = 0;
+ __be32 err;
+- int host_err;
++ int host_err = 0;
+ bool get_write_count;
+ bool size_change = (iap->ia_valid & ATTR_SIZE);
++ int retries;
+
+ if (iap->ia_valid & ATTR_SIZE) {
+ accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
+@@ -409,13 +525,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+ dentry = fhp->fh_dentry;
+ inode = d_inode(dentry);
+
+- /* Ignore any mode updates on symlinks */
+- if (S_ISLNK(inode->i_mode))
+- iap->ia_valid &= ~ATTR_MODE;
+-
+- if (!iap->ia_valid)
+- return 0;
+-
+ nfsd_sanitize_attrs(inode, iap);
+
+ if (check_guard && guardtime != inode->i_ctime.tv_sec)
+@@ -434,45 +543,41 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
+ return err;
+ }
+
+- fh_lock(fhp);
+- if (size_change) {
+- /*
+- * RFC5661, Section 18.30.4:
+- * Changing the size of a file with SETATTR indirectly
+- * changes the time_modify and change attributes.
+- *
+- * (and similar for the older RFCs)
+- */
+- struct iattr size_attr = {
+- .ia_valid = ATTR_SIZE | ATTR_CTIME | ATTR_MTIME,
+- .ia_size = iap->ia_size,
+- };
+-
+- host_err = notify_change(dentry, &size_attr, NULL);
+- if (host_err)
+- goto out_unlock;
+- iap->ia_valid &= ~ATTR_SIZE;
++ inode_lock(inode);
++ fh_fill_pre_attrs(fhp);
++ for (retries = 1;;) {
++ struct iattr attrs;
+
+ /*
+- * Avoid the additional setattr call below if the only other
+- * attribute that the client sends is the mtime, as we update
+- * it as part of the size change above.
++ * notify_change() can alter its iattr argument, making
++ * @iap unsuitable for submission multiple times. Make a
++ * copy for every loop iteration.
+ */
+- if ((iap->ia_valid & ~ATTR_MTIME) == 0)
+- goto out_unlock;
++ attrs = *iap;
++ host_err = __nfsd_setattr(dentry, &attrs);
++ if (host_err != -EAGAIN || !retries--)
++ break;
++ if (!nfsd_wait_for_delegreturn(rqstp, inode))
++ break;
+ }
+-
+- iap->ia_valid |= ATTR_CTIME;
+- host_err = notify_change(dentry, iap, NULL);
+-
+-out_unlock:
+- fh_unlock(fhp);
++ if (attr->na_seclabel && attr->na_seclabel->len)
++ attr->na_labelerr = security_inode_setsecctx(dentry,
++ attr->na_seclabel->data, attr->na_seclabel->len);
++ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && attr->na_pacl)
++ attr->na_aclerr = set_posix_acl(inode, ACL_TYPE_ACCESS,
++ attr->na_pacl);
++ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) &&
++ !attr->na_aclerr && attr->na_dpacl && S_ISDIR(inode->i_mode))
++ attr->na_aclerr = set_posix_acl(inode, ACL_TYPE_DEFAULT,
++ attr->na_dpacl);
++ fh_fill_post_attrs(fhp);
++ inode_unlock(inode);
+ if (size_change)
+ put_write_access(inode);
+ out:
+ if (!host_err)
+ host_err = commit_metadata(fhp);
+- return nfserrno(host_err);
++ return err != 0 ? err : nfserrno(host_err);
+ }
+
+ #if defined(CONFIG_NFSD_V4)
+@@ -503,35 +608,16 @@ int nfsd4_is_junction(struct dentry *dentry)
+ return 0;
+ return 1;
+ }
+-#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
+-__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- struct xdr_netobj *label)
+-{
+- __be32 error;
+- int host_error;
+- struct dentry *dentry;
+
+- error = fh_verify(rqstp, fhp, 0 /* S_IFREG */, NFSD_MAY_SATTR);
+- if (error)
+- return error;
+-
+- dentry = fhp->fh_dentry;
+-
+- inode_lock(d_inode(dentry));
+- host_error = security_inode_setsecctx(dentry, label->data, label->len);
+- inode_unlock(d_inode(dentry));
+- return nfserrno(host_error);
+-}
+-#else
+-__be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- struct xdr_netobj *label)
++static struct nfsd4_compound_state *nfsd4_get_cstate(struct svc_rqst *rqstp)
+ {
+- return nfserr_notsupp;
++ return &((struct nfsd4_compoundres *)rqstp->rq_resp)->cstate;
+ }
+-#endif
+
+-__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
+- struct nfsd_file *nf_dst, u64 dst_pos, u64 count, bool sync)
++__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
++ struct nfsd_file *nf_src, u64 src_pos,
++ struct nfsd_file *nf_dst, u64 dst_pos,
++ u64 count, bool sync)
+ {
+ struct file *src = nf_src->nf_file;
+ struct file *dst = nf_dst->nf_file;
+@@ -558,8 +644,17 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
+ if (!status)
+ status = commit_inode_metadata(file_inode(src));
+ if (status < 0) {
+- nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net,
+- nfsd_net_id));
++ struct nfsd_net *nn = net_generic(nf_dst->nf_net,
++ nfsd_net_id);
++
++ trace_nfsd_clone_file_range_err(rqstp,
++ &nfsd4_get_cstate(rqstp)->save_fh,
++ src_pos,
++ &nfsd4_get_cstate(rqstp)->current_fh,
++ dst_pos,
++ count, status);
++ nfsd_reset_write_verifier(nn);
++ trace_nfsd_writeverf_reset(nn, rqstp, status);
+ ret = nfserrno(status);
+ }
+ }
+@@ -606,7 +701,6 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ }
+ #endif /* defined(CONFIG_NFSD_V4) */
+
+-#ifdef CONFIG_NFSD_V3
+ /*
+ * Check server access rights to a file system object
+ */
+@@ -718,7 +812,6 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor
+ out:
+ return error;
+ }
+-#endif /* CONFIG_NFSD_V3 */
+
+ int nfsd_open_break_lease(struct inode *inode, int access)
+ {
+@@ -751,9 +844,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ path.dentry = fhp->fh_dentry;
+ inode = d_inode(path.dentry);
+
+- /* Disallow write access to files with the append-only bit set
+- * or any access when mandatory locking enabled
+- */
+ err = nfserr_perm;
+ if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE))
+ goto out;
+@@ -808,6 +898,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ int may_flags, struct file **filp)
+ {
+ __be32 err;
++ bool retried = false;
+
+ validate_process_creds();
+ /*
+@@ -823,21 +914,37 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+ */
+ if (type == S_IFREG)
+ may_flags |= NFSD_MAY_OWNER_OVERRIDE;
++retry:
+ err = fh_verify(rqstp, fhp, type, may_flags);
+- if (!err)
++ if (!err) {
+ err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
++ if (err == nfserr_stale && !retried) {
++ retried = true;
++ fh_put(fhp);
++ goto retry;
++ }
++ }
+ validate_process_creds();
+ return err;
+ }
+
++/**
++ * nfsd_open_verified - Open a regular file for the filecache
++ * @rqstp: RPC request
++ * @fhp: NFS filehandle of the file to open
++ * @may_flags: internal permission flags
++ * @filp: OUT: open "struct file *"
++ *
++ * Returns an nfsstat value in network byte order.
++ */
+ __be32
+-nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
+- int may_flags, struct file **filp)
++nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags,
++ struct file **filp)
+ {
+ __be32 err;
+
+ validate_process_creds();
+- err = __nfsd_open(rqstp, fhp, type, may_flags, filp);
++ err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp);
+ validate_process_creds();
+ return err;
+ }
+@@ -852,28 +959,24 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+ struct splice_desc *sd)
+ {
+ struct svc_rqst *rqstp = sd->u.data;
+- struct page **pp = rqstp->rq_next_page;
+- struct page *page = buf->page;
+- size_t size;
+-
+- size = sd->len;
+-
+- if (rqstp->rq_res.page_len == 0) {
+- get_page(page);
+- put_page(*rqstp->rq_next_page);
+- *(rqstp->rq_next_page++) = page;
+- rqstp->rq_res.page_base = buf->offset;
+- rqstp->rq_res.page_len = size;
+- } else if (page != pp[-1]) {
+- get_page(page);
+- if (*rqstp->rq_next_page)
+- put_page(*rqstp->rq_next_page);
+- *(rqstp->rq_next_page++) = page;
+- rqstp->rq_res.page_len += size;
+- } else
+- rqstp->rq_res.page_len += size;
++ struct page *page = buf->page; // may be a compound one
++ unsigned offset = buf->offset;
++ struct page *last_page;
+
+- return size;
++ last_page = page + (offset + sd->len - 1) / PAGE_SIZE;
++ for (page += offset / PAGE_SIZE; page <= last_page; page++) {
++ /*
++ * Skip page replacement when extending the contents
++ * of the current page.
++ */
++ if (page == *(rqstp->rq_next_page - 1))
++ continue;
++ svc_rqst_replace_page(rqstp, page);
++ }
++ if (rqstp->rq_res.page_len == 0) // first call
++ rqstp->rq_res.page_base = offset % PAGE_SIZE;
++ rqstp->rq_res.page_len += sd->len;
++ return sd->len;
+ }
+
+ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe,
+@@ -897,7 +1000,7 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ unsigned long *count, u32 *eof, ssize_t host_err)
+ {
+ if (host_err >= 0) {
+- nfsdstats.io_read += host_err;
++ nfsd_stats_io_read_add(fhp->fh_export, host_err);
+ *eof = nfsd_eof_on_read(file, offset, host_err, *count);
+ *count = host_err;
+ fsnotify_access(file);
+@@ -985,7 +1088,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ unsigned long *cnt, int stable,
+ __be32 *verf)
+ {
++ struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+ struct file *file = nf->nf_file;
++ struct super_block *sb = file_inode(file)->i_sb;
+ struct svc_export *exp;
+ struct iov_iter iter;
+ errseq_t since;
+@@ -993,12 +1098,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ int host_err;
+ int use_wgather;
+ loff_t pos = offset;
++ unsigned long exp_op_flags = 0;
+ unsigned int pflags = current->flags;
+ rwf_t flags = 0;
++ bool restore_flags = false;
+
+ trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
+
+- if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
++ if (sb->s_export_op)
++ exp_op_flags = sb->s_export_op->flags;
++
++ if (test_bit(RQ_LOCAL, &rqstp->rq_flags) &&
++ !(exp_op_flags & EXPORT_OP_REMOTE_FS)) {
+ /*
+ * We want throttling in balance_dirty_pages()
+ * and shrink_inactive_list() to only consider
+@@ -1007,6 +1118,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ * the client's dirty pages or its congested queue.
+ */
+ current->flags |= PF_LOCAL_THROTTLE;
++ restore_flags = true;
++ }
+
+ exp = fhp->fh_export;
+ use_wgather = (rqstp->rq_vers == 2) && EX_WGATHER(exp);
+@@ -1019,29 +1132,18 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+
+ iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+ since = READ_ONCE(file->f_wb_err);
+- if (flags & RWF_SYNC) {
+- if (verf)
+- nfsd_copy_boot_verifier(verf,
+- net_generic(SVC_NET(rqstp),
+- nfsd_net_id));
+- host_err = vfs_iter_write(file, &iter, &pos, flags);
+- if (host_err < 0)
+- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+- nfsd_net_id));
+- } else {
+- if (verf)
+- nfsd_copy_boot_verifier(verf,
+- net_generic(SVC_NET(rqstp),
+- nfsd_net_id));
+- host_err = vfs_iter_write(file, &iter, &pos, flags);
+- }
++ if (verf)
++ nfsd_copy_write_verifier(verf, nn);
++ file_start_write(file);
++ host_err = vfs_iter_write(file, &iter, &pos, flags);
++ file_end_write(file);
+ if (host_err < 0) {
+- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+- nfsd_net_id));
++ nfsd_reset_write_verifier(nn);
++ trace_nfsd_writeverf_reset(nn, rqstp, host_err);
+ goto out_nfserr;
+ }
+ *cnt = host_err;
+- nfsdstats.io_write += *cnt;
++ nfsd_stats_io_write_add(exp, *cnt);
+ fsnotify_modify(file);
+ host_err = filemap_check_wb_err(file->f_mapping, since);
+ if (host_err < 0)
+@@ -1049,9 +1151,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+
+ if (stable && use_wgather) {
+ host_err = wait_for_concurrent_writes(file);
+- if (host_err < 0)
+- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+- nfsd_net_id));
++ if (host_err < 0) {
++ nfsd_reset_write_verifier(nn);
++ trace_nfsd_writeverf_reset(nn, rqstp, host_err);
++ }
+ }
+
+ out_nfserr:
+@@ -1062,7 +1165,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
+ trace_nfsd_write_err(rqstp, fhp, offset, host_err);
+ nfserr = nfserrno(host_err);
+ }
+- if (test_bit(RQ_LOCAL, &rqstp->rq_flags))
++ if (restore_flags)
+ current_restore_flags(pflags, PF_LOCAL_THROTTLE);
+ return nfserr;
+ }
+@@ -1081,7 +1184,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ __be32 err;
+
+ trace_nfsd_read_start(rqstp, fhp, offset, *count);
+- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf);
++ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_READ, &nf);
+ if (err)
+ return err;
+
+@@ -1113,7 +1216,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+
+ trace_nfsd_write_start(rqstp, fhp, offset, *cnt);
+
+- err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf);
++ err = nfsd_file_acquire_gc(rqstp, fhp, NFSD_MAY_WRITE, &nf);
+ if (err)
+ goto out;
+
+@@ -1125,45 +1228,59 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
+ return err;
+ }
+
+-#ifdef CONFIG_NFSD_V3
+-/*
+- * Commit all pending writes to stable storage.
++/**
++ * nfsd_commit - Commit pending writes to stable storage
++ * @rqstp: RPC request being processed
++ * @fhp: NFS filehandle
++ * @nf: target file
++ * @offset: raw offset from beginning of file
++ * @count: raw count of bytes to sync
++ * @verf: filled in with the server's current write verifier
+ *
+- * Note: we only guarantee that data that lies within the range specified
+- * by the 'offset' and 'count' parameters will be synced.
++ * Note: we guarantee that data that lies within the range specified
++ * by the 'offset' and 'count' parameters will be synced. The server
++ * is permitted to sync data that lies outside this range at the
++ * same time.
+ *
+ * Unfortunately we cannot lock the file to make sure we return full WCC
+ * data to the client, as locking happens lower down in the filesystem.
++ *
++ * Return values:
++ * An nfsstat value in network byte order.
+ */
+ __be32
+-nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- loff_t offset, unsigned long count, __be32 *verf)
++nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
++ u64 offset, u32 count, __be32 *verf)
+ {
+- struct nfsd_file *nf;
+- loff_t end = LLONG_MAX;
+- __be32 err = nfserr_inval;
++ __be32 err = nfs_ok;
++ u64 maxbytes;
++ loff_t start, end;
++ struct nfsd_net *nn;
+
+- if (offset < 0)
+- goto out;
+- if (count != 0) {
+- end = offset + (loff_t)count - 1;
+- if (end < offset)
+- goto out;
++ /*
++ * Convert the client-provided (offset, count) range to a
++ * (start, end) range. If the client-provided range falls
++ * outside the maximum file size of the underlying FS,
++ * clamp the sync range appropriately.
++ */
++ start = 0;
++ end = LLONG_MAX;
++ maxbytes = (u64)fhp->fh_dentry->d_sb->s_maxbytes;
++ if (offset < maxbytes) {
++ start = offset;
++ if (count && (offset + count - 1 < maxbytes))
++ end = offset + count - 1;
+ }
+
+- err = nfsd_file_acquire(rqstp, fhp,
+- NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf);
+- if (err)
+- goto out;
++ nn = net_generic(nf->nf_net, nfsd_net_id);
+ if (EX_ISSYNC(fhp->fh_export)) {
+ errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
+ int err2;
+
+- err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
++ err2 = vfs_fsync_range(nf->nf_file, start, end, 0);
+ switch (err2) {
+ case 0:
+- nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
+- nfsd_net_id));
++ nfsd_copy_write_verifier(verf, nn);
+ err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
+ since);
+ err = nfserrno(err2);
+@@ -1172,28 +1289,37 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ err = nfserr_notsupp;
+ break;
+ default:
+- nfsd_reset_boot_verifier(net_generic(nf->nf_net,
+- nfsd_net_id));
++ nfsd_reset_write_verifier(nn);
++ trace_nfsd_writeverf_reset(nn, rqstp, err2);
+ err = nfserrno(err2);
+ }
+ } else
+- nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
+- nfsd_net_id));
++ nfsd_copy_write_verifier(verf, nn);
+
+- nfsd_file_put(nf);
+-out:
+ return err;
+ }
+-#endif /* CONFIG_NFSD_V3 */
+
+-static __be32
+-nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+- struct iattr *iap)
++/**
++ * nfsd_create_setattr - Set a created file's attributes
++ * @rqstp: RPC transaction being executed
++ * @fhp: NFS filehandle of parent directory
++ * @resfhp: NFS filehandle of new object
++ * @attrs: requested attributes of new object
++ *
++ * Returns nfs_ok on success, or an nfsstat in network byte order.
++ */
++__be32
++nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ struct svc_fh *resfhp, struct nfsd_attrs *attrs)
+ {
++ struct iattr *iap = attrs->na_iattr;
++ __be32 status;
++
+ /*
+- * Mode has already been set earlier in create:
++ * Mode has already been set by file creation.
+ */
+ iap->ia_valid &= ~ATTR_MODE;
++
+ /*
+ * Setting uid/gid works only for root. Irix appears to
+ * send along the gid on create when it tries to implement
+@@ -1201,10 +1327,31 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
+ */
+ if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
+ iap->ia_valid &= ~(ATTR_UID|ATTR_GID);
++
++ /*
++ * Callers expect new file metadata to be committed even
++ * if the attributes have not changed.
++ */
+ if (iap->ia_valid)
+- return nfsd_setattr(rqstp, resfhp, iap, 0, (time64_t)0);
+- /* Callers expect file metadata to be committed here */
+- return nfserrno(commit_metadata(resfhp));
++ status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0);
++ else
++ status = nfserrno(commit_metadata(resfhp));
++
++ /*
++ * Transactional filesystems had a chance to commit changes
++ * for both parent and child simultaneously making the
++ * following commit_metadata a noop in many cases.
++ */
++ if (!status)
++ status = nfserrno(commit_metadata(fhp));
++
++ /*
++ * Update the new filehandle to pick up the new attributes.
++ */
++ if (!status)
++ status = fh_update(resfhp);
++
++ return status;
+ }
+
+ /* HPUX client sometimes creates a file in mode 000, and sets size to 0.
+@@ -1225,26 +1372,19 @@ nfsd_check_ignore_resizing(struct iattr *iap)
+ /* The parent directory should already be locked: */
+ __be32
+ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- char *fname, int flen, struct iattr *iap,
+- int type, dev_t rdev, struct svc_fh *resfhp)
++ struct nfsd_attrs *attrs,
++ int type, dev_t rdev, struct svc_fh *resfhp)
+ {
+ struct dentry *dentry, *dchild;
+ struct inode *dirp;
++ struct iattr *iap = attrs->na_iattr;
+ __be32 err;
+- __be32 err2;
+ int host_err;
+
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
+
+ dchild = dget(resfhp->fh_dentry);
+- if (!fhp->fh_locked) {
+- WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
+- dentry);
+- err = nfserr_io;
+- goto out;
+- }
+-
+ err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+@@ -1257,7 +1397,6 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ iap->ia_mode &= ~current_umask();
+
+ err = 0;
+- host_err = 0;
+ switch (type) {
+ case S_IFREG:
+ host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+@@ -1303,22 +1442,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ if (host_err < 0)
+ goto out_nfserr;
+
+- err = nfsd_create_setattr(rqstp, resfhp, iap);
++ err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
+
+- /*
+- * nfsd_create_setattr already committed the child. Transactional
+- * filesystems had a chance to commit changes for both parent and
+- * child simultaneously making the following commit_metadata a
+- * noop.
+- */
+- err2 = nfserrno(commit_metadata(fhp));
+- if (err2)
+- err = err2;
+- /*
+- * Update the file handle to get the new inode info.
+- */
+- if (!err)
+- err = fh_update(resfhp);
+ out:
+ dput(dchild);
+ return err;
+@@ -1336,8 +1461,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ */
+ __be32
+ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- char *fname, int flen, struct iattr *iap,
+- int type, dev_t rdev, struct svc_fh *resfhp)
++ char *fname, int flen, struct nfsd_attrs *attrs,
++ int type, dev_t rdev, struct svc_fh *resfhp)
+ {
+ struct dentry *dentry, *dchild = NULL;
+ __be32 err;
+@@ -1356,11 +1481,13 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ if (host_err)
+ return nfserrno(host_err);
+
+- fh_lock_nested(fhp, I_MUTEX_PARENT);
++ inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
+ dchild = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(dchild);
+- if (IS_ERR(dchild))
+- return nfserrno(host_err);
++ if (IS_ERR(dchild)) {
++ err = nfserrno(host_err);
++ goto out_unlock;
++ }
+ err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+ /*
+ * We unconditionally drop our ref to dchild as fh_compose will have
+@@ -1368,178 +1495,14 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ */
+ dput(dchild);
+ if (err)
+- return err;
+- return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
+- rdev, resfhp);
+-}
+-
+-#ifdef CONFIG_NFSD_V3
+-
+-/*
+- * NFSv3 and NFSv4 version of nfsd_create
+- */
+-__be32
+-do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- char *fname, int flen, struct iattr *iap,
+- struct svc_fh *resfhp, int createmode, u32 *verifier,
+- bool *truncp, bool *created)
+-{
+- struct dentry *dentry, *dchild = NULL;
+- struct inode *dirp;
+- __be32 err;
+- int host_err;
+- __u32 v_mtime=0, v_atime=0;
+-
+- err = nfserr_perm;
+- if (!flen)
+- goto out;
+- err = nfserr_exist;
+- if (isdotent(fname, flen))
+- goto out;
+- if (!(iap->ia_valid & ATTR_MODE))
+- iap->ia_mode = 0;
+- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_EXEC);
+- if (err)
+- goto out;
+-
+- dentry = fhp->fh_dentry;
+- dirp = d_inode(dentry);
+-
+- host_err = fh_want_write(fhp);
+- if (host_err)
+- goto out_nfserr;
+-
+- fh_lock_nested(fhp, I_MUTEX_PARENT);
+-
+- /*
+- * Compose the response file handle.
+- */
+- dchild = lookup_one_len(fname, dentry, flen);
+- host_err = PTR_ERR(dchild);
+- if (IS_ERR(dchild))
+- goto out_nfserr;
+-
+- /* If file doesn't exist, check for permissions to create one */
+- if (d_really_is_negative(dchild)) {
+- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
+- if (err)
+- goto out;
+- }
+-
+- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+- if (err)
+- goto out;
+-
+- if (nfsd_create_is_exclusive(createmode)) {
+- /* solaris7 gets confused (bugid 4218508) if these have
+- * the high bit set, so just clear the high bits. If this is
+- * ever changed to use different attrs for storing the
+- * verifier, then do_open_lookup() will also need to be fixed
+- * accordingly.
+- */
+- v_mtime = verifier[0]&0x7fffffff;
+- v_atime = verifier[1]&0x7fffffff;
+- }
+-
+- if (d_really_is_positive(dchild)) {
+- err = 0;
+-
+- switch (createmode) {
+- case NFS3_CREATE_UNCHECKED:
+- if (! d_is_reg(dchild))
+- goto out;
+- else if (truncp) {
+- /* in nfsv4, we need to treat this case a little
+- * differently. we don't want to truncate the
+- * file now; this would be wrong if the OPEN
+- * fails for some other reason. furthermore,
+- * if the size is nonzero, we should ignore it
+- * according to spec!
+- */
+- *truncp = (iap->ia_valid & ATTR_SIZE) && !iap->ia_size;
+- }
+- else {
+- iap->ia_valid &= ATTR_SIZE;
+- goto set_attr;
+- }
+- break;
+- case NFS3_CREATE_EXCLUSIVE:
+- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
+- && d_inode(dchild)->i_atime.tv_sec == v_atime
+- && d_inode(dchild)->i_size == 0 ) {
+- if (created)
+- *created = true;
+- break;
+- }
+- fallthrough;
+- case NFS4_CREATE_EXCLUSIVE4_1:
+- if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime
+- && d_inode(dchild)->i_atime.tv_sec == v_atime
+- && d_inode(dchild)->i_size == 0 ) {
+- if (created)
+- *created = true;
+- goto set_attr;
+- }
+- fallthrough;
+- case NFS3_CREATE_GUARDED:
+- err = nfserr_exist;
+- }
+- fh_drop_write(fhp);
+- goto out;
+- }
+-
+- if (!IS_POSIXACL(dirp))
+- iap->ia_mode &= ~current_umask();
+-
+- host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
+- if (host_err < 0) {
+- fh_drop_write(fhp);
+- goto out_nfserr;
+- }
+- if (created)
+- *created = true;
+-
+- nfsd_check_ignore_resizing(iap);
+-
+- if (nfsd_create_is_exclusive(createmode)) {
+- /* Cram the verifier into atime/mtime */
+- iap->ia_valid = ATTR_MTIME|ATTR_ATIME
+- | ATTR_MTIME_SET|ATTR_ATIME_SET;
+- /* XXX someone who knows this better please fix it for nsec */
+- iap->ia_mtime.tv_sec = v_mtime;
+- iap->ia_atime.tv_sec = v_atime;
+- iap->ia_mtime.tv_nsec = 0;
+- iap->ia_atime.tv_nsec = 0;
+- }
+-
+- set_attr:
+- err = nfsd_create_setattr(rqstp, resfhp, iap);
+-
+- /*
+- * nfsd_create_setattr already committed the child
+- * (and possibly also the parent).
+- */
+- if (!err)
+- err = nfserrno(commit_metadata(fhp));
+-
+- /*
+- * Update the filehandle to get the new inode info.
+- */
+- if (!err)
+- err = fh_update(resfhp);
+-
+- out:
+- fh_unlock(fhp);
+- if (dchild && !IS_ERR(dchild))
+- dput(dchild);
+- fh_drop_write(fhp);
+- return err;
+-
+- out_nfserr:
+- err = nfserrno(host_err);
+- goto out;
++ goto out_unlock;
++ fh_fill_pre_attrs(fhp);
++ err = nfsd_create_locked(rqstp, fhp, attrs, type, rdev, resfhp);
++ fh_fill_post_attrs(fhp);
++out_unlock:
++ inode_unlock(dentry->d_inode);
++ return err;
+ }
+-#endif /* CONFIG_NFSD_V3 */
+
+ /*
+ * Read a symlink. On entry, *lenp must contain the maximum path length that
+@@ -1579,15 +1542,25 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
+ return 0;
+ }
+
+-/*
+- * Create a symlink and look up its inode
++/**
++ * nfsd_symlink - Create a symlink and look up its inode
++ * @rqstp: RPC transaction being executed
++ * @fhp: NFS filehandle of parent directory
++ * @fname: filename of the new symlink
++ * @flen: length of @fname
++ * @path: content of the new symlink (NUL-terminated)
++ * @attrs: requested attributes of new object
++ * @resfhp: NFS filehandle of new object
++ *
+ * N.B. After this call _both_ fhp and resfhp need an fh_put
++ *
++ * Returns nfs_ok on success, or an nfsstat in network byte order.
+ */
+ __be32
+ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
+- char *fname, int flen,
+- char *path,
+- struct svc_fh *resfhp)
++ char *fname, int flen,
++ char *path, struct nfsd_attrs *attrs,
++ struct svc_fh *resfhp)
+ {
+ struct dentry *dentry, *dnew;
+ __be32 err, cerr;
+@@ -1605,33 +1578,35 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ goto out;
+
+ host_err = fh_want_write(fhp);
+- if (host_err)
+- goto out_nfserr;
++ if (host_err) {
++ err = nfserrno(host_err);
++ goto out;
++ }
+
+- fh_lock(fhp);
+ dentry = fhp->fh_dentry;
++ inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT);
+ dnew = lookup_one_len(fname, dentry, flen);
+- host_err = PTR_ERR(dnew);
+- if (IS_ERR(dnew))
+- goto out_nfserr;
+-
++ if (IS_ERR(dnew)) {
++ err = nfserrno(PTR_ERR(dnew));
++ inode_unlock(dentry->d_inode);
++ goto out_drop_write;
++ }
++ fh_fill_pre_attrs(fhp);
+ host_err = vfs_symlink(d_inode(dentry), dnew, path);
+ err = nfserrno(host_err);
++ cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
++ if (!err)
++ nfsd_create_setattr(rqstp, fhp, resfhp, attrs);
++ fh_fill_post_attrs(fhp);
++ inode_unlock(dentry->d_inode);
+ if (!err)
+ err = nfserrno(commit_metadata(fhp));
+- fh_unlock(fhp);
+-
+- fh_drop_write(fhp);
+-
+- cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
+ dput(dnew);
+ if (err==0) err = cerr;
++out_drop_write:
++ fh_drop_write(fhp);
+ out:
+ return err;
+-
+-out_nfserr:
+- err = nfserrno(host_err);
+- goto out;
+ }
+
+ /*
+@@ -1669,21 +1644,25 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
+ goto out;
+ }
+
+- fh_lock_nested(ffhp, I_MUTEX_PARENT);
+ ddir = ffhp->fh_dentry;
+ dirp = d_inode(ddir);
++ inode_lock_nested(dirp, I_MUTEX_PARENT);
+
+ dnew = lookup_one_len(name, ddir, len);
+- host_err = PTR_ERR(dnew);
+- if (IS_ERR(dnew))
+- goto out_nfserr;
++ if (IS_ERR(dnew)) {
++ err = nfserrno(PTR_ERR(dnew));
++ goto out_unlock;
++ }
+
+ dold = tfhp->fh_dentry;
+
+ err = nfserr_noent;
+ if (d_really_is_negative(dold))
+ goto out_dput;
++ fh_fill_pre_attrs(ffhp);
+ host_err = vfs_link(dold, dirp, dnew, NULL);
++ fh_fill_post_attrs(ffhp);
++ inode_unlock(dirp);
+ if (!host_err) {
+ err = nfserrno(commit_metadata(ffhp));
+ if (!err)
+@@ -1694,17 +1673,17 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
+ else
+ err = nfserrno(host_err);
+ }
+-out_dput:
+ dput(dnew);
+-out_unlock:
+- fh_unlock(ffhp);
++out_drop_write:
+ fh_drop_write(tfhp);
+ out:
+ return err;
+
+-out_nfserr:
+- err = nfserrno(host_err);
+- goto out_unlock;
++out_dput:
++ dput(dnew);
++out_unlock:
++ inode_unlock(dirp);
++ goto out_drop_write;
+ }
+
+ static void
+@@ -1739,7 +1718,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ struct inode *fdir, *tdir;
+ __be32 err;
+ int host_err;
+- bool has_cached = false;
++ bool close_cached = false;
+
+ err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
+ if (err)
+@@ -1771,12 +1750,9 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ goto out;
+ }
+
+- /* cannot use fh_lock as we need deadlock protective ordering
+- * so do it by hand */
+ trap = lock_rename(tdentry, fdentry);
+- ffhp->fh_locked = tfhp->fh_locked = true;
+- fill_pre_wcc(ffhp);
+- fill_pre_wcc(tfhp);
++ fh_fill_pre_attrs(ffhp);
++ fh_fill_pre_attrs(tfhp);
+
+ odentry = lookup_one_len(fname, fdentry, flen);
+ host_err = PTR_ERR(odentry);
+@@ -1798,11 +1774,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ if (ndentry == trap)
+ goto out_dput_new;
+
+- if (nfsd_has_cached_files(ndentry)) {
+- has_cached = true;
++ if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) &&
++ nfsd_has_cached_files(ndentry)) {
++ close_cached = true;
+ goto out_dput_old;
+ } else {
+- host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0);
++ struct renamedata rd = {
++ .old_dir = fdir,
++ .old_dentry = odentry,
++ .new_dir = tdir,
++ .new_dentry = ndentry,
++ };
++ int retries;
++
++ for (retries = 1;;) {
++ host_err = vfs_rename(&rd);
++ if (host_err != -EAGAIN || !retries--)
++ break;
++ if (!nfsd_wait_for_delegreturn(rqstp, d_inode(odentry)))
++ break;
++ }
+ if (!host_err) {
+ host_err = commit_metadata(tfhp);
+ if (!host_err)
+@@ -1815,17 +1806,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ dput(odentry);
+ out_nfserr:
+ err = nfserrno(host_err);
+- /*
+- * We cannot rely on fh_unlock on the two filehandles,
+- * as that would do the wrong thing if the two directories
+- * were the same, so again we do it by hand.
+- */
+- if (!has_cached) {
+- fill_post_wcc(ffhp);
+- fill_post_wcc(tfhp);
++
++ if (!close_cached) {
++ fh_fill_post_attrs(ffhp);
++ fh_fill_post_attrs(tfhp);
+ }
+ unlock_rename(tdentry, fdentry);
+- ffhp->fh_locked = tfhp->fh_locked = false;
+ fh_drop_write(ffhp);
+
+ /*
+@@ -1834,8 +1820,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
+ * shouldn't be done with locks held however, so we delay it until this
+ * point and then reattempt the whole shebang.
+ */
+- if (has_cached) {
+- has_cached = false;
++ if (close_cached) {
++ close_cached = false;
+ nfsd_close_cached_files(ndentry);
+ dput(ndentry);
+ goto retry;
+@@ -1854,6 +1840,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ {
+ struct dentry *dentry, *rdentry;
+ struct inode *dirp;
++ struct inode *rinode;
+ __be32 err;
+ int host_err;
+
+@@ -1868,34 +1855,50 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ if (host_err)
+ goto out_nfserr;
+
+- fh_lock_nested(fhp, I_MUTEX_PARENT);
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
++ inode_lock_nested(dirp, I_MUTEX_PARENT);
+
+ rdentry = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(rdentry);
+ if (IS_ERR(rdentry))
+- goto out_drop_write;
++ goto out_unlock;
+
+ if (d_really_is_negative(rdentry)) {
+ dput(rdentry);
+ host_err = -ENOENT;
+- goto out_drop_write;
++ goto out_unlock;
+ }
++ rinode = d_inode(rdentry);
++ ihold(rinode);
+
+ if (!type)
+ type = d_inode(rdentry)->i_mode & S_IFMT;
+
++ fh_fill_pre_attrs(fhp);
+ if (type != S_IFDIR) {
+- nfsd_close_cached_files(rdentry);
+- host_err = vfs_unlink(dirp, rdentry, NULL);
++ int retries;
++
++ if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK)
++ nfsd_close_cached_files(rdentry);
++
++ for (retries = 1;;) {
++ host_err = vfs_unlink(dirp, rdentry, NULL);
++ if (host_err != -EAGAIN || !retries--)
++ break;
++ if (!nfsd_wait_for_delegreturn(rqstp, rinode))
++ break;
++ }
+ } else {
+ host_err = vfs_rmdir(dirp, rdentry);
+ }
++ fh_fill_post_attrs(fhp);
+
++ inode_unlock(dirp);
+ if (!host_err)
+ host_err = commit_metadata(fhp);
+ dput(rdentry);
++ iput(rinode); /* truncate the inode here */
+
+ out_drop_write:
+ fh_drop_write(fhp);
+@@ -1913,6 +1916,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+ }
+ out:
+ return err;
++out_unlock:
++ inode_unlock(dirp);
++ goto out_drop_write;
+ }
+
+ /*
+@@ -1962,8 +1968,9 @@ static int nfsd_buffered_filldir(struct dir_context *ctx, const char *name,
+ return 0;
+ }
+
+-static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
+- struct readdir_cd *cdp, loff_t *offsetp)
++static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
++ nfsd_filldir_t func, struct readdir_cd *cdp,
++ loff_t *offsetp)
+ {
+ struct buffered_dirent *de;
+ int host_err;
+@@ -2009,6 +2016,8 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
+ if (cdp->err != nfs_ok)
+ break;
+
++ trace_nfsd_dirent(fhp, de->ino, de->name, de->namlen);
++
+ reclen = ALIGN(sizeof(*de) + de->namlen,
+ sizeof(u64));
+ size -= reclen;
+@@ -2056,7 +2065,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
+ goto out_close;
+ }
+
+- err = nfsd_buffered_readdir(file, func, cdp, offsetp);
++ err = nfsd_buffered_readdir(file, fhp, func, cdp, offsetp);
+
+ if (err == nfserr_eof || err == nfserr_toosmall)
+ err = nfs_ok; /* can still be found in ->err */
+@@ -2263,13 +2272,16 @@ nfsd_listxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char **bufp,
+ return err;
+ }
+
+-/*
+- * Removexattr and setxattr need to call fh_lock to both lock the inode
+- * and set the change attribute. Since the top-level vfs_removexattr
+- * and vfs_setxattr calls already do their own inode_lock calls, call
+- * the _locked variant. Pass in a NULL pointer for delegated_inode,
+- * and let the client deal with NFS4ERR_DELAY (same as with e.g.
+- * setattr and remove).
++/**
++ * nfsd_removexattr - Remove an extended attribute
++ * @rqstp: RPC transaction being executed
++ * @fhp: NFS filehandle of object with xattr to remove
++ * @name: name of xattr to remove (NUL-terminate)
++ *
++ * Pass in a NULL pointer for delegated_inode, and let the client deal
++ * with NFS4ERR_DELAY (same as with e.g. setattr and remove).
++ *
++ * Returns nfs_ok on success, or an nfsstat in network byte order.
+ */
+ __be32
+ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
+@@ -2285,11 +2297,13 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name)
+ if (ret)
+ return nfserrno(ret);
+
+- fh_lock(fhp);
++ inode_lock(fhp->fh_dentry->d_inode);
++ fh_fill_pre_attrs(fhp);
+
+ ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL);
+
+- fh_unlock(fhp);
++ fh_fill_post_attrs(fhp);
++ inode_unlock(fhp->fh_dentry->d_inode);
+ fh_drop_write(fhp);
+
+ return nfsd_xattr_errno(ret);
+@@ -2309,12 +2323,13 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name,
+ ret = fh_want_write(fhp);
+ if (ret)
+ return nfserrno(ret);
+- fh_lock(fhp);
++ inode_lock(fhp->fh_dentry->d_inode);
++ fh_fill_pre_attrs(fhp);
+
+ ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags,
+ NULL);
+-
+- fh_unlock(fhp);
++ fh_fill_post_attrs(fhp);
++ inode_unlock(fhp->fh_dentry->d_inode);
+ fh_drop_write(fhp);
+
+ return nfsd_xattr_errno(ret);
+diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
+index a2442ebe5acf6..dbdfef7ae85bb 100644
+--- a/fs/nfsd/vfs.h
++++ b/fs/nfsd/vfs.h
+@@ -6,6 +6,8 @@
+ #ifndef LINUX_NFSD_VFS_H
+ #define LINUX_NFSD_VFS_H
+
++#include <linux/fs.h>
++#include <linux/posix_acl.h>
+ #include "nfsfh.h"
+ #include "nfsd.h"
+
+@@ -42,6 +44,23 @@ struct nfsd_file;
+ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned);
+
+ /* nfsd/vfs.c */
++struct nfsd_attrs {
++ struct iattr *na_iattr; /* input */
++ struct xdr_netobj *na_seclabel; /* input */
++ struct posix_acl *na_pacl; /* input */
++ struct posix_acl *na_dpacl; /* input */
++
++ int na_labelerr; /* output */
++ int na_aclerr; /* output */
++};
++
++static inline void nfsd_attrs_free(struct nfsd_attrs *attrs)
++{
++ posix_acl_release(attrs->na_pacl);
++ posix_acl_release(attrs->na_dpacl);
++}
++
++__be32 nfserrno (int errno);
+ int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
+ struct svc_export **expp);
+ __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *,
+@@ -50,32 +69,28 @@ __be32 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
+ const char *, unsigned int,
+ struct svc_export **, struct dentry **);
+ __be32 nfsd_setattr(struct svc_rqst *, struct svc_fh *,
+- struct iattr *, int, time64_t);
++ struct nfsd_attrs *, int, time64_t);
+ int nfsd_mountpoint(struct dentry *, struct svc_export *);
+ #ifdef CONFIG_NFSD_V4
+-__be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
+- struct xdr_netobj *);
+ __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
+ struct file *, loff_t, loff_t, int);
+-__be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
++__be32 nfsd4_clone_file_range(struct svc_rqst *rqstp,
++ struct nfsd_file *nf_src, u64 src_pos,
+ struct nfsd_file *nf_dst, u64 dst_pos,
+ u64 count, bool sync);
+ #endif /* CONFIG_NFSD_V4 */
+ __be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
+- char *name, int len, struct iattr *attrs,
+- int type, dev_t rdev, struct svc_fh *res);
++ struct nfsd_attrs *attrs, int type, dev_t rdev,
++ struct svc_fh *res);
+ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
+- char *name, int len, struct iattr *attrs,
++ char *name, int len, struct nfsd_attrs *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
+-#ifdef CONFIG_NFSD_V3
+ __be32 nfsd_access(struct svc_rqst *, struct svc_fh *, u32 *, u32 *);
+-__be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *,
+- char *name, int len, struct iattr *attrs,
+- struct svc_fh *res, int createmode,
+- u32 *verifier, bool *truncp, bool *created);
+-__be32 nfsd_commit(struct svc_rqst *, struct svc_fh *,
+- loff_t, unsigned long, __be32 *verf);
+-#endif /* CONFIG_NFSD_V3 */
++__be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
++ struct svc_fh *resfhp, struct nfsd_attrs *iap);
++__be32 nfsd_commit(struct svc_rqst *rqst, struct svc_fh *fhp,
++ struct nfsd_file *nf, u64 offset, u32 count,
++ __be32 *verf);
+ #ifdef CONFIG_NFSD_V4
+ __be32 nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *name, void **bufp, int *lenp);
+@@ -89,7 +104,7 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ int nfsd_open_break_lease(struct inode *, int);
+ __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
+ int, struct file **);
+-__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t,
++__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *,
+ int, struct file **);
+ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct file *file, loff_t offset,
+@@ -113,8 +128,9 @@ __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
+ char *, int *);
+ __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
+- char *name, int len, char *path,
+- struct svc_fh *res);
++ char *name, int len, char *path,
++ struct nfsd_attrs *attrs,
++ struct svc_fh *res);
+ __be32 nfsd_link(struct svc_rqst *, struct svc_fh *,
+ char *, int, struct svc_fh *);
+ ssize_t nfsd_copy_file_range(struct file *, u64,
+@@ -152,7 +168,7 @@ static inline void fh_drop_write(struct svc_fh *fh)
+ }
+ }
+
+-static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
++static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat)
+ {
+ struct path p = {.mnt = fh->fh_export->ex_path.mnt,
+ .dentry = fh->fh_dentry};
+@@ -160,10 +176,4 @@ static inline __be32 fh_getattr(struct svc_fh *fh, struct kstat *stat)
+ AT_STATX_SYNC_AS_STAT));
+ }
+
+-static inline int nfsd_create_is_exclusive(int createmode)
+-{
+- return createmode == NFS3_CREATE_EXCLUSIVE
+- || createmode == NFS4_CREATE_EXCLUSIVE4_1;
+-}
+-
+ #endif /* LINUX_NFSD_VFS_H */
+diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
+index b8cc6a4b2e0ec..852f71580bd06 100644
+--- a/fs/nfsd/xdr.h
++++ b/fs/nfsd/xdr.h
+@@ -27,14 +27,13 @@ struct nfsd_readargs {
+ struct svc_fh fh;
+ __u32 offset;
+ __u32 count;
+- int vlen;
+ };
+
+ struct nfsd_writeargs {
+ svc_fh fh;
+ __u32 offset;
+ __u32 len;
+- struct kvec first;
++ struct xdr_buf payload;
+ };
+
+ struct nfsd_createargs {
+@@ -53,11 +52,6 @@ struct nfsd_renameargs {
+ unsigned int tlen;
+ };
+
+-struct nfsd_readlinkargs {
+- struct svc_fh fh;
+- char * buffer;
+-};
+-
+ struct nfsd_linkargs {
+ struct svc_fh ffh;
+ struct svc_fh tfh;
+@@ -79,7 +73,6 @@ struct nfsd_readdirargs {
+ struct svc_fh fh;
+ __u32 cookie;
+ __u32 count;
+- __be32 * buffer;
+ };
+
+ struct nfsd_stat {
+@@ -101,6 +94,7 @@ struct nfsd_diropres {
+ struct nfsd_readlinkres {
+ __be32 status;
+ int len;
++ struct page *page;
+ };
+
+ struct nfsd_readres {
+@@ -108,17 +102,20 @@ struct nfsd_readres {
+ struct svc_fh fh;
+ unsigned long count;
+ struct kstat stat;
++ struct page **pages;
+ };
+
+ struct nfsd_readdirres {
++ /* Components of the reply */
+ __be32 status;
+
+ int count;
+
++ /* Used to encode the reply's entry list */
++ struct xdr_stream xdr;
++ struct xdr_buf dirlist;
+ struct readdir_cd common;
+- __be32 * buffer;
+- int buflen;
+- __be32 * offset;
++ unsigned int cookie_offset;
+ };
+
+ struct nfsd_statfsres {
+@@ -144,36 +141,37 @@ union nfsd_xdrstore {
+ #define NFS2_SVC_XDRSIZE sizeof(union nfsd_xdrstore)
+
+
+-int nfssvc_decode_void(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_fhandle(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_sattrargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_diropargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_readargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_writeargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_createargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_renameargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_readlinkargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_linkargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_symlinkargs(struct svc_rqst *, __be32 *);
+-int nfssvc_decode_readdirargs(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_void(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_stat(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_attrstat(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_diropres(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_readlinkres(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_readres(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_statfsres(struct svc_rqst *, __be32 *);
+-int nfssvc_encode_readdirres(struct svc_rqst *, __be32 *);
+-
+-int nfssvc_encode_entry(void *, const char *name,
+- int namlen, loff_t offset, u64 ino, unsigned int);
++bool nfssvc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++
++bool nfssvc_encode_statres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_encode_attrstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_encode_diropres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfssvc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++
++void nfssvc_encode_nfscookie(struct nfsd_readdirres *resp, u32 offset);
++int nfssvc_encode_entry(void *data, const char *name, int namlen,
++ loff_t offset, u64 ino, unsigned int d_type);
+
+ void nfssvc_release_attrstat(struct svc_rqst *rqstp);
+ void nfssvc_release_diropres(struct svc_rqst *rqstp);
+ void nfssvc_release_readres(struct svc_rqst *rqstp);
+
+ /* Helper functions for NFSv2 ACL code */
+-__be32 *nfs2svc_encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, struct kstat *stat);
+-__be32 *nfs2svc_decode_fh(__be32 *p, struct svc_fh *fhp);
++bool svcxdr_decode_fhandle(struct xdr_stream *xdr, struct svc_fh *fhp);
++bool svcxdr_encode_stat(struct xdr_stream *xdr, __be32 status);
++bool svcxdr_encode_fattr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ const struct svc_fh *fhp, const struct kstat *stat);
+
+ #endif /* LINUX_NFSD_H */
+diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
+index ae6fa6c9cb467..03fe4e21306cb 100644
+--- a/fs/nfsd/xdr3.h
++++ b/fs/nfsd/xdr3.h
+@@ -25,14 +25,13 @@ struct nfsd3_diropargs {
+
+ struct nfsd3_accessargs {
+ struct svc_fh fh;
+- unsigned int access;
++ __u32 access;
+ };
+
+ struct nfsd3_readargs {
+ struct svc_fh fh;
+ __u64 offset;
+ __u32 count;
+- int vlen;
+ };
+
+ struct nfsd3_writeargs {
+@@ -41,7 +40,7 @@ struct nfsd3_writeargs {
+ __u32 count;
+ int stable;
+ __u32 len;
+- struct kvec first;
++ struct xdr_buf payload;
+ };
+
+ struct nfsd3_createargs {
+@@ -71,11 +70,6 @@ struct nfsd3_renameargs {
+ unsigned int tlen;
+ };
+
+-struct nfsd3_readlinkargs {
+- struct svc_fh fh;
+- char * buffer;
+-};
+-
+ struct nfsd3_linkargs {
+ struct svc_fh ffh;
+ struct svc_fh tfh;
+@@ -96,10 +90,8 @@ struct nfsd3_symlinkargs {
+ struct nfsd3_readdirargs {
+ struct svc_fh fh;
+ __u64 cookie;
+- __u32 dircount;
+ __u32 count;
+ __be32 * verf;
+- __be32 * buffer;
+ };
+
+ struct nfsd3_commitargs {
+@@ -110,13 +102,13 @@ struct nfsd3_commitargs {
+
+ struct nfsd3_getaclargs {
+ struct svc_fh fh;
+- int mask;
++ __u32 mask;
+ };
+
+ struct posix_acl;
+ struct nfsd3_setaclargs {
+ struct svc_fh fh;
+- int mask;
++ __u32 mask;
+ struct posix_acl *acl_access;
+ struct posix_acl *acl_default;
+ };
+@@ -145,6 +137,7 @@ struct nfsd3_readlinkres {
+ __be32 status;
+ struct svc_fh fh;
+ __u32 len;
++ struct page **pages;
+ };
+
+ struct nfsd3_readres {
+@@ -152,6 +145,7 @@ struct nfsd3_readres {
+ struct svc_fh fh;
+ unsigned long count;
+ __u32 eof;
++ struct page **pages;
+ };
+
+ struct nfsd3_writeres {
+@@ -175,19 +169,17 @@ struct nfsd3_linkres {
+ };
+
+ struct nfsd3_readdirres {
++ /* Components of the reply */
+ __be32 status;
+ struct svc_fh fh;
+- /* Just to save kmalloc on every readdirplus entry (svc_fh is a
+- * little large for the stack): */
+- struct svc_fh scratch;
+- int count;
+ __be32 verf[2];
+
++ /* Used to encode the reply's entry list */
++ struct xdr_stream xdr;
++ struct xdr_buf dirlist;
++ struct svc_fh scratch;
+ struct readdir_cd common;
+- __be32 * buffer;
+- int buflen;
+- __be32 * offset;
+- __be32 * offset1;
++ unsigned int cookie_offset;
+ struct svc_rqst * rqstp;
+
+ };
+@@ -273,52 +265,50 @@ union nfsd3_xdrstore {
+
+ #define NFS3_SVC_XDRSIZE sizeof(union nfsd3_xdrstore)
+
+-int nfs3svc_decode_voidarg(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_fhandle(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_sattrargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_diropargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_accessargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_readargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_writeargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_createargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_mkdirargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_mknodargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_renameargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_readlinkargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_linkargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_symlinkargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_readdirargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_readdirplusargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_decode_commitargs(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_voidres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_attrstat(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_wccstat(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_diropres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_accessres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_readlinkres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_readres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_writeres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_createres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_renameres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_linkres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_readdirres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_fsstatres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_fsinfores(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_pathconfres(struct svc_rqst *, __be32 *);
+-int nfs3svc_encode_commitres(struct svc_rqst *, __be32 *);
++bool nfs3svc_decode_fhandleargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_sattrargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_diropargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_accessargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_readargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_createargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_mkdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_mknodargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_renameargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_linkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_decode_commitargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++
++bool nfs3svc_encode_getattrres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_wccstat(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_lookupres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_accessres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_readlinkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_readres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_writeres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_createres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_renameres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_linkres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_readdirres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_fsstatres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_fsinfores(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_pathconfres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs3svc_encode_commitres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+ void nfs3svc_release_fhandle(struct svc_rqst *);
+ void nfs3svc_release_fhandle2(struct svc_rqst *);
+-int nfs3svc_encode_entry(void *, const char *name,
+- int namlen, loff_t offset, u64 ino,
+- unsigned int);
+-int nfs3svc_encode_entry_plus(void *, const char *name,
+- int namlen, loff_t offset, u64 ino,
+- unsigned int);
+-/* Helper functions for NFSv3 ACL code */
+-__be32 *nfs3svc_encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p,
+- struct svc_fh *fhp);
+-__be32 *nfs3svc_decode_fh(__be32 *p, struct svc_fh *fhp);
+
++void nfs3svc_encode_cookie3(struct nfsd3_readdirres *resp, u64 offset);
++int nfs3svc_encode_entry3(void *data, const char *name, int namlen,
++ loff_t offset, u64 ino, unsigned int d_type);
++int nfs3svc_encode_entryplus3(void *data, const char *name, int namlen,
++ loff_t offset, u64 ino, unsigned int d_type);
++/* Helper functions for NFSv3 ACL code */
++bool svcxdr_decode_nfs_fh3(struct xdr_stream *xdr, struct svc_fh *fhp);
++bool svcxdr_encode_nfsstat3(struct xdr_stream *xdr, __be32 status);
++bool svcxdr_encode_post_op_attr(struct svc_rqst *rqstp, struct xdr_stream *xdr,
++ const struct svc_fh *fhp);
+
+ #endif /* _LINUX_NFSD_XDR3_H */
+diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
+index 679d40af1bbb1..a034b9b62137c 100644
+--- a/fs/nfsd/xdr4.h
++++ b/fs/nfsd/xdr4.h
+@@ -76,12 +76,7 @@ static inline bool nfsd4_has_session(struct nfsd4_compound_state *cs)
+
+ struct nfsd4_change_info {
+ u32 atomic;
+- bool change_supported;
+- u32 before_ctime_sec;
+- u32 before_ctime_nsec;
+ u64 before_change;
+- u32 after_ctime_sec;
+- u32 after_ctime_nsec;
+ u64 after_change;
+ };
+
+@@ -252,7 +247,8 @@ struct nfsd4_listxattrs {
+
+ struct nfsd4_open {
+ u32 op_claim_type; /* request */
+- struct xdr_netobj op_fname; /* request - everything but CLAIM_PREV */
++ u32 op_fnamelen;
++ char * op_fname; /* request - everything but CLAIM_PREV */
+ u32 op_delegate_type; /* request - CLAIM_PREV only */
+ stateid_t op_delegate_stateid; /* request - response */
+ u32 op_why_no_deleg; /* response - DELEG_NONE_EXT only */
+@@ -277,11 +273,13 @@ struct nfsd4_open {
+ bool op_truncate; /* used during processing */
+ bool op_created; /* used during processing */
+ struct nfs4_openowner *op_openowner; /* used during processing */
++ struct file *op_filp; /* used during processing */
+ struct nfs4_file *op_file; /* used during processing */
+ struct nfs4_ol_stateid *op_stp; /* used during processing */
+ struct nfs4_clnt_odstate *op_odstate; /* used during processing */
+ struct nfs4_acl *op_acl;
+ struct xdr_netobj op_label;
++ struct svc_rqst *op_rqstp;
+ };
+
+ struct nfsd4_open_confirm {
+@@ -305,9 +303,10 @@ struct nfsd4_read {
+ u32 rd_length; /* request */
+ int rd_vlen;
+ struct nfsd_file *rd_nf;
+-
++
+ struct svc_rqst *rd_rqstp; /* response */
+- struct svc_fh *rd_fhp; /* response */
++ struct svc_fh *rd_fhp; /* response */
++ u32 rd_eof; /* response */
+ };
+
+ struct nfsd4_readdir {
+@@ -385,13 +384,6 @@ struct nfsd4_setclientid_confirm {
+ nfs4_verifier sc_confirm;
+ };
+
+-struct nfsd4_saved_compoundargs {
+- __be32 *p;
+- __be32 *end;
+- int pagelen;
+- struct page **pagelist;
+-};
+-
+ struct nfsd4_test_stateid_id {
+ __be32 ts_id_status;
+ stateid_t ts_id_stateid;
+@@ -419,8 +411,7 @@ struct nfsd4_write {
+ u64 wr_offset; /* request */
+ u32 wr_stable_how; /* request */
+ u32 wr_buflen; /* request */
+- struct kvec wr_head;
+- struct page ** wr_pagelist; /* request */
++ struct xdr_buf wr_payload; /* request */
+
+ u32 wr_bytes_written; /* response */
+ u32 wr_how_written; /* response */
+@@ -433,7 +424,7 @@ struct nfsd4_exchange_id {
+ u32 flags;
+ clientid_t clientid;
+ u32 seqid;
+- int spa_how;
++ u32 spa_how;
+ u32 spo_must_enforce[3];
+ u32 spo_must_allow[3];
+ struct xdr_netobj nii_domain;
+@@ -543,6 +534,13 @@ struct nfsd42_write_res {
+ stateid_t cb_stateid;
+ };
+
++struct nfsd4_cb_offload {
++ struct nfsd4_callback co_cb;
++ struct nfsd42_write_res co_res;
++ __be32 co_nfserr;
++ struct knfsd_fh co_fh;
++};
++
+ struct nfsd4_copy {
+ /* request */
+ stateid_t cp_src_stateid;
+@@ -550,18 +548,16 @@ struct nfsd4_copy {
+ u64 cp_src_pos;
+ u64 cp_dst_pos;
+ u64 cp_count;
+- struct nl4_server cp_src;
+- bool cp_intra;
++ struct nl4_server *cp_src;
+
+- /* both */
+- bool cp_synchronous;
++ unsigned long cp_flags;
++#define NFSD4_COPY_F_STOPPED (0)
++#define NFSD4_COPY_F_INTRA (1)
++#define NFSD4_COPY_F_SYNCHRONOUS (2)
++#define NFSD4_COPY_F_COMMITTED (3)
+
+ /* response */
+ struct nfsd42_write_res cp_res;
+-
+- /* for cb_offload */
+- struct nfsd4_callback cp_cb;
+- __be32 nfserr;
+ struct knfsd_fh fh;
+
+ struct nfs4_client *cp_clp;
+@@ -574,13 +570,34 @@ struct nfsd4_copy {
+ struct list_head copies;
+ struct task_struct *copy_task;
+ refcount_t refcount;
+- bool stopped;
+
+- struct vfsmount *ss_mnt;
++ struct nfsd4_ssc_umount_item *ss_nsui;
+ struct nfs_fh c_fh;
+ nfs4_stateid stateid;
+ };
+-extern bool inter_copy_offload_enable;
++
++static inline void nfsd4_copy_set_sync(struct nfsd4_copy *copy, bool sync)
++{
++ if (sync)
++ set_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
++ else
++ clear_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
++}
++
++static inline bool nfsd4_copy_is_sync(const struct nfsd4_copy *copy)
++{
++ return test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
++}
++
++static inline bool nfsd4_copy_is_async(const struct nfsd4_copy *copy)
++{
++ return !test_bit(NFSD4_COPY_F_SYNCHRONOUS, &copy->cp_flags);
++}
++
++static inline bool nfsd4_ssc_is_inter(const struct nfsd4_copy *copy)
++{
++ return !test_bit(NFSD4_COPY_F_INTRA, &copy->cp_flags);
++}
+
+ struct nfsd4_seek {
+ /* request */
+@@ -605,19 +622,20 @@ struct nfsd4_offload_status {
+ struct nfsd4_copy_notify {
+ /* request */
+ stateid_t cpn_src_stateid;
+- struct nl4_server cpn_dst;
++ struct nl4_server *cpn_dst;
+
+ /* response */
+ stateid_t cpn_cnr_stateid;
+ u64 cpn_sec;
+ u32 cpn_nsec;
+- struct nl4_server cpn_src;
++ struct nl4_server *cpn_src;
+ };
+
+ struct nfsd4_op {
+- int opnum;
+- const struct nfsd4_operation * opdesc;
++ u32 opnum;
+ __be32 status;
++ const struct nfsd4_operation *opdesc;
++ struct nfs4_replay *replay;
+ union nfsd4_op_u {
+ struct nfsd4_access access;
+ struct nfsd4_close close;
+@@ -681,7 +699,6 @@ struct nfsd4_op {
+ struct nfsd4_listxattrs listxattrs;
+ struct nfsd4_removexattr removexattr;
+ } u;
+- struct nfs4_replay * replay;
+ };
+
+ bool nfsd4_cache_this_op(struct nfsd4_op *);
+@@ -696,35 +713,29 @@ struct svcxdr_tmpbuf {
+
+ struct nfsd4_compoundargs {
+ /* scratch variables for XDR decode */
+- __be32 * p;
+- __be32 * end;
+- struct page ** pagelist;
+- int pagelen;
+- bool tail;
+- __be32 tmp[8];
+- __be32 * tmpp;
++ struct xdr_stream *xdr;
+ struct svcxdr_tmpbuf *to_free;
+-
+ struct svc_rqst *rqstp;
+
+- u32 taglen;
+ char * tag;
++ u32 taglen;
+ u32 minorversion;
++ u32 client_opcnt;
+ u32 opcnt;
+ struct nfsd4_op *ops;
+ struct nfsd4_op iops[8];
+- int cachetype;
+ };
+
+ struct nfsd4_compoundres {
+ /* scratch variables for XDR encode */
+- struct xdr_stream xdr;
++ struct xdr_stream *xdr;
+ struct svc_rqst * rqstp;
+
+- u32 taglen;
++ __be32 *statusp;
+ char * tag;
++ u32 taglen;
+ u32 opcnt;
+- __be32 * tagp; /* tag, opcount encode location */
++
+ struct nfsd4_compound_state cstate;
+ };
+
+@@ -767,24 +778,16 @@ static inline void
+ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
+ {
+ BUG_ON(!fhp->fh_pre_saved);
+- cinfo->atomic = (u32)fhp->fh_post_saved;
+- cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
++ cinfo->atomic = (u32)(fhp->fh_post_saved && !fhp->fh_no_atomic_attr);
+
+ cinfo->before_change = fhp->fh_pre_change;
+ cinfo->after_change = fhp->fh_post_change;
+- cinfo->before_ctime_sec = fhp->fh_pre_ctime.tv_sec;
+- cinfo->before_ctime_nsec = fhp->fh_pre_ctime.tv_nsec;
+- cinfo->after_ctime_sec = fhp->fh_post_attr.ctime.tv_sec;
+- cinfo->after_ctime_nsec = fhp->fh_post_attr.ctime.tv_nsec;
+-
+ }
+
+
+ bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
+-int nfs4svc_decode_voidarg(struct svc_rqst *, __be32 *);
+-int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *);
+-int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *);
+-int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *);
++bool nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nfs4svc_encode_compoundres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+ __be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
+ void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
+ void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op);
+@@ -885,13 +888,19 @@ struct nfsd4_operation {
+ u32 op_flags;
+ char *op_name;
+ /* Try to get response size before operation */
+- u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *);
++ u32 (*op_rsize_bop)(const struct svc_rqst *rqstp,
++ const struct nfsd4_op *op);
+ void (*op_get_currentstateid)(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+ void (*op_set_currentstateid)(struct nfsd4_compound_state *,
+ union nfsd4_op_u *);
+ };
+
++struct nfsd4_cb_recall_any {
++ struct nfsd4_callback ra_cb;
++ u32 ra_keep;
++ u32 ra_bmval[1];
++};
+
+ #endif
+
+diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
+index 547cf07cf4e08..0d39af1b00a0f 100644
+--- a/fs/nfsd/xdr4cb.h
++++ b/fs/nfsd/xdr4cb.h
+@@ -48,3 +48,9 @@
+ #define NFS4_dec_cb_offload_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
++#define NFS4_enc_cb_recall_any_sz (cb_compound_enc_hdr_sz + \
++ cb_sequence_enc_sz + \
++ 1 + 1 + 1)
++#define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \
++ cb_sequence_dec_sz + \
++ op_dec_sz)
+diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
+index e45ca6ecba959..fa81c59a2ad41 100644
+--- a/fs/notify/dnotify/dnotify.c
++++ b/fs/notify/dnotify/dnotify.c
+@@ -150,7 +150,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
+ return;
+ dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark);
+
+- mutex_lock(&dnotify_group->mark_mutex);
++ fsnotify_group_lock(dnotify_group);
+
+ spin_lock(&fsn_mark->lock);
+ prev = &dn_mark->dn;
+@@ -173,7 +173,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
+ free = true;
+ }
+
+- mutex_unlock(&dnotify_group->mark_mutex);
++ fsnotify_group_unlock(dnotify_group);
+
+ if (free)
+ fsnotify_free_mark(fsn_mark);
+@@ -196,7 +196,7 @@ static __u32 convert_arg(unsigned long arg)
+ if (arg & DN_ATTRIB)
+ new_mask |= FS_ATTRIB;
+ if (arg & DN_RENAME)
+- new_mask |= FS_DN_RENAME;
++ new_mask |= FS_RENAME;
+ if (arg & DN_CREATE)
+ new_mask |= (FS_CREATE | FS_MOVED_TO);
+
+@@ -306,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+ new_dn_mark->dn = NULL;
+
+ /* this is needed to prevent the fcntl/close race described below */
+- mutex_lock(&dnotify_group->mark_mutex);
++ fsnotify_group_lock(dnotify_group);
+
+ /* add the new_fsn_mark or find an old one. */
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group);
+@@ -316,7 +316,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+ } else {
+ error = fsnotify_add_inode_mark_locked(new_fsn_mark, inode, 0);
+ if (error) {
+- mutex_unlock(&dnotify_group->mark_mutex);
++ fsnotify_group_unlock(dnotify_group);
+ goto out_err;
+ }
+ spin_lock(&new_fsn_mark->lock);
+@@ -327,7 +327,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+ }
+
+ rcu_read_lock();
+- f = fcheck(fd);
++ f = lookup_fd_rcu(fd);
+ rcu_read_unlock();
+
+ /* if (f != filp) means that we lost a race and another task/thread
+@@ -365,7 +365,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
+
+ if (destroy)
+ fsnotify_detach_mark(fsn_mark);
+- mutex_unlock(&dnotify_group->mark_mutex);
++ fsnotify_group_unlock(dnotify_group);
+ if (destroy)
+ fsnotify_free_mark(fsn_mark);
+ fsnotify_put_mark(fsn_mark);
+@@ -383,7 +383,8 @@ static int __init dnotify_init(void)
+ SLAB_PANIC|SLAB_ACCOUNT);
+ dnotify_mark_cache = KMEM_CACHE(dnotify_mark, SLAB_PANIC|SLAB_ACCOUNT);
+
+- dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops);
++ dnotify_group = fsnotify_alloc_group(&dnotify_fsnotify_ops,
++ FSNOTIFY_GROUP_NOFS);
+ if (IS_ERR(dnotify_group))
+ panic("unable to allocate fsnotify group for dnotify\n");
+ return 0;
+diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
+index c3af99e94f1d1..a2a15bc4df280 100644
+--- a/fs/notify/fanotify/fanotify.c
++++ b/fs/notify/fanotify/fanotify.c
+@@ -14,20 +14,33 @@
+ #include <linux/audit.h>
+ #include <linux/sched/mm.h>
+ #include <linux/statfs.h>
++#include <linux/stringhash.h>
+
+ #include "fanotify.h"
+
+-static bool fanotify_path_equal(struct path *p1, struct path *p2)
++static bool fanotify_path_equal(const struct path *p1, const struct path *p2)
+ {
+ return p1->mnt == p2->mnt && p1->dentry == p2->dentry;
+ }
+
++static unsigned int fanotify_hash_path(const struct path *path)
++{
++ return hash_ptr(path->dentry, FANOTIFY_EVENT_HASH_BITS) ^
++ hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS);
++}
++
+ static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
+ __kernel_fsid_t *fsid2)
+ {
+ return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
+ }
+
++static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid)
++{
++ return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^
++ hash_32(fsid->val[1], FANOTIFY_EVENT_HASH_BITS);
++}
++
+ static bool fanotify_fh_equal(struct fanotify_fh *fh1,
+ struct fanotify_fh *fh2)
+ {
+@@ -38,6 +51,16 @@ static bool fanotify_fh_equal(struct fanotify_fh *fh1,
+ !memcmp(fanotify_fh_buf(fh1), fanotify_fh_buf(fh2), fh1->len);
+ }
+
++static unsigned int fanotify_hash_fh(struct fanotify_fh *fh)
++{
++ long salt = (long)fh->type | (long)fh->len << 8;
++
++ /*
++ * full_name_hash() works long by long, so it handles fh buf optimally.
++ */
++ return full_name_hash((void *)salt, fanotify_fh_buf(fh), fh->len);
++}
++
+ static bool fanotify_fid_event_equal(struct fanotify_fid_event *ffe1,
+ struct fanotify_fid_event *ffe2)
+ {
+@@ -53,8 +76,10 @@ static bool fanotify_info_equal(struct fanotify_info *info1,
+ struct fanotify_info *info2)
+ {
+ if (info1->dir_fh_totlen != info2->dir_fh_totlen ||
++ info1->dir2_fh_totlen != info2->dir2_fh_totlen ||
+ info1->file_fh_totlen != info2->file_fh_totlen ||
+- info1->name_len != info2->name_len)
++ info1->name_len != info2->name_len ||
++ info1->name2_len != info2->name2_len)
+ return false;
+
+ if (info1->dir_fh_totlen &&
+@@ -62,14 +87,24 @@ static bool fanotify_info_equal(struct fanotify_info *info1,
+ fanotify_info_dir_fh(info2)))
+ return false;
+
++ if (info1->dir2_fh_totlen &&
++ !fanotify_fh_equal(fanotify_info_dir2_fh(info1),
++ fanotify_info_dir2_fh(info2)))
++ return false;
++
+ if (info1->file_fh_totlen &&
+ !fanotify_fh_equal(fanotify_info_file_fh(info1),
+ fanotify_info_file_fh(info2)))
+ return false;
+
+- return !info1->name_len ||
+- !memcmp(fanotify_info_name(info1), fanotify_info_name(info2),
+- info1->name_len);
++ if (info1->name_len &&
++ memcmp(fanotify_info_name(info1), fanotify_info_name(info2),
++ info1->name_len))
++ return false;
++
++ return !info1->name2_len ||
++ !memcmp(fanotify_info_name2(info1), fanotify_info_name2(info2),
++ info1->name2_len);
+ }
+
+ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
+@@ -88,16 +123,22 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1,
+ return fanotify_info_equal(info1, info2);
+ }
+
+-static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
+- struct fsnotify_event *new_fsn)
++static bool fanotify_error_event_equal(struct fanotify_error_event *fee1,
++ struct fanotify_error_event *fee2)
+ {
+- struct fanotify_event *old, *new;
++ /* Error events against the same file system are always merged. */
++ if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid))
++ return false;
+
+- pr_debug("%s: old=%p new=%p\n", __func__, old_fsn, new_fsn);
+- old = FANOTIFY_E(old_fsn);
+- new = FANOTIFY_E(new_fsn);
++ return true;
++}
+
+- if (old_fsn->objectid != new_fsn->objectid ||
++static bool fanotify_should_merge(struct fanotify_event *old,
++ struct fanotify_event *new)
++{
++ pr_debug("%s: old=%p new=%p\n", __func__, old, new);
++
++ if (old->hash != new->hash ||
+ old->type != new->type || old->pid != new->pid)
+ return false;
+
+@@ -112,6 +153,13 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
+ if ((old->mask & FS_ISDIR) != (new->mask & FS_ISDIR))
+ return false;
+
++ /*
++ * FAN_RENAME event is reported with special info record types,
++ * so we cannot merge it with other events.
++ */
++ if ((old->mask & FAN_RENAME) != (new->mask & FAN_RENAME))
++ return false;
++
+ switch (old->type) {
+ case FANOTIFY_EVENT_TYPE_PATH:
+ return fanotify_path_equal(fanotify_event_path(old),
+@@ -122,6 +170,9 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
+ case FANOTIFY_EVENT_TYPE_FID_NAME:
+ return fanotify_name_event_equal(FANOTIFY_NE(old),
+ FANOTIFY_NE(new));
++ case FANOTIFY_EVENT_TYPE_FS_ERROR:
++ return fanotify_error_event_equal(FANOTIFY_EE(old),
++ FANOTIFY_EE(new));
+ default:
+ WARN_ON_ONCE(1);
+ }
+@@ -133,14 +184,16 @@ static bool fanotify_should_merge(struct fsnotify_event *old_fsn,
+ #define FANOTIFY_MAX_MERGE_EVENTS 128
+
+ /* and the list better be locked by something too! */
+-static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
++static int fanotify_merge(struct fsnotify_group *group,
++ struct fsnotify_event *event)
+ {
+- struct fsnotify_event *test_event;
+- struct fanotify_event *new;
++ struct fanotify_event *old, *new = FANOTIFY_E(event);
++ unsigned int bucket = fanotify_event_hash_bucket(group, new);
++ struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket];
+ int i = 0;
+
+- pr_debug("%s: list=%p event=%p\n", __func__, list, event);
+- new = FANOTIFY_E(event);
++ pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
++ group, event, bucket);
+
+ /*
+ * Don't merge a permission event with any other event so that we know
+@@ -150,11 +203,15 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event)
+ if (fanotify_is_perm_event(new->mask))
+ return 0;
+
+- list_for_each_entry_reverse(test_event, list, list) {
++ hlist_for_each_entry(old, hlist, merge_list) {
+ if (++i > FANOTIFY_MAX_MERGE_EVENTS)
+ break;
+- if (fanotify_should_merge(test_event, event)) {
+- FANOTIFY_E(test_event)->mask |= new->mask;
++ if (fanotify_should_merge(old, new)) {
++ old->mask |= new->mask;
++
++ if (fanotify_is_error_event(old->mask))
++ FANOTIFY_EE(old)->err_count++;
++
+ return 1;
+ }
+ }
+@@ -190,8 +247,11 @@ static int fanotify_get_response(struct fsnotify_group *group,
+ return ret;
+ }
+ /* Event not yet reported? Just remove it. */
+- if (event->state == FAN_EVENT_INIT)
++ if (event->state == FAN_EVENT_INIT) {
+ fsnotify_remove_queued_event(group, &event->fae.fse);
++ /* Permission events are not supposed to be hashed */
++ WARN_ON_ONCE(!hlist_unhashed(&event->fae.merge_list));
++ }
+ /*
+ * Event may be also answered in case signal delivery raced
+ * with wakeup. In that case we have nothing to do besides
+@@ -231,15 +291,17 @@ static int fanotify_get_response(struct fsnotify_group *group,
+ */
+ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ struct fsnotify_iter_info *iter_info,
+- u32 event_mask, const void *data,
+- int data_type, struct inode *dir)
++ u32 *match_mask, u32 event_mask,
++ const void *data, int data_type,
++ struct inode *dir)
+ {
+- __u32 marks_mask = 0, marks_ignored_mask = 0;
++ __u32 marks_mask = 0, marks_ignore_mask = 0;
+ __u32 test_mask, user_mask = FANOTIFY_OUTGOING_EVENTS |
+ FANOTIFY_EVENT_FLAGS;
+ const struct path *path = fsnotify_data_path(data, data_type);
+ unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ struct fsnotify_mark *mark;
++ bool ondir = event_mask & FAN_ONDIR;
+ int type;
+
+ pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n",
+@@ -254,37 +316,30 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ return 0;
+ } else if (!(fid_mode & FAN_REPORT_FID)) {
+ /* Do we have a directory inode to report? */
+- if (!dir && !(event_mask & FS_ISDIR))
++ if (!dir && !ondir)
+ return 0;
+ }
+
+- fsnotify_foreach_obj_type(type) {
+- if (!fsnotify_iter_should_report_type(iter_info, type))
+- continue;
+- mark = iter_info->marks[type];
+-
+- /* Apply ignore mask regardless of ISDIR and ON_CHILD flags */
+- marks_ignored_mask |= mark->ignored_mask;
+-
++ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ /*
+- * If the event is on dir and this mark doesn't care about
+- * events on dir, don't send it!
++ * Apply ignore mask depending on event flags in ignore mask.
+ */
+- if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
+- continue;
++ marks_ignore_mask |=
++ fsnotify_effective_ignore_mask(mark, ondir, type);
+
+ /*
+- * If the event is on a child and this mark is on a parent not
+- * watching children, don't send it!
++ * Send the event depending on event flags in mark mask.
+ */
+- if (type == FSNOTIFY_OBJ_TYPE_PARENT &&
+- !(mark->mask & FS_EVENT_ON_CHILD))
++ if (!fsnotify_mask_applicable(mark->mask, ondir, type))
+ continue;
+
+ marks_mask |= mark->mask;
++
++ /* Record the mark types of this group that matched the event */
++ *match_mask |= 1U << type;
+ }
+
+- test_mask = event_mask & marks_mask & ~marks_ignored_mask;
++ test_mask = event_mask & marks_mask & ~marks_ignore_mask;
+
+ /*
+ * For dirent modification events (create/delete/move) that do not carry
+@@ -319,13 +374,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
+ static int fanotify_encode_fh_len(struct inode *inode)
+ {
+ int dwords = 0;
++ int fh_len;
+
+ if (!inode)
+ return 0;
+
+ exportfs_encode_inode_fh(inode, NULL, &dwords, NULL);
++ fh_len = dwords << 2;
+
+- return dwords << 2;
++ /*
++ * struct fanotify_error_event might be preallocated and is
++ * limited to MAX_HANDLE_SZ. This should never happen, but
++ * safeguard by forcing an invalid file handle.
++ */
++ if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ))
++ return 0;
++
++ return fh_len;
+ }
+
+ /*
+@@ -335,7 +400,8 @@ static int fanotify_encode_fh_len(struct inode *inode)
+ * Return 0 on failure to encode.
+ */
+ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
+- unsigned int fh_len, gfp_t gfp)
++ unsigned int fh_len, unsigned int *hash,
++ gfp_t gfp)
+ {
+ int dwords, type = 0;
+ char *ext_buf = NULL;
+@@ -345,15 +411,21 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
+ fh->type = FILEID_ROOT;
+ fh->len = 0;
+ fh->flags = 0;
++
++ /*
++ * Invalid FHs are used by FAN_FS_ERROR for errors not
++ * linked to any inode. The f_handle won't be reported
++ * back to userspace.
++ */
+ if (!inode)
+- return 0;
++ goto out;
+
+ /*
+ * !gpf means preallocated variable size fh, but fh_len could
+ * be zero in that case if encoding fh len failed.
+ */
+ err = -ENOENT;
+- if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4))
++ if (fh_len < 4 || WARN_ON_ONCE(fh_len % 4) || fh_len > MAX_HANDLE_SZ)
+ goto out_err;
+
+ /* No external buffer in a variable size allocated fh */
+@@ -378,6 +450,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
+ fh->type = type;
+ fh->len = fh_len;
+
++out:
++ /*
++ * Mix fh into event merge key. Hash might be NULL in case of
++ * unhashed FID events (i.e. FAN_FS_ERROR).
++ */
++ if (hash)
++ *hash ^= fanotify_hash_fh(fh);
++
+ return FANOTIFY_FH_HDR_LEN + fh_len;
+
+ out_err:
+@@ -392,17 +472,41 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
+ }
+
+ /*
+- * The inode to use as identifier when reporting fid depends on the event.
+- * Report the modified directory inode on dirent modification events.
+- * Report the "victim" inode otherwise.
++ * FAN_REPORT_FID is ambiguous in that it reports the fid of the child for
++ * some events and the fid of the parent for create/delete/move events.
++ *
++ * With the FAN_REPORT_TARGET_FID flag, the fid of the child is reported
++ * also in create/delete/move events in addition to the fid of the parent
++ * and the name of the child.
++ */
++static inline bool fanotify_report_child_fid(unsigned int fid_mode, u32 mask)
++{
++ if (mask & ALL_FSNOTIFY_DIRENT_EVENTS)
++ return (fid_mode & FAN_REPORT_TARGET_FID);
++
++ return (fid_mode & FAN_REPORT_FID) && !(mask & FAN_ONDIR);
++}
++
++/*
++ * The inode to use as identifier when reporting fid depends on the event
++ * and the group flags.
++ *
++ * With the group flag FAN_REPORT_TARGET_FID, always report the child fid.
++ *
++ * Without the group flag FAN_REPORT_TARGET_FID, report the modified directory
++ * fid on dirent events and the child fid otherwise.
++ *
+ * For example:
+- * FS_ATTRIB reports the child inode even if reported on a watched parent.
+- * FS_CREATE reports the modified dir inode and not the created inode.
++ * FS_ATTRIB reports the child fid even if reported on a watched parent.
++ * FS_CREATE reports the modified dir fid without FAN_REPORT_TARGET_FID.
++ * and reports the created child fid with FAN_REPORT_TARGET_FID.
+ */
+ static struct inode *fanotify_fid_inode(u32 event_mask, const void *data,
+- int data_type, struct inode *dir)
++ int data_type, struct inode *dir,
++ unsigned int fid_mode)
+ {
+- if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
++ if ((event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) &&
++ !(fid_mode & FAN_REPORT_TARGET_FID))
+ return dir;
+
+ return fsnotify_data_inode(data, data_type);
+@@ -424,13 +528,14 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data,
+ if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS)
+ return dir;
+
+- if (S_ISDIR(inode->i_mode))
++ if (inode && S_ISDIR(inode->i_mode))
+ return inode;
+
+ return dir;
+ }
+
+ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
++ unsigned int *hash,
+ gfp_t gfp)
+ {
+ struct fanotify_path_event *pevent;
+@@ -441,6 +546,7 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path,
+
+ pevent->fae.type = FANOTIFY_EVENT_TYPE_PATH;
+ pevent->path = *path;
++ *hash ^= fanotify_hash_path(path);
+ path_get(path);
+
+ return &pevent->fae;
+@@ -466,6 +572,7 @@ static struct fanotify_event *fanotify_alloc_perm_event(const struct path *path,
+
+ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id,
+ __kernel_fsid_t *fsid,
++ unsigned int *hash,
+ gfp_t gfp)
+ {
+ struct fanotify_fid_event *ffe;
+@@ -476,78 +583,153 @@ static struct fanotify_event *fanotify_alloc_fid_event(struct inode *id,
+
+ ffe->fae.type = FANOTIFY_EVENT_TYPE_FID;
+ ffe->fsid = *fsid;
++ *hash ^= fanotify_hash_fsid(fsid);
+ fanotify_encode_fh(&ffe->object_fh, id, fanotify_encode_fh_len(id),
+- gfp);
++ hash, gfp);
+
+ return &ffe->fae;
+ }
+
+-static struct fanotify_event *fanotify_alloc_name_event(struct inode *id,
++static struct fanotify_event *fanotify_alloc_name_event(struct inode *dir,
+ __kernel_fsid_t *fsid,
+- const struct qstr *file_name,
++ const struct qstr *name,
+ struct inode *child,
++ struct dentry *moved,
++ unsigned int *hash,
+ gfp_t gfp)
+ {
+ struct fanotify_name_event *fne;
+ struct fanotify_info *info;
+ struct fanotify_fh *dfh, *ffh;
+- unsigned int dir_fh_len = fanotify_encode_fh_len(id);
++ struct inode *dir2 = moved ? d_inode(moved->d_parent) : NULL;
++ const struct qstr *name2 = moved ? &moved->d_name : NULL;
++ unsigned int dir_fh_len = fanotify_encode_fh_len(dir);
++ unsigned int dir2_fh_len = fanotify_encode_fh_len(dir2);
+ unsigned int child_fh_len = fanotify_encode_fh_len(child);
+- unsigned int size;
+-
+- size = sizeof(*fne) + FANOTIFY_FH_HDR_LEN + dir_fh_len;
++ unsigned long name_len = name ? name->len : 0;
++ unsigned long name2_len = name2 ? name2->len : 0;
++ unsigned int len, size;
++
++ /* Reserve terminating null byte even for empty name */
++ size = sizeof(*fne) + name_len + name2_len + 2;
++ if (dir_fh_len)
++ size += FANOTIFY_FH_HDR_LEN + dir_fh_len;
++ if (dir2_fh_len)
++ size += FANOTIFY_FH_HDR_LEN + dir2_fh_len;
+ if (child_fh_len)
+ size += FANOTIFY_FH_HDR_LEN + child_fh_len;
+- if (file_name)
+- size += file_name->len + 1;
+ fne = kmalloc(size, gfp);
+ if (!fne)
+ return NULL;
+
+ fne->fae.type = FANOTIFY_EVENT_TYPE_FID_NAME;
+ fne->fsid = *fsid;
++ *hash ^= fanotify_hash_fsid(fsid);
+ info = &fne->info;
+ fanotify_info_init(info);
+- dfh = fanotify_info_dir_fh(info);
+- info->dir_fh_totlen = fanotify_encode_fh(dfh, id, dir_fh_len, 0);
++ if (dir_fh_len) {
++ dfh = fanotify_info_dir_fh(info);
++ len = fanotify_encode_fh(dfh, dir, dir_fh_len, hash, 0);
++ fanotify_info_set_dir_fh(info, len);
++ }
++ if (dir2_fh_len) {
++ dfh = fanotify_info_dir2_fh(info);
++ len = fanotify_encode_fh(dfh, dir2, dir2_fh_len, hash, 0);
++ fanotify_info_set_dir2_fh(info, len);
++ }
+ if (child_fh_len) {
+ ffh = fanotify_info_file_fh(info);
+- info->file_fh_totlen = fanotify_encode_fh(ffh, child, child_fh_len, 0);
++ len = fanotify_encode_fh(ffh, child, child_fh_len, hash, 0);
++ fanotify_info_set_file_fh(info, len);
++ }
++ if (name_len) {
++ fanotify_info_copy_name(info, name);
++ *hash ^= full_name_hash((void *)name_len, name->name, name_len);
++ }
++ if (name2_len) {
++ fanotify_info_copy_name2(info, name2);
++ *hash ^= full_name_hash((void *)name2_len, name2->name,
++ name2_len);
+ }
+- if (file_name)
+- fanotify_info_copy_name(info, file_name);
+
+- pr_debug("%s: ino=%lu size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n",
+- __func__, id->i_ino, size, dir_fh_len, child_fh_len,
++ pr_debug("%s: size=%u dir_fh_len=%u child_fh_len=%u name_len=%u name='%.*s'\n",
++ __func__, size, dir_fh_len, child_fh_len,
+ info->name_len, info->name_len, fanotify_info_name(info));
+
++ if (dir2_fh_len) {
++ pr_debug("%s: dir2_fh_len=%u name2_len=%u name2='%.*s'\n",
++ __func__, dir2_fh_len, info->name2_len,
++ info->name2_len, fanotify_info_name2(info));
++ }
++
+ return &fne->fae;
+ }
+
+-static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+- u32 mask, const void *data,
+- int data_type, struct inode *dir,
+- const struct qstr *file_name,
+- __kernel_fsid_t *fsid)
++static struct fanotify_event *fanotify_alloc_error_event(
++ struct fsnotify_group *group,
++ __kernel_fsid_t *fsid,
++ const void *data, int data_type,
++ unsigned int *hash)
++{
++ struct fs_error_report *report =
++ fsnotify_data_error_report(data, data_type);
++ struct inode *inode;
++ struct fanotify_error_event *fee;
++ int fh_len;
++
++ if (WARN_ON_ONCE(!report))
++ return NULL;
++
++ fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS);
++ if (!fee)
++ return NULL;
++
++ fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR;
++ fee->error = report->error;
++ fee->err_count = 1;
++ fee->fsid = *fsid;
++
++ inode = report->inode;
++ fh_len = fanotify_encode_fh_len(inode);
++
++ /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */
++ if (!fh_len && inode)
++ inode = NULL;
++
++ fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0);
++
++ *hash ^= fanotify_hash_fsid(fsid);
++
++ return &fee->fae;
++}
++
++static struct fanotify_event *fanotify_alloc_event(
++ struct fsnotify_group *group,
++ u32 mask, const void *data, int data_type,
++ struct inode *dir, const struct qstr *file_name,
++ __kernel_fsid_t *fsid, u32 match_mask)
+ {
+ struct fanotify_event *event = NULL;
+ gfp_t gfp = GFP_KERNEL_ACCOUNT;
+- struct inode *id = fanotify_fid_inode(mask, data, data_type, dir);
++ unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
++ struct inode *id = fanotify_fid_inode(mask, data, data_type, dir,
++ fid_mode);
+ struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir);
+ const struct path *path = fsnotify_data_path(data, data_type);
+- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+ struct mem_cgroup *old_memcg;
++ struct dentry *moved = NULL;
+ struct inode *child = NULL;
+ bool name_event = false;
++ unsigned int hash = 0;
++ bool ondir = mask & FAN_ONDIR;
++ struct pid *pid;
+
+ if ((fid_mode & FAN_REPORT_DIR_FID) && dirid) {
+ /*
+- * With both flags FAN_REPORT_DIR_FID and FAN_REPORT_FID, we
+- * report the child fid for events reported on a non-dir child
++ * For certain events and group flags, report the child fid
+ * in addition to reporting the parent fid and maybe child name.
+ */
+- if ((fid_mode & FAN_REPORT_FID) &&
+- id != dirid && !(mask & FAN_ONDIR))
++ if (fanotify_report_child_fid(fid_mode, mask) && id != dirid)
+ child = id;
+
+ id = dirid;
+@@ -568,10 +750,41 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ if (!(fid_mode & FAN_REPORT_NAME)) {
+ name_event = !!child;
+ file_name = NULL;
+- } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
+- !(mask & FAN_ONDIR)) {
++ } else if ((mask & ALL_FSNOTIFY_DIRENT_EVENTS) || !ondir) {
+ name_event = true;
+ }
++
++ /*
++ * In the special case of FAN_RENAME event, use the match_mask
++ * to determine if we need to report only the old parent+name,
++ * only the new parent+name or both.
++ * 'dirid' and 'file_name' are the old parent+name and
++ * 'moved' has the new parent+name.
++ */
++ if (mask & FAN_RENAME) {
++ bool report_old, report_new;
++
++ if (WARN_ON_ONCE(!match_mask))
++ return NULL;
++
++ /* Report both old and new parent+name if sb watching */
++ report_old = report_new =
++ match_mask & (1U << FSNOTIFY_ITER_TYPE_SB);
++ report_old |=
++ match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE);
++ report_new |=
++ match_mask & (1U << FSNOTIFY_ITER_TYPE_INODE2);
++
++ if (!report_old) {
++ /* Do not report old parent+name */
++ dirid = NULL;
++ file_name = NULL;
++ }
++ if (report_new) {
++ /* Report new parent+name */
++ moved = fsnotify_data_dentry(data, data_type);
++ }
++ }
+ }
+
+ /*
+@@ -590,28 +803,30 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+
+ if (fanotify_is_perm_event(mask)) {
+ event = fanotify_alloc_perm_event(path, gfp);
+- } else if (name_event && (file_name || child)) {
+- event = fanotify_alloc_name_event(id, fsid, file_name, child,
+- gfp);
++ } else if (fanotify_is_error_event(mask)) {
++ event = fanotify_alloc_error_event(group, fsid, data,
++ data_type, &hash);
++ } else if (name_event && (file_name || moved || child)) {
++ event = fanotify_alloc_name_event(dirid, fsid, file_name, child,
++ moved, &hash, gfp);
+ } else if (fid_mode) {
+- event = fanotify_alloc_fid_event(id, fsid, gfp);
++ event = fanotify_alloc_fid_event(id, fsid, &hash, gfp);
+ } else {
+- event = fanotify_alloc_path_event(path, gfp);
++ event = fanotify_alloc_path_event(path, &hash, gfp);
+ }
+
+ if (!event)
+ goto out;
+
+- /*
+- * Use the victim inode instead of the watching inode as the id for
+- * event queue, so event reported on parent is merged with event
+- * reported on child when both directory and child watches exist.
+- */
+- fanotify_init_event(event, (unsigned long)id, mask);
+ if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
+- event->pid = get_pid(task_pid(current));
++ pid = get_pid(task_pid(current));
+ else
+- event->pid = get_pid(task_tgid(current));
++ pid = get_pid(task_tgid(current));
++
++ /* Mix event info, FAN_ONDIR flag and pid into event merge key */
++ hash ^= hash_long((unsigned long)pid | ondir, FANOTIFY_EVENT_HASH_BITS);
++ fanotify_init_event(event, hash, mask);
++ event->pid = pid;
+
+ out:
+ set_active_memcg(old_memcg);
+@@ -625,16 +840,14 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
+ */
+ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+ {
++ struct fsnotify_mark *mark;
+ int type;
+ __kernel_fsid_t fsid = {};
+
+- fsnotify_foreach_obj_type(type) {
++ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
+ struct fsnotify_mark_connector *conn;
+
+- if (!fsnotify_iter_should_report_type(iter_info, type))
+- continue;
+-
+- conn = READ_ONCE(iter_info->marks[type]->connector);
++ conn = READ_ONCE(mark->connector);
+ /* Mark is just getting destroyed or created? */
+ if (!conn)
+ continue;
+@@ -651,6 +864,27 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
+ return fsid;
+ }
+
++/*
++ * Add an event to hash table for faster merge.
++ */
++static void fanotify_insert_event(struct fsnotify_group *group,
++ struct fsnotify_event *fsn_event)
++{
++ struct fanotify_event *event = FANOTIFY_E(fsn_event);
++ unsigned int bucket = fanotify_event_hash_bucket(group, event);
++ struct hlist_head *hlist = &group->fanotify_data.merge_hash[bucket];
++
++ assert_spin_locked(&group->notification_lock);
++
++ if (!fanotify_is_hashed_event(event->mask))
++ return;
++
++ pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
++ group, event, bucket);
++
++ hlist_add_head(&event->merge_list, hlist);
++}
++
+ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+ const void *data, int data_type,
+ struct inode *dir,
+@@ -661,6 +895,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+ struct fanotify_event *event;
+ struct fsnotify_event *fsn_event;
+ __kernel_fsid_t fsid = {};
++ u32 match_mask = 0;
+
+ BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS);
+ BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY);
+@@ -681,15 +916,18 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+ BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR);
+ BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC);
+ BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM);
++ BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR);
++ BUILD_BUG_ON(FAN_RENAME != FS_RENAME);
+
+- BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19);
++ BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 21);
+
+- mask = fanotify_group_event_mask(group, iter_info, mask, data,
+- data_type, dir);
++ mask = fanotify_group_event_mask(group, iter_info, &match_mask,
++ mask, data, data_type, dir);
+ if (!mask)
+ return 0;
+
+- pr_debug("%s: group=%p mask=%x\n", __func__, group, mask);
++ pr_debug("%s: group=%p mask=%x report_mask=%x\n", __func__,
++ group, mask, match_mask);
+
+ if (fanotify_is_perm_event(mask)) {
+ /*
+@@ -708,7 +946,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+ }
+
+ event = fanotify_alloc_event(group, mask, data, data_type, dir,
+- file_name, &fsid);
++ file_name, &fsid, match_mask);
+ ret = -ENOMEM;
+ if (unlikely(!event)) {
+ /*
+@@ -721,7 +959,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+ }
+
+ fsn_event = &event->fse;
+- ret = fsnotify_add_event(group, fsn_event, fanotify_merge);
++ ret = fsnotify_insert_event(group, fsn_event, fanotify_merge,
++ fanotify_insert_event);
+ if (ret) {
+ /* Permission events shouldn't be merged */
+ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS);
+@@ -742,11 +981,13 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
+
+ static void fanotify_free_group_priv(struct fsnotify_group *group)
+ {
+- struct user_struct *user;
++ kfree(group->fanotify_data.merge_hash);
++ if (group->fanotify_data.ucounts)
++ dec_ucount(group->fanotify_data.ucounts,
++ UCOUNT_FANOTIFY_GROUPS);
+
+- user = group->fanotify_data.user;
+- atomic_dec(&user->fanotify_listeners);
+- free_uid(user);
++ if (mempool_initialized(&group->fanotify_data.error_events_pool))
++ mempool_exit(&group->fanotify_data.error_events_pool);
+ }
+
+ static void fanotify_free_path_event(struct fanotify_event *event)
+@@ -775,7 +1016,16 @@ static void fanotify_free_name_event(struct fanotify_event *event)
+ kfree(FANOTIFY_NE(event));
+ }
+
+-static void fanotify_free_event(struct fsnotify_event *fsn_event)
++static void fanotify_free_error_event(struct fsnotify_group *group,
++ struct fanotify_event *event)
++{
++ struct fanotify_error_event *fee = FANOTIFY_EE(event);
++
++ mempool_free(fee, &group->fanotify_data.error_events_pool);
++}
++
++static void fanotify_free_event(struct fsnotify_group *group,
++ struct fsnotify_event *fsn_event)
+ {
+ struct fanotify_event *event;
+
+@@ -797,11 +1047,21 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event)
+ case FANOTIFY_EVENT_TYPE_OVERFLOW:
+ kfree(event);
+ break;
++ case FANOTIFY_EVENT_TYPE_FS_ERROR:
++ fanotify_free_error_event(group, event);
++ break;
+ default:
+ WARN_ON_ONCE(1);
+ }
+ }
+
++static void fanotify_freeing_mark(struct fsnotify_mark *mark,
++ struct fsnotify_group *group)
++{
++ if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
++ dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_MARKS);
++}
++
+ static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
+ {
+ kmem_cache_free(fanotify_mark_cache, fsn_mark);
+@@ -811,5 +1071,6 @@ const struct fsnotify_ops fanotify_fsnotify_ops = {
+ .handle_event = fanotify_handle_event,
+ .free_group_priv = fanotify_free_group_priv,
+ .free_event = fanotify_free_event,
++ .freeing_mark = fanotify_freeing_mark,
+ .free_mark = fanotify_free_mark,
+ };
+diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
+index 896c819a17863..57f51a9a3015d 100644
+--- a/fs/notify/fanotify/fanotify.h
++++ b/fs/notify/fanotify/fanotify.h
+@@ -3,6 +3,7 @@
+ #include <linux/path.h>
+ #include <linux/slab.h>
+ #include <linux/exportfs.h>
++#include <linux/hashtable.h>
+
+ extern struct kmem_cache *fanotify_mark_cache;
+ extern struct kmem_cache *fanotify_fid_event_cachep;
+@@ -39,15 +40,45 @@ struct fanotify_fh {
+ struct fanotify_info {
+ /* size of dir_fh/file_fh including fanotify_fh hdr size */
+ u8 dir_fh_totlen;
++ u8 dir2_fh_totlen;
+ u8 file_fh_totlen;
+ u8 name_len;
+- u8 pad;
++ u8 name2_len;
++ u8 pad[3];
+ unsigned char buf[];
+ /*
+ * (struct fanotify_fh) dir_fh starts at buf[0]
+- * (optional) file_fh starts at buf[dir_fh_totlen]
+- * name starts at buf[dir_fh_totlen + file_fh_totlen]
++ * (optional) dir2_fh starts at buf[dir_fh_totlen]
++ * (optional) file_fh starts at buf[dir_fh_totlen + dir2_fh_totlen]
++ * name starts at buf[dir_fh_totlen + dir2_fh_totlen + file_fh_totlen]
++ * ...
+ */
++#define FANOTIFY_DIR_FH_SIZE(info) ((info)->dir_fh_totlen)
++#define FANOTIFY_DIR2_FH_SIZE(info) ((info)->dir2_fh_totlen)
++#define FANOTIFY_FILE_FH_SIZE(info) ((info)->file_fh_totlen)
++#define FANOTIFY_NAME_SIZE(info) ((info)->name_len + 1)
++#define FANOTIFY_NAME2_SIZE(info) ((info)->name2_len + 1)
++
++#define FANOTIFY_DIR_FH_OFFSET(info) 0
++#define FANOTIFY_DIR2_FH_OFFSET(info) \
++ (FANOTIFY_DIR_FH_OFFSET(info) + FANOTIFY_DIR_FH_SIZE(info))
++#define FANOTIFY_FILE_FH_OFFSET(info) \
++ (FANOTIFY_DIR2_FH_OFFSET(info) + FANOTIFY_DIR2_FH_SIZE(info))
++#define FANOTIFY_NAME_OFFSET(info) \
++ (FANOTIFY_FILE_FH_OFFSET(info) + FANOTIFY_FILE_FH_SIZE(info))
++#define FANOTIFY_NAME2_OFFSET(info) \
++ (FANOTIFY_NAME_OFFSET(info) + FANOTIFY_NAME_SIZE(info))
++
++#define FANOTIFY_DIR_FH_BUF(info) \
++ ((info)->buf + FANOTIFY_DIR_FH_OFFSET(info))
++#define FANOTIFY_DIR2_FH_BUF(info) \
++ ((info)->buf + FANOTIFY_DIR2_FH_OFFSET(info))
++#define FANOTIFY_FILE_FH_BUF(info) \
++ ((info)->buf + FANOTIFY_FILE_FH_OFFSET(info))
++#define FANOTIFY_NAME_BUF(info) \
++ ((info)->buf + FANOTIFY_NAME_OFFSET(info))
++#define FANOTIFY_NAME2_BUF(info) \
++ ((info)->buf + FANOTIFY_NAME2_OFFSET(info))
+ } __aligned(4);
+
+ static inline bool fanotify_fh_has_ext_buf(struct fanotify_fh *fh)
+@@ -86,7 +117,21 @@ static inline struct fanotify_fh *fanotify_info_dir_fh(struct fanotify_info *inf
+ {
+ BUILD_BUG_ON(offsetof(struct fanotify_info, buf) % 4);
+
+- return (struct fanotify_fh *)info->buf;
++ return (struct fanotify_fh *)FANOTIFY_DIR_FH_BUF(info);
++}
++
++static inline int fanotify_info_dir2_fh_len(struct fanotify_info *info)
++{
++ if (!info->dir2_fh_totlen ||
++ WARN_ON_ONCE(info->dir2_fh_totlen < FANOTIFY_FH_HDR_LEN))
++ return 0;
++
++ return info->dir2_fh_totlen - FANOTIFY_FH_HDR_LEN;
++}
++
++static inline struct fanotify_fh *fanotify_info_dir2_fh(struct fanotify_info *info)
++{
++ return (struct fanotify_fh *)FANOTIFY_DIR2_FH_BUF(info);
+ }
+
+ static inline int fanotify_info_file_fh_len(struct fanotify_info *info)
+@@ -100,27 +145,90 @@ static inline int fanotify_info_file_fh_len(struct fanotify_info *info)
+
+ static inline struct fanotify_fh *fanotify_info_file_fh(struct fanotify_info *info)
+ {
+- return (struct fanotify_fh *)(info->buf + info->dir_fh_totlen);
++ return (struct fanotify_fh *)FANOTIFY_FILE_FH_BUF(info);
++}
++
++static inline char *fanotify_info_name(struct fanotify_info *info)
++{
++ if (!info->name_len)
++ return NULL;
++
++ return FANOTIFY_NAME_BUF(info);
+ }
+
+-static inline const char *fanotify_info_name(struct fanotify_info *info)
++static inline char *fanotify_info_name2(struct fanotify_info *info)
+ {
+- return info->buf + info->dir_fh_totlen + info->file_fh_totlen;
++ if (!info->name2_len)
++ return NULL;
++
++ return FANOTIFY_NAME2_BUF(info);
+ }
+
+ static inline void fanotify_info_init(struct fanotify_info *info)
+ {
++ BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN + MAX_HANDLE_SZ > U8_MAX);
++ BUILD_BUG_ON(NAME_MAX > U8_MAX);
++
+ info->dir_fh_totlen = 0;
++ info->dir2_fh_totlen = 0;
+ info->file_fh_totlen = 0;
+ info->name_len = 0;
++ info->name2_len = 0;
++}
++
++/* These set/copy helpers MUST be called by order */
++static inline void fanotify_info_set_dir_fh(struct fanotify_info *info,
++ unsigned int totlen)
++{
++ if (WARN_ON_ONCE(info->dir2_fh_totlen > 0) ||
++ WARN_ON_ONCE(info->file_fh_totlen > 0) ||
++ WARN_ON_ONCE(info->name_len > 0) ||
++ WARN_ON_ONCE(info->name2_len > 0))
++ return;
++
++ info->dir_fh_totlen = totlen;
++}
++
++static inline void fanotify_info_set_dir2_fh(struct fanotify_info *info,
++ unsigned int totlen)
++{
++ if (WARN_ON_ONCE(info->file_fh_totlen > 0) ||
++ WARN_ON_ONCE(info->name_len > 0) ||
++ WARN_ON_ONCE(info->name2_len > 0))
++ return;
++
++ info->dir2_fh_totlen = totlen;
++}
++
++static inline void fanotify_info_set_file_fh(struct fanotify_info *info,
++ unsigned int totlen)
++{
++ if (WARN_ON_ONCE(info->name_len > 0) ||
++ WARN_ON_ONCE(info->name2_len > 0))
++ return;
++
++ info->file_fh_totlen = totlen;
+ }
+
+ static inline void fanotify_info_copy_name(struct fanotify_info *info,
+ const struct qstr *name)
+ {
++ if (WARN_ON_ONCE(name->len > NAME_MAX) ||
++ WARN_ON_ONCE(info->name2_len > 0))
++ return;
++
+ info->name_len = name->len;
+- strcpy(info->buf + info->dir_fh_totlen + info->file_fh_totlen,
+- name->name);
++ strcpy(fanotify_info_name(info), name->name);
++}
++
++static inline void fanotify_info_copy_name2(struct fanotify_info *info,
++ const struct qstr *name)
++{
++ if (WARN_ON_ONCE(name->len > NAME_MAX))
++ return;
++
++ info->name2_len = name->len;
++ strcpy(fanotify_info_name2(info), name->name);
+ }
+
+ /*
+@@ -135,29 +243,48 @@ enum fanotify_event_type {
+ FANOTIFY_EVENT_TYPE_PATH,
+ FANOTIFY_EVENT_TYPE_PATH_PERM,
+ FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */
++ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */
++ __FANOTIFY_EVENT_TYPE_NUM
+ };
+
++#define FANOTIFY_EVENT_TYPE_BITS \
++ (ilog2(__FANOTIFY_EVENT_TYPE_NUM - 1) + 1)
++#define FANOTIFY_EVENT_HASH_BITS \
++ (32 - FANOTIFY_EVENT_TYPE_BITS)
++
+ struct fanotify_event {
+ struct fsnotify_event fse;
++ struct hlist_node merge_list; /* List for hashed merge */
+ u32 mask;
+- enum fanotify_event_type type;
++ struct {
++ unsigned int type : FANOTIFY_EVENT_TYPE_BITS;
++ unsigned int hash : FANOTIFY_EVENT_HASH_BITS;
++ };
+ struct pid *pid;
+ };
+
+ static inline void fanotify_init_event(struct fanotify_event *event,
+- unsigned long id, u32 mask)
++ unsigned int hash, u32 mask)
+ {
+- fsnotify_init_event(&event->fse, id);
++ fsnotify_init_event(&event->fse);
++ INIT_HLIST_NODE(&event->merge_list);
++ event->hash = hash;
+ event->mask = mask;
+ event->pid = NULL;
+ }
+
++#define FANOTIFY_INLINE_FH(name, size) \
++struct { \
++ struct fanotify_fh (name); \
++ /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
++ unsigned char _inline_fh_buf[(size)]; \
++}
++
+ struct fanotify_fid_event {
+ struct fanotify_event fae;
+ __kernel_fsid_t fsid;
+- struct fanotify_fh object_fh;
+- /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */
+- unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN];
++
++ FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN);
+ };
+
+ static inline struct fanotify_fid_event *
+@@ -178,12 +305,30 @@ FANOTIFY_NE(struct fanotify_event *event)
+ return container_of(event, struct fanotify_name_event, fae);
+ }
+
++struct fanotify_error_event {
++ struct fanotify_event fae;
++ s32 error; /* Error reported by the Filesystem. */
++ u32 err_count; /* Suppressed errors count */
++
++ __kernel_fsid_t fsid; /* FSID this error refers to. */
++
++ FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ);
++};
++
++static inline struct fanotify_error_event *
++FANOTIFY_EE(struct fanotify_event *event)
++{
++ return container_of(event, struct fanotify_error_event, fae);
++}
++
+ static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event)
+ {
+ if (event->type == FANOTIFY_EVENT_TYPE_FID)
+ return &FANOTIFY_FE(event)->fsid;
+ else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
+ return &FANOTIFY_NE(event)->fsid;
++ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
++ return &FANOTIFY_EE(event)->fsid;
+ else
+ return NULL;
+ }
+@@ -195,6 +340,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh(
+ return &FANOTIFY_FE(event)->object_fh;
+ else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME)
+ return fanotify_info_file_fh(&FANOTIFY_NE(event)->info);
++ else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
++ return &FANOTIFY_EE(event)->object_fh;
+ else
+ return NULL;
+ }
+@@ -226,6 +373,37 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event)
+ return info ? fanotify_info_dir_fh_len(info) : 0;
+ }
+
++static inline int fanotify_event_dir2_fh_len(struct fanotify_event *event)
++{
++ struct fanotify_info *info = fanotify_event_info(event);
++
++ return info ? fanotify_info_dir2_fh_len(info) : 0;
++}
++
++static inline bool fanotify_event_has_object_fh(struct fanotify_event *event)
++{
++ /* For error events, even zeroed fh are reported. */
++ if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR)
++ return true;
++ return fanotify_event_object_fh_len(event) > 0;
++}
++
++static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event)
++{
++ return fanotify_event_dir_fh_len(event) > 0;
++}
++
++static inline bool fanotify_event_has_dir2_fh(struct fanotify_event *event)
++{
++ return fanotify_event_dir2_fh_len(event) > 0;
++}
++
++static inline bool fanotify_event_has_any_dir_fh(struct fanotify_event *event)
++{
++ return fanotify_event_has_dir_fh(event) ||
++ fanotify_event_has_dir2_fh(event);
++}
++
+ struct fanotify_path_event {
+ struct fanotify_event fae;
+ struct path path;
+@@ -269,13 +447,12 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse)
+ return container_of(fse, struct fanotify_event, fse);
+ }
+
+-static inline bool fanotify_event_has_path(struct fanotify_event *event)
++static inline bool fanotify_is_error_event(u32 mask)
+ {
+- return event->type == FANOTIFY_EVENT_TYPE_PATH ||
+- event->type == FANOTIFY_EVENT_TYPE_PATH_PERM;
++ return mask & FAN_FS_ERROR;
+ }
+
+-static inline struct path *fanotify_event_path(struct fanotify_event *event)
++static inline const struct path *fanotify_event_path(struct fanotify_event *event)
+ {
+ if (event->type == FANOTIFY_EVENT_TYPE_PATH)
+ return &FANOTIFY_PE(event)->path;
+@@ -284,3 +461,40 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event)
+ else
+ return NULL;
+ }
++
++/*
++ * Use 128 size hash table to speed up events merge.
++ */
++#define FANOTIFY_HTABLE_BITS (7)
++#define FANOTIFY_HTABLE_SIZE (1 << FANOTIFY_HTABLE_BITS)
++#define FANOTIFY_HTABLE_MASK (FANOTIFY_HTABLE_SIZE - 1)
++
++/*
++ * Permission events and overflow event do not get merged - don't hash them.
++ */
++static inline bool fanotify_is_hashed_event(u32 mask)
++{
++ return !(fanotify_is_perm_event(mask) ||
++ fsnotify_is_overflow_event(mask));
++}
++
++static inline unsigned int fanotify_event_hash_bucket(
++ struct fsnotify_group *group,
++ struct fanotify_event *event)
++{
++ return event->hash & FANOTIFY_HTABLE_MASK;
++}
++
++static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
++{
++ unsigned int mflags = 0;
++
++ if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
++ mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
++ if (mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF)
++ mflags |= FAN_MARK_EVICTABLE;
++ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
++ mflags |= FAN_MARK_IGNORE;
++
++ return mflags;
++}
+diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
+index 84de9f97bbc09..5302313f28bed 100644
+--- a/fs/notify/fanotify/fanotify_user.c
++++ b/fs/notify/fanotify/fanotify_user.c
+@@ -1,6 +1,7 @@
+ // SPDX-License-Identifier: GPL-2.0
+ #include <linux/fanotify.h>
+ #include <linux/fcntl.h>
++#include <linux/fdtable.h>
+ #include <linux/file.h>
+ #include <linux/fs.h>
+ #include <linux/anon_inodes.h>
+@@ -27,8 +28,62 @@
+ #include "fanotify.h"
+
+ #define FANOTIFY_DEFAULT_MAX_EVENTS 16384
+-#define FANOTIFY_DEFAULT_MAX_MARKS 8192
+-#define FANOTIFY_DEFAULT_MAX_LISTENERS 128
++#define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192
++#define FANOTIFY_DEFAULT_MAX_GROUPS 128
++#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32
++
++/*
++ * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
++ * limit of marks per user, similar to inotify. Effectively, the legacy limit
++ * of fanotify marks per user is <max marks per group> * <max groups per user>.
++ * This default limit (1M) also happens to match the increased limit of inotify
++ * max_user_watches since v5.10.
++ */
++#define FANOTIFY_DEFAULT_MAX_USER_MARKS \
++ (FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
++
++/*
++ * Most of the memory cost of adding an inode mark is pinning the marked inode.
++ * The size of the filesystem inode struct is not uniform across filesystems,
++ * so double the size of a VFS inode is used as a conservative approximation.
++ */
++#define INODE_MARK_COST (2 * sizeof(struct inode))
++
++/* configurable via /proc/sys/fs/fanotify/ */
++static int fanotify_max_queued_events __read_mostly;
++
++#ifdef CONFIG_SYSCTL
++
++#include <linux/sysctl.h>
++
++struct ctl_table fanotify_table[] = {
++ {
++ .procname = "max_user_groups",
++ .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = SYSCTL_ZERO,
++ },
++ {
++ .procname = "max_user_marks",
++ .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = SYSCTL_ZERO,
++ },
++ {
++ .procname = "max_queued_events",
++ .data = &fanotify_max_queued_events,
++ .maxlen = sizeof(int),
++ .mode = 0644,
++ .proc_handler = proc_dointvec_minmax,
++ .extra1 = SYSCTL_ZERO
++ },
++ { }
++};
++#endif /* CONFIG_SYSCTL */
+
+ /*
+ * All flags that may be specified in parameter event_f_flags of fanotify_init.
+@@ -51,8 +106,12 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly;
+ struct kmem_cache *fanotify_perm_event_cachep __read_mostly;
+
+ #define FANOTIFY_EVENT_ALIGN 4
+-#define FANOTIFY_INFO_HDR_LEN \
++#define FANOTIFY_FID_INFO_HDR_LEN \
+ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
++#define FANOTIFY_PIDFD_INFO_HDR_LEN \
++ sizeof(struct fanotify_event_info_pidfd)
++#define FANOTIFY_ERROR_INFO_LEN \
++ (sizeof(struct fanotify_event_info_error))
+
+ static int fanotify_fid_info_len(int fh_len, int name_len)
+ {
+@@ -61,21 +120,45 @@ static int fanotify_fid_info_len(int fh_len, int name_len)
+ if (name_len)
+ info_len += name_len + 1;
+
+- return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN);
++ return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
++ FANOTIFY_EVENT_ALIGN);
+ }
+
+-static int fanotify_event_info_len(unsigned int fid_mode,
+- struct fanotify_event *event)
++/* FAN_RENAME may have one or two dir+name info records */
++static int fanotify_dir_name_info_len(struct fanotify_event *event)
+ {
+ struct fanotify_info *info = fanotify_event_info(event);
+ int dir_fh_len = fanotify_event_dir_fh_len(event);
+- int fh_len = fanotify_event_object_fh_len(event);
++ int dir2_fh_len = fanotify_event_dir2_fh_len(event);
+ int info_len = 0;
++
++ if (dir_fh_len)
++ info_len += fanotify_fid_info_len(dir_fh_len,
++ info->name_len);
++ if (dir2_fh_len)
++ info_len += fanotify_fid_info_len(dir2_fh_len,
++ info->name2_len);
++
++ return info_len;
++}
++
++static size_t fanotify_event_len(unsigned int info_mode,
++ struct fanotify_event *event)
++{
++ size_t event_len = FAN_EVENT_METADATA_LEN;
++ int fh_len;
+ int dot_len = 0;
+
+- if (dir_fh_len) {
+- info_len += fanotify_fid_info_len(dir_fh_len, info->name_len);
+- } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) {
++ if (!info_mode)
++ return event_len;
++
++ if (fanotify_is_error_event(event->mask))
++ event_len += FANOTIFY_ERROR_INFO_LEN;
++
++ if (fanotify_event_has_any_dir_fh(event)) {
++ event_len += fanotify_dir_name_info_len(event);
++ } else if ((info_mode & FAN_REPORT_NAME) &&
++ (event->mask & FAN_ONDIR)) {
+ /*
+ * With group flag FAN_REPORT_NAME, if name was not recorded in
+ * event on a directory, we will report the name ".".
+@@ -83,10 +166,32 @@ static int fanotify_event_info_len(unsigned int fid_mode,
+ dot_len = 1;
+ }
+
+- if (fh_len)
+- info_len += fanotify_fid_info_len(fh_len, dot_len);
++ if (info_mode & FAN_REPORT_PIDFD)
++ event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+- return info_len;
++ if (fanotify_event_has_object_fh(event)) {
++ fh_len = fanotify_event_object_fh_len(event);
++ event_len += fanotify_fid_info_len(fh_len, dot_len);
++ }
++
++ return event_len;
++}
++
++/*
++ * Remove an hashed event from merge hash table.
++ */
++static void fanotify_unhash_event(struct fsnotify_group *group,
++ struct fanotify_event *event)
++{
++ assert_spin_locked(&group->notification_lock);
++
++ pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
++ group, event, fanotify_event_hash_bucket(group, event));
++
++ if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
++ return;
++
++ hlist_del_init(&event->merge_list);
+ }
+
+ /*
+@@ -98,34 +203,41 @@ static int fanotify_event_info_len(unsigned int fid_mode,
+ static struct fanotify_event *get_one_event(struct fsnotify_group *group,
+ size_t count)
+ {
+- size_t event_size = FAN_EVENT_METADATA_LEN;
++ size_t event_size;
+ struct fanotify_event *event = NULL;
+- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
++ struct fsnotify_event *fsn_event;
++ unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
+
+ pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
+
+ spin_lock(&group->notification_lock);
+- if (fsnotify_notify_queue_is_empty(group))
++ fsn_event = fsnotify_peek_first_event(group);
++ if (!fsn_event)
+ goto out;
+
+- if (fid_mode) {
+- event_size += fanotify_event_info_len(fid_mode,
+- FANOTIFY_E(fsnotify_peek_first_event(group)));
+- }
++ event = FANOTIFY_E(fsn_event);
++ event_size = fanotify_event_len(info_mode, event);
+
+ if (event_size > count) {
+ event = ERR_PTR(-EINVAL);
+ goto out;
+ }
+- event = FANOTIFY_E(fsnotify_remove_first_event(group));
++
++ /*
++ * Held the notification_lock the whole time, so this is the
++ * same event we peeked above.
++ */
++ fsnotify_remove_first_event(group);
+ if (fanotify_is_perm_event(event->mask))
+ FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
++ if (fanotify_is_hashed_event(event->mask))
++ fanotify_unhash_event(group, event);
+ out:
+ spin_unlock(&group->notification_lock);
+ return event;
+ }
+
+-static int create_fd(struct fsnotify_group *group, struct path *path,
++static int create_fd(struct fsnotify_group *group, const struct path *path,
+ struct file **file)
+ {
+ int client_fd;
+@@ -140,7 +252,7 @@ static int create_fd(struct fsnotify_group *group, struct path *path,
+ * originally opened O_WRONLY.
+ */
+ new_file = dentry_open(path,
+- group->fanotify_data.f_flags | FMODE_NONOTIFY,
++ group->fanotify_data.f_flags | __FMODE_NONOTIFY,
+ current_cred());
+ if (IS_ERR(new_file)) {
+ /*
+@@ -225,9 +337,31 @@ static int process_access_response(struct fsnotify_group *group,
+ return -ENOENT;
+ }
+
+-static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+- int info_type, const char *name, size_t name_len,
+- char __user *buf, size_t count)
++static size_t copy_error_info_to_user(struct fanotify_event *event,
++ char __user *buf, int count)
++{
++ struct fanotify_event_info_error info = { };
++ struct fanotify_error_event *fee = FANOTIFY_EE(event);
++
++ info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
++ info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
++
++ if (WARN_ON(count < info.hdr.len))
++ return -EFAULT;
++
++ info.error = fee->error;
++ info.error_count = fee->err_count;
++
++ if (copy_to_user(buf, &info, sizeof(info)))
++ return -EFAULT;
++
++ return info.hdr.len;
++}
++
++static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
++ int info_type, const char *name,
++ size_t name_len,
++ char __user *buf, size_t count)
+ {
+ struct fanotify_event_info_fid info = { };
+ struct file_handle handle = { };
+@@ -239,9 +373,6 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+ pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
+ __func__, fh_len, name_len, info_len, count);
+
+- if (!fh_len)
+- return 0;
+-
+ if (WARN_ON_ONCE(len < sizeof(info) || len > count))
+ return -EFAULT;
+
+@@ -256,6 +387,8 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+ return -EFAULT;
+ break;
+ case FAN_EVENT_INFO_TYPE_DFID_NAME:
++ case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
++ case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
+ if (WARN_ON_ONCE(!name || !name_len))
+ return -EFAULT;
+ break;
+@@ -276,6 +409,11 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+
+ handle.handle_type = fh->type;
+ handle.handle_bytes = fh_len;
++
++ /* Mangle handle_type for bad file_handle */
++ if (!fh_len)
++ handle.handle_type = FILEID_INVALID;
++
+ if (copy_to_user(buf, &handle, sizeof(handle)))
+ return -EFAULT;
+
+@@ -320,68 +458,79 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
+ return info_len;
+ }
+
+-static ssize_t copy_event_to_user(struct fsnotify_group *group,
+- struct fanotify_event *event,
+- char __user *buf, size_t count)
++static int copy_pidfd_info_to_user(int pidfd,
++ char __user *buf,
++ size_t count)
+ {
+- struct fanotify_event_metadata metadata;
+- struct path *path = fanotify_event_path(event);
+- struct fanotify_info *info = fanotify_event_info(event);
+- unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+- struct file *f = NULL;
+- int ret, fd = FAN_NOFD;
+- int info_type = 0;
++ struct fanotify_event_info_pidfd info = { };
++ size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
+
+- pr_debug("%s: group=%p event=%p\n", __func__, group, event);
++ if (WARN_ON_ONCE(info_len > count))
++ return -EFAULT;
+
+- metadata.event_len = FAN_EVENT_METADATA_LEN +
+- fanotify_event_info_len(fid_mode, event);
+- metadata.metadata_len = FAN_EVENT_METADATA_LEN;
+- metadata.vers = FANOTIFY_METADATA_VERSION;
+- metadata.reserved = 0;
+- metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
+- metadata.pid = pid_vnr(event->pid);
++ info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
++ info.hdr.len = info_len;
++ info.pidfd = pidfd;
+
+- if (path && path->mnt && path->dentry) {
+- fd = create_fd(group, path, &f);
+- if (fd < 0)
+- return fd;
+- }
+- metadata.fd = fd;
++ if (copy_to_user(buf, &info, info_len))
++ return -EFAULT;
++
++ return info_len;
++}
++
++static int copy_info_records_to_user(struct fanotify_event *event,
++ struct fanotify_info *info,
++ unsigned int info_mode, int pidfd,
++ char __user *buf, size_t count)
++{
++ int ret, total_bytes = 0, info_type = 0;
++ unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
++ unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
+
+- ret = -EFAULT;
+ /*
+- * Sanity check copy size in case get_one_event() and
+- * event_len sizes ever get out of sync.
++ * Event info records order is as follows:
++ * 1. dir fid + name
++ * 2. (optional) new dir fid + new name
++ * 3. (optional) child fid
+ */
+- if (WARN_ON_ONCE(metadata.event_len > count))
+- goto out_close_fd;
++ if (fanotify_event_has_dir_fh(event)) {
++ info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
++ FAN_EVENT_INFO_TYPE_DFID;
+
+- if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
+- goto out_close_fd;
++ /* FAN_RENAME uses special info types */
++ if (event->mask & FAN_RENAME)
++ info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
+
+- buf += FAN_EVENT_METADATA_LEN;
+- count -= FAN_EVENT_METADATA_LEN;
++ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
++ fanotify_info_dir_fh(info),
++ info_type,
++ fanotify_info_name(info),
++ info->name_len, buf, count);
++ if (ret < 0)
++ return ret;
+
+- if (fanotify_is_perm_event(event->mask))
+- FANOTIFY_PERM(event)->fd = fd;
++ buf += ret;
++ count -= ret;
++ total_bytes += ret;
++ }
+
+- /* Event info records order is: dir fid + name, child fid */
+- if (fanotify_event_dir_fh_len(event)) {
+- info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
+- FAN_EVENT_INFO_TYPE_DFID;
+- ret = copy_info_to_user(fanotify_event_fsid(event),
+- fanotify_info_dir_fh(info),
+- info_type, fanotify_info_name(info),
+- info->name_len, buf, count);
++ /* New dir fid+name may be reported in addition to old dir fid+name */
++ if (fanotify_event_has_dir2_fh(event)) {
++ info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
++ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
++ fanotify_info_dir2_fh(info),
++ info_type,
++ fanotify_info_name2(info),
++ info->name2_len, buf, count);
+ if (ret < 0)
+- goto out_close_fd;
++ return ret;
+
+ buf += ret;
+ count -= ret;
++ total_bytes += ret;
+ }
+
+- if (fanotify_event_object_fh_len(event)) {
++ if (fanotify_event_has_object_fh(event)) {
+ const char *dot = NULL;
+ int dot_len = 0;
+
+@@ -395,8 +544,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
+ (event->mask & FAN_ONDIR)) {
+ /*
+ * With group flag FAN_REPORT_NAME, if name was not
+- * recorded in an event on a directory, report the
+- * name "." with info type DFID_NAME.
++ * recorded in an event on a directory, report the name
++ * "." with info type DFID_NAME.
+ */
+ info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
+ dot = ".";
+@@ -419,14 +568,132 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
+ info_type = FAN_EVENT_INFO_TYPE_FID;
+ }
+
+- ret = copy_info_to_user(fanotify_event_fsid(event),
+- fanotify_event_object_fh(event),
+- info_type, dot, dot_len, buf, count);
++ ret = copy_fid_info_to_user(fanotify_event_fsid(event),
++ fanotify_event_object_fh(event),
++ info_type, dot, dot_len,
++ buf, count);
+ if (ret < 0)
+- goto out_close_fd;
++ return ret;
++
++ buf += ret;
++ count -= ret;
++ total_bytes += ret;
++ }
++
++ if (pidfd_mode) {
++ ret = copy_pidfd_info_to_user(pidfd, buf, count);
++ if (ret < 0)
++ return ret;
++
++ buf += ret;
++ count -= ret;
++ total_bytes += ret;
++ }
+
++ if (fanotify_is_error_event(event->mask)) {
++ ret = copy_error_info_to_user(event, buf, count);
++ if (ret < 0)
++ return ret;
+ buf += ret;
+ count -= ret;
++ total_bytes += ret;
++ }
++
++ return total_bytes;
++}
++
++static ssize_t copy_event_to_user(struct fsnotify_group *group,
++ struct fanotify_event *event,
++ char __user *buf, size_t count)
++{
++ struct fanotify_event_metadata metadata;
++ const struct path *path = fanotify_event_path(event);
++ struct fanotify_info *info = fanotify_event_info(event);
++ unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
++ unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
++ struct file *f = NULL;
++ int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD;
++
++ pr_debug("%s: group=%p event=%p\n", __func__, group, event);
++
++ metadata.event_len = fanotify_event_len(info_mode, event);
++ metadata.metadata_len = FAN_EVENT_METADATA_LEN;
++ metadata.vers = FANOTIFY_METADATA_VERSION;
++ metadata.reserved = 0;
++ metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
++ metadata.pid = pid_vnr(event->pid);
++ /*
++ * For an unprivileged listener, event->pid can be used to identify the
++ * events generated by the listener process itself, without disclosing
++ * the pids of other processes.
++ */
++ if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
++ task_tgid(current) != event->pid)
++ metadata.pid = 0;
++
++ /*
++ * For now, fid mode is required for an unprivileged listener and
++ * fid mode does not report fd in events. Keep this check anyway
++ * for safety in case fid mode requirement is relaxed in the future
++ * to allow unprivileged listener to get events with no fd and no fid.
++ */
++ if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
++ path && path->mnt && path->dentry) {
++ fd = create_fd(group, path, &f);
++ if (fd < 0)
++ return fd;
++ }
++ metadata.fd = fd;
++
++ if (pidfd_mode) {
++ /*
++ * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
++ * exclusion is ever lifted. At the time of incoporating pidfd
++ * support within fanotify, the pidfd API only supported the
++ * creation of pidfds for thread-group leaders.
++ */
++ WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
++
++ /*
++ * The PIDTYPE_TGID check for an event->pid is performed
++ * preemptively in an attempt to catch out cases where the event
++ * listener reads events after the event generating process has
++ * already terminated. Report FAN_NOPIDFD to the event listener
++ * in those cases, with all other pidfd creation errors being
++ * reported as FAN_EPIDFD.
++ */
++ if (metadata.pid == 0 ||
++ !pid_has_task(event->pid, PIDTYPE_TGID)) {
++ pidfd = FAN_NOPIDFD;
++ } else {
++ pidfd = pidfd_create(event->pid, 0);
++ if (pidfd < 0)
++ pidfd = FAN_EPIDFD;
++ }
++ }
++
++ ret = -EFAULT;
++ /*
++ * Sanity check copy size in case get_one_event() and
++ * event_len sizes ever get out of sync.
++ */
++ if (WARN_ON_ONCE(metadata.event_len > count))
++ goto out_close_fd;
++
++ if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
++ goto out_close_fd;
++
++ buf += FAN_EVENT_METADATA_LEN;
++ count -= FAN_EVENT_METADATA_LEN;
++
++ if (fanotify_is_perm_event(event->mask))
++ FANOTIFY_PERM(event)->fd = fd;
++
++ if (info_mode) {
++ ret = copy_info_records_to_user(event, info, info_mode, pidfd,
++ buf, count);
++ if (ret < 0)
++ goto out_close_fd;
+ }
+
+ if (f)
+@@ -439,6 +706,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
+ put_unused_fd(fd);
+ fput(f);
+ }
++
++ if (pidfd >= 0)
++ close_fd(pidfd);
++
+ return ret;
+ }
+
+@@ -573,6 +844,7 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
+ static int fanotify_release(struct inode *ignored, struct file *file)
+ {
+ struct fsnotify_group *group = file->private_data;
++ struct fsnotify_event *fsn_event;
+
+ /*
+ * Stop new events from arriving in the notification queue. since
+@@ -601,13 +873,12 @@ static int fanotify_release(struct inode *ignored, struct file *file)
+ * dequeue them and set the response. They will be freed once the
+ * response is consumed and fanotify_get_response() returns.
+ */
+- while (!fsnotify_notify_queue_is_empty(group)) {
+- struct fanotify_event *event;
++ while ((fsn_event = fsnotify_remove_first_event(group))) {
++ struct fanotify_event *event = FANOTIFY_E(fsn_event);
+
+- event = FANOTIFY_E(fsnotify_remove_first_event(group));
+ if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
+ spin_unlock(&group->notification_lock);
+- fsnotify_destroy_event(group, &event->fse);
++ fsnotify_destroy_event(group, fsn_event);
+ } else {
+ finish_permission_event(group, FANOTIFY_PERM(event),
+ FAN_ALLOW);
+@@ -702,7 +973,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
+ }
+
+ /* you can only watch an inode if you have read permissions on it */
+- ret = inode_permission(path->dentry->d_inode, MAY_READ);
++ ret = path_permission(path, MAY_READ);
+ if (ret) {
+ path_put(path);
+ goto out;
+@@ -720,27 +991,28 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
+ __u32 mask, unsigned int flags,
+ __u32 umask, int *destroy)
+ {
+- __u32 oldmask = 0;
++ __u32 oldmask, newmask;
+
+ /* umask bits cannot be removed by user */
+ mask &= ~umask;
+ spin_lock(&fsn_mark->lock);
+- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+- oldmask = fsn_mark->mask;
++ oldmask = fsnotify_calc_mask(fsn_mark);
++ if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
+ fsn_mark->mask &= ~mask;
+ } else {
+- fsn_mark->ignored_mask &= ~mask;
++ fsn_mark->ignore_mask &= ~mask;
+ }
++ newmask = fsnotify_calc_mask(fsn_mark);
+ /*
+ * We need to keep the mark around even if remaining mask cannot
+ * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
+ * changes to the mask.
+ * Destroy mark when only umask bits remain.
+ */
+- *destroy = !((fsn_mark->mask | fsn_mark->ignored_mask) & ~umask);
++ *destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
+ spin_unlock(&fsn_mark->lock);
+
+- return mask & oldmask;
++ return oldmask & ~newmask;
+ }
+
+ static int fanotify_remove_mark(struct fsnotify_group *group,
+@@ -751,10 +1023,10 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
+ __u32 removed;
+ int destroy_mark;
+
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ fsn_mark = fsnotify_find_mark(connp, group);
+ if (!fsn_mark) {
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ return -ENOENT;
+ }
+
+@@ -764,7 +1036,7 @@ static int fanotify_remove_mark(struct fsnotify_group *group,
+ fsnotify_recalc_mask(fsn_mark->connector);
+ if (destroy_mark)
+ fsnotify_detach_mark(fsn_mark);
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ if (destroy_mark)
+ fsnotify_free_mark(fsn_mark);
+
+@@ -797,76 +1069,199 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group,
+ flags, umask);
+ }
+
+-static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
+- __u32 mask,
+- unsigned int flags)
++static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
++ unsigned int fan_flags)
++{
++ bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
++ unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
++ bool recalc = false;
++
++ /*
++ * When using FAN_MARK_IGNORE for the first time, mark starts using
++ * independent event flags in ignore mask. After that, trying to
++ * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
++ * will result in EEXIST error.
++ */
++ if (ignore == FAN_MARK_IGNORE)
++ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
++
++ /*
++ * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
++ * the removal of the FS_MODIFY bit in calculated mask if it was set
++ * because of an ignore mask that is now going to survive FS_MODIFY.
++ */
++ if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
++ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
++ fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
++ if (!(fsn_mark->mask & FS_MODIFY))
++ recalc = true;
++ }
++
++ if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
++ want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
++ return recalc;
++
++ /*
++ * NO_IREF may be removed from a mark, but not added.
++ * When removed, fsnotify_recalc_mask() will take the inode ref.
++ */
++ WARN_ON_ONCE(!want_iref);
++ fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
++
++ return true;
++}
++
++static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
++ __u32 mask, unsigned int fan_flags)
+ {
+- __u32 oldmask = -1;
++ bool recalc;
+
+ spin_lock(&fsn_mark->lock);
+- if (!(flags & FAN_MARK_IGNORED_MASK)) {
+- oldmask = fsn_mark->mask;
++ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
+ fsn_mark->mask |= mask;
+- } else {
+- fsn_mark->ignored_mask |= mask;
+- if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
+- fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
+- }
++ else
++ fsn_mark->ignore_mask |= mask;
++
++ recalc = fsnotify_calc_mask(fsn_mark) &
++ ~fsnotify_conn_mask(fsn_mark->connector);
++
++ recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
+ spin_unlock(&fsn_mark->lock);
+
+- return mask & ~oldmask;
++ return recalc;
+ }
+
+ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
+ fsnotify_connp_t *connp,
+- unsigned int type,
++ unsigned int obj_type,
++ unsigned int fan_flags,
+ __kernel_fsid_t *fsid)
+ {
++ struct ucounts *ucounts = group->fanotify_data.ucounts;
+ struct fsnotify_mark *mark;
+ int ret;
+
+- if (atomic_read(&group->num_marks) > group->fanotify_data.max_marks)
++ /*
++ * Enforce per user marks limits per user in all containing user ns.
++ * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
++ * in the limited groups account.
++ */
++ if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
++ !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
+ return ERR_PTR(-ENOSPC);
+
+ mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+- if (!mark)
+- return ERR_PTR(-ENOMEM);
++ if (!mark) {
++ ret = -ENOMEM;
++ goto out_dec_ucounts;
++ }
+
+ fsnotify_init_mark(mark, group);
+- ret = fsnotify_add_mark_locked(mark, connp, type, 0, fsid);
++ if (fan_flags & FAN_MARK_EVICTABLE)
++ mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
++
++ ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
+ if (ret) {
+ fsnotify_put_mark(mark);
+- return ERR_PTR(ret);
++ goto out_dec_ucounts;
+ }
+
+ return mark;
++
++out_dec_ucounts:
++ if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
++ dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
++ return ERR_PTR(ret);
+ }
+
++static int fanotify_group_init_error_pool(struct fsnotify_group *group)
++{
++ if (mempool_initialized(&group->fanotify_data.error_events_pool))
++ return 0;
++
++ return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
++ FANOTIFY_DEFAULT_FEE_POOL_SIZE,
++ sizeof(struct fanotify_error_event));
++}
++
++static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
++ unsigned int fan_flags)
++{
++ /*
++ * Non evictable mark cannot be downgraded to evictable mark.
++ */
++ if (fan_flags & FAN_MARK_EVICTABLE &&
++ !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
++ return -EEXIST;
++
++ /*
++ * New ignore mask semantics cannot be downgraded to old semantics.
++ */
++ if (fan_flags & FAN_MARK_IGNORED_MASK &&
++ fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
++ return -EEXIST;
++
++ /*
++ * An ignore mask that survives modify could never be downgraded to not
++ * survive modify. With new FAN_MARK_IGNORE semantics we make that rule
++ * explicit and return an error when trying to update the ignore mask
++ * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
++ */
++ if (fan_flags & FAN_MARK_IGNORE &&
++ !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
++ fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
++ return -EEXIST;
++
++ return 0;
++}
+
+ static int fanotify_add_mark(struct fsnotify_group *group,
+- fsnotify_connp_t *connp, unsigned int type,
+- __u32 mask, unsigned int flags,
++ fsnotify_connp_t *connp, unsigned int obj_type,
++ __u32 mask, unsigned int fan_flags,
+ __kernel_fsid_t *fsid)
+ {
+ struct fsnotify_mark *fsn_mark;
+- __u32 added;
++ bool recalc;
++ int ret = 0;
+
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ fsn_mark = fsnotify_find_mark(connp, group);
+ if (!fsn_mark) {
+- fsn_mark = fanotify_add_new_mark(group, connp, type, fsid);
++ fsn_mark = fanotify_add_new_mark(group, connp, obj_type,
++ fan_flags, fsid);
+ if (IS_ERR(fsn_mark)) {
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ return PTR_ERR(fsn_mark);
+ }
+ }
+- added = fanotify_mark_add_to_mask(fsn_mark, mask, flags);
+- if (added & ~fsnotify_conn_mask(fsn_mark->connector))
++
++ /*
++ * Check if requested mark flags conflict with an existing mark flags.
++ */
++ ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
++ if (ret)
++ goto out;
++
++ /*
++ * Error events are pre-allocated per group, only if strictly
++ * needed (i.e. FAN_FS_ERROR was requested).
++ */
++ if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
++ (mask & FAN_FS_ERROR)) {
++ ret = fanotify_group_init_error_pool(group);
++ if (ret)
++ goto out;
++ }
++
++ recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
++ if (recalc)
+ fsnotify_recalc_mask(fsn_mark->connector);
+- mutex_unlock(&group->mark_mutex);
++
++out:
++ fsnotify_group_unlock(group);
+
+ fsnotify_put_mark(fsn_mark);
+- return 0;
++ return ret;
+ }
+
+ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
+@@ -893,10 +1288,10 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group,
+
+ /*
+ * If some other task has this inode open for write we should not add
+- * an ignored mark, unless that ignored mark is supposed to survive
++ * an ignore mask, unless that ignore mask is supposed to survive
+ * modification changes anyway.
+ */
+- if ((flags & FAN_MARK_IGNORED_MASK) &&
++ if ((flags & FANOTIFY_MARK_IGNORE_BITS) &&
+ !(flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
+ inode_is_open_for_write(inode))
+ return 0;
+@@ -919,20 +1314,49 @@ static struct fsnotify_event *fanotify_alloc_overflow_event(void)
+ return &oevent->fse;
+ }
+
++static struct hlist_head *fanotify_alloc_merge_hash(void)
++{
++ struct hlist_head *hash;
++
++ hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
++ GFP_KERNEL_ACCOUNT);
++ if (!hash)
++ return NULL;
++
++ __hash_init(hash, FANOTIFY_HTABLE_SIZE);
++
++ return hash;
++}
++
+ /* fanotify syscalls */
+ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+ {
+ struct fsnotify_group *group;
+ int f_flags, fd;
+- struct user_struct *user;
+ unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
+ unsigned int class = flags & FANOTIFY_CLASS_BITS;
++ unsigned int internal_flags = 0;
+
+ pr_debug("%s: flags=%x event_f_flags=%x\n",
+ __func__, flags, event_f_flags);
+
+- if (!capable(CAP_SYS_ADMIN))
+- return -EPERM;
++ if (!capable(CAP_SYS_ADMIN)) {
++ /*
++ * An unprivileged user can setup an fanotify group with
++ * limited functionality - an unprivileged group is limited to
++ * notification events with file handles and it cannot use
++ * unlimited queue/marks.
++ */
++ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
++ return -EPERM;
++
++ /*
++ * Setting the internal flag FANOTIFY_UNPRIV on the group
++ * prevents setting mount/filesystem marks on this group and
++ * prevents reporting pid and open fd in events.
++ */
++ internal_flags |= FANOTIFY_UNPRIV;
++ }
+
+ #ifdef CONFIG_AUDITSYSCALL
+ if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
+@@ -941,6 +1365,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+ #endif
+ return -EINVAL;
+
++ /*
++ * A pidfd can only be returned for a thread-group leader; thus
++ * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
++ * exclusive.
++ */
++ if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
++ return -EINVAL;
++
+ if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
+ return -EINVAL;
+
+@@ -963,30 +1395,46 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+ if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
+ return -EINVAL;
+
+- user = get_current_user();
+- if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) {
+- free_uid(user);
+- return -EMFILE;
+- }
++ /*
++ * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
++ * and is used as an indication to report both dir and child fid on all
++ * dirent events.
++ */
++ if ((fid_mode & FAN_REPORT_TARGET_FID) &&
++ (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
++ return -EINVAL;
+
+- f_flags = O_RDWR | FMODE_NONOTIFY;
++ f_flags = O_RDWR | __FMODE_NONOTIFY;
+ if (flags & FAN_CLOEXEC)
+ f_flags |= O_CLOEXEC;
+ if (flags & FAN_NONBLOCK)
+ f_flags |= O_NONBLOCK;
+
+ /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */
+- group = fsnotify_alloc_group(&fanotify_fsnotify_ops);
++ group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
++ FSNOTIFY_GROUP_USER | FSNOTIFY_GROUP_NOFS);
+ if (IS_ERR(group)) {
+- free_uid(user);
+ return PTR_ERR(group);
+ }
+
+- group->fanotify_data.user = user;
+- group->fanotify_data.flags = flags;
+- atomic_inc(&user->fanotify_listeners);
++ /* Enforce groups limits per user in all containing user ns */
++ group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
++ current_euid(),
++ UCOUNT_FANOTIFY_GROUPS);
++ if (!group->fanotify_data.ucounts) {
++ fd = -EMFILE;
++ goto out_destroy_group;
++ }
++
++ group->fanotify_data.flags = flags | internal_flags;
+ group->memcg = get_mem_cgroup_from_mm(current->mm);
+
++ group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
++ if (!group->fanotify_data.merge_hash) {
++ fd = -ENOMEM;
++ goto out_destroy_group;
++ }
++
+ group->overflow_event = fanotify_alloc_overflow_event();
+ if (unlikely(!group->overflow_event)) {
+ fd = -ENOMEM;
+@@ -1019,16 +1467,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+ goto out_destroy_group;
+ group->max_events = UINT_MAX;
+ } else {
+- group->max_events = FANOTIFY_DEFAULT_MAX_EVENTS;
++ group->max_events = fanotify_max_queued_events;
+ }
+
+ if (flags & FAN_UNLIMITED_MARKS) {
+ fd = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out_destroy_group;
+- group->fanotify_data.max_marks = UINT_MAX;
+- } else {
+- group->fanotify_data.max_marks = FANOTIFY_DEFAULT_MAX_MARKS;
+ }
+
+ if (flags & FAN_ENABLE_AUDIT) {
+@@ -1048,16 +1493,15 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
+ return fd;
+ }
+
+-/* Check if filesystem can encode a unique fid */
+-static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
++static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+ {
+ __kernel_fsid_t root_fsid;
+ int err;
+
+ /*
+- * Make sure path is not in filesystem with zero fsid (e.g. tmpfs).
++ * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
+ */
+- err = vfs_get_fsid(path->dentry, fsid);
++ err = vfs_get_fsid(dentry, fsid);
+ if (err)
+ return err;
+
+@@ -1065,10 +1509,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+ return -ENODEV;
+
+ /*
+- * Make sure path is not inside a filesystem subvolume (e.g. btrfs)
++ * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
+ * which uses a different fsid than sb root.
+ */
+- err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid);
++ err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
+ if (err)
+ return err;
+
+@@ -1076,6 +1520,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+ root_fsid.val[1] != fsid->val[1])
+ return -EXDEV;
+
++ return 0;
++}
++
++/* Check if filesystem can encode a unique fid */
++static int fanotify_test_fid(struct dentry *dentry)
++{
+ /*
+ * We need to make sure that the file system supports at least
+ * encoding a file handle so user can use name_to_handle_at() to
+@@ -1083,17 +1533,22 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
+ * objects. However, name_to_handle_at() requires that the
+ * filesystem also supports decoding file handles.
+ */
+- if (!path->dentry->d_sb->s_export_op ||
+- !path->dentry->d_sb->s_export_op->fh_to_dentry)
++ if (!dentry->d_sb->s_export_op ||
++ !dentry->d_sb->s_export_op->fh_to_dentry)
+ return -EOPNOTSUPP;
+
+ return 0;
+ }
+
+-static int fanotify_events_supported(struct path *path, __u64 mask,
++static int fanotify_events_supported(struct fsnotify_group *group,
++ const struct path *path, __u64 mask,
+ unsigned int flags)
+ {
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
++ /* Strict validation of events in non-dir inode mask with v5.17+ APIs */
++ bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
++ (mask & FAN_RENAME) ||
++ (flags & FAN_MARK_IGNORE);
+
+ /*
+ * Some filesystems such as 'proc' acquire unusual locks when opening
+@@ -1121,6 +1576,15 @@ static int fanotify_events_supported(struct path *path, __u64 mask,
+ path->mnt->mnt_sb->s_flags & SB_NOUSER)
+ return -EINVAL;
+
++ /*
++ * We shouldn't have allowed setting dirent events and the directory
++ * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
++ * but because we always allowed it, error only when using new APIs.
++ */
++ if (strict_dir_events && mark_type == FAN_MARK_INODE &&
++ !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
++ return -ENOTDIR;
++
+ return 0;
+ }
+
+@@ -1135,7 +1599,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ __kernel_fsid_t __fsid, *fsid = NULL;
+ u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+- bool ignored = flags & FAN_MARK_IGNORED_MASK;
++ unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
++ unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
+ unsigned int obj_type, fid_mode;
+ u32 umask = 0;
+ int ret;
+@@ -1144,7 +1609,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ __func__, fanotify_fd, flags, dfd, pathname, mask);
+
+ /* we only use the lower 32 bits as of right now. */
+- if (mask & ((__u64)0xffffffff << 32))
++ if (upper_32_bits(mask))
+ return -EINVAL;
+
+ if (flags & ~FANOTIFY_MARK_FLAGS)
+@@ -1164,7 +1629,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ return -EINVAL;
+ }
+
+- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) {
++ switch (mark_cmd) {
+ case FAN_MARK_ADD:
+ case FAN_MARK_REMOVE:
+ if (!mask)
+@@ -1184,9 +1649,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ if (mask & ~valid_mask)
+ return -EINVAL;
+
+- /* Event flags (ONDIR, ON_CHILD) are meaningless in ignored mask */
+- if (ignored)
++
++ /* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
++ if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
++ return -EINVAL;
++
++ /*
++ * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
++ * FAN_MARK_IGNORED_MASK.
++ */
++ if (ignore == FAN_MARK_IGNORED_MASK) {
+ mask &= ~FANOTIFY_EVENT_FLAGS;
++ umask = FANOTIFY_EVENT_FLAGS;
++ }
+
+ f = fdget(fanotify_fd);
+ if (unlikely(!f.file))
+@@ -1198,6 +1673,17 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ goto fput_and_out;
+ group = f.file->private_data;
+
++ /*
++ * An unprivileged user is not allowed to setup mount nor filesystem
++ * marks. This also includes setting up such marks by a group that
++ * was initialized by an unprivileged user.
++ */
++ ret = -EPERM;
++ if ((!capable(CAP_SYS_ADMIN) ||
++ FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
++ mark_type != FAN_MARK_INODE)
++ goto fput_and_out;
++
+ /*
+ * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF. These are not
+ * allowed to set permissions events.
+@@ -1207,19 +1693,39 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ group->priority == FS_PRIO_0)
+ goto fput_and_out;
+
++ if (mask & FAN_FS_ERROR &&
++ mark_type != FAN_MARK_FILESYSTEM)
++ goto fput_and_out;
++
++ /*
++ * Evictable is only relevant for inode marks, because only inode object
++ * can be evicted on memory pressure.
++ */
++ if (flags & FAN_MARK_EVICTABLE &&
++ mark_type != FAN_MARK_INODE)
++ goto fput_and_out;
++
+ /*
+- * Events with data type inode do not carry enough information to report
+- * event->fd, so we do not allow setting a mask for inode events unless
+- * group supports reporting fid.
+- * inode events are not supported on a mount mark, because they do not
+- * carry enough information (i.e. path) to be filtered by mount point.
++ * Events that do not carry enough information to report
++ * event->fd require a group that supports reporting fid. Those
++ * events are not supported on a mount mark, because they do not
++ * carry enough information (i.e. path) to be filtered by mount
++ * point.
+ */
+ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
+- if (mask & FANOTIFY_INODE_EVENTS &&
++ if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
+ (!fid_mode || mark_type == FAN_MARK_MOUNT))
+ goto fput_and_out;
+
+- if (flags & FAN_MARK_FLUSH) {
++ /*
++ * FAN_RENAME uses special info type records to report the old and
++ * new parent+name. Reporting only old and new parent id is less
++ * useful and was not implemented.
++ */
++ if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
++ goto fput_and_out;
++
++ if (mark_cmd == FAN_MARK_FLUSH) {
+ ret = 0;
+ if (mark_type == FAN_MARK_MOUNT)
+ fsnotify_clear_vfsmount_marks_by_group(group);
+@@ -1235,14 +1741,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ if (ret)
+ goto fput_and_out;
+
+- if (flags & FAN_MARK_ADD) {
+- ret = fanotify_events_supported(&path, mask, flags);
++ if (mark_cmd == FAN_MARK_ADD) {
++ ret = fanotify_events_supported(group, &path, mask, flags);
+ if (ret)
+ goto path_put_and_out;
+ }
+
+ if (fid_mode) {
+- ret = fanotify_test_fid(&path, &__fsid);
++ ret = fanotify_test_fsid(path.dentry, &__fsid);
++ if (ret)
++ goto path_put_and_out;
++
++ ret = fanotify_test_fid(path.dentry);
+ if (ret)
+ goto path_put_and_out;
+
+@@ -1255,6 +1765,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ else
+ mnt = path.mnt;
+
++ ret = mnt ? -EINVAL : -EISDIR;
++ /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
++ if (mark_cmd == FAN_MARK_ADD && ignore == FAN_MARK_IGNORE &&
++ (mnt || S_ISDIR(inode->i_mode)) &&
++ !(flags & FAN_MARK_IGNORED_SURV_MODIFY))
++ goto path_put_and_out;
++
+ /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
+ if (mnt || !S_ISDIR(inode->i_mode)) {
+ mask &= ~FAN_EVENT_ON_CHILD;
+@@ -1264,12 +1781,12 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
+ * events with parent/name info for non-directory.
+ */
+ if ((fid_mode & FAN_REPORT_DIR_FID) &&
+- (flags & FAN_MARK_ADD) && !ignored)
++ (flags & FAN_MARK_ADD) && !ignore)
+ mask |= FAN_EVENT_ON_CHILD;
+ }
+
+ /* create/update an inode mark */
+- switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) {
++ switch (mark_cmd) {
+ case FAN_MARK_ADD:
+ if (mark_type == FAN_MARK_MOUNT)
+ ret = fanotify_add_vfsmount_mark(group, mnt, mask,
+@@ -1330,8 +1847,24 @@ SYSCALL32_DEFINE6(fanotify_mark,
+ */
+ static int __init fanotify_user_setup(void)
+ {
+- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10);
+- BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9);
++ struct sysinfo si;
++ int max_marks;
++
++ si_meminfo(&si);
++ /*
++ * Allow up to 1% of addressable memory to be accounted for per user
++ * marks limited to the range [8192, 1048576]. mount and sb marks are
++ * a lot cheaper than inode marks, but there is no reason for a user
++ * to have many of those, so calculate by the cost of inode marks.
++ */
++ max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
++ INODE_MARK_COST;
++ max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
++ FANOTIFY_DEFAULT_MAX_USER_MARKS);
++
++ BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
++ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
++ BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
+
+ fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ SLAB_PANIC|SLAB_ACCOUNT);
+@@ -1344,6 +1877,11 @@ static int __init fanotify_user_setup(void)
+ KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
+ }
+
++ fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
++ init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
++ FANOTIFY_DEFAULT_MAX_GROUPS;
++ init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
++
+ return 0;
+ }
+ device_initcall(fanotify_user_setup);
+diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
+index 765b50aeadd28..55081ae3a6ec0 100644
+--- a/fs/notify/fdinfo.c
++++ b/fs/notify/fdinfo.c
+@@ -14,6 +14,7 @@
+ #include <linux/exportfs.h>
+
+ #include "inotify/inotify.h"
++#include "fanotify/fanotify.h"
+ #include "fdinfo.h"
+ #include "fsnotify.h"
+
+@@ -28,13 +29,13 @@ static void show_fdinfo(struct seq_file *m, struct file *f,
+ struct fsnotify_group *group = f->private_data;
+ struct fsnotify_mark *mark;
+
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ list_for_each_entry(mark, &group->marks_list, g_list) {
+ show(m, mark);
+ if (seq_has_overflowed(m))
+ break;
+ }
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ }
+
+ #if defined(CONFIG_EXPORTFS)
+@@ -103,19 +104,16 @@ void inotify_show_fdinfo(struct seq_file *m, struct file *f)
+
+ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+ {
+- unsigned int mflags = 0;
++ unsigned int mflags = fanotify_mark_user_flags(mark);
+ struct inode *inode;
+
+- if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
+- mflags |= FAN_MARK_IGNORED_SURV_MODIFY;
+-
+ if (mark->connector->type == FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = igrab(fsnotify_conn_inode(mark->connector));
+ if (!inode)
+ return;
+ seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
+ inode->i_ino, inode->i_sb->s_dev,
+- mflags, mark->mask, mark->ignored_mask);
++ mflags, mark->mask, mark->ignore_mask);
+ show_mark_fhandle(m, inode);
+ seq_putc(m, '\n');
+ iput(inode);
+@@ -123,12 +121,12 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
+ struct mount *mnt = fsnotify_conn_mount(mark->connector);
+
+ seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
+- mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
++ mnt->mnt_id, mflags, mark->mask, mark->ignore_mask);
+ } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_SB) {
+ struct super_block *sb = fsnotify_conn_sb(mark->connector);
+
+ seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n",
+- sb->s_dev, mflags, mark->mask, mark->ignored_mask);
++ sb->s_dev, mflags, mark->mask, mark->ignore_mask);
+ }
+ }
+
+@@ -137,7 +135,8 @@ void fanotify_show_fdinfo(struct seq_file *m, struct file *f)
+ struct fsnotify_group *group = f->private_data;
+
+ seq_printf(m, "fanotify flags:%x event-flags:%x\n",
+- group->fanotify_data.flags, group->fanotify_data.f_flags);
++ group->fanotify_data.flags & FANOTIFY_INIT_FLAGS,
++ group->fanotify_data.f_flags);
+
+ show_fdinfo(m, f, fanotify_fdinfo);
+ }
+diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
+index 30d422b8c0fc7..7974e91ffe134 100644
+--- a/fs/notify/fsnotify.c
++++ b/fs/notify/fsnotify.c
+@@ -70,8 +70,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&sb->s_inode_list_lock);
+
+- if (iput_inode)
+- iput(iput_inode);
++ iput(iput_inode);
+
+ /* for each watch, send FS_UNMOUNT and then remove it */
+ fsnotify_inode(inode, FS_UNMOUNT);
+@@ -85,24 +84,23 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+
+- if (iput_inode)
+- iput(iput_inode);
+- /* Wait for outstanding inode references from connectors */
+- wait_var_event(&sb->s_fsnotify_inode_refs,
+- !atomic_long_read(&sb->s_fsnotify_inode_refs));
++ iput(iput_inode);
+ }
+
+ void fsnotify_sb_delete(struct super_block *sb)
+ {
+ fsnotify_unmount_inodes(sb);
+ fsnotify_clear_marks_by_sb(sb);
++ /* Wait for outstanding object references from connectors */
++ wait_var_event(&sb->s_fsnotify_connectors,
++ !atomic_long_read(&sb->s_fsnotify_connectors));
+ }
+
+ /*
+ * Given an inode, first check if we care what happens to our children. Inotify
+ * and dnotify both tell their parents about events. If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+- * parent cares. Thus when an event happens on a child it can quickly tell if
++ * parent cares. Thus when an event happens on a child it can quickly tell
+ * if there is a need to find a parent and send the event to the parent.
+ */
+ void __fsnotify_update_child_dentry_flags(struct inode *inode)
+@@ -252,7 +250,10 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group,
+ if (WARN_ON_ONCE(!ops->handle_inode_event))
+ return 0;
+
+- if ((inode_mark->mask & FS_EXCL_UNLINK) &&
++ if (WARN_ON_ONCE(!inode && !dir))
++ return 0;
++
++ if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) &&
+ path && d_unlinked(path->dentry))
+ return 0;
+
+@@ -276,23 +277,28 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask,
+ WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info)))
+ return 0;
+
+- if (parent_mark) {
+- /*
+- * parent_mark indicates that the parent inode is watching
+- * children and interested in this event, which is an event
+- * possible on child. But is *this mark* watching children and
+- * interested in this event?
+- */
+- if (parent_mark->mask & FS_EVENT_ON_CHILD) {
+- ret = fsnotify_handle_inode_event(group, parent_mark, mask,
+- data, data_type, dir, name, 0);
+- if (ret)
+- return ret;
+- }
+- if (!inode_mark)
++ /*
++ * For FS_RENAME, 'dir' is old dir and 'data' is new dentry.
++ * The only ->handle_inode_event() backend that supports FS_RENAME is
++ * dnotify, where it means file was renamed within same parent.
++ */
++ if (mask & FS_RENAME) {
++ struct dentry *moved = fsnotify_data_dentry(data, data_type);
++
++ if (dir != moved->d_parent->d_inode)
+ return 0;
+ }
+
++ if (parent_mark) {
++ ret = fsnotify_handle_inode_event(group, parent_mark, mask,
++ data, data_type, dir, name, 0);
++ if (ret)
++ return ret;
++ }
++
++ if (!inode_mark)
++ return 0;
++
+ if (mask & FS_EVENT_ON_CHILD) {
+ /*
+ * Some events can be sent on both parent dir and child marks
+@@ -318,42 +324,36 @@ static int send_to_group(__u32 mask, const void *data, int data_type,
+ struct fsnotify_group *group = NULL;
+ __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS);
+ __u32 marks_mask = 0;
+- __u32 marks_ignored_mask = 0;
++ __u32 marks_ignore_mask = 0;
++ bool is_dir = mask & FS_ISDIR;
+ struct fsnotify_mark *mark;
+ int type;
+
+- if (WARN_ON(!iter_info->report_mask))
++ if (!iter_info->report_mask)
+ return 0;
+
+ /* clear ignored on inode modification */
+ if (mask & FS_MODIFY) {
+- fsnotify_foreach_obj_type(type) {
+- if (!fsnotify_iter_should_report_type(iter_info, type))
+- continue;
+- mark = iter_info->marks[type];
+- if (mark &&
+- !(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
+- mark->ignored_mask = 0;
++ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
++ if (!(mark->flags &
++ FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
++ mark->ignore_mask = 0;
+ }
+ }
+
+- fsnotify_foreach_obj_type(type) {
+- if (!fsnotify_iter_should_report_type(iter_info, type))
+- continue;
+- mark = iter_info->marks[type];
+- /* does the object mark tell us to do something? */
+- if (mark) {
+- group = mark->group;
+- marks_mask |= mark->mask;
+- marks_ignored_mask |= mark->ignored_mask;
+- }
++ /* Are any of the group marks interested in this event? */
++ fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
++ group = mark->group;
++ marks_mask |= mark->mask;
++ marks_ignore_mask |=
++ fsnotify_effective_ignore_mask(mark, is_dir, type);
+ }
+
+- pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignored_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
+- __func__, group, mask, marks_mask, marks_ignored_mask,
++ pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n",
++ __func__, group, mask, marks_mask, marks_ignore_mask,
+ data, data_type, dir, cookie);
+
+- if (!(test_mask & marks_mask & ~marks_ignored_mask))
++ if (!(test_mask & marks_mask & ~marks_ignore_mask))
+ return 0;
+
+ if (group->ops->handle_event) {
+@@ -390,11 +390,11 @@ static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark)
+
+ /*
+ * iter_info is a multi head priority queue of marks.
+- * Pick a subset of marks from queue heads, all with the
+- * same group and set the report_mask for selected subset.
+- * Returns the report_mask of the selected subset.
++ * Pick a subset of marks from queue heads, all with the same group
++ * and set the report_mask to a subset of the selected marks.
++ * Returns false if there are no more groups to iterate.
+ */
+-static unsigned int fsnotify_iter_select_report_types(
++static bool fsnotify_iter_select_report_types(
+ struct fsnotify_iter_info *iter_info)
+ {
+ struct fsnotify_group *max_prio_group = NULL;
+@@ -402,7 +402,7 @@ static unsigned int fsnotify_iter_select_report_types(
+ int type;
+
+ /* Choose max prio group among groups of all queue heads */
+- fsnotify_foreach_obj_type(type) {
++ fsnotify_foreach_iter_type(type) {
+ mark = iter_info->marks[type];
+ if (mark &&
+ fsnotify_compare_groups(max_prio_group, mark->group) > 0)
+@@ -410,30 +410,49 @@ static unsigned int fsnotify_iter_select_report_types(
+ }
+
+ if (!max_prio_group)
+- return 0;
++ return false;
+
+ /* Set the report mask for marks from same group as max prio group */
++ iter_info->current_group = max_prio_group;
+ iter_info->report_mask = 0;
+- fsnotify_foreach_obj_type(type) {
++ fsnotify_foreach_iter_type(type) {
+ mark = iter_info->marks[type];
+- if (mark &&
+- fsnotify_compare_groups(max_prio_group, mark->group) == 0)
++ if (mark && mark->group == iter_info->current_group) {
++ /*
++ * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode
++ * is watching children and interested in this event,
++ * which is an event possible on child.
++ * But is *this mark* watching children?
++ */
++ if (type == FSNOTIFY_ITER_TYPE_PARENT &&
++ !(mark->mask & FS_EVENT_ON_CHILD) &&
++ !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD))
++ continue;
++
+ fsnotify_iter_set_report_type(iter_info, type);
++ }
+ }
+
+- return iter_info->report_mask;
++ return true;
+ }
+
+ /*
+- * Pop from iter_info multi head queue, the marks that were iterated in the
++ * Pop from iter_info multi head queue, the marks that belong to the group of
+ * current iteration step.
+ */
+ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
+ {
++ struct fsnotify_mark *mark;
+ int type;
+
+- fsnotify_foreach_obj_type(type) {
+- if (fsnotify_iter_should_report_type(iter_info, type))
++ /*
++ * We cannot use fsnotify_foreach_iter_mark_type() here because we
++ * may need to advance a mark of type X that belongs to current_group
++ * but was not selected for reporting.
++ */
++ fsnotify_foreach_iter_type(type) {
++ mark = iter_info->marks[type];
++ if (mark && mark->group == iter_info->current_group)
+ iter_info->marks[type] =
+ fsnotify_next_mark(iter_info->marks[type]);
+ }
+@@ -455,18 +474,20 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info)
+ * @file_name is relative to
+ * @file_name: optional file name associated with event
+ * @inode: optional inode associated with event -
+- * either @dir or @inode must be non-NULL.
+- * if both are non-NULL event may be reported to both.
++ * If @dir and @inode are both non-NULL, event may be
++ * reported to both.
+ * @cookie: inotify rename cookie
+ */
+ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
+ const struct qstr *file_name, struct inode *inode, u32 cookie)
+ {
+ const struct path *path = fsnotify_data_path(data, data_type);
++ struct super_block *sb = fsnotify_data_sb(data, data_type);
+ struct fsnotify_iter_info iter_info = {};
+- struct super_block *sb;
+ struct mount *mnt = NULL;
+- struct inode *parent = NULL;
++ struct inode *inode2 = NULL;
++ struct dentry *moved;
++ int inode2_type;
+ int ret = 0;
+ __u32 test_mask, marks_mask;
+
+@@ -476,14 +497,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
+ if (!inode) {
+ /* Dirent event - report on TYPE_INODE to dir */
+ inode = dir;
++ /* For FS_RENAME, inode is old_dir and inode2 is new_dir */
++ if (mask & FS_RENAME) {
++ moved = fsnotify_data_dentry(data, data_type);
++ inode2 = moved->d_parent->d_inode;
++ inode2_type = FSNOTIFY_ITER_TYPE_INODE2;
++ }
+ } else if (mask & FS_EVENT_ON_CHILD) {
+ /*
+ * Event on child - report on TYPE_PARENT to dir if it is
+ * watching children and on TYPE_INODE to child.
+ */
+- parent = dir;
++ inode2 = dir;
++ inode2_type = FSNOTIFY_ITER_TYPE_PARENT;
+ }
+- sb = inode->i_sb;
+
+ /*
+ * Optimization: srcu_read_lock() has a memory barrier which can
+@@ -495,7 +522,7 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
+ if (!sb->s_fsnotify_marks &&
+ (!mnt || !mnt->mnt_fsnotify_marks) &&
+ (!inode || !inode->i_fsnotify_marks) &&
+- (!parent || !parent->i_fsnotify_marks))
++ (!inode2 || !inode2->i_fsnotify_marks))
+ return 0;
+
+ marks_mask = sb->s_fsnotify_mask;
+@@ -503,33 +530,35 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
+ marks_mask |= mnt->mnt_fsnotify_mask;
+ if (inode)
+ marks_mask |= inode->i_fsnotify_mask;
+- if (parent)
+- marks_mask |= parent->i_fsnotify_mask;
++ if (inode2)
++ marks_mask |= inode2->i_fsnotify_mask;
+
+
+ /*
+- * if this is a modify event we may need to clear the ignored masks
+- * otherwise return if none of the marks care about this type of event.
++ * If this is a modify event we may need to clear some ignore masks.
++ * In that case, the object with ignore masks will have the FS_MODIFY
++ * event in its mask.
++ * Otherwise, return if none of the marks care about this type of event.
+ */
+ test_mask = (mask & ALL_FSNOTIFY_EVENTS);
+- if (!(mask & FS_MODIFY) && !(test_mask & marks_mask))
++ if (!(test_mask & marks_mask))
+ return 0;
+
+ iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
+
+- iter_info.marks[FSNOTIFY_OBJ_TYPE_SB] =
++ iter_info.marks[FSNOTIFY_ITER_TYPE_SB] =
+ fsnotify_first_mark(&sb->s_fsnotify_marks);
+ if (mnt) {
+- iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
++ iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] =
+ fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
+ }
+ if (inode) {
+- iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
++ iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] =
+ fsnotify_first_mark(&inode->i_fsnotify_marks);
+ }
+- if (parent) {
+- iter_info.marks[FSNOTIFY_OBJ_TYPE_PARENT] =
+- fsnotify_first_mark(&parent->i_fsnotify_marks);
++ if (inode2) {
++ iter_info.marks[inode2_type] =
++ fsnotify_first_mark(&inode2->i_fsnotify_marks);
+ }
+
+ /*
+@@ -558,7 +587,7 @@ static __init int fsnotify_init(void)
+ {
+ int ret;
+
+- BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 25);
++ BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23);
+
+ ret = init_srcu_struct(&fsnotify_mark_srcu);
+ if (ret)
+diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
+index ff2063ec6b0f3..fde74eb333cc9 100644
+--- a/fs/notify/fsnotify.h
++++ b/fs/notify/fsnotify.h
+@@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb(
+ return container_of(conn->obj, struct super_block, s_fsnotify_marks);
+ }
+
++static inline struct super_block *fsnotify_connector_sb(
++ struct fsnotify_mark_connector *conn)
++{
++ switch (conn->type) {
++ case FSNOTIFY_OBJ_TYPE_INODE:
++ return fsnotify_conn_inode(conn)->i_sb;
++ case FSNOTIFY_OBJ_TYPE_VFSMOUNT:
++ return fsnotify_conn_mount(conn)->mnt.mnt_sb;
++ case FSNOTIFY_OBJ_TYPE_SB:
++ return fsnotify_conn_sb(conn);
++ default:
++ return NULL;
++ }
++}
++
+ /* destroy all events sitting in this groups notification queue */
+ extern void fsnotify_flush_notify(struct fsnotify_group *group);
+
+@@ -61,10 +76,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
+ */
+ extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+
+-/* allocate and destroy and event holder to attach events to notification/access queues */
+-extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+-extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+-
+ extern struct kmem_cache *fsnotify_mark_connector_cachep;
+
+ #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
+diff --git a/fs/notify/group.c b/fs/notify/group.c
+index a4a4b1c64d32a..1de6631a3925e 100644
+--- a/fs/notify/group.c
++++ b/fs/notify/group.c
+@@ -58,7 +58,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
+ fsnotify_group_stop_queueing(group);
+
+ /* Clear all marks for this group and queue them for destruction */
+- fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES_MASK);
++ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_ANY);
+
+ /*
+ * Some marks can still be pinned when waiting for response from
+@@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group)
+ * that deliberately ignores overflow events.
+ */
+ if (group->overflow_event)
+- group->ops->free_event(group->overflow_event);
++ group->ops->free_event(group, group->overflow_event);
+
+ fsnotify_put_group(group);
+ }
+@@ -111,20 +111,19 @@ void fsnotify_put_group(struct fsnotify_group *group)
+ }
+ EXPORT_SYMBOL_GPL(fsnotify_put_group);
+
+-/*
+- * Create a new fsnotify_group and hold a reference for the group returned.
+- */
+-struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
++static struct fsnotify_group *__fsnotify_alloc_group(
++ const struct fsnotify_ops *ops,
++ int flags, gfp_t gfp)
+ {
++ static struct lock_class_key nofs_marks_lock;
+ struct fsnotify_group *group;
+
+- group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
++ group = kzalloc(sizeof(struct fsnotify_group), gfp);
+ if (!group)
+ return ERR_PTR(-ENOMEM);
+
+ /* set to 0 when there a no external references to this group */
+ refcount_set(&group->refcnt, 1);
+- atomic_set(&group->num_marks, 0);
+ atomic_set(&group->user_waits, 0);
+
+ spin_lock_init(&group->notification_lock);
+@@ -136,9 +135,32 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
+ INIT_LIST_HEAD(&group->marks_list);
+
+ group->ops = ops;
++ group->flags = flags;
++ /*
++ * For most backends, eviction of inode with a mark is not expected,
++ * because marks hold a refcount on the inode against eviction.
++ *
++ * Use a different lockdep class for groups that support evictable
++ * inode marks, because with evictable marks, mark_mutex is NOT
++ * fs-reclaim safe - the mutex is taken when evicting inodes.
++ */
++ if (flags & FSNOTIFY_GROUP_NOFS)
++ lockdep_set_class(&group->mark_mutex, &nofs_marks_lock);
+
+ return group;
+ }
++
++/*
++ * Create a new fsnotify_group and hold a reference for the group returned.
++ */
++struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops,
++ int flags)
++{
++ gfp_t gfp = (flags & FSNOTIFY_GROUP_USER) ? GFP_KERNEL_ACCOUNT :
++ GFP_KERNEL;
++
++ return __fsnotify_alloc_group(ops, flags, gfp);
++}
+ EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
+
+ int fsnotify_fasync(int fd, struct file *file, int on)
+diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
+index 8f00151eb731f..7d5df7a215397 100644
+--- a/fs/notify/inotify/inotify.h
++++ b/fs/notify/inotify/inotify.h
+@@ -27,11 +27,18 @@ static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse)
+ * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+-#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)
++#define INOTIFY_USER_MASK (IN_ALL_EVENTS)
+
+ static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+ {
+- return fsn_mark->mask & INOTIFY_USER_MASK;
++ __u32 mask = fsn_mark->mask & INOTIFY_USER_MASK;
++
++ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK)
++ mask |= IN_EXCL_UNLINK;
++ if (fsn_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
++ mask |= IN_ONESHOT;
++
++ return mask;
+ }
+
+ extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
+diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
+index 66991c7fef9e2..993375f0db673 100644
+--- a/fs/notify/inotify/inotify_fsnotify.c
++++ b/fs/notify/inotify/inotify_fsnotify.c
+@@ -46,9 +46,10 @@ static bool event_compare(struct fsnotify_event *old_fsn,
+ return false;
+ }
+
+-static int inotify_merge(struct list_head *list,
+- struct fsnotify_event *event)
++static int inotify_merge(struct fsnotify_group *group,
++ struct fsnotify_event *event)
+ {
++ struct list_head *list = &group->notification_list;
+ struct fsnotify_event *last_event;
+
+ last_event = list_entry(list->prev, struct fsnotify_event, list);
+@@ -114,7 +115,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
+ mask &= ~IN_ISDIR;
+
+ fsn_event = &event->fse;
+- fsnotify_init_event(fsn_event, 0);
++ fsnotify_init_event(fsn_event);
+ event->mask = mask;
+ event->wd = wd;
+ event->sync_cookie = cookie;
+@@ -128,7 +129,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
+ fsnotify_destroy_event(group, fsn_event);
+ }
+
+- if (inode_mark->mask & IN_ONESHOT)
++ if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT)
+ fsnotify_destroy_mark(inode_mark, group);
+
+ return 0;
+@@ -183,7 +184,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group)
+ dec_inotify_instances(group->inotify_data.ucounts);
+ }
+
+-static void inotify_free_event(struct fsnotify_event *fsn_event)
++static void inotify_free_event(struct fsnotify_group *group,
++ struct fsnotify_event *fsn_event)
+ {
+ kfree(INOTIFY_E(fsn_event));
+ }
+diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
+index 32b6b97021bef..7360d16ce46d7 100644
+--- a/fs/notify/inotify/inotify_user.c
++++ b/fs/notify/inotify/inotify_user.c
+@@ -37,6 +37,15 @@
+
+ #include <asm/ioctls.h>
+
++/*
++ * An inotify watch requires allocating an inotify_inode_mark structure as
++ * well as pinning the watched inode. Doubling the size of a VFS inode
++ * should be more than enough to cover the additional filesystem inode
++ * size increase.
++ */
++#define INOTIFY_WATCH_COST (sizeof(struct inotify_inode_mark) + \
++ 2 * sizeof(struct inode))
++
+ /* configurable via /proc/sys/fs/inotify/ */
+ static int inotify_max_queued_events __read_mostly;
+
+@@ -80,10 +89,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
+ __u32 mask;
+
+ /*
+- * Everything should accept their own ignored and should receive events
+- * when the inode is unmounted. All directories care about children.
++ * Everything should receive events when the inode is unmounted.
++ * All directories care about children.
+ */
+- mask = (FS_IN_IGNORED | FS_UNMOUNT);
++ mask = (FS_UNMOUNT);
+ if (S_ISDIR(inode->i_mode))
+ mask |= FS_EVENT_ON_CHILD;
+
+@@ -93,13 +102,28 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
+ return mask;
+ }
+
++#define INOTIFY_MARK_FLAGS \
++ (FSNOTIFY_MARK_FLAG_EXCL_UNLINK | FSNOTIFY_MARK_FLAG_IN_ONESHOT)
++
++static inline unsigned int inotify_arg_to_flags(u32 arg)
++{
++ unsigned int flags = 0;
++
++ if (arg & IN_EXCL_UNLINK)
++ flags |= FSNOTIFY_MARK_FLAG_EXCL_UNLINK;
++ if (arg & IN_ONESHOT)
++ flags |= FSNOTIFY_MARK_FLAG_IN_ONESHOT;
++
++ return flags;
++}
++
+ static inline u32 inotify_mask_to_arg(__u32 mask)
+ {
+ return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+ IN_Q_OVERFLOW);
+ }
+
+-/* intofiy userspace file descriptor functions */
++/* inotify userspace file descriptor functions */
+ static __poll_t inotify_poll(struct file *file, poll_table *wait)
+ {
+ struct fsnotify_group *group = file->private_data;
+@@ -137,10 +161,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+ size_t event_size = sizeof(struct inotify_event);
+ struct fsnotify_event *event;
+
+- if (fsnotify_notify_queue_is_empty(group))
+- return NULL;
+-
+ event = fsnotify_peek_first_event(group);
++ if (!event)
++ return NULL;
+
+ pr_debug("%s: group=%p event=%p\n", __func__, group, event);
+
+@@ -343,7 +366,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path,
+ if (error)
+ return error;
+ /* you can only watch an inode if you have read permissions on it */
+- error = inode_permission(path->dentry->d_inode, MAY_READ);
++ error = path_permission(path, MAY_READ);
+ if (error) {
+ path_put(path);
+ return error;
+@@ -505,13 +528,10 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
+ struct fsnotify_mark *fsn_mark;
+ struct inotify_inode_mark *i_mark;
+ __u32 old_mask, new_mask;
+- __u32 mask;
+- int add = (arg & IN_MASK_ADD);
++ int replace = !(arg & IN_MASK_ADD);
+ int create = (arg & IN_MASK_CREATE);
+ int ret;
+
+- mask = inotify_arg_to_mask(inode, arg);
+-
+ fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group);
+ if (!fsn_mark)
+ return -ENOENT;
+@@ -524,10 +544,12 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
+
+ spin_lock(&fsn_mark->lock);
+ old_mask = fsn_mark->mask;
+- if (add)
+- fsn_mark->mask |= mask;
+- else
+- fsn_mark->mask = mask;
++ if (replace) {
++ fsn_mark->mask = 0;
++ fsn_mark->flags &= ~INOTIFY_MARK_FLAGS;
++ }
++ fsn_mark->mask |= inotify_arg_to_mask(inode, arg);
++ fsn_mark->flags |= inotify_arg_to_flags(arg);
+ new_mask = fsn_mark->mask;
+ spin_unlock(&fsn_mark->lock);
+
+@@ -558,19 +580,17 @@ static int inotify_new_watch(struct fsnotify_group *group,
+ u32 arg)
+ {
+ struct inotify_inode_mark *tmp_i_mark;
+- __u32 mask;
+ int ret;
+ struct idr *idr = &group->inotify_data.idr;
+ spinlock_t *idr_lock = &group->inotify_data.idr_lock;
+
+- mask = inotify_arg_to_mask(inode, arg);
+-
+ tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+ if (unlikely(!tmp_i_mark))
+ return -ENOMEM;
+
+ fsnotify_init_mark(&tmp_i_mark->fsn_mark, group);
+- tmp_i_mark->fsn_mark.mask = mask;
++ tmp_i_mark->fsn_mark.mask = inotify_arg_to_mask(inode, arg);
++ tmp_i_mark->fsn_mark.flags = inotify_arg_to_flags(arg);
+ tmp_i_mark->wd = -1;
+
+ ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
+@@ -607,13 +627,13 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
+ {
+ int ret = 0;
+
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ /* try to update and existing watch with the new arg */
+ ret = inotify_update_existing_watch(group, inode, arg);
+ /* no mark present, try to add a new one */
+ if (ret == -ENOENT)
+ ret = inotify_new_watch(group, inode, arg);
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+
+ return ret;
+ }
+@@ -623,17 +643,18 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
+ struct fsnotify_group *group;
+ struct inotify_event_info *oevent;
+
+- group = fsnotify_alloc_group(&inotify_fsnotify_ops);
++ group = fsnotify_alloc_group(&inotify_fsnotify_ops,
++ FSNOTIFY_GROUP_USER);
+ if (IS_ERR(group))
+ return group;
+
+- oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL);
++ oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT);
+ if (unlikely(!oevent)) {
+ fsnotify_destroy_group(group);
+ return ERR_PTR(-ENOMEM);
+ }
+ group->overflow_event = &oevent->fse;
+- fsnotify_init_event(group->overflow_event, 0);
++ fsnotify_init_event(group->overflow_event);
+ oevent->mask = FS_Q_OVERFLOW;
+ oevent->wd = -1;
+ oevent->sync_cookie = 0;
+@@ -797,6 +818,18 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
+ */
+ static int __init inotify_user_setup(void)
+ {
++ unsigned long watches_max;
++ struct sysinfo si;
++
++ si_meminfo(&si);
++ /*
++ * Allow up to 1% of addressable memory to be allocated for inotify
++ * watches (per user) limited to the range [8192, 1048576].
++ */
++ watches_max = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
++ INOTIFY_WATCH_COST;
++ watches_max = clamp(watches_max, 8192UL, 1048576UL);
++
+ BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+ BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+ BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+@@ -812,9 +845,7 @@ static int __init inotify_user_setup(void)
+ BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+ BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+ BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+- BUILD_BUG_ON(IN_EXCL_UNLINK != FS_EXCL_UNLINK);
+ BUILD_BUG_ON(IN_ISDIR != FS_ISDIR);
+- BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
+ BUILD_BUG_ON(HWEIGHT32(ALL_INOTIFY_BITS) != 22);
+
+@@ -823,7 +854,7 @@ static int __init inotify_user_setup(void)
+
+ inotify_max_queued_events = 16384;
+ init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
+- init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = 8192;
++ init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max;
+
+ return 0;
+ }
+diff --git a/fs/notify/mark.c b/fs/notify/mark.c
+index 5b44be5f93dd8..c74ef947447d6 100644
+--- a/fs/notify/mark.c
++++ b/fs/notify/mark.c
+@@ -116,20 +116,64 @@ __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn)
+ return *fsnotify_conn_mask_p(conn);
+ }
+
+-static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
++static void fsnotify_get_inode_ref(struct inode *inode)
++{
++ ihold(inode);
++ atomic_long_inc(&inode->i_sb->s_fsnotify_connectors);
++}
++
++/*
++ * Grab or drop inode reference for the connector if needed.
++ *
++ * When it's time to drop the reference, we only clear the HAS_IREF flag and
++ * return the inode object. fsnotify_drop_object() will be resonsible for doing
++ * iput() outside of spinlocks. This happens when last mark that wanted iref is
++ * detached.
++ */
++static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
++ bool want_iref)
++{
++ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
++ struct inode *inode = NULL;
++
++ if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
++ want_iref == has_iref)
++ return NULL;
++
++ if (want_iref) {
++ /* Pin inode if any mark wants inode refcount held */
++ fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
++ conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
++ } else {
++ /* Unpin inode after detach of last mark that wanted iref */
++ inode = fsnotify_conn_inode(conn);
++ conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
++ }
++
++ return inode;
++}
++
++static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+ {
+ u32 new_mask = 0;
++ bool want_iref = false;
+ struct fsnotify_mark *mark;
+
+ assert_spin_locked(&conn->lock);
+ /* We can get detached connector here when inode is getting unlinked. */
+ if (!fsnotify_valid_obj_type(conn->type))
+- return;
++ return NULL;
+ hlist_for_each_entry(mark, &conn->list, obj_list) {
+- if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)
+- new_mask |= mark->mask;
++ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED))
++ continue;
++ new_mask |= fsnotify_calc_mask(mark);
++ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE &&
++ !(mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
++ want_iref = true;
+ }
+ *fsnotify_conn_mask_p(conn) = new_mask;
++
++ return fsnotify_update_iref(conn, want_iref);
+ }
+
+ /*
+@@ -169,6 +213,31 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work)
+ }
+ }
+
++static void fsnotify_put_inode_ref(struct inode *inode)
++{
++ struct super_block *sb = inode->i_sb;
++
++ iput(inode);
++ if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
++ wake_up_var(&sb->s_fsnotify_connectors);
++}
++
++static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn)
++{
++ struct super_block *sb = fsnotify_connector_sb(conn);
++
++ if (sb)
++ atomic_long_inc(&sb->s_fsnotify_connectors);
++}
++
++static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
++{
++ struct super_block *sb = fsnotify_connector_sb(conn);
++
++ if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors))
++ wake_up_var(&sb->s_fsnotify_connectors);
++}
++
+ static void *fsnotify_detach_connector_from_object(
+ struct fsnotify_mark_connector *conn,
+ unsigned int *type)
+@@ -182,13 +251,17 @@ static void *fsnotify_detach_connector_from_object(
+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = fsnotify_conn_inode(conn);
+ inode->i_fsnotify_mask = 0;
+- atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs);
++
++ /* Unpin inode when detaching from connector */
++ if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
++ inode = NULL;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
+ } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) {
+ fsnotify_conn_sb(conn)->s_fsnotify_mask = 0;
+ }
+
++ fsnotify_put_sb_connectors(conn);
+ rcu_assign_pointer(*(conn->obj), NULL);
+ conn->obj = NULL;
+ conn->type = FSNOTIFY_OBJ_TYPE_DETACHED;
+@@ -209,19 +282,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
+ /* Drop object reference originally held by a connector */
+ static void fsnotify_drop_object(unsigned int type, void *objp)
+ {
+- struct inode *inode;
+- struct super_block *sb;
+-
+ if (!objp)
+ return;
+ /* Currently only inode references are passed to be dropped */
+ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
+ return;
+- inode = objp;
+- sb = inode->i_sb;
+- iput(inode);
+- if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs))
+- wake_up_var(&sb->s_fsnotify_inode_refs);
++ fsnotify_put_inode_ref(objp);
+ }
+
+ void fsnotify_put_mark(struct fsnotify_mark *mark)
+@@ -250,7 +316,8 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
+ objp = fsnotify_detach_connector_from_object(conn, &type);
+ free_conn = true;
+ } else {
+- __fsnotify_recalc_mask(conn);
++ objp = __fsnotify_recalc_mask(conn);
++ type = conn->type;
+ }
+ WRITE_ONCE(mark->connector, NULL);
+ spin_unlock(&conn->lock);
+@@ -329,7 +396,7 @@ bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info)
+ {
+ int type;
+
+- fsnotify_foreach_obj_type(type) {
++ fsnotify_foreach_iter_type(type) {
+ /* This can fail if mark is being removed */
+ if (!fsnotify_get_mark_safe(iter_info->marks[type])) {
+ __release(&fsnotify_mark_srcu);
+@@ -358,7 +425,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
+ int type;
+
+ iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
+- fsnotify_foreach_obj_type(type)
++ fsnotify_foreach_iter_type(type)
+ fsnotify_put_mark_wake(iter_info->marks[type]);
+ }
+
+@@ -374,9 +441,7 @@ void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info)
+ */
+ void fsnotify_detach_mark(struct fsnotify_mark *mark)
+ {
+- struct fsnotify_group *group = mark->group;
+-
+- WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
++ fsnotify_group_assert_locked(mark->group);
+ WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) &&
+ refcount_read(&mark->refcnt) < 1 +
+ !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED));
+@@ -391,8 +456,6 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
+ list_del_init(&mark->g_list);
+ spin_unlock(&mark->lock);
+
+- atomic_dec(&group->num_marks);
+-
+ /* Drop mark reference acquired in fsnotify_add_mark_locked() */
+ fsnotify_put_mark(mark);
+ }
+@@ -430,9 +493,9 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
+ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
+ struct fsnotify_group *group)
+ {
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ fsnotify_detach_mark(mark);
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ fsnotify_free_mark(mark);
+ }
+ EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
+@@ -474,10 +537,9 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
+ }
+
+ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
+- unsigned int type,
++ unsigned int obj_type,
+ __kernel_fsid_t *fsid)
+ {
+- struct inode *inode = NULL;
+ struct fsnotify_mark_connector *conn;
+
+ conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL);
+@@ -485,7 +547,8 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
+ return -ENOMEM;
+ spin_lock_init(&conn->lock);
+ INIT_HLIST_HEAD(&conn->list);
+- conn->type = type;
++ conn->flags = 0;
++ conn->type = obj_type;
+ conn->obj = connp;
+ /* Cache fsid of filesystem containing the object */
+ if (fsid) {
+@@ -495,16 +558,15 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
+ conn->fsid.val[0] = conn->fsid.val[1] = 0;
+ conn->flags = 0;
+ }
+- if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
+- inode = igrab(fsnotify_conn_inode(conn));
++ fsnotify_get_sb_connectors(conn);
++
+ /*
+ * cmpxchg() provides the barrier so that readers of *connp can see
+ * only initialized structure
+ */
+ if (cmpxchg(connp, NULL, conn)) {
+ /* Someone else created list structure for us */
+- if (inode)
+- iput(inode);
++ fsnotify_put_sb_connectors(conn);
+ kmem_cache_free(fsnotify_mark_connector_cachep, conn);
+ }
+
+@@ -545,15 +607,16 @@ static struct fsnotify_mark_connector *fsnotify_grab_connector(
+ * priority, highest number first, and then by the group's location in memory.
+ */
+ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+- fsnotify_connp_t *connp, unsigned int type,
+- int allow_dups, __kernel_fsid_t *fsid)
++ fsnotify_connp_t *connp,
++ unsigned int obj_type,
++ int add_flags, __kernel_fsid_t *fsid)
+ {
+ struct fsnotify_mark *lmark, *last = NULL;
+ struct fsnotify_mark_connector *conn;
+ int cmp;
+ int err = 0;
+
+- if (WARN_ON(!fsnotify_valid_obj_type(type)))
++ if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
+ return -EINVAL;
+
+ /* Backend is expected to check for zero fsid (e.g. tmpfs) */
+@@ -565,7 +628,8 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+ conn = fsnotify_grab_connector(connp);
+ if (!conn) {
+ spin_unlock(&mark->lock);
+- err = fsnotify_attach_connector_to_object(connp, type, fsid);
++ err = fsnotify_attach_connector_to_object(connp, obj_type,
++ fsid);
+ if (err)
+ return err;
+ goto restart;
+@@ -604,7 +668,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+
+ if ((lmark->group == mark->group) &&
+ (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) &&
+- !allow_dups) {
++ !(mark->group->flags & FSNOTIFY_GROUP_DUPS)) {
+ err = -EEXIST;
+ goto out_err;
+ }
+@@ -638,13 +702,13 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
+ * event types should be delivered to which group.
+ */
+ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
+- fsnotify_connp_t *connp, unsigned int type,
+- int allow_dups, __kernel_fsid_t *fsid)
++ fsnotify_connp_t *connp, unsigned int obj_type,
++ int add_flags, __kernel_fsid_t *fsid)
+ {
+ struct fsnotify_group *group = mark->group;
+ int ret = 0;
+
+- BUG_ON(!mutex_is_locked(&group->mark_mutex));
++ fsnotify_group_assert_locked(group);
+
+ /*
+ * LOCKING ORDER!!!!
+@@ -656,16 +720,14 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
+ mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED;
+
+ list_add(&mark->g_list, &group->marks_list);
+- atomic_inc(&group->num_marks);
+ fsnotify_get_mark(mark); /* for g_list */
+ spin_unlock(&mark->lock);
+
+- ret = fsnotify_add_mark_list(mark, connp, type, allow_dups, fsid);
++ ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
+ if (ret)
+ goto err;
+
+- if (mark->mask)
+- fsnotify_recalc_mask(mark->connector);
++ fsnotify_recalc_mask(mark->connector);
+
+ return ret;
+ err:
+@@ -674,21 +736,21 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
+ FSNOTIFY_MARK_FLAG_ATTACHED);
+ list_del_init(&mark->g_list);
+ spin_unlock(&mark->lock);
+- atomic_dec(&group->num_marks);
+
+ fsnotify_put_mark(mark);
+ return ret;
+ }
+
+ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
+- unsigned int type, int allow_dups, __kernel_fsid_t *fsid)
++ unsigned int obj_type, int add_flags,
++ __kernel_fsid_t *fsid)
+ {
+ int ret;
+ struct fsnotify_group *group = mark->group;
+
+- mutex_lock(&group->mark_mutex);
+- ret = fsnotify_add_mark_locked(mark, connp, type, allow_dups, fsid);
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_lock(group);
++ ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
++ fsnotify_group_unlock(group);
+ return ret;
+ }
+ EXPORT_SYMBOL_GPL(fsnotify_add_mark);
+@@ -722,14 +784,14 @@ EXPORT_SYMBOL_GPL(fsnotify_find_mark);
+
+ /* Clear any marks in a group with given type mask */
+ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+- unsigned int type_mask)
++ unsigned int obj_type)
+ {
+ struct fsnotify_mark *lmark, *mark;
+ LIST_HEAD(to_free);
+ struct list_head *head = &to_free;
+
+ /* Skip selection step if we want to clear all marks. */
+- if (type_mask == FSNOTIFY_OBJ_ALL_TYPES_MASK) {
++ if (obj_type == FSNOTIFY_OBJ_TYPE_ANY) {
+ head = &group->marks_list;
+ goto clear;
+ }
+@@ -742,24 +804,24 @@ void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
+ * move marks to free to to_free list in one go and then free marks in
+ * to_free list one by one.
+ */
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
+- if ((1U << mark->connector->type) & type_mask)
++ if (mark->connector->type == obj_type)
+ list_move(&mark->g_list, &to_free);
+ }
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+
+ clear:
+ while (1) {
+- mutex_lock(&group->mark_mutex);
++ fsnotify_group_lock(group);
+ if (list_empty(head)) {
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ break;
+ }
+ mark = list_first_entry(head, struct fsnotify_mark, g_list);
+ fsnotify_get_mark(mark);
+ fsnotify_detach_mark(mark);
+- mutex_unlock(&group->mark_mutex);
++ fsnotify_group_unlock(group);
+ fsnotify_free_mark(mark);
+ fsnotify_put_mark(mark);
+ }
+diff --git a/fs/notify/notification.c b/fs/notify/notification.c
+index 75d79d6d3ef09..9022ae650cf86 100644
+--- a/fs/notify/notification.c
++++ b/fs/notify/notification.c
+@@ -47,13 +47,6 @@ u32 fsnotify_get_cookie(void)
+ }
+ EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
+
+-/* return true if the notify queue is empty, false otherwise */
+-bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+-{
+- assert_spin_locked(&group->notification_lock);
+- return list_empty(&group->notification_list) ? true : false;
+-}
+-
+ void fsnotify_destroy_event(struct fsnotify_group *group,
+ struct fsnotify_event *event)
+ {
+@@ -71,20 +64,26 @@ void fsnotify_destroy_event(struct fsnotify_group *group,
+ WARN_ON(!list_empty(&event->list));
+ spin_unlock(&group->notification_lock);
+ }
+- group->ops->free_event(event);
++ group->ops->free_event(group, event);
+ }
+
+ /*
+- * Add an event to the group notification queue. The group can later pull this
+- * event off the queue to deal with. The function returns 0 if the event was
+- * added to the queue, 1 if the event was merged with some other queued event,
++ * Try to add an event to the notification queue.
++ * The group can later pull this event off the queue to deal with.
++ * The group can use the @merge hook to merge the event with a queued event.
++ * The group can use the @insert hook to insert the event into hash table.
++ * The function returns:
++ * 0 if the event was added to a queue
++ * 1 if the event was merged with some other queued event
+ * 2 if the event was not queued - either the queue of events has overflown
+- * or the group is shutting down.
++ * or the group is shutting down.
+ */
+-int fsnotify_add_event(struct fsnotify_group *group,
+- struct fsnotify_event *event,
+- int (*merge)(struct list_head *,
+- struct fsnotify_event *))
++int fsnotify_insert_event(struct fsnotify_group *group,
++ struct fsnotify_event *event,
++ int (*merge)(struct fsnotify_group *,
++ struct fsnotify_event *),
++ void (*insert)(struct fsnotify_group *,
++ struct fsnotify_event *))
+ {
+ int ret = 0;
+ struct list_head *list = &group->notification_list;
+@@ -111,7 +110,7 @@ int fsnotify_add_event(struct fsnotify_group *group,
+ }
+
+ if (!list_empty(list) && merge) {
+- ret = merge(list, event);
++ ret = merge(group, event);
+ if (ret) {
+ spin_unlock(&group->notification_lock);
+ return ret;
+@@ -121,6 +120,8 @@ int fsnotify_add_event(struct fsnotify_group *group,
+ queue:
+ group->q_len++;
+ list_add_tail(&event->list, list);
++ if (insert)
++ insert(group, event);
+ spin_unlock(&group->notification_lock);
+
+ wake_up(&group->notification_waitq);
+@@ -141,33 +142,36 @@ void fsnotify_remove_queued_event(struct fsnotify_group *group,
+ }
+
+ /*
+- * Remove and return the first event from the notification list. It is the
+- * responsibility of the caller to destroy the obtained event
++ * Return the first event on the notification list without removing it.
++ * Returns NULL if the list is empty.
+ */
+-struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
++struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
+ {
+- struct fsnotify_event *event;
+-
+ assert_spin_locked(&group->notification_lock);
+
+- pr_debug("%s: group=%p\n", __func__, group);
++ if (fsnotify_notify_queue_is_empty(group))
++ return NULL;
+
+- event = list_first_entry(&group->notification_list,
+- struct fsnotify_event, list);
+- fsnotify_remove_queued_event(group, event);
+- return event;
++ return list_first_entry(&group->notification_list,
++ struct fsnotify_event, list);
+ }
+
+ /*
+- * This will not remove the event, that must be done with
+- * fsnotify_remove_first_event()
++ * Remove and return the first event from the notification list. It is the
++ * responsibility of the caller to destroy the obtained event
+ */
+-struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group)
++struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group)
+ {
+- assert_spin_locked(&group->notification_lock);
++ struct fsnotify_event *event = fsnotify_peek_first_event(group);
+
+- return list_first_entry(&group->notification_list,
+- struct fsnotify_event, list);
++ if (!event)
++ return NULL;
++
++ pr_debug("%s: group=%p event=%p\n", __func__, group, event);
++
++ fsnotify_remove_queued_event(group, event);
++
++ return event;
+ }
+
+ /*
+diff --git a/fs/open.c b/fs/open.c
+index 83f62cf1432c8..d69312a2d434b 100644
+--- a/fs/open.c
++++ b/fs/open.c
+@@ -492,7 +492,7 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename)
+ if (error)
+ goto out;
+
+- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
++ error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
+ if (error)
+ goto dput_and_out;
+
+@@ -521,7 +521,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
+ if (!d_can_lookup(f.file->f_path.dentry))
+ goto out_putf;
+
+- error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
++ error = file_permission(f.file, MAY_EXEC | MAY_CHDIR);
+ if (!error)
+ set_fs_pwd(current->fs, &f.file->f_path);
+ out_putf:
+@@ -540,7 +540,7 @@ SYSCALL_DEFINE1(chroot, const char __user *, filename)
+ if (error)
+ goto out;
+
+- error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
++ error = path_permission(&path, MAY_EXEC | MAY_CHDIR);
+ if (error)
+ goto dput_and_out;
+
+@@ -954,6 +954,47 @@ struct file *dentry_open(const struct path *path, int flags,
+ }
+ EXPORT_SYMBOL(dentry_open);
+
++/**
++ * dentry_create - Create and open a file
++ * @path: path to create
++ * @flags: O_ flags
++ * @mode: mode bits for new file
++ * @cred: credentials to use
++ *
++ * Caller must hold the parent directory's lock, and have prepared
++ * a negative dentry, placed in @path->dentry, for the new file.
++ *
++ * Caller sets @path->mnt to the vfsmount of the filesystem where
++ * the new file is to be created. The parent directory and the
++ * negative dentry must reside on the same filesystem instance.
++ *
++ * On success, returns a "struct file *". Otherwise a ERR_PTR
++ * is returned.
++ */
++struct file *dentry_create(const struct path *path, int flags, umode_t mode,
++ const struct cred *cred)
++{
++ struct file *f;
++ int error;
++
++ validate_creds(cred);
++ f = alloc_empty_file(flags, cred);
++ if (IS_ERR(f))
++ return f;
++
++ error = vfs_create(d_inode(path->dentry->d_parent),
++ path->dentry, mode, true);
++ if (!error)
++ error = vfs_open(path, f);
++
++ if (unlikely(error)) {
++ fput(f);
++ return ERR_PTR(error);
++ }
++ return f;
++}
++EXPORT_SYMBOL(dentry_create);
++
+ struct file *open_with_fake_path(const struct path *path, int flags,
+ struct inode *inode, const struct cred *cred)
+ {
+@@ -1310,7 +1351,7 @@ EXPORT_SYMBOL(filp_close);
+ */
+ SYSCALL_DEFINE1(close, unsigned int, fd)
+ {
+- int retval = __close_fd(current->files, fd);
++ int retval = close_fd(fd);
+
+ /* can't restart close syscall because file table entry was cleared */
+ if (unlikely(retval == -ERESTARTSYS ||
+diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
+index 26f91868fbdaf..87b7a4a74f4ed 100644
+--- a/fs/overlayfs/overlayfs.h
++++ b/fs/overlayfs/overlayfs.h
+@@ -212,9 +212,16 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
+ unsigned int flags)
+ {
+ int err;
++ struct renamedata rd = {
++ .old_dir = olddir,
++ .old_dentry = olddentry,
++ .new_dir = newdir,
++ .new_dentry = newdentry,
++ .flags = flags,
++ };
+
+ pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags);
+- err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
++ err = vfs_rename(&rd);
+ if (err) {
+ pr_debug("...rename(%pd2, %pd2, ...) = %i\n",
+ olddentry, newdentry, err);
+diff --git a/fs/proc/fd.c b/fs/proc/fd.c
+index 81882a13212d3..cb51763ed554b 100644
+--- a/fs/proc/fd.c
++++ b/fs/proc/fd.c
+@@ -28,14 +28,13 @@ static int seq_show(struct seq_file *m, void *v)
+ if (!task)
+ return -ENOENT;
+
+- files = get_files_struct(task);
+- put_task_struct(task);
+-
++ task_lock(task);
++ files = task->files;
+ if (files) {
+ unsigned int fd = proc_fd(m->private);
+
+ spin_lock(&files->file_lock);
+- file = fcheck_files(files, fd);
++ file = files_lookup_fd_locked(files, fd);
+ if (file) {
+ struct fdtable *fdt = files_fdtable(files);
+
+@@ -47,8 +46,9 @@ static int seq_show(struct seq_file *m, void *v)
+ ret = 0;
+ }
+ spin_unlock(&files->file_lock);
+- put_files_struct(files);
+ }
++ task_unlock(task);
++ put_task_struct(task);
+
+ if (ret)
+ return ret;
+@@ -57,6 +57,7 @@ static int seq_show(struct seq_file *m, void *v)
+ (long long)file->f_pos, f_flags,
+ real_mount(file->f_path.mnt)->mnt_id);
+
++ /* show_fd_locks() never deferences files so a stale value is safe */
+ show_fd_locks(m, file, files);
+ if (seq_has_overflowed(m))
+ goto out;
+@@ -83,18 +84,13 @@ static const struct file_operations proc_fdinfo_file_operations = {
+
+ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
+ {
+- struct files_struct *files = get_files_struct(task);
+ struct file *file;
+
+- if (!files)
+- return false;
+-
+ rcu_read_lock();
+- file = fcheck_files(files, fd);
++ file = task_lookup_fd_rcu(task, fd);
+ if (file)
+ *mode = file->f_mode;
+ rcu_read_unlock();
+- put_files_struct(files);
+ return !!file;
+ }
+
+@@ -146,29 +142,22 @@ static const struct dentry_operations tid_fd_dentry_operations = {
+
+ static int proc_fd_link(struct dentry *dentry, struct path *path)
+ {
+- struct files_struct *files = NULL;
+ struct task_struct *task;
+ int ret = -ENOENT;
+
+ task = get_proc_task(d_inode(dentry));
+ if (task) {
+- files = get_files_struct(task);
+- put_task_struct(task);
+- }
+-
+- if (files) {
+ unsigned int fd = proc_fd(d_inode(dentry));
+ struct file *fd_file;
+
+- spin_lock(&files->file_lock);
+- fd_file = fcheck_files(files, fd);
++ fd_file = fget_task(task, fd);
+ if (fd_file) {
+ *path = fd_file->f_path;
+ path_get(&fd_file->f_path);
+ ret = 0;
++ fput(fd_file);
+ }
+- spin_unlock(&files->file_lock);
+- put_files_struct(files);
++ put_task_struct(task);
+ }
+
+ return ret;
+@@ -229,7 +218,6 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+ instantiate_t instantiate)
+ {
+ struct task_struct *p = get_proc_task(file_inode(file));
+- struct files_struct *files;
+ unsigned int fd;
+
+ if (!p)
+@@ -237,22 +225,18 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+
+ if (!dir_emit_dots(file, ctx))
+ goto out;
+- files = get_files_struct(p);
+- if (!files)
+- goto out;
+
+ rcu_read_lock();
+- for (fd = ctx->pos - 2;
+- fd < files_fdtable(files)->max_fds;
+- fd++, ctx->pos++) {
++ for (fd = ctx->pos - 2;; fd++) {
+ struct file *f;
+ struct fd_data data;
+ char name[10 + 1];
+ unsigned int len;
+
+- f = fcheck_files(files, fd);
++ f = task_lookup_next_fd_rcu(p, &fd);
++ ctx->pos = fd + 2LL;
+ if (!f)
+- continue;
++ break;
+ data.mode = f->f_mode;
+ rcu_read_unlock();
+ data.fd = fd;
+@@ -261,13 +245,11 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
+ if (!proc_fill_cache(file, ctx,
+ name, len, instantiate, p,
+ &data))
+- goto out_fd_loop;
++ goto out;
+ cond_resched();
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+-out_fd_loop:
+- put_files_struct(files);
+ out:
+ put_task_struct(p);
+ return 0;
+diff --git a/fs/udf/file.c b/fs/udf/file.c
+index e283a62701b83..25f7c915f22b7 100644
+--- a/fs/udf/file.c
++++ b/fs/udf/file.c
+@@ -181,7 +181,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+ long old_block, new_block;
+ int result;
+
+- if (inode_permission(inode, MAY_READ) != 0) {
++ if (file_permission(filp, MAY_READ) != 0) {
+ udf_debug("no permission to access inode %lu\n", inode->i_ino);
+ return -EPERM;
+ }
+diff --git a/fs/verity/enable.c b/fs/verity/enable.c
+index 5ceae66e1ae02..29becb66d0d88 100644
+--- a/fs/verity/enable.c
++++ b/fs/verity/enable.c
+@@ -369,7 +369,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
+ * has verity enabled, and to stabilize the data being hashed.
+ */
+
+- err = inode_permission(inode, MAY_WRITE);
++ err = file_permission(filp, MAY_WRITE);
+ if (err)
+ return err;
+
+diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
+index 0aad774beaec4..b87c3b85a166c 100644
+--- a/include/linux/dnotify.h
++++ b/include/linux/dnotify.h
+@@ -26,7 +26,7 @@ struct dnotify_struct {
+ FS_MODIFY | FS_MODIFY_CHILD |\
+ FS_ACCESS | FS_ACCESS_CHILD |\
+ FS_ATTRIB | FS_ATTRIB_CHILD |\
+- FS_CREATE | FS_DN_RENAME |\
++ FS_CREATE | FS_RENAME |\
+ FS_MOVED_FROM | FS_MOVED_TO)
+
+ extern int dir_notify_enable;
+diff --git a/include/linux/errno.h b/include/linux/errno.h
+index d73f597a24849..8b0c754bab025 100644
+--- a/include/linux/errno.h
++++ b/include/linux/errno.h
+@@ -31,5 +31,6 @@
+ #define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
+ #define EIOCBQUEUED 529 /* iocb queued, will get completion event */
+ #define ERECALLCONFLICT 530 /* conflict with recalled state */
++#define ENOGRACE 531 /* NFS file lock reclaim refused */
+
+ #endif
+diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
+index 3ceb72b67a7aa..218fc5c54e901 100644
+--- a/include/linux/exportfs.h
++++ b/include/linux/exportfs.h
+@@ -213,12 +213,27 @@ struct export_operations {
+ bool write, u32 *device_generation);
+ int (*commit_blocks)(struct inode *inode, struct iomap *iomaps,
+ int nr_iomaps, struct iattr *iattr);
++ u64 (*fetch_iversion)(struct inode *);
++#define EXPORT_OP_NOWCC (0x1) /* don't collect v3 wcc data */
++#define EXPORT_OP_NOSUBTREECHK (0x2) /* no subtree checking */
++#define EXPORT_OP_CLOSE_BEFORE_UNLINK (0x4) /* close files before unlink */
++#define EXPORT_OP_REMOTE_FS (0x8) /* Filesystem is remote */
++#define EXPORT_OP_NOATOMIC_ATTR (0x10) /* Filesystem cannot supply
++ atomic attribute updates
++ */
++#define EXPORT_OP_FLUSH_ON_CLOSE (0x20) /* fs flushes file data on close */
++ unsigned long flags;
+ };
+
+ extern int exportfs_encode_inode_fh(struct inode *inode, struct fid *fid,
+ int *max_len, struct inode *parent);
+ extern int exportfs_encode_fh(struct dentry *dentry, struct fid *fid,
+ int *max_len, int connectable);
++extern struct dentry *exportfs_decode_fh_raw(struct vfsmount *mnt,
++ struct fid *fid, int fh_len,
++ int fileid_type,
++ int (*acceptable)(void *, struct dentry *),
++ void *context);
+ extern struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
+ int fh_len, int fileid_type, int (*acceptable)(void *, struct dentry *),
+ void *context);
+diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
+index 3e9c56ee651f7..558844c8d2598 100644
+--- a/include/linux/fanotify.h
++++ b/include/linux/fanotify.h
+@@ -2,8 +2,11 @@
+ #ifndef _LINUX_FANOTIFY_H
+ #define _LINUX_FANOTIFY_H
+
++#include <linux/sysctl.h>
+ #include <uapi/linux/fanotify.h>
+
++extern struct ctl_table fanotify_table[]; /* for sysctl */
++
+ #define FAN_GROUP_FLAG(group, flag) \
+ ((group)->fanotify_data.flags & (flag))
+
+@@ -15,27 +18,62 @@
+ * these constant, the programs may break if re-compiled with new uapi headers
+ * and then run on an old kernel.
+ */
+-#define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FAN_CLASS_CONTENT | \
++
++/* Group classes where permission events are allowed */
++#define FANOTIFY_PERM_CLASSES (FAN_CLASS_CONTENT | \
+ FAN_CLASS_PRE_CONTENT)
+
+-#define FANOTIFY_FID_BITS (FAN_REPORT_FID | FAN_REPORT_DFID_NAME)
++#define FANOTIFY_CLASS_BITS (FAN_CLASS_NOTIF | FANOTIFY_PERM_CLASSES)
++
++#define FANOTIFY_FID_BITS (FAN_REPORT_DFID_NAME_TARGET)
++
++#define FANOTIFY_INFO_MODES (FANOTIFY_FID_BITS | FAN_REPORT_PIDFD)
++
++/*
++ * fanotify_init() flags that require CAP_SYS_ADMIN.
++ * We do not allow unprivileged groups to request permission events.
++ * We do not allow unprivileged groups to get other process pid in events.
++ * We do not allow unprivileged groups to use unlimited resources.
++ */
++#define FANOTIFY_ADMIN_INIT_FLAGS (FANOTIFY_PERM_CLASSES | \
++ FAN_REPORT_TID | \
++ FAN_REPORT_PIDFD | \
++ FAN_UNLIMITED_QUEUE | \
++ FAN_UNLIMITED_MARKS)
++
++/*
++ * fanotify_init() flags that are allowed for user without CAP_SYS_ADMIN.
++ * FAN_CLASS_NOTIF is the only class we allow for unprivileged group.
++ * We do not allow unprivileged groups to get file descriptors in events,
++ * so one of the flags for reporting file handles is required.
++ */
++#define FANOTIFY_USER_INIT_FLAGS (FAN_CLASS_NOTIF | \
++ FANOTIFY_FID_BITS | \
++ FAN_CLOEXEC | FAN_NONBLOCK)
++
++#define FANOTIFY_INIT_FLAGS (FANOTIFY_ADMIN_INIT_FLAGS | \
++ FANOTIFY_USER_INIT_FLAGS)
+
+-#define FANOTIFY_INIT_FLAGS (FANOTIFY_CLASS_BITS | FANOTIFY_FID_BITS | \
+- FAN_REPORT_TID | \
+- FAN_CLOEXEC | FAN_NONBLOCK | \
+- FAN_UNLIMITED_QUEUE | FAN_UNLIMITED_MARKS)
++/* Internal group flags */
++#define FANOTIFY_UNPRIV 0x80000000
++#define FANOTIFY_INTERNAL_GROUP_FLAGS (FANOTIFY_UNPRIV)
+
+ #define FANOTIFY_MARK_TYPE_BITS (FAN_MARK_INODE | FAN_MARK_MOUNT | \
+ FAN_MARK_FILESYSTEM)
+
++#define FANOTIFY_MARK_CMD_BITS (FAN_MARK_ADD | FAN_MARK_REMOVE | \
++ FAN_MARK_FLUSH)
++
++#define FANOTIFY_MARK_IGNORE_BITS (FAN_MARK_IGNORED_MASK | \
++ FAN_MARK_IGNORE)
++
+ #define FANOTIFY_MARK_FLAGS (FANOTIFY_MARK_TYPE_BITS | \
+- FAN_MARK_ADD | \
+- FAN_MARK_REMOVE | \
++ FANOTIFY_MARK_CMD_BITS | \
++ FANOTIFY_MARK_IGNORE_BITS | \
+ FAN_MARK_DONT_FOLLOW | \
+ FAN_MARK_ONLYDIR | \
+- FAN_MARK_IGNORED_MASK | \
+ FAN_MARK_IGNORED_SURV_MODIFY | \
+- FAN_MARK_FLUSH)
++ FAN_MARK_EVICTABLE)
+
+ /*
+ * Events that can be reported with data type FSNOTIFY_EVENT_PATH.
+@@ -49,15 +87,23 @@
+ * Directory entry modification events - reported only to directory
+ * where entry is modified and not to a watching parent.
+ */
+-#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE)
++#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE | \
++ FAN_RENAME)
++
++/* Events that can be reported with event->fd */
++#define FANOTIFY_FD_EVENTS (FANOTIFY_PATH_EVENTS | FANOTIFY_PERM_EVENTS)
+
+ /* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */
+ #define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \
+ FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF)
+
++/* Events that can only be reported with data type FSNOTIFY_EVENT_ERROR */
++#define FANOTIFY_ERROR_EVENTS (FAN_FS_ERROR)
++
+ /* Events that user can request to be notified on */
+ #define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \
+- FANOTIFY_INODE_EVENTS)
++ FANOTIFY_INODE_EVENTS | \
++ FANOTIFY_ERROR_EVENTS)
+
+ /* Events that require a permission response from user */
+ #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \
+@@ -71,6 +117,10 @@
+ FANOTIFY_PERM_EVENTS | \
+ FAN_Q_OVERFLOW | FAN_ONDIR)
+
++/* Events and flags relevant only for directories */
++#define FANOTIFY_DIRONLY_EVENT_BITS (FANOTIFY_DIRENT_EVENTS | \
++ FAN_EVENT_ON_CHILD | FAN_ONDIR)
++
+ #define ALL_FANOTIFY_EVENT_BITS (FANOTIFY_OUTGOING_EVENTS | \
+ FANOTIFY_EVENT_FLAGS)
+
+diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
+index f1a99d3e55707..4ed3589f9294e 100644
+--- a/include/linux/fdtable.h
++++ b/include/linux/fdtable.h
+@@ -80,7 +80,7 @@ struct dentry;
+ /*
+ * The caller must ensure that fd table isn't shared or hold rcu or file lock
+ */
+-static inline struct file *__fcheck_files(struct files_struct *files, unsigned int fd)
++static inline struct file *files_lookup_fd_raw(struct files_struct *files, unsigned int fd)
+ {
+ struct fdtable *fdt = rcu_dereference_raw(files->fdt);
+
+@@ -91,37 +91,40 @@ static inline struct file *__fcheck_files(struct files_struct *files, unsigned i
+ return NULL;
+ }
+
+-static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd)
++static inline struct file *files_lookup_fd_locked(struct files_struct *files, unsigned int fd)
+ {
+- RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
+- !lockdep_is_held(&files->file_lock),
++ RCU_LOCKDEP_WARN(!lockdep_is_held(&files->file_lock),
+ "suspicious rcu_dereference_check() usage");
+- return __fcheck_files(files, fd);
++ return files_lookup_fd_raw(files, fd);
+ }
+
+-/*
+- * Check whether the specified fd has an open file.
+- */
+-#define fcheck(fd) fcheck_files(current->files, fd)
++static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd)
++{
++ RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
++ "suspicious rcu_dereference_check() usage");
++ return files_lookup_fd_raw(files, fd);
++}
++
++static inline struct file *lookup_fd_rcu(unsigned int fd)
++{
++ return files_lookup_fd_rcu(current->files, fd);
++}
++
++struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd);
++struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *fd);
+
+ struct task_struct;
+
+ struct files_struct *get_files_struct(struct task_struct *);
+ void put_files_struct(struct files_struct *fs);
+-void reset_files_struct(struct files_struct *);
+-int unshare_files(struct files_struct **);
++int unshare_files(void);
+ struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
+ void do_close_on_exec(struct files_struct *);
+ int iterate_fd(struct files_struct *, unsigned,
+ int (*)(const void *, struct file *, unsigned),
+ const void *);
+
+-extern int __alloc_fd(struct files_struct *files,
+- unsigned start, unsigned end, unsigned flags);
+-extern void __fd_install(struct files_struct *files,
+- unsigned int fd, struct file *file);
+-extern int __close_fd(struct files_struct *files,
+- unsigned int fd);
++extern int close_fd(unsigned int fd);
+ extern int __close_range(unsigned int fd, unsigned int max_fd, unsigned int flags);
+ extern int close_fd_get_file(unsigned int fd, struct file **res);
+ extern int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 6de70634e5471..6a26ef54ac25d 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -996,6 +996,7 @@ static inline struct file *get_file(struct file *f)
+ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */
+ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */
+ #define FL_LAYOUT 2048 /* outstanding pNFS layout */
++#define FL_RECLAIM 4096 /* reclaiming from a reboot server */
+
+ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
+
+@@ -1016,6 +1017,7 @@ struct file_lock_operations {
+ };
+
+ struct lock_manager_operations {
++ void *lm_mod_owner;
+ fl_owner_t (*lm_get_owner)(fl_owner_t);
+ void (*lm_put_owner)(fl_owner_t);
+ void (*lm_notify)(struct file_lock *); /* unblock callback */
+@@ -1024,6 +1026,8 @@ struct lock_manager_operations {
+ int (*lm_change)(struct file_lock *, int, struct list_head *);
+ void (*lm_setup)(struct file_lock *, void **);
+ bool (*lm_breaker_owns_lease)(struct file_lock *);
++ bool (*lm_lock_expirable)(struct file_lock *cfl);
++ void (*lm_expire_lock)(void);
+ };
+
+ struct lock_manager {
+@@ -1162,6 +1166,15 @@ extern void lease_unregister_notifier(struct notifier_block *);
+ struct files_struct;
+ extern void show_fd_locks(struct seq_file *f,
+ struct file *filp, struct files_struct *files);
++extern bool locks_owner_has_blockers(struct file_lock_context *flctx,
++ fl_owner_t owner);
++
++static inline struct file_lock_context *
++locks_inode_context(const struct inode *inode)
++{
++ return smp_load_acquire(&inode->i_flctx);
++}
++
+ #else /* !CONFIG_FILE_LOCKING */
+ static inline int fcntl_getlk(struct file *file, unsigned int cmd,
+ struct flock __user *user)
+@@ -1302,6 +1315,18 @@ static inline int lease_modify(struct file_lock *fl, int arg,
+ struct files_struct;
+ static inline void show_fd_locks(struct seq_file *f,
+ struct file *filp, struct files_struct *files) {}
++static inline bool locks_owner_has_blockers(struct file_lock_context *flctx,
++ fl_owner_t owner)
++{
++ return false;
++}
++
++static inline struct file_lock_context *
++locks_inode_context(const struct inode *inode)
++{
++ return NULL;
++}
++
+ #endif /* !CONFIG_FILE_LOCKING */
+
+ static inline struct inode *file_inode(const struct file *f)
+@@ -1512,8 +1537,11 @@ struct super_block {
+ /* Number of inodes with nlink == 0 but still referenced */
+ atomic_long_t s_remove_count;
+
+- /* Pending fsnotify inode refs */
+- atomic_long_t s_fsnotify_inode_refs;
++ /*
++ * Number of inode/mount/sb objects that are being watched, note that
++ * inodes objects are currently double-accounted.
++ */
++ atomic_long_t s_fsnotify_connectors;
+
+ /* Being remounted read-only */
+ int s_readonly_remount;
+@@ -1780,7 +1808,17 @@ extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+ extern int vfs_rmdir(struct inode *, struct dentry *);
+ extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+-extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
++
++struct renamedata {
++ struct inode *old_dir;
++ struct dentry *old_dentry;
++ struct inode *new_dir;
++ struct dentry *new_dentry;
++ struct inode **delegated_inode;
++ unsigned int flags;
++} __randomize_layout;
++
++int vfs_rename(struct renamedata *);
+
+ static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry)
+ {
+@@ -2594,6 +2632,8 @@ extern struct file *filp_open(const char *, int, umode_t);
+ extern struct file *file_open_root(struct dentry *, struct vfsmount *,
+ const char *, int, umode_t);
+ extern struct file * dentry_open(const struct path *, int, const struct cred *);
++extern struct file *dentry_create(const struct path *path, int flags,
++ umode_t mode, const struct cred *cred);
+ extern struct file * open_with_fake_path(const struct path *, int,
+ struct inode*, const struct cred *);
+ static inline struct file *file_clone_open(struct file *file)
+@@ -2824,6 +2864,14 @@ static inline int bmap(struct inode *inode, sector_t *block)
+ extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+ extern int inode_permission(struct inode *, int);
+ extern int generic_permission(struct inode *, int);
++static inline int file_permission(struct file *file, int mask)
++{
++ return inode_permission(file_inode(file), mask);
++}
++static inline int path_permission(const struct path *path, int mask)
++{
++ return inode_permission(d_inode(path->dentry), mask);
++}
+ extern int __check_sticky(struct inode *dir, struct inode *inode);
+
+ static inline bool execute_ok(struct inode *inode)
+diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
+index 79add91eaa04e..bb8467cd11ae2 100644
+--- a/include/linux/fsnotify.h
++++ b/include/linux/fsnotify.h
+@@ -26,21 +26,27 @@
+ * FS_EVENT_ON_CHILD mask on the parent inode and will not be reported if only
+ * the child is interested and not the parent.
+ */
+-static inline void fsnotify_name(struct inode *dir, __u32 mask,
+- struct inode *child,
+- const struct qstr *name, u32 cookie)
++static inline int fsnotify_name(__u32 mask, const void *data, int data_type,
++ struct inode *dir, const struct qstr *name,
++ u32 cookie)
+ {
+- fsnotify(mask, child, FSNOTIFY_EVENT_INODE, dir, name, NULL, cookie);
++ if (atomic_long_read(&dir->i_sb->s_fsnotify_connectors) == 0)
++ return 0;
++
++ return fsnotify(mask, data, data_type, dir, name, NULL, cookie);
+ }
+
+ static inline void fsnotify_dirent(struct inode *dir, struct dentry *dentry,
+ __u32 mask)
+ {
+- fsnotify_name(dir, mask, d_inode(dentry), &dentry->d_name, 0);
++ fsnotify_name(mask, dentry, FSNOTIFY_EVENT_DENTRY, dir, &dentry->d_name, 0);
+ }
+
+ static inline void fsnotify_inode(struct inode *inode, __u32 mask)
+ {
++ if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0)
++ return;
++
+ if (S_ISDIR(inode->i_mode))
+ mask |= FS_ISDIR;
+
+@@ -53,6 +59,9 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
+ {
+ struct inode *inode = d_inode(dentry);
+
++ if (atomic_long_read(&inode->i_sb->s_fsnotify_connectors) == 0)
++ return 0;
++
+ if (S_ISDIR(inode->i_mode)) {
+ mask |= FS_ISDIR;
+
+@@ -77,7 +86,7 @@ static inline int fsnotify_parent(struct dentry *dentry, __u32 mask,
+ */
+ static inline void fsnotify_dentry(struct dentry *dentry, __u32 mask)
+ {
+- fsnotify_parent(dentry, mask, d_inode(dentry), FSNOTIFY_EVENT_INODE);
++ fsnotify_parent(dentry, mask, dentry, FSNOTIFY_EVENT_DENTRY);
+ }
+
+ static inline int fsnotify_file(struct file *file, __u32 mask)
+@@ -135,18 +144,23 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
+ u32 fs_cookie = fsnotify_get_cookie();
+ __u32 old_dir_mask = FS_MOVED_FROM;
+ __u32 new_dir_mask = FS_MOVED_TO;
++ __u32 rename_mask = FS_RENAME;
+ const struct qstr *new_name = &moved->d_name;
+
+- if (old_dir == new_dir)
+- old_dir_mask |= FS_DN_RENAME;
+-
+ if (isdir) {
+ old_dir_mask |= FS_ISDIR;
+ new_dir_mask |= FS_ISDIR;
++ rename_mask |= FS_ISDIR;
+ }
+
+- fsnotify_name(old_dir, old_dir_mask, source, old_name, fs_cookie);
+- fsnotify_name(new_dir, new_dir_mask, source, new_name, fs_cookie);
++ /* Event with information about both old and new parent+name */
++ fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
++ old_dir, old_name, 0);
++
++ fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
++ old_dir, old_name, fs_cookie);
++ fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
++ new_dir, new_name, fs_cookie);
+
+ if (target)
+ fsnotify_link_count(target);
+@@ -181,16 +195,22 @@ static inline void fsnotify_inoderemove(struct inode *inode)
+
+ /*
+ * fsnotify_create - 'name' was linked in
++ *
++ * Caller must make sure that dentry->d_name is stable.
++ * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
++ * ->d_inode later
+ */
+-static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
++static inline void fsnotify_create(struct inode *dir, struct dentry *dentry)
+ {
+- audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
++ audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);
+
+- fsnotify_dirent(inode, dentry, FS_CREATE);
++ fsnotify_dirent(dir, dentry, FS_CREATE);
+ }
+
+ /*
+ * fsnotify_link - new hardlink in 'inode' directory
++ *
++ * Caller must make sure that new_dentry->d_name is stable.
+ * Note: We have to pass also the linked inode ptr as some filesystems leave
+ * new_dentry->d_inode NULL and instantiate inode pointer later
+ */
+@@ -200,7 +220,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode,
+ fsnotify_link_count(inode);
+ audit_inode_child(dir, new_dentry, AUDIT_TYPE_CHILD_CREATE);
+
+- fsnotify_name(dir, FS_CREATE, inode, &new_dentry->d_name, 0);
++ fsnotify_name(FS_CREATE, inode, FSNOTIFY_EVENT_INODE,
++ dir, &new_dentry->d_name, 0);
+ }
+
+ /*
+@@ -219,7 +240,8 @@ static inline void fsnotify_delete(struct inode *dir, struct inode *inode,
+ if (S_ISDIR(inode->i_mode))
+ mask |= FS_ISDIR;
+
+- fsnotify_name(dir, mask, inode, &dentry->d_name, 0);
++ fsnotify_name(mask, inode, FSNOTIFY_EVENT_INODE, dir, &dentry->d_name,
++ 0);
+ }
+
+ /**
+@@ -254,12 +276,16 @@ static inline void fsnotify_unlink(struct inode *dir, struct dentry *dentry)
+
+ /*
+ * fsnotify_mkdir - directory 'name' was created
++ *
++ * Caller must make sure that dentry->d_name is stable.
++ * Note: some filesystems (e.g. kernfs) leave @dentry negative and instantiate
++ * ->d_inode later
+ */
+-static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
++static inline void fsnotify_mkdir(struct inode *dir, struct dentry *dentry)
+ {
+- audit_inode_child(inode, dentry, AUDIT_TYPE_CHILD_CREATE);
++ audit_inode_child(dir, dentry, AUDIT_TYPE_CHILD_CREATE);
+
+- fsnotify_dirent(inode, dentry, FS_CREATE | FS_ISDIR);
++ fsnotify_dirent(dir, dentry, FS_CREATE | FS_ISDIR);
+ }
+
+ /*
+@@ -353,4 +379,17 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
+ fsnotify_dentry(dentry, mask);
+ }
+
++static inline int fsnotify_sb_error(struct super_block *sb, struct inode *inode,
++ int error)
++{
++ struct fs_error_report report = {
++ .error = error,
++ .inode = inode,
++ .sb = sb,
++ };
++
++ return fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR,
++ NULL, NULL, NULL, 0);
++}
++
+ #endif /* _LINUX_FS_NOTIFY_H */
+diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
+index a2e42d3cd87cf..d7d96c806bff2 100644
+--- a/include/linux/fsnotify_backend.h
++++ b/include/linux/fsnotify_backend.h
+@@ -19,6 +19,8 @@
+ #include <linux/atomic.h>
+ #include <linux/user_namespace.h>
+ #include <linux/refcount.h>
++#include <linux/mempool.h>
++#include <linux/sched/mm.h>
+
+ /*
+ * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
+@@ -42,13 +44,18 @@
+
+ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */
+ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
++#define FS_ERROR 0x00008000 /* Filesystem Error (fanotify) */
++
++/*
++ * FS_IN_IGNORED overloads FS_ERROR. It is only used internally by inotify
++ * which does not support FS_ERROR.
++ */
+ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */
+
+ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */
+ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */
+ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */
+
+-#define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */
+ /*
+ * Set on inode mark that cares about things that happen to its children.
+ * Always set for dnotify and inotify.
+@@ -56,10 +63,9 @@
+ */
+ #define FS_EVENT_ON_CHILD 0x08000000
+
+-#define FS_DN_RENAME 0x10000000 /* file renamed */
++#define FS_RENAME 0x10000000 /* File was renamed */
+ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */
+ #define FS_ISDIR 0x40000000 /* event occurred against dir */
+-#define FS_IN_ONESHOT 0x80000000 /* only send event once */
+
+ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO)
+
+@@ -69,7 +75,7 @@
+ * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event
+ * when a directory entry inside a child subdir changes.
+ */
+-#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE)
++#define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE | FS_RENAME)
+
+ #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \
+ FS_OPEN_EXEC_PERM)
+@@ -94,12 +100,12 @@
+ /* Events that can be reported to backends */
+ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \
+ FS_EVENTS_POSS_ON_CHILD | \
+- FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \
+- FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED)
++ FS_DELETE_SELF | FS_MOVE_SELF | \
++ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED | \
++ FS_ERROR)
+
+ /* Extra flags that may be reported with event or control handling of events */
+-#define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \
+- FS_DN_MULTISHOT | FS_EVENT_ON_CHILD)
++#define ALL_FSNOTIFY_FLAGS (FS_ISDIR | FS_EVENT_ON_CHILD | FS_DN_MULTISHOT)
+
+ #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS)
+
+@@ -136,6 +142,7 @@ struct mem_cgroup;
+ * @dir: optional directory associated with event -
+ * if @file_name is not NULL, this is the directory that
+ * @file_name is relative to.
++ * Either @inode or @dir must be non-NULL.
+ * @file_name: optional file name associated with event
+ * @cookie: inotify rename cookie
+ *
+@@ -155,7 +162,7 @@ struct fsnotify_ops {
+ const struct qstr *file_name, u32 cookie);
+ void (*free_group_priv)(struct fsnotify_group *group);
+ void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group);
+- void (*free_event)(struct fsnotify_event *event);
++ void (*free_event)(struct fsnotify_group *group, struct fsnotify_event *event);
+ /* called on final put+free to free memory */
+ void (*free_mark)(struct fsnotify_mark *mark);
+ };
+@@ -167,7 +174,6 @@ struct fsnotify_ops {
+ */
+ struct fsnotify_event {
+ struct list_head list;
+- unsigned long objectid; /* identifier for queue merges */
+ };
+
+ /*
+@@ -205,11 +211,14 @@ struct fsnotify_group {
+ unsigned int priority;
+ bool shutdown; /* group is being shut down, don't queue more events */
+
++#define FSNOTIFY_GROUP_USER 0x01 /* user allocated group */
++#define FSNOTIFY_GROUP_DUPS 0x02 /* allow multiple marks per object */
++#define FSNOTIFY_GROUP_NOFS 0x04 /* group lock is not direct reclaim safe */
++ int flags;
++ unsigned int owner_flags; /* stored flags of mark_mutex owner */
++
+ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */
+ struct mutex mark_mutex; /* protect marks_list */
+- atomic_t num_marks; /* 1 for each mark and 1 for not being
+- * past the point of no return when freeing
+- * a group */
+ atomic_t user_waits; /* Number of tasks waiting for user
+ * response */
+ struct list_head marks_list; /* all inode marks for this group */
+@@ -234,23 +243,58 @@ struct fsnotify_group {
+ #endif
+ #ifdef CONFIG_FANOTIFY
+ struct fanotify_group_private_data {
++ /* Hash table of events for merge */
++ struct hlist_head *merge_hash;
+ /* allows a group to block waiting for a userspace response */
+ struct list_head access_list;
+ wait_queue_head_t access_waitq;
+ int flags; /* flags from fanotify_init() */
+ int f_flags; /* event_f_flags from fanotify_init() */
+- unsigned int max_marks;
+- struct user_struct *user;
++ struct ucounts *ucounts;
++ mempool_t error_events_pool;
+ } fanotify_data;
+ #endif /* CONFIG_FANOTIFY */
+ };
+ };
+
++/*
++ * These helpers are used to prevent deadlock when reclaiming inodes with
++ * evictable marks of the same group that is allocating a new mark.
++ */
++static inline void fsnotify_group_lock(struct fsnotify_group *group)
++{
++ mutex_lock(&group->mark_mutex);
++ if (group->flags & FSNOTIFY_GROUP_NOFS)
++ group->owner_flags = memalloc_nofs_save();
++}
++
++static inline void fsnotify_group_unlock(struct fsnotify_group *group)
++{
++ if (group->flags & FSNOTIFY_GROUP_NOFS)
++ memalloc_nofs_restore(group->owner_flags);
++ mutex_unlock(&group->mark_mutex);
++}
++
++static inline void fsnotify_group_assert_locked(struct fsnotify_group *group)
++{
++ WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex));
++ if (group->flags & FSNOTIFY_GROUP_NOFS)
++ WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
++}
++
+ /* When calling fsnotify tell it if the data is a path or inode */
+ enum fsnotify_data_type {
+ FSNOTIFY_EVENT_NONE,
+ FSNOTIFY_EVENT_PATH,
+ FSNOTIFY_EVENT_INODE,
++ FSNOTIFY_EVENT_DENTRY,
++ FSNOTIFY_EVENT_ERROR,
++};
++
++struct fs_error_report {
++ int error;
++ struct inode *inode;
++ struct super_block *sb;
+ };
+
+ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
+@@ -258,8 +302,25 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
+ switch (data_type) {
+ case FSNOTIFY_EVENT_INODE:
+ return (struct inode *)data;
++ case FSNOTIFY_EVENT_DENTRY:
++ return d_inode(data);
+ case FSNOTIFY_EVENT_PATH:
+ return d_inode(((const struct path *)data)->dentry);
++ case FSNOTIFY_EVENT_ERROR:
++ return ((struct fs_error_report *)data)->inode;
++ default:
++ return NULL;
++ }
++}
++
++static inline struct dentry *fsnotify_data_dentry(const void *data, int data_type)
++{
++ switch (data_type) {
++ case FSNOTIFY_EVENT_DENTRY:
++ /* Non const is needed for dget() */
++ return (struct dentry *)data;
++ case FSNOTIFY_EVENT_PATH:
++ return ((const struct path *)data)->dentry;
+ default:
+ return NULL;
+ }
+@@ -276,58 +337,110 @@ static inline const struct path *fsnotify_data_path(const void *data,
+ }
+ }
+
++static inline struct super_block *fsnotify_data_sb(const void *data,
++ int data_type)
++{
++ switch (data_type) {
++ case FSNOTIFY_EVENT_INODE:
++ return ((struct inode *)data)->i_sb;
++ case FSNOTIFY_EVENT_DENTRY:
++ return ((struct dentry *)data)->d_sb;
++ case FSNOTIFY_EVENT_PATH:
++ return ((const struct path *)data)->dentry->d_sb;
++ case FSNOTIFY_EVENT_ERROR:
++ return ((struct fs_error_report *) data)->sb;
++ default:
++ return NULL;
++ }
++}
++
++static inline struct fs_error_report *fsnotify_data_error_report(
++ const void *data,
++ int data_type)
++{
++ switch (data_type) {
++ case FSNOTIFY_EVENT_ERROR:
++ return (struct fs_error_report *) data;
++ default:
++ return NULL;
++ }
++}
++
++/*
++ * Index to merged marks iterator array that correlates to a type of watch.
++ * The type of watched object can be deduced from the iterator type, but not
++ * the other way around, because an event can match different watched objects
++ * of the same object type.
++ * For example, both parent and child are watching an object of type inode.
++ */
++enum fsnotify_iter_type {
++ FSNOTIFY_ITER_TYPE_INODE,
++ FSNOTIFY_ITER_TYPE_VFSMOUNT,
++ FSNOTIFY_ITER_TYPE_SB,
++ FSNOTIFY_ITER_TYPE_PARENT,
++ FSNOTIFY_ITER_TYPE_INODE2,
++ FSNOTIFY_ITER_TYPE_COUNT
++};
++
++/* The type of object that a mark is attached to */
+ enum fsnotify_obj_type {
++ FSNOTIFY_OBJ_TYPE_ANY = -1,
+ FSNOTIFY_OBJ_TYPE_INODE,
+- FSNOTIFY_OBJ_TYPE_PARENT,
+ FSNOTIFY_OBJ_TYPE_VFSMOUNT,
+ FSNOTIFY_OBJ_TYPE_SB,
+ FSNOTIFY_OBJ_TYPE_COUNT,
+ FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT
+ };
+
+-#define FSNOTIFY_OBJ_TYPE_INODE_FL (1U << FSNOTIFY_OBJ_TYPE_INODE)
+-#define FSNOTIFY_OBJ_TYPE_PARENT_FL (1U << FSNOTIFY_OBJ_TYPE_PARENT)
+-#define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL (1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT)
+-#define FSNOTIFY_OBJ_TYPE_SB_FL (1U << FSNOTIFY_OBJ_TYPE_SB)
+-#define FSNOTIFY_OBJ_ALL_TYPES_MASK ((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1)
+-
+-static inline bool fsnotify_valid_obj_type(unsigned int type)
++static inline bool fsnotify_valid_obj_type(unsigned int obj_type)
+ {
+- return (type < FSNOTIFY_OBJ_TYPE_COUNT);
++ return (obj_type < FSNOTIFY_OBJ_TYPE_COUNT);
+ }
+
+ struct fsnotify_iter_info {
+- struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT];
++ struct fsnotify_mark *marks[FSNOTIFY_ITER_TYPE_COUNT];
++ struct fsnotify_group *current_group;
+ unsigned int report_mask;
+ int srcu_idx;
+ };
+
+ static inline bool fsnotify_iter_should_report_type(
+- struct fsnotify_iter_info *iter_info, int type)
++ struct fsnotify_iter_info *iter_info, int iter_type)
+ {
+- return (iter_info->report_mask & (1U << type));
++ return (iter_info->report_mask & (1U << iter_type));
+ }
+
+ static inline void fsnotify_iter_set_report_type(
+- struct fsnotify_iter_info *iter_info, int type)
++ struct fsnotify_iter_info *iter_info, int iter_type)
++{
++ iter_info->report_mask |= (1U << iter_type);
++}
++
++static inline struct fsnotify_mark *fsnotify_iter_mark(
++ struct fsnotify_iter_info *iter_info, int iter_type)
+ {
+- iter_info->report_mask |= (1U << type);
++ if (fsnotify_iter_should_report_type(iter_info, iter_type))
++ return iter_info->marks[iter_type];
++ return NULL;
+ }
+
+-static inline void fsnotify_iter_set_report_type_mark(
+- struct fsnotify_iter_info *iter_info, int type,
+- struct fsnotify_mark *mark)
++static inline int fsnotify_iter_step(struct fsnotify_iter_info *iter, int type,
++ struct fsnotify_mark **markp)
+ {
+- iter_info->marks[type] = mark;
+- iter_info->report_mask |= (1U << type);
++ while (type < FSNOTIFY_ITER_TYPE_COUNT) {
++ *markp = fsnotify_iter_mark(iter, type);
++ if (*markp)
++ break;
++ type++;
++ }
++ return type;
+ }
+
+ #define FSNOTIFY_ITER_FUNCS(name, NAME) \
+ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \
+ struct fsnotify_iter_info *iter_info) \
+ { \
+- return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \
+- iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \
++ return fsnotify_iter_mark(iter_info, FSNOTIFY_ITER_TYPE_##NAME); \
+ }
+
+ FSNOTIFY_ITER_FUNCS(inode, INODE)
+@@ -335,8 +448,13 @@ FSNOTIFY_ITER_FUNCS(parent, PARENT)
+ FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT)
+ FSNOTIFY_ITER_FUNCS(sb, SB)
+
+-#define fsnotify_foreach_obj_type(type) \
+- for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++)
++#define fsnotify_foreach_iter_type(type) \
++ for (type = 0; type < FSNOTIFY_ITER_TYPE_COUNT; type++)
++#define fsnotify_foreach_iter_mark_type(iter, mark, type) \
++ for (type = 0; \
++ type = fsnotify_iter_step(iter, type, &mark), \
++ type < FSNOTIFY_ITER_TYPE_COUNT; \
++ type++)
+
+ /*
+ * fsnotify_connp_t is what we embed in objects which connector can be attached
+@@ -355,6 +473,7 @@ struct fsnotify_mark_connector {
+ spinlock_t lock;
+ unsigned short type; /* Type of object [lock] */
+ #define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01
++#define FSNOTIFY_CONN_FLAG_HAS_IREF 0x02
+ unsigned short flags; /* flags [lock] */
+ __kernel_fsid_t fsid; /* fsid of filesystem containing object */
+ union {
+@@ -399,11 +518,18 @@ struct fsnotify_mark {
+ struct hlist_node obj_list;
+ /* Head of list of marks for an object [mark ref] */
+ struct fsnotify_mark_connector *connector;
+- /* Events types to ignore [mark->lock, group->mark_mutex] */
+- __u32 ignored_mask;
+-#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01
+-#define FSNOTIFY_MARK_FLAG_ALIVE 0x02
+-#define FSNOTIFY_MARK_FLAG_ATTACHED 0x04
++ /* Events types and flags to ignore [mark->lock, group->mark_mutex] */
++ __u32 ignore_mask;
++ /* General fsnotify mark flags */
++#define FSNOTIFY_MARK_FLAG_ALIVE 0x0001
++#define FSNOTIFY_MARK_FLAG_ATTACHED 0x0002
++ /* inotify mark flags */
++#define FSNOTIFY_MARK_FLAG_EXCL_UNLINK 0x0010
++#define FSNOTIFY_MARK_FLAG_IN_ONESHOT 0x0020
++ /* fanotify mark flags */
++#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x0100
++#define FSNOTIFY_MARK_FLAG_NO_IREF 0x0200
++#define FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS 0x0400
+ unsigned int flags; /* flags [mark->lock] */
+ };
+
+@@ -469,7 +595,9 @@ static inline void fsnotify_update_flags(struct dentry *dentry)
+ /* called from fsnotify listeners, such as fanotify or dnotify */
+
+ /* create a new group */
+-extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops);
++extern struct fsnotify_group *fsnotify_alloc_group(
++ const struct fsnotify_ops *ops,
++ int flags);
+ /* get reference to a group */
+ extern void fsnotify_get_group(struct fsnotify_group *group);
+ /* drop reference on a group from fsnotify_alloc_group */
+@@ -484,17 +612,39 @@ extern int fsnotify_fasync(int fd, struct file *file, int on);
+ extern void fsnotify_destroy_event(struct fsnotify_group *group,
+ struct fsnotify_event *event);
+ /* attach the event to the group notification queue */
+-extern int fsnotify_add_event(struct fsnotify_group *group,
+- struct fsnotify_event *event,
+- int (*merge)(struct list_head *,
+- struct fsnotify_event *));
++extern int fsnotify_insert_event(struct fsnotify_group *group,
++ struct fsnotify_event *event,
++ int (*merge)(struct fsnotify_group *,
++ struct fsnotify_event *),
++ void (*insert)(struct fsnotify_group *,
++ struct fsnotify_event *));
++
++static inline int fsnotify_add_event(struct fsnotify_group *group,
++ struct fsnotify_event *event,
++ int (*merge)(struct fsnotify_group *,
++ struct fsnotify_event *))
++{
++ return fsnotify_insert_event(group, event, merge, NULL);
++}
++
+ /* Queue overflow event to a notification group */
+ static inline void fsnotify_queue_overflow(struct fsnotify_group *group)
+ {
+ fsnotify_add_event(group, group->overflow_event, NULL);
+ }
+
+-/* true if the group notification queue is empty */
++static inline bool fsnotify_is_overflow_event(u32 mask)
++{
++ return mask & FS_Q_OVERFLOW;
++}
++
++static inline bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
++{
++ assert_spin_locked(&group->notification_lock);
++
++ return list_empty(&group->notification_list);
++}
++
+ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
+ /* return, but do not dequeue the first event on the notification queue */
+ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group);
+@@ -506,6 +656,101 @@ extern void fsnotify_remove_queued_event(struct fsnotify_group *group,
+
+ /* functions used to manipulate the marks attached to inodes */
+
++/*
++ * Canonical "ignore mask" including event flags.
++ *
++ * Note the subtle semantic difference from the legacy ->ignored_mask.
++ * ->ignored_mask traditionally only meant which events should be ignored,
++ * while ->ignore_mask also includes flags regarding the type of objects on
++ * which events should be ignored.
++ */
++static inline __u32 fsnotify_ignore_mask(struct fsnotify_mark *mark)
++{
++ __u32 ignore_mask = mark->ignore_mask;
++
++ /* The event flags in ignore mask take effect */
++ if (mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
++ return ignore_mask;
++
++ /*
++ * Legacy behavior:
++ * - Always ignore events on dir
++ * - Ignore events on child if parent is watching children
++ */
++ ignore_mask |= FS_ISDIR;
++ ignore_mask &= ~FS_EVENT_ON_CHILD;
++ ignore_mask |= mark->mask & FS_EVENT_ON_CHILD;
++
++ return ignore_mask;
++}
++
++/* Legacy ignored_mask - only event types to ignore */
++static inline __u32 fsnotify_ignored_events(struct fsnotify_mark *mark)
++{
++ return mark->ignore_mask & ALL_FSNOTIFY_EVENTS;
++}
++
++/*
++ * Check if mask (or ignore mask) should be applied depending if victim is a
++ * directory and whether it is reported to a watching parent.
++ */
++static inline bool fsnotify_mask_applicable(__u32 mask, bool is_dir,
++ int iter_type)
++{
++ /* Should mask be applied to a directory? */
++ if (is_dir && !(mask & FS_ISDIR))
++ return false;
++
++ /* Should mask be applied to a child? */
++ if (iter_type == FSNOTIFY_ITER_TYPE_PARENT &&
++ !(mask & FS_EVENT_ON_CHILD))
++ return false;
++
++ return true;
++}
++
++/*
++ * Effective ignore mask taking into account if event victim is a
++ * directory and whether it is reported to a watching parent.
++ */
++static inline __u32 fsnotify_effective_ignore_mask(struct fsnotify_mark *mark,
++ bool is_dir, int iter_type)
++{
++ __u32 ignore_mask = fsnotify_ignored_events(mark);
++
++ if (!ignore_mask)
++ return 0;
++
++ /* For non-dir and non-child, no need to consult the event flags */
++ if (!is_dir && iter_type != FSNOTIFY_ITER_TYPE_PARENT)
++ return ignore_mask;
++
++ ignore_mask = fsnotify_ignore_mask(mark);
++ if (!fsnotify_mask_applicable(ignore_mask, is_dir, iter_type))
++ return 0;
++
++ return ignore_mask & ALL_FSNOTIFY_EVENTS;
++}
++
++/* Get mask for calculating object interest taking ignore mask into account */
++static inline __u32 fsnotify_calc_mask(struct fsnotify_mark *mark)
++{
++ __u32 mask = mark->mask;
++
++ if (!fsnotify_ignored_events(mark))
++ return mask;
++
++ /* Interest in FS_MODIFY may be needed for clearing ignore mask */
++ if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY))
++ mask |= FS_MODIFY;
++
++ /*
++ * If mark is interested in ignoring events on children, the object must
++ * show interest in those events for fsnotify_parent() to notice it.
++ */
++ return mask | mark->ignore_mask;
++}
++
+ /* Get mask of events for a list of marks */
+ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn);
+ /* Calculate mask of events for a list of marks */
+@@ -520,27 +765,27 @@ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn,
+ __kernel_fsid_t *fsid);
+ /* attach the mark to the object */
+ extern int fsnotify_add_mark(struct fsnotify_mark *mark,
+- fsnotify_connp_t *connp, unsigned int type,
+- int allow_dups, __kernel_fsid_t *fsid);
++ fsnotify_connp_t *connp, unsigned int obj_type,
++ int add_flags, __kernel_fsid_t *fsid);
+ extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
+ fsnotify_connp_t *connp,
+- unsigned int type, int allow_dups,
++ unsigned int obj_type, int add_flags,
+ __kernel_fsid_t *fsid);
+
+ /* attach the mark to the inode */
+ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
+ struct inode *inode,
+- int allow_dups)
++ int add_flags)
+ {
+ return fsnotify_add_mark(mark, &inode->i_fsnotify_marks,
+- FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL);
++ FSNOTIFY_OBJ_TYPE_INODE, add_flags, NULL);
+ }
+ static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark,
+ struct inode *inode,
+- int allow_dups)
++ int add_flags)
+ {
+ return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks,
+- FSNOTIFY_OBJ_TYPE_INODE, allow_dups,
++ FSNOTIFY_OBJ_TYPE_INODE, add_flags,
+ NULL);
+ }
+
+@@ -553,33 +798,32 @@ extern void fsnotify_detach_mark(struct fsnotify_mark *mark);
+ extern void fsnotify_free_mark(struct fsnotify_mark *mark);
+ /* Wait until all marks queued for destruction are destroyed */
+ extern void fsnotify_wait_marks_destroyed(void);
+-/* run all the marks in a group, and clear all of the marks attached to given object type */
+-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type);
++/* Clear all of the marks of a group attached to a given object type */
++extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group,
++ unsigned int obj_type);
+ /* run all the marks in a group, and clear all of the vfsmount marks */
+ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
+ {
+- fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL);
++ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT);
+ }
+ /* run all the marks in a group, and clear all of the inode marks */
+ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group)
+ {
+- fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL);
++ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE);
+ }
+ /* run all the marks in a group, and clear all of the sn marks */
+ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group)
+ {
+- fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL);
++ fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB);
+ }
+ extern void fsnotify_get_mark(struct fsnotify_mark *mark);
+ extern void fsnotify_put_mark(struct fsnotify_mark *mark);
+ extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
+ extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
+
+-static inline void fsnotify_init_event(struct fsnotify_event *event,
+- unsigned long objectid)
++static inline void fsnotify_init_event(struct fsnotify_event *event)
+ {
+ INIT_LIST_HEAD(&event->list);
+- event->objectid = objectid;
+ }
+
+ #else
+diff --git a/include/linux/iversion.h b/include/linux/iversion.h
+index 2917ef990d435..3bfebde5a1a6d 100644
+--- a/include/linux/iversion.h
++++ b/include/linux/iversion.h
+@@ -328,6 +328,19 @@ inode_query_iversion(struct inode *inode)
+ return cur >> I_VERSION_QUERIED_SHIFT;
+ }
+
++/*
++ * For filesystems without any sort of change attribute, the best we can
++ * do is fake one up from the ctime:
++ */
++static inline u64 time_to_chattr(struct timespec64 *t)
++{
++ u64 chattr = t->tv_sec;
++
++ chattr <<= 32;
++ chattr += t->tv_nsec;
++ return chattr;
++}
++
+ /**
+ * inode_eq_iversion_raw - check whether the raw i_version counter has changed
+ * @inode: inode to check
+diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
+index 481273f0c72d4..465060acc9816 100644
+--- a/include/linux/kallsyms.h
++++ b/include/linux/kallsyms.h
+@@ -71,15 +71,14 @@ static inline void *dereference_symbol_descriptor(void *ptr)
+ return ptr;
+ }
+
+-#ifdef CONFIG_KALLSYMS
+-/* Lookup the address for a symbol. Returns 0 if not found. */
+-unsigned long kallsyms_lookup_name(const char *name);
+-
+-/* Call a function on each kallsyms symbol in the core kernel */
+ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+ unsigned long),
+ void *data);
+
++#ifdef CONFIG_KALLSYMS
++/* Lookup the address for a symbol. Returns 0 if not found. */
++unsigned long kallsyms_lookup_name(const char *name);
++
+ extern int kallsyms_lookup_size_offset(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset);
+@@ -108,14 +107,6 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
+ return 0;
+ }
+
+-static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+- struct module *,
+- unsigned long),
+- void *data)
+-{
+- return 0;
+-}
+-
+ static inline int kallsyms_lookup_size_offset(unsigned long addr,
+ unsigned long *symbolsize,
+ unsigned long *offset)
+diff --git a/include/linux/kthread.h b/include/linux/kthread.h
+index 2484ed97e72f5..9dae77a97a033 100644
+--- a/include/linux/kthread.h
++++ b/include/linux/kthread.h
+@@ -68,6 +68,7 @@ void *kthread_probe_data(struct task_struct *k);
+ int kthread_park(struct task_struct *k);
+ void kthread_unpark(struct task_struct *k);
+ void kthread_parkme(void);
++void kthread_exit(long result) __noreturn;
+
+ int kthreadd(void *unused);
+ extern struct task_struct *kthreadd_task;
+diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h
+index 0520c0cd73f42..3bc9f7410e213 100644
+--- a/include/linux/lockd/bind.h
++++ b/include/linux/lockd/bind.h
+@@ -27,7 +27,8 @@ struct rpc_task;
+ struct nlmsvc_binding {
+ __be32 (*fopen)(struct svc_rqst *,
+ struct nfs_fh *,
+- struct file **);
++ struct file **,
++ int mode);
+ void (*fclose)(struct file *);
+ };
+
+diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h
+index 666f5f310a041..70ce419e27093 100644
+--- a/include/linux/lockd/lockd.h
++++ b/include/linux/lockd/lockd.h
+@@ -10,6 +10,8 @@
+ #ifndef LINUX_LOCKD_LOCKD_H
+ #define LINUX_LOCKD_LOCKD_H
+
++/* XXX: a lot of this should really be under fs/lockd. */
++
+ #include <linux/in.h>
+ #include <linux/in6.h>
+ #include <net/ipv6.h>
+@@ -154,7 +156,8 @@ struct nlm_rqst {
+ struct nlm_file {
+ struct hlist_node f_list; /* linked list */
+ struct nfs_fh f_handle; /* NFS file handle */
+- struct file * f_file; /* VFS file pointer */
++ struct file * f_file[2]; /* VFS file pointers,
++ indexed by O_ flags */
+ struct nlm_share * f_shares; /* DOS shares */
+ struct list_head f_blocks; /* blocked locks */
+ unsigned int f_locks; /* guesstimate # of locks */
+@@ -267,6 +270,7 @@ typedef int (*nlm_host_match_fn_t)(void *cur, struct nlm_host *ref);
+ /*
+ * Server-side lock handling
+ */
++int lock_to_openmode(struct file_lock *);
+ __be32 nlmsvc_lock(struct svc_rqst *, struct nlm_file *,
+ struct nlm_host *, struct nlm_lock *, int,
+ struct nlm_cookie *, int);
+@@ -286,8 +290,9 @@ void nlmsvc_locks_init_private(struct file_lock *, struct nlm_host *, pid_t);
+ * File handling for the server personality
+ */
+ __be32 nlm_lookup_file(struct svc_rqst *, struct nlm_file **,
+- struct nfs_fh *);
++ struct nlm_lock *);
+ void nlm_release_file(struct nlm_file *);
++void nlmsvc_put_lockowner(struct nlm_lockowner *);
+ void nlmsvc_release_lockowner(struct nlm_lock *);
+ void nlmsvc_mark_resources(struct net *);
+ void nlmsvc_free_host_resources(struct nlm_host *);
+@@ -299,9 +304,15 @@ void nlmsvc_invalidate_all(void);
+ int nlmsvc_unlock_all_by_sb(struct super_block *sb);
+ int nlmsvc_unlock_all_by_ip(struct sockaddr *server_addr);
+
++static inline struct file *nlmsvc_file_file(struct nlm_file *file)
++{
++ return file->f_file[O_RDONLY] ?
++ file->f_file[O_RDONLY] : file->f_file[O_WRONLY];
++}
++
+ static inline struct inode *nlmsvc_file_inode(struct nlm_file *file)
+ {
+- return locks_inode(file->f_file);
++ return locks_inode(nlmsvc_file_file(file));
+ }
+
+ static inline int __nlm_privileged_request4(const struct sockaddr *sap)
+diff --git a/include/linux/lockd/xdr.h b/include/linux/lockd/xdr.h
+index 7ab9f264313f0..67e4a2c5500bd 100644
+--- a/include/linux/lockd/xdr.h
++++ b/include/linux/lockd/xdr.h
+@@ -41,6 +41,8 @@ struct nlm_lock {
+ struct nfs_fh fh;
+ struct xdr_netobj oh;
+ u32 svid;
++ u64 lock_start;
++ u64 lock_len;
+ struct file_lock fl;
+ };
+
+@@ -96,24 +98,19 @@ struct nlm_reboot {
+ */
+ #define NLMSVC_XDRSIZE sizeof(struct nlm_args)
+
+-int nlmsvc_decode_testargs(struct svc_rqst *, __be32 *);
+-int nlmsvc_encode_testres(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_lockargs(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_cancargs(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_unlockargs(struct svc_rqst *, __be32 *);
+-int nlmsvc_encode_res(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_res(struct svc_rqst *, __be32 *);
+-int nlmsvc_encode_void(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_void(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_shareargs(struct svc_rqst *, __be32 *);
+-int nlmsvc_encode_shareres(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_notify(struct svc_rqst *, __be32 *);
+-int nlmsvc_decode_reboot(struct svc_rqst *, __be32 *);
+-/*
+-int nlmclt_encode_testargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+-int nlmclt_encode_lockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+-int nlmclt_encode_cancargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+-int nlmclt_encode_unlockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+- */
++bool nlmsvc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++
++bool nlmsvc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlmsvc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+ #endif /* LOCKD_XDR_H */
+diff --git a/include/linux/lockd/xdr4.h b/include/linux/lockd/xdr4.h
+index e709fe5924f2b..72831e35dca32 100644
+--- a/include/linux/lockd/xdr4.h
++++ b/include/linux/lockd/xdr4.h
+@@ -22,27 +22,22 @@
+ #define nlm4_fbig cpu_to_be32(NLM_FBIG)
+ #define nlm4_failed cpu_to_be32(NLM_FAILED)
+
++void nlm4svc_set_file_lock_range(struct file_lock *fl, u64 off, u64 len);
++bool nlm4svc_decode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_reboot(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_shareargs(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_decode_notify(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
++bool nlm4svc_encode_testres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_encode_res(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_encode_void(struct svc_rqst *rqstp, struct xdr_stream *xdr);
++bool nlm4svc_encode_shareres(struct svc_rqst *rqstp, struct xdr_stream *xdr);
+
+-int nlm4svc_decode_testargs(struct svc_rqst *, __be32 *);
+-int nlm4svc_encode_testres(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_lockargs(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_cancargs(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_unlockargs(struct svc_rqst *, __be32 *);
+-int nlm4svc_encode_res(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_res(struct svc_rqst *, __be32 *);
+-int nlm4svc_encode_void(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_void(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_shareargs(struct svc_rqst *, __be32 *);
+-int nlm4svc_encode_shareres(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_notify(struct svc_rqst *, __be32 *);
+-int nlm4svc_decode_reboot(struct svc_rqst *, __be32 *);
+-/*
+-int nlmclt_encode_testargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+-int nlmclt_encode_lockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+-int nlmclt_encode_cancargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+-int nlmclt_encode_unlockargs(struct rpc_rqst *, u32 *, struct nlm_args *);
+- */
+ extern const struct rpc_version nlm_version4;
+
+ #endif /* LOCKD_XDR4_H */
+diff --git a/include/linux/module.h b/include/linux/module.h
+index 6264617bab4d4..a55a40c28568e 100644
+--- a/include/linux/module.h
++++ b/include/linux/module.h
+@@ -582,7 +582,7 @@ static inline bool within_module(unsigned long addr, const struct module *mod)
+ return within_module_init(addr, mod) || within_module_core(addr, mod);
+ }
+
+-/* Search for module by name: must hold module_mutex. */
++/* Search for module by name: must be in a RCU-sched critical section. */
+ struct module *find_module(const char *name);
+
+ struct symsearch {
+@@ -604,13 +604,9 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+ /* Look for this name: can be of form module:name. */
+ unsigned long module_kallsyms_lookup_name(const char *name);
+
+-int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+- struct module *, unsigned long),
+- void *data);
+-
+-extern void __noreturn __module_put_and_exit(struct module *mod,
++extern void __noreturn __module_put_and_kthread_exit(struct module *mod,
+ long code);
+-#define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code)
++#define module_put_and_kthread_exit(code) __module_put_and_kthread_exit(THIS_MODULE, code)
+
+ #ifdef CONFIG_MODULE_UNLOAD
+ int module_refcount(struct module *mod);
+@@ -791,14 +787,6 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
+ return 0;
+ }
+
+-static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+- struct module *,
+- unsigned long),
+- void *data)
+-{
+- return 0;
+-}
+-
+ static inline int register_module_notifier(struct notifier_block *nb)
+ {
+ /* no events will happen anyway, so this can always succeed */
+@@ -810,7 +798,7 @@ static inline int unregister_module_notifier(struct notifier_block *nb)
+ return 0;
+ }
+
+-#define module_put_and_exit(code) do_exit(code)
++#define module_put_and_kthread_exit(code) kthread_exit(code)
+
+ static inline void print_modules(void)
+ {
+@@ -887,4 +875,8 @@ static inline bool module_sig_ok(struct module *module)
+ }
+ #endif /* CONFIG_MODULE_SIG */
+
++int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
++ struct module *, unsigned long),
++ void *data);
++
+ #endif /* _LINUX_MODULE_H */
+diff --git a/include/linux/nfs.h b/include/linux/nfs.h
+index 0dc7ad38a0da4..b06375e88e589 100644
+--- a/include/linux/nfs.h
++++ b/include/linux/nfs.h
+@@ -36,14 +36,6 @@ static inline void nfs_copy_fh(struct nfs_fh *target, const struct nfs_fh *sourc
+ memcpy(target->data, source->data, source->size);
+ }
+
+-
+-/*
+- * This is really a general kernel constant, but since nothing like
+- * this is defined in the kernel headers, I have to do it here.
+- */
+-#define NFS_OFFSET_MAX ((__s64)((~(__u64)0) >> 1))
+-
+-
+ enum nfs3_stable_how {
+ NFS_UNSTABLE = 0,
+ NFS_DATA_SYNC = 1,
+diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
+index 9dc7eeac924f0..ea88d0f462c9d 100644
+--- a/include/linux/nfs4.h
++++ b/include/linux/nfs4.h
+@@ -385,13 +385,6 @@ enum lock_type4 {
+ NFS4_WRITEW_LT = 4
+ };
+
+-enum change_attr_type4 {
+- NFS4_CHANGE_TYPE_IS_MONOTONIC_INCR = 0,
+- NFS4_CHANGE_TYPE_IS_VERSION_COUNTER = 1,
+- NFS4_CHANGE_TYPE_IS_VERSION_COUNTER_NOPNFS = 2,
+- NFS4_CHANGE_TYPE_IS_TIME_METADATA = 3,
+- NFS4_CHANGE_TYPE_IS_UNDEFINED = 4
+-};
+
+ /* Mandatory Attributes */
+ #define FATTR4_WORD0_SUPPORTED_ATTRS (1UL << 0)
+@@ -459,7 +452,6 @@ enum change_attr_type4 {
+ #define FATTR4_WORD2_LAYOUT_BLKSIZE (1UL << 1)
+ #define FATTR4_WORD2_MDSTHRESHOLD (1UL << 4)
+ #define FATTR4_WORD2_CLONE_BLKSIZE (1UL << 13)
+-#define FATTR4_WORD2_CHANGE_ATTR_TYPE (1UL << 15)
+ #define FATTR4_WORD2_SECURITY_LABEL (1UL << 16)
+ #define FATTR4_WORD2_MODE_UMASK (1UL << 17)
+ #define FATTR4_WORD2_XATTR_SUPPORT (1UL << 18)
+@@ -725,4 +717,17 @@ enum nfs4_setxattr_options {
+ SETXATTR4_CREATE = 1,
+ SETXATTR4_REPLACE = 2,
+ };
++
++enum {
++ RCA4_TYPE_MASK_RDATA_DLG = 0,
++ RCA4_TYPE_MASK_WDATA_DLG = 1,
++ RCA4_TYPE_MASK_DIR_DLG = 2,
++ RCA4_TYPE_MASK_FILE_LAYOUT = 3,
++ RCA4_TYPE_MASK_BLK_LAYOUT = 4,
++ RCA4_TYPE_MASK_OBJ_LAYOUT_MIN = 8,
++ RCA4_TYPE_MASK_OBJ_LAYOUT_MAX = 9,
++ RCA4_TYPE_MASK_OTHER_LAYOUT_MIN = 12,
++ RCA4_TYPE_MASK_OTHER_LAYOUT_MAX = 15,
++};
++
+ #endif
+diff --git a/include/linux/nfs_ssc.h b/include/linux/nfs_ssc.h
+index f5ba0fbff72fe..22265b1ff0800 100644
+--- a/include/linux/nfs_ssc.h
++++ b/include/linux/nfs_ssc.h
+@@ -8,6 +8,7 @@
+ */
+
+ #include <linux/nfs_fs.h>
++#include <linux/sunrpc/svc.h>
+
+ extern struct nfs_ssc_client_ops_tbl nfs_ssc_client_tbl;
+
+@@ -54,6 +55,19 @@ static inline void nfs42_ssc_close(struct file *filep)
+ }
+ #endif
+
++struct nfsd4_ssc_umount_item {
++ struct list_head nsui_list;
++ bool nsui_busy;
++ /*
++ * nsui_refcnt inited to 2, 1 on list and 1 for consumer. Entry
++ * is removed when refcnt drops to 1 and nsui_expire expires.
++ */
++ refcount_t nsui_refcnt;
++ unsigned long nsui_expire;
++ struct vfsmount *nsui_vfsmount;
++ char nsui_ipaddr[RPC_MAX_ADDRBUFLEN + 1];
++};
++
+ /*
+ * NFS_FS
+ */
+diff --git a/include/linux/nfsacl.h b/include/linux/nfsacl.h
+index 103d446953234..8e76a79cdc6ae 100644
+--- a/include/linux/nfsacl.h
++++ b/include/linux/nfsacl.h
+@@ -38,5 +38,11 @@ nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode,
+ extern int
+ nfsacl_decode(struct xdr_buf *buf, unsigned int base, unsigned int *aclcnt,
+ struct posix_acl **pacl);
++extern bool
++nfs_stream_decode_acl(struct xdr_stream *xdr, unsigned int *aclcnt,
++ struct posix_acl **pacl);
++extern bool
++nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode,
++ struct posix_acl *acl, int encode_entries, int typeflag);
+
+ #endif /* __LINUX_NFSACL_H */
+diff --git a/include/linux/pid.h b/include/linux/pid.h
+index fa10acb8d6a42..af308e15f174c 100644
+--- a/include/linux/pid.h
++++ b/include/linux/pid.h
+@@ -78,6 +78,7 @@ struct file;
+
+ extern struct pid *pidfd_pid(const struct file *file);
+ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags);
++int pidfd_create(struct pid *pid, unsigned int flags);
+
+ static inline struct pid *get_pid(struct pid *pid)
+ {
+diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h
+index a8ec3b6093fcb..3632c5d6ec559 100644
+--- a/include/linux/sched/user.h
++++ b/include/linux/sched/user.h
+@@ -14,9 +14,6 @@ struct user_struct {
+ refcount_t __count; /* reference count */
+ atomic_t processes; /* How many processes does this user have? */
+ atomic_t sigpending; /* How many pending signals does this user have? */
+-#ifdef CONFIG_FANOTIFY
+- atomic_t fanotify_listeners;
+-#endif
+ #ifdef CONFIG_EPOLL
+ atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
+ #endif
+diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
+index 43f854487539b..938c2bf29db88 100644
+--- a/include/linux/sunrpc/msg_prot.h
++++ b/include/linux/sunrpc/msg_prot.h
+@@ -10,9 +10,6 @@
+
+ #define RPC_VERSION 2
+
+-/* size of an XDR encoding unit in bytes, i.e. 32bit */
+-#define XDR_UNIT (4)
+-
+ /* spec defines authentication flavor as an unsigned 32 bit integer */
+ typedef u32 rpc_authflavor_t;
+
+diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
+index 386628b36bc75..1cf7a7799cc04 100644
+--- a/include/linux/sunrpc/svc.h
++++ b/include/linux/sunrpc/svc.h
+@@ -19,6 +19,7 @@
+ #include <linux/sunrpc/svcauth.h>
+ #include <linux/wait.h>
+ #include <linux/mm.h>
++#include <linux/pagevec.h>
+
+ /* statistics for svc_pool structures */
+ struct svc_pool_stats {
+@@ -51,25 +52,6 @@ struct svc_pool {
+ unsigned long sp_flags;
+ } ____cacheline_aligned_in_smp;
+
+-struct svc_serv;
+-
+-struct svc_serv_ops {
+- /* Callback to use when last thread exits. */
+- void (*svo_shutdown)(struct svc_serv *, struct net *);
+-
+- /* function for service threads to run */
+- int (*svo_function)(void *);
+-
+- /* queue up a transport for servicing */
+- void (*svo_enqueue_xprt)(struct svc_xprt *);
+-
+- /* set up thread (or whatever) execution context */
+- int (*svo_setup)(struct svc_serv *, struct svc_pool *, int);
+-
+- /* optional module to count when adding threads (pooled svcs only) */
+- struct module *svo_module;
+-};
+-
+ /*
+ * RPC service.
+ *
+@@ -84,6 +66,7 @@ struct svc_serv {
+ struct svc_program * sv_program; /* RPC program */
+ struct svc_stat * sv_stats; /* RPC statistics */
+ spinlock_t sv_lock;
++ struct kref sv_refcnt;
+ unsigned int sv_nrthreads; /* # of server threads */
+ unsigned int sv_maxconn; /* max connections allowed or
+ * '0' causing max to be based
+@@ -101,7 +84,8 @@ struct svc_serv {
+
+ unsigned int sv_nrpools; /* number of thread pools */
+ struct svc_pool * sv_pools; /* array of thread pools */
+- const struct svc_serv_ops *sv_ops; /* server operations */
++ int (*sv_threadfn)(void *data);
++
+ #if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ struct list_head sv_cb_list; /* queue for callback requests
+ * that arrive over the same
+@@ -113,15 +97,30 @@ struct svc_serv {
+ #endif /* CONFIG_SUNRPC_BACKCHANNEL */
+ };
+
+-/*
+- * We use sv_nrthreads as a reference count. svc_destroy() drops
+- * this refcount, so we need to bump it up around operations that
+- * change the number of threads. Horrible, but there it is.
+- * Should be called with the "service mutex" held.
++/**
++ * svc_get() - increment reference count on a SUNRPC serv
++ * @serv: the svc_serv to have count incremented
++ *
++ * Returns: the svc_serv that was passed in.
+ */
+-static inline void svc_get(struct svc_serv *serv)
++static inline struct svc_serv *svc_get(struct svc_serv *serv)
+ {
+- serv->sv_nrthreads++;
++ kref_get(&serv->sv_refcnt);
++ return serv;
++}
++
++void svc_destroy(struct kref *);
++
++/**
++ * svc_put - decrement reference count on a SUNRPC serv
++ * @serv: the svc_serv to have count decremented
++ *
++ * When the reference count reaches zero, svc_destroy()
++ * is called to clean up and free the serv.
++ */
++static inline void svc_put(struct svc_serv *serv)
++{
++ kref_put(&serv->sv_refcnt, svc_destroy);
+ }
+
+ /*
+@@ -247,12 +246,16 @@ struct svc_rqst {
+
+ size_t rq_xprt_hlen; /* xprt header len */
+ struct xdr_buf rq_arg;
++ struct xdr_stream rq_arg_stream;
++ struct xdr_stream rq_res_stream;
++ struct page *rq_scratch_page;
+ struct xdr_buf rq_res;
+ struct page *rq_pages[RPCSVC_MAXPAGES + 1];
+ struct page * *rq_respages; /* points into rq_pages */
+ struct page * *rq_next_page; /* next reply page to use */
+ struct page * *rq_page_end; /* one past the last page */
+
++ struct pagevec rq_pvec;
+ struct kvec rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */
+ struct bio_vec rq_bvec[RPCSVC_MAXPAGES];
+
+@@ -272,13 +275,13 @@ struct svc_rqst {
+ #define RQ_VICTIM (5) /* about to be shut down */
+ #define RQ_BUSY (6) /* request is busy */
+ #define RQ_DATA (7) /* request has data */
+-#define RQ_AUTHERR (8) /* Request status is auth error */
+ unsigned long rq_flags; /* flags field */
+ ktime_t rq_qtime; /* enqueue time */
+
+ void * rq_argp; /* decoded arguments */
+ void * rq_resp; /* xdr'd results */
+ void * rq_auth_data; /* flavor-specific data */
++ __be32 rq_auth_stat; /* authentication status */
+ int rq_auth_slack; /* extra space xdr code
+ * should leave in head
+ * for krb5i, krb5p.
+@@ -452,40 +455,21 @@ struct svc_procedure {
+ /* process the request: */
+ __be32 (*pc_func)(struct svc_rqst *);
+ /* XDR decode args: */
+- int (*pc_decode)(struct svc_rqst *, __be32 *data);
++ bool (*pc_decode)(struct svc_rqst *rqstp,
++ struct xdr_stream *xdr);
+ /* XDR encode result: */
+- int (*pc_encode)(struct svc_rqst *, __be32 *data);
++ bool (*pc_encode)(struct svc_rqst *rqstp,
++ struct xdr_stream *xdr);
+ /* XDR free result: */
+ void (*pc_release)(struct svc_rqst *);
+ unsigned int pc_argsize; /* argument struct size */
++ unsigned int pc_argzero; /* how much of argument to clear */
+ unsigned int pc_ressize; /* result struct size */
+ unsigned int pc_cachetype; /* cache info (NFS) */
+ unsigned int pc_xdrressize; /* maximum size of XDR reply */
++ const char * pc_name; /* for display */
+ };
+
+-/*
+- * Mode for mapping cpus to pools.
+- */
+-enum {
+- SVC_POOL_AUTO = -1, /* choose one of the others */
+- SVC_POOL_GLOBAL, /* no mapping, just a single global pool
+- * (legacy & UP mode) */
+- SVC_POOL_PERCPU, /* one pool per cpu */
+- SVC_POOL_PERNODE /* one pool per numa node */
+-};
+-
+-struct svc_pool_map {
+- int count; /* How many svc_servs use us */
+- int mode; /* Note: int not enum to avoid
+- * warnings about "enumeration value
+- * not handled in switch" */
+- unsigned int npools;
+- unsigned int *pool_to; /* maps pool id to cpu or node */
+- unsigned int *to_pool; /* maps cpu or node to pool id */
+-};
+-
+-extern struct svc_pool_map svc_pool_map;
+-
+ /*
+ * Function prototypes.
+ */
+@@ -493,22 +477,17 @@ int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
+ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
+ int svc_bind(struct svc_serv *serv, struct net *net);
+ struct svc_serv *svc_create(struct svc_program *, unsigned int,
+- const struct svc_serv_ops *);
++ int (*threadfn)(void *data));
+ struct svc_rqst *svc_rqst_alloc(struct svc_serv *serv,
+ struct svc_pool *pool, int node);
+-struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
+- struct svc_pool *pool, int node);
++void svc_rqst_replace_page(struct svc_rqst *rqstp,
++ struct page *page);
+ void svc_rqst_free(struct svc_rqst *);
+ void svc_exit_thread(struct svc_rqst *);
+-unsigned int svc_pool_map_get(void);
+-void svc_pool_map_put(void);
+ struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
+- const struct svc_serv_ops *);
++ int (*threadfn)(void *data));
+ int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
+-int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int);
+ int svc_pool_stats_open(struct svc_serv *serv, struct file *file);
+-void svc_destroy(struct svc_serv *);
+-void svc_shutdown_net(struct svc_serv *, struct net *);
+ int svc_process(struct svc_rqst *);
+ int bc_svc_process(struct svc_serv *, struct rpc_rqst *,
+ struct svc_rqst *);
+@@ -519,16 +498,14 @@ void svc_wake_up(struct svc_serv *);
+ void svc_reserve(struct svc_rqst *rqstp, int space);
+ struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu);
+ char * svc_print_addr(struct svc_rqst *, char *, size_t);
+-int svc_encode_read_payload(struct svc_rqst *rqstp,
+- unsigned int offset,
+- unsigned int length);
++int svc_encode_result_payload(struct svc_rqst *rqstp,
++ unsigned int offset,
++ unsigned int length);
+ unsigned int svc_fill_write_vector(struct svc_rqst *rqstp,
+- struct page **pages,
+- struct kvec *first, size_t total);
++ struct xdr_buf *payload);
+ char *svc_fill_symlink_pathname(struct svc_rqst *rqstp,
+ struct kvec *first, void *p,
+ size_t total);
+-__be32 svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err);
+ __be32 svc_generic_init_request(struct svc_rqst *rqstp,
+ const struct svc_program *progp,
+ struct svc_process_info *procinfo);
+@@ -557,4 +534,42 @@ static inline void svc_reserve_auth(struct svc_rqst *rqstp, int space)
+ svc_reserve(rqstp, space + rqstp->rq_auth_slack);
+ }
+
++/**
++ * svcxdr_init_decode - Prepare an xdr_stream for svc Call decoding
++ * @rqstp: controlling server RPC transaction context
++ *
++ */
++static inline void svcxdr_init_decode(struct svc_rqst *rqstp)
++{
++ struct xdr_stream *xdr = &rqstp->rq_arg_stream;
++ struct kvec *argv = rqstp->rq_arg.head;
++
++ xdr_init_decode(xdr, &rqstp->rq_arg, argv->iov_base, NULL);
++ xdr_set_scratch_page(xdr, rqstp->rq_scratch_page);
++}
++
++/**
++ * svcxdr_init_encode - Prepare an xdr_stream for svc Reply encoding
++ * @rqstp: controlling server RPC transaction context
++ *
++ */
++static inline void svcxdr_init_encode(struct svc_rqst *rqstp)
++{
++ struct xdr_stream *xdr = &rqstp->rq_res_stream;
++ struct xdr_buf *buf = &rqstp->rq_res;
++ struct kvec *resv = buf->head;
++
++ xdr_reset_scratch_buffer(xdr);
++
++ xdr->buf = buf;
++ xdr->iov = resv;
++ xdr->p = resv->iov_base + resv->iov_len;
++ xdr->end = resv->iov_base + PAGE_SIZE - rqstp->rq_auth_slack;
++ buf->len = resv->iov_len;
++ xdr->page_ptr = buf->pages - 1;
++ buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages);
++ buf->buflen -= rqstp->rq_auth_slack;
++ xdr->rqst = NULL;
++}
++
+ #endif /* SUNRPC_SVC_H */
+diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
+index 9dc3a3b88391b..2b870a3f391b1 100644
+--- a/include/linux/sunrpc/svc_rdma.h
++++ b/include/linux/sunrpc/svc_rdma.h
+@@ -207,8 +207,8 @@ extern void svc_rdma_send_error_msg(struct svcxprt_rdma *rdma,
+ struct svc_rdma_recv_ctxt *rctxt,
+ int status);
+ extern int svc_rdma_sendto(struct svc_rqst *);
+-extern int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+- unsigned int length);
++extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
++ unsigned int length);
+
+ /* svc_rdma_transport.c */
+ extern struct svc_xprt_class svc_rdma_class;
+diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
+index aca35ab5cff24..dbffb92511ef2 100644
+--- a/include/linux/sunrpc/svc_xprt.h
++++ b/include/linux/sunrpc/svc_xprt.h
+@@ -21,8 +21,8 @@ struct svc_xprt_ops {
+ int (*xpo_has_wspace)(struct svc_xprt *);
+ int (*xpo_recvfrom)(struct svc_rqst *);
+ int (*xpo_sendto)(struct svc_rqst *);
+- int (*xpo_read_payload)(struct svc_rqst *, unsigned int,
+- unsigned int);
++ int (*xpo_result_payload)(struct svc_rqst *, unsigned int,
++ unsigned int);
+ void (*xpo_release_rqst)(struct svc_rqst *);
+ void (*xpo_detach)(struct svc_xprt *);
+ void (*xpo_free)(struct svc_xprt *);
+@@ -127,14 +127,16 @@ int svc_reg_xprt_class(struct svc_xprt_class *);
+ void svc_unreg_xprt_class(struct svc_xprt_class *);
+ void svc_xprt_init(struct net *, struct svc_xprt_class *, struct svc_xprt *,
+ struct svc_serv *);
+-int svc_create_xprt(struct svc_serv *, const char *, struct net *,
+- const int, const unsigned short, int,
+- const struct cred *);
+-void svc_xprt_do_enqueue(struct svc_xprt *xprt);
++int svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
++ struct net *net, const int family,
++ const unsigned short port, int flags,
++ const struct cred *cred);
++void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net);
++void svc_xprt_received(struct svc_xprt *xprt);
+ void svc_xprt_enqueue(struct svc_xprt *xprt);
+ void svc_xprt_put(struct svc_xprt *xprt);
+ void svc_xprt_copy_addrs(struct svc_rqst *rqstp, struct svc_xprt *xprt);
+-void svc_close_xprt(struct svc_xprt *xprt);
++void svc_xprt_close(struct svc_xprt *xprt);
+ int svc_port_is_privileged(struct sockaddr *sin);
+ int svc_print_xprts(char *buf, int maxlen);
+ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h
+index b0003866a2497..6d9cc9080aca7 100644
+--- a/include/linux/sunrpc/svcauth.h
++++ b/include/linux/sunrpc/svcauth.h
+@@ -127,7 +127,7 @@ struct auth_ops {
+ char * name;
+ struct module *owner;
+ int flavour;
+- int (*accept)(struct svc_rqst *rq, __be32 *authp);
++ int (*accept)(struct svc_rqst *rq);
+ int (*release)(struct svc_rqst *rq);
+ void (*domain_release)(struct auth_domain *);
+ int (*set_client)(struct svc_rqst *rq);
+@@ -149,7 +149,7 @@ struct auth_ops {
+
+ struct svc_xprt;
+
+-extern int svc_authenticate(struct svc_rqst *rqstp, __be32 *authp);
++extern int svc_authenticate(struct svc_rqst *rqstp);
+ extern int svc_authorise(struct svc_rqst *rqstp);
+ extern int svc_set_client(struct svc_rqst *rqstp);
+ extern int svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops);
+diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h
+index b7ac7fe683067..a366d3eb05315 100644
+--- a/include/linux/sunrpc/svcsock.h
++++ b/include/linux/sunrpc/svcsock.h
+@@ -57,10 +57,9 @@ int svc_recv(struct svc_rqst *, long);
+ int svc_send(struct svc_rqst *);
+ void svc_drop(struct svc_rqst *);
+ void svc_sock_update_bufs(struct svc_serv *serv);
+-bool svc_alien_sock(struct net *net, int fd);
+-int svc_addsock(struct svc_serv *serv, const int fd,
+- char *name_return, const size_t len,
+- const struct cred *cred);
++int svc_addsock(struct svc_serv *serv, struct net *net,
++ const int fd, char *name_return, const size_t len,
++ const struct cred *cred);
+ void svc_init_xprt_sock(void);
+ void svc_cleanup_xprt_sock(void);
+ struct svc_xprt *svc_sock_create(struct svc_serv *serv, int prot);
+diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
+index 6d9d1520612b8..c1c50eaae4726 100644
+--- a/include/linux/sunrpc/xdr.h
++++ b/include/linux/sunrpc/xdr.h
+@@ -19,6 +19,13 @@
+ struct bio_vec;
+ struct rpc_rqst;
+
++/*
++ * Size of an XDR encoding unit in bytes, i.e. 32 bits,
++ * as defined in Section 3 of RFC 4506. All encoded
++ * XDR data items are aligned on a boundary of 32 bits.
++ */
++#define XDR_UNIT sizeof(__be32)
++
+ /*
+ * Buffer adjustment
+ */
+@@ -232,10 +239,12 @@ typedef int (*kxdrdproc_t)(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
+
+ extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf,
+ __be32 *p, struct rpc_rqst *rqst);
++extern void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
++ struct page **pages, struct rpc_rqst *rqst);
+ extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+ extern int xdr_reserve_space_vec(struct xdr_stream *xdr, struct kvec *vec,
+ size_t nbytes);
+-extern void xdr_commit_encode(struct xdr_stream *xdr);
++extern void __xdr_commit_encode(struct xdr_stream *xdr);
+ extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
+ extern int xdr_restrict_buflen(struct xdr_stream *xdr, int newbuflen);
+ extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
+@@ -246,13 +255,71 @@ extern void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf,
+ __be32 *p, struct rpc_rqst *rqst);
+ extern void xdr_init_decode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
+ struct page **pages, unsigned int len);
+-extern void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen);
+ extern __be32 *xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes);
+ extern unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len);
+ extern void xdr_enter_page(struct xdr_stream *xdr, unsigned int len);
+ extern int xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len, int (*actor)(struct scatterlist *, void *), void *data);
+ extern uint64_t xdr_align_data(struct xdr_stream *, uint64_t, uint32_t);
+ extern uint64_t xdr_expand_hole(struct xdr_stream *, uint64_t, uint64_t);
++extern bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
++ unsigned int len);
++
++/**
++ * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
++ * @xdr: pointer to xdr_stream struct
++ * @buf: pointer to an empty buffer
++ * @buflen: size of 'buf'
++ *
++ * The scratch buffer is used when decoding from an array of pages.
++ * If an xdr_inline_decode() call spans across page boundaries, then
++ * we copy the data into the scratch buffer in order to allow linear
++ * access.
++ */
++static inline void
++xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
++{
++ xdr->scratch.iov_base = buf;
++ xdr->scratch.iov_len = buflen;
++}
++
++/**
++ * xdr_set_scratch_page - Attach a scratch buffer for decoding data
++ * @xdr: pointer to xdr_stream struct
++ * @page: an anonymous page
++ *
++ * See xdr_set_scratch_buffer().
++ */
++static inline void
++xdr_set_scratch_page(struct xdr_stream *xdr, struct page *page)
++{
++ xdr_set_scratch_buffer(xdr, page_address(page), PAGE_SIZE);
++}
++
++/**
++ * xdr_reset_scratch_buffer - Clear scratch buffer information
++ * @xdr: pointer to xdr_stream struct
++ *
++ * See xdr_set_scratch_buffer().
++ */
++static inline void
++xdr_reset_scratch_buffer(struct xdr_stream *xdr)
++{
++ xdr_set_scratch_buffer(xdr, NULL, 0);
++}
++
++/**
++ * xdr_commit_encode - Ensure all data is written to xdr->buf
++ * @xdr: pointer to xdr_stream
++ *
++ * Handle encoding across page boundaries by giving the caller a
++ * temporary location to write to, then later copying the data into
++ * place. __xdr_commit_encode() does that copying.
++ */
++static inline void xdr_commit_encode(struct xdr_stream *xdr)
++{
++ if (unlikely(xdr->scratch.iov_len))
++ __xdr_commit_encode(xdr);
++}
+
+ /**
+ * xdr_stream_remaining - Return the number of bytes remaining in the stream
+@@ -285,7 +352,7 @@ ssize_t xdr_stream_decode_string_dup(struct xdr_stream *xdr, char **str,
+ static inline size_t
+ xdr_align_size(size_t n)
+ {
+- const size_t mask = sizeof(__u32) - 1;
++ const size_t mask = XDR_UNIT - 1;
+
+ return (n + mask) & ~mask;
+ }
+@@ -315,7 +382,7 @@ static inline size_t xdr_pad_size(size_t n)
+ */
+ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr)
+ {
+- const size_t len = sizeof(__be32);
++ const size_t len = XDR_UNIT;
+ __be32 *p = xdr_reserve_space(xdr, len);
+
+ if (unlikely(!p))
+@@ -334,7 +401,7 @@ static inline ssize_t xdr_stream_encode_item_present(struct xdr_stream *xdr)
+ */
+ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
+ {
+- const size_t len = sizeof(__be32);
++ const size_t len = XDR_UNIT;
+ __be32 *p = xdr_reserve_space(xdr, len);
+
+ if (unlikely(!p))
+@@ -343,6 +410,40 @@ static inline int xdr_stream_encode_item_absent(struct xdr_stream *xdr)
+ return len;
+ }
+
++/**
++ * xdr_encode_bool - Encode a boolean item
++ * @p: address in a buffer into which to encode
++ * @n: boolean value to encode
++ *
++ * Return value:
++ * Address of item following the encoded boolean
++ */
++static inline __be32 *xdr_encode_bool(__be32 *p, u32 n)
++{
++ *p++ = n ? xdr_one : xdr_zero;
++ return p;
++}
++
++/**
++ * xdr_stream_encode_bool - Encode a boolean item
++ * @xdr: pointer to xdr_stream
++ * @n: boolean value to encode
++ *
++ * Return values:
++ * On success, returns length in bytes of XDR buffer consumed
++ * %-EMSGSIZE on XDR buffer overflow
++ */
++static inline int xdr_stream_encode_bool(struct xdr_stream *xdr, __u32 n)
++{
++ const size_t len = XDR_UNIT;
++ __be32 *p = xdr_reserve_space(xdr, len);
++
++ if (unlikely(!p))
++ return -EMSGSIZE;
++ xdr_encode_bool(p, n);
++ return len;
++}
++
+ /**
+ * xdr_stream_encode_u32 - Encode a 32-bit integer
+ * @xdr: pointer to xdr_stream
+@@ -504,6 +605,27 @@ static inline bool xdr_item_is_present(const __be32 *p)
+ return *p != xdr_zero;
+ }
+
++/**
++ * xdr_stream_decode_bool - Decode a boolean
++ * @xdr: pointer to xdr_stream
++ * @ptr: pointer to a u32 in which to store the result
++ *
++ * Return values:
++ * %0 on success
++ * %-EBADMSG on XDR buffer overflow
++ */
++static inline ssize_t
++xdr_stream_decode_bool(struct xdr_stream *xdr, __u32 *ptr)
++{
++ const size_t count = sizeof(*ptr);
++ __be32 *p = xdr_inline_decode(xdr, count);
++
++ if (unlikely(!p))
++ return -EBADMSG;
++ *ptr = (*p != xdr_zero);
++ return 0;
++}
++
+ /**
+ * xdr_stream_decode_u32 - Decode a 32-bit integer
+ * @xdr: pointer to xdr_stream
+@@ -525,6 +647,27 @@ xdr_stream_decode_u32(struct xdr_stream *xdr, __u32 *ptr)
+ return 0;
+ }
+
++/**
++ * xdr_stream_decode_u64 - Decode a 64-bit integer
++ * @xdr: pointer to xdr_stream
++ * @ptr: location to store 64-bit integer
++ *
++ * Return values:
++ * %0 on success
++ * %-EBADMSG on XDR buffer overflow
++ */
++static inline ssize_t
++xdr_stream_decode_u64(struct xdr_stream *xdr, __u64 *ptr)
++{
++ const size_t count = sizeof(*ptr);
++ __be32 *p = xdr_inline_decode(xdr, count);
++
++ if (unlikely(!p))
++ return -EBADMSG;
++ xdr_decode_hyper(p, ptr);
++ return 0;
++}
++
+ /**
+ * xdr_stream_decode_opaque_fixed - Decode fixed length opaque xdr data
+ * @xdr: pointer to xdr_stream
+diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
+index 17a24e1180dad..1ea422b1a9f1c 100644
+--- a/include/linux/syscalls.h
++++ b/include/linux/syscalls.h
+@@ -1320,18 +1320,6 @@ static inline long ksys_ftruncate(unsigned int fd, loff_t length)
+ return do_sys_ftruncate(fd, length, 1);
+ }
+
+-extern int __close_fd(struct files_struct *files, unsigned int fd);
+-
+-/*
+- * In contrast to sys_close(), this stub does not check whether the syscall
+- * should or should not be restarted, but returns the raw error codes from
+- * __close_fd().
+- */
+-static inline int ksys_close(unsigned int fd)
+-{
+- return __close_fd(current->files, fd);
+-}
+-
+ extern long do_sys_truncate(const char __user *pathname, loff_t length);
+
+ static inline long ksys_truncate(const char __user *pathname, loff_t length)
+diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
+index c202a72e16906..47cf70c8eb93c 100644
+--- a/include/linux/sysctl.h
++++ b/include/linux/sysctl.h
+@@ -55,6 +55,8 @@ typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+
+ int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *);
++int proc_dobool(struct ctl_table *table, int write, void *buffer,
++ size_t *lenp, loff_t *ppos);
+ int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *);
+ int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *);
+ int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *);
+diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
+index 7616c7bf4b241..0793951867aef 100644
+--- a/include/linux/user_namespace.h
++++ b/include/linux/user_namespace.h
+@@ -49,6 +49,10 @@ enum ucount_type {
+ #ifdef CONFIG_INOTIFY_USER
+ UCOUNT_INOTIFY_INSTANCES,
+ UCOUNT_INOTIFY_WATCHES,
++#endif
++#ifdef CONFIG_FANOTIFY
++ UCOUNT_FANOTIFY_GROUPS,
++ UCOUNT_FANOTIFY_MARKS,
+ #endif
+ UCOUNT_COUNTS,
+ };
+diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h
+index 8220369ee6105..56e4a57d25382 100644
+--- a/include/trace/events/sunrpc.h
++++ b/include/trace/events/sunrpc.h
+@@ -394,6 +394,7 @@ DEFINE_RPC_RUNNING_EVENT(complete);
+ DEFINE_RPC_RUNNING_EVENT(timeout);
+ DEFINE_RPC_RUNNING_EVENT(signalled);
+ DEFINE_RPC_RUNNING_EVENT(end);
++DEFINE_RPC_RUNNING_EVENT(call_done);
+
+ DECLARE_EVENT_CLASS(rpc_task_queued,
+
+@@ -1480,8 +1481,7 @@ DEFINE_SVCXDRBUF_EVENT(sendto);
+ svc_rqst_flag(SPLICE_OK) \
+ svc_rqst_flag(VICTIM) \
+ svc_rqst_flag(BUSY) \
+- svc_rqst_flag(DATA) \
+- svc_rqst_flag_end(AUTHERR)
++ svc_rqst_flag_end(DATA)
+
+ #undef svc_rqst_flag
+ #undef svc_rqst_flag_end
+@@ -1547,9 +1547,9 @@ TRACE_DEFINE_ENUM(SVC_COMPLETE);
+ { SVC_COMPLETE, "SVC_COMPLETE" })
+
+ TRACE_EVENT(svc_authenticate,
+- TP_PROTO(const struct svc_rqst *rqst, int auth_res, __be32 auth_stat),
++ TP_PROTO(const struct svc_rqst *rqst, int auth_res),
+
+- TP_ARGS(rqst, auth_res, auth_stat),
++ TP_ARGS(rqst, auth_res),
+
+ TP_STRUCT__entry(
+ __field(u32, xid)
+@@ -1560,7 +1560,7 @@ TRACE_EVENT(svc_authenticate,
+ TP_fast_assign(
+ __entry->xid = be32_to_cpu(rqst->rq_xid);
+ __entry->svc_status = auth_res;
+- __entry->auth_stat = be32_to_cpu(auth_stat);
++ __entry->auth_stat = be32_to_cpu(rqst->rq_auth_stat);
+ ),
+
+ TP_printk("xid=0x%08x auth_res=%s auth_stat=%s",
+@@ -1578,6 +1578,7 @@ TRACE_EVENT(svc_process,
+ __field(u32, vers)
+ __field(u32, proc)
+ __string(service, name)
++ __string(procedure, rqst->rq_procinfo->pc_name)
+ __string(addr, rqst->rq_xprt ?
+ rqst->rq_xprt->xpt_remotebuf : "(null)")
+ ),
+@@ -1587,13 +1588,16 @@ TRACE_EVENT(svc_process,
+ __entry->vers = rqst->rq_vers;
+ __entry->proc = rqst->rq_proc;
+ __assign_str(service, name);
++ __assign_str(procedure, rqst->rq_procinfo->pc_name);
+ __assign_str(addr, rqst->rq_xprt ?
+ rqst->rq_xprt->xpt_remotebuf : "(null)");
+ ),
+
+- TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%u",
++ TP_printk("addr=%s xid=0x%08x service=%s vers=%u proc=%s",
+ __get_str(addr), __entry->xid,
+- __get_str(service), __entry->vers, __entry->proc)
++ __get_str(service), __entry->vers,
++ __get_str(procedure)
++ )
+ );
+
+ DECLARE_EVENT_CLASS(svc_rqst_event,
+@@ -1752,6 +1756,7 @@ DECLARE_EVENT_CLASS(svc_xprt_event,
+ ), \
+ TP_ARGS(xprt))
+
++DEFINE_SVC_XPRT_EVENT(received);
+ DEFINE_SVC_XPRT_EVENT(no_write_space);
+ DEFINE_SVC_XPRT_EVENT(close);
+ DEFINE_SVC_XPRT_EVENT(detach);
+@@ -1849,6 +1854,7 @@ TRACE_EVENT(svc_stats_latency,
+ TP_STRUCT__entry(
+ __field(u32, xid)
+ __field(unsigned long, execute)
++ __string(procedure, rqst->rq_procinfo->pc_name)
+ __string(addr, rqst->rq_xprt->xpt_remotebuf)
+ ),
+
+@@ -1856,11 +1862,13 @@ TRACE_EVENT(svc_stats_latency,
+ __entry->xid = be32_to_cpu(rqst->rq_xid);
+ __entry->execute = ktime_to_us(ktime_sub(ktime_get(),
+ rqst->rq_stime));
++ __assign_str(procedure, rqst->rq_procinfo->pc_name);
+ __assign_str(addr, rqst->rq_xprt->xpt_remotebuf);
+ ),
+
+- TP_printk("addr=%s xid=0x%08x execute-us=%lu",
+- __get_str(addr), __entry->xid, __entry->execute)
++ TP_printk("addr=%s xid=0x%08x proc=%s execute-us=%lu",
++ __get_str(addr), __entry->xid, __get_str(procedure),
++ __entry->execute)
+ );
+
+ DECLARE_EVENT_CLASS(svc_deferred_event,
+diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h
+index fbf9c5c7dd59a..d8536d77fea1c 100644
+--- a/include/uapi/linux/fanotify.h
++++ b/include/uapi/linux/fanotify.h
+@@ -20,6 +20,7 @@
+ #define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */
+
+ #define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */
++#define FAN_FS_ERROR 0x00008000 /* Filesystem error */
+
+ #define FAN_OPEN_PERM 0x00010000 /* File open in perm check */
+ #define FAN_ACCESS_PERM 0x00020000 /* File accessed in perm check */
+@@ -27,6 +28,8 @@
+
+ #define FAN_EVENT_ON_CHILD 0x08000000 /* Interested in child events */
+
++#define FAN_RENAME 0x10000000 /* File was renamed */
++
+ #define FAN_ONDIR 0x40000000 /* Event occurred against dir */
+
+ /* helper events */
+@@ -51,13 +54,18 @@
+ #define FAN_ENABLE_AUDIT 0x00000040
+
+ /* Flags to determine fanotify event format */
++#define FAN_REPORT_PIDFD 0x00000080 /* Report pidfd for event->pid */
+ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */
+ #define FAN_REPORT_FID 0x00000200 /* Report unique file id */
+ #define FAN_REPORT_DIR_FID 0x00000400 /* Report unique directory id */
+ #define FAN_REPORT_NAME 0x00000800 /* Report events with name */
++#define FAN_REPORT_TARGET_FID 0x00001000 /* Report dirent target id */
+
+ /* Convenience macro - FAN_REPORT_NAME requires FAN_REPORT_DIR_FID */
+ #define FAN_REPORT_DFID_NAME (FAN_REPORT_DIR_FID | FAN_REPORT_NAME)
++/* Convenience macro - FAN_REPORT_TARGET_FID requires all other FID flags */
++#define FAN_REPORT_DFID_NAME_TARGET (FAN_REPORT_DFID_NAME | \
++ FAN_REPORT_FID | FAN_REPORT_TARGET_FID)
+
+ /* Deprecated - do not use this in programs and do not add new flags here! */
+ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \
+@@ -74,12 +82,21 @@
+ #define FAN_MARK_IGNORED_SURV_MODIFY 0x00000040
+ #define FAN_MARK_FLUSH 0x00000080
+ /* FAN_MARK_FILESYSTEM is 0x00000100 */
++#define FAN_MARK_EVICTABLE 0x00000200
++/* This bit is mutually exclusive with FAN_MARK_IGNORED_MASK bit */
++#define FAN_MARK_IGNORE 0x00000400
+
+ /* These are NOT bitwise flags. Both bits can be used togther. */
+ #define FAN_MARK_INODE 0x00000000
+ #define FAN_MARK_MOUNT 0x00000010
+ #define FAN_MARK_FILESYSTEM 0x00000100
+
++/*
++ * Convenience macro - FAN_MARK_IGNORE requires FAN_MARK_IGNORED_SURV_MODIFY
++ * for non-inode mark types.
++ */
++#define FAN_MARK_IGNORE_SURV (FAN_MARK_IGNORE | FAN_MARK_IGNORED_SURV_MODIFY)
++
+ /* Deprecated - do not use this in programs and do not add new flags here! */
+ #define FAN_ALL_MARK_FLAGS (FAN_MARK_ADD |\
+ FAN_MARK_REMOVE |\
+@@ -123,6 +140,14 @@ struct fanotify_event_metadata {
+ #define FAN_EVENT_INFO_TYPE_FID 1
+ #define FAN_EVENT_INFO_TYPE_DFID_NAME 2
+ #define FAN_EVENT_INFO_TYPE_DFID 3
++#define FAN_EVENT_INFO_TYPE_PIDFD 4
++#define FAN_EVENT_INFO_TYPE_ERROR 5
++
++/* Special info types for FAN_RENAME */
++#define FAN_EVENT_INFO_TYPE_OLD_DFID_NAME 10
++/* Reserved for FAN_EVENT_INFO_TYPE_OLD_DFID 11 */
++#define FAN_EVENT_INFO_TYPE_NEW_DFID_NAME 12
++/* Reserved for FAN_EVENT_INFO_TYPE_NEW_DFID 13 */
+
+ /* Variable length info record following event metadata */
+ struct fanotify_event_info_header {
+@@ -148,6 +173,21 @@ struct fanotify_event_info_fid {
+ unsigned char handle[0];
+ };
+
++/*
++ * This structure is used for info records of type FAN_EVENT_INFO_TYPE_PIDFD.
++ * It holds a pidfd for the pid that was responsible for generating an event.
++ */
++struct fanotify_event_info_pidfd {
++ struct fanotify_event_info_header hdr;
++ __s32 pidfd;
++};
++
++struct fanotify_event_info_error {
++ struct fanotify_event_info_header hdr;
++ __s32 error;
++ __u32 error_count;
++};
++
+ struct fanotify_response {
+ __s32 fd;
+ __u32 response;
+@@ -160,6 +200,8 @@ struct fanotify_response {
+
+ /* No fd set in event */
+ #define FAN_NOFD -1
++#define FAN_NOPIDFD FAN_NOFD
++#define FAN_EPIDFD -2
+
+ /* Helper functions to deal with fanotify_event_metadata buffers */
+ #define FAN_EVENT_METADATA_LEN (sizeof(struct fanotify_event_metadata))
+diff --git a/include/uapi/linux/nfs3.h b/include/uapi/linux/nfs3.h
+index 37e4b34e6b435..c22ab77713bd0 100644
+--- a/include/uapi/linux/nfs3.h
++++ b/include/uapi/linux/nfs3.h
+@@ -63,6 +63,12 @@ enum nfs3_ftype {
+ NF3BAD = 8
+ };
+
++enum nfs3_time_how {
++ DONT_CHANGE = 0,
++ SET_TO_SERVER_TIME = 1,
++ SET_TO_CLIENT_TIME = 2,
++};
++
+ struct nfs3_fh {
+ unsigned short size;
+ unsigned char data[NFS3_FHSIZE];
+diff --git a/include/uapi/linux/nfsd/nfsfh.h b/include/uapi/linux/nfsd/nfsfh.h
+deleted file mode 100644
+index ff0ca88b1c8f6..0000000000000
+--- a/include/uapi/linux/nfsd/nfsfh.h
++++ /dev/null
+@@ -1,105 +0,0 @@
+-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+-/*
+- * This file describes the layout of the file handles as passed
+- * over the wire.
+- *
+- * Copyright (C) 1995, 1996, 1997 Olaf Kirch <okir@monad.swb.de>
+- */
+-
+-#ifndef _UAPI_LINUX_NFSD_FH_H
+-#define _UAPI_LINUX_NFSD_FH_H
+-
+-#include <linux/types.h>
+-#include <linux/nfs.h>
+-#include <linux/nfs2.h>
+-#include <linux/nfs3.h>
+-#include <linux/nfs4.h>
+-
+-/*
+- * This is the old "dentry style" Linux NFSv2 file handle.
+- *
+- * The xino and xdev fields are currently used to transport the
+- * ino/dev of the exported inode.
+- */
+-struct nfs_fhbase_old {
+- __u32 fb_dcookie; /* dentry cookie - always 0xfeebbaca */
+- __u32 fb_ino; /* our inode number */
+- __u32 fb_dirino; /* dir inode number, 0 for directories */
+- __u32 fb_dev; /* our device */
+- __u32 fb_xdev;
+- __u32 fb_xino;
+- __u32 fb_generation;
+-};
+-
+-/*
+- * This is the new flexible, extensible style NFSv2/v3/v4 file handle.
+- * by Neil Brown <neilb@cse.unsw.edu.au> - March 2000
+- *
+- * The file handle starts with a sequence of four-byte words.
+- * The first word contains a version number (1) and three descriptor bytes
+- * that tell how the remaining 3 variable length fields should be handled.
+- * These three bytes are auth_type, fsid_type and fileid_type.
+- *
+- * All four-byte values are in host-byte-order.
+- *
+- * The auth_type field is deprecated and must be set to 0.
+- *
+- * The fsid_type identifies how the filesystem (or export point) is
+- * encoded.
+- * Current values:
+- * 0 - 4 byte device id (ms-2-bytes major, ls-2-bytes minor), 4byte inode number
+- * NOTE: we cannot use the kdev_t device id value, because kdev_t.h
+- * says we mustn't. We must break it up and reassemble.
+- * 1 - 4 byte user specified identifier
+- * 2 - 4 byte major, 4 byte minor, 4 byte inode number - DEPRECATED
+- * 3 - 4 byte device id, encoded for user-space, 4 byte inode number
+- * 4 - 4 byte inode number and 4 byte uuid
+- * 5 - 8 byte uuid
+- * 6 - 16 byte uuid
+- * 7 - 8 byte inode number and 16 byte uuid
+- *
+- * The fileid_type identified how the file within the filesystem is encoded.
+- * The values for this field are filesystem specific, exccept that
+- * filesystems must not use the values '0' or '0xff'. 'See enum fid_type'
+- * in include/linux/exportfs.h for currently registered values.
+- */
+-struct nfs_fhbase_new {
+- __u8 fb_version; /* == 1, even => nfs_fhbase_old */
+- __u8 fb_auth_type;
+- __u8 fb_fsid_type;
+- __u8 fb_fileid_type;
+- __u32 fb_auth[1];
+-/* __u32 fb_fsid[0]; floating */
+-/* __u32 fb_fileid[0]; floating */
+-};
+-
+-struct knfsd_fh {
+- unsigned int fh_size; /* significant for NFSv3.
+- * Points to the current size while building
+- * a new file handle
+- */
+- union {
+- struct nfs_fhbase_old fh_old;
+- __u32 fh_pad[NFS4_FHSIZE/4];
+- struct nfs_fhbase_new fh_new;
+- } fh_base;
+-};
+-
+-#define ofh_dcookie fh_base.fh_old.fb_dcookie
+-#define ofh_ino fh_base.fh_old.fb_ino
+-#define ofh_dirino fh_base.fh_old.fb_dirino
+-#define ofh_dev fh_base.fh_old.fb_dev
+-#define ofh_xdev fh_base.fh_old.fb_xdev
+-#define ofh_xino fh_base.fh_old.fb_xino
+-#define ofh_generation fh_base.fh_old.fb_generation
+-
+-#define fh_version fh_base.fh_new.fb_version
+-#define fh_fsid_type fh_base.fh_new.fb_fsid_type
+-#define fh_auth_type fh_base.fh_new.fb_auth_type
+-#define fh_fileid_type fh_base.fh_new.fb_fileid_type
+-#define fh_fsid fh_base.fh_new.fb_auth
+-
+-/* Do not use, provided for userspace compatiblity. */
+-#define fh_auth fh_base.fh_new.fb_auth
+-
+-#endif /* _UAPI_LINUX_NFSD_FH_H */
+diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
+index b2ebacd2f3097..691f90dd09d25 100644
+--- a/kernel/audit_fsnotify.c
++++ b/kernel/audit_fsnotify.c
+@@ -100,7 +100,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa
+ audit_update_mark(audit_mark, dentry->d_inode);
+ audit_mark->rule = krule;
+
+- ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, true);
++ ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0);
+ if (ret < 0) {
+ audit_mark->path = NULL;
+ fsnotify_put_mark(&audit_mark->mark);
+@@ -161,8 +161,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+
+ audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+
+- if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
+- WARN_ON_ONCE(!inode))
++ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
+ return 0;
+
+ if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+@@ -183,7 +182,8 @@ static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+
+ static int __init audit_fsnotify_init(void)
+ {
+- audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
++ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops,
++ FSNOTIFY_GROUP_DUPS);
+ if (IS_ERR(audit_fsnotify_group)) {
+ audit_fsnotify_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
+index 39241207ec044..0c35879bbf7c3 100644
+--- a/kernel/audit_tree.c
++++ b/kernel/audit_tree.c
+@@ -1077,7 +1077,7 @@ static int __init audit_tree_init(void)
+
+ audit_tree_mark_cachep = KMEM_CACHE(audit_tree_mark, SLAB_PANIC);
+
+- audit_tree_group = fsnotify_alloc_group(&audit_tree_ops);
++ audit_tree_group = fsnotify_alloc_group(&audit_tree_ops, 0);
+ if (IS_ERR(audit_tree_group))
+ audit_panic("cannot initialize fsnotify group for rectree watches");
+
+diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
+index edbeffee64b8e..5cf22fe301493 100644
+--- a/kernel/audit_watch.c
++++ b/kernel/audit_watch.c
+@@ -472,8 +472,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
+
+ parent = container_of(inode_mark, struct audit_parent, mark);
+
+- if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
+- WARN_ON_ONCE(!inode))
++ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
+ return 0;
+
+ if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
+@@ -493,7 +492,7 @@ static const struct fsnotify_ops audit_watch_fsnotify_ops = {
+
+ static int __init audit_watch_init(void)
+ {
+- audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops);
++ audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops, 0);
+ if (IS_ERR(audit_watch_group)) {
+ audit_watch_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
+index 6b14b4c4068cc..5966013bc788b 100644
+--- a/kernel/bpf/inode.c
++++ b/kernel/bpf/inode.c
+@@ -507,7 +507,7 @@ static void *bpf_obj_do_get(const char __user *pathname,
+ return ERR_PTR(ret);
+
+ inode = d_backing_inode(path.dentry);
+- ret = inode_permission(inode, ACC_MODE(flags));
++ ret = path_permission(&path, ACC_MODE(flags));
+ if (ret)
+ goto out;
+
+diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
+index e1bee8cd34044..fbe7f8e2b022c 100644
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -3929,7 +3929,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
+ pid_t pid = attr->task_fd_query.pid;
+ u32 fd = attr->task_fd_query.fd;
+ const struct perf_event *event;
+- struct files_struct *files;
+ struct task_struct *task;
+ struct file *file;
+ int err;
+@@ -3949,23 +3948,11 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
+ if (!task)
+ return -ENOENT;
+
+- files = get_files_struct(task);
+- put_task_struct(task);
+- if (!files)
+- return -ENOENT;
+-
+ err = 0;
+- spin_lock(&files->file_lock);
+- file = fcheck_files(files, fd);
++ file = fget_task(task, fd);
++ put_task_struct(task);
+ if (!file)
+- err = -EBADF;
+- else
+- get_file(file);
+- spin_unlock(&files->file_lock);
+- put_files_struct(files);
+-
+- if (err)
+- goto out;
++ return -EBADF;
+
+ if (file->f_op == &bpf_link_fops) {
+ struct bpf_link *link = file->private_data;
+@@ -4005,7 +3992,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
+ err = -ENOTSUPP;
+ put_file:
+ fput(file);
+-out:
+ return err;
+ }
+
+diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
+index f3d3a562a802a..762b4d7c37795 100644
+--- a/kernel/bpf/task_iter.c
++++ b/kernel/bpf/task_iter.c
+@@ -185,7 +185,7 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info)
+ for (; curr_fd < max_fds; curr_fd++) {
+ struct file *f;
+
+- f = fcheck_files(curr_files, curr_fd);
++ f = files_lookup_fd_rcu(curr_files, curr_fd);
+ if (!f)
+ continue;
+ if (!get_file_rcu(f))
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 633b0af1d1a73..8b8a5a172b158 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -3077,21 +3077,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
+ * the exec layer of the kernel.
+ */
+
+-int unshare_files(struct files_struct **displaced)
++int unshare_files(void)
+ {
+ struct task_struct *task = current;
+- struct files_struct *copy = NULL;
++ struct files_struct *old, *copy = NULL;
+ int error;
+
+ error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, &copy);
+- if (error || !copy) {
+- *displaced = NULL;
++ if (error || !copy)
+ return error;
+- }
+- *displaced = task->files;
++
++ old = task->files;
+ task_lock(task);
+ task->files = copy;
+ task_unlock(task);
++ put_files_struct(old);
+ return 0;
+ }
+
+diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
+index fe9de067771c3..8043a90aa50ed 100644
+--- a/kernel/kallsyms.c
++++ b/kernel/kallsyms.c
+@@ -177,6 +177,11 @@ unsigned long kallsyms_lookup_name(const char *name)
+ return module_kallsyms_lookup_name(name);
+ }
+
++#ifdef CONFIG_LIVEPATCH
++/*
++ * Iterate over all symbols in vmlinux. For symbols from modules use
++ * module_kallsyms_on_each_symbol instead.
++ */
+ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+ unsigned long),
+ void *data)
+@@ -192,8 +197,9 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+ if (ret != 0)
+ return ret;
+ }
+- return module_kallsyms_on_each_symbol(fn, data);
++ return 0;
+ }
++#endif /* CONFIG_LIVEPATCH */
+
+ static unsigned long get_symbol_pos(unsigned long addr,
+ unsigned long *symbolsize,
+diff --git a/kernel/kcmp.c b/kernel/kcmp.c
+index c0d2ad9b4705d..5353edfad8e11 100644
+--- a/kernel/kcmp.c
++++ b/kernel/kcmp.c
+@@ -61,16 +61,11 @@ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
+ static struct file *
+ get_file_raw_ptr(struct task_struct *task, unsigned int idx)
+ {
+- struct file *file = NULL;
++ struct file *file;
+
+- task_lock(task);
+ rcu_read_lock();
+-
+- if (task->files)
+- file = fcheck_files(task->files, idx);
+-
++ file = task_lookup_fd_rcu(task, idx);
+ rcu_read_unlock();
+- task_unlock(task);
+
+ return file;
+ }
+@@ -107,7 +102,6 @@ static int kcmp_epoll_target(struct task_struct *task1,
+ {
+ struct file *filp, *filp_epoll, *filp_tgt;
+ struct kcmp_epoll_slot slot;
+- struct files_struct *files;
+
+ if (copy_from_user(&slot, uslot, sizeof(slot)))
+ return -EFAULT;
+@@ -116,23 +110,12 @@ static int kcmp_epoll_target(struct task_struct *task1,
+ if (!filp)
+ return -EBADF;
+
+- files = get_files_struct(task2);
+- if (!files)
++ filp_epoll = fget_task(task2, slot.efd);
++ if (!filp_epoll)
+ return -EBADF;
+
+- spin_lock(&files->file_lock);
+- filp_epoll = fcheck_files(files, slot.efd);
+- if (filp_epoll)
+- get_file(filp_epoll);
+- else
+- filp_tgt = ERR_PTR(-EBADF);
+- spin_unlock(&files->file_lock);
+- put_files_struct(files);
+-
+- if (filp_epoll) {
+- filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+- fput(filp_epoll);
+- }
++ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
++ fput(filp_epoll);
+
+ if (IS_ERR(filp_tgt))
+ return PTR_ERR(filp_tgt);
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 508fe52782857..9d6cc9c15a55e 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -262,6 +262,21 @@ void kthread_parkme(void)
+ }
+ EXPORT_SYMBOL_GPL(kthread_parkme);
+
++/**
++ * kthread_exit - Cause the current kthread return @result to kthread_stop().
++ * @result: The integer value to return to kthread_stop().
++ *
++ * While kthread_exit can be called directly, it exists so that
++ * functions which do some additional work in non-modular code such as
++ * module_put_and_kthread_exit can be implemented.
++ *
++ * Does not return.
++ */
++void __noreturn kthread_exit(long result)
++{
++ do_exit(result);
++}
++
+ static int kthread(void *_create)
+ {
+ /* Copy data: it's on kthread's stack */
+@@ -279,13 +294,13 @@ static int kthread(void *_create)
+ done = xchg(&create->done, NULL);
+ if (!done) {
+ kfree(create);
+- do_exit(-EINTR);
++ kthread_exit(-EINTR);
+ }
+
+ if (!self) {
+ create->result = ERR_PTR(-ENOMEM);
+ complete(done);
+- do_exit(-ENOMEM);
++ kthread_exit(-ENOMEM);
+ }
+
+ self->threadfn = threadfn;
+@@ -312,7 +327,7 @@ static int kthread(void *_create)
+ __kthread_parkme(self);
+ ret = threadfn(data);
+ }
+- do_exit(ret);
++ kthread_exit(ret);
+ }
+
+ /* called from do_fork() to get node information for about to be created task */
+@@ -621,7 +636,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
+ * instead of calling wake_up_process(): the thread will exit without
+ * calling threadfn().
+ *
+- * If threadfn() may call do_exit() itself, the caller must ensure
++ * If threadfn() may call kthread_exit() itself, the caller must ensure
+ * task_struct can't go away.
+ *
+ * Returns the result of threadfn(), or %-EINTR if wake_up_process()
+diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
+index f5faf935c2d8f..147ed154ebc77 100644
+--- a/kernel/livepatch/core.c
++++ b/kernel/livepatch/core.c
+@@ -19,6 +19,7 @@
+ #include <linux/moduleloader.h>
+ #include <linux/completion.h>
+ #include <linux/memory.h>
++#include <linux/rcupdate.h>
+ #include <asm/cacheflush.h>
+ #include "core.h"
+ #include "patch.h"
+@@ -57,7 +58,7 @@ static void klp_find_object_module(struct klp_object *obj)
+ if (!klp_is_module(obj))
+ return;
+
+- mutex_lock(&module_mutex);
++ rcu_read_lock_sched();
+ /*
+ * We do not want to block removal of patched modules and therefore
+ * we do not take a reference here. The patches are removed by
+@@ -74,7 +75,7 @@ static void klp_find_object_module(struct klp_object *obj)
+ if (mod && mod->klp_alive)
+ obj->mod = mod;
+
+- mutex_unlock(&module_mutex);
++ rcu_read_unlock_sched();
+ }
+
+ static bool klp_initialized(void)
+@@ -163,12 +164,10 @@ static int klp_find_object_symbol(const char *objname, const char *name,
+ .pos = sympos,
+ };
+
+- mutex_lock(&module_mutex);
+ if (objname)
+ module_kallsyms_on_each_symbol(klp_find_callback, &args);
+ else
+ kallsyms_on_each_symbol(klp_find_callback, &args);
+- mutex_unlock(&module_mutex);
+
+ /*
+ * Ensure an address was found. If sympos is 0, ensure symbol is unique;
+diff --git a/kernel/module.c b/kernel/module.c
+index 72a5dcdccf7b1..edc7b99cb16fa 100644
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -88,7 +88,6 @@
+ * 3) module_addr_min/module_addr_max.
+ * (delete and add uses RCU list operations). */
+ DEFINE_MUTEX(module_mutex);
+-EXPORT_SYMBOL_GPL(module_mutex);
+ static LIST_HEAD(modules);
+
+ /* Work queue for freeing init sections in success case */
+@@ -256,11 +255,6 @@ static void mod_update_bounds(struct module *mod)
+ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
+ #endif /* CONFIG_KGDB_KDB */
+
+-static void module_assert_mutex(void)
+-{
+- lockdep_assert_held(&module_mutex);
+-}
+-
+ static void module_assert_mutex_or_preempt(void)
+ {
+ #ifdef CONFIG_LOCKDEP
+@@ -340,14 +334,14 @@ static inline void add_taint_module(struct module *mod, unsigned flag,
+
+ /*
+ * A thread that wants to hold a reference to a module only while it
+- * is running can call this to safely exit. nfsd and lockd use this.
++ * is running can call this to safely exit.
+ */
+-void __noreturn __module_put_and_exit(struct module *mod, long code)
++void __noreturn __module_put_and_kthread_exit(struct module *mod, long code)
+ {
+ module_put(mod);
+- do_exit(code);
++ kthread_exit(code);
+ }
+-EXPORT_SYMBOL(__module_put_and_exit);
++EXPORT_SYMBOL(__module_put_and_kthread_exit);
+
+ /* Find a module section: 0 means not found. */
+ static unsigned int find_sec(const struct load_info *info, const char *name)
+@@ -642,10 +636,8 @@ static struct module *find_module_all(const char *name, size_t len,
+
+ struct module *find_module(const char *name)
+ {
+- module_assert_mutex();
+ return find_module_all(name, strlen(name), false);
+ }
+-EXPORT_SYMBOL_GPL(find_module);
+
+ #ifdef CONFIG_SMP
+
+@@ -4452,6 +4444,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
+ return ret;
+ }
+
++#ifdef CONFIG_LIVEPATCH
+ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ struct module *, unsigned long),
+ void *data)
+@@ -4460,8 +4453,7 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ unsigned int i;
+ int ret;
+
+- module_assert_mutex();
+-
++ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list) {
+ /* We hold module_mutex: no need for rcu_dereference_sched */
+ struct mod_kallsyms *kallsyms = mod->kallsyms;
+@@ -4477,11 +4469,13 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+ ret = fn(data, kallsyms_symbol_name(kallsyms, i),
+ mod, kallsyms_symbol_value(sym));
+ if (ret != 0)
+- return ret;
++ break;
+ }
+ }
+- return 0;
++ mutex_unlock(&module_mutex);
++ return ret;
+ }
++#endif /* CONFIG_LIVEPATCH */
+ #endif /* CONFIG_KALLSYMS */
+
+ /* Maximum number of characters written by module_flags() */
+diff --git a/kernel/pid.c b/kernel/pid.c
+index 4856818c9de1a..0820f2c50bb0c 100644
+--- a/kernel/pid.c
++++ b/kernel/pid.c
+@@ -550,13 +550,21 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
+ * Note, that this function can only be called after the fd table has
+ * been unshared to avoid leaking the pidfd to the new process.
+ *
++ * This symbol should not be explicitly exported to loadable modules.
++ *
+ * Return: On success, a cloexec pidfd is returned.
+ * On error, a negative errno number will be returned.
+ */
+-static int pidfd_create(struct pid *pid, unsigned int flags)
++int pidfd_create(struct pid *pid, unsigned int flags)
+ {
+ int fd;
+
++ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
++ return -EINVAL;
++
++ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
++ return -EINVAL;
++
+ fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
+ flags | O_RDWR | O_CLOEXEC);
+ if (fd < 0)
+@@ -596,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
+ if (!p)
+ return -ESRCH;
+
+- if (pid_has_task(p, PIDTYPE_TGID))
+- fd = pidfd_create(p, flags);
+- else
+- fd = -EINVAL;
++ fd = pidfd_create(p, flags);
+
+ put_pid(p);
+ return fd;
+diff --git a/kernel/sys.c b/kernel/sys.c
+index efc213ae4c5ad..7a2cfb57fa9e7 100644
+--- a/kernel/sys.c
++++ b/kernel/sys.c
+@@ -1873,7 +1873,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+ if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
+ goto exit;
+
+- err = inode_permission(inode, MAY_EXEC);
++ err = file_permission(exe.file, MAY_EXEC);
+ if (err)
+ goto exit;
+
+diff --git a/kernel/sysctl.c b/kernel/sysctl.c
+index 99a19190196e0..abe0f16d53641 100644
+--- a/kernel/sysctl.c
++++ b/kernel/sysctl.c
+@@ -142,6 +142,9 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
+ #ifdef CONFIG_INOTIFY_USER
+ #include <linux/inotify.h>
+ #endif
++#ifdef CONFIG_FANOTIFY
++#include <linux/fanotify.h>
++#endif
+
+ #ifdef CONFIG_PROC_SYSCTL
+
+@@ -543,6 +546,21 @@ static void proc_put_char(void **buf, size_t *size, char c)
+ }
+ }
+
++static int do_proc_dobool_conv(bool *negp, unsigned long *lvalp,
++ int *valp,
++ int write, void *data)
++{
++ if (write) {
++ *(bool *)valp = *lvalp;
++ } else {
++ int val = *(bool *)valp;
++
++ *lvalp = (unsigned long)val;
++ *negp = false;
++ }
++ return 0;
++}
++
+ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
+ int *valp,
+ int write, void *data)
+@@ -805,6 +823,26 @@ static int do_proc_douintvec(struct ctl_table *table, int write,
+ buffer, lenp, ppos, conv, data);
+ }
+
++/**
++ * proc_dobool - read/write a bool
++ * @table: the sysctl table
++ * @write: %TRUE if this is a write to the sysctl file
++ * @buffer: the user buffer
++ * @lenp: the size of the user buffer
++ * @ppos: file position
++ *
++ * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
++ * values from/to the user buffer, treated as an ASCII string.
++ *
++ * Returns 0 on success.
++ */
++int proc_dobool(struct ctl_table *table, int write, void *buffer,
++ size_t *lenp, loff_t *ppos)
++{
++ return do_proc_dointvec(table, write, buffer, lenp, ppos,
++ do_proc_dobool_conv, NULL);
++}
++
+ /**
+ * proc_dointvec - read a vector of integers
+ * @table: the sysctl table
+@@ -1641,6 +1679,12 @@ int proc_dostring(struct ctl_table *table, int write,
+ return -ENOSYS;
+ }
+
++int proc_dobool(struct ctl_table *table, int write,
++ void *buffer, size_t *lenp, loff_t *ppos)
++{
++ return -ENOSYS;
++}
++
+ int proc_dointvec(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+ {
+@@ -3330,7 +3374,14 @@ static struct ctl_table fs_table[] = {
+ .mode = 0555,
+ .child = inotify_table,
+ },
+-#endif
++#endif
++#ifdef CONFIG_FANOTIFY
++ {
++ .procname = "fanotify",
++ .mode = 0555,
++ .child = fanotify_table,
++ },
++#endif
+ #ifdef CONFIG_EPOLL
+ {
+ .procname = "epoll",
+@@ -3493,6 +3544,7 @@ int __init sysctl_init(void)
+ * No sense putting this after each symbol definition, twice,
+ * exception granted :-)
+ */
++EXPORT_SYMBOL(proc_dobool);
+ EXPORT_SYMBOL(proc_dointvec);
+ EXPORT_SYMBOL(proc_douintvec);
+ EXPORT_SYMBOL(proc_dointvec_jiffies);
+diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
+index 7183572898998..5453af26ff764 100644
+--- a/kernel/trace/trace_kprobe.c
++++ b/kernel/trace/trace_kprobe.c
+@@ -124,9 +124,9 @@ static nokprobe_inline bool trace_kprobe_module_exist(struct trace_kprobe *tk)
+ if (!p)
+ return true;
+ *p = '\0';
+- mutex_lock(&module_mutex);
++ rcu_read_lock_sched();
+ ret = !!find_module(tk->symbol);
+- mutex_unlock(&module_mutex);
++ rcu_read_unlock_sched();
+ *p = ':';
+
+ return ret;
+diff --git a/kernel/ucount.c b/kernel/ucount.c
+index 11b1596e2542a..8d8874f1c35e2 100644
+--- a/kernel/ucount.c
++++ b/kernel/ucount.c
+@@ -73,6 +73,10 @@ static struct ctl_table user_table[] = {
+ #ifdef CONFIG_INOTIFY_USER
+ UCOUNT_ENTRY("max_inotify_instances"),
+ UCOUNT_ENTRY("max_inotify_watches"),
++#endif
++#ifdef CONFIG_FANOTIFY
++ UCOUNT_ENTRY("max_fanotify_groups"),
++ UCOUNT_ENTRY("max_fanotify_marks"),
+ #endif
+ { }
+ };
+diff --git a/mm/madvise.c b/mm/madvise.c
+index f71fc88f0b331..a63aa04ec7fa3 100644
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -543,7 +543,7 @@ static inline bool can_do_pageout(struct vm_area_struct *vma)
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
++ file_permission(vma->vm_file, MAY_WRITE) == 0;
+ }
+
+ static long madvise_pageout(struct vm_area_struct *vma,
+diff --git a/mm/memcontrol.c b/mm/memcontrol.c
+index ddc8ed096deca..186ae9dba0fd5 100644
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -4918,7 +4918,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
+
+ /* the process need read permission on control file */
+ /* AV: shouldn't we check that it's been opened for read instead? */
+- ret = inode_permission(file_inode(cfile.file), MAY_READ);
++ ret = file_permission(cfile.file, MAY_READ);
+ if (ret < 0)
+ goto out_put_cfile;
+
+diff --git a/mm/mincore.c b/mm/mincore.c
+index 02db1a834021b..7bdb4673f776a 100644
+--- a/mm/mincore.c
++++ b/mm/mincore.c
+@@ -167,7 +167,7 @@ static inline bool can_do_mincore(struct vm_area_struct *vma)
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+- inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
++ file_permission(vma->vm_file, MAY_WRITE) == 0;
+ }
+
+ static const struct mm_walk_ops mincore_walk_ops = {
+diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
+index 43c284158f63e..09b6d825124ee 100644
+--- a/net/bluetooth/bnep/core.c
++++ b/net/bluetooth/bnep/core.c
+@@ -535,7 +535,7 @@ static int bnep_session(void *arg)
+
+ up_write(&bnep_session_sem);
+ free_netdev(dev);
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ return 0;
+ }
+
+diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
+index 83eb84e8e688f..90d130588a3e5 100644
+--- a/net/bluetooth/cmtp/core.c
++++ b/net/bluetooth/cmtp/core.c
+@@ -323,7 +323,7 @@ static int cmtp_session(void *arg)
+ up_write(&cmtp_session_sem);
+
+ kfree(session);
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ return 0;
+ }
+
+diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
+index b946a6379433a..3ff870599eb77 100644
+--- a/net/bluetooth/hidp/core.c
++++ b/net/bluetooth/hidp/core.c
+@@ -1305,7 +1305,7 @@ static int hidp_session_thread(void *arg)
+ l2cap_unregister_user(session->conn, &session->user);
+ hidp_session_put(session);
+
+- module_put_and_exit(0);
++ module_put_and_kthread_exit(0);
+ return 0;
+ }
+
+diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
+index e265b8d38aa14..a857fc99431ce 100644
+--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
++++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
+@@ -800,7 +800,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
+ scratch = alloc_page(GFP_KERNEL);
+ if (!scratch)
+ return -ENOMEM;
+- xdr_set_scratch_buffer(xdr, page_address(scratch), PAGE_SIZE);
++ xdr_set_scratch_page(xdr, scratch);
+
+ /* res->status */
+ err = gssx_dec_status(xdr, &res->status);
+diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
+index 784c8b24f1640..329eac782cc5e 100644
+--- a/net/sunrpc/auth_gss/svcauth_gss.c
++++ b/net/sunrpc/auth_gss/svcauth_gss.c
+@@ -707,11 +707,11 @@ svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o)
+ /*
+ * Verify the checksum on the header and return SVC_OK on success.
+ * Otherwise, return SVC_DROP (in the case of a bad sequence number)
+- * or return SVC_DENIED and indicate error in authp.
++ * or return SVC_DENIED and indicate error in rqstp->rq_auth_stat.
+ */
+ static int
+ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
+- __be32 *rpcstart, struct rpc_gss_wire_cred *gc, __be32 *authp)
++ __be32 *rpcstart, struct rpc_gss_wire_cred *gc)
+ {
+ struct gss_ctx *ctx_id = rsci->mechctx;
+ struct xdr_buf rpchdr;
+@@ -725,7 +725,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
+ iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart;
+ xdr_buf_from_iov(&iov, &rpchdr);
+
+- *authp = rpc_autherr_badverf;
++ rqstp->rq_auth_stat = rpc_autherr_badverf;
+ if (argv->iov_len < 4)
+ return SVC_DENIED;
+ flavor = svc_getnl(argv);
+@@ -737,13 +737,13 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci,
+ if (rqstp->rq_deferred) /* skip verification of revisited request */
+ return SVC_OK;
+ if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) {
+- *authp = rpcsec_gsserr_credproblem;
++ rqstp->rq_auth_stat = rpcsec_gsserr_credproblem;
+ return SVC_DENIED;
+ }
+
+ if (gc->gc_seq > MAXSEQ) {
+ trace_rpcgss_svc_seqno_large(rqstp, gc->gc_seq);
+- *authp = rpcsec_gsserr_ctxproblem;
++ rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem;
+ return SVC_DENIED;
+ }
+ if (!gss_check_seq_num(rqstp, rsci, gc->gc_seq))
+@@ -1038,6 +1038,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
+ struct rpc_gss_wire_cred *gc = &svcdata->clcred;
+ int stat;
+
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
++
+ /*
+ * A gss export can be specified either by:
+ * export *(sec=krb5,rw)
+@@ -1053,6 +1055,8 @@ svcauth_gss_set_client(struct svc_rqst *rqstp)
+ stat = svcauth_unix_set_client(rqstp);
+ if (stat == SVC_DROP || stat == SVC_CLOSE)
+ return stat;
++
++ rqstp->rq_auth_stat = rpc_auth_ok;
+ return SVC_OK;
+ }
+
+@@ -1136,7 +1140,7 @@ static void gss_free_in_token_pages(struct gssp_in_token *in_token)
+ }
+
+ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
+- struct rpc_gss_wire_cred *gc, __be32 *authp,
++ struct rpc_gss_wire_cred *gc,
+ struct xdr_netobj *in_handle,
+ struct gssp_in_token *in_token)
+ {
+@@ -1145,7 +1149,7 @@ static int gss_read_proxy_verf(struct svc_rqst *rqstp,
+ int pages, i, res, pgto, pgfrom;
+ size_t inlen, to_offs, from_offs;
+
+- res = gss_read_common_verf(gc, argv, authp, in_handle);
++ res = gss_read_common_verf(gc, argv, &rqstp->rq_auth_stat, in_handle);
+ if (res)
+ return res;
+
+@@ -1226,7 +1230,7 @@ gss_write_resv(struct kvec *resv, size_t size_limit,
+ * Otherwise, drop the request pending an answer to the upcall.
+ */
+ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
+- struct rpc_gss_wire_cred *gc, __be32 *authp)
++ struct rpc_gss_wire_cred *gc)
+ {
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+@@ -1235,7 +1239,7 @@ static int svcauth_gss_legacy_init(struct svc_rqst *rqstp,
+ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
+
+ memset(&rsikey, 0, sizeof(rsikey));
+- ret = gss_read_verf(gc, argv, authp,
++ ret = gss_read_verf(gc, argv, &rqstp->rq_auth_stat,
+ &rsikey.in_handle, &rsikey.in_token);
+ if (ret)
+ return ret;
+@@ -1338,7 +1342,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd,
+ }
+
+ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
+- struct rpc_gss_wire_cred *gc, __be32 *authp)
++ struct rpc_gss_wire_cred *gc)
+ {
+ struct kvec *resv = &rqstp->rq_res.head[0];
+ struct xdr_netobj cli_handle;
+@@ -1350,8 +1354,7 @@ static int svcauth_gss_proxy_init(struct svc_rqst *rqstp,
+ struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
+
+ memset(&ud, 0, sizeof(ud));
+- ret = gss_read_proxy_verf(rqstp, gc, authp,
+- &ud.in_handle, &ud.in_token);
++ ret = gss_read_proxy_verf(rqstp, gc, &ud.in_handle, &ud.in_token);
+ if (ret)
+ return ret;
+
+@@ -1524,7 +1527,7 @@ static void destroy_use_gss_proxy_proc_entry(struct net *net) {}
+ * response here and return SVC_COMPLETE.
+ */
+ static int
+-svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
++svcauth_gss_accept(struct svc_rqst *rqstp)
+ {
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+@@ -1537,7 +1540,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+ int ret;
+ struct sunrpc_net *sn = net_generic(SVC_NET(rqstp), sunrpc_net_id);
+
+- *authp = rpc_autherr_badcred;
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ if (!svcdata)
+ svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL);
+ if (!svcdata)
+@@ -1574,22 +1577,22 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+ if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0))
+ goto auth_err;
+
+- *authp = rpc_autherr_badverf;
++ rqstp->rq_auth_stat = rpc_autherr_badverf;
+ switch (gc->gc_proc) {
+ case RPC_GSS_PROC_INIT:
+ case RPC_GSS_PROC_CONTINUE_INIT:
+ if (use_gss_proxy(SVC_NET(rqstp)))
+- return svcauth_gss_proxy_init(rqstp, gc, authp);
++ return svcauth_gss_proxy_init(rqstp, gc);
+ else
+- return svcauth_gss_legacy_init(rqstp, gc, authp);
++ return svcauth_gss_legacy_init(rqstp, gc);
+ case RPC_GSS_PROC_DATA:
+ case RPC_GSS_PROC_DESTROY:
+ /* Look up the context, and check the verifier: */
+- *authp = rpcsec_gsserr_credproblem;
++ rqstp->rq_auth_stat = rpcsec_gsserr_credproblem;
+ rsci = gss_svc_searchbyctx(sn->rsc_cache, &gc->gc_ctx);
+ if (!rsci)
+ goto auth_err;
+- switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) {
++ switch (gss_verify_header(rqstp, rsci, rpcstart, gc)) {
+ case SVC_OK:
+ break;
+ case SVC_DENIED:
+@@ -1599,7 +1602,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+ }
+ break;
+ default:
+- *authp = rpc_autherr_rejectedcred;
++ rqstp->rq_auth_stat = rpc_autherr_rejectedcred;
+ goto auth_err;
+ }
+
+@@ -1615,13 +1618,13 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
+ svc_putnl(resv, RPC_SUCCESS);
+ goto complete;
+ case RPC_GSS_PROC_DATA:
+- *authp = rpcsec_gsserr_ctxproblem;
++ rqstp->rq_auth_stat = rpcsec_gsserr_ctxproblem;
+ svcdata->verf_start = resv->iov_base + resv->iov_len;
+ if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
+ goto auth_err;
+ rqstp->rq_cred = rsci->cred;
+ get_group_info(rsci->cred.cr_group_info);
+- *authp = rpc_autherr_badcred;
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ switch (gc->gc_svc) {
+ case RPC_GSS_SVC_NONE:
+ break;
+diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
+index a00890962e115..a4c9d410eb8d5 100644
+--- a/net/sunrpc/sched.c
++++ b/net/sunrpc/sched.c
+@@ -821,6 +821,7 @@ void rpc_exit_task(struct rpc_task *task)
+ else if (task->tk_client)
+ rpc_count_iostats(task, task->tk_client->cl_metrics);
+ if (task->tk_ops->rpc_call_done != NULL) {
++ trace_rpc_task_call_done(task, task->tk_ops->rpc_call_done);
+ task->tk_ops->rpc_call_done(task, task->tk_calldata);
+ if (task->tk_action != NULL) {
+ /* Always release the RPC slot and buffer memory */
+diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
+index cfe8b911ca013..26d972c54a593 100644
+--- a/net/sunrpc/svc.c
++++ b/net/sunrpc/svc.c
+@@ -35,18 +35,37 @@
+
+ static void svc_unregister(const struct svc_serv *serv, struct net *net);
+
+-#define svc_serv_is_pooled(serv) ((serv)->sv_ops->svo_function)
+-
+ #define SVC_POOL_DEFAULT SVC_POOL_GLOBAL
+
++/*
++ * Mode for mapping cpus to pools.
++ */
++enum {
++ SVC_POOL_AUTO = -1, /* choose one of the others */
++ SVC_POOL_GLOBAL, /* no mapping, just a single global pool
++ * (legacy & UP mode) */
++ SVC_POOL_PERCPU, /* one pool per cpu */
++ SVC_POOL_PERNODE /* one pool per numa node */
++};
++
+ /*
+ * Structure for mapping cpus to pools and vice versa.
+ * Setup once during sunrpc initialisation.
+ */
+-struct svc_pool_map svc_pool_map = {
++
++struct svc_pool_map {
++ int count; /* How many svc_servs use us */
++ int mode; /* Note: int not enum to avoid
++ * warnings about "enumeration value
++ * not handled in switch" */
++ unsigned int npools;
++ unsigned int *pool_to; /* maps pool id to cpu or node */
++ unsigned int *to_pool; /* maps cpu or node to pool id */
++};
++
++static struct svc_pool_map svc_pool_map = {
+ .mode = SVC_POOL_DEFAULT
+ };
+-EXPORT_SYMBOL_GPL(svc_pool_map);
+
+ static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
+
+@@ -217,10 +236,12 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
+
+ /*
+ * Add a reference to the global map of cpus to pools (and
+- * vice versa). Initialise the map if we're the first user.
+- * Returns the number of pools.
++ * vice versa) if pools are in use.
++ * Initialise the map if we're the first user.
++ * Returns the number of pools. If this is '1', no reference
++ * was taken.
+ */
+-unsigned int
++static unsigned int
+ svc_pool_map_get(void)
+ {
+ struct svc_pool_map *m = &svc_pool_map;
+@@ -230,6 +251,7 @@ svc_pool_map_get(void)
+
+ if (m->count++) {
+ mutex_unlock(&svc_pool_map_mutex);
++ WARN_ON_ONCE(m->npools <= 1);
+ return m->npools;
+ }
+
+@@ -245,30 +267,36 @@ svc_pool_map_get(void)
+ break;
+ }
+
+- if (npools < 0) {
++ if (npools <= 0) {
+ /* default, or memory allocation failure */
+ npools = 1;
+ m->mode = SVC_POOL_GLOBAL;
+ }
+ m->npools = npools;
+
++ if (npools == 1)
++ /* service is unpooled, so doesn't hold a reference */
++ m->count--;
++
+ mutex_unlock(&svc_pool_map_mutex);
+- return m->npools;
++ return npools;
+ }
+-EXPORT_SYMBOL_GPL(svc_pool_map_get);
+
+ /*
+- * Drop a reference to the global map of cpus to pools.
++ * Drop a reference to the global map of cpus to pools, if
++ * pools were in use, i.e. if npools > 1.
+ * When the last reference is dropped, the map data is
+ * freed; this allows the sysadmin to change the pool
+ * mode using the pool_mode module option without
+ * rebooting or re-loading sunrpc.ko.
+ */
+-void
+-svc_pool_map_put(void)
++static void
++svc_pool_map_put(int npools)
+ {
+ struct svc_pool_map *m = &svc_pool_map;
+
++ if (npools <= 1)
++ return;
+ mutex_lock(&svc_pool_map_mutex);
+
+ if (!--m->count) {
+@@ -281,7 +309,6 @@ svc_pool_map_put(void)
+
+ mutex_unlock(&svc_pool_map_mutex);
+ }
+-EXPORT_SYMBOL_GPL(svc_pool_map_put);
+
+ static int svc_pool_map_get_node(unsigned int pidx)
+ {
+@@ -338,21 +365,18 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
+ struct svc_pool_map *m = &svc_pool_map;
+ unsigned int pidx = 0;
+
+- /*
+- * An uninitialised map happens in a pure client when
+- * lockd is brought up, so silently treat it the
+- * same as SVC_POOL_GLOBAL.
+- */
+- if (svc_serv_is_pooled(serv)) {
+- switch (m->mode) {
+- case SVC_POOL_PERCPU:
+- pidx = m->to_pool[cpu];
+- break;
+- case SVC_POOL_PERNODE:
+- pidx = m->to_pool[cpu_to_node(cpu)];
+- break;
+- }
++ if (serv->sv_nrpools <= 1)
++ return serv->sv_pools;
++
++ switch (m->mode) {
++ case SVC_POOL_PERCPU:
++ pidx = m->to_pool[cpu];
++ break;
++ case SVC_POOL_PERNODE:
++ pidx = m->to_pool[cpu_to_node(cpu)];
++ break;
+ }
++
+ return &serv->sv_pools[pidx % serv->sv_nrpools];
+ }
+
+@@ -422,7 +446,7 @@ __svc_init_bc(struct svc_serv *serv)
+ */
+ static struct svc_serv *
+ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
+- const struct svc_serv_ops *ops)
++ int (*threadfn)(void *data))
+ {
+ struct svc_serv *serv;
+ unsigned int vers;
+@@ -433,13 +457,13 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
+ return NULL;
+ serv->sv_name = prog->pg_name;
+ serv->sv_program = prog;
+- serv->sv_nrthreads = 1;
++ kref_init(&serv->sv_refcnt);
+ serv->sv_stats = prog->pg_stats;
+ if (bufsize > RPCSVC_MAXPAYLOAD)
+ bufsize = RPCSVC_MAXPAYLOAD;
+ serv->sv_max_payload = bufsize? bufsize : 4096;
+ serv->sv_max_mesg = roundup(serv->sv_max_payload + PAGE_SIZE, PAGE_SIZE);
+- serv->sv_ops = ops;
++ serv->sv_threadfn = threadfn;
+ xdrsize = 0;
+ while (prog) {
+ prog->pg_lovers = prog->pg_nvers-1;
+@@ -485,59 +509,56 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
+ return serv;
+ }
+
+-struct svc_serv *
+-svc_create(struct svc_program *prog, unsigned int bufsize,
+- const struct svc_serv_ops *ops)
++/**
++ * svc_create - Create an RPC service
++ * @prog: the RPC program the new service will handle
++ * @bufsize: maximum message size for @prog
++ * @threadfn: a function to service RPC requests for @prog
++ *
++ * Returns an instantiated struct svc_serv object or NULL.
++ */
++struct svc_serv *svc_create(struct svc_program *prog, unsigned int bufsize,
++ int (*threadfn)(void *data))
+ {
+- return __svc_create(prog, bufsize, /*npools*/1, ops);
++ return __svc_create(prog, bufsize, 1, threadfn);
+ }
+ EXPORT_SYMBOL_GPL(svc_create);
+
+-struct svc_serv *
+-svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
+- const struct svc_serv_ops *ops)
++/**
++ * svc_create_pooled - Create an RPC service with pooled threads
++ * @prog: the RPC program the new service will handle
++ * @bufsize: maximum message size for @prog
++ * @threadfn: a function to service RPC requests for @prog
++ *
++ * Returns an instantiated struct svc_serv object or NULL.
++ */
++struct svc_serv *svc_create_pooled(struct svc_program *prog,
++ unsigned int bufsize,
++ int (*threadfn)(void *data))
+ {
+ struct svc_serv *serv;
+ unsigned int npools = svc_pool_map_get();
+
+- serv = __svc_create(prog, bufsize, npools, ops);
++ serv = __svc_create(prog, bufsize, npools, threadfn);
+ if (!serv)
+ goto out_err;
+ return serv;
+ out_err:
+- svc_pool_map_put();
++ svc_pool_map_put(npools);
+ return NULL;
+ }
+ EXPORT_SYMBOL_GPL(svc_create_pooled);
+
+-void svc_shutdown_net(struct svc_serv *serv, struct net *net)
+-{
+- svc_close_net(serv, net);
+-
+- if (serv->sv_ops->svo_shutdown)
+- serv->sv_ops->svo_shutdown(serv, net);
+-}
+-EXPORT_SYMBOL_GPL(svc_shutdown_net);
+-
+ /*
+ * Destroy an RPC service. Should be called with appropriate locking to
+- * protect the sv_nrthreads, sv_permsocks and sv_tempsocks.
++ * protect sv_permsocks and sv_tempsocks.
+ */
+ void
+-svc_destroy(struct svc_serv *serv)
++svc_destroy(struct kref *ref)
+ {
+- dprintk("svc: svc_destroy(%s, %d)\n",
+- serv->sv_program->pg_name,
+- serv->sv_nrthreads);
+-
+- if (serv->sv_nrthreads) {
+- if (--(serv->sv_nrthreads) != 0) {
+- svc_sock_update_bufs(serv);
+- return;
+- }
+- } else
+- printk("svc_destroy: no threads for serv=%p!\n", serv);
++ struct svc_serv *serv = container_of(ref, struct svc_serv, sv_refcnt);
+
++ dprintk("svc: svc_destroy(%s)\n", serv->sv_program->pg_name);
+ del_timer_sync(&serv->sv_temptimer);
+
+ /*
+@@ -549,8 +570,7 @@ svc_destroy(struct svc_serv *serv)
+
+ cache_clean_deferred(serv);
+
+- if (svc_serv_is_pooled(serv))
+- svc_pool_map_put();
++ svc_pool_map_put(serv->sv_nrpools);
+
+ kfree(serv->sv_pools);
+ kfree(serv);
+@@ -614,6 +634,10 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
+ rqstp->rq_server = serv;
+ rqstp->rq_pool = pool;
+
++ rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0);
++ if (!rqstp->rq_scratch_page)
++ goto out_enomem;
++
+ rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node);
+ if (!rqstp->rq_argp)
+ goto out_enomem;
+@@ -632,7 +656,7 @@ svc_rqst_alloc(struct svc_serv *serv, struct svc_pool *pool, int node)
+ }
+ EXPORT_SYMBOL_GPL(svc_rqst_alloc);
+
+-struct svc_rqst *
++static struct svc_rqst *
+ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+ {
+ struct svc_rqst *rqstp;
+@@ -641,14 +665,17 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node)
+ if (!rqstp)
+ return ERR_PTR(-ENOMEM);
+
+- serv->sv_nrthreads++;
++ svc_get(serv);
++ spin_lock_bh(&serv->sv_lock);
++ serv->sv_nrthreads += 1;
++ spin_unlock_bh(&serv->sv_lock);
++
+ spin_lock_bh(&pool->sp_lock);
+ pool->sp_nrthreads++;
+ list_add_rcu(&rqstp->rq_all, &pool->sp_all_threads);
+ spin_unlock_bh(&pool->sp_lock);
+ return rqstp;
+ }
+-EXPORT_SYMBOL_GPL(svc_prepare_thread);
+
+ /*
+ * Choose a pool in which to create a new thread, for svc_set_num_threads
+@@ -722,11 +749,9 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+ if (IS_ERR(rqstp))
+ return PTR_ERR(rqstp);
+
+- __module_get(serv->sv_ops->svo_module);
+- task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
++ task = kthread_create_on_node(serv->sv_threadfn, rqstp,
+ node, "%s", serv->sv_name);
+ if (IS_ERR(task)) {
+- module_put(serv->sv_ops->svo_module);
+ svc_exit_thread(rqstp);
+ return PTR_ERR(task);
+ }
+@@ -742,59 +767,13 @@ svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+ return 0;
+ }
+
+-
+-/* destroy old threads */
+-static int
+-svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+-{
+- struct task_struct *task;
+- unsigned int state = serv->sv_nrthreads-1;
+-
+- /* destroy old threads */
+- do {
+- task = choose_victim(serv, pool, &state);
+- if (task == NULL)
+- break;
+- send_sig(SIGINT, task, 1);
+- nrservs++;
+- } while (nrservs < 0);
+-
+- return 0;
+-}
+-
+ /*
+ * Create or destroy enough new threads to make the number
+ * of threads the given number. If `pool' is non-NULL, applies
+ * only to threads in that pool, otherwise round-robins between
+ * all pools. Caller must ensure that mutual exclusion between this and
+ * server startup or shutdown.
+- *
+- * Destroying threads relies on the service threads filling in
+- * rqstp->rq_task, which only the nfs ones do. Assumes the serv
+- * has been created using svc_create_pooled().
+- *
+- * Based on code that used to be in nfsd_svc() but tweaked
+- * to be pool-aware.
+ */
+-int
+-svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+-{
+- if (pool == NULL) {
+- /* The -1 assumes caller has done a svc_get() */
+- nrservs -= (serv->sv_nrthreads-1);
+- } else {
+- spin_lock_bh(&pool->sp_lock);
+- nrservs -= pool->sp_nrthreads;
+- spin_unlock_bh(&pool->sp_lock);
+- }
+-
+- if (nrservs > 0)
+- return svc_start_kthreads(serv, pool, nrservs);
+- if (nrservs < 0)
+- return svc_signal_kthreads(serv, pool, nrservs);
+- return 0;
+-}
+-EXPORT_SYMBOL_GPL(svc_set_num_threads);
+
+ /* destroy old threads */
+ static int
+@@ -819,11 +798,10 @@ svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+ }
+
+ int
+-svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
++svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
+ {
+ if (pool == NULL) {
+- /* The -1 assumes caller has done a svc_get() */
+- nrservs -= (serv->sv_nrthreads-1);
++ nrservs -= serv->sv_nrthreads;
+ } else {
+ spin_lock_bh(&pool->sp_lock);
+ nrservs -= pool->sp_nrthreads;
+@@ -836,7 +814,28 @@ svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrser
+ return svc_stop_kthreads(serv, pool, nrservs);
+ return 0;
+ }
+-EXPORT_SYMBOL_GPL(svc_set_num_threads_sync);
++EXPORT_SYMBOL_GPL(svc_set_num_threads);
++
++/**
++ * svc_rqst_replace_page - Replace one page in rq_pages[]
++ * @rqstp: svc_rqst with pages to replace
++ * @page: replacement page
++ *
++ * When replacing a page in rq_pages, batch the release of the
++ * replaced pages to avoid hammering the page allocator.
++ */
++void svc_rqst_replace_page(struct svc_rqst *rqstp, struct page *page)
++{
++ if (*rqstp->rq_next_page) {
++ if (!pagevec_space(&rqstp->rq_pvec))
++ __pagevec_release(&rqstp->rq_pvec);
++ pagevec_add(&rqstp->rq_pvec, *rqstp->rq_next_page);
++ }
++
++ get_page(page);
++ *(rqstp->rq_next_page++) = page;
++}
++EXPORT_SYMBOL_GPL(svc_rqst_replace_page);
+
+ /*
+ * Called from a server thread as it's exiting. Caller must hold the "service
+@@ -846,6 +845,7 @@ void
+ svc_rqst_free(struct svc_rqst *rqstp)
+ {
+ svc_release_buffer(rqstp);
++ put_page(rqstp->rq_scratch_page);
+ kfree(rqstp->rq_resp);
+ kfree(rqstp->rq_argp);
+ kfree(rqstp->rq_auth_data);
+@@ -865,11 +865,14 @@ svc_exit_thread(struct svc_rqst *rqstp)
+ list_del_rcu(&rqstp->rq_all);
+ spin_unlock_bh(&pool->sp_lock);
+
++ spin_lock_bh(&serv->sv_lock);
++ serv->sv_nrthreads -= 1;
++ spin_unlock_bh(&serv->sv_lock);
++ svc_sock_update_bufs(serv);
++
+ svc_rqst_free(rqstp);
+
+- /* Release the server */
+- if (serv)
+- svc_destroy(serv);
++ svc_put(serv);
+ }
+ EXPORT_SYMBOL_GPL(svc_exit_thread);
+
+@@ -1161,22 +1164,6 @@ void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...)
+ static __printf(2,3) void svc_printk(struct svc_rqst *rqstp, const char *fmt, ...) {}
+ #endif
+
+-__be32
+-svc_return_autherr(struct svc_rqst *rqstp, __be32 auth_err)
+-{
+- set_bit(RQ_AUTHERR, &rqstp->rq_flags);
+- return auth_err;
+-}
+-EXPORT_SYMBOL_GPL(svc_return_autherr);
+-
+-static __be32
+-svc_get_autherr(struct svc_rqst *rqstp, __be32 *statp)
+-{
+- if (test_and_clear_bit(RQ_AUTHERR, &rqstp->rq_flags))
+- return *statp;
+- return rpc_auth_ok;
+-}
+-
+ static int
+ svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ {
+@@ -1200,7 +1187,7 @@ svc_generic_dispatch(struct svc_rqst *rqstp, __be32 *statp)
+ test_bit(RQ_DROPME, &rqstp->rq_flags))
+ return 0;
+
+- if (test_bit(RQ_AUTHERR, &rqstp->rq_flags))
++ if (rqstp->rq_auth_stat != rpc_auth_ok)
+ return 1;
+
+ if (*statp != rpc_success)
+@@ -1250,7 +1237,7 @@ svc_generic_init_request(struct svc_rqst *rqstp,
+ rqstp->rq_procinfo = procp = &versp->vs_proc[rqstp->rq_proc];
+
+ /* Initialize storage for argp and resp */
+- memset(rqstp->rq_argp, 0, procp->pc_argsize);
++ memset(rqstp->rq_argp, 0, procp->pc_argzero);
+ memset(rqstp->rq_resp, 0, procp->pc_ressize);
+
+ /* Bump per-procedure stats counter */
+@@ -1279,7 +1266,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+ struct svc_process_info process;
+ __be32 *statp;
+ u32 prog, vers;
+- __be32 auth_stat, rpc_stat;
++ __be32 rpc_stat;
+ int auth_res;
+ __be32 *reply_statp;
+
+@@ -1322,14 +1309,12 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+ * We do this before anything else in order to get a decent
+ * auth verifier.
+ */
+- auth_res = svc_authenticate(rqstp, &auth_stat);
++ auth_res = svc_authenticate(rqstp);
+ /* Also give the program a chance to reject this call: */
+- if (auth_res == SVC_OK && progp) {
+- auth_stat = rpc_autherr_badcred;
++ if (auth_res == SVC_OK && progp)
+ auth_res = progp->pg_authenticate(rqstp);
+- }
+ if (auth_res != SVC_OK)
+- trace_svc_authenticate(rqstp, auth_res, auth_stat);
++ trace_svc_authenticate(rqstp, auth_res);
+ switch (auth_res) {
+ case SVC_OK:
+ break;
+@@ -1388,15 +1373,15 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+ goto release_dropit;
+ if (*statp == rpc_garbage_args)
+ goto err_garbage;
+- auth_stat = svc_get_autherr(rqstp, statp);
+- if (auth_stat != rpc_auth_ok)
+- goto err_release_bad_auth;
+ } else {
+ dprintk("svc: calling dispatcher\n");
+ if (!process.dispatch(rqstp, statp))
+ goto release_dropit; /* Release reply info */
+ }
+
++ if (rqstp->rq_auth_stat != rpc_auth_ok)
++ goto err_release_bad_auth;
++
+ /* Check RPC status result */
+ if (*statp != rpc_success)
+ resv->iov_len = ((void*)statp) - resv->iov_base + 4;
+@@ -1425,7 +1410,7 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+ svc_authorise(rqstp);
+ close_xprt:
+ if (rqstp->rq_xprt && test_bit(XPT_TEMP, &rqstp->rq_xprt->xpt_flags))
+- svc_close_xprt(rqstp->rq_xprt);
++ svc_xprt_close(rqstp->rq_xprt);
+ dprintk("svc: svc_process close\n");
+ return 0;
+
+@@ -1446,13 +1431,14 @@ svc_process_common(struct svc_rqst *rqstp, struct kvec *argv, struct kvec *resv)
+ if (procp->pc_release)
+ procp->pc_release(rqstp);
+ err_bad_auth:
+- dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
++ dprintk("svc: authentication failed (%d)\n",
++ be32_to_cpu(rqstp->rq_auth_stat));
+ serv->sv_stats->rpcbadauth++;
+ /* Restore write pointer to location of accept status: */
+ xdr_ressize_check(rqstp, reply_statp);
+ svc_putnl(resv, 1); /* REJECT */
+ svc_putnl(resv, 1); /* AUTH_ERROR */
+- svc_putnl(resv, ntohl(auth_stat)); /* status */
++ svc_putu32(resv, rqstp->rq_auth_stat); /* status */
+ goto sendit;
+
+ err_bad_prog:
+@@ -1626,7 +1612,7 @@ u32 svc_max_payload(const struct svc_rqst *rqstp)
+ EXPORT_SYMBOL_GPL(svc_max_payload);
+
+ /**
+- * svc_encode_read_payload - mark a range of bytes as a READ payload
++ * svc_encode_result_payload - mark a range of bytes as a result payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in rqstp->rq_res
+ * @length: size of payload, in bytes
+@@ -1634,26 +1620,28 @@ EXPORT_SYMBOL_GPL(svc_max_payload);
+ * Returns zero on success, or a negative errno if a permanent
+ * error occurred.
+ */
+-int svc_encode_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+- unsigned int length)
++int svc_encode_result_payload(struct svc_rqst *rqstp, unsigned int offset,
++ unsigned int length)
+ {
+- return rqstp->rq_xprt->xpt_ops->xpo_read_payload(rqstp, offset, length);
++ return rqstp->rq_xprt->xpt_ops->xpo_result_payload(rqstp, offset,
++ length);
+ }
+-EXPORT_SYMBOL_GPL(svc_encode_read_payload);
++EXPORT_SYMBOL_GPL(svc_encode_result_payload);
+
+ /**
+ * svc_fill_write_vector - Construct data argument for VFS write call
+ * @rqstp: svc_rqst to operate on
+- * @pages: list of pages containing data payload
+- * @first: buffer containing first section of write payload
+- * @total: total number of bytes of write payload
++ * @payload: xdr_buf containing only the write data payload
+ *
+ * Fills in rqstp::rq_vec, and returns the number of elements.
+ */
+-unsigned int svc_fill_write_vector(struct svc_rqst *rqstp, struct page **pages,
+- struct kvec *first, size_t total)
++unsigned int svc_fill_write_vector(struct svc_rqst *rqstp,
++ struct xdr_buf *payload)
+ {
++ struct page **pages = payload->pages;
++ struct kvec *first = payload->head;
+ struct kvec *vec = rqstp->rq_vec;
++ size_t total = payload->len;
+ unsigned int i;
+
+ /* Some types of transport can present the write payload
+diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
+index 06e503466c32c..d1eacf3358b81 100644
+--- a/net/sunrpc/svc_xprt.c
++++ b/net/sunrpc/svc_xprt.c
+@@ -233,30 +233,35 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
+ return xprt;
+ }
+
+-/*
+- * svc_xprt_received conditionally queues the transport for processing
+- * by another thread. The caller must hold the XPT_BUSY bit and must
++/**
++ * svc_xprt_received - start next receiver thread
++ * @xprt: controlling transport
++ *
++ * The caller must hold the XPT_BUSY bit and must
+ * not thereafter touch transport data.
+ *
+ * Note: XPT_DATA only gets cleared when a read-attempt finds no (or
+ * insufficient) data.
+ */
+-static void svc_xprt_received(struct svc_xprt *xprt)
++void svc_xprt_received(struct svc_xprt *xprt)
+ {
+ if (!test_bit(XPT_BUSY, &xprt->xpt_flags)) {
+ WARN_ONCE(1, "xprt=0x%p already busy!", xprt);
+ return;
+ }
+
++ trace_svc_xprt_received(xprt);
++
+ /* As soon as we clear busy, the xprt could be closed and
+- * 'put', so we need a reference to call svc_enqueue_xprt with:
++ * 'put', so we need a reference to call svc_xprt_enqueue with:
+ */
+ svc_xprt_get(xprt);
+ smp_mb__before_atomic();
+ clear_bit(XPT_BUSY, &xprt->xpt_flags);
+- xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
++ svc_xprt_enqueue(xprt);
+ svc_xprt_put(xprt);
+ }
++EXPORT_SYMBOL_GPL(svc_xprt_received);
+
+ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
+ {
+@@ -267,7 +272,7 @@ void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new)
+ svc_xprt_received(new);
+ }
+
+-static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
++static int _svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, const int family,
+ const unsigned short port, int flags,
+ const struct cred *cred)
+@@ -303,21 +308,35 @@ static int _svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+ return -EPROTONOSUPPORT;
+ }
+
+-int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
++/**
++ * svc_xprt_create - Add a new listener to @serv
++ * @serv: target RPC service
++ * @xprt_name: transport class name
++ * @net: network namespace
++ * @family: network address family
++ * @port: listener port
++ * @flags: SVC_SOCK flags
++ * @cred: credential to bind to this transport
++ *
++ * Return values:
++ * %0: New listener added successfully
++ * %-EPROTONOSUPPORT: Requested transport type not supported
++ */
++int svc_xprt_create(struct svc_serv *serv, const char *xprt_name,
+ struct net *net, const int family,
+ const unsigned short port, int flags,
+ const struct cred *cred)
+ {
+ int err;
+
+- err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
++ err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
+ if (err == -EPROTONOSUPPORT) {
+ request_module("svc%s", xprt_name);
+- err = _svc_create_xprt(serv, xprt_name, net, family, port, flags, cred);
++ err = _svc_xprt_create(serv, xprt_name, net, family, port, flags, cred);
+ }
+ return err;
+ }
+-EXPORT_SYMBOL_GPL(svc_create_xprt);
++EXPORT_SYMBOL_GPL(svc_xprt_create);
+
+ /*
+ * Copy the local and remote xprt addresses to the rqstp structure
+@@ -393,6 +412,8 @@ static bool svc_xprt_ready(struct svc_xprt *xprt)
+ smp_rmb();
+ xpt_flags = READ_ONCE(xprt->xpt_flags);
+
++ if (xpt_flags & BIT(XPT_BUSY))
++ return false;
+ if (xpt_flags & (BIT(XPT_CONN) | BIT(XPT_CLOSE)))
+ return true;
+ if (xpt_flags & (BIT(XPT_DATA) | BIT(XPT_DEFERRED))) {
+@@ -405,7 +426,12 @@ static bool svc_xprt_ready(struct svc_xprt *xprt)
+ return false;
+ }
+
+-void svc_xprt_do_enqueue(struct svc_xprt *xprt)
++/**
++ * svc_xprt_enqueue - Queue a transport on an idle nfsd thread
++ * @xprt: transport with data pending
++ *
++ */
++void svc_xprt_enqueue(struct svc_xprt *xprt)
+ {
+ struct svc_pool *pool;
+ struct svc_rqst *rqstp = NULL;
+@@ -449,19 +475,6 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt)
+ put_cpu();
+ trace_svc_xprt_do_enqueue(xprt, rqstp);
+ }
+-EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue);
+-
+-/*
+- * Queue up a transport with data pending. If there are idle nfsd
+- * processes, wake 'em up.
+- *
+- */
+-void svc_xprt_enqueue(struct svc_xprt *xprt)
+-{
+- if (test_bit(XPT_BUSY, &xprt->xpt_flags))
+- return;
+- xprt->xpt_server->sv_ops->svo_enqueue_xprt(xprt);
+-}
+ EXPORT_SYMBOL_GPL(svc_xprt_enqueue);
+
+ /*
+@@ -520,6 +533,7 @@ static void svc_xprt_release(struct svc_rqst *rqstp)
+ kfree(rqstp->rq_deferred);
+ rqstp->rq_deferred = NULL;
+
++ pagevec_release(&rqstp->rq_pvec);
+ svc_free_res_pages(rqstp);
+ rqstp->rq_res.page_len = 0;
+ rqstp->rq_res.page_base = 0;
+@@ -646,6 +660,8 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
+ int pages;
+ int i;
+
++ pagevec_init(&rqstp->rq_pvec);
++
+ /* now allocate needed pages. If we get a failure, sleep briefly */
+ pages = (serv->sv_max_mesg + 2 * PAGE_SIZE) >> PAGE_SHIFT;
+ if (pages > RPCSVC_MAXPAGES) {
+@@ -658,13 +674,13 @@ static int svc_alloc_arg(struct svc_rqst *rqstp)
+ while (rqstp->rq_pages[i] == NULL) {
+ struct page *p = alloc_page(GFP_KERNEL);
+ if (!p) {
+- set_current_state(TASK_INTERRUPTIBLE);
+- if (signalled() || kthread_should_stop()) {
++ set_current_state(TASK_IDLE);
++ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ return -EINTR;
+ }
+- schedule_timeout(msecs_to_jiffies(500));
+ }
++ freezable_schedule_timeout(msecs_to_jiffies(500));
+ rqstp->rq_pages[i] = p;
+ }
+ rqstp->rq_page_end = &rqstp->rq_pages[i];
+@@ -697,7 +713,7 @@ rqst_should_sleep(struct svc_rqst *rqstp)
+ return false;
+
+ /* are we shutting down? */
+- if (signalled() || kthread_should_stop())
++ if (kthread_should_stop())
+ return false;
+
+ /* are we freezing? */
+@@ -719,18 +735,14 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
+ if (rqstp->rq_xprt)
+ goto out_found;
+
+- /*
+- * We have to be able to interrupt this wait
+- * to bring down the daemons ...
+- */
+- set_current_state(TASK_INTERRUPTIBLE);
++ set_current_state(TASK_IDLE);
+ smp_mb__before_atomic();
+ clear_bit(SP_CONGESTED, &pool->sp_flags);
+ clear_bit(RQ_BUSY, &rqstp->rq_flags);
+ smp_mb__after_atomic();
+
+ if (likely(rqst_should_sleep(rqstp)))
+- time_left = schedule_timeout(timeout);
++ time_left = freezable_schedule_timeout(timeout);
+ else
+ __set_current_state(TASK_RUNNING);
+
+@@ -745,7 +757,7 @@ static struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout)
+ if (!time_left)
+ atomic_long_inc(&pool->sp_stats.threads_timedout);
+
+- if (signalled() || kthread_should_stop())
++ if (kthread_should_stop())
+ return ERR_PTR(-EINTR);
+ return ERR_PTR(-EAGAIN);
+ out_found:
+@@ -844,7 +856,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
+ try_to_freeze();
+ cond_resched();
+ err = -EINTR;
+- if (signalled() || kthread_should_stop())
++ if (kthread_should_stop())
+ goto out;
+
+ xprt = svc_get_next_xprt(rqstp, timeout);
+@@ -1040,7 +1052,12 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
+ svc_xprt_put(xprt);
+ }
+
+-void svc_close_xprt(struct svc_xprt *xprt)
++/**
++ * svc_xprt_close - Close a client connection
++ * @xprt: transport to disconnect
++ *
++ */
++void svc_xprt_close(struct svc_xprt *xprt)
+ {
+ trace_svc_xprt_close(xprt);
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+@@ -1055,7 +1072,7 @@ void svc_close_xprt(struct svc_xprt *xprt)
+ */
+ svc_delete_xprt(xprt);
+ }
+-EXPORT_SYMBOL_GPL(svc_close_xprt);
++EXPORT_SYMBOL_GPL(svc_xprt_close);
+
+ static int svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net)
+ {
+@@ -1107,7 +1124,11 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
+ }
+ }
+
+-/*
++/**
++ * svc_xprt_destroy_all - Destroy transports associated with @serv
++ * @serv: RPC service to be shut down
++ * @net: target network namespace
++ *
+ * Server threads may still be running (especially in the case where the
+ * service is still running in other network namespaces).
+ *
+@@ -1119,7 +1140,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net)
+ * threads, we may need to wait a little while and then check again to
+ * see if they're done.
+ */
+-void svc_close_net(struct svc_serv *serv, struct net *net)
++void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net)
+ {
+ int delay = 0;
+
+@@ -1130,6 +1151,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net)
+ msleep(delay++);
+ }
+ }
++EXPORT_SYMBOL_GPL(svc_xprt_destroy_all);
+
+ /*
+ * Handle defer and revisit of requests
+diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
+index 998b196b61767..5a8b8e03fdd42 100644
+--- a/net/sunrpc/svcauth.c
++++ b/net/sunrpc/svcauth.c
+@@ -59,12 +59,12 @@ svc_put_auth_ops(struct auth_ops *aops)
+ }
+
+ int
+-svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
++svc_authenticate(struct svc_rqst *rqstp)
+ {
+ rpc_authflavor_t flavor;
+ struct auth_ops *aops;
+
+- *authp = rpc_auth_ok;
++ rqstp->rq_auth_stat = rpc_auth_ok;
+
+ flavor = svc_getnl(&rqstp->rq_arg.head[0]);
+
+@@ -72,7 +72,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
+
+ aops = svc_get_auth_ops(flavor);
+ if (aops == NULL) {
+- *authp = rpc_autherr_badcred;
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+
+@@ -80,7 +80,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
+ init_svc_cred(&rqstp->rq_cred);
+
+ rqstp->rq_authop = aops;
+- return aops->accept(rqstp, authp);
++ return aops->accept(rqstp);
+ }
+ EXPORT_SYMBOL_GPL(svc_authenticate);
+
+diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
+index 60754a292589b..1868596259af5 100644
+--- a/net/sunrpc/svcauth_unix.c
++++ b/net/sunrpc/svcauth_unix.c
+@@ -699,8 +699,9 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
+
+ rqstp->rq_client = NULL;
+ if (rqstp->rq_proc == 0)
+- return SVC_OK;
++ goto out;
+
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ ipm = ip_map_cached_get(xprt);
+ if (ipm == NULL)
+ ipm = __ip_map_lookup(sn->ip_map_cache, rqstp->rq_server->sv_program->pg_class,
+@@ -737,13 +738,16 @@ svcauth_unix_set_client(struct svc_rqst *rqstp)
+ put_group_info(cred->cr_group_info);
+ cred->cr_group_info = gi;
+ }
++
++out:
++ rqstp->rq_auth_stat = rpc_auth_ok;
+ return SVC_OK;
+ }
+
+ EXPORT_SYMBOL_GPL(svcauth_unix_set_client);
+
+ static int
+-svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
++svcauth_null_accept(struct svc_rqst *rqstp)
+ {
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+@@ -754,12 +758,12 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
+
+ if (svc_getu32(argv) != 0) {
+ dprintk("svc: bad null cred\n");
+- *authp = rpc_autherr_badcred;
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+ if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
+ dprintk("svc: bad null verf\n");
+- *authp = rpc_autherr_badverf;
++ rqstp->rq_auth_stat = rpc_autherr_badverf;
+ return SVC_DENIED;
+ }
+
+@@ -803,7 +807,7 @@ struct auth_ops svcauth_null = {
+
+
+ static int
+-svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
++svcauth_unix_accept(struct svc_rqst *rqstp)
+ {
+ struct kvec *argv = &rqstp->rq_arg.head[0];
+ struct kvec *resv = &rqstp->rq_res.head[0];
+@@ -845,7 +849,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
+ }
+ groups_sort(cred->cr_group_info);
+ if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) {
+- *authp = rpc_autherr_badverf;
++ rqstp->rq_auth_stat = rpc_autherr_badverf;
+ return SVC_DENIED;
+ }
+
+@@ -857,7 +861,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
+ return SVC_OK;
+
+ badcred:
+- *authp = rpc_autherr_badcred;
++ rqstp->rq_auth_stat = rpc_autherr_badcred;
+ return SVC_DENIED;
+ }
+
+diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
+index 3d5ee042c5015..cb0cfcd8a8141 100644
+--- a/net/sunrpc/svcsock.c
++++ b/net/sunrpc/svcsock.c
+@@ -181,8 +181,8 @@ static void svc_set_cmsg_data(struct svc_rqst *rqstp, struct cmsghdr *cmh)
+ }
+ }
+
+-static int svc_sock_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+- unsigned int length)
++static int svc_sock_result_payload(struct svc_rqst *rqstp, unsigned int offset,
++ unsigned int length)
+ {
+ return 0;
+ }
+@@ -635,7 +635,7 @@ static const struct svc_xprt_ops svc_udp_ops = {
+ .xpo_create = svc_udp_create,
+ .xpo_recvfrom = svc_udp_recvfrom,
+ .xpo_sendto = svc_udp_sendto,
+- .xpo_read_payload = svc_sock_read_payload,
++ .xpo_result_payload = svc_sock_result_payload,
+ .xpo_release_rqst = svc_udp_release_rqst,
+ .xpo_detach = svc_sock_detach,
+ .xpo_free = svc_sock_free,
+@@ -1209,7 +1209,7 @@ static const struct svc_xprt_ops svc_tcp_ops = {
+ .xpo_create = svc_tcp_create,
+ .xpo_recvfrom = svc_tcp_recvfrom,
+ .xpo_sendto = svc_tcp_sendto,
+- .xpo_read_payload = svc_sock_read_payload,
++ .xpo_result_payload = svc_sock_result_payload,
+ .xpo_release_rqst = svc_tcp_release_rqst,
+ .xpo_detach = svc_tcp_sock_detach,
+ .xpo_free = svc_sock_free,
+@@ -1342,25 +1342,10 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
+ return svsk;
+ }
+
+-bool svc_alien_sock(struct net *net, int fd)
+-{
+- int err;
+- struct socket *sock = sockfd_lookup(fd, &err);
+- bool ret = false;
+-
+- if (!sock)
+- goto out;
+- if (sock_net(sock->sk) != net)
+- ret = true;
+- sockfd_put(sock);
+-out:
+- return ret;
+-}
+-EXPORT_SYMBOL_GPL(svc_alien_sock);
+-
+ /**
+ * svc_addsock - add a listener socket to an RPC service
+ * @serv: pointer to RPC service to which to add a new listener
++ * @net: caller's network namespace
+ * @fd: file descriptor of the new listener
+ * @name_return: pointer to buffer to fill in with name of listener
+ * @len: size of the buffer
+@@ -1370,8 +1355,8 @@ EXPORT_SYMBOL_GPL(svc_alien_sock);
+ * Name is terminated with '\n'. On error, returns a negative errno
+ * value.
+ */
+-int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
+- const size_t len, const struct cred *cred)
++int svc_addsock(struct svc_serv *serv, struct net *net, const int fd,
++ char *name_return, const size_t len, const struct cred *cred)
+ {
+ int err = 0;
+ struct socket *so = sockfd_lookup(fd, &err);
+@@ -1382,6 +1367,9 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return,
+
+ if (!so)
+ return err;
++ err = -EINVAL;
++ if (sock_net(so->sk) != net)
++ goto out;
+ err = -EAFNOSUPPORT;
+ if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6))
+ goto out;
+diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
+index d84bb5037bb5b..e2bd0cd391142 100644
+--- a/net/sunrpc/xdr.c
++++ b/net/sunrpc/xdr.c
+@@ -669,7 +669,7 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+ struct kvec *iov = buf->head;
+ int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
+
+- xdr_set_scratch_buffer(xdr, NULL, 0);
++ xdr_reset_scratch_buffer(xdr);
+ BUG_ON(scratch_len < 0);
+ xdr->buf = buf;
+ xdr->iov = iov;
+@@ -691,7 +691,29 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+ EXPORT_SYMBOL_GPL(xdr_init_encode);
+
+ /**
+- * xdr_commit_encode - Ensure all data is written to buffer
++ * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages
++ * @xdr: pointer to xdr_stream struct
++ * @buf: pointer to XDR buffer into which to encode data
++ * @pages: list of pages to decode into
++ * @rqst: pointer to controlling rpc_rqst, for debugging
++ *
++ */
++void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf,
++ struct page **pages, struct rpc_rqst *rqst)
++{
++ xdr_reset_scratch_buffer(xdr);
++
++ xdr->buf = buf;
++ xdr->page_ptr = pages;
++ xdr->iov = NULL;
++ xdr->p = page_address(*pages);
++ xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE);
++ xdr->rqst = rqst;
++}
++EXPORT_SYMBOL_GPL(xdr_init_encode_pages);
++
++/**
++ * __xdr_commit_encode - Ensure all data is written to buffer
+ * @xdr: pointer to xdr_stream
+ *
+ * We handle encoding across page boundaries by giving the caller a
+@@ -703,22 +725,25 @@ EXPORT_SYMBOL_GPL(xdr_init_encode);
+ * required at the end of encoding, or any other time when the xdr_buf
+ * data might be read.
+ */
+-inline void xdr_commit_encode(struct xdr_stream *xdr)
++void __xdr_commit_encode(struct xdr_stream *xdr)
+ {
+ int shift = xdr->scratch.iov_len;
+ void *page;
+
+- if (shift == 0)
+- return;
+ page = page_address(*xdr->page_ptr);
+ memcpy(xdr->scratch.iov_base, page, shift);
+ memmove(page, page + shift, (void *)xdr->p - page);
+- xdr->scratch.iov_len = 0;
++ xdr_reset_scratch_buffer(xdr);
+ }
+-EXPORT_SYMBOL_GPL(xdr_commit_encode);
++EXPORT_SYMBOL_GPL(__xdr_commit_encode);
+
+-static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
+- size_t nbytes)
++/*
++ * The buffer space to be reserved crosses the boundary between
++ * xdr->buf->head and xdr->buf->pages, or between two pages
++ * in xdr->buf->pages.
++ */
++static noinline __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
++ size_t nbytes)
+ {
+ __be32 *p;
+ int space_left;
+@@ -743,8 +768,7 @@ static __be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr,
+ * the "scratch" iov to track any temporarily unused fragment of
+ * space at the end of the previous buffer:
+ */
+- xdr->scratch.iov_base = xdr->p;
+- xdr->scratch.iov_len = frag1bytes;
++ xdr_set_scratch_buffer(xdr, xdr->p, frag1bytes);
+ p = page_address(*xdr->page_ptr);
+ /*
+ * Note this is where the next encode will start after we've
+@@ -1056,8 +1080,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p,
+ struct rpc_rqst *rqst)
+ {
+ xdr->buf = buf;
+- xdr->scratch.iov_base = NULL;
+- xdr->scratch.iov_len = 0;
++ xdr_reset_scratch_buffer(xdr);
+ xdr->nwords = XDR_QUADLEN(buf->len);
+ if (buf->head[0].iov_len != 0)
+ xdr_set_iov(xdr, buf->head, buf->len);
+@@ -1105,24 +1128,6 @@ static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
+ return p;
+ }
+
+-/**
+- * xdr_set_scratch_buffer - Attach a scratch buffer for decoding data.
+- * @xdr: pointer to xdr_stream struct
+- * @buf: pointer to an empty buffer
+- * @buflen: size of 'buf'
+- *
+- * The scratch buffer is used when decoding from an array of pages.
+- * If an xdr_inline_decode() call spans across page boundaries, then
+- * we copy the data into the scratch buffer in order to allow linear
+- * access.
+- */
+-void xdr_set_scratch_buffer(struct xdr_stream *xdr, void *buf, size_t buflen)
+-{
+- xdr->scratch.iov_base = buf;
+- xdr->scratch.iov_len = buflen;
+-}
+-EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
+-
+ static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
+ {
+ __be32 *p;
+@@ -1432,6 +1437,51 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
+ }
+ EXPORT_SYMBOL_GPL(xdr_buf_subsegment);
+
++/**
++ * xdr_stream_subsegment - set @subbuf to a portion of @xdr
++ * @xdr: an xdr_stream set up for decoding
++ * @subbuf: the result buffer
++ * @nbytes: length of @xdr to extract, in bytes
++ *
++ * Sets up @subbuf to represent a portion of @xdr. The portion
++ * starts at the current offset in @xdr, and extends for a length
++ * of @nbytes. If this is successful, @xdr is advanced to the next
++ * position following that portion.
++ *
++ * Return values:
++ * %true: @subbuf has been initialized, and @xdr has been advanced.
++ * %false: a bounds error has occurred
++ */
++bool xdr_stream_subsegment(struct xdr_stream *xdr, struct xdr_buf *subbuf,
++ unsigned int nbytes)
++{
++ unsigned int remaining, offset, len;
++
++ if (xdr_buf_subsegment(xdr->buf, subbuf, xdr_stream_pos(xdr), nbytes))
++ return false;
++
++ if (subbuf->head[0].iov_len)
++ if (!__xdr_inline_decode(xdr, subbuf->head[0].iov_len))
++ return false;
++
++ remaining = subbuf->page_len;
++ offset = subbuf->page_base;
++ while (remaining) {
++ len = min_t(unsigned int, remaining, PAGE_SIZE) - offset;
++
++ if (xdr->p == xdr->end && !xdr_set_next_buffer(xdr))
++ return false;
++ if (!__xdr_inline_decode(xdr, len))
++ return false;
++
++ remaining -= len;
++ offset = 0;
++ }
++
++ return true;
++}
++EXPORT_SYMBOL_GPL(xdr_stream_subsegment);
++
+ /**
+ * xdr_buf_trim - lop at most "len" bytes off the end of "buf"
+ * @buf: buf to be trimmed
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+index c5154bc38e129..feac8c26fb87d 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+@@ -186,7 +186,7 @@ static int xprt_rdma_bc_send_request(struct rpc_rqst *rqst)
+
+ ret = rpcrdma_bc_send_request(rdma, rqst);
+ if (ret == -ENOTCONN)
+- svc_close_xprt(sxprt);
++ svc_xprt_close(sxprt);
+ return ret;
+ }
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+index c3d588b149aaa..d6436c13d5c47 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+@@ -448,7 +448,6 @@ static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
+ * svc_rdma_encode_write_list - Encode RPC Reply's Write chunk list
+ * @rctxt: Reply context with information about the RPC Call
+ * @sctxt: Send context for the RPC Reply
+- * @length: size in bytes of the payload in the first Write chunk
+ *
+ * The client provides a Write chunk list in the Call message. Fill
+ * in the segments in the first Write chunk in the Reply's transport
+@@ -465,12 +464,12 @@ static ssize_t svc_rdma_encode_write_chunk(__be32 *src,
+ */
+ static ssize_t
+ svc_rdma_encode_write_list(const struct svc_rdma_recv_ctxt *rctxt,
+- struct svc_rdma_send_ctxt *sctxt,
+- unsigned int length)
++ struct svc_rdma_send_ctxt *sctxt)
+ {
+ ssize_t len, ret;
+
+- ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt, length);
++ ret = svc_rdma_encode_write_chunk(rctxt->rc_write_list, sctxt,
++ rctxt->rc_read_payload_length);
+ if (ret < 0)
+ return ret;
+ len = ret;
+@@ -923,21 +922,12 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
+ goto err0;
+ if (wr_lst) {
+ /* XXX: Presume the client sent only one Write chunk */
+- unsigned long offset;
+- unsigned int length;
+-
+- if (rctxt->rc_read_payload_length) {
+- offset = rctxt->rc_read_payload_offset;
+- length = rctxt->rc_read_payload_length;
+- } else {
+- offset = xdr->head[0].iov_len;
+- length = xdr->page_len;
+- }
+- ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr, offset,
+- length);
++ ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr,
++ rctxt->rc_read_payload_offset,
++ rctxt->rc_read_payload_length);
+ if (ret < 0)
+ goto err2;
+- if (svc_rdma_encode_write_list(rctxt, sctxt, length) < 0)
++ if (svc_rdma_encode_write_list(rctxt, sctxt) < 0)
+ goto err0;
+ } else {
+ if (xdr_stream_encode_item_absent(&sctxt->sc_stream) < 0)
+@@ -979,19 +969,19 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
+ }
+
+ /**
+- * svc_rdma_read_payload - special processing for a READ payload
++ * svc_rdma_result_payload - special processing for a result payload
+ * @rqstp: svc_rqst to operate on
+ * @offset: payload's byte offset in @xdr
+ * @length: size of payload, in bytes
+ *
+ * Returns zero on success.
+ *
+- * For the moment, just record the xdr_buf location of the READ
++ * For the moment, just record the xdr_buf location of the result
+ * payload. svc_rdma_sendto will use that location later when
+ * we actually send the payload.
+ */
+-int svc_rdma_read_payload(struct svc_rqst *rqstp, unsigned int offset,
+- unsigned int length)
++int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset,
++ unsigned int length)
+ {
+ struct svc_rdma_recv_ctxt *rctxt = rqstp->rq_xprt_ctxt;
+
+diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+index 5f7e3d12523fe..c895f80df659c 100644
+--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
++++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
+@@ -80,7 +80,7 @@ static const struct svc_xprt_ops svc_rdma_ops = {
+ .xpo_create = svc_rdma_create,
+ .xpo_recvfrom = svc_rdma_recvfrom,
+ .xpo_sendto = svc_rdma_sendto,
+- .xpo_read_payload = svc_rdma_read_payload,
++ .xpo_result_payload = svc_rdma_result_payload,
+ .xpo_release_rqst = svc_rdma_release_rqst,
+ .xpo_detach = svc_rdma_detach,
+ .xpo_free = svc_rdma_free,
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index 3ab726a668e8a..405bf3e6eb796 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -959,7 +959,7 @@ static struct sock *unix_find_other(struct net *net,
+ if (err)
+ goto fail;
+ inode = d_backing_inode(path.dentry);
+- err = inode_permission(inode, MAY_WRITE);
++ err = path_permission(&path, MAY_WRITE);
+ if (err)
+ goto put_fail;
+
+diff --git a/tools/objtool/check.c b/tools/objtool/check.c
+index 059b78d08f7af..0506a48f124c2 100644
+--- a/tools/objtool/check.c
++++ b/tools/objtool/check.c
+@@ -168,8 +168,9 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
+ "panic",
+ "do_exit",
+ "do_task_dead",
++ "kthread_exit",
+ "make_task_dead",
+- "__module_put_and_exit",
++ "__module_put_and_kthread_exit",
+ "complete_and_exit",
+ "__reiserfs_panic",
+ "lbug_with_loc",