commit:     af04b07db818faf519780e2e77f637a6419c5ab6
Author:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
AuthorDate: Sat Jul 31 10:33:59 2021 +0000
Commit:     Alice Ferrazzi <alicef <AT> gentoo <DOT> org>
CommitDate: Sat Jul 31 10:34:09 2021 +0000
URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=af04b07d

Linux patch 4.19.200

Signed-off-by: Alice Ferrazzi <alicef <AT> gentoo.org>

 0000_README               |   4 +
 1199_linux-4.19.200.patch | 893 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 897 insertions(+)

diff --git a/0000_README b/0000_README
index f1619e0..58e7859 100644
--- a/0000_README
+++ b/0000_README
@@ -835,6 +835,10 @@ Patch:  1198_linux-4.19.199.patch
 From:   https://www.kernel.org
 Desc:   Linux 4.19.199
 
+Patch:  1199_linux-4.19.200.patch
+From:   https://www.kernel.org
+Desc:   Linux 4.19.200
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1199_linux-4.19.200.patch b/1199_linux-4.19.200.patch
new file mode 100644
index 0000000..da6c8c4
--- /dev/null
+++ b/1199_linux-4.19.200.patch
@@ -0,0 +1,893 @@
+diff --git a/Makefile b/Makefile
+index f3ad63a089a18..a4ea351c4e5d6 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 4
+ PATCHLEVEL = 19
+-SUBLEVEL = 199
++SUBLEVEL = 200
+ EXTRAVERSION =
+ NAME = "People's Front"
+ 
+diff --git a/arch/arm/boot/dts/versatile-ab.dts 
b/arch/arm/boot/dts/versatile-ab.dts
+index 6f4f60ba5429c..990b7ef1800e4 100644
+--- a/arch/arm/boot/dts/versatile-ab.dts
++++ b/arch/arm/boot/dts/versatile-ab.dts
+@@ -192,16 +192,15 @@
+               #size-cells = <1>;
+               ranges;
+ 
+-              vic: intc@10140000 {
++              vic: interrupt-controller@10140000 {
+                       compatible = "arm,versatile-vic";
+                       interrupt-controller;
+                       #interrupt-cells = <1>;
+                       reg = <0x10140000 0x1000>;
+-                      clear-mask = <0xffffffff>;
+                       valid-mask = <0xffffffff>;
+               };
+ 
+-              sic: intc@10003000 {
++              sic: interrupt-controller@10003000 {
+                       compatible = "arm,versatile-sic";
+                       interrupt-controller;
+                       #interrupt-cells = <1>;
+diff --git a/arch/arm/boot/dts/versatile-pb.dts 
b/arch/arm/boot/dts/versatile-pb.dts
+index 06a0fdf24026c..e7e751a858d81 100644
+--- a/arch/arm/boot/dts/versatile-pb.dts
++++ b/arch/arm/boot/dts/versatile-pb.dts
+@@ -7,7 +7,7 @@
+ 
+       amba {
+               /* The Versatile PB is using more SIC IRQ lines than the AB */
+-              sic: intc@10003000 {
++              sic: interrupt-controller@10003000 {
+                       clear-mask = <0xffffffff>;
+                       /*
+                        * Valid interrupt lines mask according to
+diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
+index 43fb4e296d8de..9cfc669b4a243 100644
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -416,8 +416,6 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
+ 
+       if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
+       queue:
+-              if (has_error && !is_protmode(vcpu))
+-                      has_error = false;
+               if (reinject) {
+                       /*
+                        * On vmentry, vcpu->arch.exception.pending is only
+@@ -7114,6 +7112,13 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+       kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+ }
+ 
++static void kvm_inject_exception(struct kvm_vcpu *vcpu)
++{
++       if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
++               vcpu->arch.exception.error_code = false;
++       kvm_x86_ops->queue_exception(vcpu);
++}
++
+ static int inject_pending_event(struct kvm_vcpu *vcpu)
+ {
+       int r;
+@@ -7121,7 +7126,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
+       /* try to reinject previous events if any */
+ 
+       if (vcpu->arch.exception.injected)
+-              kvm_x86_ops->queue_exception(vcpu);
++              kvm_inject_exception(vcpu);
+       /*
+        * Do not inject an NMI or interrupt if there is a pending
+        * exception.  Exceptions and interrupts are recognized at
+@@ -7175,7 +7180,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu)
+                       kvm_update_dr7(vcpu);
+               }
+ 
+-              kvm_x86_ops->queue_exception(vcpu);
++              kvm_inject_exception(vcpu);
+       }
+ 
+       /* Don't consider new event if we re-injected an event */
+diff --git a/drivers/firmware/arm_scmi/driver.c 
b/drivers/firmware/arm_scmi/driver.c
+index effc4c17e0fb9..af5139eb96b5d 100644
+--- a/drivers/firmware/arm_scmi/driver.c
++++ b/drivers/firmware/arm_scmi/driver.c
+@@ -48,7 +48,6 @@ enum scmi_error_codes {
+       SCMI_ERR_GENERIC = -8,  /* Generic Error */
+       SCMI_ERR_HARDWARE = -9, /* Hardware Error */
+       SCMI_ERR_PROTOCOL = -10,/* Protocol Error */
+-      SCMI_ERR_MAX
+ };
+ 
+ /* List of all SCMI devices active in system */
+@@ -168,8 +167,10 @@ static const int scmi_linux_errmap[] = {
+ 
+ static inline int scmi_to_linux_errno(int errno)
+ {
+-      if (errno < SCMI_SUCCESS && errno > SCMI_ERR_MAX)
+-              return scmi_linux_errmap[-errno];
++      int err_idx = -errno;
++
++      if (err_idx >= SCMI_SUCCESS && err_idx < ARRAY_SIZE(scmi_linux_errmap))
++              return scmi_linux_errmap[err_idx];
+       return -EIO;
+ }
+ 
+@@ -628,8 +629,9 @@ static int scmi_xfer_info_init(struct scmi_info *sinfo)
+       struct scmi_xfers_info *info = &sinfo->minfo;
+ 
+       /* Pre-allocated messages, no more than what hdr.seq can support */
+-      if (WARN_ON(desc->max_msg >= MSG_TOKEN_MAX)) {
+-              dev_err(dev, "Maximum message of %d exceeds supported %ld\n",
++      if (WARN_ON(!desc->max_msg || desc->max_msg > MSG_TOKEN_MAX)) {
++              dev_err(dev,
++                      "Invalid maximum messages %d, not in range [1 - %lu]\n",
+                       desc->max_msg, MSG_TOKEN_MAX);
+               return -EINVAL;
+       }
+diff --git a/drivers/iio/dac/ds4424.c b/drivers/iio/dac/ds4424.c
+index 714a97f913199..ae9be792693bf 100644
+--- a/drivers/iio/dac/ds4424.c
++++ b/drivers/iio/dac/ds4424.c
+@@ -236,12 +236,6 @@ static int ds4424_probe(struct i2c_client *client,
+       indio_dev->dev.of_node = client->dev.of_node;
+       indio_dev->dev.parent = &client->dev;
+ 
+-      if (!client->dev.of_node) {
+-              dev_err(&client->dev,
+-                              "Not found DT.\n");
+-              return -ENODEV;
+-      }
+-
+       data->vcc_reg = devm_regulator_get(&client->dev, "vcc");
+       if (IS_ERR(data->vcc_reg)) {
+               dev_err(&client->dev,
+diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
+index 5a14f518cd979..61955a7c838b4 100644
+--- a/fs/cifs/smb2ops.c
++++ b/fs/cifs/smb2ops.c
+@@ -386,8 +386,8 @@ parse_server_interfaces(struct 
network_interface_info_ioctl_rsp *buf,
+       p = buf;
+       while (bytes_left >= sizeof(*p)) {
+               info->speed = le64_to_cpu(p->LinkSpeed);
+-              info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE);
+-              info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE);
++              info->rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) 
? 1 : 0;
++              info->rss_capable = le32_to_cpu(p->Capability & RSS_CAPABLE) ? 
1 : 0;
+ 
+               cifs_dbg(FYI, "%s: adding iface %zu\n", __func__, *iface_count);
+               cifs_dbg(FYI, "%s: speed %zu bps\n", __func__, info->speed);
+diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
+index 4af318fbda774..ef9498a6e88ac 100644
+--- a/fs/hfs/bfind.c
++++ b/fs/hfs/bfind.c
+@@ -25,7 +25,19 @@ int hfs_find_init(struct hfs_btree *tree, struct 
hfs_find_data *fd)
+       fd->key = ptr + tree->max_key_len + 2;
+       hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n",
+               tree->cnid, __builtin_return_address(0));
+-      mutex_lock(&tree->tree_lock);
++      switch (tree->cnid) {
++      case HFS_CAT_CNID:
++              mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX);
++              break;
++      case HFS_EXT_CNID:
++              mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX);
++              break;
++      case HFS_ATTR_CNID:
++              mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX);
++              break;
++      default:
++              return -EINVAL;
++      }
+       return 0;
+ }
+ 
+diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
+index b63a4df7327b6..c0a73a6ffb28b 100644
+--- a/fs/hfs/bnode.c
++++ b/fs/hfs/bnode.c
+@@ -15,16 +15,31 @@
+ 
+ #include "btree.h"
+ 
+-void hfs_bnode_read(struct hfs_bnode *node, void *buf,
+-              int off, int len)
++void hfs_bnode_read(struct hfs_bnode *node, void *buf, int off, int len)
+ {
+       struct page *page;
++      int pagenum;
++      int bytes_read;
++      int bytes_to_read;
++      void *vaddr;
+ 
+       off += node->page_offset;
+-      page = node->page[0];
++      pagenum = off >> PAGE_SHIFT;
++      off &= ~PAGE_MASK; /* compute page offset for the first page */
+ 
+-      memcpy(buf, kmap(page) + off, len);
+-      kunmap(page);
++      for (bytes_read = 0; bytes_read < len; bytes_read += bytes_to_read) {
++              if (pagenum >= node->tree->pages_per_bnode)
++                      break;
++              page = node->page[pagenum];
++              bytes_to_read = min_t(int, len - bytes_read, PAGE_SIZE - off);
++
++              vaddr = kmap_atomic(page);
++              memcpy(buf + bytes_read, vaddr + off, bytes_to_read);
++              kunmap_atomic(vaddr);
++
++              pagenum++;
++              off = 0; /* page offset only applies to the first page */
++      }
+ }
+ 
+ u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
+diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
+index dcc2aab1b2c43..25ac9a8bb57a7 100644
+--- a/fs/hfs/btree.h
++++ b/fs/hfs/btree.h
+@@ -13,6 +13,13 @@ typedef int (*btree_keycmp)(const btree_key *, const 
btree_key *);
+ 
+ #define NODE_HASH_SIZE  256
+ 
++/* B-tree mutex nested subclasses */
++enum hfs_btree_mutex_classes {
++      CATALOG_BTREE_MUTEX,
++      EXTENTS_BTREE_MUTEX,
++      ATTR_BTREE_MUTEX,
++};
++
+ /* A HFS BTree held in memory */
+ struct hfs_btree {
+       struct super_block *sb;
+diff --git a/fs/hfs/super.c b/fs/hfs/super.c
+index 173876782f73f..77b6f35a4aa93 100644
+--- a/fs/hfs/super.c
++++ b/fs/hfs/super.c
+@@ -427,14 +427,12 @@ static int hfs_fill_super(struct super_block *sb, void 
*data, int silent)
+       if (!res) {
+               if (fd.entrylength > sizeof(rec) || fd.entrylength < 0) {
+                       res =  -EIO;
+-                      goto bail;
++                      goto bail_hfs_find;
+               }
+               hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, fd.entrylength);
+       }
+-      if (res) {
+-              hfs_find_exit(&fd);
+-              goto bail_no_root;
+-      }
++      if (res)
++              goto bail_hfs_find;
+       res = -EINVAL;
+       root_inode = hfs_iget(sb, &fd.search_key->cat, &rec);
+       hfs_find_exit(&fd);
+@@ -450,6 +448,8 @@ static int hfs_fill_super(struct super_block *sb, void 
*data, int silent)
+       /* everything's okay */
+       return 0;
+ 
++bail_hfs_find:
++      hfs_find_exit(&fd);
+ bail_no_root:
+       pr_err("get root inode failed\n");
+ bail:
+diff --git a/include/net/af_unix.h b/include/net/af_unix.h
+index a5ba41b3b8673..7ec1cdb66be8d 100644
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -10,6 +10,7 @@
+ 
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
++void unix_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(void);
+ struct sock *unix_get_socket(struct file *filp);
+diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h
+index cf8f792743ec2..c76a5e9894dac 100644
+--- a/include/net/busy_poll.h
++++ b/include/net/busy_poll.h
+@@ -48,7 +48,7 @@ static inline bool net_busy_loop_on(void)
+ 
+ static inline bool sk_can_busy_loop(const struct sock *sk)
+ {
+-      return sk->sk_ll_usec && !signal_pending(current);
++      return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current);
+ }
+ 
+ bool sk_busy_loop_end(void *p, unsigned long start_time);
+diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
+index 48d74674d5e95..bc22e44ffcdf7 100644
+--- a/include/net/sctp/constants.h
++++ b/include/net/sctp/constants.h
+@@ -348,8 +348,7 @@ enum {
+ #define SCTP_SCOPE_POLICY_MAX SCTP_SCOPE_POLICY_LINK
+ 
+ /* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>,
+- * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24,
+- * 192.88.99.0/24.
++ * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24.
+  * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP
+  * addresses.
+  */
+@@ -357,7 +356,6 @@ enum {
+       ((htonl(INADDR_BROADCAST) == a) ||  \
+        ipv4_is_multicast(a) ||            \
+        ipv4_is_zeronet(a) ||              \
+-       ipv4_is_test_198(a) ||             \
+        ipv4_is_anycast_6to4(a))
+ 
+ /* Flags used for the bind address copy functions.  */
+diff --git a/kernel/workqueue.c b/kernel/workqueue.c
+index f278e2f584fd2..1573d1bf63007 100644
+--- a/kernel/workqueue.c
++++ b/kernel/workqueue.c
+@@ -3498,15 +3498,21 @@ static void pwq_unbound_release_workfn(struct 
work_struct *work)
+                                                 unbound_release_work);
+       struct workqueue_struct *wq = pwq->wq;
+       struct worker_pool *pool = pwq->pool;
+-      bool is_last;
++      bool is_last = false;
+ 
+-      if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+-              return;
++      /*
++       * when @pwq is not linked, it doesn't hold any reference to the
++       * @wq, and @wq is invalid to access.
++       */
++      if (!list_empty(&pwq->pwqs_node)) {
++              if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
++                      return;
+ 
+-      mutex_lock(&wq->mutex);
+-      list_del_rcu(&pwq->pwqs_node);
+-      is_last = list_empty(&wq->pwqs);
+-      mutex_unlock(&wq->mutex);
++              mutex_lock(&wq->mutex);
++              list_del_rcu(&pwq->pwqs_node);
++              is_last = list_empty(&wq->pwqs);
++              mutex_unlock(&wq->mutex);
++      }
+ 
+       mutex_lock(&wq_pool_mutex);
+       put_unbound_pool(pool);
+diff --git a/net/802/garp.c b/net/802/garp.c
+index 7f50d47470bd4..8e19f51833d6f 100644
+--- a/net/802/garp.c
++++ b/net/802/garp.c
+@@ -206,6 +206,19 @@ static void garp_attr_destroy(struct garp_applicant *app, 
struct garp_attr *attr
+       kfree(attr);
+ }
+ 
++static void garp_attr_destroy_all(struct garp_applicant *app)
++{
++      struct rb_node *node, *next;
++      struct garp_attr *attr;
++
++      for (node = rb_first(&app->gid);
++           next = node ? rb_next(node) : NULL, node != NULL;
++           node = next) {
++              attr = rb_entry(node, struct garp_attr, node);
++              garp_attr_destroy(app, attr);
++      }
++}
++
+ static int garp_pdu_init(struct garp_applicant *app)
+ {
+       struct sk_buff *skb;
+@@ -612,6 +625,7 @@ void garp_uninit_applicant(struct net_device *dev, struct 
garp_application *appl
+ 
+       spin_lock_bh(&app->lock);
+       garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU);
++      garp_attr_destroy_all(app);
+       garp_pdu_queue(app);
+       spin_unlock_bh(&app->lock);
+ 
+diff --git a/net/802/mrp.c b/net/802/mrp.c
+index a808dd5bbb27a..32f87d458f054 100644
+--- a/net/802/mrp.c
++++ b/net/802/mrp.c
+@@ -295,6 +295,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, 
struct mrp_attr *attr)
+       kfree(attr);
+ }
+ 
++static void mrp_attr_destroy_all(struct mrp_applicant *app)
++{
++      struct rb_node *node, *next;
++      struct mrp_attr *attr;
++
++      for (node = rb_first(&app->mad);
++           next = node ? rb_next(node) : NULL, node != NULL;
++           node = next) {
++              attr = rb_entry(node, struct mrp_attr, node);
++              mrp_attr_destroy(app, attr);
++      }
++}
++
+ static int mrp_pdu_init(struct mrp_applicant *app)
+ {
+       struct sk_buff *skb;
+@@ -898,6 +911,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct 
mrp_application *appl)
+ 
+       spin_lock_bh(&app->lock);
+       mrp_mad_event(app, MRP_EVENT_TX);
++      mrp_attr_destroy_all(app);
+       mrp_pdu_queue(app);
+       spin_unlock_bh(&app->lock);
+ 
+diff --git a/net/Makefile b/net/Makefile
+index bdaf53925acd5..449fc0b221f83 100644
+--- a/net/Makefile
++++ b/net/Makefile
+@@ -18,7 +18,7 @@ obj-$(CONFIG_NETFILTER)              += netfilter/
+ obj-$(CONFIG_INET)            += ipv4/
+ obj-$(CONFIG_TLS)             += tls/
+ obj-$(CONFIG_XFRM)            += xfrm/
+-obj-$(CONFIG_UNIX)            += unix/
++obj-$(CONFIG_UNIX_SCM)                += unix/
+ obj-$(CONFIG_NET)             += ipv6/
+ obj-$(CONFIG_BPFILTER)                += bpfilter/
+ obj-$(CONFIG_PACKET)          += packet/
+diff --git a/net/core/sock.c b/net/core/sock.c
+index e6cbe137cb6fc..956af38aa0d6e 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -989,7 +989,7 @@ set_rcvbuf:
+                       if (val < 0)
+                               ret = -EINVAL;
+                       else
+-                              sk->sk_ll_usec = val;
++                              WRITE_ONCE(sk->sk_ll_usec, val);
+               }
+               break;
+ #endif
+diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
+index dd51256582556..7207a9769f1a9 100644
+--- a/net/sctp/protocol.c
++++ b/net/sctp/protocol.c
+@@ -412,7 +412,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr)
+               retval = SCTP_SCOPE_LINK;
+       } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) ||
+                  ipv4_is_private_172(addr->v4.sin_addr.s_addr) ||
+-                 ipv4_is_private_192(addr->v4.sin_addr.s_addr)) {
++                 ipv4_is_private_192(addr->v4.sin_addr.s_addr) ||
++                 ipv4_is_test_198(addr->v4.sin_addr.s_addr)) {
+               retval = SCTP_SCOPE_PRIVATE;
+       } else {
+               retval = SCTP_SCOPE_GLOBAL;
+diff --git a/net/unix/Kconfig b/net/unix/Kconfig
+index 8b31ab85d050f..3b9e450656a4d 100644
+--- a/net/unix/Kconfig
++++ b/net/unix/Kconfig
+@@ -19,6 +19,11 @@ config UNIX
+ 
+         Say Y unless you know what you are doing.
+ 
++config UNIX_SCM
++      bool
++      depends on UNIX
++      default y
++
+ config UNIX_DIAG
+       tristate "UNIX: socket monitoring interface"
+       depends on UNIX
+diff --git a/net/unix/Makefile b/net/unix/Makefile
+index ffd0a275c3a79..54e58cc4f9450 100644
+--- a/net/unix/Makefile
++++ b/net/unix/Makefile
+@@ -10,3 +10,5 @@ unix-$(CONFIG_SYSCTL)        += sysctl_net_unix.o
+ 
+ obj-$(CONFIG_UNIX_DIAG)       += unix_diag.o
+ unix_diag-y           := diag.o
++
++obj-$(CONFIG_UNIX_SCM)        += scm.o
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index 53fe5ada5a83a..98c253afa0db2 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -119,6 +119,8 @@
+ #include <linux/freezer.h>
+ #include <linux/file.h>
+ 
++#include "scm.h"
++
+ struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
+ EXPORT_SYMBOL_GPL(unix_socket_table);
+ DEFINE_SPINLOCK(unix_table_lock);
+@@ -1515,65 +1517,51 @@ out:
+       return err;
+ }
+ 
+-static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+-{
+-      int i;
+-
+-      scm->fp = UNIXCB(skb).fp;
+-      UNIXCB(skb).fp = NULL;
+-
+-      for (i = scm->fp->count-1; i >= 0; i--)
+-              unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+-}
+-
+-static void unix_destruct_scm(struct sk_buff *skb)
+-{
+-      struct scm_cookie scm;
+-      memset(&scm, 0, sizeof(scm));
+-      scm.pid  = UNIXCB(skb).pid;
+-      if (UNIXCB(skb).fp)
+-              unix_detach_fds(&scm, skb);
+-
+-      /* Alas, it calls VFS */
+-      /* So fscking what? fput() had been SMP-safe since the last Summer */
+-      scm_destroy(&scm);
+-      sock_wfree(skb);
+-}
+-
+-/*
+- * The "user->unix_inflight" variable is protected by the garbage
+- * collection lock, and we just read it locklessly here. If you go
+- * over the limit, there might be a tiny race in actually noticing
+- * it across threads. Tough.
+- */
+-static inline bool too_many_unix_fds(struct task_struct *p)
+-{
+-      struct user_struct *user = current_user();
+-
+-      if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+-              return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+-      return false;
+-}
+-
+-static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+-      int i;
+-
+-      if (too_many_unix_fds(current))
+-              return -ETOOMANYREFS;
++      scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+ 
+       /*
+-       * Need to duplicate file references for the sake of garbage
+-       * collection.  Otherwise a socket in the fps might become a
+-       * candidate for GC while the skb is not yet queued.
++       * Garbage collection of unix sockets starts by selecting a set of
++       * candidate sockets which have reference only from being in flight
++       * (total_refs == inflight_refs).  This condition is checked once during
++       * the candidate collection phase, and candidates are marked as such, so
++       * that non-candidates can later be ignored.  While inflight_refs is
++       * protected by unix_gc_lock, total_refs (file count) is not, hence this
++       * is an instantaneous decision.
++       *
++       * Once a candidate, however, the socket must not be reinstalled into a
++       * file descriptor while the garbage collection is in progress.
++       *
++       * If the above conditions are met, then the directed graph of
++       * candidates (*) does not change while unix_gc_lock is held.
++       *
++       * Any operations that changes the file count through file descriptors
++       * (dup, close, sendmsg) does not change the graph since candidates are
++       * not installed in fds.
++       *
++       * Dequeing a candidate via recvmsg would install it into an fd, but
++       * that takes unix_gc_lock to decrement the inflight count, so it's
++       * serialized with garbage collection.
++       *
++       * MSG_PEEK is special in that it does not change the inflight count,
++       * yet does install the socket into an fd.  The following lock/unlock
++       * pair is to ensure serialization with garbage collection.  It must be
++       * done between incrementing the file count and installing the file into
++       * an fd.
++       *
++       * If garbage collection starts after the barrier provided by the
++       * lock/unlock, then it will see the elevated refcount and not mark this
++       * as a candidate.  If a garbage collection is already in progress
++       * before the file count was incremented, then the lock/unlock pair will
++       * ensure that garbage collection is finished before progressing to
++       * installing the fd.
++       *
++       * (*) A -> B where B is on the queue of A or B is on the queue of C
++       * which is on the queue of listening socket A.
+        */
+-      UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+-      if (!UNIXCB(skb).fp)
+-              return -ENOMEM;
+-
+-      for (i = scm->fp->count - 1; i >= 0; i--)
+-              unix_inflight(scm->fp->user, scm->fp->fp[i]);
+-      return 0;
++      spin_lock(&unix_gc_lock);
++      spin_unlock(&unix_gc_lock);
+ }
+ 
+ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool 
send_fds)
+@@ -2201,7 +2189,7 @@ static int unix_dgram_recvmsg(struct socket *sock, 
struct msghdr *msg,
+               sk_peek_offset_fwd(sk, size);
+ 
+               if (UNIXCB(skb).fp)
+-                      scm.fp = scm_fp_dup(UNIXCB(skb).fp);
++                      unix_peek_fds(&scm, skb);
+       }
+       err = (flags & MSG_TRUNC) ? skb->len - skip : size;
+ 
+@@ -2442,7 +2430,7 @@ unlock:
+                       /* It is questionable, see note in unix_dgram_recvmsg.
+                        */
+                       if (UNIXCB(skb).fp)
+-                              scm.fp = scm_fp_dup(UNIXCB(skb).fp);
++                              unix_peek_fds(&scm, skb);
+ 
+                       sk_peek_offset_fwd(sk, chunk);
+ 
+diff --git a/net/unix/garbage.c b/net/unix/garbage.c
+index c36757e728442..8bbe1b8e4ff7f 100644
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -86,77 +86,13 @@
+ #include <net/scm.h>
+ #include <net/tcp_states.h>
+ 
++#include "scm.h"
++
+ /* Internal data structures and random procedures: */
+ 
+-static LIST_HEAD(gc_inflight_list);
+ static LIST_HEAD(gc_candidates);
+-static DEFINE_SPINLOCK(unix_gc_lock);
+ static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+ 
+-unsigned int unix_tot_inflight;
+-
+-struct sock *unix_get_socket(struct file *filp)
+-{
+-      struct sock *u_sock = NULL;
+-      struct inode *inode = file_inode(filp);
+-
+-      /* Socket ? */
+-      if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+-              struct socket *sock = SOCKET_I(inode);
+-              struct sock *s = sock->sk;
+-
+-              /* PF_UNIX ? */
+-              if (s && sock->ops && sock->ops->family == PF_UNIX)
+-                      u_sock = s;
+-      }
+-      return u_sock;
+-}
+-
+-/* Keep the number of times in flight count for the file
+- * descriptor if it is for an AF_UNIX socket.
+- */
+-
+-void unix_inflight(struct user_struct *user, struct file *fp)
+-{
+-      struct sock *s = unix_get_socket(fp);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      if (s) {
+-              struct unix_sock *u = unix_sk(s);
+-
+-              if (atomic_long_inc_return(&u->inflight) == 1) {
+-                      BUG_ON(!list_empty(&u->link));
+-                      list_add_tail(&u->link, &gc_inflight_list);
+-              } else {
+-                      BUG_ON(list_empty(&u->link));
+-              }
+-              unix_tot_inflight++;
+-      }
+-      user->unix_inflight++;
+-      spin_unlock(&unix_gc_lock);
+-}
+-
+-void unix_notinflight(struct user_struct *user, struct file *fp)
+-{
+-      struct sock *s = unix_get_socket(fp);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      if (s) {
+-              struct unix_sock *u = unix_sk(s);
+-
+-              BUG_ON(!atomic_long_read(&u->inflight));
+-              BUG_ON(list_empty(&u->link));
+-
+-              if (atomic_long_dec_and_test(&u->inflight))
+-                      list_del_init(&u->link);
+-              unix_tot_inflight--;
+-      }
+-      user->unix_inflight--;
+-      spin_unlock(&unix_gc_lock);
+-}
+-
+ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+                         struct sk_buff_head *hitlist)
+ {
+diff --git a/net/unix/scm.c b/net/unix/scm.c
+new file mode 100644
+index 0000000000000..83413ade79838
+--- /dev/null
++++ b/net/unix/scm.c
+@@ -0,0 +1,148 @@
++// SPDX-License-Identifier: GPL-2.0
++#include <linux/module.h>
++#include <linux/kernel.h>
++#include <linux/string.h>
++#include <linux/socket.h>
++#include <linux/net.h>
++#include <linux/fs.h>
++#include <net/af_unix.h>
++#include <net/scm.h>
++#include <linux/init.h>
++
++#include "scm.h"
++
++unsigned int unix_tot_inflight;
++EXPORT_SYMBOL(unix_tot_inflight);
++
++LIST_HEAD(gc_inflight_list);
++EXPORT_SYMBOL(gc_inflight_list);
++
++DEFINE_SPINLOCK(unix_gc_lock);
++EXPORT_SYMBOL(unix_gc_lock);
++
++struct sock *unix_get_socket(struct file *filp)
++{
++      struct sock *u_sock = NULL;
++      struct inode *inode = file_inode(filp);
++
++      /* Socket ? */
++      if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
++              struct socket *sock = SOCKET_I(inode);
++              struct sock *s = sock->sk;
++
++              /* PF_UNIX ? */
++              if (s && sock->ops && sock->ops->family == PF_UNIX)
++                      u_sock = s;
++      }
++      return u_sock;
++}
++EXPORT_SYMBOL(unix_get_socket);
++
++/* Keep the number of times in flight count for the file
++ * descriptor if it is for an AF_UNIX socket.
++ */
++void unix_inflight(struct user_struct *user, struct file *fp)
++{
++      struct sock *s = unix_get_socket(fp);
++
++      spin_lock(&unix_gc_lock);
++
++      if (s) {
++              struct unix_sock *u = unix_sk(s);
++
++              if (atomic_long_inc_return(&u->inflight) == 1) {
++                      BUG_ON(!list_empty(&u->link));
++                      list_add_tail(&u->link, &gc_inflight_list);
++              } else {
++                      BUG_ON(list_empty(&u->link));
++              }
++              unix_tot_inflight++;
++      }
++      user->unix_inflight++;
++      spin_unlock(&unix_gc_lock);
++}
++
++void unix_notinflight(struct user_struct *user, struct file *fp)
++{
++      struct sock *s = unix_get_socket(fp);
++
++      spin_lock(&unix_gc_lock);
++
++      if (s) {
++              struct unix_sock *u = unix_sk(s);
++
++              BUG_ON(!atomic_long_read(&u->inflight));
++              BUG_ON(list_empty(&u->link));
++
++              if (atomic_long_dec_and_test(&u->inflight))
++                      list_del_init(&u->link);
++              unix_tot_inflight--;
++      }
++      user->unix_inflight--;
++      spin_unlock(&unix_gc_lock);
++}
++
++/*
++ * The "user->unix_inflight" variable is protected by the garbage
++ * collection lock, and we just read it locklessly here. If you go
++ * over the limit, there might be a tiny race in actually noticing
++ * it across threads. Tough.
++ */
++static inline bool too_many_unix_fds(struct task_struct *p)
++{
++      struct user_struct *user = current_user();
++
++      if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
++              return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
++      return false;
++}
++
++int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++      int i;
++
++      if (too_many_unix_fds(current))
++              return -ETOOMANYREFS;
++
++      /*
++       * Need to duplicate file references for the sake of garbage
++       * collection.  Otherwise a socket in the fps might become a
++       * candidate for GC while the skb is not yet queued.
++       */
++      UNIXCB(skb).fp = scm_fp_dup(scm->fp);
++      if (!UNIXCB(skb).fp)
++              return -ENOMEM;
++
++      for (i = scm->fp->count - 1; i >= 0; i--)
++              unix_inflight(scm->fp->user, scm->fp->fp[i]);
++      return 0;
++}
++EXPORT_SYMBOL(unix_attach_fds);
++
++void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++      int i;
++
++      scm->fp = UNIXCB(skb).fp;
++      UNIXCB(skb).fp = NULL;
++
++      for (i = scm->fp->count-1; i >= 0; i--)
++              unix_notinflight(scm->fp->user, scm->fp->fp[i]);
++}
++EXPORT_SYMBOL(unix_detach_fds);
++
++void unix_destruct_scm(struct sk_buff *skb)
++{
++      struct scm_cookie scm;
++
++      memset(&scm, 0, sizeof(scm));
++      scm.pid  = UNIXCB(skb).pid;
++      if (UNIXCB(skb).fp)
++              unix_detach_fds(&scm, skb);
++
++      /* Alas, it calls VFS */
++      /* So fscking what? fput() had been SMP-safe since the last Summer */
++      scm_destroy(&scm);
++      sock_wfree(skb);
++}
++EXPORT_SYMBOL(unix_destruct_scm);
+diff --git a/net/unix/scm.h b/net/unix/scm.h
+new file mode 100644
+index 0000000000000..5a255a477f160
+--- /dev/null
++++ b/net/unix/scm.h
+@@ -0,0 +1,10 @@
++#ifndef NET_UNIX_SCM_H
++#define NET_UNIX_SCM_H
++
++extern struct list_head gc_inflight_list;
++extern spinlock_t unix_gc_lock;
++
++int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
++void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
++
++#endif
+diff --git a/tools/testing/selftests/vm/userfaultfd.c 
b/tools/testing/selftests/vm/userfaultfd.c
+index 16d42b2de424e..1963440f67251 100644
+--- a/tools/testing/selftests/vm/userfaultfd.c
++++ b/tools/testing/selftests/vm/userfaultfd.c
+@@ -131,7 +131,7 @@ static void anon_allocate_area(void **alloc_area)
+ {
+       *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+                          MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+-      if (*alloc_area == MAP_FAILED)
++      if (*alloc_area == MAP_FAILED) {
+               fprintf(stderr, "mmap of anonymous memory failed");
+               *alloc_area = NULL;
+       }

Reply via email to