commit:     971b8702719cf2ed697a6017deeff1f2facc0080
Author:     Kerin Millar <kfm <AT> plushkava <DOT> net>
AuthorDate: Thu Aug  1 19:16:02 2024 +0000
Commit:     Sam James <sam <AT> gentoo <DOT> org>
CommitDate: Fri Aug  2 16:21:14 2024 +0000
URL:        
https://gitweb.gentoo.org/proj/gentoo-functions.git/commit/?id=971b8702

Render contains_all() and contains_any() faster

Re-implement the contains_all() and contains_any() functions in such a
way that they are faster than their forebears by an order of magnitude.
In order to achieve this level of performance, the value of IFS is no
longer taken into account. Instead, words are always presumed to be
separated by characters matching the [[:space:]] character class.

Consider a scenario in which the FEATURES variable is comprised of 33
words.

$ FEATURES="assume-digests binpkg-docompress binpkg-dostrip binpkg-logs
buildpkg buildpkg-live config-protect-if-modified distlocks ebuild-locks
fixlafiles ipc-sandbox merge-sync merge-wait multilib-strict
network-sandbox news parallel-fetch pid-sandbox pkgdir-index-trusted
preserve-libs protect-owned qa-unresolved-soname-deps sandbox sfperms
strict unknown-features-warn unmerge-logs unmerge-orphans userfetch
userpriv usersandbox usersync xattr"

Let's say that the contains_any function is used to search for 10 words,
where only the 10th can be matched and where FEATURES must be scanned in
its entirety exactly 10 times.

$ contains_any "$FEATURES" the quick brown fox jumped over the lazy hen xattr

The following benchmarks show how long it took to call the function
50,000 times consecutively on a system with an Apple M1 CPU for both the
original and new implementations. This is with the dash shell.

contains_any (BEFORE)

real    0m19.135s
user    0m16.781s
sys     0m2.258s

contains_any (AFTER)

real    0m1.571s
user    0m1.497s
sys     0m0.063s

Now let's say that the contains_all function is used to search for 3
words, where all can be matched while requiring for FEATURES to be
scanned in its entirety at least once.

$ contains_all "$FEATURES" assume-digests news xattr

Again, The following benchmarks show how long it took to call the
function 50,000 times consecutively.

contains_all (BEFORE)

real    1m8.052s
user    0m19.363s
sys     0m42.742s

contains_all (AFTER)

real    0m0.689s
user    0m0.627s
sys     0m0.057s

The performance improvements are similarly impressive if using bash.

Signed-off-by: Kerin Millar <kfm <AT> plushkava.net>

 functions.sh   | 138 ++++++++++++++++++---------------------------------------
 test-functions | 125 +++++++++++++--------------------------------------
 2 files changed, 74 insertions(+), 189 deletions(-)

diff --git a/functions.sh b/functions.sh
index 036e3a7..f120564 100644
--- a/functions.sh
+++ b/functions.sh
@@ -22,7 +22,7 @@
 # COLUMNS          : may be used by _update_columns() to get the column count
 # EPOCHREALTIME    : potentially used by _update_time() to get the time
 # GENFUN_MODULES   : which of the optional function collections must be sourced
-# IFS              : affects contains_all(), contains_any() and warn()
+# IFS              : warn() operands are joined by its first character
 # INVOCATION_ID    : used by from_unit()
 # PORTAGE_BIN_PATH : used by from_portage()
 # RC_OPENRC_PID    : used by from_runscript()
@@ -54,112 +54,60 @@ chdir()
 }
 
 #
-# Takes the first parameter as a string comprising zero or more words, composes
-# a set consisting of the intersection of those words, then determines whether
-# the intersection of the remaining parameters forms a subset thereof. The
-# words shall be collected by splitting the string into individual fields, in
-# accordance with section 2.6.5 of the Shell Command Language specification.
-# Therefore, the value of IFS shall be taken into account. If fewer than two
-# parameters are provided, or if the first parameter yields no fields, or if 
the
-# second set is disjoint from - or a superset of - the first, the return value
-# shall be greater than 0.
+# Considers the first parameter as a string comprising zero or more
+# whitespace-separated words then determines whether all of the remaining
+# parameters can be found within the resulting list in their capacity as
+# discrete words. If they cannot be, or if fewer than two parameters were 
given,
+# the return value shall be 1. Of the words to be searched for, any which are
+# empty or which contain whitespace characters shall be deemed unfindable.
 #
 contains_all()
 {
-       # shellcheck disable=2097,2098
-       [ "$#" -ge 2 ] &&
-       IFS=${IFS} awk -v ifs_set=${IFS+1} -f - -- "$@" <<-'EOF'
-       BEGIN {
-               haystack = ARGV[1]
-               argc = ARGC
-               ARGC = 1
-               if (ifs_set) {
-                       ifs = ENVIRON["IFS"]
-               } else {
-                       ifs = " \t\n"
-               }
-               # Translate IFS to FS, in accordance with section 2.6.5.
-               if (length(ifs) == 0) {
-                       FS = "^"
-               } else {
-                       fs = "("
-                       for (i = 1; i <= length(ifs); i++) {
-                               char = substr(ifs, i, 1)
-                               if (seen[char]++) {
-                                       continue
-                               } else if (char ~ /[ \t\n]/) {
-                                       whitespace = whitespace char
-                                       fs = fs "[" char "]+|"
-                               } else {
-                                       fs = fs "[" char "]|"
-                               }
-                       }
-                       sub(/\|$/, "", fs)
-                       FS = fs = fs ")"
-               }
-               # Leading whitespace characters must be removed.
-               if (length(whitespace) > 0) {
-                       sub("^[" whitespace "]+", "", haystack)
-               }
-               # In sh, fields are terminated, not separated.
-               sub(FS "$", "", haystack)
-               len = split(haystack, words)
-               for (i = 1; i <= len; i++) {
-                       set2[words[i]]
-               }
-               for (i = 2; i < argc; i++) {
-                       set1[ARGV[i]]
-               }
-               for (word in set2) {
-                       delete set1[word]
-               }
-               for (word in set1) {
-                       exit 1
-               }
-       }
-       EOF
+       local arg haystack
+
+       [ "$#" -gt 1 ] || return
+       haystack=" $1 "
+       shift
+       for arg; do
+               case ${arg} in
+                       ''|*[[:space:]]*)
+                               return 1
+               esac
+               case ${haystack} in
+                       *" ${arg} "*)
+                               ;;
+                       *)
+                               return 1
+               esac
+       done
 }
 
 #
-# Takes the first parameter as a string comprising zero or more words then
-# determines whether at least one of the remaining parameters can be matched
-# against any of those words. The words shall be collected by splitting the
-# string into individual fields, in accordance with section 2.6.5 of the Shell
-# Command Language specification. Therefore, the value of IFS shall be taken
-# into account. If fewer than two parameters are provided, or if the first
-# parameter yields no fields, or if none of the following parameters can be
-# matched, the return value shall be greater than 0.
+# Considers the first parameter as a string comprising zero or more
+# whitespace-separated words then determines whether any of the remaining
+# parameters can be found within the resulting list in their capacity as
+# discrete words. If none can be, or if no parameters were given, the return
+# value shall be greater than 0. Of the words to be searched for, any which are
+# empty or which contain whitespace characters shall be disregarded.
 #
 contains_any()
 {
-       local had_noglob haystack i item needle retval
+       local arg haystack
 
-       [ "$#" -ge 2 ] || return
-       haystack=$1
+       [ "$#" -gt 0 ] || return
+       haystack=" $1 "
        shift
-       i=0
-       case $- in
-               *f*)
-                       had_noglob=1
-                       ;;
-               *)
-                       had_noglob=0
-       esac
-       set -f
-       for needle; do
-               if [ "$(( i += 1 ))" -eq 1 ]; then
-                       # shellcheck disable=2086
-                       set -- ${haystack}
-               fi
-               for item; do
-                       [ "${item}" = "${needle}" ] && break 2
-               done
+       for arg; do
+               case ${arg} in
+                       ''|*[[:space:]]*)
+                               continue
+               esac
+               case ${haystack} in
+                       *" ${arg} "*)
+                               return 0
+               esac
        done
-       retval=$?
-       if [ "${had_noglob}" -eq 0 ]; then
-               set +f
-       fi
-       return "${retval}"
+       false
 }
 
 #

diff --git a/test-functions b/test-functions
index ef2aa98..f11234a 100755
--- a/test-functions
+++ b/test-functions
@@ -744,105 +744,42 @@ test_substr() {
 
 test_contains_all() {
        set -- \
-               ge  1  default  N/A           N/A         N/A         N/A  \
-               ge  1  default  ' foo  bar '  ''          N/A         N/A  \
-               ge  1  default  ' foo  bar '  ''          ' '         N/A  \
-               ge  1  default  ' foo  bar '  ''          ' bar'      N/A  \
-               ge  1  default  ' foo  bar '  ''          'foo '      N/A  \
-               ge  1  default  ' foo  bar '  ''          'foo  bar'  N/A  \
-               ge  1  default  ' foo  bar '  ' '         ''          N/A  \
-               ge  1  default  ' foo  bar '  ' '         ' '         N/A  \
-               ge  1  default  ' foo  bar '  ' '         N/A         N/A  \
-               ge  1  default  ' foo  bar '  ' bar'      ''          N/A  \
-               ge  1  default  ' foo  bar '  ' bar'      N/A         N/A  \
-               ge  1  default  ' foo  bar '  'foo '      ''          N/A  \
-               ge  1  default  ' foo  bar '  'foo '      ' bar'      N/A  \
-               ge  1  default  ' foo  bar '  'foo '      N/A         N/A  \
-               ge  1  default  ' foo  bar '  'foo  bar'  ''          N/A  \
-               ge  1  default  ' foo  bar '  'foo  bar'  N/A         N/A  \
-               ge  1  default  ' foo  bar '  N/A         N/A         N/A  \
-               ge  1  default  ' foo  bar '  bar         foo         ''   \
-               ge  1  default  ' foo  bar '  bar         foo         ' '  \
-               ge  1  default  ' foo  bar '  baz         bar         foo  \
-               ge  1  default  ' foo  bar '  fo          ba          N/A  \
-               ge  1  default  ' foo  bar '  foo         bar         ''   \
-               ge  1  default  ' foo  bar '  foo         bar         ' '  \
-               ge  1  default  ' foo  bar '  foo         bar         baz  \
-               ge  1  default  ' foo  bar '  o           a           N/A  \
-               ge  1  default  ' foo  bar '  oo          ar          N/A  \
-               eq  0  default  ' foo  bar '  foo         bar         N/A  \
-               eq  0  default  ' foo  bar '  bar         foo         N/A  \
-               ge  1  ', '     N/A           N/A         N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  ''          N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  ''          ' '         N/A  \
-               ge  1  ', '     ' foo ,bar, ' ''          ','         N/A  \
-               ge  1  ', '     ' foo  bar '  ''          ' bar'      N/A  \
-               ge  1  ', '     ' foo ,bar '  ''          ',bar'      N/A  \
-               ge  1  ', '     ' foo  bar '  ''          'foo '      N/A  \
-               ge  1  ', '     ' foo, bar '  ''          'foo,'      N/A  \
-               ge  1  ', '     ' foo  bar '  ''          'foo  bar'  N/A  \
-               ge  1  ', '     ' foo  bar '  ' '         ''          N/A  \
-               ge  1  ', '     ' foo,bar, '  ','         ''          N/A  \
-               ge  1  ', '     ' foo  bar '  ' '         ' '         N/A  \
-               ge  1  ', '     ',foo,,bar,'  ','         ','         N/A  \
-               ge  1  ', '     ' foo  bar '  ' '         N/A         N/A  \
-               ge  1  ', '     ',foo,,bar,'  ','         N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  ' bar'      ''          N/A  \
-               ge  1  ', '     'foo,bar,'    ',bar'      ''          N/A  \
-               ge  1  ', '     ' foo  bar '  ' bar'      N/A         N/A  \
-               ge  1  ', '     ',foo,,bar,'  ',bar'      N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  'foo '      ''          N/A  \
-               ge  1  ', '     'foo,bar,'    'foo,'      ''          N/A  \
-               ge  1  ', '     ' foo  bar '  'foo '      ' bar'      N/A  \
-               ge  1  ', '     ',foo,,bar,'  'foo,'      ',bar'      N/A  \
-               ge  1  ', '     ' foo  bar '  'foo '      N/A         N/A  \
-               ge  1  ', '     ',foo,,bar,'  'foo,'      N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  'foo  bar'  ''          N/A  \
-               ge  1  ', '     'foo,bar,'    'foo,bar'   ''          N/A  \
-               ge  1  ', '     ' foo  bar '  'foo  bar'  N/A         N/A  \
-               ge  1  ', '     ',foo,,bar,'  'foo,,bar'  N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  N/A         N/A         N/A  \
-               ge  1  ', '     ' foo  bar '  bar         foo         ''   \
-               ge  1  ', '     ' foo,bar  '  bar         foo         ''   \
-               ge  1  ', '     ' foo  bar '  bar         foo         ' '  \
-               ge  1  ', '     ' foo,,bar '  bar         foo         ','  \
-               ge  1  ', '     ' foo  bar '  baz         bar         foo  \
-               ge  1  ', '     ',foo,,bar,'  baz         bar         foo  \
-               ge  1  ', '     ' foo  bar '  fo          ba          N/A  \
-               ge  1  ', '     ',foo,,bar,'  fo          ba          N/A  \
-               ge  1  ', '     ' foo  bar '  foo         bar         ''   \
-               ge  1  ', '     ' foo,bar '   foo         bar         ''   \
-               ge  1  ', '     ' foo  bar '  foo         bar         ' '  \
-               ge  1  ', '     ' foo,,bar '  foo         bar         ','  \
-               ge  1  ', '     ' foo  bar '  foo         bar         baz  \
-               ge  1  ', '     ',foo,,bar,'  foo         bar         baz  \
-               ge  1  ', '     ' foo  bar '  o           a           N/A  \
-               ge  1  ', '     ',foo,,bar,'  o           a           N/A  \
-               ge  1  ', '     ' foo  bar '  oo          ar          N/A  \
-               ge  1  ', '     ',foo,,bar,'  oo          ar          N/A  \
-               eq  0  ', '     ' foo  bar '  foo         bar         N/A  \
-               eq  0  ', '     ',foo  bar,'  foo         bar         N/A  \
-               eq  0  ', '     ' foo,,bar '  foo         bar         N/A  \
-               eq  0  ', '     ',foo,,bar,'  foo         bar         N/A  \
-               eq  0  ', '     ' foo  bar '  bar         foo         N/A  \
-               eq  0  ', '     ',foo  bar,'  bar         foo         N/A  \
-               eq  0  ', '     ' foo,,bar '  bar         foo         N/A  \
-               eq  0  ', '     ',foo,,bar,'  bar         foo         N/A
+               ge  1  N/A           N/A         N/A         N/A  \
+               ge  1  ' foo  bar '  ''          N/A         N/A  \
+               ge  1  ' foo  bar '  ''          ' '         N/A  \
+               ge  1  ' foo  bar '  ''          ' bar'      N/A  \
+               ge  1  ' foo  bar '  ''          'foo '      N/A  \
+               ge  1  ' foo  bar '  ''          'foo  bar'  N/A  \
+               ge  1  ' foo  bar '  ' '         ''          N/A  \
+               ge  1  ' foo  bar '  ' '         ' '         N/A  \
+               ge  1  ' foo  bar '  ' '         N/A         N/A  \
+               ge  1  ' foo  bar '  ' bar'      ''          N/A  \
+               ge  1  ' foo  bar '  ' bar'      N/A         N/A  \
+               ge  1  ' foo  bar '  'foo '      ''          N/A  \
+               ge  1  ' foo  bar '  'foo '      ' bar'      N/A  \
+               ge  1  ' foo  bar '  'foo '      N/A         N/A  \
+               ge  1  ' foo  bar '  'foo  bar'  ''          N/A  \
+               ge  1  ' foo  bar '  'foo  bar'  N/A         N/A  \
+               ge  1  ' foo  bar '  N/A         N/A         N/A  \
+               ge  1  ' foo  bar '  bar         foo         ''   \
+               ge  1  ' foo  bar '  bar         foo         ' '  \
+               ge  1  ' foo  bar '  baz         bar         foo  \
+               ge  1  ' foo  bar '  fo          ba          N/A  \
+               ge  1  ' foo  bar '  foo         bar         ''   \
+               ge  1  ' foo  bar '  foo         bar         ' '  \
+               ge  1  ' foo  bar '  foo         bar         baz  \
+               ge  1  ' foo  bar '  o           a           N/A  \
+               ge  1  ' foo  bar '  oo          ar          N/A  \
+               eq  0  ' foo  bar '  foo         bar         N/A  \
+               eq  0  ' foo  bar '  bar         foo         N/A
 
        callback() {
                shift
-               ifs=$1
-               shift
-               if [ "${ifs}" = "default" ]; then
-                       test_description="contains_all $(quote_args "$@")"
-                       contains_all "$@"
-               else
-                       test_description="IFS='${ifs}' contains_all 
$(quote_args "$@")"
-                       IFS=${ifs} contains_all "$@"
-               fi
+               test_description="contains_all $(quote_args "$@")"
+               contains_all "$@"
        }
 
-       iterate_tests 7 "$@"
+       iterate_tests 6 "$@"
 }
 
 test_contains_any() {

Reply via email to