branch: externals/xr commit 19d52a7edbb5f22d471481cca8e6cca70a465641 Author: Mattias Engdegård <matti...@acm.org> Commit: Mattias Engdegård <matti...@acm.org>
Add branch subsumption check Look for alternatives containing a branch that matches a superset of another branch, like ".\\|a". This is wasteful, or indicates a mistake. --- xr-test.el | 32 +++++++++ xr.el | 232 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 262 insertions(+), 2 deletions(-) diff --git a/xr-test.el b/xr-test.el index 99c85fa..a3219dd 100644 --- a/xr-test.el +++ b/xr-test.el @@ -378,6 +378,38 @@ '((2 . "Repetition of zero-width assertion") (5 . "Repetition of zero-width assertion") (13 . "Repetition of zero-width assertion")))) + (should (equal (xr-lint "a.cde*f?g\\|g\\|abcdefg") + '((14 . "Branch matches subset of a previous branch")))) + (should (equal (xr-lint "abcd\\|e\\|[aA].[^0-9z]d") + '((9 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint "\\(?:\\(a\\)\\|.\\)\\(?:a\\|\\(.\\)\\)") + '((21 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint ".\\|\n\\|\r") + '((6 . "Branch matches subset of a previous branch")))) + (should (equal (xr-lint "[^mM]\\|[^a-zA-Z]") + '((7 . "Branch matches subset of a previous branch")))) + (should (equal (xr-lint "[^mM]\\|[^A-LN-Z]") + nil)) + (should (equal (xr-lint "[ab]\\|[^bcd]") + nil)) + (should (equal (xr-lint "[ab]\\|[^cd]") + '((6 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint ".\\|[a\n]") + nil)) + (should (equal (xr-lint ".\\|[[:space:]\r]") + '((3 . "Branch matches subset of a previous branch")))) + (should (equal (xr-lint "ab?c+\\|a?b*c*") + '((7 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint "\\(?:[aA]\\|b\\)\\|a") + '((15 . "Branch matches subset of a previous branch")))) + (should (equal (xr-lint "\\(?:a\\|b\\)\\|[abc]") + '((12 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint "\\(?:a\\|b\\)\\|\\(?:[abd]\\|[abc]\\)") + '((12 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint "ab\\|abc?") + '((4 . "Branch matches superset of a previous branch")))) + (should (equal (xr-lint "abc\\|abcd*e?") + '((5 . "Branch matches superset of a previous branch")))) ) (ert-deftest xr-skip-set () diff --git a/xr.el b/xr.el index 28545a9..99239fb 100644 --- a/xr.el +++ b/xr.el @@ -667,6 +667,224 @@ UPPER may be nil, meaning infinity." (t (cons 'seq item-seq)))))) +(defun xr--range-string-to-items (str) + "Convert a string of ranges to a list of pairs of their endpoints." + (let ((len (length str)) + (ranges nil) + (i 0)) + (while (< i len) + (push (cons (aref str i) (aref str (+ i 2))) + ranges) + (setq i (+ i 3))) + ranges)) + +(defun xr--any-arg-to-items (arg) + "Convert an `any' argument to a list of characters, ranges (as pairs), +and classes (symbols)." + ;; We know (since we built it) that x is either a symbol or + ;; a string, and that the string does not mix ranges and chars. + (cond ((symbolp arg) (list arg)) + ((and (>= (length arg) 3) + (eq (aref arg 1) ?-)) + (xr--range-string-to-items arg)) + (t (string-to-list arg)))) + +(defun xr--any-item-superset-p (a b) + "Whether A is a superset of B, both being `any' items: a character, +a range (pair of chars), or a class (symbol)." + (cond + ((symbolp a) + (cond ((symbolp b) (eq a b)) + ((eq b ?\n) + (memq a '(alnum alpha blank digit graph + lower multibyte nonascii print punct space + upper word xdigit))))) + ((consp a) + (or (and (characterp b) + (<= (car a) b (cdr a))) + (and (consp b) + (<= (car a) (car b) (cdr b) (cdr a))))) + (t + (and (characterp b) (eq a b))))) + +(defun xr--any-item-may-intersect-p (a b) + "Whether A intersects B, both being `any' items: a character, +a range (pair of chars), or a class (symbol). If in doubt, return t." + (cond + ((symbolp a) + (cond ((eq b ?\n) + (not (memq a '(alnum alpha blank digit graph + lower multibyte nonascii print punct space + upper word xdigit)))) + (t t))) + ((consp a) + (or (and (characterp b) + (<= (car a) b (cdr a))) + (and (consp b) + (<= (car a) (cdr b)) + (<= (car b) (cdr a))) + (symbolp b))) + ;; Now a must be a character. + ((characterp b) (eq a b)) + (t (xr--any-item-may-intersect-p b a)))) + +(defun xr--char-superset-of-char-set-p (a-sets negated b-sets) + "Whether A-SETS, possibly NEGATED, is a superset of B-SETS. +A-SETS and B-SETS are arguments to `any'." + (let ((a-items (mapcan #'xr--any-arg-to-items a-sets)) + (b-items (mapcan #'xr--any-arg-to-items b-sets))) + (cl-every (lambda (b-item) + (if negated + (not (cl-some + (lambda (a-item) + (xr--any-item-may-intersect-p b-item a-item)) + a-items)) + (cl-some (lambda (a-item) + (xr--any-item-superset-p a-item b-item)) + a-items))) + b-items))) + +(defun xr--char-superset-of-rx-p (sets negated rx) + "Whether SETS, possibly NEGATED, is a superset of RX." + (pcase rx + (`(any . ,b-sets) + (xr--char-superset-of-char-set-p sets negated b-sets)) + (`(not (any . ,b-sets)) + (and negated + (xr--char-superset-of-char-set-p b-sets nil sets))) + ((or 'ascii 'alnum 'alpha 'blank 'cntrl 'digit 'graph + 'lower 'multibyte 'nonascii 'print 'punct 'space + 'unibyte 'upper 'word 'xdigit) + (xr--char-superset-of-char-set-p sets negated `(any ,rx))) + ((pred stringp) + (and (= (length rx) 1) + (xr--char-superset-of-char-set-p sets negated (list rx)))))) + +(defun xr--single-non-newline-char-p (rx) + "Whether RX only matches single characters none of which is newline." + (pcase rx + ((or 'nonl 'wordchar) t) + (`(category ,_) t) + (`(syntax ,s) (not (eq s ?>))) ; comment-end often matches newline + (_ (xr--char-superset-of-rx-p '("\n") t rx)))) + +(defun xr--syntax-superset-of-rx-p (syntax negated rx) + "Whether SYNTAX, possibly NEGATED, is a superset of RX." + ;; Syntax tables vary, but we make a (quite conservative) guess. + (let* ((always-set + ;; Characters we think always will be in the syntax set. + '((whitespace " \t") + (word "A-Za-z0-9") + (open-parenthesis "([") + (close-parenthesis "])"))) + (never-set + ;; Characters we think never will be in the syntax set. + '((whitespace "!-~") + (punctuation "A-Za-z0-9") + (open-parenthesis "\x00- A-Za-z0-9") + (close-parenthesis "\x00- A-Za-z0-9"))) + (set (assq syntax (if negated never-set always-set)))) + (and set + (xr--char-superset-of-rx-p (cdr set) nil rx)))) + +(defun xr--string-to-chars (str) + (mapcar #'char-to-string (string-to-list str))) + +(defun xr--expand-strings (rx) + "If RX is a string or a seq of strings, convert them to seqs of +single-character strings." + (cond ((consp rx) + (if (eq (car rx) 'seq) + (cons 'seq (mapcan (lambda (x) + (if (and (stringp x) + (> (length x) 1)) + (xr--string-to-chars x) + (list x))) + (cdr rx))) + rx)) + ((and (stringp rx) + (> (length rx) 1)) + (cons 'seq (xr--string-to-chars rx))) + (t rx))) + +(defun xr--superset-seq-p (a b) + "Whether A matches all that B matches, both lists of expressions." + (while (and a b (xr--superset-p (car a) (car b))) + (setq a (cdr a)) + (setq b (cdr b))) + (and (not b) + (or (not a) + (xr--matches-empty-p (cons 'seq a))))) + +(defun xr--make-seq (body) + (if (> (length body) 1) + (cons 'seq body) + (car body))) + +(defun xr--superset-p (a b) + "Whether A matches all that B matches." + (setq a (xr--expand-strings a)) + (setq b (xr--expand-strings b)) + + (pcase b + (`(or . ,b-body) + (cl-every (lambda (b-expr) (xr--superset-p a b-expr)) b-body)) + (_ + (pcase a + (`(any . ,sets) + (xr--char-superset-of-rx-p sets nil b)) + (`(not (any . ,sets)) + (xr--char-superset-of-rx-p sets t b)) + ('nonl (xr--single-non-newline-char-p b)) + + (`(seq . ,a-body) + (pcase b + (`(seq . ,b-body) + (xr--superset-seq-p a-body b-body)) + (_ + (xr--superset-seq-p a-body (list b))))) + (`(or . ,a-body) + (cl-some (lambda (a-expr) (xr--superset-p a-expr b)) a-body)) + + (`(zero-or-more . ,a-body) + (pcase b + (`(,(or 'opt 'zero-or-more 'one-or-more) . ,b-body) + (xr--superset-p (xr--make-seq a-body) (xr--make-seq b-body))) + (_ (xr--superset-p (xr--make-seq a-body) b)))) + (`(one-or-more . ,a-body) + (pcase b + (`(one-or-more . ,b-body) + (xr--superset-p (xr--make-seq a-body) (xr--make-seq b-body))) + (_ (xr--superset-p (xr--make-seq a-body) b)))) + (`(opt . ,a-body) + (pcase b + (`(opt . ,b-body) + (xr--superset-p (xr--make-seq a-body) (xr--make-seq b-body))) + (_ (xr--superset-p (xr--make-seq a-body) b)))) + (`(repeat ,lo ,_ . ,a-body) + (if (<= lo 1) + (xr--superset-p (xr--make-seq a-body) b) + (equal a b))) + + ;; We do not expand through groups on the subset (b) side to + ;; avoid false positives; "\\(a\\)\\|." should be without warning. + (`(group . ,body) + (xr--superset-p (xr--make-seq body) b)) + (`(group-n ,_ . ,body) + (xr--superset-p (xr--make-seq body) b)) + + (`(syntax ,syn) + (or (equal a b) (xr--syntax-superset-of-rx-p syn nil b))) + (`(not (syntax ,syn)) + (or (equal a b) (xr--syntax-superset-of-rx-p syn t b))) + + ((or `(category ,_) `(not (category ,cat))) + (or (equal a b) + (and (stringp b) + (string-match-p (rx-to-string a) b)))) + + (_ (equal a b)))))) + (defun xr--parse-alt (warnings) (let ((alternatives nil)) ; reversed (push (xr--parse-seq warnings) alternatives) @@ -674,8 +892,18 @@ UPPER may be nil, meaning infinity." (forward-char 2) ; skip \| (let ((pos (point)) (seq (xr--parse-seq warnings))) - (when (and warnings (member seq alternatives)) - (xr--report warnings pos "Duplicated alternative branch")) + (when warnings + (cond + ((member seq alternatives) + (xr--report warnings pos "Duplicated alternative branch")) + ((cl-some (lambda (branch) (xr--superset-p seq branch)) + alternatives) + (xr--report warnings pos + "Branch matches superset of a previous branch")) + ((cl-some (lambda (branch) (xr--superset-p branch seq)) + alternatives) + (xr--report warnings pos + "Branch matches subset of a previous branch")))) (push seq alternatives))) (if (cdr alternatives) ;; Simplify (or nonl "\n") to anything