branch: externals/xr commit 6d02ab45b6e8a99b65d792c3e91b95827f0ac3e5 Author: Mattias Engdegård <matti...@acm.org> Commit: Mattias Engdegård <matti...@acm.org>
Recognise implicit gap in ASCII-raw ranges Character ranges between ASCII and raw bytes, such as "[a-\xc0]", do not include the intervening codes 80..3fff7f. That is, "[a-\xc0]" really means "[a-\x7f\x80-\xc0]" and does not match any non-ASCII Unicode character. This is also true for skip-sets. This change does two things: xr-lint no longer complains about "[a-\xc0λ]" since the λ is in the gap not matched by the range, and the conversion to rx now represents the gap explicitly: (any "a-\x7f\x80-\xc0" "λ"). --- xr-test.el | 24 ++++++--- xr.el | 174 ++++++++++++++++++++++++++++++++++++------------------------- 2 files changed, 120 insertions(+), 78 deletions(-) diff --git a/xr-test.el b/xr-test.el index 2d5d8859a4..e78c72d4f8 100644 --- a/xr-test.el +++ b/xr-test.el @@ -441,6 +441,13 @@ (xr-lint "\\`\\{2\\}\\(a\\|\\|b\\)\\{,8\\}") '((2 . "Repetition of zero-width assertion") (17 . "Repetition of expression matching an empty string")))) + ;; The range "[\x70-\x8f]" only includes 70..7f and 3fff80..3fff8f; + ;; the gap 80..3fff7f is excluded. + (should (equal (xr-lint "[\x70-\x8f∃]") nil)) + (should (equal (xr-lint "[\x70-\x8f\x7e-å]") + '((4 . "Ranges `\x70-\\x7f' and `\x7e-å' overlap")))) + (should (equal (xr-lint "[\x70-\x8få-\x82]") + '((4 . "Ranges `å-\\x82' and `\\x80-\\x8f' overlap")))) )) (ert-deftest xr-lint-repetition-of-empty () @@ -714,11 +721,11 @@ (ert-deftest xr-skip-set () (should (equal (xr-skip-set "0-9a-fA-F+*") - '(any "0-9a-fA-F" "+*"))) + '(any "0-9A-Fa-f" "*+"))) (should (equal (xr-skip-set "^ab-ex-") '(not (any "b-e" "ax-")))) (should (equal (xr-skip-set "-^][\\") - '(any "^][-"))) + '(any "[]^-"))) (should (equal (xr-skip-set "\\^a\\-bc-\\fg") '(any "c-f" "^abg-"))) (should (equal (xr-skip-set "\\") @@ -728,15 +735,15 @@ (should (equal (xr-skip-set "^Q-\\c-\\n") '(not (any "Q-c" "n-")))) (should (equal (xr-skip-set "\\\\A-") - '(any "\\A-"))) + '(any "A\\-"))) (should (equal (xr-skip-set "[a-z]") '(any "a-z" "[]"))) (should (equal (xr-skip-set "[:ascii:]-[:digit:]") '(any "-" ascii digit))) (should (equal (xr-skip-set "A-[:blank:]") - '(any "A-[" ":blank]"))) + '(any "A-[" ":]abkln"))) (should (equal (xr-skip-set "\\[:xdigit:]-b") - '(any "]-b" "[:xdigt"))) + '(any "]-b" ":[dgitx"))) (should (equal (xr-skip-set "^a-z+" 'terse) '(not (in "a-z" "+")))) (should-error (xr-skip-set "[::]")) @@ -763,7 +770,7 @@ (should (equal (xr-skip-set-lint "A-Fa-z3D-KM-N!3-7\\!b") '((7 . "Ranges `A-F' and `D-K' overlap") (10 . "Two-element range `M-N'") - (14 . "Range `3-7' includes character `3'") + (14 . "Character `3' included in range `3-7'") (17 . "Duplicated character `!'") (17 . "Unnecessarily escaped `!'") (19 . "Character `b' included in range `a-z'")))) @@ -787,6 +794,11 @@ nil)) (should (equal (xr-skip-set-lint "A-Z-z") '((3 . "Literal `-' not first or last")))) + (should (equal (xr-skip-set-lint "\x70-\x8f∃") nil)) + (should (equal (xr-skip-set-lint "\x70-\x8f\x7e-å") + '((3 . "Ranges `\x70-\\x7f' and `\x7e-å' overlap")))) + (should (equal (xr-skip-set-lint "\x70-\x8få-\x82") + '((3 . "Ranges `å-\\x82' and `\\x80-\\x8f' overlap")))) )) (provide 'xr-test) diff --git a/xr.el b/xr.el index aa4089d552..95c0df27b2 100644 --- a/xr.el +++ b/xr.el @@ -129,6 +129,11 @@ (let ((start ch) (end (char-after (+ (point) 2)))) (cond + ((<= start #x7f #x3fff80 end) + ;; Intervals that go from ASCII (0-7f) to raw bytes + ;; (3fff80-3fffff) always exclude the intervening (Unicode) points. + (push (vector start #x7f (point)) intervals) + (push (vector #x3fff80 end (point)) intervals)) ((<= start end) (push (vector start end (point)) intervals)) ;; It's unlikely that anyone writes z-a by mistake; don't complain. @@ -1458,7 +1463,7 @@ A-SETS and B-SETS are arguments to `any'." (let ((negated (eq (following-char) ?^)) (start-pos (point)) - (ranges nil) + (intervals nil) (classes nil)) (when negated (forward-char) @@ -1531,39 +1536,16 @@ A-SETS and B-SETS are arguments to `any'." (format-message "Two-element range `%c-%c'" start end) nil)))) - (let ((tail ranges)) - (while tail - (let ((range (car tail))) - (if (and (<= (car range) (or end start)) - (<= start (cdr range))) - (let ((msg - (cond - ((and end (< start end) - (< (car range) (cdr range))) - (format-message - "Ranges `%c-%c' and `%c-%c' overlap" - (car range) (cdr range) start end)) - ((and end (< start end)) - (format-message - "Range `%c-%c' includes character `%c'" - start end (car range))) - ((< (car range) (cdr range)) - (format-message - "Character `%c' included in range `%c-%c'" - start (car range) (cdr range))) - (t - (format-message "Duplicated character `%c'" - start))))) - (xr--report warnings (point) - (xr--escape-string msg nil)) - ;; Expand previous interval to include this range. - (setcar range (min (car range) start)) - (setcdr range (max (cdr range) (or end start))) - (setq start nil) - (setq tail nil)) - (setq tail (cdr tail)))))) - (when start - (push (cons start (or end start)) ranges))))) + (cond + ((not end) + (push (vector start start (point)) intervals)) + ((<= start #x7f #x3fff80 end) + ;; Intervals that go from ASCII (0-7f) to raw bytes + ;; (3fff80-3fffff) always exclude the intervening (Unicode) points. + (push (vector start #x7f (point)) intervals) + (push (vector #x3fff80 end (point)) intervals)) + (t + (push (vector start end (point)) intervals)))))) ((looking-at (rx "\\" eos)) (xr--report warnings (point) @@ -1571,51 +1553,99 @@ A-SETS and B-SETS are arguments to `any'." (goto-char (match-end 0))) - (when (and (null ranges) (null classes)) + (when (and (null intervals) (null classes)) (xr--report warnings (point-min) (if negated "Negated empty set matches anything" "Empty set matches nothing"))) - (cond - ;; Single non-negated character, like "-": make a string. - ((and (not negated) - (null classes) - (= (length ranges) 1) - (eq (caar ranges) (cdar ranges))) - (regexp-quote (char-to-string (caar ranges)))) - ;; Negated empty set, like "^": anything. - ((and negated - (null classes) - (null ranges)) - 'anything) - ;; Single named class, like "[:nonascii:]": use the symbol. - ((and (= (length classes) 1) - (null ranges)) - (if negated - (list 'not (car classes)) - (car classes))) - ;; Anything else: produce (any ...) - (t - (let ((intervals nil) + (let* ((sorted (sort (nreverse intervals) + (lambda (a b) (< (aref a 0) (aref b 0))))) + (s sorted)) + (while (cdr s) + (let ((this (car s)) + (next (cadr s))) + (if (>= (aref this 1) (aref next 0)) + ;; Overlap. + (let ((message + (cond + ;; Duplicate character: drop it and warn. + ((and (eq (aref this 0) (aref this 1)) + (eq (aref next 0) (aref next 1))) + (format-message + "Duplicated character `%c'" + (aref this 0))) + ;; Duplicate range: drop it and warn. + ((and (eq (aref this 0) (aref next 0)) + (eq (aref this 1) (aref next 1))) + (format-message + "Duplicated range `%c-%c'" + (aref this 0) (aref this 1))) + ;; Character in range: drop it and warn. + ((eq (aref this 0) (aref this 1)) + (setcar s next) + (format-message + "Character `%c' included in range `%c-%c'" + (aref this 0) (aref next 0) (aref next 1))) + ;; Same but other way around. + ((eq (aref next 0) (aref next 1)) + (format-message + "Character `%c' included in range `%c-%c'" + (aref next 0) (aref this 0) (aref this 1))) + ;; Overlapping ranges: merge and warn. + (t + (let ((this-end (aref this 1))) + (aset this 1 (max (aref this 1) (aref next 1))) + (format-message "Ranges `%c-%c' and `%c-%c' overlap" + (aref this 0) this-end + (aref next 0) (aref next 1))))))) + (xr--report warnings (max (aref this 2) (aref next 2)) + (xr--escape-string message nil)) + (setcdr s (cddr s))) + ;; No overlap. + (setq s (cdr s))))) + + (let ((ranges nil) (chars nil)) - (dolist (range ranges) - (if (eq (car range) (cdr range)) - (push (car range) chars) - (push (string (car range) ?- (cdr range)) intervals))) - ;; Put a single `-' last. - (when (memq ?- chars) - (setq chars (append (delq ?- chars) (list ?-)))) - (let ((set (cons 'any - (append - (and intervals - (list (apply #'concat intervals))) - (and chars - (list (apply #'string chars))) - (nreverse classes))))) + (dolist (interv sorted) + (if (eq (aref interv 0) (aref interv 1)) + (push (aref interv 0) chars) + (push (string (aref interv 0) ?- (aref interv 1)) + ranges))) + + (cond + ;; Single non-negated character, like "-": make a string. + ((and (not negated) + (null classes) + (null ranges) + (= (length chars) 1)) + (regexp-quote (char-to-string (car chars)))) + ;; Negated empty set, like "^": anything. + ((and negated + (null classes) + (null intervals)) + 'anything) + ;; Single named class, like "[:nonascii:]": use the symbol. + ((and (= (length classes) 1) + (null intervals)) (if negated - (list 'not set) - set))))))) + (list 'not (car classes)) + (car classes))) + ;; Anything else: produce (any ...) + (t + ;; Put a single `-' last. + (when (memq ?- chars) + (setq chars (cons ?- (delq ?- chars)))) + (let ((set (cons 'any + (append + (and ranges + (list (apply #'concat (nreverse ranges)))) + (and chars + (list (apply #'string (nreverse chars)))) + (nreverse classes))))) + (if negated + (list 'not set) + set)))))))) (defun xr--parse-skip-set (skip-string warnings) (with-temp-buffer