branch: master commit 25be5093eb7cf078af0b099f2d438f6b9262c572 Author: Stefan Monnier <monn...@iro.umontreal.ca> Commit: Stefan Monnier <monn...@iro.umontreal.ca>
* externals-list: Convert `tNFA` from subtree to external --- externals-list | 1 + packages/tNFA/tNFA.el | 1111 ------------------------------------------------- 2 files changed, 1 insertion(+), 1111 deletions(-) diff --git a/externals-list b/externals-list index 93fe9a3..b9a64ca 100644 --- a/externals-list +++ b/externals-list @@ -286,6 +286,7 @@ ("test-simple" :external "https://github.com/rocky/emacs-test-simple") ("timerfunctions" :external nil) ("tiny" :external "https://github.com/abo-abo/tiny") + ("tNFA" :external "http://www.dr-qubit.org/git/predictive.git") ("tramp" :external "https://git.savannah.gnu.org/cgit/tramp.git/?h=elpa") ("tramp-theme" :external nil) ("transcribe" :external nil) diff --git a/packages/tNFA/tNFA.el b/packages/tNFA/tNFA.el deleted file mode 100644 index 5a13e41..0000000 --- a/packages/tNFA/tNFA.el +++ /dev/null @@ -1,1111 +0,0 @@ -;;; tNFA.el --- Tagged non-deterministic finite-state automata - -;; Copyright (C) 2008-2010, 2012, 2014 Free Software Foundation, Inc - -;; Author: Toby Cubitt <toby-predict...@dr-qubit.org> -;; Version: 0.1.1 -;; Keywords: extensions, matching, data structures -;; tNFA, NFA, DFA, finite state automata, automata, regexp -;; Package-Requires: ((queue "0.1")) -;; URL: http://www.dr-qubit.org/emacs.php -;; Repository: http://www.dr-qubit.org/git/predictive.git - -;; This file is part of Emacs. -;; -;; GNU Emacs is free software: you can redistribute it and/or modify it under -;; the terms of the GNU General Public License as published by the Free -;; Software Foundation, either version 3 of the License, or (at your option) -;; any later version. -;; -;; GNU Emacs is distributed in the hope that it will be useful, but WITHOUT -;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -;; FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -;; more details. -;; -;; You should have received a copy of the GNU General Public License along -;; with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. - - -;;; Commentary: -;; -;; A tagged, non-deterministic finite state automata (NFA) is an abstract -;; computing machine that recognises regular languages. In layman's terms, -;; they are used to decide whether a string matches a regular expression. The -;; "tagged" part allows the NFA to do group-capture: it returns information -;; about which parts of a string matched which subgroup of the regular -;; expression. -;; -;; Why re-implement regular expression matching when Emacs comes with -;; extensive built-in support for regexps? Primarily, because some algorithms -;; require access to the NFA states produced part way through the regular -;; expression matching process (see the trie.el package for an -;; example). Secondarily, because Emacs regexps only work on strings, whereas -;; regular expressions can usefully be used in Elisp code to match other -;; sequence types, not just strings. -;; -;; A tagged NFA can be created from a regular expression using -;; `tNFA-from-regexp', and its state can be updated using -;; `tNFA-next-state'. You can discover whether a state is a matching state -;; using `tNFA-match-p', extract subgroup capture data from it using -;; `tNFA-group-data', check whether a state has any wildcard transitions using -;; `tNFA-wildcard-p', and get a list of non-wildcard transitions using -;; `tNFA-transitions'. Finally, `tNFA-regexp-match' uses tagged NFAs to decide -;; whether a regexp matches a given string. -;; -;; Note that Emacs' regexps are not regular expressions in the original -;; meaning of that phrase. Emacs regexps implement additional features (in -;; particular, back-references) that allow them to match far more than just -;; regular languages. This comes at a cost: regexp matching can potentially be -;; very slow (NP-hard in fact, though the hard cases rarely crop up in -;; practise), whereas there are efficient (polynomial-time) algorithms for -;; matching regular expressions (in the original sense). Therefore, this -;; package only supports a subset of the full Emacs regular expression -;; syntax. See the function docstrings for more information. -;; -;; This package essentially implements Laurikari's algorithm, as described in -;; his master's thesis, but it builds the corresponding tagged deterministic -;; finite state automaton (DFA) on-the-fly as needed. -;; -;; This package uses the queue package queue.el. - - -;;; Change Log: -;; -;; Version 0.1.1 -;; * work-around mysterious byte-compiler bug by defining -;; `tNFA--NFA-state-create' and `tNFA--NFA-state-create-tag' via `defun' -;; instead of directly in `defstruct' -;; -;; Version 0.1 -;; * initial version - - - -;;; Code: - -(eval-when-compile (require 'cl)) -(require 'queue) - - - -;;; ================================================================ -;;; Replcements for CL functions - -(defun* tNFA--assoc (item alist &key (test 'eq)) - ;; Return first cons cell in ALIST whose CAR matches ITEM according to - ;; :test function (defaulting to `eq') - (while (and alist - (or (not (consp (car alist))) - (not (funcall test item (caar alist))))) - (setq alist (cdr alist))) - (car alist)) - - - -;;; ================================================================ -;;; Data structures - -;;; ---------------------------------------------------------------- -;;; tagged NFA states - -(defstruct - (tNFA--state - (:constructor nil) - (:constructor tNFA--state-create-initial - (NFA-state num-tags min-tags max-tags - &aux - (tags (tNFA--tags-create num-tags min-tags max-tags)))) - (:constructor tNFA--state-create (NFA-state tags)) - (:copier nil)) - NFA-state tags) - -(defmacro tNFA--state-id (state) - `(tNFA--NFA-state-id (tNFA--state-NFA-state ,state))) - -(defmacro tNFA--state-type (state) - `(tNFA--NFA-state-type (tNFA--state-NFA-state ,state))) - -(defmacro tNFA--state-label (state) - `(tNFA--NFA-state-label (tNFA--state-NFA-state ,state))) - -(defmacro tNFA--state-in-degree (state) - `(tNFA--NFA-state-in-degree (tNFA--state-NFA-state ,state))) - -(defmacro tNFA--state-next (state) - `(tNFA--NFA-state-next (tNFA--state-NFA-state ,state))) - -(defmacro tNFA--state-count (state) - `(tNFA--NFA-state-count (tNFA--state-NFA-state ,state))) - - - -;;; ---------------------------------------------------------------- -;;; NFA states - -(declare (special NFA--state-id)) - -(defstruct - (tNFA--NFA-state - (:type vector) - (:constructor nil) - (:constructor tNFA---NFA-state-create - (&optional type label next - &aux - (in-degree 0) - (count 0) - (id (incf NFA--state-id)) - ;; (dummy - ;; (when next - ;; (setf (tNFA--NFA-state-count next) - ;; (incf (tNFA--NFA-state-in-degree next))))) - )) - (:constructor tNFA--NFA-state-create-branch - (&rest next - &aux - (type 'branch) - (in-degree 0) - (count 0) - (id (incf NFA--state-id)))) - (:constructor tNFA---NFA-state-create-tag - (tag &optional next - &aux - (type 'tag) - (label tag) - (in-degree 0) - (count 0) - (id (incf NFA--state-id)) - ;; (dummy - ;; (when next - ;; (setf (tNFA--NFA-state-count next) - ;; (incf (tNFA--NFA-state-in-degree next))))) - )) - (:copier nil)) - id type label in-degree - count tNFA-state ; used internally in NFA evolution algorithms - next) - - -;; Define these via defun instead of using the dummy argument in the -;; above defstruct to work around a mysterious byte-compiler bug. - -(defun tNFA--NFA-state-create (&optional type label next) - (when next - (setf (tNFA--NFA-state-count next) - (incf (tNFA--NFA-state-in-degree next)))) - (tNFA---NFA-state-create type label next)) - -(defun tNFA--NFA-state-create-tag (tag &optional next) - (when next - (setf (tNFA--NFA-state-count next) - (incf (tNFA--NFA-state-in-degree next)))) - (tNFA---NFA-state-create-tag tag next)) - - -;; tag number for a tagged epsilon transition is stored in label slot -(defalias 'tNFA--NFA-state-tag 'tNFA--NFA-state-label) - -(defmacro tNFA--NFA-state-tags (state) - `(tNFA--state-tags (tNFA--NFA-state-tNFA-state ,state))) - - -(defun tNFA--NFA-state-patch (attach state) - ;; patch STATE onto ATTACH. Return value is meaningless - (setf - (tNFA--NFA-state-type attach) - (tNFA--NFA-state-type state) - (tNFA--NFA-state-label attach) - (tNFA--NFA-state-label state) - (tNFA--NFA-state-next attach) - (tNFA--NFA-state-next state) - (tNFA--NFA-state-count state) - (incf (tNFA--NFA-state-in-degree state)))) - - -(defun tNFA--NFA-state-make-epsilon (state next) - ;; create an epsilon transition from STATE to NEXT - (setf - (tNFA--NFA-state-type state) 'epsilon - (tNFA--NFA-state-label state) nil - (tNFA--NFA-state-next state) next - (tNFA--NFA-state-count next) - (incf (tNFA--NFA-state-in-degree next)))) - - -(defun tNFA--NFA-state-make-branch (state next) - ;; create a branch from STATE to all states in NEXT list - (setf (tNFA--NFA-state-type state) 'branch - (tNFA--NFA-state-label state) nil - (tNFA--NFA-state-next state) next) - (dolist (n next) - (setf (tNFA--NFA-state-count n) - (incf (tNFA--NFA-state-in-degree n))))) - - -(defun tNFA--NFA-state-copy (state) - ;; Return a copy of STATE. The next link is *not* copied, it is `eq' - ;; to the original next link. Use `tNFA--fragment-copy' if you want to - ;; recursively copy a chain of states. Note: NFA--state-id must be - ;; bound to something appropriate when this function is called. - (let ((copy (copy-sequence state))) - (setf (tNFA--NFA-state-id copy) (incf NFA--state-id)) - copy)) - - - -;;; ---------------------------------------------------------------- -;;; NFA fragments - -(defstruct - (tNFA--fragment - (:type vector) - (:constructor nil) - (:constructor tNFA--fragment-create (initial final)) - (:copier nil)) - initial final) - - -(defun tNFA--fragment-patch (frag1 frag2) - ;; patch FRAG2 onto end of FRAG1; return value is meaningless - (tNFA--NFA-state-patch (tNFA--fragment-final frag1) - (tNFA--fragment-initial frag2)) - (setf (tNFA--fragment-final frag1) (tNFA--fragment-final frag2))) - - -(defun tNFA--fragment-copy (fragment) - ;; return a copy of FRAGMENT. - (declare (special copied-states)) - (let (copied-states) - (tNFA--fragment-create - (tNFA--do-fragment-copy (tNFA--fragment-initial fragment)) - (cdr (assq (tNFA--fragment-final fragment) copied-states))))) - - -(defun tNFA--do-fragment-copy (state) - ;; return a copy of STATE, recursively following and copying links - ;; (note: NFA--state-id must be bound to something appropriate when - ;; this is called) - (declare (special copied-states)) - (let ((copy (tNFA--NFA-state-copy state))) - (push (cons state copy) copied-states) - - ;; if STATE is a branch, copy all links - (cond - ((eq (tNFA--NFA-state-type copy) 'branch) - (setf (tNFA--NFA-state-next copy) - (mapcar (lambda (next) - (or (cdr (assq next copied-states)) - (tNFA--do-fragment-copy next))) - (tNFA--NFA-state-next copy)))) - - ;; if state doesn't have a next link, return - ((or (eq (tNFA--NFA-state-type copy) 'match) - (null (tNFA--NFA-state-type copy)))) - - ;; otherwise, copy next link - ((tNFA--NFA-state-type copy) - ;; for a non-branch STATE, copy next link - (setf (tNFA--NFA-state-next copy) - ;; if we've already copied next state, set next link to that - (or (cdr (assq (tNFA--NFA-state-next copy) copied-states)) - ;; otherwise, recursively copy next state - (tNFA--do-fragment-copy (tNFA--NFA-state-next copy)))))) - copy)) - - - -;;; ---------------------------------------------------------------- -;;; DFA states - -(defstruct - (tNFA--DFA-state - :named - (:constructor nil) - (:constructor tNFA--DFA-state--create - (list pool - &key - (test 'eq) - &aux - (transitions ()))) - (:copier nil)) - list transitions test wildcard match pool) - - -(defun* tNFA--DFA-state-create (state-list state-pool &key (test 'eq)) - ;; create DFA state and add it to the state pool - (let ((DFA-state (tNFA--DFA-state--create - state-list state-pool :test test))) - (puthash state-list DFA-state (tNFA--DFA-state-pool DFA-state)) - - (dolist (state state-list) - ;; if state in state list is... - (cond - ;; literal state: add literal transition - ((eq (tNFA--state-type state) 'literal) - (pushnew (cons (tNFA--state-label state) t) - (tNFA--DFA-state-transitions DFA-state) - :test #'equal)) - - ;; character alternative: add transitions for all alternatives - ((eq (tNFA--state-type state) 'char-alt) - (dolist (c (tNFA--state-label state)) - (pushnew (cons c t) (tNFA--DFA-state-transitions DFA-state) - :test #'equal))) - - ;; wildcard or negated character alternative: add wildcard - ;; transistion - ((or (eq (tNFA--state-type state) 'wildcard) - (eq (tNFA--state-type state) 'neg-char-alt)) - (setf (tNFA--DFA-state-wildcard DFA-state) t)) - - ;; match state: set match tags - ((eq (tNFA--state-type state) 'match) - (setf (tNFA--DFA-state-match DFA-state) - (tNFA--state-tags state))))) - - ;; return constructed state - DFA-state)) - - -(defun* tNFA--DFA-state-create-initial (state-list &key (test 'eq)) - ;; create initial DFA state from initial tNFA state INITIAL-STATE - (tNFA--DFA-state-create state-list - (make-hash-table :test 'equal) - :test test)) - - -(defalias 'tNFA-match-p 'tNFA--DFA-state-match - "Return non-nil if STATE is a matching state, otherwise return nil.") - - -(defalias 'tNFA-wildcard-p 'tNFA--DFA-state-wildcard - "Return non-nil if STATE has a wildcard transition, - otherwise return nil.") - - -(defun tNFA-transitions (state) - "Return list of literal transitions from tNFA state STATE." - (mapcar 'car (tNFA--DFA-state-transitions state))) - - - -;;; ---------------------------------------------------------------- -;;; tag tables - -(defun tNFA--tags-create (num-tags min-tags max-tags) - ;; construct a new tags table - (let ((vec (make-vector num-tags nil))) - (dolist (tag min-tags) - (aset vec tag (cons -1 'min))) - (dolist (tag max-tags) - (aset vec tag (cons -1 'max))) - vec)) - - -(defun tNFA--tags-copy (tags) - ;; return a copy of TAGS table - (let* ((len (length tags)) - (vec (make-vector len nil))) - (dotimes (i len) - (aset vec i (cons (car (aref tags i)) - (cdr (aref tags i))))) - vec)) - - -(defmacro tNFA--tags-set (tags tag val) - ;; set value of TAG in TAGS table to VAL - `(setcar (aref ,tags ,tag) ,val)) - - -(defmacro tNFA--tags-get (tags tag) - ;; get value of TAG in TAGS table - `(car (aref ,tags ,tag))) - - -(defmacro tNFA--tags-type (tags tag) - ;; return tag type ('min or 'max) - `(cdr (aref ,tags ,tag))) - - -(defun tNFA--tags< (val tag tags) - ;; return non-nil if VAL takes precedence over the value of TAG in - ;; TAGS table, nil otherwise - (setq tag (aref tags tag)) - (or (and (eq (cdr tag) 'min) - (< val (car tag))) - ;;(and (eq (cdr tag) 'max) - (> val (car tag));) - )) - - -(defun tNFA--tags-to-groups (tags) - ;; Convert TAGS table to a list of indices of group matches. The n'th - ;; element of the list is a cons cell, whose car is the starting index - ;; of the nth group and whose cdr is its end index. If a group didn't - ;; match, the corresponding list element will be null." - (let ((groups (make-list (/ (length tags) 2) nil)) - group-stack - (grp 0)) - (dotimes (i (length tags)) - (if (eq (tNFA--tags-type tags i) 'max) - (unless (= (tNFA--tags-get tags i) -1) - (setf (nth (caar group-stack) groups) - (cons (cdr (pop group-stack)) - (tNFA--tags-get tags i)))) - (unless (= (tNFA--tags-get tags i) -1) - (push (cons grp (tNFA--tags-get tags i)) group-stack)) - (incf grp))) - groups)) - - - - -;;; ================================================================ -;;; Regexp -> tNFA - -;;;###autoload -(defun* tNFA-from-regexp (regexp &key (test 'eq)) - "Create a tagged NFA that recognizes the regular expression REGEXP. -The return value is the initial state of the tagged NFA. - -REGEXP can be any sequence type (vector, list, or string); it -need not be an actual string. Special characters in REGEXP are -still just that: elements of the sequence that are characters -which have a special meaning in regexps. - -The :test keyword argument specifies how to test whether two -individual elements of STRING are identical. The default is `eq'. - -Only a subset of the full Emacs regular expression syntax is -supported. There is no support for regexp constructs that are -only meaningful for strings (character ranges and character -classes inside character alternatives, and syntax-related -backslash constructs). Back-references and non-greedy postfix -operators are not supported, so `?' after a postfix operator -loses its special meaning. Also, matches are always anchored, so -`$' and `^' lose their special meanings (use `.*' at the -beginning and end of the regexp to get an unanchored match)." - - ;; convert regexp to list, build NFA, and return initial state - (declare (special NFA--state-id)) - (destructuring-bind (fragment num-tags min-tags max-tags regexp) - (let ((NFA--state-id -1)) - (tNFA--from-regexp (append regexp nil) 0 '() '() 'top-level)) - (if regexp - (error "Syntax error in regexp: missing \"(\"") - (setf (tNFA--NFA-state-type (tNFA--fragment-final fragment)) - 'match) - (tNFA--DFA-state-create-initial - (tNFA--epsilon-boundary - (list - (tNFA--state-create-initial - (tNFA--fragment-initial fragment) num-tags min-tags max-tags)) - 0) - :test test) - ))) - - -(defmacro tNFA--regexp-postfix-p (regexp) - ;; return t if next token in REGEXP is a postfix operator, nil - ;; otherwise - `(or (eq (car ,regexp) ?*) - (eq (car ,regexp) ?+) - (eq (car ,regexp) ??) - (and (eq (car ,regexp) ?\\) - (cdr ,regexp) - (eq (cadr ,regexp) ?{)))) - - -(defun tNFA--from-regexp (regexp num-tags min-tags max-tags - &optional top-level shy-group) - ;; Construct a tagged NFA fragment from REGEXP, up to first end-group - ;; character or end of REGEXP. The TAGS arguments are used to pass the - ;; tags created so far. A non-nil TOP-LEVEL indicates that REGEXP is - ;; the complete regexp, so we're constructing the entire tNFA. A - ;; non-nil SHY-GROUP indicates that we're constructing a shy subgroup - ;; fragment. (Both optional arguments are only used for spotting - ;; syntax errors in REGEXP.) - ;; - ;; Returns a list: (FRAGMENT NUM-TAGS MIN-TAGS MAX-TAGS - ;; REGEXP). FRAGMENT is the constructed tNFA fragment, REGEXP is the - ;; remaining, unused portion of the regexp, and the TAGS return values - ;; give the tags created so far. - - (let* ((new (tNFA--NFA-state-create)) - (fragment-stack (list (tNFA--fragment-create new new))) - fragment copy attach token type group-end-tag) - - (catch 'constructed - (while t - (setq regexp (tNFA--regexp-next-token regexp) - type (nth 0 regexp) - token (nth 1 regexp) - regexp (nth 2 regexp)) - (setq fragment nil - group-end-tag nil) - - ;; ----- construct new fragment ----- - (cond - ;; syntax error: missing ) - ((and (null type) (not top-level)) - (error "Syntax error in regexp:\ - extra \"(\" or missing \")\"")) - - ;; syntax error: extra ) - ((and (eq type 'shy-group-end) top-level) - (error "Syntax error in regexp:\ - extra \")\" or missing \"(\"")) - - ;; syntax error: ) ending a shy group - ((and (eq type 'shy-group-end) (not shy-group)) - (error "Syntax error in regexp: \"(\" matched with \")?\"")) - - ;; syntax error: )? ending a group - ((and (eq type 'group-end) shy-group) - (error "Syntax error in regexp: \"(?\" matched with \")\"")) - - ;; syntax error: postfix operator not after atom - ((eq type 'postfix) - (error "Syntax error in regexp: unexpected \"%s\"" - (char-to-string token))) - - - ;; regexp atom: construct new literal fragment - ((or (eq type 'literal) (eq type 'wildcard) - (eq type 'char-alt) (eq type 'neg-char-alt)) - (setq new (tNFA--NFA-state-create - type token (tNFA--NFA-state-create)) - fragment (tNFA--fragment-create - new (tNFA--NFA-state-next new)))) - - ;; shy subgroup start: recursively construct subgroup fragment - ((eq type 'shy-group-start) - (setq new (tNFA--from-regexp - regexp num-tags min-tags max-tags nil t) - num-tags (nth 1 new) - min-tags (nth 2 new) - max-tags (nth 3 new) - regexp (nth 4 new) - fragment (nth 0 new))) - - ;; subgroup start: add minimize tag to current fragment, and - ;; recursively construct subgroup fragment - ((eq type 'group-start) - (setq new (tNFA--NFA-state-create)) - (setq fragment - (tNFA--fragment-create - (tNFA--NFA-state-create-tag - (car (push (1- (incf num-tags)) min-tags)) - new) - new)) - (tNFA--fragment-patch (car fragment-stack) fragment) - ;; reserve next tag number for subgroup end tag - (setq group-end-tag num-tags) - (incf num-tags) - ;; recursively construct subgroup fragment - (setq new (tNFA--from-regexp - regexp num-tags min-tags max-tags) - num-tags (nth 1 new) - min-tags (nth 2 new) - max-tags (nth 3 new) - regexp (nth 4 new) - fragment (nth 0 new))) - - - ;; end of regexp or subgroup: ... - ((or (null type) (eq type 'shy-group-end) (eq type 'group-end)) - - ;; if fragment-stack contains only one fragment, throw - ;; fragment up to recursion level above - (cond - ((null (nth 1 fragment-stack)) - (throw 'constructed - (list (car fragment-stack) - num-tags min-tags max-tags regexp))) - - ;; if fragment-stack contains multiple alternation fragments, - ;; attach them all together - ;; - ;; .--fragment--. - ;; / \ - ;; /----fragment----\ - ;; / \ - ;; ---o------fragment------o---> - ;; \ . / - ;; \ . / - ;; . - (t - ;; create a new fragment containing start and end of - ;; alternation - (setq fragment - (tNFA--fragment-create - (tNFA--NFA-state-create-branch) - (tNFA--NFA-state-create))) - ;; patch alternation fragments into new fragment - (dolist (frag fragment-stack) - (push (tNFA--fragment-initial frag) - (tNFA--NFA-state-next - (tNFA--fragment-initial fragment))) - (setf (tNFA--NFA-state-count - (tNFA--fragment-initial frag)) - (incf (tNFA--NFA-state-in-degree - (tNFA--fragment-initial frag)))) - (tNFA--NFA-state-make-epsilon (tNFA--fragment-final frag) - (tNFA--fragment-final fragment))) - ;; throw constructed fragment up to recursion level above - (throw 'constructed - (list fragment num-tags min-tags max-tags regexp))) - )) - - ;; | alternation: start new fragment - ((eq type 'alternation) - (setq new (tNFA--NFA-state-create)) - (push (tNFA--fragment-create new new) fragment-stack))) - - - ;; ----- attach new fragment ----- - (when fragment - ;; if next token is not a postfix operator, attach new - ;; fragment onto end of current NFA fragment - (if (not (tNFA--regexp-postfix-p regexp)) - (tNFA--fragment-patch (car fragment-stack) fragment) - - ;; if next token is a postfix operator, splice new fragment - ;; into NFA as appropriate - (when (eq type 'alternation) - (error "Syntax error in regexp: unexpected \"%s\"" - (char-to-string token))) - (setq regexp (tNFA--regexp-next-token regexp) - type (nth 0 regexp) - token (nth 1 regexp) - regexp (nth 2 regexp)) - - (while fragment - (setq attach (tNFA--fragment-final (car fragment-stack))) - (setq new (tNFA--NFA-state-create)) - (cond - - ;; * postfix = \{0,\}: - ;; - ;; .--fragment--. - ;; / \ - ;; \ ______/ - ;; \ / - ;; ---attach-----new--- - ;; - ((and (eq (car token) 0) (null (cdr token))) - (tNFA--NFA-state-make-branch - attach (list (tNFA--fragment-initial fragment) new)) - (tNFA--NFA-state-make-epsilon - (tNFA--fragment-final fragment) attach) - (setf (tNFA--fragment-final (car fragment-stack)) new) - (setq fragment nil)) - - ;; + postfix = \{1,\}: - ;; - ;; .----. - ;; / \ - ;; / \ - ;; \ / - ;; ---fragment-----new--- - ;; - ((and (eq (car token) 1) (null (cdr token))) - (tNFA--NFA-state-patch - attach (tNFA--fragment-initial fragment)) - (tNFA--NFA-state-make-branch - (tNFA--fragment-final fragment) (list attach new)) - (setf (tNFA--fragment-final (car fragment-stack)) new) - (setq fragment nil)) - - ;; \{0,n\} (note: ? postfix = \{0,1\}): - ;; - ;; .--fragment--. - ;; / \ - ;; ---attach new--- - ;; \______________/ - ;; - ((eq (car token) 0) - ;; ? postfix = \{0,1\}: after this we're done - (if (eq (cdr token) 1) - (setq copy nil) - (setq copy (tNFA--fragment-copy fragment))) - ;; attach fragment - (tNFA--NFA-state-make-branch - attach (list (tNFA--fragment-initial fragment) new)) - (tNFA--NFA-state-make-epsilon - (tNFA--fragment-final fragment) new) - (setf (tNFA--fragment-final (car fragment-stack)) new) - ;; prepare for next iteration - (decf (cdr token)) - (setq fragment copy)) - - ;; \{n,\} or \{n,m\}: - ;; - ;; ---attach----fragment----new--- - ;; - (t - (setq copy (tNFA--fragment-copy fragment)) - (tNFA--fragment-patch (car fragment-stack) fragment) - ;; prepare for next iteration - (decf (car token)) - (when (cdr token) (decf (cdr token))) - (if (eq (cdr token) 0) - (setq fragment nil) - (setq fragment copy))) - ))) - - - ;; if ending a group, add a maximize tag to end - (when group-end-tag - (setq new (tNFA--NFA-state-create) - fragment (tNFA--fragment-create - (tNFA--NFA-state-create-tag - group-end-tag new) - new)) - (push group-end-tag max-tags) - (tNFA--fragment-patch (car fragment-stack) fragment))) - )) ; end of infinite loop and catch - )) - - - -;; Note: hard-coding the parsing like this is ugly, though sufficient -;; for our purposes. Perhaps it would be more elegant to implement -;; this in terms of a proper parser... - -(defun tNFA--regexp-next-token (regexp) - ;; if regexp is empty, return null values for next token type, token - ;; and remaining regexp - (if (null regexp) - (list nil nil nil) - - (let ((token (pop regexp)) - (type 'literal)) ; assume token is literal initially - (cond - - ;; [: gobble up to closing ] - ((eq token ?\[) - ;; character alternatives are stored in lists - (setq token '()) - (cond - ;; gobble ] appearing straight after [ - ((eq (car regexp) ?\]) (push (pop regexp) token)) - ;; gobble ] appearing straight after [^ - ((and (eq (car regexp) ?^) (eq (nth 1 regexp) ?\])) - (push (pop regexp) token) - (push (pop regexp) token))) - ;; gobble everything up to closing ] - (while (not (eq (car regexp) ?\])) - (push (pop regexp) token) - (unless regexp - (error "Syntax error in regexp: missing \"]\""))) - (pop regexp) ; dump closing ] - (if (not (eq (car (last token)) ?^)) - (setq type 'char-alt) - (setq type 'neg-char-alt) - (setq token (butlast token)))) - - ;; ]: syntax error (always gobbled when parsing [) - ((eq token ?\]) - (error "Syntax error in regexp: missing \"[\"")) - - ;; . * + ?: set appropriate type - ((eq token ?*) (setq type 'postfix token (cons 0 nil))) - ((eq token ?+) (setq type 'postfix token (cons 1 nil))) - ((eq token ??) (setq type 'postfix token (cons 0 1))) - ((eq token ?.) (setq type 'wildcard)) - - ;; \: look at next character - ((eq token ?\\) - (unless (setq token (pop regexp)) - (error "Syntax error in regexp:\ - missing character after \"\\\"")) - (cond - ;; |: alternation - ((eq token ?|) (setq type 'alternation)) - ;; \(?: shy group start - ((and (eq token ?\() (eq (car regexp) ??)) - (setq type 'shy-group-start) - (pop regexp)) - ;; \)?: shy group end - ((and (eq token ?\)) (eq (car regexp) ??)) - (setq type 'shy-group-end) - (pop regexp)) - ;; \(: group start - ((eq token ?\() (setq type 'group-start)) - ;; \): group end - ((eq token ?\)) (setq type 'group-end)) - - ;; \{: postfix repetition operator - ((eq token ?{) - (setq type 'postfix token (cons nil nil)) - ;; extract first number from repetition operator - (while (if (null regexp) - (error "Syntax error in regexp:\ - malformed \\{...\\}") - (not (or (eq (car regexp) ?,) - (eq (car regexp) ?\\)))) - (setcar token - (concat (car token) (char-to-string (pop regexp))))) - (if (null (car token)) - (setcar token 0) - (unless (string-match "[0-9]+" (car token)) - (error "Syntax error in regexp: malformed \\{...\\}")) - (setcar token (string-to-number (car token)))) - (cond - ;; if next character is "\", we expect "}" to follow - ((eq (car regexp) ?\\) - (pop regexp) - (unless (eq (car regexp) ?}) - (error "Syntax error in regexp: expected \"}\"")) - (pop regexp) - (unless (car token) - (error "Syntax error in regexp: malformed \\{...\\}")) - (setcdr token (car token))) - ;; if next character is ",", we expect a second number to - ;; follow - ((eq (car regexp) ?,) - (pop regexp) - (while (if (null regexp) - (error "Syntax error in regexp:\ - malformed \\{...\\}") - (not (eq (car regexp) ?\\))) - (setcdr token - (concat (cdr token) - (char-to-string (pop regexp))))) - (unless (null (cdr token)) - (unless (string-match "[0-9]+" (cdr token)) - (error "Syntax error in regexp: malformed \\{...\\}")) - (setcdr token (string-to-number (cdr token)))) - (pop regexp) - (unless (eq (car regexp) ?}) - (error "Syntax error in regexp: expected \"}\"")) - (pop regexp)))) - )) - ) - - ;; return first token type, token, and remaining regexp - (list type token regexp)))) - - - -;;; ================================================================ -;;; tNFA evolution - -(defun tNFA-next-state (tNFA chr pos) - "Evolve tNFA according to CHR, which corresponds to position -POS in a string." - (let (elem state) - ;; if there is a transition for character CHR... - (cond - ((setq elem (tNFA--assoc chr (tNFA--DFA-state-transitions tNFA) - :test (tNFA--DFA-state-test tNFA))) - ;; if next state has not already been computed, do so - (unless (tNFA--DFA-state-p (setq state (cdr elem))) - (setq state (tNFA--DFA-next-state tNFA chr pos nil)) - (setcdr elem state))) - - ;; if there's a wildcard transition... - ((setq state (tNFA--DFA-state-wildcard tNFA)) - ;; if next state has not already been computed, do so - (unless (tNFA--DFA-state-p state) - (setq state (tNFA--DFA-next-state tNFA chr pos t)) - (setf (tNFA--DFA-state-wildcard tNFA) state)))) - state)) - - - -(defun tNFA--DFA-next-state (DFA-state chr pos wildcard) - (let (state-list state) - ;; add all states reached by a CHR transition from DFA-STATE to - ;; state list - (if wildcard - (dolist (state (tNFA--DFA-state-list DFA-state)) - (when (or (eq (tNFA--state-type state) 'wildcard) - (and (eq (tNFA--state-type state) 'neg-char-alt) - (not (memq chr (tNFA--state-label state))))) - (push (tNFA--state-create - (tNFA--state-next state) - (tNFA--tags-copy (tNFA--state-tags state))) - state-list))) - (dolist (state (tNFA--DFA-state-list DFA-state)) - (when (or (and (eq (tNFA--state-type state) 'literal) - (eq chr (tNFA--state-label state))) - (and (eq (tNFA--state-type state) 'char-alt) - (memq chr (tNFA--state-label state))) - (and (eq (tNFA--state-type state) 'neg-char-alt) - (not (memq chr (tNFA--state-label state)))) - (eq (tNFA--state-type state) 'wildcard)) - (push (tNFA--state-create - (tNFA--state-next state) - (tNFA--tags-copy (tNFA--state-tags state))) - state-list)))) - - ;; if state list is empty, return empty, failure DFA state - (when state-list - ;; otherwise, construct new DFA state and add it to the pool if - ;; it's not already there - (setq state-list (tNFA--epsilon-boundary state-list (1+ pos))) - (setq state - (or (gethash state-list (tNFA--DFA-state-pool DFA-state)) - (tNFA--DFA-state-create - state-list - (tNFA--DFA-state-pool DFA-state) - :test (tNFA--DFA-state-test DFA-state)))) - ;; return next state - state))) - - - -(defun tNFA--epsilon-boundary (state-set pos) - ;; Return the tagged epsilon-boundary of the NFA states listed in - ;; STATE-SET, that is the set of all states that can be reached via - ;; epsilon transitions from some state in STATE-SET (not including - ;; states in STATE-SET itself). - (let ((queue (queue-create)) - (result '()) - (reset '()) - state next tags) - ;; temporarily link the NFA states to their corresponding tNFA - ;; states, and add them to the queue - (dolist (t-state state-set) - (setf state (tNFA--state-NFA-state t-state) - (tNFA--NFA-state-tNFA-state state) t-state) - (push state reset) - (queue-enqueue queue state)) - - (while (setq state (queue-dequeue queue)) - (cond - ;; branch or epsilon: add next states as necessary, copying tags - ;; across - ((or (eq (tNFA--NFA-state-type state) 'branch) - (eq (tNFA--NFA-state-type state) 'epsilon)) - (dolist (next (if (eq (tNFA--NFA-state-type state) 'epsilon) - (list (tNFA--NFA-state-next state)) - (tNFA--NFA-state-next state))) - (unless (tNFA--NFA-state-tNFA-state next) - (setf (tNFA--NFA-state-tNFA-state next) - (tNFA--state-create - next (tNFA--tags-copy (tNFA--NFA-state-tags state)))) - (push next reset) - ;; if next state hasn't already been seen in-degree times, - ;; add it to the end of the queue - (if (/= (decf (tNFA--NFA-state-count next)) 0) - (queue-enqueue queue next) - ;; if it has now been seen in-degree times, reset count - ;; and add it back to the front of the queue - (setf (tNFA--NFA-state-count next) - (tNFA--NFA-state-in-degree next)) - (queue-prepend queue next))))) - - ;; tag: add next state if necessary, updating tags if necessary - ((eq (tNFA--NFA-state-type state) 'tag) - (setq next (tNFA--NFA-state-next state)) - ;; if next state is not already in results list, or it is - ;; already in results but new tag value takes precedence... - (when (or (not (tNFA--NFA-state-tNFA-state next)) - (tNFA--tags< pos (tNFA--NFA-state-tag state) - (tNFA--NFA-state-tags next))) - ;; if next state is already in results, update tag value - (if (tNFA--NFA-state-tNFA-state next) - (tNFA--tags-set (tNFA--NFA-state-tags next) - (tNFA--NFA-state-tag state) pos) - ;; if state is not already in results, copy tags, updating - ;; tag value, and add next state to results list - (setq tags (tNFA--tags-copy (tNFA--NFA-state-tags state))) - (tNFA--tags-set tags (tNFA--NFA-state-tag state) pos) - (setf (tNFA--NFA-state-tNFA-state next) - (tNFA--state-create next tags)) - (push next reset)) - ;; if next state hasn't already been seen in-degree times, add - ;; it to the end of the queue - (if (/= (decf (tNFA--NFA-state-count next)) 0) - (queue-enqueue queue next) - ;; if it has now been seen in-degree times, reset count and - ;; add it back to the front of the queue - (setf (tNFA--NFA-state-count next) - (tNFA--NFA-state-in-degree next)) - (queue-prepend queue next)))) - - ;; anything else is a non-epsilon-transition state, so add it to - ;; result - (t (push (tNFA--NFA-state-tNFA-state state) result)) - )) - - ;; reset temporary NFA state link and count - (dolist (state reset) - (setf (tNFA--NFA-state-tNFA-state state) nil - (tNFA--NFA-state-count state) - (tNFA--NFA-state-in-degree state))) - ;; sort result states - (sort result - (lambda (a b) (< (tNFA--state-id a) (tNFA--state-id b)))) - )) - - - -;;; ================================================================ -;;; tNFA matching - -;;;###autoload -(defun* tNFA-regexp-match (regexp string &key (test 'eq)) - "Return non-nil if STRING matches REGEXP, nil otherwise. -Sets the match data if there was a match; see `match-beginning', -`match-end' and `match-string'. - -REGEXP and STRING can be any sequence type (vector, list, or -string); they need not be actual strings. Special characters in -REGEXP are still just that: elements of the sequence that are -characters which have a special meaning in regexps. - -The :test keyword argument specifies how to test whether two -individual elements of STRING are identical. The default is `eq'. - -Only a subset of the full Emacs regular expression syntax is -supported. There is no support for regexp constructs that are -only meaningful for strings (character ranges and character -classes inside character alternatives, and syntax-related -backslash constructs). Back-references and non-greedy postfix -operators are not supported, so `?' after a postfix operator -loses its special meaning. Also, matches are always anchored, so -`$' and `^' lose their special meanings (use `.*' at the -beginning and end of the regexp to get an unanchored match)." - - (let ((tNFA (tNFA-from-regexp regexp :test test)) - (i -1) tags match-data group-stack (grp 0)) - - ;; evolve tNFA according to characters of STRING - (catch 'fail - (dolist (chr (append string nil)) - (unless (setq tNFA (tNFA-next-state tNFA chr (incf i))) - (throw 'fail nil))) - - ;; if REGEXP matched... - (when (setq tags (tNFA--DFA-state-match tNFA)) - (setq match-data (make-list (+ (length tags) 2) nil)) - ;; set match data - (setf (nth 0 match-data) 0 - (nth 1 match-data) (length string)) - ;; set group match data if there were any groups - (dotimes (i (length tags)) - (if (eq (tNFA--tags-type tags i) 'max) - (unless (= (tNFA--tags-get tags i) -1) - (setf (nth (1+ (* 2 (pop group-stack))) match-data) - (tNFA--tags-get tags i))) - (incf grp) - (unless (= (tNFA--tags-get tags i) -1) - (push grp group-stack) - (setf (nth (* 2 grp) match-data) - (tNFA--tags-get tags i))))) - (set-match-data match-data) - tags)))) - - -(defun tNFA-group-data (tNFA) - "Return the group match data associated with a tNFA state." - (tNFA--tags-to-groups (tNFA--DFA-state-match tNFA))) - - - -(provide 'tNFA) - -;;; tNFA.el ends here