branch: externals/parser-generator commit 5f65cfcb4d8ab927551227b2df18c844d7741f76 Author: Christian Johansson <christ...@cvj.se> Commit: Christian Johansson <christ...@cvj.se>
More refactoring, using lists instead of string as grammar data type --- parser.el | 232 +++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 132 insertions(+), 100 deletions(-) diff --git a/parser.el b/parser.el index 102138f..ca57f8f 100644 --- a/parser.el +++ b/parser.el @@ -30,6 +30,10 @@ nil "Current look-ahead number used.") +(defvar parser--f-sets + nil + "Generated F-sets for grammar.") + ;; Macros @@ -104,14 +108,15 @@ (error "Invalid look-ahead number k!")) (setq parser--grammar G) (setq parser--look-ahead-number k) + (setq parser--f-sets nil) (parser--load-symbols)) -(defun parser--valid-empty-p (symbol) - "Return whether SYMBOL is empty identifier or not." - (eq symbol "e")) +(defun parser--valid-e-p (symbol) + "Return whether SYMBOL is the e identifier or not." + (eq symbol 'e)) (defun parser--valid-grammar-p (G) - "Return if grammar G is valid or not. Grammar should contain list with 4 elements: non-terminals (N), terminals (T), productions (P), start (S) where N, T and P are lists and S is a symbol." + "Return if grammar G is valid or not. Grammar should contain list with 4 elements: non-terminals (N), terminals (T), productions (P), start (S) where N, T and P are lists containing symbols and/or strings and S is a symbol or string." (let ((valid-p t)) (unless (listp G) (setq valid-p nil)) @@ -125,7 +130,9 @@ (not (listp (nth 0 G))) (not (listp (nth 1 G))) (not (listp (nth 2 G))) - (not (stringp (nth 3 G))))) + (not (or + (stringp (nth 3 G)) + (symbolp (nth 3 G)))))) (setq valid-p nil)) valid-p)) @@ -146,18 +153,17 @@ (defun parser--valid-sentential-form-p (symbols) "Return whether SYMBOLS is a valid sentential form in grammar or not." (let ((is-valid t)) - (let ((symbols-string (symbol-name symbols))) - (let ((symbols-length (length symbols-string)) - (symbol-index 0)) - (while (and - is-valid - (< symbol-index symbols-length)) - (let ((symbol-string (substring symbols-string symbol-index (1+ symbol-index)))) - (unless (or - (parser--valid-empty-p symbol-string) - (parser--valid-non-terminal-p symbol-string) - (parser--valid-terminal-p symbol-string)) - (setq is-valid nil)))))) + (let ((symbols-length (length symbols)) + (symbol-index 0)) + (while (and + is-valid + (< symbol-index symbols-length)) + (let ((symbol (nth symbol-index symbols))) + (unless (or + (parser--valid-e-p symbol) + (parser--valid-non-terminal-p symbol) + (parser--valid-terminal-p symbol)) + (setq is-valid nil))))) is-valid)) (defun parser--valid-terminal-p (symbol) @@ -191,7 +197,7 @@ (k (nth 0 state)) (i (nth 1 state)) (f-sets (nth 2 state)) - (disallow-empty-first (nth 3 state))) + (disallow-e-first (nth 3 state))) (parser--debug (message "input-tape-length: %s" input-tape-length) (message "k: %s" k) @@ -203,22 +209,22 @@ (let ((leading-terminals (nth 0 stack-symbol)) (all-leading-terminals-p (nth 1 stack-symbol)) (input-tape-index (nth 2 stack-symbol)) - (empty-first-p nil)) + (e-first-p nil)) (parser--debug (message "leading-terminals: %s" leading-terminals) (message "all-leading-terminals-p: %s" all-leading-terminals-p) (message "input-tape-index: %s" input-tape-index)) ;; Flag whether leading-terminal is empty or not - (when (string= leading-terminals "e") - (setq empty-first-p t)) + (when (parser--valid-e-p leading-terminals) + (setq e-first-p t)) - (parser--debug (message "empty-first-p: %s" empty-first-p)) + (parser--debug (message "e-first-p: %s" e-first-p)) ;; If leading terminal is empty and we have input-tape left, disregard it (when (and - (not disallow-empty-first) - empty-first-p + (not disallow-e-first) + e-first-p (< input-tape-index input-tape-length)) (parser--debug (message "Disregarding empty first terminal")) (setq leading-terminals "")) @@ -229,16 +235,18 @@ (< input-tape-index input-tape-length) (< leading-terminals-count k) all-leading-terminals-p) - (let ((rhs-element (substring input-tape input-tape-index (1+ input-tape-index))) + (let ((rhs-element (nth input-tape-index input-tape)) (rhs-type)) (parser--debug (message "rhs-element: %s" rhs-element)) ;; Determine symbol type - (if (string= rhs-element (upcase rhs-element)) - (setq rhs-type 'NON-TERMINAL) - (if (string= rhs-element "e") - (setq rhs-type 'EMPTY) - (setq rhs-type 'TERMINAL))) + (cond + ((parser--valid-non-terminal-p rhs-element) + (setq rhs-type 'NON-TERMINAL)) + ((parser--valid-e-p rhs-element) + (setq rhs-type 'EMPTY)) + ((parser--valid-terminal-p rhs-element) + (setq rhs-type 'TERMINAL))) (parser--debug (message "rhs-type: %s" rhs-type)) (cond @@ -262,23 +270,23 @@ ;; When we have a leading terminal and sub-terminal set is empty, don't append it (when (and (> leading-terminals-count 0) - (string= sub-terminal-set "e")) - (setq sub-terminal-set "")) + (parser--valid-e-p sub-terminal-set)) + (setq sub-terminal-set nil)) - (let ((sub-rhs-leading-terminals (concat leading-terminals sub-terminal-set))) + (let ((sub-rhs-leading-terminals (append leading-terminals sub-terminal-set))) (when (> (length sub-rhs-leading-terminals) k) - (setq sub-rhs-leading-terminals (substring sub-rhs-leading-terminals 0 k))) + (setq sub-rhs-leading-terminals (butlast sub-rhs-leading-terminals (- (length sub-rhs-leading-terminals) k)))) (push `(,sub-rhs-leading-terminals ,all-leading-terminals-p ,(1+ input-tape-index)) stack))) (setq sub-terminal-index (1+ sub-terminal-index))))) (parser--debug (message "Sub-terminal-set: %s" sub-terminal-set)) (when (or - (not (string= sub-terminal-set "e")) + (not (parser--valid-e-p sub-terminal-set)) (= input-tape-index (1- input-tape-length))) - (setq leading-terminals (concat leading-terminals sub-terminal-set)) + (setq leading-terminals (append leading-terminals sub-terminal-set)) (setq leading-terminals-count (+ leading-terminals-count (length sub-terminal-set))) (when (> leading-terminals-count k) - (setq leading-terminals (substring leading-terminals 0 k)) + (setq leading-terminals (butlast leading-terminals (- leading-terminals-count k))) (setq leading-terminals-count k))))) (parser--debug (message "Found no subsets for %s %s" rhs-element (1- i))))) @@ -286,19 +294,19 @@ ((equal rhs-type 'EMPTY) (if all-leading-terminals-p - (if disallow-empty-first + (if disallow-e-first (when (= leading-terminals-count 0) (setq all-leading-terminals-p nil)) (when (and (= leading-terminals-count 0) (= input-tape-index (1- input-tape-length))) - (setq leading-terminals (concat leading-terminals rhs-element)) + (setq leading-terminals (append leading-terminals rhs-element)) (setq leading-terminals-count (1+ leading-terminals-count)))) (setq all-leading-terminals-p nil))) ((equal rhs-type 'TERMINAL) (when all-leading-terminals-p - (setq leading-terminals (concat leading-terminals rhs-element)) + (setq leading-terminals (append leading-terminals rhs-element)) (setq leading-terminals-count (1+ leading-terminals-count)))))) (setq input-tape-index (1+ input-tape-index))) (when (> leading-terminals-count 0) @@ -306,70 +314,94 @@ f-set)) ;; Algorithm 5.5, p. 357 -;; TODO Make this work on strings instead of symbols (defun parser--first (β &optional disallow-e-first) "For sentential-form Β, in grammar, calculate first k terminals, optionally DISALLOW-E-FIRST." - (unless (parser--sentential-form-p β) + (unless (parser--valid-sentential-form-p β) (error "Invalid sentential form β!")) - (let ((productions (parser--get-grammar-productions)) - (k parser--look-ahead-number)) - (let ((f-sets (make-hash-table :test 'equal)) - (i 0) - (i-max (length productions))) - (while (< i i-max) - (parser--debug (message "i = %s" i)) - (let ((f-set (make-hash-table :test 'equal))) - - ;; Iterate all productions, set F_i - (dolist (p productions) - (let ((production-lhs (symbol-name (car p))) - (production-rhs (cdr p))) - (parser--debug - (message "Production-LHS: %s" production-lhs) - (message "Production-RHS: %s" production-rhs)) - - ;; Iterate all blocks in RHS - (let ((f-p-set)) - (dolist (rhs-p production-rhs) - (let ((rhs-string (symbol-name rhs-p))) - (let ((rhs-leading-terminals - (parser--f-set rhs-string `(,k ,i ,f-sets ,disallow-e-first) '(("" t 0))))) - (parser--debug - (message "Leading %d terminals at index %s (%s) -> %s = %s" k i production-lhs rhs-string rhs-leading-terminals)) - (when rhs-leading-terminals - (when (and - (listp rhs-leading-terminals) - (> (length rhs-leading-terminals) 0)) - (dolist (rhs-leading-terminals-string rhs-leading-terminals) - (when (and - (stringp rhs-leading-terminals-string) - (> (length rhs-leading-terminals-string) 0)) - (push rhs-leading-terminals-string f-p-set)))))))) - - ;; Make set distinct - (setq f-p-set (parser--distinct f-p-set)) + (let* ((productions (parser--get-grammar-productions)) + (k parser--look-ahead-number) + (i-max (length productions))) + + ;; Generate F-sets only once per grammar + (unless parser--f-sets + (let ((f-sets (make-hash-table :test 'equal)) + (i 0)) + (while (< i i-max) + (parser--debug (message "i = %s" i)) + (let ((f-set (make-hash-table :test 'equal))) + + ;; Iterate all productions, set F_i + (dolist (p productions) + (let ((production-lhs (car p)) + (production-rhs (cdr p))) (parser--debug - (message "F_%s_%s(%s) = %s" i k production-lhs f-p-set)) - (puthash production-lhs (nreverse f-p-set) f-set)))) - (puthash i f-set f-sets) - (setq i (+ i 1)))) - - ;; TODO Iterate each symbol in β using a PDA algorithm - (let ((symbol-length (length β)) - (symbol-index 0) - (first-string "") - (first-length 0)) - (while (and - (< symbol-index symbol-length) - (< first-length k)) - (let ((symbol-string (substring β symbol-index (1+ symbol-index)))) - (cond - ((parser--valid-terminal-p symbol-string) - (setq first-string (concat first-string symbol-string)) - (setq first-length (1+ first-length))) - ((parser--valid-non-terminal-p symbol-string) - ;; TODO Handle this scenario here were a non-terminal can result in different FIRST sets - (sort (gethash (symbol-name production) (gethash (1- i-max) f-sets)) 'string<)))) + (message "Production-LHS: %s" production-lhs) + (message "Production-RHS: %s" production-rhs)) + + ;; Iterate all blocks in RHS + (let ((f-p-set)) + (dolist (rhs-p production-rhs) + (let ((rhs-string rhs-p)) + (let ((rhs-leading-terminals + (parser--f-set rhs-string `(,k ,i ,f-sets ,disallow-e-first) '(("" t 0))))) + (parser--debug + (message "Leading %d terminals at index %s (%s) -> %s = %s" k i production-lhs rhs-string rhs-leading-terminals)) + (when rhs-leading-terminals + (when (and + (listp rhs-leading-terminals) + (> (length rhs-leading-terminals) 0)) + (dolist (rhs-leading-terminals-string rhs-leading-terminals) + (when (and + (stringp rhs-leading-terminals-string) + (> (length rhs-leading-terminals-string) 0)) + (push rhs-leading-terminals-string f-p-set)))))))) + + ;; Make set distinct + (setq f-p-set (parser--distinct f-p-set)) + (parser--debug + (message "F_%s_%s(%s) = %s" i k production-lhs f-p-set)) + (puthash production-lhs (nreverse f-p-set) f-set)))) + (puthash i f-set f-sets) + (setq i (+ i 1)))) + (setq parser--f-sets f-sets))) + + ;; Iterate each symbol in β using a PDA algorithm + (let ((state 'parsing) + (input-tape β) + (input-tape-length (length β)) + (stack '((0 0 nil))) + (first-list nil)) + (while stack + (let ((stack-topmost (pop stack))) + (let ((input-tape-index (car stack-topmost)) + (first-length (car (cdr stack-topmost))) + (first (car (cdr (cdr stack-topmost))))) + (while (and + (< input-tape-index input-tape-length) + (< first-length k)) + (let ((symbol (nth input-tape-index input-tape))) + (cond + ((parser--valid-terminal-p symbol) + (push symbol first) + (setq first-length (1+ first-length))) + ((parser--valid-non-terminal-p symbol) + (let ((symbol-f-set (sort (gethash symbol (gethash (1- i-max) parser--f-sets)) 'string<))) + (when (> (length symbol-f-set) 0) + ;; Handle this scenario here were a non-terminal can result in different FIRST sets + (let ((symbol-f-set-index 1) + (symbol-f-set-length (length symbol-f-set))) + (while (< symbol-f-set-index symbol-f-set-length) + (let ((symbol-f-set-element (nth symbol-f-set-index symbol-f-set))) + (let ((alternative-first-length (+ first-length (length symbol-f-set-element))) + (alternative-first (append first symbol-f-set-element)) + (alternative-tape-index (1+ input-tape-index))) + (push `(,alternative-tape-index ,alternative-first-length ,alternative-first) stack)))))) + (setq first-length (+ first-length (length (car symbol-f-set)))) + (setq first (append first (car symbol-f-set))))))) + (setq input-tape-index (1+ input-tape-index))) + (when (> first-length 0) + (push first first-list))))) + first-list))) (defun parser--v-set (y) "Calculate valid LRk-sets for the viable-prefix Y in grammar G with look-ahead K."