branch: externals/pyim commit 172e8e14e44c91c471dcabdbd9798d211f90bca6 Author: Feng Shu <tuma...@163.com> Commit: Feng Shu <tuma...@163.com>
Add pyim-pymap-cchars2pys-get and use it. --- pyim-cstring.el | 83 +++++++++++------------------------------------------ pyim-pymap.el | 69 ++++++++++++++++++++++++++++++++++++++++++++ tests/pyim-tests.el | 10 +++++-- 3 files changed, 92 insertions(+), 70 deletions(-) diff --git a/pyim-cstring.el b/pyim-cstring.el index 1d1e4df051..2d718a72dd 100644 --- a/pyim-cstring.el +++ b/pyim-cstring.el @@ -131,79 +131,28 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结 "从 Dcache 中搜索 CSTRING 对应的拼音。" (let* ((string-parts (pyim-cstring--partition cstring)) (pinyins-list - (mapcar (lambda (str) - (if (pyim-string-match-p "\\cc" str) - (when-let ((code (cl-find-if-not - (lambda (c) - ;; 注意:Pinyin 词库中不包含 "/" 字符。 - (string-match-p c "/")) - (pyim-dcache-get str '(word2code))))) - (split-string code "-")) - (list str))) + (mapcar #'pyim-cstring--get-pinyin-code string-parts))) (unless (member nil pinyins-list) (list (apply #'append pinyins-list))))) +(defun pyim-cstring--get-pinyin-code (str) + "从 Dcache 中获取中文字符串 STR 对应的拼音。 + +如果 STR 不包含中文,不做特殊处理。" + (if (pyim-string-match-p "\\cc" str) + (when-let ((code (cl-find-if-not + (lambda (c) + ;; 注意:Pinyin 词库中不包含 "/" 字符。 + (string-match-p c "/")) + (pyim-dcache-get str '(word2code))))) + (split-string code "-")) + (list str))) + (defun pyim-cstring-to-pinyin--from-pymap (cstring) "使用 PYMAP 提供的工具来搜索 CSTRING 对应的拼音。" - (let* ((string-parts (pyim-cstring--partition cstring t)) - (pinyins-list - ;; ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing")) - (mapcar (lambda (str) - (if (pyim-string-match-p "\\cc" str) - (pyim-pymap-cchar2py-get str) - (list str))) - string-parts))) - ;; 通过排列组合的方式, 重排 pinyins-list。 - ;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang")) - (pyim-permutate-list - (pyim-cstring--adjust-duoyinzi - string-parts pinyins-list)))) - -(defun pyim-cstring--adjust-duoyinzi (string-parts pinyins-list) - "根据 STRING-PARTS 对 PINYINS-LIST 进行校正。 - -比如: - -1. STRING-PARTS: (\"人\" \"民\" \"银\" \"行\") -2. PINYINS-LIST: ((\"ren\") (\"min\") (\"yin\") (\"hang\" \"xing\")) -3. 输出结果为: ((\"ren\") (\"min\") (\"yin\") (\"hang\")) - -这个函数依赖 `pyim-pymap-duoyinzi' 提供的多音字数据。" - (let ((n (length pinyins-list)) - output) - (dotimes (i n) - (let ((pinyins (nth i pinyins-list)) - ;; 当前位置对应的汉字和位置前后汉字组成的两字词语。 - (words-list (list (when (>= (- i 1) 0) - (concat (nth (- i 1) string-parts) - (nth i string-parts))) - (when (< (+ i 1) n) - (concat (nth i string-parts) - (nth (+ i 1) string-parts))))) - ;; 当前位置汉字 - (char-list (list (nth i string-parts)))) - (if (= (length pinyins) 1) - (push pinyins output) - (let ((py-adjusted - (or - ;; NOTE: 多音字校正规则: - ;; 1. 首先通过 pyim 自带的多音字词语来校正,具体见: - ;; `pyim-pymap-duoyinzi-words' - (pyim-pymap-possible-cchar-pinyin pinyins words-list) - ;; 2. 然后通过 pyim 自带的多音字常用读音进行校正, 具体见: - ;; `pyim-pymap-duoyinzi-chars', - ;; - ;; NOTE: 如果用户想要使用某个汉字的偏僻读音,这样处理是有问题 - ;; 的,但大多数情况我们还是使用汉字的常用读音,让偏僻的读音进 - ;; 入用户个人词库似乎也没有什么好处。 - (pyim-pymap-possible-cchar-pinyin pinyins char-list t)))) - ;; 3. 如果多音字校正没有结果,就使用未校正的信息。 - (push (if py-adjusted - (list py-adjusted) - pinyins) - output))))) - (reverse output))) + (pyim-pymap-cchars2pys-get + (pyim-cstring--partition cstring t))) ;;;###autoload (defun pyim-cstring-to-pinyin-simple (string &optional shou-zi-mu separator return-list) diff --git a/pyim-pymap.el b/pyim-pymap.el index 4d014d9025..66dfae4069 100644 --- a/pyim-pymap.el +++ b/pyim-pymap.el @@ -1044,6 +1044,30 @@ If FORCE is non-nil, FORCE build." output (remove "|" output))))) +(defun pyim-pymap-cchars2pys-get (cchars) + "将汉字列表转换为拼音列表,转换过程中矫正多音字。 + +比如: +1. CCHARS: (\"你\" \"好\") +2. OUTPUTS: ((\"ni\" \"hao\")) + +注意事项: +1. 这个函数遇到非汉字字符串时,原样输出。 +2. 多音字矫正依赖 pymap 自带的多音字矫正信息的完善程度,可能会出 + 现矫正不正确的情况,这个函数为了保证性能,只处理常用多音字。" + (let* ((pinyins-list + ;; ("Hello" "银" "行") -> (("Hello") ("yin") ("hang" "xing")) + (mapcar (lambda (str) + (if (pyim-string-match-p "\\cc" str) + (pyim-pymap-cchar2py-get str) + (list str))) + cchars))) + ;; 通过排列组合的方式, 重排 pinyins-list。 + ;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang")) + (pyim-permutate-list + (pyim-pymap--adjust-duoyinzi + cchars pinyins-list)))) + (defun pyim-pymap-cchar2py-get (char-or-str) "获取字符或者字符串 CHAR-OR-STR 对应的拼音 code. @@ -1062,6 +1086,51 @@ pyim 在特定的时候需要读取一个汉字的拼音,这个工作由此完 (when (= (length key) 1) (gethash key pyim-pymap--cchar2py-cache)))) +(defun pyim-pymap--adjust-duoyinzi (cchars-list pinyins-list) + "根据 CCHARS-LIST 对 PINYINS-LIST 进行校正。 + +比如: + +1. CCHARS-LIST: (\"人\" \"民\" \"银\" \"行\") +2. PINYINS-LIST: ((\"ren\") (\"min\") (\"yin\") (\"hang\" \"xing\")) +3. 输出结果为: ((\"ren\") (\"min\") (\"yin\") (\"hang\")) + +这个函数依赖 `pyim-pymap-duoyinzi' 提供的多音字数据。" + (let ((n (length pinyins-list)) + output) + (dotimes (i n) + (let ((pinyins (nth i pinyins-list)) + ;; 当前位置对应的汉字和位置前后汉字组成的两字词语。 + (words-list (list (when (>= (- i 1) 0) + (concat (nth (- i 1) cchars-list) + (nth i cchars-list))) + (when (< (+ i 1) n) + (concat (nth i cchars-list) + (nth (+ i 1) cchars-list))))) + ;; 当前位置汉字 + (char-list (list (nth i cchars-list)))) + (if (= (length pinyins) 1) + (push pinyins output) + (let ((py-adjusted + (or + ;; NOTE: 多音字校正规则: + ;; 1. 首先通过 pyim 自带的多音字词语来校正,具体见: + ;; `pyim-pymap-duoyinzi-words' + (pyim-pymap-possible-cchar-pinyin pinyins words-list) + ;; 2. 然后通过 pyim 自带的多音字常用读音进行校正, 具体见: + ;; `pyim-pymap-duoyinzi-chars', + ;; + ;; NOTE: 如果用户想要使用某个汉字的偏僻读音,这样处理是有问题 + ;; 的,但大多数情况我们还是使用汉字的常用读音,让偏僻的读音进 + ;; 入用户个人词库似乎也没有什么好处。 + (pyim-pymap-possible-cchar-pinyin pinyins char-list t)))) + ;; 3. 如果多音字校正没有结果,就使用未校正的信息。 + (push (if py-adjusted + (list py-adjusted) + pinyins) + output))))) + (reverse output))) + (defun pyim-pymap-possible-cchar-pinyin (cchar-pinyins cchar-words &optional search-char) "寻找一个汉字当前最可能的读音。 diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el index 87f145f496..818ab5ce31 100644 --- a/tests/pyim-tests.el +++ b/tests/pyim-tests.el @@ -297,6 +297,10 @@ (should-not (pyim-pymap-duoyinzi-include-p "银子")) (should (equal (pyim-pymap-py2duoyinzi-get "ai" t) '("艾"))) + + (should (equal (pyim-pymap-cchars2pys-get '("hello" "你" "好" "ma")) + '(("hello" "ni" "hao" "ma")))) + (should (equal (mapcar (lambda (x) (concat (substring x 0 1) (substring x -1))) @@ -808,19 +812,19 @@ '("bu" "pi") '("不") t) "bu")) - (should (equal (pyim-cstring--adjust-duoyinzi + (should (equal (pyim-pymap--adjust-duoyinzi '("银" "行" "传" "说") '(("yin") ("xing" "heng" "hang") ("zhuan" "chuan") ("yue" "shuo" "shui"))) '(("yin") ("hang") ("chuan") ("shuo")))) - (should (equal (pyim-cstring--adjust-duoyinzi + (should (equal (pyim-pymap--adjust-duoyinzi '("银" "行" "很" "行") '(("yin") ("xing" "heng" "hang") ("hen") ("xing" "heng" "hang"))) '(("yin") ("hang") ("hen") ("xing")))) - (should (equal (pyim-cstring--adjust-duoyinzi + (should (equal (pyim-pymap--adjust-duoyinzi '("银" "行" "行" "业" "很" "行" "不" "行" "也" "行" "行" "也" "行")