branch: externals/pyim commit e0f80e5d36926f269174cc16385754ec6141f791 Merge: 1e0834c456 7288242a6d Author: Feng Shu <tuma...@163.com> Commit: Feng Shu <tuma...@163.com>
Merge branch 'dcache' --- README.org | 2 +- pyim-candidates.el | 2 +- pyim-cstring.el | 2 +- pyim-dcache.el | 206 ++++++------ pyim-dhashcache.el | 922 ++++++++++++++++++++++++++++------------------------ pyim-dregcache.el | 575 +++++++++++++++++--------------- pyim-process.el | 8 +- pyim.el | 27 +- tests/pyim-tests.el | 10 +- 9 files changed, 927 insertions(+), 827 deletions(-) diff --git a/README.org b/README.org index d278dc124e..81e423139b 100644 --- a/README.org +++ b/README.org @@ -33,7 +33,7 @@ 1. 五笔用户 1. 需要 (require 'pyim-wbdict), 加载五笔 scheme 设置。 2. 需要将自己的五笔词库文件中的 code-prefix "." 替换为 "wubi/". - 3. 运行 `pyim-dcache-upgrade' 命令,升级 icode2word 词库缓存。 + 3. 运行 `pyim-upgrade' 命令,升级 icode2word 词库缓存。 2. 仓颉用户 1. 需要 (require 'pyim-cangjie5dict), 加载仓颉 scheme 设置。 2. 需要将自己的五笔词库文件中的 code-prefix "@" 替换为 "cangjie/". diff --git a/pyim-candidates.el b/pyim-candidates.el index 3f035fddbe..38675707b6 100644 --- a/pyim-candidates.el +++ b/pyim-candidates.el @@ -46,7 +46,7 @@ ;; ** 获取备选词列表 (defun pyim-candidates-sort (candidates) "对 CANDIDATES 进行排序。" - (pyim-dcache-call-api 'sort-words candidates)) + (pyim-dcache-sort-words candidates)) (cl-defgeneric pyim-candidates-get-chief (scheme &optional personal-words common-words) "PYIM 输入法第一位候选词的获取策略。") diff --git a/pyim-cstring.el b/pyim-cstring.el index c90b78afec..7e9dbff6ba 100644 --- a/pyim-cstring.el +++ b/pyim-cstring.el @@ -182,7 +182,7 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结 (mapcar (lambda (x) (when (string-prefix-p prefix x) (string-remove-prefix prefix x))) - (sort (cl-copy-list (pyim-dcache-call-api 'search-word-code string)) + (sort (cl-copy-list (pyim-dcache-search-word-code string)) (lambda (a b) (> (length a) (length b)))))) (codes (remove nil dcache-codes))) diff --git a/pyim-dcache.el b/pyim-dcache.el index 79e0236bab..7dff11ebb8 100644 --- a/pyim-dcache.el +++ b/pyim-dcache.el @@ -65,26 +65,7 @@ pyim 对资源的消耗。 2. 自动更新功能无法正常工作,用户通过手工从其他机器上拷贝 dcache 文件的方法让 pyim 正常工作。") -;; ** Dcache API 调用功能 -(defun pyim-dcache-call-api (api-name &rest api-args) - "Get backend API named API-NAME then call it with arguments API-ARGS." - ;; make sure the backend is load - (unless (featurep pyim-dcache-backend) - (require pyim-dcache-backend)) - (let ((func (intern (concat (symbol-name pyim-dcache-backend) - "-" (symbol-name api-name))))) - (if (functionp func) - (apply func api-args) - (when pyim-debug - (message "%S 不是一个有效的 dcache api 函数." (symbol-name func)) - ;; Need to return nil - nil)))) - -;; ** Dcache 变量处理相关功能 -(defun pyim-dcache-init-variables () - "初始化 dcache 缓存相关变量." - (pyim-dcache-call-api 'init-variables)) - +;; ** Dcache 变量初始化相关函数 (defmacro pyim-dcache-init-variable (variable &optional fallback-value) "初始化 VARIABLE. @@ -96,18 +77,22 @@ dcache 文件的方法让 pyim 正常工作。") ,fallback-value (make-hash-table :test #'equal))))) -(defmacro pyim-dcache-reload-variable (variable) - "从 `pyim-dcache-directory' 重新读取并设置 VARIABLE 的值." - `(when (symbolp ',variable) - (setq ,variable (or (pyim-dcache-get-value ',variable) - (make-hash-table :test #'equal))))) - (defun pyim-dcache-get-value (variable) "从 `pyim-dcache-directory' 中读取与 VARIABLE 对应的文件中保存的值." (let ((file (expand-file-name (url-hexify-string (symbol-name variable)) pyim-dcache-directory))) (pyim-dcache-get-value-from-file file))) +(defun pyim-dcache-get-value-from-file (file) + "读取保存到 FILE 里面的 value." + (when (and (> (length file) 0) + (file-exists-p file)) + (with-temp-buffer + (insert-file-contents file) + (ignore-errors + (read (current-buffer)))))) + +;; ** Dcache 保存变量相关函数 (defun pyim-dcache-save-variable (variable value &optional auto-backup-threshold) "将 VARIABLE 变量的取值保存到 `pyim-dcache-directory' 中对应文件中. @@ -117,14 +102,6 @@ dcache 文件的方法让 pyim 正常工作。") pyim-dcache-directory))) (pyim-dcache-save-value-to-file value file auto-backup-threshold))) -(defun pyim-dcache-value-length (value) - "获取 VALUE 的某个可以作为长度的值." - (or (ignore-errors - (if (hash-table-p value) - (hash-table-count value) - (length value))) - 0)) - (defun pyim-dcache-save-value-to-file (value file &optional auto-backup-threshold) "将 VALUE 保存到 FILE 文件中. @@ -155,16 +132,14 @@ AUTO-BACKUP-THRESHOLD 倍, 那么原值将自动备份到 FILE 对应的备份 (insert ";; End:") (pyim-dcache-write-file file))))) -(defun pyim-dcache-get-value-from-file (file) - "读取保存到 FILE 里面的 value." - (when (and (> (length file) 0) - (file-exists-p file)) - (with-temp-buffer - (insert-file-contents file) - (ignore-errors - (read (current-buffer)))))) +(defun pyim-dcache-value-length (value) + "获取 VALUE 的某个可以作为长度的值." + (or (ignore-errors + (if (hash-table-p value) + (hash-table-count value) + (length value))) + 0)) -;; ** Dcache 文件处理功能 (defun pyim-dcache-write-file (filename &optional confirm) "A helper function to write dcache files." (let ((coding-system-for-write 'utf-8-unix) @@ -181,98 +156,99 @@ AUTO-BACKUP-THRESHOLD 倍, 那么原值将自动备份到 FILE 对应的备份 (write-region (point-min) (point-max) filename nil :silent) (message "Saving file %s..." filename))) -(defun pyim-dcache-save-caches () - "保存 dcache. +(defun pyim-dcache-create-files-md5 (files) + "为 FILES 生成 md5 字符串。" + ;; 当需要强制更新 dict 缓存时,更改这个字符串。 + (let ((version "v1")) + (md5 (prin1-to-string + (mapcar (lambda (file) + (list version file (nth 5 (file-attributes file 'string)))) + files))))) - 将用户选择过的词生成的缓存和词频缓存的取值 - 保存到它们对应的文件中. +;; ** Dcache 重新加载变量相关函数 +(defmacro pyim-dcache-reload-variable (variable) + "从 `pyim-dcache-directory' 重新读取并设置 VARIABLE 的值." + `(when (symbolp ',variable) + (setq ,variable (or (pyim-dcache-get-value ',variable) + (make-hash-table :test #'equal))))) - 这个函数默认作为 `kill-emacs-hook' 使用。" - (interactive) - (pyim-dcache-call-api 'save-personal-dcache-to-file) - t) +;; ** Dcache 初始化功能接口 +(cl-defgeneric pyim-dcache-init-variables () + "初始化 dcache 缓存相关变量." + nil) -;; ** Dcache 导出功能 -(defun pyim-dcache-export-words-and-counts (file &optional confirm ignore-counts) - "将个人词条以及词条对应的词频信息导出到文件 FILE. +(cl-defmethod pyim-dcache-init-variables :before () + (unless (featurep pyim-dcache-backend) + (require pyim-dcache-backend))) -如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 -non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式" - (interactive "F将词条和词频信息导出到文件: ") - (pyim-dcache-init-variables) - (pyim-dcache-call-api 'export-words-and-counts file confirm ignore-counts) - (message "PYIM: 词条和词频信息导出完成。")) +;; ** Dcache 检索词条功能接口 +(cl-defgeneric pyim-dcache-get (_code &optional _from) + "从 FROM 对应的 dcache 中搜索 CODE, 得到对应的词条. -(defun pyim-dcache-export-personal-words (file &optional confirm) - "将用户的个人词条导出为 pyim 词库文件. +当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个 +code 对应的中文词条了." + nil) -如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil, -文件存在时将会提示用户是否覆盖,默认为覆盖模式。" - (interactive "F将个人词条导出到文件:") - (pyim-dcache-init-variables) - (pyim-dcache-call-api 'export-personal-words file confirm) - (message "PYIM: 个人词条导出完成。")) +(cl-defmethod pyim-dcache-get :before (_code &optional _from) + (unless (featurep pyim-dcache-backend) + (require pyim-dcache-backend))) -;; ** Dcache 更新功能 -(defun pyim-dcache-update (&optional force) - "读取并加载所有相关词库 dcache. +;; ** Dcache 代码反查功能接口 +(cl-defgeneric pyim-dcache-search-word-code (word) + "从 dcache 中搜索 WROD 对应的 code.") -如果 FORCE 为真,强制加载。" - (pyim-dcache-call-api 'update force)) +;; ** Dcache 加词功能接口 +(cl-defgeneric pyim-dcache-insert-word (word code prepend) + "将词条 WORD 插入到 dcache 中。 -(defun pyim-dcache-create-files-md5 (files) - "为 FILES 生成 md5 字符串。" - ;; 当需要强制更新 dict 缓存时,更改这个字符串。 - (let ((version "v1")) - (md5 (prin1-to-string - (mapcar (lambda (file) - (list version file (nth 5 (file-attributes file 'string)))) - files))))) +如果 PREPEND 为 non-nil, 词条将放到已有词条的最前面。 +内部函数会根据 CODE 来确定插入对应的 hash key.") -(defun pyim-dcache-update-wordcount (word &optional wordcount-handler) +;; ** Dcache 删词功能 +(cl-defgeneric pyim-dcache-delete-word (word) + "将中文词条 WORD 从个人词库中删除") + +;; ** Dcache 更新功能接口 +(cl-defgeneric pyim-dcache-update (&optional force) + "读取并加载所有相关词库 dcache, 如果 FORCE 为真,强制加载。") + +;; ** Dcache 更新词条统计量功能接口 +(cl-defgeneric pyim-dcache-update-wordcount (word &optional wordcount-handler) "保存 WORD 词频. 1. 如果 WORDCOUNT-HANDLER 是一个函数:那么其返回值将作为词频保存, 参数为原有词频。 2. 如果 WORDCOUNT-HANDLER 是一个数值:那么这个数值直接作为词频保存。 -3. 如果 WORDCOUNT-HANDLER 为其他值:词频不变." - (pyim-dcache-call-api 'update-iword2count word wordcount-handler)) +3. 如果 WORDCOUNT-HANDLER 为其他值:词频不变.") -;; ** Dcache 加词功能 -(defun pyim-dcache-insert-word (word code prepend) - "将词条 WORD 插入到 dcache 中。 +;; ** Dcache 升级功能接口 +(cl-defgeneric pyim-dcache-upgrade () + "升级词库缓存.") -如果 PREPEND 为 non-nil, 词条将放到已有词条的最前面。 -内部函数会根据 CODE 来确定插入对应的 hash key." - (pyim-dcache-call-api 'insert-word-into-icode2word word code prepend) - ;; NOTE: 保存词条到 icode2word 词库缓存的同时,也在 ishortcode2word 词库缓存中 - ;; 临时写入一份,供当前 Emacs session 使用,但退出时 pyim 不会保存 - ;; ishortcode2word 词库缓存到文件,因为下次启动 Emacs 的时候,ishortcode2word - ;; 词库缓存会从 icode2word 再次重建。 - (pyim-dcache-call-api 'insert-word-into-ishortcode2word word code prepend)) - -;; ** Dcache 升级功能 -(defun pyim-dcache-upgrade () - "升级词库缓存. - -当前已有的功能: -1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。" - (interactive) - (pyim-dcache-call-api 'upgrade-icode2word)) +;; ** Dcache 排序功能接口 +(cl-defgeneric pyim-dcache-sort-words (words) + "对 WORDS 进行排序。" + words) -;; ** Dcache 删词功能 -(defun pyim-dcache-delete-word (word) - "将中文词条 WORD 从个人词库中删除" - (pyim-dcache-call-api 'delete-word word)) +;; ** Dcache 保存功能接口 +(cl-defgeneric pyim-dcache-save-caches () + "保存 dcache. -;; ** Dcache 检索功能 -(defun pyim-dcache-get (code &optional from) - "从 FROM 对应的 dcache 中搜索 CODE, 得到对应的词条. +将用户选择过的词生成的缓存和词频缓存的取值 +保存到它们对应的文件中.") -当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个 -code 对应的中文词条了." - (when code - (pyim-dcache-call-api 'get code from))) +;; ** Dcache 导出功能接口 +(cl-defgeneric pyim-dcache-export-words-and-counts (file &optional confirm ignore-counts) + "将个人词条以及词条对应的词频信息导出到文件 FILE. + +如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 +non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式") + +(cl-defgeneric pyim-dcache-export-personal-words (file &optional confirm) + "将用户的个人词条导出为 pyim 词库文件. + +如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil, +文件存在时将会提示用户是否覆盖,默认为覆盖模式。") ;; * Footer (provide 'pyim-dcache) diff --git a/pyim-dhashcache.el b/pyim-dhashcache.el index db2b0a8ad6..f07cdf7811 100644 --- a/pyim-dhashcache.el +++ b/pyim-dhashcache.el @@ -79,11 +79,167 @@ (defvar pyim-dhashcache-update-iword2priority-p nil) (defvar pyim-dhashcache-update-code2word-running-p nil) -(defun pyim-dhashcache-update (&optional force) +;; ** 初始化 dhashcache 相关函数 +(cl-defmethod pyim-dcache-init-variables + (&context (pyim-dcache-backend (eql pyim-dhashcache))) + "初始化 dcache 缓存相关变量." + (when (and (not pyim-dhashcache-icode2word) + pyim-dcache-directory + (file-directory-p pyim-dcache-directory) + (directory-files pyim-dcache-directory nil "-backup-")) + (message "PYIM: 在 %S 目录中发现备份文件的存在,可能是词库缓存文件损坏导致,请抓紧检查处理!!!" + pyim-dcache-directory)) + (pyim-dhashcache-init-count-and-priority-variables) + (pyim-dcache-init-variable pyim-dhashcache-code2word) + (pyim-dcache-init-variable pyim-dhashcache-word2code) + (pyim-dcache-init-variable pyim-dhashcache-shortcode2word) + (pyim-dcache-init-variable pyim-dhashcache-icode2word) + (pyim-dcache-init-variable pyim-dhashcache-ishortcode2word)) + +(defun pyim-dhashcache-init-count-and-priority-variables () + "初始化 count 相关的变量。" + (pyim-dcache-init-variable pyim-dhashcache-iword2count) + (pyim-dcache-init-variable pyim-dhashcache-iword2count-log) + (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-10-words) + (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-50-words) + (pyim-dcache-init-variable pyim-dhashcache-iword2priority)) + +;; ** 从 dhashcache 搜索词条相关函数 +(cl-defmethod pyim-dcache-get + (code &context (pyim-dcache-backend (eql pyim-dhashcache)) + &optional from) + "从 FROM 对应的 dcaches 中搜索 CODE, 得到对应的词条. + +当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个 +code 对应的中文词条了。 + +如果 FROM 为 nil, 则默认搜索 `pyim-dhashcache-icode2word' 和 +`pyim-dhashcache-code2word' 两个 dcache." + (when code + (let* ((caches (mapcar (lambda (x) + (intern (concat "pyim-dhashcache-" (symbol-name x)))) + (or (and from + (if (listp from) + from + (list from))) + '(icode2word code2word)))) + result) + (dolist (cache caches) + (let* ((cache (ignore-errors (symbol-value cache))) + (value (and cache (gethash code cache)))) + ;; 处理 iword2count. + (unless (listp value) + (setq value (list value))) + (when value + (setq result (append result value))))) + result))) + +;; ** 从 dhashcache 搜索代码相关函数 +(cl-defmethod pyim-dcache-search-word-code + (string &context (pyim-dcache-backend (eql pyim-dhashcache))) + (gethash string pyim-dhashcache-word2code)) + +;; ** 给 dhashcache 添加词条相关函数 +(cl-defmethod pyim-dcache-insert-word + (word code prepend + &context (pyim-dcache-backend (eql pyim-dhashcache))) + "将词条 WORD 插入到下面两个词库缓存中。 + +1. `pyim-dhashcache-icode2word' +2. `pyim-dhashcache-insert-word-into-ishortcode2word'." + (pyim-dhashcache-insert-word-into-icode2word word code prepend) + ;; NOTE: 保存词条到 icode2word 词库缓存的同时,也在 ishortcode2word 词库缓存中 + ;; 临时写入一份,供当前 Emacs session 使用,但退出时 pyim 不会保存 + ;; ishortcode2word 词库缓存到文件,因为下次启动 Emacs 的时候,ishortcode2word + ;; 词库缓存会从 icode2word 再次重建。 + (pyim-dhashcache-insert-word-into-ishortcode2word word code prepend)) + +(defmacro pyim-dhashcache-put (cache code &rest body) + "将 BODY 的返回值保存到 CACHE 对应的 CODE 中。 + +注意事项:这个宏是一个指代宏,其中 orig-value 在这个宏中有特殊含 +义,代表原来 code 对应的取值。" + (declare (indent 0)) + (let ((key (make-symbol "key")) + (table (make-symbol "table")) + (new-value (make-symbol "new-value"))) + `(let* ((,key ,code) + (,table ,cache) + (orig-value (gethash ,key ,table)) + ,new-value) + (setq ,new-value (progn ,@body)) + (puthash ,key ,new-value ,table)))) + +(defun pyim-dhashcache-insert-word-into-icode2word (word code prepend) + "将词条 WORD 插入到 icode2word 词库缓存 CODE 键对应的位置. + +默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放 +到已有词条的最前面。" + (pyim-dhashcache-put + pyim-dhashcache-icode2word code + (if prepend + `(,word ,@(remove word orig-value)) + `(,@(remove word orig-value) ,word)))) + +(defun pyim-dhashcache-insert-word-into-ishortcode2word (word code prepend) + "将词条 WORD 插入到 ishortcode2word 词库缓存 CODE 首字母字符串对应的位置. + +默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放 +到已有词条的最前面。" + (dolist (newcode (pyim-dhashcache-get-ishortcodes code)) + (pyim-dhashcache-put + pyim-dhashcache-ishortcode2word + newcode + (if prepend + `(,word ,@(remove word orig-value)) + `(,@(remove word orig-value) ,word))))) + +(defun pyim-dhashcache-get-ishortcodes (code) + "获取CODE 所有的简写 ishortcodes. + +比如: ni-hao -> (n-h) + +注意事项:这个函数用于全拼输入法。" + (when (and (> (length code) 0) + (not (pyim-string-match-p "/" code)) + (not (pyim-string-match-p "[^a-z-]" code))) + (list (mapconcat + (lambda (x) + (substring x 0 1)) + (split-string code "-") "-")))) + +;; ** 从 dhashcache 删除词条相关函数 +(cl-defmethod pyim-dcache-delete-word + (word &context (pyim-dcache-backend (eql pyim-dhashcache))) + "将中文词条 WORD 从个人词库中删除" + (maphash + (lambda (key value) + (when (member word value) + (let ((new-value (remove word value))) + (if new-value + (puthash key new-value pyim-dhashcache-icode2word) + (remhash key pyim-dhashcache-icode2word))))) + pyim-dhashcache-icode2word) + (maphash + (lambda (key value) + (when (member word value) + (print value) + (let ((new-value (remove word value))) + (if new-value + (puthash key new-value pyim-dhashcache-ishortcode2word) + (remhash key pyim-dhashcache-ishortcode2word))))) + pyim-dhashcache-ishortcode2word) + (remhash word pyim-dhashcache-iword2count) + (remhash word pyim-dhashcache-iword2count-log) + (remhash word pyim-dhashcache-iword2priority)) + +;; ** 更新 dhashcache 相关函数 +(cl-defmethod pyim-dcache-update + (&context (pyim-dcache-backend (eql pyim-dhashcache)) &optional force) "读取并加载所有相关词库 dcache. 如果 FORCE 为真,强制加载。" - (pyim-dhashcache-init-variables) + (pyim-dcache-init-variables) (when pyim-dcache-auto-update (pyim-dhashcache-update-iword2priority force) (pyim-dhashcache-update-personal-words force) @@ -91,22 +247,57 @@ (dicts-md5 (pyim-dcache-create-files-md5 dict-files))) (pyim-dhashcache-update-code2word dict-files dicts-md5 force)))) -(defun pyim-dhashcache-sort-words (words-list) - "对 WORDS-LIST 排序" - (let ((iword2count pyim-dhashcache-iword2count) - (iword2priority pyim-dhashcache-iword2priority)) - (sort words-list - (lambda (a b) - (let ((p1 (gethash a iword2priority)) - (p2 (gethash b iword2priority))) - (cond - ((and (listp p1) - (listp p2) - (not (equal p1 p2))) - (pyim-numbers> p1 p2)) - (t (let ((n1 (or (gethash a iword2count) 0)) - (n2 (or (gethash b iword2count) 0))) - (> n1 n2))))))))) +(defun pyim-dhashcache-update-iword2priority (&optional force) + "更新词条优先级表,如果 FORCE 为真,强制更新。" + (interactive) + (when (or force (not pyim-dhashcache-update-iword2priority-p)) + ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死, + ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。 + (setq pyim-dhashcache-update-iword2priority-p t) + (async-start + `(lambda () + ,@(pyim-dhashcache-async-inject-variables) + (require 'pyim-dhashcache) + (pyim-dhashcache-init-count-and-priority-variables) + (maphash + (lambda (key value) + (puthash key + (pyim-dhashcache-calculate-priority + (pyim-dhashcache-get-counts-from-log + value)) + pyim-dhashcache-iword2priority)) + pyim-dhashcache-iword2count-log) + (pyim-dcache-save-variable + 'pyim-dhashcache-iword2priority + pyim-dhashcache-iword2priority) + nil) + (lambda (_) + (pyim-dcache-reload-variable pyim-dhashcache-iword2priority))))) + +(defun pyim-dhashcache-async-inject-variables () + "pyim's async-inject-variables." + (list (async-inject-variables "^load-path$") + (async-inject-variables "^exec-path$") + (async-inject-variables "^pyim-.+?directory$"))) + +(defun pyim-dhashcache-calculate-priority (counts-info) + "根据 COUNTS-INFO 计算优先级(优先级是多个数字组成的一个列表), +用于对词条进行排序。COUNTS-INFO 是一个 alist, 其结构类似: + + ((day n1 n2 n3 ...)) + +其中 (n1 n2 n3 ...) 代表从当前日期逐日倒推,每日 count 所组成的列表。" + (mapcar (lambda (x) + (let* ((label (car x)) + (plist (cdr x)) + (weights (plist-get plist :weights)) + (factor (plist-get plist :factor))) + (round (* (apply #'+ (cl-mapcar (lambda (a b) + (* (or a 0) b)) + (cdr (assoc label counts-info)) + weights)) + factor)))) + pyim-dhashcache-count-types)) (defun pyim-dhashcache-get-counts-from-log (log-info &optional time) "从 LOG-INFO 中获取所有的 count 值。 @@ -131,190 +322,105 @@ `(,label ,@(reverse output)))) pyim-dhashcache-count-types)) -(defun pyim-dhashcache-calculate-priority (counts-info) - "根据 COUNTS-INFO 计算优先级(优先级是多个数字组成的一个列表), -用于对词条进行排序。COUNTS-INFO 是一个 alist, 其结构类似: - - ((day n1 n2 n3 ...)) - -其中 (n1 n2 n3 ...) 代表从当前日期逐日倒推,每日 count 所组成的列表。" - (mapcar (lambda (x) - (let* ((label (car x)) - (plist (cdr x)) - (weights (plist-get plist :weights)) - (factor (plist-get plist :factor))) - (round (* (apply #'+ (cl-mapcar (lambda (a b) - (* (or a 0) b)) - (cdr (assoc label counts-info)) - weights)) - factor)))) - pyim-dhashcache-count-types)) - -(defun pyim-dhashcache-get-shortcodes (code) - "获取 CODE 所有的 shortcodes. - -比如:wubi/aaaa -> (wubi/aaa wubi/aa) - -注意事项:这个函数目前只用于五笔等型码输入法,不用于拼音输入法, -因为拼音输入法词库太大,这样处理之后,会生成一个特别大的哈希表, -占用太多内存资源,拼音输入法使用 ishortcode 机制。" - (when (and (pyim-string-match-p "/" code) - (not (pyim-string-match-p "-" code))) - (let* ((x (split-string code "/")) - (prefix (concat (nth 0 x) "/")) - (code1 (nth 1 x)) - (n (length code1)) - results) - (dotimes (i n) - (when (> i 1) - (push (concat prefix (substring code1 0 i)) results))) - results))) - -(defun pyim-dhashcache-get-ishortcodes (code) - "获取CODE 所有的简写 ishortcodes. - -比如: ni-hao -> (n-h) - -注意事项:这个函数用于全拼输入法。" - (when (and (> (length code) 0) - (not (pyim-string-match-p "/" code)) - (not (pyim-string-match-p "[^a-z-]" code))) - (list (mapconcat - (lambda (x) - (substring x 0 1)) - (split-string code "-") "-")))) - -(defun pyim-dhashcache-async-inject-variables () - "pyim's async-inject-variables." - (list (async-inject-variables "^load-path$") - (async-inject-variables "^exec-path$") - (async-inject-variables "^pyim-.+?directory$"))) +(defun pyim-dhashcache-update-personal-words (&optional force) + (pyim-dhashcache-update-icode2word force)) -(defun pyim-dhashcache-update-ishortcode2word (&optional force) - "读取 `pyim-dhashcache-icode2word' 中的词库,创建 *简拼* 缓存,然后加载这个缓存. +(defun pyim-dhashcache-update-icode2word (&optional force) + "对 personal 缓存中的词条进行排序,加载排序后的结果. -如果 FORCE 为真,强制加载缓存。" +在这个过程中使用了 `pyim-dhashcache-iword2count' 中记录的词频信息。 +如果 FORCE 为真,强制排序。" (interactive) - (when (or force (not pyim-dhashcache-update-ishortcode2word-p)) + (when (or force (not pyim-dhashcache-update-icode2word-p)) ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死, ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。 - (setq pyim-dhashcache-update-ishortcode2word-p t) + (setq pyim-dhashcache-update-icode2word-p t) (async-start `(lambda () ,@(pyim-dhashcache-async-inject-variables) (require 'pyim-dhashcache) (pyim-dcache-init-variable pyim-dhashcache-icode2word) (pyim-dhashcache-init-count-and-priority-variables) + (maphash + (lambda (key value) + (puthash key (pyim-dcache-sort-words value) + pyim-dhashcache-icode2word)) + pyim-dhashcache-icode2word) (pyim-dcache-save-variable - 'pyim-dhashcache-ishortcode2word - (pyim-dhashcache-update-ishortcode2word-1 - pyim-dhashcache-icode2word))) + 'pyim-dhashcache-icode2word + pyim-dhashcache-icode2word) + nil) (lambda (_) - (pyim-dcache-reload-variable pyim-dhashcache-ishortcode2word))))) - -(defun pyim-dhashcache-update-ishortcode2word-1 (icode2word) - "`pyim-dhashcache-update-ishortcode2word' 内部函数." - (let ((ishortcode2word (make-hash-table :test #'equal))) - (maphash - (lambda (key value) - (dolist (newkey (pyim-dhashcache-get-ishortcodes key)) - (puthash newkey - (delete-dups - `(,@(gethash newkey ishortcode2word) - ,@value)) - ishortcode2word))) - icode2word) - (maphash - (lambda (key value) - (puthash key (pyim-dhashcache-sort-words value) - ishortcode2word)) - ishortcode2word) - ishortcode2word)) + (pyim-dcache-reload-variable pyim-dhashcache-icode2word) + (pyim-dhashcache-update-ishortcode2word force))))) -(defun pyim-dhashcache-update-shortcode2word (&optional force) - "使用 `pyim-dhashcache-code2word' 中的词条,创建简写 code 词库缓存并加载. +(defun pyim-dhashcache-update-ishortcode2word (&optional force) + "读取 `pyim-dhashcache-icode2word' 中的词库,创建 *简拼* 缓存,然后加载这个缓存. -如果 FORCE 为真,强制运行。" +如果 FORCE 为真,强制加载缓存。" (interactive) - (when (or force (not pyim-dhashcache-update-shortcode2word-p)) + (when (or force (not pyim-dhashcache-update-ishortcode2word-p)) ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死, ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。 - (setq pyim-dhashcache-update-shortcode2word-p t) + (setq pyim-dhashcache-update-ishortcode2word-p t) (async-start `(lambda () ,@(pyim-dhashcache-async-inject-variables) (require 'pyim-dhashcache) - (pyim-dcache-init-variable pyim-dhashcache-code2word) + (pyim-dcache-init-variable pyim-dhashcache-icode2word) (pyim-dhashcache-init-count-and-priority-variables) (pyim-dcache-save-variable - 'pyim-dhashcache-shortcode2word - (pyim-dhashcache-update-shortcode2word-1 - pyim-dhashcache-code2word))) + 'pyim-dhashcache-ishortcode2word + (pyim-dhashcache-update-ishortcode2word-1 + pyim-dhashcache-icode2word))) (lambda (_) - (pyim-dcache-reload-variable pyim-dhashcache-shortcode2word))))) - -(defun pyim-dhashcache-update-shortcode2word-1 (code2word) - "`pyim-dhashcache-update-shortcode2word' 的内部函数" - (let ((shortcode2word (make-hash-table :test #'equal))) + (pyim-dcache-reload-variable pyim-dhashcache-ishortcode2word))))) + +(defun pyim-dhashcache-update-ishortcode2word-1 (icode2word) + "`pyim-dhashcache-update-ishortcode2word' 内部函数." + (let ((ishortcode2word (make-hash-table :test #'equal))) (maphash (lambda (key value) - (dolist (x (pyim-dhashcache-get-shortcodes key)) - (puthash x - (mapcar - (lambda (word) - ;; 这个地方的代码用于实现五笔 code 自动提示功能, - ;; 比如输入 'aa' 后得到选词框: - ;; ---------------------- - ;; | 1. 莁aa 2.匶wv ... | - ;; ---------------------- - (if (get-text-property 0 :comment word) - word - (propertize word :comment (substring key (length x))))) - (delete-dups `(,@(gethash x shortcode2word) ,@value))) - shortcode2word))) - code2word) + (dolist (newkey (pyim-dhashcache-get-ishortcodes key)) + (puthash newkey + (delete-dups + `(,@(gethash newkey ishortcode2word) + ,@value)) + ishortcode2word))) + icode2word) (maphash (lambda (key value) - (puthash key (pyim-dhashcache-sort-words value) - shortcode2word)) - shortcode2word) - shortcode2word)) - -(defun pyim-dhashcache-get-path (variable) - "获取保存 VARIABLE 取值的文件的路径." - (when (symbolp variable) - (concat (file-name-as-directory pyim-dcache-directory) - (symbol-name variable)))) - -(defun pyim-dhashcache-generate-dcache-file (dict-files dcache-file) - "读取词库文件列表:DICT-FILES, 生成一个词库缓冲文件 DCACHE-FILE. + (puthash key (pyim-dcache-sort-words value) + ishortcode2word)) + ishortcode2word) + ishortcode2word)) -pyim 使用的词库文件是简单的文本文件,编码 *强制* 为 \\='utf-8-unix, -其结构类似: +(defun pyim-dhashcache-update-code2word (dict-files dicts-md5 &optional force) + "读取并加载词库. - ni-bu-hao 你不好 - ni-hao 你好 妮好 你豪 +读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。 -第一个空白字符之前的内容为 code,空白字符之后为中文词条列表。词库 -*不处理* 中文标点符号。" - (let ((hashtable (make-hash-table :size 1000000 :test #'equal))) - (dolist (file dict-files) - (with-temp-buffer - (let ((coding-system-for-read 'utf-8-unix)) - (insert-file-contents file)) - (goto-char (point-min)) - (forward-line 1) - (while (not (eobp)) - (let* ((content (pyim-dline-parse)) - (code (car content)) - (words (cdr content))) - (when (and code words) - (puthash code - (delete-dups `(,@(gethash code hashtable) ,@words)) - hashtable))) - (forward-line 1)))) - (pyim-dcache-save-value-to-file hashtable dcache-file) - hashtable)) +如果 FORCE 为真,强制加载。" + (interactive) + (let* ((code2word-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word)) + (word2code-file (pyim-dhashcache-get-path 'pyim-dhashcache-word2code)) + (code2word-md5-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word-md5))) + (when (or force (and (not (equal dicts-md5 (pyim-dcache-get-value-from-file code2word-md5-file))) + (not pyim-dhashcache-update-code2word-running-p))) + (setq pyim-dhashcache-update-code2word-running-p t) + ;; use hashtable + (async-start + `(lambda () + ,@(pyim-dhashcache-async-inject-variables) + (require 'pyim-dhashcache) + (let ((dcache (pyim-dhashcache-generate-dcache-file ',dict-files ,code2word-file))) + (pyim-dhashcache-generate-word2code-dcache-file dcache ,word2code-file)) + (pyim-dcache-save-value-to-file ',dicts-md5 ,code2word-md5-file)) + (lambda (_) + (pyim-dcache-reload-variable pyim-dhashcache-code2word) + (pyim-dcache-reload-variable pyim-dhashcache-word2code) + (pyim-dhashcache-update-shortcode2word force) + (setq pyim-dhashcache-update-code2word-running-p nil)))))) (defun pyim-dhashcache-generate-word2code-dcache-file (dcache file) "从 DCACHE 生成一个 word -> code 的反向查询表. @@ -346,217 +452,116 @@ DCACHE 是一个 code -> words 的 hashtable. dcache) (pyim-dcache-save-value-to-file hashtable file)))) -(defun pyim-dhashcache-update-code2word (dict-files dicts-md5 &optional force) - "读取并加载词库. - -读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。 - -如果 FORCE 为真,强制加载。" - (interactive) - (let* ((code2word-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word)) - (word2code-file (pyim-dhashcache-get-path 'pyim-dhashcache-word2code)) - (code2word-md5-file (pyim-dhashcache-get-path 'pyim-dhashcache-code2word-md5))) - (when (or force (and (not (equal dicts-md5 (pyim-dcache-get-value-from-file code2word-md5-file))) - (not pyim-dhashcache-update-code2word-running-p))) - (setq pyim-dhashcache-update-code2word-running-p t) - ;; use hashtable - (async-start - `(lambda () - ,@(pyim-dhashcache-async-inject-variables) - (require 'pyim-dhashcache) - (let ((dcache (pyim-dhashcache-generate-dcache-file ',dict-files ,code2word-file))) - (pyim-dhashcache-generate-word2code-dcache-file dcache ,word2code-file)) - (pyim-dcache-save-value-to-file ',dicts-md5 ,code2word-md5-file)) - (lambda (_) - (pyim-dcache-reload-variable pyim-dhashcache-code2word) - (pyim-dcache-reload-variable pyim-dhashcache-word2code) - (pyim-dhashcache-update-shortcode2word force) - (setq pyim-dhashcache-update-code2word-running-p nil)))))) - -(defun pyim-dhashcache-export (dcache file &optional confirm) - "将一个 pyim DCACHE 导出为文件 FILE. +(defun pyim-dhashcache-get-path (variable) + "获取保存 VARIABLE 取值的文件的路径." + (when (symbolp variable) + (concat (file-name-as-directory pyim-dcache-directory) + (symbol-name variable)))) -如果 CONFIRM 为 non-nil,文件存在时将会提示用户是否覆盖, -默认为覆盖模式" - (with-temp-buffer - (insert ";;; -*- coding: utf-8-unix -*-\n") - (maphash - (lambda (key value) - (let ((value (cl-remove-if - (lambda (x) - ;; 如果某个词条的 text 属性 :noexport 设置为 t, 在导出的 - ;; 时候自动忽略这个词条。 - (and (stringp x) - (get-text-property 0 :noexport x))) - (if (listp value) - value - (list value))))) - (when value - (insert (format "%s %s\n" key (mapconcat #'identity value " ")))))) - dcache) - (pyim-dcache-write-file file confirm))) +(defun pyim-dhashcache-generate-dcache-file (dict-files dcache-file) + "读取词库文件列表:DICT-FILES, 生成一个词库缓冲文件 DCACHE-FILE. -(defun pyim-dhashcache-get (code &optional from) - "从 FROM 对应的 dcaches 中搜索 CODE, 得到对应的词条. +pyim 使用的词库文件是简单的文本文件,编码 *强制* 为 \\='utf-8-unix, +其结构类似: -当词库文件加载完成后,pyim 就可以用这个函数从词库缓存中搜索某个 -code 对应的中文词条了。 + ni-bu-hao 你不好 + ni-hao 你好 妮好 你豪 -如果 FROM 为 nil, 则默认搜索 `pyim-dhashcache-icode2word' 和 -`pyim-dhashcache-code2word' 两个 dcache." - (let* ((caches (mapcar (lambda (x) - (intern (concat "pyim-dhashcache-" (symbol-name x)))) - (or (and from - (if (listp from) - from - (list from))) - '(icode2word code2word)))) - result) - (dolist (cache caches) - (let* ((cache (ignore-errors (symbol-value cache))) - (value (and cache (gethash code cache)))) - ;; 处理 iword2count. - (unless (listp value) - (setq value (list value))) - (when value - (setq result (append result value))))) - result)) +第一个空白字符之前的内容为 code,空白字符之后为中文词条列表。词库 +*不处理* 中文标点符号。" + (let ((hashtable (make-hash-table :size 1000000 :test #'equal))) + (dolist (file dict-files) + (with-temp-buffer + (let ((coding-system-for-read 'utf-8-unix)) + (insert-file-contents file)) + (goto-char (point-min)) + (forward-line 1) + (while (not (eobp)) + (let* ((content (pyim-dline-parse)) + (code (car content)) + (words (cdr content))) + (when (and code words) + (puthash code + (delete-dups `(,@(gethash code hashtable) ,@words)) + hashtable))) + (forward-line 1)))) + (pyim-dcache-save-value-to-file hashtable dcache-file) + hashtable)) -(defun pyim-dhashcache-update-icode2word (&optional force) - "对 personal 缓存中的词条进行排序,加载排序后的结果. +(defun pyim-dhashcache-update-shortcode2word (&optional force) + "使用 `pyim-dhashcache-code2word' 中的词条,创建简写 code 词库缓存并加载. -在这个过程中使用了 `pyim-dhashcache-iword2count' 中记录的词频信息。 -如果 FORCE 为真,强制排序。" +如果 FORCE 为真,强制运行。" (interactive) - (when (or force (not pyim-dhashcache-update-icode2word-p)) + (when (or force (not pyim-dhashcache-update-shortcode2word-p)) ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死, ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。 - (setq pyim-dhashcache-update-icode2word-p t) + (setq pyim-dhashcache-update-shortcode2word-p t) (async-start `(lambda () ,@(pyim-dhashcache-async-inject-variables) (require 'pyim-dhashcache) - (pyim-dcache-init-variable pyim-dhashcache-icode2word) + (pyim-dcache-init-variable pyim-dhashcache-code2word) (pyim-dhashcache-init-count-and-priority-variables) - (maphash - (lambda (key value) - (puthash key (pyim-dhashcache-sort-words value) - pyim-dhashcache-icode2word)) - pyim-dhashcache-icode2word) (pyim-dcache-save-variable - 'pyim-dhashcache-icode2word - pyim-dhashcache-icode2word) - nil) + 'pyim-dhashcache-shortcode2word + (pyim-dhashcache-update-shortcode2word-1 + pyim-dhashcache-code2word))) (lambda (_) - (pyim-dcache-reload-variable pyim-dhashcache-icode2word) - (pyim-dhashcache-update-ishortcode2word force))))) - -(defun pyim-dhashcache-upgrade-icode2word () - "升级 icode2word 缓存。" - (let ((delete-old-key-p (yes-or-no-p "Delete old key after upgrade? ")) - (ruler-list (delete-dups - (remove nil - (mapcar - (lambda (scheme) - (let ((code-prefix (plist-get (cdr scheme) :code-prefix)) - (code-prefix-history (plist-get (cdr scheme) :code-prefix-history))) - (when code-prefix-history - (cons code-prefix-history code-prefix)))) - pyim-schemes))))) - (dolist (ruler ruler-list) - (let ((old-prefix-list (car ruler)) - (new-prefix (cdr ruler))) - (dolist (old-prefix old-prefix-list) - (maphash - (lambda (key _value) - (when (string-prefix-p old-prefix key) - (let* ((key-words (gethash key pyim-dhashcache-icode2word)) - (new-key (concat new-prefix (string-remove-prefix old-prefix key))) - (new-key-words (gethash new-key pyim-dhashcache-icode2word)) - (merged-value (delete-dups `(,@new-key-words ,@key-words)))) - (puthash new-key merged-value pyim-dhashcache-icode2word) - (message "PYIM icode2word upgrade: %S %S -> %S %S" key key-words new-key merged-value) - (when delete-old-key-p - (remhash key pyim-dhashcache-icode2word) - (message "PYIM icode2word upgrade: %S has been deleted." key))))) - pyim-dhashcache-icode2word)))))) - -(defun pyim-dhashcache-update-personal-words (&optional force) - (pyim-dhashcache-update-icode2word force)) - -(defun pyim-dhashcache-init-variables () - "初始化 dcache 缓存相关变量." - (when (and (not pyim-dhashcache-icode2word) - pyim-dcache-directory - (file-directory-p pyim-dcache-directory) - (directory-files pyim-dcache-directory nil "-backup-")) - (message "PYIM: 在 %S 目录中发现备份文件的存在,可能是词库缓存文件损坏导致,请抓紧检查处理!!!" - pyim-dcache-directory)) - (pyim-dhashcache-init-count-and-priority-variables) - (pyim-dcache-init-variable pyim-dhashcache-code2word) - (pyim-dcache-init-variable pyim-dhashcache-word2code) - (pyim-dcache-init-variable pyim-dhashcache-shortcode2word) - (pyim-dcache-init-variable pyim-dhashcache-icode2word) - (pyim-dcache-init-variable pyim-dhashcache-ishortcode2word)) + (pyim-dcache-reload-variable pyim-dhashcache-shortcode2word))))) -(defun pyim-dhashcache-init-count-and-priority-variables () - "初始化 count 相关的变量。" - (pyim-dcache-init-variable pyim-dhashcache-iword2count) - (pyim-dcache-init-variable pyim-dhashcache-iword2count-log) - (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-10-words) - (pyim-dcache-init-variable pyim-dhashcache-iword2count-recent-50-words) - (pyim-dcache-init-variable pyim-dhashcache-iword2priority)) +(defun pyim-dhashcache-update-shortcode2word-1 (code2word) + "`pyim-dhashcache-update-shortcode2word' 的内部函数" + (let ((shortcode2word (make-hash-table :test #'equal))) + (maphash + (lambda (key value) + (dolist (x (pyim-dhashcache-get-shortcodes key)) + (puthash x + (mapcar + (lambda (word) + ;; 这个地方的代码用于实现五笔 code 自动提示功能, + ;; 比如输入 'aa' 后得到选词框: + ;; ---------------------- + ;; | 1. 莁aa 2.匶wv ... | + ;; ---------------------- + (if (get-text-property 0 :comment word) + word + (propertize word :comment (substring key (length x))))) + (delete-dups `(,@(gethash x shortcode2word) ,@value))) + shortcode2word))) + code2word) + (maphash + (lambda (key value) + (puthash key (pyim-dcache-sort-words value) + shortcode2word)) + shortcode2word) + shortcode2word)) -(defun pyim-dhashcache-save-personal-dcache-to-file () - ;; 用户选择过的词 - (pyim-dcache-save-variable - 'pyim-dhashcache-icode2word - pyim-dhashcache-icode2word 0.8) - ;; 词条总 count - (pyim-dcache-save-variable - 'pyim-dhashcache-iword2count - pyim-dhashcache-iword2count 0.8) - ;; 词条 count 日志 - (pyim-dcache-save-variable - 'pyim-dhashcache-iword2count-log - pyim-dhashcache-iword2count-log 0.8) - ;; 词条优先级 - (pyim-dcache-save-variable - 'pyim-dhashcache-iword2priority - pyim-dhashcache-iword2priority 0.8)) +(defun pyim-dhashcache-get-shortcodes (code) + "获取 CODE 所有的 shortcodes. -(defmacro pyim-dhashcache-put (cache code &rest body) - "将 BODY 的返回值保存到 CACHE 对应的 CODE 中。 +比如:wubi/aaaa -> (wubi/aaa wubi/aa) -注意事项:这个宏是一个指代宏,其中 orig-value 在这个宏中有特殊含 -义,代表原来 code 对应的取值。" - (declare (indent 0)) - (let ((key (make-symbol "key")) - (table (make-symbol "table")) - (new-value (make-symbol "new-value"))) - `(let* ((,key ,code) - (,table ,cache) - (orig-value (gethash ,key ,table)) - ,new-value) - (setq ,new-value (progn ,@body)) - (puthash ,key ,new-value ,table)))) +注意事项:这个函数目前只用于五笔等型码输入法,不用于拼音输入法, +因为拼音输入法词库太大,这样处理之后,会生成一个特别大的哈希表, +占用太多内存资源,拼音输入法使用 ishortcode 机制。" + (when (and (pyim-string-match-p "/" code) + (not (pyim-string-match-p "-" code))) + (let* ((x (split-string code "/")) + (prefix (concat (nth 0 x) "/")) + (code1 (nth 1 x)) + (n (length code1)) + results) + (dotimes (i n) + (when (> i 1) + (push (concat prefix (substring code1 0 i)) results))) + results))) -(defun pyim-dhashcache-update-iword2count-recent (word n hash-table) - (let (words-need-remove) - (pyim-dhashcache-put - hash-table :all-words - (setq orig-value (remove word orig-value)) - (push word orig-value) - (if (<= (length orig-value) n) - orig-value - (setq words-need-remove (nthcdr n orig-value)) - (cl-subseq orig-value 0 n))) - (dolist (w words-need-remove) - (remhash w hash-table)) - (pyim-dhashcache-put - hash-table word - (+ (or orig-value 0) 1)) - hash-table)) +;; ** 更新 dhashcache 词条计数 +(cl-defmethod pyim-dcache-update-wordcount + (word &context (pyim-dcache-backend (eql pyim-dhashcache)) + &optional wordcount-handler) + (pyim-dhashcache-update-iword2count word wordcount-handler)) (defun pyim-dhashcache-update-iword2count (word &optional wordcount-handler) "保存词频到缓存." @@ -603,88 +608,139 @@ code 对应的中文词条了。 (pyim-dhashcache-get-counts-from-log (gethash word pyim-dhashcache-iword2count-log))))) -(defun pyim-dhashcache-update-iword2priority (&optional force) - "更新词条优先级表,如果 FORCE 为真,强制更新。" - (interactive) - (when (or force (not pyim-dhashcache-update-iword2priority-p)) - ;; NOTE: 这个变量按理说应该在回调函数里面设置,但 async 在某些情况下会卡死, - ;; 这个变量无法设置为 t, 导致后续产生大量的 emacs 进程,极其影响性能。 - (setq pyim-dhashcache-update-iword2priority-p t) - (async-start - `(lambda () - ,@(pyim-dhashcache-async-inject-variables) - (require 'pyim-dhashcache) - (pyim-dhashcache-init-count-and-priority-variables) - (maphash - (lambda (key value) - (puthash key - (pyim-dhashcache-calculate-priority - (pyim-dhashcache-get-counts-from-log - value)) - pyim-dhashcache-iword2priority)) - pyim-dhashcache-iword2count-log) - (pyim-dcache-save-variable - 'pyim-dhashcache-iword2priority - pyim-dhashcache-iword2priority) - nil) - (lambda (_) - (pyim-dcache-reload-variable pyim-dhashcache-iword2priority))))) +(defun pyim-dhashcache-update-iword2count-recent (word n hash-table) + (let (words-need-remove) + (pyim-dhashcache-put + hash-table :all-words + (setq orig-value (remove word orig-value)) + (push word orig-value) + (if (<= (length orig-value) n) + orig-value + (setq words-need-remove (nthcdr n orig-value)) + (cl-subseq orig-value 0 n))) + (dolist (w words-need-remove) + (remhash w hash-table)) + (pyim-dhashcache-put + hash-table word + (+ (or orig-value 0) 1)) + hash-table)) -(defun pyim-dhashcache-delete-word (word) - "将中文词条 WORD 从个人词库中删除" - (maphash - (lambda (key value) - (when (member word value) - (let ((new-value (remove word value))) - (if new-value - (puthash key new-value pyim-dhashcache-icode2word) - (remhash key pyim-dhashcache-icode2word))))) - pyim-dhashcache-icode2word) - (maphash - (lambda (key value) - (when (member word value) - (print value) - (let ((new-value (remove word value))) - (if new-value - (puthash key new-value pyim-dhashcache-ishortcode2word) - (remhash key pyim-dhashcache-ishortcode2word))))) - pyim-dhashcache-ishortcode2word) - (remhash word pyim-dhashcache-iword2count) - (remhash word pyim-dhashcache-iword2count-log) - (remhash word pyim-dhashcache-iword2priority)) +;; ** 根据 dhashcache 信息对词条进行排序 +(cl-defmethod pyim-dcache-sort-words + (words-list &context (pyim-dcache-backend (eql pyim-dhashcache))) + "对 WORDS-LIST 排序" + (let ((iword2count pyim-dhashcache-iword2count) + (iword2priority pyim-dhashcache-iword2priority)) + (sort words-list + (lambda (a b) + (let ((p1 (gethash a iword2priority)) + (p2 (gethash b iword2priority))) + (cond + ((and (listp p1) + (listp p2) + (not (equal p1 p2))) + (pyim-numbers> p1 p2)) + (t (let ((n1 (or (gethash a iword2count) 0)) + (n2 (or (gethash b iword2count) 0))) + (> n1 n2))))))))) -(defun pyim-dhashcache-insert-word-into-icode2word (word code prepend) - "将词条 WORD 插入到 icode2word 词库缓存 CODE 键对应的位置. +;; ** 升级 dhashcache 相关函数 +(cl-defmethod pyim-dcache-upgrade + (&context (pyim-dcache-backend (eql pyim-dhashcache))) + "升级词库缓存. -默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放 -到已有词条的最前面。" - (pyim-dhashcache-put - pyim-dhashcache-icode2word code - (if prepend - `(,word ,@(remove word orig-value)) - `(,@(remove word orig-value) ,word)))) +当前已有的功能: +1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。" + (pyim-dhashcache-upgrade-icode2word)) -(defun pyim-dhashcache-insert-word-into-ishortcode2word (word code prepend) - "将词条 WORD 插入到 ishortcode2word 词库缓存 CODE 首字母字符串对应的位置. +(defun pyim-dhashcache-upgrade-icode2word () + "升级 icode2word 缓存。" + (let ((delete-old-key-p (yes-or-no-p "Delete old key after upgrade? ")) + (ruler-list (delete-dups + (remove nil + (mapcar + (lambda (scheme) + (let ((code-prefix (plist-get (cdr scheme) :code-prefix)) + (code-prefix-history (plist-get (cdr scheme) :code-prefix-history))) + (when code-prefix-history + (cons code-prefix-history code-prefix)))) + pyim-schemes))))) + (dolist (ruler ruler-list) + (let ((old-prefix-list (car ruler)) + (new-prefix (cdr ruler))) + (dolist (old-prefix old-prefix-list) + (maphash + (lambda (key _value) + (when (string-prefix-p old-prefix key) + (let* ((key-words (gethash key pyim-dhashcache-icode2word)) + (new-key (concat new-prefix (string-remove-prefix old-prefix key))) + (new-key-words (gethash new-key pyim-dhashcache-icode2word)) + (merged-value (delete-dups `(,@new-key-words ,@key-words)))) + (puthash new-key merged-value pyim-dhashcache-icode2word) + (message "PYIM icode2word upgrade: %S %S -> %S %S" key key-words new-key merged-value) + (when delete-old-key-p + (remhash key pyim-dhashcache-icode2word) + (message "PYIM icode2word upgrade: %S has been deleted." key))))) + pyim-dhashcache-icode2word)))))) -默认 WORD 放到已有词条的最后,如果 PREPEND 为 non-nil, WORD 将放 -到已有词条的最前面。" - (dolist (newcode (pyim-dhashcache-get-ishortcodes code)) - (pyim-dhashcache-put - pyim-dhashcache-ishortcode2word - newcode - (if prepend - `(,word ,@(remove word orig-value)) - `(,@(remove word orig-value) ,word))))) +;; ** 保存 dhashcache 相关函数 +(cl-defmethod pyim-dcache-save-caches + (&context (pyim-dcache-backend (eql pyim-dhashcache))) + (pyim-dhashcache-save-personal-dcache-to-file)) -(defun pyim-dhashcache-search-word-code (string) - (gethash string pyim-dhashcache-word2code)) +(defun pyim-dhashcache-save-personal-dcache-to-file () + ;; 用户选择过的词 + (pyim-dcache-save-variable + 'pyim-dhashcache-icode2word + pyim-dhashcache-icode2word 0.8) + ;; 词条总 count + (pyim-dcache-save-variable + 'pyim-dhashcache-iword2count + pyim-dhashcache-iword2count 0.8) + ;; 词条 count 日志 + (pyim-dcache-save-variable + 'pyim-dhashcache-iword2count-log + pyim-dhashcache-iword2count-log 0.8) + ;; 词条优先级 + (pyim-dcache-save-variable + 'pyim-dhashcache-iword2priority + pyim-dhashcache-iword2priority 0.8)) -(defun pyim-dhashcache-export-personal-words (file &optional confirm) +;; ** 导出相关函数 +(cl-defmethod pyim-dcache-export-personal-words + (file &context (pyim-dcache-backend (eql pyim-dhashcache)) + &optional confirm) "导出个人词库到 FILE." + (pyim-dcache-init-variables) (pyim-dhashcache-export pyim-dhashcache-icode2word file confirm)) -(defun pyim-dhashcache-export-words-and-counts (file &optional confirm ignore-counts) +(defun pyim-dhashcache-export (dcache file &optional confirm) + "将一个 pyim DCACHE 导出为文件 FILE. + +如果 CONFIRM 为 non-nil,文件存在时将会提示用户是否覆盖, +默认为覆盖模式" + (with-temp-buffer + (insert ";;; -*- coding: utf-8-unix -*-\n") + (maphash + (lambda (key value) + (let ((value (cl-remove-if + (lambda (x) + ;; 如果某个词条的 text 属性 :noexport 设置为 t, 在导出的 + ;; 时候自动忽略这个词条。 + (and (stringp x) + (get-text-property 0 :noexport x))) + (if (listp value) + value + (list value))))) + (when value + (insert (format "%s %s\n" key (mapconcat #'identity value " ")))))) + dcache) + (pyim-dcache-write-file file confirm))) + +(cl-defmethod pyim-dcache-export-words-and-counts + (file &context (pyim-dcache-backend (eql pyim-dhashcache)) + &optional confirm ignore-counts) + (pyim-dcache-init-variables) (with-temp-buffer (insert ";;; -*- coding: utf-8-unix -*-\n") (maphash @@ -710,6 +766,6 @@ code 对应的中文词条了。 (pyim-dcache-write-file file confirm))) ;; * Footer - (provide 'pyim-dhashcache) + ;;; pyim-dhashcache.el ends here diff --git a/pyim-dregcache.el b/pyim-dregcache.el index 4686b074ed..ef4d5d6e8c 100644 --- a/pyim-dregcache.el +++ b/pyim-dregcache.el @@ -44,133 +44,87 @@ (defvar pyim-dregcache-iword2count nil) (defvar pyim-dregcache-dicts-md5 nil) -(defun pyim-dregcache-update (&optional force) - "读取并加载所有相关词库 dcache. - -如果 FORCE 为真,强制加载。" - (pyim-dregcache-init-variables) - (when pyim-dcache-auto-update - (pyim-dregcache-update-personal-words force) - (let* ((dict-files (pyim-dict-get-enabled-dict-files)) - (dicts-md5 (pyim-dcache-create-files-md5 dict-files))) - (when pyim-debug - (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s dict-files=%s" - pyim-dicts - pyim-extra-dicts - dict-files)) - (pyim-dregcache-update-code2word dict-files dicts-md5 force)))) - -(defun pyim-dregcache-variable-file (variable) - "Get VARIABLE dcache file path." - (concat (file-name-as-directory pyim-dcache-directory) - (symbol-name variable))) - -(defun pyim-dregcache-save-variable (variable value) - "Save VARIABLE with its VALUE." - (let* ((file (pyim-dregcache-variable-file variable)) - (save-silently t)) - (make-directory (file-name-directory file) t) - (with-temp-buffer - (insert value) - (pyim-dcache-write-file file)))) - -(defun pyim-dregcache-load-variable (variable) - "载入 VARIABLE 对应的文件内容." - (let* ((file (pyim-dregcache-variable-file variable))) - (when (and file (file-exists-p file)) - (with-temp-buffer - (insert-file-contents file) - (buffer-string))))) - -(defun pyim-dregcache-sort-words (words-list) - "对 WORDS-LIST 排序,词频大的排在前面." - (let ((iword2count pyim-dregcache-iword2count)) - (sort words-list - (lambda (a b) - (let ((a (car (split-string a ":"))) - (b (car (split-string b ":")))) - (> (or (gethash a iword2count) 0) - (or (gethash b iword2count) 0))))))) - -(defun pyim-dregcache-sort-icode2word () - "对个人词库排序." - ;; https://github.com/redguardtoo/zhfreq - (with-temp-buffer - (dolist (l (split-string pyim-dregcache-icode2word "\n")) - (cond - ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l) - ;; 3字以上词很少,如果只处理单字,2字词,3字词 - ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+ \\)\\(.*\\)" l) - (let* ((pinyin (match-string 1 l)) - (words (pyim-dregcache-sort-words (split-string (match-string 2 l) " ")))) - (insert (format "%s\n" (concat pinyin (string-join words " ")))))) - ;; 其他词 - ((string= l "") - ;; skip empty line - ) - (t - (insert (format "%s\n" l))))) - (setq pyim-dregcache-icode2word (buffer-string)))) - -(defun pyim-dregcache-create-cache-content (raw-content) - "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区." - (let ((chars "bcdefghjklmnopqrstwxyz") - (i 0) - content-segments - (start (string-match "^a" raw-content)) - chunk - end) - ;; 将字典缓存划分成多个"子搜索区域" - (while (< i (length chars)) - (when (setq end (string-match (string ?^ (elt chars i)) - raw-content - start)) - (setq chunk (substring-no-properties raw-content start end)) - (push chunk content-segments) - (setq start end)) - (setq i (1+ i))) - - ;; last chunk - (setq chunk (substring-no-properties raw-content end (length raw-content))) - (push chunk content-segments) - (list :content (nreverse content-segments)))) - -(defun pyim-dregcache-load-dictionary-file (dict-file) - "READ from DICT-FILE." - (let* ((raw-content (with-temp-buffer - (insert-file-contents dict-file) - (buffer-string)))) - (setq pyim-dregcache-cache - ;; use string type as key, so have to use `lax-plist-put' - ;; @see https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access - (lax-plist-put pyim-dregcache-cache - (file-truename dict-file) - (pyim-dregcache-create-cache-content raw-content))))) - -(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force) - "读取并加载词库. +;; ** 初始化 dregcache 相关函数 +(cl-defmethod pyim-dcache-init-variables + (&context (pyim-dcache-backend (eql pyim-dregcache))) + "初始化 cache 缓存相关变量." + (pyim-dcache-init-variable + pyim-dregcache-iword2count + ;; dregcache 引擎也需要词频信息,第一次使用 dregcache 引擎的时候, + ;; 自动导入 dhashcache 引擎的词频信息,以后两个引擎的词频信息就 + ;; 完全分开了。 + (pyim-dcache-get-value 'pyim-dhashcache-iword2count)) + (unless pyim-dregcache-icode2word + (pyim-dregcache-update-personal-words t))) -读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。 +;; ** 从 dregcache 搜索词条相关函数 +(cl-defmethod pyim-dcache-get + (code &context (pyim-dcache-backend (eql pyim-dregcache)) + &optional from) + "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条." + (when code + (cond ((or (memq 'icode2word from) + (memq 'ishortcode2word from)) + (pyim-dregcache-get-icode2word-ishortcode2word code)) + ;; FIXME: pyim-dregcache 暂时不支持 iword2count-recent-10-words 和 + ;; iword2count-recent-50-words. + ((or (memq 'iword2count-recent-10-words from) + (memq 'iword2count-recent-50-words from)) + nil) + (t (let ((dict-files (pyim-dregcache-all-dict-files)) + result) + + (when pyim-debug (message "pyim-dregcache-get is called. code=%s" code)) + (when dict-files + (dolist (file dict-files) + (let* ((file-info (lax-plist-get pyim-dregcache-cache file)) + (content (pyim-dregcache-get-content code file-info))) + (setq result (append (pyim-dregcache-get-1 content code) result))))) + ;; `push' plus `nreverse' is more efficient than `add-to-list' + ;; Many examples exist in Emacs' own code + (nreverse result)))))) -DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码. +(defun pyim-dregcache-get-icode2word-ishortcode2word (code) + "以 CODE 搜索个人词和个人联想词. 正则表达式搜索词库,不需要为联想词开单独缓存." + (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word called => %s" code)) + (when pyim-dregcache-icode2word + (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code)))) -如果 FORCE 为真,强制加载。" - (interactive) - (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5))) - ;; no hashtable i file mapping algorithm - (dolist (file dict-files) - (pyim-dregcache-load-dictionary-file file)) - (setq pyim-dregcache-dicts-md5 dicts-md5))) +(defmacro pyim-dregcache-match-line (code) + `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)")) -(defmacro pyim-dregcache-shenmu2regexp (char) - "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字." - `(concat ,char "[a-z]*")) +(defun pyim-dregcache-get-1 (content code) + (let ((case-fold-search t) + (start 0) + (pattern (pyim-dregcache-match-line code)) + (content-length (length content)) + word + output) + (while (and (< start content-length) + (setq start (string-match pattern content start))) + ;; 提取词 + (setq word (match-string-no-properties 1 content)) + (when word + (cond + ((string-match "^[^ ]+$" word) + ;; 单个词 + (push word output)) + (t + ;; 多个字 + (setq output (append (nreverse (split-string word " +")) output))))) + ;; 继续搜索 + (setq start (+ start 2 (length code) (length word)))) + output)) (defmacro pyim-dregcache-is-shenmu (code) "判断CODE 是否是一个声母." `(and (eq (length ,code) 1) (not (string-match ,code "aeo")))) +(defmacro pyim-dregcache-shenmu2regexp (char) + "将声母 CHAR 转换为通用正则表达式匹配所有以该声母开头的汉字." + `(concat ,char "[a-z]*")) + (defun pyim-dregcache-code2regexp (code) "将 CODE 转换成正则表达式用来搜索辞典缓存中的匹配项目. 单个声母会匹配所有以此生母开头的单个汉字." @@ -209,9 +163,6 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码. ;; tian-an-men => tian-an-men[a-z-]* (concat s "[a-z-]*")))))))) -(defmacro pyim-dregcache-match-line (code) - `(concat "^" (pyim-dregcache-code2regexp ,code) " \\(.+\\)")) - (defun pyim-dregcache-all-dict-files () "所有词典文件." (let* (rlt) @@ -238,57 +189,109 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码. ;; fetch segment using the first character of pinyin code (nth idx rlt))) -(defun pyim-dregcache-get-1 (content code) - (let ((case-fold-search t) - (start 0) - (pattern (pyim-dregcache-match-line code)) - (content-length (length content)) - word - output) - (while (and (< start content-length) - (setq start (string-match pattern content start))) - ;; 提取词 - (setq word (match-string-no-properties 1 content)) - (when word - (cond - ((string-match "^[^ ]+$" word) - ;; 单个词 - (push word output)) - (t - ;; 多个字 - (setq output (append (nreverse (split-string word " +")) output))))) - ;; 继续搜索 - (setq start (+ start 2 (length code) (length word)))) - output)) +;; ** 从 dregcache 搜索代码相关函数 +(cl-defmethod pyim-dcache-search-word-code + (word &context (pyim-dcache-backend (eql pyim-dregcache))) + "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code." + (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word)) + (when pyim-dregcache-cache + (catch 'result + (let ((dict-files (pyim-dregcache-all-dict-files)) + code) + (when pyim-dregcache-icode2word + (setq code (pyim-dregcache-search-word-code-1 word pyim-dregcache-icode2word)) + (when code (throw 'result (list code)))) + (dolist (file dict-files) + (let* ((file-info (lax-plist-get pyim-dregcache-cache file)) + (contents (lax-plist-get file-info :content))) + (dolist (content contents) + (setq code (pyim-dregcache-search-word-code-1 word content)) + (when code (throw 'result (list code)))))))))) -(defun pyim-dregcache-get (code &optional from) - "从 `pyim-dregcache-cache' 搜索 CODE, 得到对应的词条." - (cond ((or (memq 'icode2word from) - (memq 'ishortcode2word from)) - (pyim-dregcache-get-icode2word-ishortcode2word code)) - ;; FIXME: pyim-dregcache 暂时不支持 iword2count-recent-10-words 和 - ;; iword2count-recent-50-words. - ((or (memq 'iword2count-recent-10-words from) - (memq 'iword2count-recent-50-words from)) - nil) - (t (let ((dict-files (pyim-dregcache-all-dict-files)) - result) - - (when pyim-debug (message "pyim-dregcache-get is called. code=%s" code)) - (when dict-files - (dolist (file dict-files) - (let* ((file-info (lax-plist-get pyim-dregcache-cache file)) - (content (pyim-dregcache-get-content code file-info))) - (setq result (append (pyim-dregcache-get-1 content code) result))))) - ;; `push' plus `nreverse' is more efficient than `add-to-list' - ;; Many examples exist in Emacs' own code - (nreverse result))))) +(defun pyim-dregcache-search-word-code-1 (word content) + (let* ((case-fold-search t) + (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word "$\\)"))) + (when (string-match regexp content) + (match-string-no-properties 1 content)))) -(defun pyim-dregcache-get-icode2word-ishortcode2word (code) - "以 CODE 搜索个人词和个人联想词. 正则表达式搜索词库,不需要为联想词开单独缓存." - (when pyim-debug (message "pyim-dregcache-get-icode2word-ishortcode2word called => %s" code)) - (when pyim-dregcache-icode2word - (nreverse (pyim-dregcache-get-1 pyim-dregcache-icode2word code)))) +;; ** 给 dregcache 添加词条相关函数 +(cl-defmethod pyim-dcache-insert-word + (word code prepend + &context (pyim-dcache-backend (eql pyim-dregcache))) + "将词条 WORD 插入到 `pyim-dregcache-icode2word'." + (pyim-dregcache-insert-word-into-icode2word word code prepend)) + +(defun pyim-dregcache-insert-word-into-icode2word (word code prepend) + "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法." + (when pyim-debug + (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s" + word + code + prepend)) + (with-temp-buffer + (when pyim-dregcache-icode2word + (insert pyim-dregcache-icode2word)) + (goto-char (point-min)) + (let* ((case-fold-search t) + substring replace-string beg end old-word-list) + (if (re-search-forward (concat "^" code " \\(.*\\)") nil t) + (progn + (setq beg (match-beginning 0)) + (setq end (match-end 0)) + (setq substring (match-string-no-properties 1)) + (delete-region beg end) + ;; 这里不进行排序,在pyim-dregcache-update-personal-words排序 + (setq old-word-list (pyim-dregcache-sort-words (split-string substring " "))) + (setq replace-string (concat code " " (string-join (delete-dups `(,@old-word-list ,word)) " ")))) + (setq replace-string (concat code " " (or replace-string word) "\n"))) + (goto-char (or beg (point-max))) + (insert replace-string)) + (setq pyim-dregcache-icode2word + (buffer-string)))) + +;; ** 从 dregcache 删除词条相关函数 +(cl-defmethod pyim-dcache-delete-word + (word &context (pyim-dcache-backend (eql pyim-dregcache))) + "将中文词条 WORD 从个人词库中删除." + (with-temp-buffer + (insert pyim-dregcache-icode2word) + (goto-char (point-min)) + (let* ((case-fold-search t) + substring beg end) + (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word "\\(.*\\)$") nil t) + (setq beg (match-beginning 0)) + (setq end (match-end 0)) + (setq substring (concat (match-string-no-properties 1) + (match-string-no-properties 2) + (match-string-no-properties 3))) + + ;; delete string and the newline char + (delete-region beg (+ 1 end)) + (when (> (length (split-string substring " ")) 1) + (goto-char beg) + (insert substring))) + (setq pyim-dregcache-icode2word + (buffer-string)))) + ;; 删除对应词条的词频 + (remhash word pyim-dregcache-iword2count)) + +;; ** 更新 dhashcache 相关函数 +(cl-defmethod pyim-dcache-update + (&context (pyim-dcache-backend (eql pyim-dregcache)) &optional force) + "读取并加载所有相关词库 dcache. + +如果 FORCE 为真,强制加载。" + (pyim-dcache-init-variables) + (when pyim-dcache-auto-update + (pyim-dregcache-update-personal-words force) + (let* ((dict-files (pyim-dict-get-enabled-dict-files)) + (dicts-md5 (pyim-dcache-create-files-md5 dict-files))) + (when pyim-debug + (message "pyim-dregcache-update: pyim-dicts=%s pyim-extra-dicts=%s dict-files=%s" + pyim-dicts + pyim-extra-dicts + dict-files)) + (pyim-dregcache-update-code2word dict-files dicts-md5 force)))) (defun pyim-dregcache-update-personal-words (&optional force) "合并 `pyim-dregcache-icode2word' 磁盘文件. 加载排序后的结果. @@ -327,33 +330,74 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码. (when (and force pyim-dregcache-icode2word) (pyim-dregcache-sort-icode2word))) -(defun pyim-dregcache-init-variables () - "初始化 cache 缓存相关变量." - (pyim-dcache-init-variable - pyim-dregcache-iword2count - ;; dregcache 引擎也需要词频信息,第一次使用 dregcache 引擎的时候, - ;; 自动导入 dhashcache 引擎的词频信息,以后两个引擎的词频信息就 - ;; 完全分开了。 - (pyim-dcache-get-value 'pyim-dhashcache-iword2count)) - (unless pyim-dregcache-icode2word - (pyim-dregcache-update-personal-words t))) +(defun pyim-dregcache-load-variable (variable) + "载入 VARIABLE 对应的文件内容." + (let* ((file (pyim-dregcache-variable-file variable))) + (when (and file (file-exists-p file)) + (with-temp-buffer + (insert-file-contents file) + (buffer-string))))) -(defun pyim-dregcache-save-personal-dcache-to-file () - "保存缓存内容到默认目录." - (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file called")) - ;; 用户选择过的词存为标准辞典格式保存 - (when pyim-dregcache-icode2word - (pyim-dregcache-save-variable - 'pyim-dregcache-icode2word - pyim-dregcache-icode2word)) - ;; 词频 - (pyim-dcache-save-variable - 'pyim-dregcache-iword2count - pyim-dregcache-iword2count)) +(defun pyim-dregcache-variable-file (variable) + "Get VARIABLE dcache file path." + (concat (file-name-as-directory pyim-dcache-directory) + (symbol-name variable))) -(defun pyim-dregcache-export-words-and-counts () - "TODO" - ) +(defun pyim-dregcache-update-code2word (dict-files dicts-md5 &optional force) + "读取并加载词库. + +读取词库文件 DICT-FILES,生成对应的词库缓冲文件,然后加载词库缓存。 + +DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码. + +如果 FORCE 为真,强制加载。" + (interactive) + (when (or force (not (equal dicts-md5 pyim-dregcache-dicts-md5))) + ;; no hashtable i file mapping algorithm + (dolist (file dict-files) + (pyim-dregcache-load-dictionary-file file)) + (setq pyim-dregcache-dicts-md5 dicts-md5))) + +(defun pyim-dregcache-load-dictionary-file (dict-file) + "READ from DICT-FILE." + (let* ((raw-content (with-temp-buffer + (insert-file-contents dict-file) + (buffer-string)))) + (setq pyim-dregcache-cache + ;; use string type as key, so have to use `lax-plist-put' + ;; @see https://www.gnu.org/software/emacs/manual/html_node/elisp/Plist-Access.html#Plist-Access + (lax-plist-put pyim-dregcache-cache + (file-truename dict-file) + (pyim-dregcache-create-cache-content raw-content))))) + +(defun pyim-dregcache-create-cache-content (raw-content) + "将 RAW-CONTENT 划分成可以更高效搜索的缓冲区." + (let ((chars "bcdefghjklmnopqrstwxyz") + (i 0) + content-segments + (start (string-match "^a" raw-content)) + chunk + end) + ;; 将字典缓存划分成多个"子搜索区域" + (while (< i (length chars)) + (when (setq end (string-match (string ?^ (elt chars i)) + raw-content + start)) + (setq chunk (substring-no-properties raw-content start end)) + (push chunk content-segments) + (setq start end)) + (setq i (1+ i))) + + ;; last chunk + (setq chunk (substring-no-properties raw-content end (length raw-content))) + (push chunk content-segments) + (list :content (nreverse content-segments)))) + +;; ** 更新 dregcache 词条计数。 +(cl-defmethod pyim-dcache-update-wordcount + (word &context (pyim-dcache-backend (eql pyim-dregcache)) + &optional wordcount-handler) + (pyim-dregcache-update-iword2count word wordcount-handler)) (defun pyim-dregcache-update-iword2count (word &optional wordcount-handler) "保存词频到缓存." @@ -369,57 +413,13 @@ DICT-FILES 是词库文件列表. DICTS-MD5 是词库的MD5校验码. (unless (equal orig-value new-value) (puthash word new-value pyim-dregcache-iword2count)))) -(defun pyim-dregcache-delete-word (word) - "将中文词条 WORD 从个人词库中删除." - (with-temp-buffer - (insert pyim-dregcache-icode2word) - (goto-char (point-min)) - (let* ((case-fold-search t) - substring beg end) - (while (re-search-forward (concat "^\\([a-z-]+\\) \\(.*\\)" word "\\(.*\\)$") nil t) - (setq beg (match-beginning 0)) - (setq end (match-end 0)) - (setq substring (concat (match-string-no-properties 1) - (match-string-no-properties 2) - (match-string-no-properties 3))) - - ;; delete string and the newline char - (delete-region beg (+ 1 end)) - (when (> (length (split-string substring " ")) 1) - (goto-char beg) - (insert substring))) - (setq pyim-dregcache-icode2word - (buffer-string)))) - ;; 删除对应词条的词频 - (remhash word pyim-dregcache-iword2count)) +;; ** 升级 dhashcache 相关函数 +(cl-defmethod pyim-dcache-upgrade (&context (pyim-dcache-backend (eql pyim-dregcache))) + "升级词库缓存. -(defun pyim-dregcache-insert-word-into-icode2word (word code prepend) - "保存个人词到缓存,和其他词库格式一样以共享正则搜索算法." - (when pyim-debug - (message "pyim-dregcache-insert-word-into-icode2word called => %s %s %s" - word - code - prepend)) - (with-temp-buffer - (when pyim-dregcache-icode2word - (insert pyim-dregcache-icode2word)) - (goto-char (point-min)) - (let* ((case-fold-search t) - substring replace-string beg end old-word-list) - (if (re-search-forward (concat "^" code " \\(.*\\)") nil t) - (progn - (setq beg (match-beginning 0)) - (setq end (match-end 0)) - (setq substring (match-string-no-properties 1)) - (delete-region beg end) - ;; 这里不进行排序,在pyim-dregcache-update-personal-words排序 - (setq old-word-list (pyim-dregcache-sort-words (split-string substring " "))) - (setq replace-string (concat code " " (string-join (delete-dups `(,@old-word-list ,word)) " ")))) - (setq replace-string (concat code " " (or replace-string word) "\n"))) - (goto-char (or beg (point-max))) - (insert replace-string)) - (setq pyim-dregcache-icode2word - (buffer-string)))) +当前已有的功能: +1. 基于 :code-prefix-history 信息,升级为新的 code-prefix。" + (pyim-dregcache-upgrade-icode2word)) (defun pyim-dregcache-upgrade-icode2word () "升级 icode2word 缓存。 @@ -428,31 +428,50 @@ dregcache 只支持全拼和双拼,不能用于五笔之类的型码输入法 update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所 以不需要具体实现细节。") -(defun pyim-dregcache-search-word-code-1 (word content) - (let* ((case-fold-search t) - (regexp (concat "^\\([a-z-]+\\)\\(.*\\) " "\\(" word " \\|" word "$\\)"))) - (when (string-match regexp content) - (match-string-no-properties 1 content)))) +;; ** 根据 dregcache 信息对词条进行排序 +(defun pyim-dregcache-sort-words (words-list) + "对 WORDS-LIST 排序,词频大的排在前面." + (let ((iword2count pyim-dregcache-iword2count)) + (sort words-list + (lambda (a b) + (let ((a (car (split-string a ":"))) + (b (car (split-string b ":")))) + (> (or (gethash a iword2count) 0) + (or (gethash b iword2count) 0))))))) -(defun pyim-dregcache-search-word-code (word) - "从 `pyim-dregcache-cache' 和 `pyim-dregcache-icode2word' 搜索 word, 得到对应的code." - (when pyim-debug (message "pyim-dregcache-search-word-code word=%s" word)) - (when pyim-dregcache-cache - (catch 'result - (let ((dict-files (pyim-dregcache-all-dict-files)) - code) - (when pyim-dregcache-icode2word - (setq code (pyim-dregcache-search-word-code-1 word pyim-dregcache-icode2word)) - (when code (throw 'result (list code)))) - (dolist (file dict-files) - (let* ((file-info (lax-plist-get pyim-dregcache-cache file)) - (contents (lax-plist-get file-info :content))) - (dolist (content contents) - (setq code (pyim-dregcache-search-word-code-1 word content)) - (when code (throw 'result (list code)))))))))) +;; ** 保存 dregcache 相关函数 +(cl-defmethod pyim-dcache-save-caches + (&context (pyim-dcache-backend (eql pyim-dregcache))) + (pyim-dregcache-save-personal-dcache-to-file)) + +(defun pyim-dregcache-save-personal-dcache-to-file () + "保存缓存内容到默认目录." + (when pyim-debug (message "pyim-dregcache-save-personal-dcache-to-file called")) + ;; 用户选择过的词存为标准辞典格式保存 + (when pyim-dregcache-icode2word + (pyim-dregcache-save-variable + 'pyim-dregcache-icode2word + pyim-dregcache-icode2word)) + ;; 词频 + (pyim-dcache-save-variable + 'pyim-dregcache-iword2count + pyim-dregcache-iword2count)) + +(defun pyim-dregcache-save-variable (variable value) + "Save VARIABLE with its VALUE." + (let* ((file (pyim-dregcache-variable-file variable)) + (save-silently t)) + (make-directory (file-name-directory file) t) + (with-temp-buffer + (insert value) + (pyim-dcache-write-file file)))) -(defun pyim-dregcache-export-personal-words (file &optional confirm) +;; ** 导出 dregcache 相关函数 +(cl-defmethod pyim-dcache-export-personal-words + (file &context (pyim-dcache-backend (eql pyim-dregcache)) + &optional confirm) "将个人词库存入 FILE." + (pyim-dcache-init-variables) (when pyim-dregcache-icode2word ;; 按词频排序,把词频信息保存到用户词典 (pyim-dregcache-sort-icode2word) @@ -466,6 +485,30 @@ update-icode2word 目前只要是用于更新型码输入法的 code-prefix, 所 (sort-lines nil (point-min) (point-max)) (pyim-dcache-write-file file confirm)))) +(defun pyim-dregcache-sort-icode2word () + "对个人词库排序." + ;; https://github.com/redguardtoo/zhfreq + (with-temp-buffer + (dolist (l (split-string pyim-dregcache-icode2word "\n")) + (cond + ((string-match "^\\([a-z-]+ \\)\\(.*\\)" l) + ;; 3字以上词很少,如果只处理单字,2字词,3字词 + ;; ((string-match "^\\([a-z]+ \\|[a-z]+-[a-z]+ \\|[a-z]+-[a-z]+-[a-z]+ \\)\\(.*\\)" l) + (let* ((pinyin (match-string 1 l)) + (words (pyim-dregcache-sort-words (split-string (match-string 2 l) " ")))) + (insert (format "%s\n" (concat pinyin (string-join words " ")))))) + ;; 其他词 + ((string= l "") + ;; skip empty line + ) + (t + (insert (format "%s\n" l))))) + (setq pyim-dregcache-icode2word (buffer-string)))) + +(defun pyim-dregcache-export-words-and-counts () + "TODO" + ) + ;; * Footer (provide 'pyim-dregcache) diff --git a/pyim-process.el b/pyim-process.el index cafea2b7f7..23345dba4d 100644 --- a/pyim-process.el +++ b/pyim-process.el @@ -205,15 +205,17 @@ imobj 组合构成在一起,构成了 imobjs 这个概念。比如: "PYIM 流程,词库相关的初始化工作。" (pyim-recreate-local-variables) (pyim-pymap-cache-create) + (pyim-dcache-init-variables) (pyim-dcache-update force)) (defun pyim-process-save-dcaches (&optional force) "PYIM 流程,保存 dcache." (when force - (pyim-dcache-save-caches))) + (pyim-dcache-save-caches)) + t) -(defun pyim-process-update-personal-words () - (pyim-dcache-call-api 'update-personal-words t)) +(defun pyim-process-update (&optional force) + (pyim-dcache-update force)) (defun pyim-process-start-daemon () "启动 pyim 流程需要的 daemon." diff --git a/pyim.el b/pyim.el index f6c69017d1..bff910eaf2 100644 --- a/pyim.el +++ b/pyim.el @@ -311,6 +311,12 @@ REFRESH-COMMON-DCACHE 已经废弃,不要再使用了。" (pyim-process-save-dcaches save-personal-dcache) (pyim-process-init-dcaches :force)) +;; ** 升级功能 +(defun pyim-upgrade () + "升级 pyim 功能。" + (interactive) + (pyim-dcache-upgrade)) + ;; ** 键盘输入处理功能 (defun pyim-self-insert-command () "Pyim 默认的 self-insert-command." @@ -424,13 +430,28 @@ MERGE-METHOD 是一个函数,这个函数需要两个数字参数,代表词 ;; 有这一步骤,导入的词条就会被覆盖。 (pyim-process-save-dcaches t) ;; 更新相关的 dcache - (pyim-process-update-personal-words) + (pyim-process-update t) (message "PYIM: 词条和词频信息导入完成!"))) ;; ** 导出功能 -(defalias 'pyim-export-words-and-counts 'pyim-dcache-export-words-and-counts) -(defalias 'pyim-export-personal-words 'pyim-dcache-export-personal-words) +(defun pyim-export-words-and-counts (file &optional confirm ignore-counts) + "将个人词条以及词条对应的词频信息导出到文件 FILE. + +如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 +non-nil,文件存在时将会提示用户是否覆盖,默认为覆盖模式" + (interactive "F将词条和词频信息导出到文件: ") + (pyim-dcache-export-words-and-counts file confirm ignore-counts) + (message "PYIM: 词条和词频信息导出完成。")) + +(defun pyim-export-personal-words (file &optional confirm) + "将用户的个人词条导出为 pyim 词库文件. + +如果 FILE 为 nil, 提示用户指定导出文件位置, 如果 CONFIRM 为 non-nil, +文件存在时将会提示用户是否覆盖,默认为覆盖模式。" + (interactive "F将个人词条导出到文件:") + (pyim-dcache-export-personal-words file confirm) + (message "PYIM: 个人词条导出完成。")) ;; ** 删词功能 (defun pyim-delete-words-in-file (file) diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el index 3b366d296b..6eb2a246d9 100644 --- a/tests/pyim-tests.el +++ b/tests/pyim-tests.el @@ -982,7 +982,8 @@ (should (equal my/test:1 "hello")))) (ert-deftest pyim-tests-pyim-dcache-export () - (let ((pyim-dhashcache-iword2count (make-hash-table :test #'equal)) + (let ((pyim-dcache-backend 'pyim-dhashcache) + (pyim-dhashcache-iword2count (make-hash-table :test #'equal)) (pyim-dhashcache-icode2word (make-hash-table :test #'equal)) (file (pyim-tests-make-temp-file))) (puthash "你好" 10 pyim-dhashcache-iword2count) @@ -1196,15 +1197,16 @@ yin-xing 因行 (should (equal (gethash "n-h" pyim-dhashcache-ishortcode2word) '("你慌" "你好" "你坏"))))) -(ert-deftest pyim-tests-pyim-dhashcache-sort-words () - (let ((pyim-dhashcache-iword2count (make-hash-table :test #'equal)) +(ert-deftest pyim-tests-pyim-dcache-sort-words () + (let ((pyim-dcache-backend 'pyim-dhashcache) + (pyim-dhashcache-iword2count (make-hash-table :test #'equal)) words) (puthash "你好" 3 pyim-dhashcache-iword2count) (puthash "呢耗" 2 pyim-dhashcache-iword2count) (puthash "你豪" 1 pyim-dhashcache-iword2count) (setq words (list "呢耗" "你豪" "你好")) - (should (equal (pyim-dhashcache-sort-words words) + (should (equal (pyim-dcache-sort-words words) '("你好" "呢耗" "你豪"))))) (ert-deftest pyim-tests-pyim-dhashcache-get-counts-from-log ()