branch: externals/pyim commit a1e5df62ca06fa20126af633bf1ad2841350b5f4 Author: Feng Shu <tuma...@163.com> Commit: Feng Shu <tuma...@163.com>
Add duoyinzi adjust feature --- pyim-cstring.el | 62 ++++--- pyim-pymap.el | 454 +++++++++++++++++++++++++++++++++++++++++++++++++++- tests/pyim-tests.el | 52 +++--- 3 files changed, 513 insertions(+), 55 deletions(-) diff --git a/pyim-cstring.el b/pyim-cstring.el index e55a502090..b3e03e7565 100644 --- a/pyim-cstring.el +++ b/pyim-cstring.el @@ -98,18 +98,13 @@ NUMBER 用于递归,表示子字符串在 CSTRING 中的位置。" ;; ** 中文字符串到拼音的转换工具 ;;;###autoload (defun pyim-cstring-to-pinyin (string &optional shou-zi-mu separator - return-list ignore-duo-yin-zi adjust-duo-yin-zi) + return-list ignore-duo-yin-zi _) "将汉字字符串转换为对应的拼音字符串的工具. 如果 SHOU-ZI-MU 设置为 t, 转换仅得到拼音首字母字符串。当 RETURN-LIST 设置为 t 时,返回一个拼音列表,这个列表包含词条的一个 或者多个拼音(词条包含多音字时);如果 IGNORE-DUO-YIN-ZI 设置为 -t, 遇到多音字时,只使用第一个拼音,其它拼音忽略;当 -ADJUST-DUO-YIN-Zi 设置为 t 时, `pyim-cstring-to-pinyin' 会使用 pyim 已 -安装的词库来校正多音字,但这个功能有一定的限制: - -1. pyim 普通词库中不存在的词条不能较正 -2. 多音字校正速度比较慢,实时转换会产生卡顿。 +t, 遇到多音字时,只使用第一个拼音,其它拼音忽略。 BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结果会 包含多余的连接符:比如: \"你=好\" --> \"ni-=-hao\"" @@ -127,22 +122,11 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结 (pyim-cstring--partition string t))) ;; 通过排列组合的方式, 重排 pinyins-list。 - ;; 比如:(("Hello") ("yin") ("hang" "xing")) -> (("Hello" "yin" "hang") ("Hello" "yin" "xing")) + ;; 比如:(("Hello") ("yin") ("hang")) -> (("Hello" "yin" "hang")) (setq pinyins-list - (pyim-permutate-list pinyins-list)) - - ;; 使用 pyim 的安装的词库来校正多音字。 - ;; FIXME:如果 string 包含非中文的字符,那么多音字矫正将不起作用。 - (when adjust-duo-yin-zi - (pyim-dcache-init-variables) - (dolist (pylist pinyins-list) - (let* ((py-str (mapconcat #'identity pylist "-")) - (words-from-dicts - (pyim-dcache-get py-str '(code2word)))) - (when (member string words-from-dicts) - (push pylist pinyins-list-adjusted)))) - (setq pinyins-list-adjusted - (nreverse pinyins-list-adjusted))) + (pyim-permutate-list + (pyim-cstring--adjust-duoyinzi + string pinyins-list))) ;; 返回拼音字符串或者拼音列表 (let* ((pinyins-list @@ -161,6 +145,40 @@ BUG: 当 STRING 中包含其它标点符号,并且设置 SEPERATER 时,结 list (string-join list " ")))))) +(defun pyim-cstring--adjust-duoyinzi (word pinyins-list) + "根据 WORD 对 PINYINS-LIST 进行校正。 + +比如: + +1. WORD: 人民银行 +2. PINYINS-LIST: ((\"ren\") (\"min\") (\"yin\") (\"hang\" \"xing\")) +3. 输出结果为: ((\"ren\") (\"min\") (\"yin\") (\"hang\")) + +这个函数依赖 `pyim-pymap-duoyinzi' 提供的多音字数据。" + (mapcar (lambda (pinyins) + (if (= (length pinyins) 1) + pinyins + (let ((py-adjusted + ;; NOTE: 多音字校正规则: + ;; 1. 首先通过在 WORD 中搜索多音字组成的词条来校正。 + ;; 2. 如果多音字组成的词条无法搜索到,就使用这个多音字最常用的读音, + ;; 这样处理有可能校正错误,但大多数情况还是适用的。 + (or (cl-find-if + (lambda (pinyin) + (when-let* ((x (pyim-pymap-py2duoyinzi-get pinyin))) + (string-match-p (string-join x "\\|") word))) + pinyins) + (cl-find-if + (lambda (pinyin) + (when-let* ((x (pyim-pymap-py2duoyinzi-get pinyin t))) + (string-match-p (string-join x "\\|") word))) + pinyins)))) + ;; 如果多音字校正没有任何结果,就用校正前的信息。 + (if py-adjusted + (list py-adjusted) + pinyins)))) + pinyins-list)) + ;;;###autoload (defun pyim-cstring-to-pinyin-simple (string &optional shou-zi-mu separator return-list) "简化版的 `pyim-cstring-to-pinyin', 不处理多音字。" diff --git a/pyim-pymap.el b/pyim-pymap.el index 22f3ff98b4..c3a3f2367d 100644 --- a/pyim-pymap.el +++ b/pyim-pymap.el @@ -458,6 +458,421 @@ 但不是完全一致。") +(defvar pyim-pymap-duoyinzi-chars + '(("ai" "艾") + ("ao" "坳") + ("ba" "扒") + ("bai" "柏") + ("bang" "磅") + ("bao" "薄" "曝" "暴" "堡" "剥" "刨") + ("beng" "蚌" "泵") + ("bi" "裨" "臂") + ("bian" "扁" "匾") + ("bu" "卜" "不") + ("ce" "侧") + ("ceng" "曾") + ("cha" "查") + ("chai" "拆") + ("chan" "禅" "掺") + ("chang" "厂") + ("chao" "嘲") + ("che" "辙" "车") + ("cheng" "乘") + ("chi" "尺") + ("chou" "臭" "仇") + ("chuo" "绰") + ("ci" "伺") + ("dai" "呆") + ("dan" "单") + ("dao" "叨") + ("di" "底") + ("ding" "丁") + ("dong" "洞") + ("du" "读" "度") + ("dun" "顿" "蹲" "沌" "敦") + ("duo" "堕") + ("fan" "繁" "番") + ("feng" "冯") + ("fo" "佛") + ("fou" "否") + ("ga" "旮") + ("gai" "盖") + ("gang" "扛") + ("gao" "镐") + ("ge" "革" "铬" "蛤") + ("gei" "给") + ("gong" "汞") + ("gou" "枸") + ("gu" "谷") + ("gua" "括" "呱") + ("guan" "莞") + ("guang" "广") + ("gui" "龟" "柜") + ("hang" "夯") + ("hao" "蒿") + ("he" "菏" "核" "和" "呵" "合") + ("hei" "嘿") + ("hong" "虹" "红") + ("hu" "鹄" "浒") + ("hua" "划") + ("huai" "徊") + ("huan" "还") + ("hun" "荤") + ("huo" "豁") + ("ji" "藉" "缉" "稽" "祭" "瘠" "亟") + ("jia" "贾" "家" "夹" "伽" "价") + ("jiao" "脚" "缴") + ("jie" "芥" "秸") + ("jing" "颈" "荆") + ("ju" "桔" "咀" "句") + ("jue" "jiao" "觉") + ("jun" "浚") + ("ka" "咖" "卡") + ("kai" "楷") + ("kan" "槛") + ("kang" "亢") + ("ke" "坷" "咳") + ("keng" "吭") + ("kui" "馈" "溃" "匮") + ("la" "蜡" "腊") + ("lao" "烙" "姥") + ("le" "勒") + ("lei" "肋") + ("lin" "赁") + ("ling" "棱") + ("liu" "六") + ("lu" "陆" "碌") + ("luo" "络") + ("lv" "缕" "绿") + ("mai" "脉" "埋") + ("mang" "氓") + ("mao" "冒") + ("me" "么") + ("mei" "酶" "没") + ("meng" "萌") + ("mi" "谜" "秘" "泌") + ("mian" "娩") + ("miu" "缪") + ("mo" "模" "摩" "抹") + ("mou" "牟") + ("mu" "沐") + ("na" "那" "哪" "呐") + ("nan" "南") + ("ne" "呢") + ("ni" "溺") + ("niao" "鸟" "尿") + ("nong" "弄") + ("nuo" "娜") + ("nve" "疟") + ("pan" "ban" "扳") + ("pang" "膀" "胖" "旁") + ("pao" "炮") + ("pi" "辟") + ("pin" "拚") + ("ping" "屏") + ("po" "魄" "迫") + ("pu" "璞" "莆" "脯" "瀑" "朴" "埔") + ("qi" "骑" "蹊" "栖" "期" "契" "其") + ("qian" "乾" "铅" "茜" "浅" "嵌" "堑") + ("qiao" "荞") + ("qie" "且" "茄") + ("qu" "区") + ("quan" "券") + ("que" "雀") + ("sai" "塞") + ("sao" "缫") + ("se" "色") + ("sha" "莎" "厦") + ("shan" "杉") + ("shang" "裳") + ("she" "赊" "蛇" "摄") + ("shen" "莘" "沈" "参" "什") + ("shi" "食" "适" "识" "石" "氏") + ("shu" "熟" "漱" "术" "数" "戌" "属") + ("shuai" "衰" "率") + ("shui" "谁") + ("shuo" "说") + ("si" "似") + ("su" "宿") + ("suo" "缩" "梭") + ("tang" "汤" "倘") + ("tao" "陶") + ("ti" "提") + ("ting" "铤" "烃") + ("tui" "褪") + ("tun" "屯" "囤") + ("tuo" "拓" "沱") + ("wan" "万") + ("wang" "亡") + ("wei" "蔚" "圩") + ("wo" "涡" "挝") + ("wu" "鹜" "无") + ("xi" "铣" "洗" "戏") + ("xia" "虾" "吓") + ("xian" "癣" "县") + ("xiang" "降" "巷") + ("xiao" "嚣" "削") + ("xie" "邪" "挟" "偕") + ("xu" "嘘" "许") + ("xue" "血") + ("xun" "寻") + ("yan" "腌" "咽") + ("yao" "钥" "窑" "侥") + ("ye" "曳" "叶") + ("yi" "遗") + ("yin" "殷") + ("yong" "甬") + ("you" "柚") + ("yu" "尉" "吁") + ("yuan" "员") + ("yue" "约") + ("yun" "蕴" "熨") + ("za" "咋") + ("zai" "仔") + ("zang" "赃" "臧") + ("zao" "皂") + ("ze" "择") + ("zha" "楂" "栅" "扎") + ("zhan" "辗" "斩") + ("zhao" "召") + ("zhe" "着") + ("zhi" "炙" "殖" "挚" "峙" "吱") + ("zhong" "种") + ("zhou" "粥") + ("zhu" "著") + ("zhua" "爪") + ("zhuai" "拽") + ("zhuan" "赚") + ("zhui" "缀" "椎") + ("zhuo" "琢") + ("zi" "兹") + ("zu" "卒")) + "多音字最常用的读音。") + +(defvar pyim-pymap-duoyinzi-words + '(("a" "阿姨" "阿富" "阿门" "阿拉" "阿林" "黑阿" "麦阿密" "鹿城阿岙" "阿福") + ("ao" "拗口" "违拗") + ("ai" "艾滋" "艾蒿" "未艾") + ("bang" "翅膀" "臂膀" "重磅" "磅秤" "黄泥磅店" "蛤蚌" "蚌壳" "河蚌" "鹬蚌" "珠蚌") + ("bai" "叔伯" "百万") + ("bao" "剥皮" "超薄" "薄脆" "薄板" "薄饼" "暴晒" "暴发" "暴雨" "暴力" "风暴" "暴露" "暴风" "汉堡" "古堡" "地堡" "城堡" "龍堡" "卡斯堡" "麻家堡" "麦芬堡" "汉堡" "麦得堡" "麦尔堡" "曝光" "瀑河") + ("beng" "蚌埠") + ("bi" "复辟" "臂章" "螳臂" "交臂" "前臂" "一臂" "奋臂" "膀臂" "臂膀" "秘鲁" "泌阳") + ("bing" "屏弃" "屏气" "屏除" "屏退" "屏息") + ("bian" "扁桃" "方便" "方便面" "便当" "便捷") + ("bo" "薄荷" "单薄" "伯仲" "伯乐" "伯劳" "伯父" "大伯" "老伯" "伯母" "黄伯" "伯爵" "停泊" "淡泊" "尼泊" "漂泊" "鸿波" "柏林") + ("bu" "大埔") + ("can" "参谋" "参事" "总参" "参数" "参议" "参观" "参拜" "参股") + ("cang" "埋藏" "藏头" "秘藏" "雪藏" "藏匿" "收藏" "馆藏" "矿藏" "隐藏" "蕴藏" "藏袍" "储藏" "窖藏" "藏龙" "藏胞" "冷藏" "珍藏" "私藏" "藏掖" "西藏" "藏书" "藏品" "伧俗" "龙藏寺") + ("cen" "参差") + ("ceng" "不曾" "似曾" "几曾" "何曾" "曾经" "曾几" "未曾" "噌的" "一声") + ("cha" "刹那" "宝刹" "一刹" "喳喳") + ("chai" "公差" "差役" "专差" "官差" "听差" "美差" "办差" "差事" "差使" "肥差" "当差" "钦差") + ("chan" "颤悠" "单于" "禅学" "班禅" "禅宗" "禅堂" "禅门" "禅机" "禅杖" "禅房" "禅师" "坐禅" "参禅" "禅院") + ("chang" "周长" "细长" "长发" "三长" "长河" "长袖" "长衫" "天长" "长短" "超长" "长沙" "长春" "长远" "长度" "长江" "长处" "长假" "长街" "长征" "全长" "长城" "波长" "身长" "长途" "长吁" "长虹" "长方") + ("chao" "朝阳" "朝阳区" "朝鲜" "朝廷" "王朝" "历朝" "解嘲" "讥嘲" "自嘲" "嘲笑" "嘲弄" "冷嘲" "嘲讽" "绰绰" "绰起" "绰家" "剿袭" "剿说") + ("che" "汽车" "停车场" "车车" "黑车" "车饰") + ("chen" "称职" "匀称" "称心" "相称" "对称") + ("cheng" "职称" "简称" "总称" "官称" "代称" "称号" "称谓" "昵称" "谦称" "全称" "名称" "噌吰") + ("chu" "六畜" "家畜" "耕畜" "畜生" "牲畜") + ("chui" "椎心") + ("chuan" "文传" "传媒" "传销" "传情" "真传" "祖传" "传闻" "传家" "秘传" "传单" "传说") + ("chi" "匙子" "茶匙" "羹匙" "汤匙" "尺度" "英尺" "咫尺" "尺码" "公尺" "卡尺" "米尺" "卷尺") + ("chong" "重庆" "重重") + ("chou" "汗臭" "臭氧" "口臭" "腋臭" "臭虫" "臭骂" "臭美" "酸臭" "腐臭" "臭气" "腥臭" "臭名" "遗臭" "恶臭" "臭豆" "狐臭" "臭味" "臭架") + ("chuang" "经幢") + ("chuo" "绰约" "阔绰" "绰号" "宽绰") + ("ci" "参差" "伺候" "龟兹") + ("cuan" "攒钱" "攒聚" "攒动") + ("cuo" "撮儿" "撮要" "撮合") + ("da" "大街" "沓子" "龙大" "大西洋" "大昌" "大圣" "福大" "黑大" "大华" "大包" "大厦") + ("dao" "叨唠" "絮叨" "叨念" "叨咕" "念叨" "唠叨" "叨叨" "磨叨") + ("dai" "大夫") + ("dan" "西单" "东单" "清单" "报单" "单利" "名单" "单姓" "单亲" "单线" "单科" "单间" "单挑" "单价" "单词" "子弹") + ("de" "似的" "总的" "中的" "别的") + ("deng" "澄清") + ("di" "怎的" "无的" "有的" "目的" "标的" "打的" "的确" "的当" "的士" "上地" "大地" "天地" "提防") + ("diao" "蓝调" "蓝调吧" "调调" "音调" "论调" "格调" "调令" "低调" "笔调" "基调" "强调" "声调" "滥调" "老调" "色调" "单调" "腔调" "跑调" "曲调" "步调" "语调" "主调" "情调") + ("du" "都会" "国都" "都城" "古都" "故都" "大都" "首都" "成都" "旧都" "都市" "龙都" "鼎都" "鹤都" "鹏都" "鸿都" "麦度" "态度" "读书" "法度" "宽度" "进度") + ("dou" "全都" "句读") + ("duo" "测度" "忖度" "揣度" "猜度") + ("dun" "粮囤") + ("e" "阿谀" "阿胶" "阿弥" "恶心") + ("fan" "番茄") + ("fo" "佛塔" "佛徒" "佛牙" "佛教") + ("fu" "仿佛" "果脯") + ("fou" "是否" "与否") + ("ga" "咖喱" "伽马") + ("gang" "扛鼎") + ("ge" "革命" "皮革" "鹰革" "蛤蚧" "文蛤" "蛤蜊" "咯吱" "咯噔" "咯咯") + ("geng" "脖颈") + ("gong" "女红") + ("gu" "布谷" "谷物" "谷地" "硅谷" "中鹄" "麦谷" "麓谷" "鹭谷") + ("gui" "龟山" "龟士" "龟博" "龟仔" "鹿龟" "龟汁" "龟苓" "龟顶") + ("gua" "挺括" "顶呱" "呱呱" "呱唧" "呱嗒") + ("guan" "纶巾" "东莞") + ("guang" "广州" "广东" "广播") + ("ha" "蛤蟆" "癞蛤" "虾蟆") + ("hai" "还是" "还有") + ("hao" "貉子" "貉绒") + ("hang" "总行" "分行" "支行" "行业" "排行" "行情" "央行" "商行" "外行" "银行" "商行" "酒行" "麻行" "琴行" "巷道") + ("he" "嘉和" "和睦" "亲和" "龙和" "之貉" "威吓" "恫吓" "恐吓" "鼎和" "锦和" "麒和苑" "合资" "鸿合") + ("heng" "道行") + ("hu" "鹄望" "鸿鹄" "鹄立") + ("huan" "鹂还") + ("hui" "会馆" "会展" "会所" "协会" "国会" "会堂") + ("hong" "红装" "红牌" "红木" "红人") + ("huo" "软和" "热和" "暖和") + ("ji" "病革" "给养" "自给" "给水" "薪给" "给予" "供给" "稽考" "稽查" "稽核" "滑稽" "稽留" "缉获" "缉查" "缉私" "缉捕" "狼藉" "奇数" "亟待" "亟须" "亟亟" "亟需" "诘屈" "荠菜") + ("jia" "雪茄" "瑜伽" "伽利略") + ("jian" "龙见") + ("jiang" "降温" "降低" "降旗" "下降" "倔强") + ("jiao" "嚼舌" "嚼子" "细嚼" "平角" "视角" "海角" "龙角" "鹿角" "围剿" "征剿" "饺子") + ("jie" "解放" "慰藉" "蕴藉" "盘诘" "诘难" "诘问" "反诘") + ("jin" "矜夸" "矜持" "骄矜" "自矜") + ("jing" "颈项" "颈椎" "引颈" "长颈" "宫颈" "瓶颈" "龙颈" "黑颈鹤" "鹿颈" "景色" "帝景" "劲松") + ("ju" "咀嚼" "桔汁") + ("jun" "平均" "鸿均") + ("juan" "棚圈" "圈养") + ("jv" "咀嚼" "趑趄") + ("jvan" "猪圈" "羊圈") + ("jue" "主角" "角色" "旦角" "女角" "丑角" "角力" "名角" "配角" "咀嚼" "直觉" "感觉" "错觉" "触觉" "幻觉") + ("jun" "龟裂") + ("jvn" "龟裂") + ("ka" "咖啡" "磁卡" "贺卡" "卡拉" "胸卡" "声卡" "卡片" "绿卡" "卡通" "网卡" "卡口" "龙卡" "咯痰" "咯血") + ("ke" "咳嗽" "干咳" "贝壳" "蚌壳" "外壳" "蛋壳" "脑壳" "弹壳") + ("keng" "吭声" "吭气" "吭哧") + ("kuai" "会计" "财会") + ("la" "癞痢") + ("lai" "癞疮" "癞子" "癞蛤" "癞皮") + ("lao" "积潦" "络子" "落枕" "落价" "麻粩") + ("le" "娱乐" "玩乐" "乐趣" "美乐" "乐缘") + ("lei" "勒紧") + ("lo" "然咯") + ("lou" "佝偻") + ("long" "里弄" "弄堂") + ("liao" "了解" "了结" "明了" "了得" "末了" "未了" "了如" "了如指掌" "潦草" "潦倒") + ("liu" "碌碡" "碌碌" "劳碌" "忙碌" "庸碌") + ("lu" "绿林") + ("luo" "络腮" "部落" "落花" "日落") + ("lv" "频率" "机率" "比率" "效率" "胜率" "概率" "汇率" "功率" "倍率" "绿叶" "淡绿" "绿色" "绿豆" "伛偻" "绿洲") + ("lun" "丙纶" "锦纶" "经纶" "涤纶") + ("man" "埋怨") + ("mai" "山脉" "动脉" "命脉" "筋脉" "脉象" "气脉" "脉动" "脉息" "脉络" "一脉" "经脉") + ("mang" "流氓") + ("me" "黛么") + ("meng" "群氓") + ("mo" "埋没" "隐没" "脉脉" "航模" "模糊" "男模" "楷模" "规模" "劳模" "模型" "模范" "模特" "名模") + ("mou" "绸缪") + ("mi" "秘密" "秘方" "奥秘" "神秘" "泌尿" "分泌") + ("miu" "谬论" "纰缪") + ("mu" "人模" "字模" "模板" "模样" "模具" "装模" "装模做样" "模子") + ("na" "安娜" "娜娜" "丽娜" "黛尔娜" "黛娜" "海娜" "黑娜" "黄丽娜" "麦香娜" "优娜" "麦娜" "麟娜") + ("nan" "南方" "湖南") + ("ne" "哪吒") + ("ni" "毛呢" "花呢" "呢绒" "线呢" "呢料" "呢子" "呢喃") + ("niao" "便溺") + ("niu" "执拗" "拗不") + ("nue" "疟疾") + ("nuo" "婀娜" "袅娜") + ("nv" "女人") + ("nve" "疟原" "疟蚊") + ("pai" "迫击" "迫击炮") + ("pang" "膀胱" "膀肿" "磅礴") + ("pi" "否极" "臧否" "龙陂" "黄陂") + ("pian" "扁舟" "便宜") + ("piao" "朴姓") + ("ping" "屏幕" "荧屏" "银屏") + ("po" "朴刀") + ("pu" "暴十" "一曝十寒" "里堡" "十里堡" "胸脯" "肉脯" "脯子" "杏脯" "简朴" "朴质" "古朴" "朴厚" "纯朴" "朴素" "诚朴" "俭朴" "朴实" "淳朴" "曝晒" "瀑布" "飞瀑" "黄埔") + ("qiu" "龟兹") + ("qi" "稽首" "缉鞋" "奇妙" "传奇" "亟来" "荸荠" "蹊跷" "林栖" "鹿奇" "鹭奇" "齐天大圣" "齐天") + ("qia" "卡脖" "卡子" "关卡" "卡壳" "哨卡" "边卡" "发卡") + ("qiao" "雀盲" "雀子" "地壳" "甲壳" "躯壳") + ("qian" "纤手" "拉纤" "纤夫" "纤绳") + ("qiang" "强颜" "强人" "自强" "强烈" "强风" "强大" "黎强" "麒强" "鹤强" "龚强") + ("qie" "茄子" "颠茄" "番茄" "趔趄") + ("qin" "亲和" "亲亲" "棘矜" "矜锄") + ("qing" "干亲" "亲家") + ("qu" "小区") + ("quan" "转圈" "钢圈" "圆圈" "罗圈" "弧圈" "垫圈" "小圈" "眼圈") + ("que" "麻雀" "鸟雀" "燕雀" "孔雀" "云雀" "雀巢、") + ("re" "般若") + ("sai" "麦迪塞姆" "活塞") + ("se" "堵塞" "搪塞" "茅塞" "闭塞" "鼻塞" "梗塞" "阻塞" "淤塞" "拥塞" "哽塞" "月色" "彩色" "特色" "深色" "声色" "黛色" "黛色" "黑色瞳" "色坊") + ("sha" "刹车" "急刹" "急刹车" "广厦" "大厦" "商厦" "鹰大厦") + ("shai" "色子") + ("shan" "姓单" "单县" "铁杉" "杉树" "封禅" "禅让" "黒杉") + ("shang" "衣裳") + ("she" "拾级" "折本") + ("shen" "沙参" "野参" "参王" "人参" "红参" "丹参" "山参" "海参" "刺参" "没什" "什么" "为什" "鹿参") + ("sheng" "野乘" "千乘" "史乘" "盛大" "鸿盛") + ("shi" "钥匙" "拾荒" "捡拾" "拾物" "家什" "什物" "什锦" "麻什" "麦什" "喀什" "牛什" "见识" "知识" "似的" "骨殖" "饮食" "副食" "石业" "石头" "石艺" "姓氏" "上栅" "下栅") + ("shuai" "表率" "率性" "率直" "率真" "粗率" "率领" "轻率" "直率" "草率" "大率" "坦率" "数字" "招数" "基数" "数码") + ("shuang" "泷水") + ("shu" "金属" "气数" "岁数" "度数" "数据" "级数" "数控" "数学" "参数" "次数" "正数" "代数" "实数" "系数" "分数" "辈数") + ("shui" "游说") + ("shuo" "数见" "数见不鲜" "传说" "听说" "妄说" "实说" "胡说" "评说" "分说" "小说") + ("si" "窥伺" "伺弄" "伺机" "疑似" "似是" "好似" "似曾" "形似" "酷似" "貌似" "似懂" "胜似" "恰似" "近似" "神似" "赛似" "看似" "活似" "强似" "似乎" "类似" "相似") + ("su" "宿主" "宿命" "归宿" "住宿" "借宿" "寄宿" "宿营" "夜宿" "露宿" "投宿" "宿舍" "名宿" "整宿" "食宿") + ("sui" "尿泡") + ("ta" "拓本" "拓片" "碑拓" "疲沓" "拖沓" "杂沓" "鸿塔") + ("tang" "鸭汤" "鸡汤") + ("tao" "叨扰" "叨光" "陶器") + ("tan" "弹性" "弹力" "反弹") + ("ti" "手提" "提速" "提意" "提前" "提早" "提升" "提议" "提款" "提婚" "提包" "耳提" "提供" "麦麦提") + ("tiao" "空调" "调教" "烹调" "调羹" "调料" "调皮" "调控" "调节" "调整" "调价" "谐调" "协调" "调色" "调侃" "调味" "失调" "调治" "调频" "调剂" "调停" "调休" "调解") + ("ting" "域町") + ("tui" "褪色" "褪毛") + ("tuo" "拓宽" "拓荒" "开拓" "落拓" "拓展" "拓印") + ("tun" "囤积" "囤聚") + ("wei" "响尾" "尾巴" "尾灯" "船尾" "追尾" "尾椎" "月尾" "燕尾" "尾数" "年尾" "岁尾" "鸢尾" "凤尾" "彗尾" "尾翼" "结尾" "遗之" "龙尾" "齐鑫尾" "麻尾" "麦度" "鹿尾") + ("wu" "可恶" "交恶" "好恶" "厌恶" "憎恶" "嫌恶" "痛恶" "深恶") + ("wan" "藤蔓" "枝蔓" "瓜蔓" "蔓儿" "莞尔" "百万") + ("xia" "虾仁" "青虾" "大虾" "虾皮" "对虾" "虾子" "虾酱" "虾兵" "虾米" "龙虾" "噶厦" "厦门" "吓唬" "吓人" "惊吓" "天虾" "龙虾" "皮皮虾" "麦虾") + ("xi" "栖栖" "关系" "星系" "水系" "系念" "体系" "联系" "系列" "菜系" "世系" "蹊径") + ("xiao" "学校" "切削" "削面" "刀削" "刮削") + ("xian" "纤细" "光纤" "纤巧" "纤柔" "纤小" "纤维" "纤瘦" "纤纤" "化纤" "纤秀" "棉纤" "纤尘") + ("xiang" "街巷" "僻巷" "巷子" "龙门巷" "六巷" "龙湾巷" "龙港巷" "龙泉巷" "龙巷" "龙妙巷" "龄巷" "齐家巷" "鼓楼巷" "鼓巷" "黎明巷" "麻子巷" "麻园巷" "麦子巷" "鹊巷") + ("xie" "解数" "出血" "采血" "换血" "血糊" "尿血" "淤血" "放血" "血晕" "血淋" "便血" "吐血" "咯血" "叶韵" "蝎子") + ("xiu" "铜臭" "乳臭" "成宿" "星宿") + ("xin" "鸿信") + ("xing" "深省" "省视" "内省" "不省人事" "省悟" "省察" "旅行" "例行" "行程" "行乐" "龙行" "人行" "流行" "先行" "行星" "品行" "发行" "行政" "风行" "龙行" "龍行" "麟行") + ("xu" "牧畜" "畜产" "畜牧" "畜养" "气吁" "喘吁" "吁吁" "麦埂圩") + ("xue" "削减" "削弱" "削瘦" "削球" "削平" "削价" "瘦削" "剥削" "削职" "删削" "削肩" "吸血") + ("xun" "荨麻" "荨麻疹") + ("yao" "发疟" "疟子" "约斤" "称约" "钥匙" "金钥") + ("yan" "吞咽" "咽气" "咽喉" "殷红" "腌制" "腌肉" "腌菜" "烟草" "名烟" "烟酒") + ("ye" "抽咽" "哽咽" "咽炎" "下咽" "呜咽" "幽咽" "悲咽" "绿叶" "叶子" "荷叶" "落叶" "菜叶" "红叶" "树叶" "枫叶" "茶叶" "鸿葉") + ("yi" "自艾" "惩艾" "后尾") + ("yin" "殷勤" "殷墟" "殷切" "殷鉴") + ("yo" "杭育") + ("yu" "谷浑" "呼吁" "吁请" "吁求" "体育" "教育" "育儿" "熨帖" "熨烫") + ("yun" "熨斗" "电熨斗") + ("yue" "乐音" "器乐" "乐律" "乐章" "音乐" "乐理" "民乐" "乐队" "声乐" "奏乐" "弦乐" "乐坛" "管乐" "配乐" "乐曲" "乐谱" "锁钥" "密钥" "乐团" "鼓乐社" "乐器" "栎阳" "约会") + ("zan" "积攒") + ("zang" "宝藏" "藏历" "藏文" "藏香" "藏语" "藏青" "藏族" "藏医" "藏戏" "藏药" "藏蓝") + ("ze" "择善") + ("zeng" "曾孙" "曾祖") + ("za" "绑扎" "结扎" "包扎" "捆扎") + ("zai" "牛仔" "龟仔" "龙仔" "鼻仔" "羊仔" "仔仔" "麻仔" "麵包仔" "麦旺仔" "鸿仔" "煲仔" "福仔") + ("zha" "马扎" "挣扎" "扎啤" "扎根" "扎手" "扎针" "扎花" "扎堆" "扎营" "扎实" "稳扎" "柞水" "麻扎镇" "麻扎乡" "栅栏") + ("zhai" "择菜") + ("zhan" "不粘" "粘贴" "粘连") + ("zhao" "朝朝" "明朝" "朝晖" "朝夕" "朝思" "有朝" "今朝" "朝气" "朝三" "朝秦" "朝霞" "鹰爪" "龙爪" "魔爪" "爪牙" "失着" "着数" "龙爪槐") + ("zhe" "破折" "打折" "曲折" "折冲" "存折" "折合" "折旧" "折纸" "骨折" "折返" "折价" "折算" "波折" "折扇" "对折" "不折" "折扣" "七折" "折中" "拙著" "要著" "著文" "新著" "本着" "对着") + ("zhi" "标识" "嘎吱" "咯吱" "吱扭" "吱吱" "繁殖" "增殖" "生殖" "殖民") + ("zhong" "重量" "鹏重") + ("zhu" "属意" "著录" "撰著" "名著" "专著" "著述" "著作" "显著" "昭著" "原著" "著名" "著书" "遗著" "论著" "著者" "编著" "卓著" "译著" "著称") + ("zhui" "椎骨" "尾椎" "椎间" "腰椎" "胸椎" "颈椎" "脊椎") + ("zhuo" "执著" "着装" "着落" "着意" "着力" "附着" "着笔" "胶着" "着手" "着重" "穿着" "衣着" "执着" "着眼" "着墨" "着实" "沉着" "着陆" "着想" "着色") + ("zhuang" "幢房" "一幢" "幢楼") + ("zi" "吱声" "来兹" "今兹" "仔细" "仔猪") + ("zu" "沐足" "足道") + ("zuo" "撮毛" "小撮" "柞绸" "柞蚕" "柞树" "柞木") + ("zui" "咀唇" "尖沙咀" "黄达咀" "黄土咀" "鹰咀")) + "多音字对应的词组。") + (defvar pyim-pymap--py2cchar-cache1 nil "拼音查汉字功能需要的变量. @@ -478,11 +893,18 @@ 类似: \"艾\" -> (\"yi\" \"ai\")") +(defvar pyim-pymap--py2duoyinzi-cache1 nil + "汉字转拼音功能需要的变量") + +(defvar pyim-pymap--py2duoyinzi-cache2 nil + "汉字转拼音功能需要的变量") + ;; ** "汉字 -> 拼音" 以及 "拼音 -> 汉字" 的转换函数 (defun pyim-pymap-cache-create (&optional force) "创建 pymap 相关的 cache." (pyim-pymap--cchar2py-cache-create force) - (pyim-pymap--py2cchar-cache-create force)) + (pyim-pymap--py2cchar-cache-create force) + (pyim-pymap--py2duoyinzi-cache-create force)) (defun pyim-pymap--cchar2py-cache-create (&optional force) "Build pinyin cchar->pinyin hashtable from `pyim-pymap'. @@ -533,6 +955,28 @@ If FORCE is non-nil, FORCE build." (puthash key (delete-dups `(,@orig-value ,@cchars)) pyim-pymap--py2cchar-cache3))))))) +(defun pyim-pymap--py2duoyinzi-cache-create (&optional force) + "构建 pinyin 到多音字的缓存,如果 FORCE 设置为 t, 强制更新索引。" + (when (or force + (not pyim-pymap--py2duoyinzi-cache1) + (not pyim-pymap--py2duoyinzi-cache2)) + (setq pyim-pymap--py2duoyinzi-cache1 + (make-hash-table :size 50000 :test #'equal)) + (setq pyim-pymap--py2duoyinzi-cache2 + (make-hash-table :size 50000 :test #'equal)) + + (dolist (x pyim-pymap-duoyinzi-chars) + (let* ((py (car x)) + (chars (delete-dups + `(,@(cdr x) ,@(gethash py pyim-pymap--py2duoyinzi-cache1))))) + (puthash py chars pyim-pymap--py2duoyinzi-cache1))) + + (dolist (x pyim-pymap-duoyinzi-words) + (let* ((py (car x)) + (words (delete-dups + `(,@(cdr x) ,@(gethash py pyim-pymap--py2duoyinzi-cache2))))) + (puthash py words pyim-pymap--py2duoyinzi-cache2))))) + (defun pyim-pymap-py2cchar-get (pinyin &optional equal-match return-list include-seperator) "获取拼音与 PINYIN 想匹配的所有汉字. @@ -578,6 +1022,14 @@ pyim 在特定的时候需要读取一个汉字的拼音,这个工作由此完 (when (= (length key) 1) (gethash key pyim-pymap--cchar2py-cache)))) +(defun pyim-pymap-py2duoyinzi-get (pinyin &optional return-chars) + "获取与 PINYIN 想匹配的多音字(词)。" + (pyim-pymap--py2duoyinzi-cache-create) + (when (and pinyin (stringp pinyin)) + (if return-chars + (gethash pinyin pyim-pymap--py2duoyinzi-cache1) + (gethash pinyin pyim-pymap--py2duoyinzi-cache2)))) + ;; * Footer (provide 'pyim-pymap) diff --git a/tests/pyim-tests.el b/tests/pyim-tests.el index 502b7f26d1..8ba7446dd8 100644 --- a/tests/pyim-tests.el +++ b/tests/pyim-tests.el @@ -291,6 +291,10 @@ '("阿" "啊" "呵" "腌" "|" "嗄" "吖" "锕" "|" "|" "錒"))) (should (equal (pyim-pymap-py2cchar-get "zhua" t) '("抓挝爪||髽|膼撾檛簻"))) + (should (equal (pyim-pymap-py2duoyinzi-get "ai") + '("艾滋" "艾蒿" "未艾"))) + (should (equal (pyim-pymap-py2duoyinzi-get "ai" t) + '("艾"))) (should (equal (mapcar (lambda (x) (concat (substring x 0 1) (substring x -1))) @@ -791,38 +795,22 @@ "我爱-北京-天安-门")))) (ert-deftest pyim-tests-pyim-cstring-to-pinyin () - (let ((pyim-dhashcache-code2word (make-hash-table :test #'equal)) - (str "银行很行")) - ;; Create code2word dcache. - (puthash "yin-hang-hen-xing" (list "银行很行") pyim-dhashcache-code2word) - ;; pyim-cstring-split-to-list - (should (equal (pyim-cstring-to-pinyin "银行很行") - (concat "yinxinghenxing yinxinghenheng yinxinghenhang " - "yinhenghenxing yinhenghenheng yinhenghenhang " - "yinhanghenxing yinhanghenheng yinhanghenhang"))) - (should (equal (pyim-cstring-to-pinyin "银行很行" t) - "yxhx yxhh yxhh yhhx yhhh yhhh yhhx yhhh yhhh")) - (should (equal (pyim-cstring-to-pinyin "银行很行" nil "-") - (concat "yin-xing-hen-xing yin-xing-hen-heng yin-xing-hen-hang " - "yin-heng-hen-xing yin-heng-hen-heng yin-heng-hen-hang " - "yin-hang-hen-xing yin-hang-hen-heng yin-hang-hen-hang"))) - (should (equal (pyim-cstring-to-pinyin "银行很行" nil "-" t) - '("yin-xing-hen-xing" "yin-xing-hen-heng" "yin-xing-hen-hang" - "yin-heng-hen-xing" "yin-heng-hen-heng" "yin-heng-hen-hang" - "yin-hang-hen-xing" "yin-hang-hen-heng" "yin-hang-hen-hang"))) - (should (equal (pyim-cstring-to-pinyin "银行很行" nil "-" t t) - '("yin-xing-hen-xing"))) - (should (equal (pyim-cstring-to-pinyin "银行很行" nil "-" nil nil t) - "yin-hang-hen-xing")) - (should (equal (pyim-cstring-to-pinyin "Hello 银行很行 Hi" nil "-" nil t) - "Hello -yin-xing-hen-xing- Hi")) - ;; FIXME: 这个 test 是不合理的,不过暂时找不到简单的修复方式。 - (should (equal (pyim-cstring-to-pinyin "Hello 银行很行 Hi" nil "-" nil nil t) - (concat "Hello -yin-xing-hen-xing- Hi Hello -yin-xing-hen-heng- Hi " - "Hello -yin-xing-hen-hang- Hi Hello -yin-heng-hen-xing- Hi " - "Hello -yin-heng-hen-heng- Hi Hello -yin-heng-hen-hang- Hi " - "Hello -yin-hang-hen-xing- Hi Hello -yin-hang-hen-heng- Hi " - "Hello -yin-hang-hen-hang- Hi"))))) + (should (equal (pyim-cstring--adjust-duoyinzi + "银行传说" '(("yin") ("xing" "heng" "hang") + ("zhuan" "chuan") ("yue" "shuo" "shui"))) + '(("yin") ("hang") ("chuan") ("shuo")))) + + ;; pyim-cstring-split-to-list + (should (equal (pyim-cstring-to-pinyin "银行传说") "yinhangchuanshuo")) + (should (equal (pyim-cstring-to-pinyin "银行传说" t) "yhcs")) + (should (equal (pyim-cstring-to-pinyin "银行传说" nil "-") "yin-hang-chuan-shuo")) + (should (equal (pyim-cstring-to-pinyin "银行传说" nil "-" t) '("yin-hang-chuan-shuo"))) + (should (equal (pyim-cstring-to-pinyin "银行传说" nil "-" t t) '("yin-hang-chuan-shuo"))) + (should (equal (pyim-cstring-to-pinyin "Hello 银行传说 Hi" nil "-" nil t) + "Hello -yin-hang-chuan-shuo- Hi")) + ;; FIXME: 这个 test 是不合理的,不过暂时找不到简单的修复方式。 + (should (equal (pyim-cstring-to-pinyin "Hello 银行传说 Hi" nil "-" nil nil t) + "Hello -yin-hang-chuan-shuo- Hi"))) (ert-deftest pyim-tests-pyim-cstring-to-xingma () (let ((pyim-dhashcache-word2code (make-hash-table :test #'equal))