dict: 拆字词典更新 (#709)

* update radical dict
* update search.lua
* chore: move search.lua down ( after simplifier ), remove shadow cand condition in lua
This commit is contained in:
mirtlecn 2024-02-27 22:15:12 +08:00 committed by GitHub
parent 4dde31f156
commit ef1eb01d42
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 412 additions and 808 deletions

View File

@ -83,11 +83,11 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -83,11 +83,11 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -83,11 +83,11 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -83,11 +83,11 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -83,11 +83,11 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -83,11 +83,11 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -1,10 +1,8 @@
-- Copyright (C) Mirtle <mirtle.cn@outlook.com>
-- [CC BY 3.0 DEED](https://creativecommons.org/licenses/by/3.0/deed)
-- License: CC BY-SA 4.0 DEED (https://creativecommons.org/licenses/by-sa/4.0/)
-- 使用说明:<https://github.com/mirtlecn/rime-radical-pinyin/blob/master/search.lua.md>
-- 感谢 [AuxFilter](https://github.com/HowcanoeWang/rime-lua-aux-code/blob/main/lua/aux_code.lua) 提供参考
local function alt_lua_punc(s)
if s then
return s:gsub("([%.%+%-%*%?%[%]%^%$%(%)%%])", "%%%1")
@ -15,202 +13,206 @@ end
local f = {}
-- 逻辑
-- 当在 engine 出直接指定了 namespace 则使用该 namespace 进行 schema 匹配
-- 当在 search_in_cand 节点下指定了 schema 和 db 则进行相应匹配
-- 当该节点下 schema 为 0 或者 false或者不存在时不进行相应匹配
function f.init(env)
local config = env.engine.schema.config
local ns = 'search'
f.if_schema_lookup = false
f.if_reverse_lookup = false
-- f.mem_main = Memory(env.engine, env.engine.schema)
-- local rules = config:get_list('preedit_rules')
-- if rules then
-- f.projection = Projection()
-- f.projection:load(rules)
-- end
-- 配置:辅码查字方法
-- --
-- 当在 engine 出直接指定了 namespace 则使用该 namespace 进行 schema 匹配
-- 当在 search_in_cand 节点下指定了 schema 和 db 则进行相应匹配
-- 当该节点下 schema 为 0 或者 false或者不存在时不进行相应匹配
-- --
f.schema = config:get_string(ns .. '/schema')
if f.schema == 'false' or f.schema == '0' then
goto checkdb
end
if not env.name_space:find('^%*') then
f.schema = env.name_space
end
if f.schema then
f.mem = Memory(env.engine, Schema(f.schema))
if not f.schema or f.schema == 'false' or f.schema == '0' or #f.schema == 0 then
goto checkdb
end
f.schema_search_limit = config:get_int(ns .. "/schema_search_limit") or 1000
::checkdb::
f.db = config:get_list(ns .. '/db')
f.if_schema_lookup = false
f.if_reverse_lookup = false
f.mem = Memory(env.engine, Schema(f.schema))
if f.schema and f.mem then
f.if_schema_lookup = true
-- log.error('if_schema_lookup: ' .. 'true')
f.schema_search_limit = config:get_int(ns .. "/schema_search_limit") or 1000
end
if f.db then
f.wildcard = config:get_string(ns .. "/wildcard") or "'"
::checkdb::
local db = config:get_list(ns .. '/db')
if db and db.size > 0 then
f.wildcard = alt_lua_punc(config:get_string(ns .. "/wildcard")) or "*"
f.db_table = {}
for i = 0, db.size - 1 do
table.insert(f.db_table, ReverseLookup(db:get_value_at(i).value))
end
f.if_reverse_lookup = true
-- log.error('if_reverse_lookup: ' .. 'true')
end
f.sort = config:get_bool(ns .. "/show_other_cands")
-- 反引号作为查找的引导符号,需要加入 speller 的字母表当中
f.search_key = config:get_string("key_binder/search") or config:get_string(ns .. "/key") or '`'
-- 处理一下输入码,如果还有没有上屏的词,保留辅助码,否则,清除上屏码
f.search_key_string = alt_lua_punc(f.search_key)
-- 如果不使用任何反查手段,则不接管选词逻辑
if not f.if_reverse_lookup and not f.if_schema_lookup then
return
end
-- 配置:辅码转换规则
-- --
-- 例如:- xlit/ABCD/1234/ 就可以用 ABCD 来输入 1234地球拼音音调
local fuma_format = config:get_list(ns .. '/fuma_format')
if fuma_format and fuma_format.size > 0 then
f.code_projection = Projection()
f.code_projection:load(fuma_format)
end
-- 配置:是否显示不符合辅码的候选
f.show_other_cands = config:get_bool(ns .. "/show_other_cands")
-- 配置:辅码引导符号,默认为反引号 `
f.search_key = config:get_string("key_binder/search") or config:get_string(ns .. "/key") or '`'
f.search_key_string = alt_lua_punc(f.search_key)
-- 配置seg tag
local tag = config:get_list(ns .. '/tags')
if tag and tag.size > 0 then
f.tag = {}
for i = 0, tag.size - 1 do
table.insert(f.tag, tag:get_value_at(i).value)
end
else
f.tag = {'abc'}
end
-- 配置:手动写入用户词库
local rules = config:get_list(ns .. '/input2code_format')
if rules and rules.size > 0 then
f.projection = Projection()
f.projection:load(rules)
f.mem_main = Memory(env.engine, env.engine.schema)
env.commit_notifier = env.engine.context.commit_notifier:connect(function(ctx)
if env.have_select_commit and env.commit_code then
local commit_text = ctx:get_commit_text()
f.update_dict_entry(commit_text, env.commit_code)
ctx.commit_history:push("user_phrase", commit_text)
env.have_select_commit = false
else
return
end
end)
end
-- 接管选词逻辑,是词组则始终保留引导码,否则直接上屏
env.notifier = env.engine.context.select_notifier:connect(function(ctx)
if not ctx.input:find("^[a-z;]+" .. f.search_key_string) then
return
end
local preedit = ctx:get_preedit()
local no_search_string = ctx.input:match("^(.-)" .. f.search_key_string)
-- log.warning('[no_search_string]: '..no_search_string)
local edit = preedit.text:match('^(.-)' .. f.search_key_string)
-- log.warning('[edit]: ' .. edit)
env.have_select_commit = true
ctx.input = no_search_string
if edit and edit:match('[a-z;]') then
ctx.input = ctx.input .. f.search_key
if edit and #edit > 0 and edit:match('[a-z;]') then
ctx.input = no_search_string .. f.search_key
else
ctx.input = no_search_string
env.commit_code = no_search_string
ctx:commit()
-- local t = f.entry()
-- log.warning(edit .. '|' .. no_search_string)
-- 手动推入历史记录
-- ctx.commit_history:push("user_phrase", edit)
-- 手动写入用户词库
-- f.update_dict_entry(edit, no_search_string)
end
end)
end
-- function f.update_dict_entry(s, code)
-- local codeLen = #code
-- if s == '' or (#code % 2 ~= 0) then
-- log.warning('Ignored!' .. s)
-- return 0
-- end
-- local e = DictEntry()
-- e.text = s
-- local custom_code = {}
-- for i = 1, #code, 2 do
-- local s = code:sub(i, i + 1)
-- local c = f.projection:apply(s, true)
-- table.insert(custom_code, c)
-- end
-- e.custom_code = table.concat(custom_code, " ") .. ' '
-- log.info("[search.lua]: " .. e.text .. ' ' .. e.custom_code)
-- f.mem_main:update_userdict(e, 1, "")
-- end
-- 此函数用于手动写入用户词库,目前仅对定长码(如双拼)有效
function f.update_dict_entry(s, code)
if #s == 0 or (#code % 2 ~= 0) then
log.warning('Ignored!' .. s)
return 0
end
local e = DictEntry()
e.text = s
local custom_code = {}
for i = 1, #code, 2 do
local code_convert = f.projection:apply(code:sub(i, i + 1), true)
table.insert(custom_code, code_convert)
end
e.custom_code = table.concat(custom_code, " ") .. ' '
local if_success = f.mem_main:update_userdict(e, 1, "")
if if_success then
log.info("[search.lua]: " .. e.text .. '|' .. e.custom_code .. 'was written into user_dict')
else
-- 观察到某些时候虽有 log但实际上并没有写入词库于是有下面的逻辑
log.error('[search.lua]: ' .. e.text .. '|' .. e.custom_code .. 'update entry falid')
end
end
-- 查询反查词典当中的匹配项,并且返回字表
-- 通过 schema 的方式查询(以辅码查字,然后对比候选,慢,但能够匹配到算法转换过的码)
-- 查询方案中的匹配项,并返回字表
function f.dict_init(search_string)
local dict_table = {}
if f.code_projection then
search_string = f.code_projection:apply(search_string, true)
end
if f.mem:dict_lookup(search_string, true, f.schema_search_limit) then
for entry in f.mem:iter_dict() do
-- log.error('text: ' .. entry.text .. ' code: ' .. entry.comment)
-- table.insert(dict_table, entry.text)
dict_table[entry.text] = true
-- dict_table[entry.text] = entry.comment
end
end
return dict_table
end
-- 通过 schema 的方式查询(以码查字,然后轮询匹配,非常慢,但能够匹配到算法转换过的码)
-- 匹配候选
function f.dict_match(table, text)
-- for i, dict in ipairs(table) do
-- if text == dict then
-- return true
-- end
-- end
if table[text] == true then
return true
end
return false
end
-- 通过 reverse db 查询(以字查码,然后比对辅码是否相同,比校快,但只能匹配未经算法转换的码)
-- 通过 reverse db 查询(以字查码,然后比对辅码是否相同,快,但只能匹配未经算法转换的码)
function f.reverse_lookup(text, s)
local list = f.db
s = s:gsub(f.wildcard, '.*')
s = s:gsub(f.wildcard, '.+')
if f.code_projection then
s = f.code_projection:apply(s, true)
end
-- log.error(s)
for i = 0, list.size - 1 do
local code = ReverseLookup(list:get_value_at(i).value):lookup(text)
if code:find(' ' .. s) or code:find('^' .. s) then
return true
for _, db in ipairs(f.db_table) do
local code = db:lookup(text)
for part in code:gmatch("%S+") do
if part:find(' ' .. s) or part:find('^' .. s) then
return true
end
end
end
return false
end
function f.func(input, env)
local input_code = env.engine.context.input
-- 当且仅当当输入码中含有辅码引导符号,并有有辅码存在,进入匹配逻辑
-- 当无任何查询方式存在,直接上屏
if (input_code:find("^[a-z;]+" .. f.search_key_string .. '.+$')) and (f.if_reverse_lookup or f.if_schema_lookup) then
f.search_string = input_code:match("^.*" .. f.search_key_string .. "(.*)$")
else
local fuma = env.engine.context.input:match("^[a-z;]+" .. f.search_key_string .. "(.+)$")
if not fuma or #fuma == 0 or (not f.if_reverse_lookup and not f.if_schema_lookup) then
for cand in input:iter() do
yield(cand)
end
return
end
-- 查字时是否单字优先
local if_single_char_first = env.engine.context:get_option("search_single_char")
local dict_table
if f.if_schema_lookup then
dict_table = f.dict_init(f.search_string)
end
local other_cand = {}
local long_word_cands = {}
if f.if_schema_lookup then
dict_table = f.dict_init(fuma)
end
for cand in input:iter() do
local type = cand.type -- 类型
local text = cand.text -- 候选文字
local comment = cand.comment
-- if utf8.len(text) > 1 and if_single_char_first then
-- table.insert(other_cand_last, cand)
-- goto skip
-- end
-- 处理经过 simplify 转化过的候选,使之能够正确匹配
if cand:get_dynamic_type() == "Shadow" then
local originalCand = cand:get_genuine()
cand = ShadowCandidate(originalCand, originalCand.type, cand.text, cand.comment)
type = cand.type
text = cand.text
end
-- 只有 script_translator 下的用户词和词才去匹配
if (type == 'phrase' or type == 'user_phrase') then
-- 当候选多于一个汉字,则取第一个匹配
if utf8.len(text) > 1 then
text = text:sub(1, utf8.offset(text, 2) - 1)
end
else
table.insert(other_cand, cand)
if cand.type == 'sentence' then
goto skip
end
local text = cand.text
-- 当候选多于一个字,则取第一个匹配
if utf8.len(text) and utf8.len(text) > 1 then
text = text:sub(1, utf8.offset(text, 2) - 1)
end
-- 匹配逻辑
if (f.if_reverse_lookup and f.reverse_lookup(text, f.search_string)) or
if (f.if_reverse_lookup and f.reverse_lookup(text, fuma)) or
(f.if_schema_lookup and f.dict_match(dict_table, text)) then
if if_single_char_first and utf8.len(cand.text) > 1 then
table.insert(long_word_cands, cand)
@ -222,24 +224,34 @@ function f.func(input, env)
end
::skip::
end
-- 上屏其余的候选
for i, cand in ipairs(long_word_cands) do
yield(cand)
end
if f.sort then
if f.show_other_cands then
for i, cand in ipairs(other_cand) do
yield(cand)
end
end
end
function f.fini(env)
if not f.if_reverse_lookup and not f.if_schema_lookup then
return
function f.tags_match(seg, env)
for i, v in ipairs(f.tag) do
if seg.tags[v] then
return true
end
end
return false
end
function f.fini(env)
if f.if_reverse_lookup or f.if_schema_lookup then
env.notifier:disconnect()
if f.projection then
env.commit_notifier:disconnect()
end
end
env.notifier:disconnect()
-- env.commit_notifier:disconnect()
end
return f

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,7 @@
# Rime schema settings
# encoding: utf-8
# Copyright (C) Mirtle <mirtle.cn@outlook.com>
# License: CC BY-SA 4.0 DEED (https://creativecommons.org/licenses/by-sa/4.0/)
schema:
schema_id: radical_pinyin

View File

@ -72,13 +72,13 @@ engine:
filters:
- lua_filter@corrector # 错音错字提示
- reverse_lookup_filter@radical_reverse_lookup # 部件拆字滤镜
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@autocap_filter # 英文自动大写
- lua_filter@v_filter # v 模式 symbols 优先
- lua_filter@pin_cand_filter # 置顶候选项(顺序要求:置顶候选项 > Emoji > 简繁切换)
- lua_filter@long_word_filter # 长词优先(顺序要求:长词优先 > Emoji
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
- lua_filter@search@radical_pinyin # 部件拆字辅码
- lua_filter@reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重