From e5d122935f9629b9df1cb1ed1ec29ba12eee6041 Mon Sep 17 00:00:00 2001 From: Dvel Date: Tue, 2 Jan 2024 20:51:41 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E5=AE=8C=E5=96=84=E9=95=BF=E8=AF=8D?= =?UTF-8?q?=E4=BC=98=E5=85=88=E7=9A=84=E9=80=BB=E8=BE=91=20close=20#592?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 不提升包含英文、数字、emoji、假名的候选项 --- lua/long_word_filter.lua | 106 ++++++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/lua/long_word_filter.lua b/lua/long_word_filter.lua index a8f199f..0711132 100644 --- a/lua/long_word_filter.lua +++ b/lua/long_word_filter.lua @@ -1,45 +1,81 @@ -- 长词优先(提升「西安」「提案」「图案」「饥饿」等词汇的优先级) -- 感谢&参考于: https://github.com/tumuyan/rime-melt --- 修改:不提升英文和中英混输的 +-- 不提升包含英文、数字、emoji、假名的候选项 + +local function isEmoji(text) + for _, char in utf8.codes(text) do + if (char >= 0x1F600 and char <= 0x1F64F) or -- Emoticons + (char >= 0x1F300 and char <= 0x1F5FF) or -- Misc Symbols and Pictographs + (char >= 0x1F680 and char <= 0x1F6FF) or -- Transport and Map + (char >= 0x2600 and char <= 0x26FF) or -- Misc symbols + (char >= 0x2700 and char <= 0x27BF) or -- Dingbats + (char >= 0xFE00 and char <= 0xFE0F) or -- Variation Selectors + (char >= 0x1F900 and char <= 0x1F9FF) or -- Supplemental Symbols and Pictographs + (char >= 0x1F1E6 and char <= 0x1F1FF) then -- Flags (iOS) + return true + end + end + return false +end + +local function containsJapaneseKana(str) + for _, code in utf8.codes(str) do + -- 检查平假名的 Unicode 范围(0x3040 至 0x309F) + if code >= 0x3040 and code <= 0x309F then + return true + end + -- 检查片假名的 Unicode 范围(0x30A0 至 0x30FF) + if code >= 0x30A0 and code <= 0x30FF then + return true + end + end + return false +end + local M = {} function M.init(env) - -- 提升 count 个词语,插入到第 idx 个位置,默认 2、4。 - local config = env.engine.schema.config - env.name_space = env.name_space:gsub("^*", "") - M.idx = config:get_int(env.name_space .. "/idx") or 4 - M.count = config:get_int(env.name_space .. "/count") or 2 + -- 提升 count 个词语,插入到第 idx 个位置,默认 2、4。 + local config = env.engine.schema.config + env.name_space = env.name_space:gsub("^*", "") + M.count = config:get_int(env.name_space .. "/count") or 2 + M.idx = config:get_int(env.name_space .. "/idx") or 4 end -function M.func(input, env) - local l = {} - local firstWordLength = 0 -- 记录第一个候选词的长度,提前的候选词至少要比第一个候选词长 - local done = 0 -- 记录筛选了多少个词条(只提升 count 个词的权重) - local i = 1 - for cand in input:iter() do - -- 找到要提升的词 - local leng = utf8.len(cand.text) - if (firstWordLength < 1 or i < M.idx) then - i = i + 1 - firstWordLength = leng - yield(cand) - elseif ((leng > firstWordLength) and (done < M.count)) and (string.find(cand.text, "[%w%p%s]+") == nil) then - yield(cand) - done = done + 1 - else - table.insert(l, cand) - end - -- 找齐了或者 l 太大了,就不找了 - if (done == M.count) or (#l > 50) then - break - end - end - for _, cand in ipairs(l) do - yield(cand) - end - for cand in input:iter() do - yield(cand) - end +function M.func(input) + local l = {} + local firstWordLength = 0 -- 记录第一个候选词的长度,提前的候选词至少要比第一个候选词长 + local done = 0 -- 记录筛选了多少个词条(只提升 count 个词的权重) + local i = 1 + for cand in input:iter() do + local leng = utf8.len(cand.text) + -- 只以第一个候选项的长度作为参考 + if firstWordLength < 1 then + firstWordLength = leng + end + -- 不处理 M.idx 之前的候选项 + if i < M.idx then + i = i + 1 + yield(cand) + -- 长词直接 yield,其余的放到 l 里 + elseif leng <= firstWordLength or cand.text:find("[%a%d]") or containsJapaneseKana(cand.text) or isEmoji(cand.text) then + table.insert(l, cand) + else + yield(cand) + done = done + 1 + end + -- 找齐了或者 l 太大了,就不找了,一般前 50 个就够了 + if done == M.count or #l > 50 then + break + end + end + -- yield l 及后续的候选项 + for _, cand in ipairs(l) do + yield(cand) + end + for cand in input:iter() do + yield(cand) + end end return M