feat: 词条隐藏、降频 (#249) close #242

This commit is contained in:
Shingo 2023-05-07 18:20:42 +08:00 committed by GitHub
parent 299ae93309
commit b70a2a7cb7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 516 additions and 7 deletions

View File

@ -168,6 +168,9 @@ key_binder:
# Lua 配置: 以词定字(上屏当前词句的第一个或最后一个字)
select_first_character:
select_last_character: "grave"
# Lua 配置: 词条隐藏、降频
# turn_down_cand: "Control+j" # 匹配当前输入码后隐藏指定的候选字词 或候选词条放到第四候选位置
# drop_cand: "Control+d" # 强制删词, 无视输入的编码
bindings:
# Tab / Shift+Tab 切换光标至下/上一个拼音

View File

@ -51,7 +51,8 @@ switches:
# 输入引擎
engine:
processors:
- lua_processor@*select_character # 以词定字
- lua_processor@*select_character # 以词定字
# - lua_processor@*cold_word_drop.processor # 词条隐藏、降频
- ascii_composer
- recognizer
- key_binder
@ -78,7 +79,8 @@ engine:
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
# - lua_filter@*is_in_user_dict # 为用户词典中(输入过)的内容结尾加上一个星号 *
- lua_filter@*reduce_english_filter # 降低部分英语单词在候选项的位置
- lua_filter@*cold_word_drop.filter # 强制删词, 词条降频(选中高亮的词条放到第四位)
- lua_filter@*reduce_english_filter # 词条隐藏、降频
- uniquifier # 去重

View File

@ -51,7 +51,8 @@ switches:
# 输入引擎
engine:
processors:
- lua_processor@*select_character # 以词定字
- lua_processor@*select_character # 以词定字
# - lua_processor@*cold_word_drop.processor # 词条隐藏、降频
- ascii_composer
- recognizer
- key_binder
@ -78,6 +79,7 @@ engine:
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
# - lua_filter@*is_in_user_dict # 为用户词典中(输入过)的内容结尾加上一个星号 *
# - lua_filter@*cold_word_drop.filter # 词条隐藏、降频
- lua_filter@*reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -51,7 +51,8 @@ switches:
# 输入引擎
engine:
processors:
- lua_processor@*select_character # 以词定字
- lua_processor@*select_character # 以词定字
# - lua_processor@*cold_word_drop.processor # 词条隐藏、降频
- ascii_composer
- recognizer
- key_binder
@ -78,6 +79,7 @@ engine:
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
# - lua_filter@*is_in_user_dict # 为用户词典中(输入过)的内容结尾加上一个星号 *
# - lua_filter@*cold_word_drop.filter # 词条隐藏、降频
- lua_filter@*reduce_english_filter # 降低部分英语单词在候选项的位置
- uniquifier # 去重

View File

@ -51,7 +51,8 @@ switches:
# 输入引擎
engine:
processors:
- lua_processor@*select_character # 以词定字
- lua_processor@*select_character # 以词定字
# - lua_processor@*cold_word_drop.processor # 词条隐藏、降频
- ascii_composer
- recognizer
- key_binder
@ -78,7 +79,8 @@ engine:
- simplifier@emoji # Emoji
- simplifier@traditionalize # 简繁切换
# - lua_filter@*is_in_user_dict # 为用户词典中(输入过)的内容结尾加上一个星号 *
- lua_filter@*reduce_english_filter # 降低部分英语单词在候选项的位置
- lua_filter@*cold_word_drop.filter # 强制删词, 词条降频(选中高亮的词条放到第四位)
- lua_filter@*reduce_english_filter # 词条隐藏、降频
- uniquifier # 去重

View File

@ -0,0 +1,80 @@
#! /usr/bin/env lua
--
-- debugtool.lua
-- Copyright (C) 2021 Shewer Lu <shewer@gmail.com>
--
-- Distributed under terms of the MIT license.
--
-- puts(tag,...)
-- DEBUG --> log.error
-- WARN --> log.warning
-- INFO --> log.info
-- CONSOLE --> print
--
-- ex:
-- test.lua
--
-- local puts = require 'tools/debugtool'
-- --set tag D103 C102
-- local D103= DEBUG .. "103"
-- local C102= CONSOLE .. "102"
-- local C103= nil
--
--
-- puts(ERROR,__FILE__(),__LINE__(),__FUNC__(), 1, 2 , 3 )
-- --> log.error( "error" .. tran_msg(...))
--
-- puts(DEBUG,__FILE__(),__LINE__(),__FUNC__(), 1, 2 , 3 )
-- --> log.error( DEBUG .. tran_msg(...))
--
-- puts(D103,__FILE__(),__LINE__(),__FUNC__(), 1 2 3)
-- --> log.error("trace103" .. tran_msg(...)
--
-- puts(C102,__FILE__(),__LINE__(),__FUNC__(), 1 2 3)
-- --> print("console103" .. tran_msg(...)
--
-- puts(C103,__FILE__(),__LINE__(),__FUNC__(), 1 2 3)
-- --> pass
--
--
--
-- puts(DEBUG,__FILE__(),__LINE__(),__FUNC__() , ...)
-- puts(INFO,__FILE__(),__LINE__(),__FUNC__() , ...)
--
-- global variable
function __FILE__(n) n=n or 2 return debug.getinfo(n,'S').soruce end
function __LINE__(n) n=n or 2 return debug.getinfo(n, 'l').currentline end
function __FUNC__(n) n=n or 2 return debug.getinfo(n, 'n').name end
INFO="log"
WARN="warn"
ERROR="error"
DEBUG="trace"
CONSOLE="console"
local function tran_msg(...)
local msg="\t"
for i,k in next, {...} do msg = msg .. ": " .. tostring(k) end
return msg
end
local function puts( tag , ...)
if type(tag) ~= "string" then return end
if INFO and tag:match("^" .. INFO) then
(log and log.info or print)( tag .. tran_msg(...))
elseif WARN and tag:match("^" .. WARN) then
(log and log.warning or print)(tag .. tran_msg(...))
elseif ERROR and tag:match("^" .. ERROR) then
(log and log.error or print)(tag .. tran_msg(...))
elseif DEBUG and tag:match("^" .. DEBUG) then
(log and log.error or print)(tag .. tran_msg(...))
elseif CONSOLE and tag:match( "^" .. CONSOLE ) then
( print)( tag .. tran_msg(...))
else
return
end
end
return puts

View File

@ -0,0 +1,4 @@
local drop_words =
{ "示~例~",
}
return drop_words

View File

@ -0,0 +1,55 @@
local drop_list = require("cold_word_drop.drop_words")
local hide_list = require("cold_word_drop.hide_words")
local turndown_freq_list = require("cold_word_drop.turndown_freq_words")
local function filter(input, env)
local idx = 3 -- 降频的词条放到第三个后面, 即第四位, 可在 yaml 里配置
local i = 1
local cands = {}
local context = env.engine.context
local preedit_code = context.input
for cand in input:iter() do
local cpreedit_code = string.gsub(cand.preedit, ' ', '')
if (i <= idx) then
local tfl = turndown_freq_list[cand.text] or nil
-- 前三个 候选项排除 要调整词频的词条, 要删的(实际假性删词, 彻底隐藏罢了) 和要隐藏的词条
if not
((tfl and table.find_index(tfl, cpreedit_code)) or
table.find_index(drop_list, cand.text) or
(hide_list[cand.text] and table.find_index(hide_list[cand.text], cpreedit_code))
)
then
i = i + 1
---@diagnostic disable-next-line: undefined-global
yield(cand)
else
table.insert(cands, cand)
end
else
table.insert(cands, cand)
end
if (#cands > 50) then
break
end
end
for _, cand in ipairs(cands) do
local cpreedit_code = string.gsub(cand.preedit, ' ', '')
if not
-- 要删的 和要隐藏的词条不显示
(
table.find_index(drop_list, cand.text) or
(hide_list[cand.text] and table.find_index(hide_list[cand.text], cpreedit_code))
)
then
---@diagnostic disable-next-line: undefined-global
yield(cand)
end
end
for cand in input:iter() do
yield(cand)
end
end
return filter

View File

@ -0,0 +1,4 @@
local hide_words =
{ ["示~例~"] = { "shil", "shili", },
}
return hide_words

View File

@ -0,0 +1,163 @@
-- create metatable
orgtype = type
function type(obj)
local _type = orgtype(obj)
if "table" == _type and obj._cname then
return obj._cname
end
return _type
end
function metatable(...)
if ... and type(...) == "table" then
return setmetatable(..., { __index = table })
else
return setmetatable({ ... }, { __index = table })
end
end
-- chech metatble
function metatable_chk(tab)
if "table" == type(tab)
then
return (tab.each and tab) or metatable(tab)
else
return tab
end
end
table.eachi = function (tab, func)
for i = 1, #tab do
func(tab[i], i)
end
return tab
end
table.eacha = function (tab, func)
for i, v in ipairs(tab) do
func(v, i)
end
return tab
end
table.each = function (tab, func)
for k, v in pairs(tab) do
func(v, k)
end
return tab
end
table.find_index = function (tab, elm, ...)
local _, i = table.find(tab, elm, ...)
return i
end
table.find = function (tab, elm, func)
for i, v in ipairs(tab) do
if elm == v then
return v, i
end
end
end
table.find_with_func = function (tab, elm, ...)
local i, v = table.find(tab, elm)
end
table.delete = function (tab, elm, ...)
local index = table.find_index(tab, elm)
return index and table.remove(tab, index)
end
table.find_all = function (tab, elm, ...)
local tmptab = setmetatable({}, { __index = table })
local _func = (type(elm) == "function" and elm) or function (v, k, ...) return v == elm end
for k, v in pairs(tab) do
if _func(v, k, ...) then
tmptab:insert(v)
end
end
return tmptab
end
table.select = table.find_all
table.reduce = function (tab, func, arg)
local new, old = arg, arg
for i, v in ipairs(tab) do
new, old = func(v, new)
end
return new, arg
end
table.map = function (tab, func)
local newtab = setmetatable({}, { __index = table })
func = func or function (v, i) return v, i end
for i, v in ipairs(tab) do
newtab[i] = func(v, i)
end
return newtab
end
table.map_hash = function (tab, func) -- table to list of array { key, v}
local newtab = setmetatable({}, { __index = table })
func = func or function (k, v) return { k, v } end
for k, v in pairs(tab) do
newtab:insert(func(k, v))
end
return newtab
end
function table:push(elm)
self:insert(elm)
end
table.append = table.push
function table:pop()
return self:remove(#self)
end
function table:shift()
self:remove(1)
end
function table:unshift(elm)
self:insert(1, elm)
end
function table.len(t)
local leng = 0
for k, v in pairs(t) do
leng = leng + 1
end
return leng;
end
-- table to string 序列化
function table.serialize(obj)
local serialize_str = ""
local t = type(obj)
if t == "number" then
serialize_str = serialize_str .. obj
elseif t == "boolean" then
serialize_str = serialize_str .. tostring(obj)
elseif t == "string" then
serialize_str = serialize_str .. string.format("%q", obj)
elseif t == "table" then
serialize_str = serialize_str .. "{ "
local record_sep = #obj < 4 and ", " or ",\n"
local record_prefix = #obj < 4 and "" or "\t"
for k, v in pairs(obj) do
if type(k) == "number" then
serialize_str = serialize_str .. record_prefix .. '"' .. v .. '"' .. record_sep
else
serialize_str = serialize_str .. "\t[" .. table.serialize(k) .. "] = " .. table.serialize(v) .. ",\n"
end
end
-- local metatable = getmetatable(obj)
-- if metatable ~= nil and type(metatable.__index) == "table" then
-- for k, v in pairs(metatable.__index) do
-- serialize_str = serialize_str .. "[" .. table.serialize(k) .. "]=" .. table.serialize(v) .. ",\n"
-- end
-- end
serialize_str = serialize_str .. "}"
elseif t == "nil" then
return nil
else
error("can not serialize a " .. t .. " type.")
end
return serialize_str
end

View File

@ -0,0 +1,146 @@
require('cold_word_drop.string')
require("cold_word_drop.metatable")
-- local puts = require("tools/debugtool")
local drop_list = require("cold_word_drop.drop_words")
local hide_list = require("cold_word_drop.hide_words")
local turndown_freq_list = require("cold_word_drop.turndown_freq_words")
local tbls = {
['drop_list'] = drop_list,
['hide_list'] = hide_list,
['turndown_freq_list'] = turndown_freq_list
}
-- local cold_word_drop = {}
local function get_record_filername(record_type)
local system = io.popen("uname -s"):read("*l")
local filename = nil
-- body
if system == "Darwin" then
filename = string.format("%s/Library/Rime/lua/cold_word_drop/%s_words.lua", os.getenv('HOME'), record_type)
elseif system == "Linux" then
filename = string.format("%s/.config/ibus/rime/lua/cold_word_drop/%s_words.lua", os.getenv('HOME'), record_type)
else
filename = string.format("%%APPDATA%%\\Rime\\lua\\cold_word_drop\\%s_words.lua", record_type)
end
return filename
end
local function write_word_to_file(record_type)
-- local filename = string.format("%s/Library/Rime/lua/cold_word_drop/%s_words.lua", os.getenv('HOME'), record_type)
local filename = get_record_filername(record_type)
local record_header = string.format("local %s_words =\n", record_type)
local record_tailer = string.format("\nreturn %s_words", record_type)
local fd = assert(io.open(filename, "w")) --打开
fd:setvbuf("line")
fd:write(record_header) --写入文件头部
-- df:flush() --刷新
local x = string.format("%s_list", record_type)
local record = table.serialize(tbls[x]) -- lua 的 table 对象 序列化为字符串
fd:write(record) --写入 序列化的字符串
fd:write(record_tailer) --写入文件尾部, 结束记录
fd:close() --关闭
end
local function check_encode_matched(cand_code, word, input_code_tbl, reversedb)
if #cand_code < 1 and utf8.len(word) > 1 then -- 二字词以上的词条反查, 需要逐个字去反查
local word_cand_code = string.split(word, "")
for i, v in ipairs(word_cand_code) do
-- 如有 `[` 引导的辅助码情况, 去掉引导符及之后的所有形码字符
local char_code = string.gsub(reversedb:lookup(v), '%[%l%l', '')
local _char_preedit_code = input_code_tbl[i] or " "
-- 如有 `[` 引导的辅助码情况, 同上, 去掉之
local char_preedit_code = string.gsub(_char_preedit_code, '%[%l+', '')
if not string.match(char_code, char_preedit_code) then
-- 输入编码串和词条反查结果不匹配(考虑到多音字, 开启了模糊音, 纠错音), 返回false, 表示隐藏这个词条
return false
end
end
end
-- 输入编码串和词条反查结果匹配, 返回true, 表示对这个词条降频
return true
end
local function append_word_to_droplist(ctx, action_type, reversedb)
local word = ctx.word
local input_code = ctx.code
if action_type == 'drop' then
table.insert(drop_list, word) -- 高亮选中的词条插入到 drop_list
return true
end
local input_code_tbl = string.split(input_code, " ")
local cand_code = reversedb:lookup(word) or "" -- 反查候选项文字编码
-- 二字词 的匹配检查, 匹配返回true, 不匹配返回false
local match_result = check_encode_matched(cand_code, word, input_code_tbl, reversedb)
local ccand_code = string.gsub(cand_code, '%[%l%l', '')
-- 如有 `[` 引导的辅助码情况, 去掉引导符及之后的所有形码字符
local input_str = string.gsub(input_code, '%[%l+', '')
local input_code_str = table.concat(input_code_tbl, '')
-- 单字和二字词 的匹配检查, 如果匹配, 降频
if string.match(ccand_code, input_str) or match_result then
if turndown_freq_list[word] then
table.insert(turndown_freq_list[word], input_code_str)
else
turndown_freq_list[word] = { input_code_str }
end
return 'turndown_freq'
end
-- 单字和二字词 如果不匹配 就隐藏
if not hide_list[word] then
hide_list[word] = { input_code_str }
return true
else
-- 隐藏的词条如果已经在 hide_list 中, 则将输入串追加到 值表中, 如: ['藏'] = {'chang', 'zhang'}
if not table.find_index(hide_list[word], input_code_str) then
table.insert(hide_list[word], input_code_str)
return true
else
return false
end
end
end
local function processor(key, env)
local engine = env.engine
local config = engine.schema.config
local context = engine.context
-- local top_cand_text = context:get_commit_text()
-- local preedit_code = context.input
local preedit_code = context:get_script_text()
local turndown_cand_key = config:get_string("key_binder/turn_down_cand") or "Control+j"
local drop_cand_key = config:get_string("key_binder/drop_cand") or "Control+d"
local action_map = {
[turndown_cand_key] = 'hide',
[drop_cand_key] = 'drop'
}
-- local schema_id = config:get_string("schema/schema_id")
local schema_id = config:get_string("translator/dictionary") -- 多方案共用字典取主方案名称
---@diagnostic disable-next-line: undefined-global
local reversedb = ReverseLookup(schema_id)
if key:repr() == turndown_cand_key or key:repr() == drop_cand_key then
local cand = context:get_selected_candidate()
local action_type = action_map[key:repr()]
local ctx_map = {
['word'] = cand.text,
['code'] = preedit_code
}
local res = append_word_to_droplist(ctx_map, action_type, reversedb)
context:refresh_non_confirmed_composition() -- 刷新当前输入法候选菜单, 实现看到实时效果
if type(res) == "boolean" then
-- 期望被删的词和隐藏的词条写入文件(drop_words.lua, hide_words.lua)
write_word_to_file(action_type)
else
-- 期望 要调整词频的词条写入 turndown_freq_words.lua 文件
write_word_to_file(res)
end
return 1 -- kAccept
end
return 2 -- kNoop, 不做任何操作, 交给下个组件处理
end
return processor

View File

@ -0,0 +1,40 @@
-- wrap utf8.sub(str,head_index, tail_index)
-- wrap string.split(str,sp,sp1)
-- string.utf8_len = utf8.len
-- string.utf8_offset= utf8.offset
-- string.utf8_sub= utf8.sub
function string.split( str, sp,sp1)
sp =type(sp) == "string" and sp or " "
if #sp == 0 then
sp= "([%z\1-\127\194-\244][\128-\191]*)"
elseif #sp == 1 then
sp= "[^" .. (sp=="%" and "%%" or sp) .. "]*"
else
sp1= sp1 or "^"
str=str:gsub(sp,sp1)
sp= "[^".. sp1 .. "]*"
end
local tab= {}
for v in str:gmatch(sp) do
table.insert(tab,v)
end
return tab
end
function utf8.gsub(str,si,ei)
local function index(ustr,i)
return i>=0 and ( ustr:utf8_offset(i) or ustr:len() +1 )
or ( ustr:utf8_offset(i) or 1 )
end
local u_si= index(str,si)
ei = ei or str:utf8_len()
ei = ei >=0 and ei +1 or ei
local u_ei= index(str, ei ) -1
return str:sub(u_si,u_ei)
end
string.utf8_len= utf8.len
string.utf8_offset=utf8.offset
string.utf8_sub= utf8.gsub
return true

View File

@ -0,0 +1,4 @@
local turndown_freq_words =
{ ["示~例~"] = { "shili", },
}
return turndown_freq_words

View File

@ -42,7 +42,8 @@ switches:
# 输入引擎
engine:
processors:
- lua_processor@*select_character # 以词定字
- lua_processor@*select_character # 以词定字
# - lua_processor@*cold_word_drop.processor # 词条隐藏、降频
- ascii_composer
- recognizer
- key_binder
@ -71,6 +72,7 @@ engine:
# - lua_filter@*is_in_user_dict # 为用户词典中(输入过)的内容结尾加上一个星号 *
- lua_filter@*v_filter # v 模式 symbols 优先(否则是英文优先)
- lua_filter@*reduce_english_filter # 降低部分英语单词在候选项的位置
# - lua_filter@*cold_word_drop.filter # 词条隐藏、降频
- lua_filter@*long_word_filter # 长词优先
- uniquifier # 去重