2022-10-30 16:47:40 +01:00
|
|
|
|
package rime
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
|
|
|
|
"fmt"
|
2023-04-13 11:04:44 +02:00
|
|
|
|
mapset "github.com/deckarep/golang-set/v2"
|
2022-10-30 16:47:40 +01:00
|
|
|
|
"log"
|
|
|
|
|
"os"
|
2023-04-13 11:04:44 +02:00
|
|
|
|
"os/user"
|
2023-01-17 17:47:12 +01:00
|
|
|
|
"path"
|
2023-04-13 11:04:44 +02:00
|
|
|
|
"path/filepath"
|
2023-01-17 17:47:12 +01:00
|
|
|
|
"strconv"
|
2022-10-30 16:47:40 +01:00
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
)
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 一个词的组成部分
|
2022-10-30 16:47:40 +01:00
|
|
|
|
type lemma struct {
|
|
|
|
|
text string // 汉字
|
|
|
|
|
code string // 编码
|
|
|
|
|
weight int // 权重
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var (
|
2023-04-13 11:04:44 +02:00
|
|
|
|
mark = "# +_+" // 词库中的标记符号,表示从这行开始进行检查或排序
|
|
|
|
|
DefaultWeight = 100 // ext、tencent 词库中默认的权重
|
|
|
|
|
RimeDir = getRimeDir() // Rime 配置目录
|
|
|
|
|
|
|
|
|
|
EmojiMapPath = filepath.Join(RimeDir, "others/emoji-map.txt")
|
|
|
|
|
EmojiPath = filepath.Join(RimeDir, "opencc/emoji.txt")
|
2022-10-30 16:47:40 +01:00
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
HanziPath = filepath.Join(RimeDir, "cn_dicts/8105.dict.yaml")
|
|
|
|
|
BasePath = filepath.Join(RimeDir, "cn_dicts/base.dict.yaml")
|
|
|
|
|
ExtPath = filepath.Join(RimeDir, "cn_dicts/ext.dict.yaml")
|
|
|
|
|
TencentPath = filepath.Join(RimeDir, "cn_dicts/tencent.dict.yaml")
|
|
|
|
|
|
|
|
|
|
HanziSet = readToSet(HanziPath)
|
|
|
|
|
BaseSet = readToSet(BasePath)
|
|
|
|
|
ExtSet = readToSet(ExtPath)
|
2023-01-17 17:47:12 +01:00
|
|
|
|
TencentSet = readToSet(TencentPath)
|
2023-04-13 11:04:44 +02:00
|
|
|
|
|
|
|
|
|
需要注音TXT = filepath.Join(RimeDir, "others/script/rime/需要注音.txt")
|
|
|
|
|
错别字TXT = filepath.Join(RimeDir, "others/script/rime/错别字.txt")
|
|
|
|
|
汉字拼音映射TXT = filepath.Join(RimeDir, "others/script/rime/汉字拼音映射.txt")
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// 获取 macOS Rime 配置目录
|
|
|
|
|
func getRimeDir() string {
|
|
|
|
|
u, err := user.Current()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatalln(err)
|
|
|
|
|
}
|
|
|
|
|
return filepath.Join(u.HomeDir, "Library/Rime")
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 将所有词库读入 set,供检查或排序使用
|
2023-01-17 17:47:12 +01:00
|
|
|
|
func readToSet(dictPath string) mapset.Set[string] {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
set := mapset.NewSet[string]()
|
|
|
|
|
|
|
|
|
|
file, err := os.Open(dictPath)
|
|
|
|
|
if err != nil {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
log.Fatalln(err)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
isMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if !isMark {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
if strings.HasPrefix(line, mark) {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
parts := strings.Split(line, "\t")
|
|
|
|
|
set.Add(parts[0])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return set
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 打印耗时时间
|
2022-10-30 16:47:40 +01:00
|
|
|
|
func printlnTimeCost(content string, start time.Time) {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// fmt.Printf("%s:\t%.2fs\n", content, time.Since(start).Seconds())
|
|
|
|
|
printfTimeCost(content, start)
|
|
|
|
|
fmt.Println()
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 打印耗时时间
|
2022-10-30 16:47:40 +01:00
|
|
|
|
func printfTimeCost(content string, start time.Time) {
|
|
|
|
|
fmt.Printf("%s:\t%.2fs", content, time.Since(start).Seconds())
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// slice 是否包含 item
|
2022-10-30 16:47:40 +01:00
|
|
|
|
func contains(arr []string, item string) bool {
|
|
|
|
|
for _, x := range arr {
|
|
|
|
|
if item == x {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// AddWeight 为 ext、tencent 没权重的词条加上权重,有权重的改为 weight
|
2023-01-17 17:47:12 +01:00
|
|
|
|
func AddWeight(dictPath string, weight int) {
|
|
|
|
|
// 控制台输出
|
|
|
|
|
printlnTimeCost("加权重\t"+path.Base(dictPath), time.Now())
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 读取到 lines 数组
|
2023-01-17 17:47:12 +01:00
|
|
|
|
file, err := os.ReadFile(dictPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
lines := strings.Split(string(file), "\n")
|
|
|
|
|
|
|
|
|
|
isMark := false
|
|
|
|
|
for i, line := range lines {
|
|
|
|
|
if !isMark {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
if strings.HasPrefix(line, mark) {
|
2023-01-17 17:47:12 +01:00
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 过滤空行
|
|
|
|
|
if line == "" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 修改权重为传入的 weight,没有就加上
|
|
|
|
|
parts := strings.Split(line, "\t")
|
|
|
|
|
_, err := strconv.Atoi(parts[len(parts)-1])
|
|
|
|
|
if err != nil {
|
|
|
|
|
lines[i] = line + "\t" + strconv.Itoa(weight)
|
|
|
|
|
} else {
|
|
|
|
|
lines[i] = strings.Join(parts[:len(parts)-1], "\t") + "\t" + strconv.Itoa(weight)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 写入
|
2023-01-17 17:47:12 +01:00
|
|
|
|
resultString := strings.Join(lines, "\n")
|
|
|
|
|
err = os.WriteFile(dictPath, []byte(resultString), 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|