rime-ice/others/script/rime/rime.go

135 lines
3.1 KiB
Go
Raw Normal View History

2022-10-30 16:47:40 +01:00
package rime
import (
"bufio"
"fmt"
2023-04-13 11:04:44 +02:00
mapset "github.com/deckarep/golang-set/v2"
2022-10-30 16:47:40 +01:00
"log"
"os"
"path"
2023-04-13 11:04:44 +02:00
"path/filepath"
"strconv"
2022-10-30 16:47:40 +01:00
"strings"
"time"
)
2023-04-13 11:04:44 +02:00
// 一个词的组成部分
2022-10-30 16:47:40 +01:00
type lemma struct {
text string // 汉字
code string // 编码
weight int // 权重
}
var (
mark = "# +_+" // 词库中的标记符号,表示从这行开始进行检查或排序
RimeDir = getRimeDir() // Rime 配置目录
2023-04-13 11:04:44 +02:00
EmojiMapPath = filepath.Join(RimeDir, "others/emoji-map.txt")
EmojiPath = filepath.Join(RimeDir, "opencc/emoji.txt")
2022-10-30 16:47:40 +01:00
2023-04-13 11:04:44 +02:00
HanziPath = filepath.Join(RimeDir, "cn_dicts/8105.dict.yaml")
BasePath = filepath.Join(RimeDir, "cn_dicts/base.dict.yaml")
ExtPath = filepath.Join(RimeDir, "cn_dicts/ext.dict.yaml")
TencentPath = filepath.Join(RimeDir, "cn_dicts/tencent.dict.yaml")
HanziSet = readToSet(HanziPath)
BaseSet = readToSet(BasePath)
ExtSet = readToSet(ExtPath)
TencentSet = readToSet(TencentPath)
2023-04-13 11:04:44 +02:00
需要注音TXT = filepath.Join(RimeDir, "others/script/rime/需要注音.txt")
错别字TXT = filepath.Join(RimeDir, "others/script/rime/错别字.txt")
汉字拼音映射TXT = filepath.Join(RimeDir, "others/script/rime/汉字拼音映射.txt")
)
// 将所有词库读入 set供检查或排序使用
func readToSet(dictPath string) mapset.Set[string] {
2022-10-30 16:47:40 +01:00
set := mapset.NewSet[string]()
file, err := os.Open(dictPath)
if err != nil {
2023-04-13 11:04:44 +02:00
log.Fatalln(err)
2022-10-30 16:47:40 +01:00
}
defer file.Close()
sc := bufio.NewScanner(file)
isMark := false
for sc.Scan() {
line := sc.Text()
if !isMark {
2023-04-13 11:04:44 +02:00
if strings.HasPrefix(line, mark) {
2022-10-30 16:47:40 +01:00
isMark = true
}
continue
}
parts := strings.Split(line, "\t")
set.Add(parts[0])
}
return set
}
2023-04-13 11:04:44 +02:00
// 打印耗时时间
2022-10-30 16:47:40 +01:00
func printlnTimeCost(content string, start time.Time) {
2023-04-13 11:04:44 +02:00
// fmt.Printf("%s\t%.2fs\n", content, time.Since(start).Seconds())
printfTimeCost(content, start)
fmt.Println()
2022-10-30 16:47:40 +01:00
}
2023-04-13 11:04:44 +02:00
// 打印耗时时间
2022-10-30 16:47:40 +01:00
func printfTimeCost(content string, start time.Time) {
fmt.Printf("%s\t%.2fs", content, time.Since(start).Seconds())
}
2023-04-13 11:04:44 +02:00
// slice 是否包含 item
2022-10-30 16:47:40 +01:00
func contains(arr []string, item string) bool {
for _, x := range arr {
if item == x {
return true
}
}
return false
}
2023-04-13 11:04:44 +02:00
// AddWeight 为 ext、tencent 没权重的词条加上权重,有权重的改为 weight
func AddWeight(dictPath string, weight int) {
// 控制台输出
printlnTimeCost("加权重\t"+path.Base(dictPath), time.Now())
2023-04-13 11:04:44 +02:00
// 读取到 lines 数组
file, err := os.ReadFile(dictPath)
if err != nil {
log.Fatal(err)
}
lines := strings.Split(string(file), "\n")
isMark := false
for i, line := range lines {
if !isMark {
2023-04-13 11:04:44 +02:00
if strings.HasPrefix(line, mark) {
isMark = true
}
continue
}
// 过滤空行
if line == "" {
continue
}
// 修改权重为传入的 weight没有就加上
parts := strings.Split(line, "\t")
_, err := strconv.Atoi(parts[len(parts)-1])
if err != nil {
lines[i] = line + "\t" + strconv.Itoa(weight)
} else {
lines[i] = strings.Join(parts[:len(parts)-1], "\t") + "\t" + strconv.Itoa(weight)
}
}
2023-04-13 11:04:44 +02:00
// 写入
resultString := strings.Join(lines, "\n")
err = os.WriteFile(dictPath, []byte(resultString), 0644)
if err != nil {
log.Fatal(err)
}
}