2022-10-30 16:47:40 +01:00
|
|
|
|
package rime
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
|
|
|
|
"crypto/sha1"
|
|
|
|
|
"encoding/hex"
|
|
|
|
|
"fmt"
|
|
|
|
|
mapset "github.com/deckarep/golang-set/v2"
|
|
|
|
|
"io"
|
|
|
|
|
"log"
|
|
|
|
|
"os"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
// 一个词条的组成部分
|
|
|
|
|
type lemma struct {
|
|
|
|
|
text string // 汉字
|
|
|
|
|
code string // 编码
|
|
|
|
|
weight int // 权重
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const (
|
|
|
|
|
mark = "# +_+" // 词库中的标记符号,表示从开始检查或排序
|
2022-11-10 12:30:00 +01:00
|
|
|
|
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
|
|
|
|
|
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
|
|
|
|
|
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
|
2022-10-30 16:47:40 +01:00
|
|
|
|
ExtPath = "/Users/dvel/Library/Rime/cn_dicts/ext.dict.yaml"
|
|
|
|
|
TencentPath = "/Users/dvel/Library/Rime/cn_dicts/tencent.dict.yaml"
|
|
|
|
|
EmojiPath = "/Users/dvel/Library/Rime/opencc/emoji-map.txt"
|
|
|
|
|
|
|
|
|
|
DefaultWeight = 100 // sogou、ext、tencet 词库中默认的权重数值
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var (
|
2022-11-10 12:30:00 +01:00
|
|
|
|
BaseSet mapset.Set[string]
|
|
|
|
|
SogouSet mapset.Set[string]
|
2022-10-30 16:47:40 +01:00
|
|
|
|
ExtSet mapset.Set[string]
|
|
|
|
|
TencentSet mapset.Set[string]
|
|
|
|
|
SogouSetWithCode mapset.Set[string]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func init() {
|
2022-11-10 12:30:00 +01:00
|
|
|
|
BaseSet = readAndSet(BasePath)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
SogouSet = readAndSet(SogouPath)
|
|
|
|
|
ExtSet = readAndSet(ExtPath)
|
|
|
|
|
TencentSet = readAndSet(TencentPath)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// readAndSet 读取词库文件为 set
|
|
|
|
|
func readAndSet(dictPath string) mapset.Set[string] {
|
|
|
|
|
set := mapset.NewSet[string]()
|
|
|
|
|
|
|
|
|
|
file, err := os.Open(dictPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(set)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
isMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if !isMark {
|
|
|
|
|
if strings.Contains(line, mark) {
|
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
parts := strings.Split(line, "\t")
|
|
|
|
|
set.Add(parts[0])
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return set
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// printlnTimeCost 打印耗时时间
|
|
|
|
|
func printlnTimeCost(content string, start time.Time) {
|
|
|
|
|
fmt.Printf("%s:\t%.2fs\n", content, time.Since(start).Seconds())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// printfTimeCost 打印耗时时间
|
|
|
|
|
func printfTimeCost(content string, start time.Time) {
|
|
|
|
|
fmt.Printf("%s:\t%.2fs", content, time.Since(start).Seconds())
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// contains slice 是否包含 item
|
|
|
|
|
func contains(arr []string, item string) bool {
|
|
|
|
|
for _, x := range arr {
|
|
|
|
|
if item == x {
|
|
|
|
|
return true
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// getSha1 获取文件 sha1
|
|
|
|
|
func getSha1(dictPath string) string {
|
|
|
|
|
f, err := os.Open(dictPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
|
|
sha1Handle := sha1.New()
|
|
|
|
|
if _, err := io.Copy(sha1Handle, f); err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return hex.EncodeToString(sha1Handle.Sum(nil))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// updateVersion 排序后,如果文件有改动,则修改 version 日期
|
|
|
|
|
func updateVersion(dictPath string, oldSha1 string) {
|
|
|
|
|
// 判断文件是否有改变
|
|
|
|
|
newSha1 := getSha1(dictPath)
|
|
|
|
|
if newSha1 == oldSha1 {
|
|
|
|
|
fmt.Println()
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
fmt.Println(" ...sorted")
|
|
|
|
|
|
|
|
|
|
// 打开文件
|
|
|
|
|
file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
// 修改那一行
|
|
|
|
|
arr := make([]string, 0)
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if strings.HasPrefix(line, "version:") {
|
|
|
|
|
s := fmt.Sprintf("version: \"%s\"", time.Now().Format("2006-01-02"))
|
|
|
|
|
arr = append(arr, s)
|
|
|
|
|
} else {
|
|
|
|
|
arr = append(arr, line)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 重新写入
|
|
|
|
|
err = file.Truncate(0)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
_, err = file.Seek(0, 0)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
for _, line := range arr {
|
|
|
|
|
_, err := file.WriteString(line + "\n")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
err = file.Sync()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|