rime-ice/others/script/rime/sort.go
Dvel fe4af3a86e 萌娘词库 (★≧▽^))★☆
commit da4acc764651ee2b5bb83ae9c4afb0e910f49ce9
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 02:00:30 2023 +0800

    萌萌哒

commit 0fb437d852e19b11a89cd2778e41f0252d216f2f
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 01:40:00 2023 +0800

    搞定了啦

commit 52d2fecc25424cd82eca876fb21276725c3891da
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 01:16:02 2023 +0800

    fix

commit 656162e42c879f914ca31a4cf049141efc7628de
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 00:59:29 2023 +0800

    基本逻辑完成

commit 8594874035a84016496b15357ebbd6ac54b52725
Author: Dvel <git@dvel.me>
Date:   Thu Feb 23 23:56:44 2023 +0800

    引入萌娘词库
2023-02-24 02:09:29 +08:00

220 lines
5.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rime
import (
"bufio"
"fmt"
mapset "github.com/deckarep/golang-set/v2"
"log"
"os"
"path"
"sort"
"strconv"
"strings"
"time"
)
// Sort 词库排序,顺便去重
// flag: 1 只有汉字2 汉字+注音3 汉字+注音+权重4 汉字+权重。
func Sort(dictPath string, flag int) {
// 控制台输出
defer updateVersion(dictPath, getSha1(dictPath))
defer printfTimeCost("排序 "+path.Base(dictPath), time.Now())
// 打开文件
file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
if err != nil {
log.Fatal(err)
}
defer file.Close()
// 前缀内容和词库切片,前者原封不动写入,后者排序后写入
prefixContents := make([]string, 0) // 前置内容切片
contents := make([]lemma, 0) // 词库切片
aSet := mapset.NewSet[string]() // 去重用的 set
isMark := false
sc := bufio.NewScanner(file)
for sc.Scan() {
line := sc.Text()
// mark 之前的写入 prefixContents
if !isMark {
prefixContents = append(prefixContents, line)
if line == mark {
isMark = true
}
continue
}
// 分割为 text、code、weight
parts := strings.Split(line, "\t")
text, code, weight := parts[0], "", ""
// 检查分割长度
if (flag == 1 || flag == 2 || flag == 3) && len(parts) != flag {
fmt.Println("分割错误123:", line)
}
if flag == 4 && len(parts) != 2 {
fmt.Println("分割错误4:", line)
}
// 将 base 中注释了但没删除的词汇权重调为 0
if dictPath == BasePath && strings.HasPrefix(line, "# ") {
parts[2] = "0"
}
// mark 之后的,写入到 contents
// 自身重复的直接排除,不重复的写入
switch flag {
case 1: // 一列 【汉字】
if aSet.Contains(text) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text)
contents = append(contents, lemma{text: text})
case 2: // 两列 【汉字+注音】
text, code = parts[0], parts[1]
if aSet.Contains(text + code) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text + code)
contents = append(contents, lemma{text: text, code: code})
case 3: // 三列 【汉字+注音+权重】
text, code, weight = parts[0], parts[1], parts[2]
if aSet.Contains(text + code) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text + code)
weight, _ := strconv.Atoi(weight)
contents = append(contents, lemma{text: text, code: code, weight: weight})
case 4: // 两列 【汉字+权重】
text, weight = parts[0], parts[1]
if aSet.Contains(text) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text)
weight, _ := strconv.Atoi(weight)
contents = append(contents, lemma{text: text, weight: weight})
default:
log.Fatal("分割错误:", line)
}
}
// 排序:拼音升序、权重降序、最后直接按 Unicode 编码排序
sort.Slice(contents, func(i, j int) bool {
if contents[i].code != contents[j].code {
return contents[i].code < contents[j].code
}
if contents[i].weight != contents[j].weight {
return contents[i].weight > contents[j].weight
}
if contents[i].text != contents[j].text {
return contents[i].text < contents[j].text
}
return false
})
// 准备写入
err = file.Truncate(0)
if err != nil {
log.Fatalln(err)
}
_, err = file.Seek(0, 0)
if err != nil {
log.Fatalln(err)
}
// 写入前缀
for _, line := range prefixContents {
_, err := file.WriteString(line + "\n")
if err != nil {
log.Fatal(err)
}
}
// 字表、base 直接写入,不需要从其他词库去重
if dictPath == HanziPath || dictPath == BasePath {
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
if err != nil {
log.Fatal(err)
}
}
}
// 其他词库需要从一个或多个词库中去重后再写入
if contains([]string{SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
switch dictPath {
case SogouPath: // sogou 不和 base 有重复
intersect = SogouSet.Intersect(BaseSet)
case MoegirlPath: // moegirl 不和 base+sogou 有重复
intersect = MoegirlSet.Intersect(BaseSet.Union(SogouSet))
case ExtPath: // ext 不和 base+sogou+moegirl 有重复
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet))
case TencentPath: // tencent 不和 base+sogou+moegirl+ext 有重复
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet).Union(ExtSet))
}
for _, line := range contents {
if intersect.Contains(line.text) {
fmt.Printf("%s 重复于其他词库:%s\n", strings.Split(path.Base(dictPath), ".")[0], line.text)
continue
}
str := ""
if flag == 3 { // sogou
str = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"
} else if flag == 4 { // ext tencent
str = line.text + "\t" + strconv.Itoa(line.weight) + "\n"
}
_, err := file.WriteString(str)
if err != nil {
log.Fatal(err)
}
}
}
// 外部词库或临时文件,只排序,不去重
if !contains([]string{HanziPath, BasePath, SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
switch flag {
case 1:
for _, line := range contents {
_, err := file.WriteString(line.text + "\n")
if err != nil {
log.Fatalln(err)
}
}
case 2:
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + line.code + "\n")
if err != nil {
log.Fatalln(err)
}
}
case 3:
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
if err != nil {
log.Fatalln(err)
}
}
case 4:
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + strconv.Itoa(line.weight) + "\n")
if err != nil {
log.Fatalln(err)
}
}
}
}
err = file.Sync()
if err != nil {
log.Fatal(err)
}
}