rime-ice/others/script/rime/sort.go
Dvel 196a0ea7a6 日常更新
commit c0e8fce9c547e08baa2690be038a0b22dba9e2a2
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 09:49:58 2023 +0800

    update recipes

commit b94b5e2b6b1a6e5968a4b79020c4797d01773317
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 09:48:52 2023 +0800

    Update README.md

commit 09b2fac8fdb91691154a87a1e9401633ca45b3e8
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 09:42:24 2023 +0800

    完善萌娘相关脚本

commit 0cedf23ffc796d8d18f042d2dbe963ad578d5860
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 08:28:35 2023 +0800

    Emoji 映射移动到 base 里
2023-02-24 09:52:38 +08:00

220 lines
5.9 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rime
import (
"bufio"
"fmt"
mapset "github.com/deckarep/golang-set/v2"
"log"
"os"
"path"
"sort"
"strconv"
"strings"
"time"
)
// Sort 词库排序,顺便去重
// flag: 1 只有汉字2 汉字+注音3 汉字+注音+权重4 汉字+权重。
func Sort(dictPath string, flag int) {
// 控制台输出
defer updateVersion(dictPath, getSha1(dictPath))
defer printfTimeCost("排序 "+path.Base(dictPath), time.Now())
// 打开文件
file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
if err != nil {
log.Fatal(err)
}
defer file.Close()
// 前缀内容和词库切片,前者原封不动写入,后者排序后写入
prefixContents := make([]string, 0) // 前置内容切片
contents := make([]lemma, 0) // 词库切片
aSet := mapset.NewSet[string]() // 去重用的 set
isMark := false
sc := bufio.NewScanner(file)
for sc.Scan() {
line := sc.Text()
// mark 之前的写入 prefixContents
if !isMark {
prefixContents = append(prefixContents, line)
if line == mark {
isMark = true
}
continue
}
// 分割为 text、code、weight
parts := strings.Split(line, "\t")
text, code, weight := parts[0], "", ""
// 检查分割长度
if (flag == 1 || flag == 2 || flag == 3) && len(parts) != flag {
fmt.Println("分割错误123:", line)
}
if flag == 4 && len(parts) != 2 {
fmt.Println("分割错误4:", line)
}
// 将 base 中注释了但没删除的词汇权重调为 0
if dictPath == BasePath && strings.HasPrefix(line, "# ") {
parts[2] = "0"
}
// mark 之后的,写入到 contents
// 自身重复的直接排除,不重复的写入
switch flag {
case 1: // 一列 【汉字】
if aSet.Contains(text) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text)
contents = append(contents, lemma{text: text})
case 2: // 两列 【汉字+注音】
text, code = parts[0], parts[1]
if aSet.Contains(text + code) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text + code)
contents = append(contents, lemma{text: text, code: code})
case 3: // 三列 【汉字+注音+权重】
text, code, weight = parts[0], parts[1], parts[2]
if aSet.Contains(text + code) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text + code)
weight, _ := strconv.Atoi(weight)
contents = append(contents, lemma{text: text, code: code, weight: weight})
case 4: // 两列 【汉字+权重】
text, weight = parts[0], parts[1]
if aSet.Contains(text) {
fmt.Println("重复:", line)
continue
}
aSet.Add(text)
weight, _ := strconv.Atoi(weight)
contents = append(contents, lemma{text: text, weight: weight})
default:
log.Fatal("分割错误:", line)
}
}
// 排序:拼音升序、权重降序、最后直接按 Unicode 编码排序
sort.Slice(contents, func(i, j int) bool {
if contents[i].code != contents[j].code {
return contents[i].code < contents[j].code
}
if contents[i].weight != contents[j].weight {
return contents[i].weight > contents[j].weight
}
if contents[i].text != contents[j].text {
return contents[i].text < contents[j].text
}
return false
})
// 准备写入
err = file.Truncate(0)
if err != nil {
log.Fatalln(err)
}
_, err = file.Seek(0, 0)
if err != nil {
log.Fatalln(err)
}
// 写入前缀
for _, line := range prefixContents {
_, err := file.WriteString(line + "\n")
if err != nil {
log.Fatal(err)
}
}
// 字表、base 直接写入,不需要从其他词库去重
if dictPath == HanziPath || dictPath == BasePath {
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
if err != nil {
log.Fatal(err)
}
}
}
// 其他词库需要从一个或多个词库中去重后再写入
if contains([]string{SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
switch dictPath {
case SogouPath: // sogou 不和 base 有重复
intersect = SogouSet.Intersect(BaseSet)
case MoegirlPath: // moegirl 不和 base+sogou 有重复
intersect = MoegirlSet.Intersect(BaseSet.Union(SogouSet))
case ExtPath: // ext 不和 base+sogou+moegirl 有重复
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet))
case TencentPath: // tencent 不和 base+sogou+moegirl+ext 有重复
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet).Union(ExtSet))
}
for _, line := range contents {
if intersect.Contains(line.text) {
fmt.Printf("%s 重复于其他词库:%s\n", strings.Split(path.Base(dictPath), ".")[0], line.text)
continue
}
str := ""
if flag == 3 { // sogou moegirl
str = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"
} else if flag == 4 { // ext tencent
str = line.text + "\t" + strconv.Itoa(line.weight) + "\n"
}
_, err := file.WriteString(str)
if err != nil {
log.Fatal(err)
}
}
}
// 外部词库或临时文件,只排序,不去重
if !contains([]string{HanziPath, BasePath, SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
switch flag {
case 1:
for _, line := range contents {
_, err := file.WriteString(line.text + "\n")
if err != nil {
log.Fatalln(err)
}
}
case 2:
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + line.code + "\n")
if err != nil {
log.Fatalln(err)
}
}
case 3:
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
if err != nil {
log.Fatalln(err)
}
}
case 4:
for _, line := range contents {
_, err := file.WriteString(line.text + "\t" + strconv.Itoa(line.weight) + "\n")
if err != nil {
log.Fatalln(err)
}
}
}
}
err = file.Sync()
if err != nil {
log.Fatal(err)
}
}