218 lines
5.6 KiB
Go
218 lines
5.6 KiB
Go
package rime
|
||
|
||
import (
|
||
"bufio"
|
||
"fmt"
|
||
mapset "github.com/deckarep/golang-set/v2"
|
||
"log"
|
||
"os"
|
||
"path"
|
||
"sort"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
)
|
||
|
||
// Sort 词库排序,顺便去重
|
||
// flag: 1 只有汉字,2 汉字+注音,3 汉字+注音+权重,4 汉字+权重。
|
||
func Sort(dictPath string, flag int) {
|
||
// 控制台输出
|
||
defer updateVersion(dictPath, getSha1(dictPath))
|
||
defer printfTimeCost("排序 "+path.Base(dictPath), time.Now())
|
||
|
||
// 打开文件
|
||
file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
|
||
if err != nil {
|
||
log.Fatal(err)
|
||
}
|
||
defer file.Close()
|
||
|
||
// 前缀内容和词库切片,前者原封不动写入,后者排序后写入
|
||
prefixContents := make([]string, 0) // 前置内容切片
|
||
contents := make([]lemma, 0) // 词库切片
|
||
aSet := mapset.NewSet[string]() // 去重用的 set
|
||
|
||
isMark := false
|
||
sc := bufio.NewScanner(file)
|
||
for sc.Scan() {
|
||
line := sc.Text()
|
||
// mark 之前的写入 prefixContents
|
||
if !isMark {
|
||
prefixContents = append(prefixContents, line)
|
||
if line == mark {
|
||
isMark = true
|
||
}
|
||
continue
|
||
}
|
||
|
||
// 分割为 text、code、weight
|
||
parts := strings.Split(line, "\t")
|
||
text, code, weight := parts[0], "", ""
|
||
|
||
// 检查分割长度
|
||
if (flag == 1 || flag == 2 || flag == 3) && len(parts) != flag {
|
||
fmt.Println("分割错误123:", line)
|
||
}
|
||
if flag == 4 && len(parts) != 2 {
|
||
fmt.Println("分割错误4:", line)
|
||
}
|
||
|
||
// 将 main 中注释了但没删除的词汇权重调为 0
|
||
if dictPath == MainPath && strings.HasPrefix(line, "# ") {
|
||
parts[2] = "0"
|
||
}
|
||
|
||
// mark 之后的,写入到 contents
|
||
|
||
// 自身重复的直接排除,不重复的写入
|
||
switch flag {
|
||
case 1: // 一列 【汉字】
|
||
if aSet.Contains(text) {
|
||
fmt.Println("重复:", line)
|
||
continue
|
||
}
|
||
aSet.Add(text)
|
||
contents = append(contents, lemma{text: text})
|
||
case 2: // 两列 【汉字+注音】
|
||
text, code = parts[0], parts[1]
|
||
if aSet.Contains(text + code) {
|
||
fmt.Println("重复:", line)
|
||
continue
|
||
}
|
||
aSet.Add(text + code)
|
||
contents = append(contents, lemma{text: text, code: code})
|
||
case 3: // 三列 【汉字+注音+权重】
|
||
text, code, weight = parts[0], parts[1], parts[2]
|
||
if aSet.Contains(text + code) {
|
||
fmt.Println("重复:", line)
|
||
continue
|
||
}
|
||
aSet.Add(text + code)
|
||
weight, _ := strconv.Atoi(weight)
|
||
contents = append(contents, lemma{text: text, code: code, weight: weight})
|
||
case 4: // 两列 【汉字+权重】
|
||
text, weight = parts[0], parts[1]
|
||
if aSet.Contains(text) {
|
||
fmt.Println("重复:", line)
|
||
continue
|
||
}
|
||
aSet.Add(text)
|
||
weight, _ := strconv.Atoi(weight)
|
||
contents = append(contents, lemma{text: text, weight: weight})
|
||
default:
|
||
log.Fatal("分割错误:", line)
|
||
}
|
||
}
|
||
|
||
// 排序:拼音升序、权重降序、最后直接按 Unicode 编码排序
|
||
sort.Slice(contents, func(i, j int) bool {
|
||
if contents[i].code != contents[j].code {
|
||
return contents[i].code < contents[j].code
|
||
}
|
||
if contents[i].weight != contents[j].weight {
|
||
return contents[i].weight > contents[j].weight
|
||
}
|
||
if contents[i].text != contents[j].text {
|
||
return contents[i].text < contents[j].text
|
||
}
|
||
return false
|
||
})
|
||
|
||
// 准备写入
|
||
err = file.Truncate(0)
|
||
if err != nil {
|
||
log.Fatalln(err)
|
||
}
|
||
_, err = file.Seek(0, 0)
|
||
if err != nil {
|
||
log.Fatalln(err)
|
||
}
|
||
|
||
// 写入前缀
|
||
for _, line := range prefixContents {
|
||
_, err := file.WriteString(line + "\n")
|
||
if err != nil {
|
||
log.Fatal(err)
|
||
}
|
||
}
|
||
|
||
// 字表、main 直接写入,不需要从其他词库去重
|
||
if dictPath == HanziPath || dictPath == MainPath {
|
||
for _, line := range contents {
|
||
_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
|
||
if err != nil {
|
||
log.Fatal(err)
|
||
}
|
||
}
|
||
}
|
||
|
||
// 其他词库需要从一个或多个词库中去重后再写入
|
||
if dictPath == SogouPath || dictPath == ExtPath || dictPath == TencentPath {
|
||
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
|
||
switch dictPath {
|
||
case SogouPath: // sogou 不和 main 有重复
|
||
intersect = SogouSet.Intersect(MainSet)
|
||
case ExtPath: // ext 不和 main+sogou 有重复
|
||
intersect = ExtSet.Intersect(MainSet.Union(SogouSet))
|
||
case TencentPath:
|
||
intersect = TencentSet.Intersect(MainSet.Union(SogouSet).Union(ExtSet))
|
||
}
|
||
|
||
for _, line := range contents {
|
||
if intersect.Contains(line.text) {
|
||
fmt.Printf("%s 重复于其他词库:%s\n", strings.Split(path.Base(dictPath), ".")[0], line.text)
|
||
continue
|
||
}
|
||
str := ""
|
||
if flag == 3 { // sogou
|
||
str = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"
|
||
} else if flag == 4 { // ext tencent
|
||
str = line.text + "\t" + strconv.Itoa(line.weight) + "\n"
|
||
}
|
||
_, err := file.WriteString(str)
|
||
if err != nil {
|
||
log.Fatal(err)
|
||
}
|
||
}
|
||
}
|
||
|
||
// 外部词库或临时文件,只排序,不去重
|
||
if !contains([]string{HanziPath, MainPath, SogouPath, ExtPath, TencentPath}, dictPath) {
|
||
switch flag {
|
||
case 1:
|
||
for _, line := range contents {
|
||
_, err := file.WriteString(line.text + "\n")
|
||
if err != nil {
|
||
log.Fatalln(err)
|
||
}
|
||
}
|
||
case 2:
|
||
for _, line := range contents {
|
||
_, err := file.WriteString(line.text + "\t" + line.code + "\n")
|
||
if err != nil {
|
||
log.Fatalln(err)
|
||
}
|
||
}
|
||
case 3:
|
||
for _, line := range contents {
|
||
_, err := file.WriteString(line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n")
|
||
if err != nil {
|
||
log.Fatalln(err)
|
||
}
|
||
}
|
||
case 4:
|
||
for _, line := range contents {
|
||
_, err := file.WriteString(line.text + "\t" + strconv.Itoa(line.weight) + "\n")
|
||
if err != nil {
|
||
log.Fatalln(err)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
err = file.Sync()
|
||
if err != nil {
|
||
log.Fatal(err)
|
||
}
|
||
}
|