2023-02-23 19:09:29 +01:00
|
|
|
|
package rime
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
|
|
|
|
"fmt"
|
|
|
|
|
mapset "github.com/deckarep/golang-set/v2"
|
|
|
|
|
"log"
|
|
|
|
|
"os"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
"unicode/utf8"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
func UpdateMoegirl() {
|
|
|
|
|
// 使用和 UpdateSogou 一样的方法
|
|
|
|
|
filterList = mapset.NewSet[string]() // 重置过滤列表
|
|
|
|
|
|
|
|
|
|
// 控制台输出
|
2023-02-24 02:52:38 +01:00
|
|
|
|
defer printlnTimeCost("更新萌娘百科", time.Now())
|
2023-02-23 19:09:29 +01:00
|
|
|
|
|
2023-02-24 02:52:38 +01:00
|
|
|
|
// 0. 下载新的萌娘词库(暂时手动操作)
|
2023-02-23 19:09:29 +01:00
|
|
|
|
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
|
2023-02-24 02:52:38 +01:00
|
|
|
|
// 1. 准备好过滤列表
|
|
|
|
|
makeFilterList(MoegirlPath, newMoegirlFile)
|
|
|
|
|
// 2. 将新的词汇加入到末尾,并打印新词
|
2023-02-23 19:09:29 +01:00
|
|
|
|
appendNewDict(MoegirlPath, newMoegirlFile)
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-24 02:52:38 +01:00
|
|
|
|
func makeFilterList(dictPath string, newPath string) {
|
|
|
|
|
// 读取目前词库 +_+ 和 *_* 之间的内容,加入过滤列表
|
|
|
|
|
file, err := os.Open(dictPath)
|
2023-02-23 19:09:29 +01:00
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
isFilterMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if line == mark {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if !isFilterMark {
|
|
|
|
|
if strings.Contains(line, fileterMark) {
|
|
|
|
|
isFilterMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 过滤列表有两种情况:
|
|
|
|
|
// 【# 测试一】取【测试一】
|
|
|
|
|
// 【测试二 ce shi er 100】取【测试二】
|
|
|
|
|
if strings.HasPrefix(line, "# ") {
|
2023-02-24 02:52:38 +01:00
|
|
|
|
filterList.Add(strings.TrimLeft(line, "# "))
|
2023-02-23 19:09:29 +01:00
|
|
|
|
} else {
|
2023-02-24 02:52:38 +01:00
|
|
|
|
filterList.Add(strings.Split(line, "\t")[0])
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 读取新词库,有问题的直接特么不要了
|
|
|
|
|
newFile, err := os.Open(newPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer newFile.Close()
|
|
|
|
|
|
|
|
|
|
sc = bufio.NewScanner(newFile)
|
|
|
|
|
isMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if !isMark {
|
|
|
|
|
if line == "..." {
|
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
parts := strings.Split(line, "\t")
|
|
|
|
|
text, code := parts[0], parts[1]
|
|
|
|
|
|
|
|
|
|
// 过滤掉有注音问题的:
|
|
|
|
|
// 把汉字和拼音弄成一一对应关系,「拼音:pin yin」→「拼:pin」「音:yin」
|
|
|
|
|
pinyins := strings.Split(code, " ")
|
|
|
|
|
i := 0
|
|
|
|
|
for _, zi := range text {
|
|
|
|
|
if !contains(hanPinyinMap[string(zi)], pinyins[i]) {
|
|
|
|
|
filterList.Add(text)
|
|
|
|
|
}
|
|
|
|
|
i++
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 过滤掉有异形词问题的
|
|
|
|
|
for _, wrongWord := range wrongWords.ToSlice() {
|
|
|
|
|
if strings.Contains(text, wrongWord) {
|
|
|
|
|
filterList.Add(text)
|
|
|
|
|
}
|
2023-02-23 19:09:29 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
func appendNewDict(dictPath string, newPath string) {
|
|
|
|
|
// 逐行读取 newPath,有新词则加入到 dictPath 末尾
|
|
|
|
|
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer moegirlFile.Close()
|
2023-02-24 02:52:38 +01:00
|
|
|
|
|
2023-02-23 19:09:29 +01:00
|
|
|
|
newFile, err := os.Open(newPath)
|
|
|
|
|
if err != nil {
|
2023-02-24 02:52:38 +01:00
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
|
return
|
|
|
|
|
}
|
2023-02-23 19:09:29 +01:00
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer newFile.Close()
|
|
|
|
|
|
|
|
|
|
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
|
|
|
|
|
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
|
|
|
|
|
// 新词列表
|
|
|
|
|
newWords := make([]string, 0)
|
|
|
|
|
|
|
|
|
|
sc := bufio.NewScanner(newFile)
|
|
|
|
|
isMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
// 只读取 ... 这行以下的词汇
|
|
|
|
|
if !isMark {
|
|
|
|
|
if line == "..." {
|
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := strings.Split(line, "\t")[0]
|
|
|
|
|
// 过滤
|
|
|
|
|
if set.Contains(text) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 过滤两字词
|
|
|
|
|
if utf8.RuneCountInString(text) <= 2 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 写入末尾
|
|
|
|
|
_, err := moegirlFile.WriteString(line + "\n")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
newWords = append(newWords, line)
|
|
|
|
|
}
|
|
|
|
|
err = moegirlFile.Sync()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 打印新词
|
|
|
|
|
fmt.Println("新增词汇:")
|
|
|
|
|
for _, word := range newWords {
|
|
|
|
|
fmt.Println(word)
|
|
|
|
|
}
|
|
|
|
|
fmt.Println("count: ", len(newWords))
|
|
|
|
|
}
|