rime-ice/others/script/rime/moegirl.go
Dvel 196a0ea7a6 日常更新
commit c0e8fce9c547e08baa2690be038a0b22dba9e2a2
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 09:49:58 2023 +0800

    update recipes

commit b94b5e2b6b1a6e5968a4b79020c4797d01773317
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 09:48:52 2023 +0800

    Update README.md

commit 09b2fac8fdb91691154a87a1e9401633ca45b3e8
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 09:42:24 2023 +0800

    完善萌娘相关脚本

commit 0cedf23ffc796d8d18f042d2dbe963ad578d5860
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 08:28:35 2023 +0800

    Emoji 映射移动到 base 里
2023-02-24 09:52:38 +08:00

165 lines
3.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rime
import (
"bufio"
"fmt"
mapset "github.com/deckarep/golang-set/v2"
"log"
"os"
"strings"
"time"
"unicode/utf8"
)
func UpdateMoegirl() {
// 使用和 UpdateSogou 一样的方法
filterList = mapset.NewSet[string]() // 重置过滤列表
// 控制台输出
defer printlnTimeCost("更新萌娘百科", time.Now())
// 0. 下载新的萌娘词库(暂时手动操作)
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
// 1. 准备好过滤列表
makeFilterList(MoegirlPath, newMoegirlFile)
// 2. 将新的词汇加入到末尾,并打印新词
appendNewDict(MoegirlPath, newMoegirlFile)
}
func makeFilterList(dictPath string, newPath string) {
// 读取目前词库 +_+ 和 *_* 之间的内容,加入过滤列表
file, err := os.Open(dictPath)
if err != nil {
log.Fatal(err)
}
defer file.Close()
sc := bufio.NewScanner(file)
isFilterMark := false
for sc.Scan() {
line := sc.Text()
if line == mark {
break
}
if !isFilterMark {
if strings.Contains(line, fileterMark) {
isFilterMark = true
}
continue
}
// 过滤列表有两种情况:
// 【# 测试一】取【测试一】
// 【测试二 ce shi er 100】取【测试二】
if strings.HasPrefix(line, "# ") {
filterList.Add(strings.TrimLeft(line, "# "))
} else {
filterList.Add(strings.Split(line, "\t")[0])
}
}
// 读取新词库,有问题的直接特么不要了
newFile, err := os.Open(newPath)
if err != nil {
if os.IsNotExist(err) {
return
}
log.Fatal(err)
}
defer newFile.Close()
sc = bufio.NewScanner(newFile)
isMark := false
for sc.Scan() {
line := sc.Text()
if !isMark {
if line == "..." {
isMark = true
}
continue
}
parts := strings.Split(line, "\t")
text, code := parts[0], parts[1]
// 过滤掉有注音问题的:
// 把汉字和拼音弄成一一对应关系,「拼音:pin yin」→「拼:pin」「音:yin」
pinyins := strings.Split(code, " ")
i := 0
for _, zi := range text {
if !contains(hanPinyinMap[string(zi)], pinyins[i]) {
filterList.Add(text)
}
i++
}
// 过滤掉有异形词问题的
for _, wrongWord := range wrongWords.ToSlice() {
if strings.Contains(text, wrongWord) {
filterList.Add(text)
}
}
}
}
func appendNewDict(dictPath string, newPath string) {
// 逐行读取 newPath有新词则加入到 dictPath 末尾
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
log.Fatal(err)
}
defer moegirlFile.Close()
newFile, err := os.Open(newPath)
if err != nil {
if os.IsNotExist(err) {
return
}
log.Fatal(err)
}
defer newFile.Close()
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
// 新词列表
newWords := make([]string, 0)
sc := bufio.NewScanner(newFile)
isMark := false
for sc.Scan() {
line := sc.Text()
// 只读取 ... 这行以下的词汇
if !isMark {
if line == "..." {
isMark = true
}
continue
}
text := strings.Split(line, "\t")[0]
// 过滤
if set.Contains(text) {
continue
}
// 过滤两字词
if utf8.RuneCountInString(text) <= 2 {
continue
}
// 写入末尾
_, err := moegirlFile.WriteString(line + "\n")
if err != nil {
log.Fatal(err)
}
newWords = append(newWords, line)
}
err = moegirlFile.Sync()
if err != nil {
log.Fatal(err)
}
// 打印新词
fmt.Println("新增词汇:")
for _, word := range newWords {
fmt.Println(word)
}
fmt.Println("count: ", len(newWords))
}