rime-ice/others/script/rime/moegirl.go
Dvel fe4af3a86e 萌娘词库 (★≧▽^))★☆
commit da4acc764651ee2b5bb83ae9c4afb0e910f49ce9
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 02:00:30 2023 +0800

    萌萌哒

commit 0fb437d852e19b11a89cd2778e41f0252d216f2f
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 01:40:00 2023 +0800

    搞定了啦

commit 52d2fecc25424cd82eca876fb21276725c3891da
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 01:16:02 2023 +0800

    fix

commit 656162e42c879f914ca31a4cf049141efc7628de
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 00:59:29 2023 +0800

    基本逻辑完成

commit 8594874035a84016496b15357ebbd6ac54b52725
Author: Dvel <git@dvel.me>
Date:   Thu Feb 23 23:56:44 2023 +0800

    引入萌娘词库
2023-02-24 02:09:29 +08:00

121 lines
2.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rime
import (
"bufio"
"fmt"
mapset "github.com/deckarep/golang-set/v2"
"log"
"os"
"strings"
"time"
"unicode/utf8"
)
func UpdateMoegirl() {
// 使用和 UpdateSogou 一样的方法
filterList = mapset.NewSet[string]() // 重置过滤列表
// 控制台输出
defer updateVersion(MoegirlPath, getSha1(MoegirlPath))
defer printfTimeCost("更新萌娘百科", time.Now())
// 0. 准备好过滤列表
makeFilterList(MoegirlPath)
// 1. 下载新的萌娘词库(暂时手动操作)
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
// 2. 将新的词汇加入到末尾,并且打印新词
appendNewDict(MoegirlPath, newMoegirlFile)
}
func makeFilterList(dictPath string) {
file, err := os.Open(MoegirlPath)
if err != nil {
log.Fatal(err)
}
defer file.Close()
sc := bufio.NewScanner(file)
isFilterMark := false
for sc.Scan() {
line := sc.Text()
// 只读取 +_+ 和 *_* 之间的内容作为过滤列表
if line == mark {
break
}
if !isFilterMark {
if strings.Contains(line, fileterMark) {
isFilterMark = true
}
continue
}
// 过滤列表有两种情况:
// 【# 测试一】取【测试一】
// 【测试二 ce shi er 100】取【测试二】
if strings.HasPrefix(line, "# ") {
text := strings.TrimLeft(line, "# ")
filterList.Add(text)
} else {
text := strings.Split(line, "\t")[0]
filterList.Add(text)
}
}
}
func appendNewDict(dictPath string, newPath string) {
// 逐行读取 newPath有新词则加入到 dictPath 末尾
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
log.Fatal(err)
}
defer moegirlFile.Close()
newFile, err := os.Open(newPath)
if err != nil {
log.Fatal(err)
}
defer newFile.Close()
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
// 新词列表
newWords := make([]string, 0)
sc := bufio.NewScanner(newFile)
isMark := false
for sc.Scan() {
line := sc.Text()
// 只读取 ... 这行以下的词汇
if !isMark {
if line == "..." {
isMark = true
}
continue
}
text := strings.Split(line, "\t")[0]
// 过滤
if set.Contains(text) {
continue
}
// 过滤两字词
if utf8.RuneCountInString(text) <= 2 {
continue
}
// 写入末尾
_, err := moegirlFile.WriteString(line + "\n")
if err != nil {
log.Fatal(err)
}
newWords = append(newWords, line)
}
err = moegirlFile.Sync()
if err != nil {
log.Fatal(err)
}
// 打印新词
fmt.Println("新增词汇:")
for _, word := range newWords {
fmt.Println(word)
}
fmt.Println("count: ", len(newWords))
}