萌娘词库 (★≧▽^))★☆

commit da4acc764651ee2b5bb83ae9c4afb0e910f49ce9
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 02:00:30 2023 +0800

    萌萌哒

commit 0fb437d852e19b11a89cd2778e41f0252d216f2f
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 01:40:00 2023 +0800

    搞定了啦

commit 52d2fecc25424cd82eca876fb21276725c3891da
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 01:16:02 2023 +0800

    fix

commit 656162e42c879f914ca31a4cf049141efc7628de
Author: Dvel <git@dvel.me>
Date:   Fri Feb 24 00:59:29 2023 +0800

    基本逻辑完成

commit 8594874035a84016496b15357ebbd6ac54b52725
Author: Dvel <git@dvel.me>
Date:   Thu Feb 23 23:56:44 2023 +0800

    引入萌娘词库
This commit is contained in:
Dvel 2023-02-24 02:09:29 +08:00
parent 5e9ee058cb
commit fe4af3a86e
9 changed files with 77772 additions and 86383 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -26,6 +26,7 @@ func main() {
// 为 sogou、ext、tencent 没权重的词条加上权重,有权重的改为下面设置的权重
rime.AddWeight(rime.SogouPath, rime.DefaultWeight)
rime.AddWeight(rime.MoegirlPath, rime.DefaultWeight)
rime.AddWeight(rime.ExtPath, rime.DefaultWeight)
rime.AddWeight(rime.TencentPath, rime.DefaultWeight)
fmt.Println("--------------------------------------------------")
@ -35,6 +36,7 @@ func main() {
go rime.Check(rime.HanziPath, 3)
go rime.Check(rime.BasePath, 3)
go rime.Check(rime.SogouPath, 3)
go rime.Check(rime.MoegirlPath, 3)
go rime.Check(rime.ExtPath, 4)
go rime.Check(rime.TencentPath, 4)
@ -44,8 +46,9 @@ func main() {
rime.Sort(rime.HanziPath, 3)
rime.Sort(rime.BasePath, 3)
rime.Sort(rime.SogouPath, 3) // 对 base 中已经有的,去重
rime.Sort(rime.ExtPath, 4) // 对 base、sogou 中已经有的,去重
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、ext 中已经有的,去重
rime.Sort(rime.MoegirlPath, 3) // 对 base、sogou 中已经有的,去重
rime.Sort(rime.ExtPath, 4) // 对 base、sogou、moegirl 中已经有的,去重
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、moegirl、ext 中已经有的,去重
// rime.SortEnDict(rime.EnPath)
}

View File

@ -0,0 +1,120 @@
package rime
import (
"bufio"
"fmt"
mapset "github.com/deckarep/golang-set/v2"
"log"
"os"
"strings"
"time"
"unicode/utf8"
)
func UpdateMoegirl() {
// 使用和 UpdateSogou 一样的方法
filterList = mapset.NewSet[string]() // 重置过滤列表
// 控制台输出
defer updateVersion(MoegirlPath, getSha1(MoegirlPath))
defer printfTimeCost("更新萌娘百科", time.Now())
// 0. 准备好过滤列表
makeFilterList(MoegirlPath)
// 1. 下载新的萌娘词库(暂时手动操作)
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
// 2. 将新的词汇加入到末尾,并且打印新词
appendNewDict(MoegirlPath, newMoegirlFile)
}
func makeFilterList(dictPath string) {
file, err := os.Open(MoegirlPath)
if err != nil {
log.Fatal(err)
}
defer file.Close()
sc := bufio.NewScanner(file)
isFilterMark := false
for sc.Scan() {
line := sc.Text()
// 只读取 +_+ 和 *_* 之间的内容作为过滤列表
if line == mark {
break
}
if !isFilterMark {
if strings.Contains(line, fileterMark) {
isFilterMark = true
}
continue
}
// 过滤列表有两种情况:
// 【# 测试一】取【测试一】
// 【测试二 ce shi er 100】取【测试二】
if strings.HasPrefix(line, "# ") {
text := strings.TrimLeft(line, "# ")
filterList.Add(text)
} else {
text := strings.Split(line, "\t")[0]
filterList.Add(text)
}
}
}
func appendNewDict(dictPath string, newPath string) {
// 逐行读取 newPath有新词则加入到 dictPath 末尾
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
log.Fatal(err)
}
defer moegirlFile.Close()
newFile, err := os.Open(newPath)
if err != nil {
log.Fatal(err)
}
defer newFile.Close()
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
// 新词列表
newWords := make([]string, 0)
sc := bufio.NewScanner(newFile)
isMark := false
for sc.Scan() {
line := sc.Text()
// 只读取 ... 这行以下的词汇
if !isMark {
if line == "..." {
isMark = true
}
continue
}
text := strings.Split(line, "\t")[0]
// 过滤
if set.Contains(text) {
continue
}
// 过滤两字词
if utf8.RuneCountInString(text) <= 2 {
continue
}
// 写入末尾
_, err := moegirlFile.WriteString(line + "\n")
if err != nil {
log.Fatal(err)
}
newWords = append(newWords, line)
}
err = moegirlFile.Sync()
if err != nil {
log.Fatal(err)
}
// 打印新词
fmt.Println("新增词汇:")
for _, word := range newWords {
fmt.Println(word)
}
fmt.Println("count: ", len(newWords))
}

View File

@ -6,6 +6,7 @@ import (
"log"
"os"
"strings"
"unicode/utf8"
)
// 临时用的或一次性的方法集
@ -52,6 +53,34 @@ func enDictsIntersect(dict1, dict2 string) {
}
}
// 处理一个 Rime 词库,去除掉它两个字及以下的词汇
func processNewDict(dictPath string) {
file, _ := os.Open(dictPath)
defer file.Close()
outFile, _ := os.OpenFile("/Users/dvel/Downloads/1.dict.yaml", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
defer outFile.Close()
sc := bufio.NewScanner(file)
isMark := false
for sc.Scan() {
line := sc.Text()
if !isMark {
if line == "..." {
isMark = true
}
continue
}
text := strings.Split(line, "\t")[0]
if utf8.RuneCountInString(text) <= 2 {
continue
}
outFile.WriteString(line+"\n")
}
outFile.Sync()
}
func get字表汉字拼音映射() {
file, err := os.Open(HanziPath)
if err != nil {

View File

@ -24,28 +24,30 @@ type lemma struct {
const (
mark = "# +_+" // 词库中的标记符号,表示从开始检查或排序
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
MoegirlPath = "/Users/dvel/Library/Rime/cn_dicts/moegirl.dict.yaml"
ExtPath = "/Users/dvel/Library/Rime/cn_dicts/ext.dict.yaml"
TencentPath = "/Users/dvel/Library/Rime/cn_dicts/tencent.dict.yaml"
EmojiPath = "/Users/dvel/Library/Rime/opencc/emoji-map.txt"
EnPath = "/Users/dvel/Library/Rime/en_dicts/en.dict.yaml"
EnPath = "/Users/dvel/Library/Rime/en_dicts/en.dict.yaml"
DefaultWeight = 100 // sogou、ext、tencet 词库中默认的权重数值
DefaultWeight = 100 // sogou、moegirl、ext、tencet 词库中默认的权重数值
)
var (
BaseSet mapset.Set[string]
SogouSet mapset.Set[string]
ExtSet mapset.Set[string]
TencentSet mapset.Set[string]
SogouSetWithCode mapset.Set[string]
BaseSet mapset.Set[string]
SogouSet mapset.Set[string]
MoegirlSet mapset.Set[string]
ExtSet mapset.Set[string]
TencentSet mapset.Set[string]
)
func init() {
BaseSet = readToSet(BasePath)
SogouSet = readToSet(SogouPath)
MoegirlSet = readToSet(MoegirlPath)
ExtSet = readToSet(ExtPath)
TencentSet = readToSet(TencentPath)
}
@ -116,12 +118,14 @@ func getSha1(dictPath string) string {
// updateVersion 排序后,如果文件有改动,则修改 version 日期
func updateVersion(dictPath string, oldSha1 string) {
// 判断文件是否有改变
newSha1 := getSha1(dictPath)
if newSha1 == oldSha1 {
fmt.Println()
return
if dictPath != MoegirlPath {
newSha1 := getSha1(dictPath)
if newSha1 == oldSha1 {
fmt.Println()
return
}
fmt.Println(" ...sorted")
}
fmt.Println(" ...sorted")
// 打开文件
file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)

View File

@ -25,10 +25,10 @@ func UpdateSogou() {
defer updateVersion(SogouPath, getSha1(SogouPath))
defer printfTimeCost("更新搜狗流行词", time.Now())
makeFilterList() // 0. 准备好过滤词列表
downloadSogou() // 1. 下载搜狗流行词加入到文件末尾
checkAndWrite() // 2. 过滤、去重、排序
PrintNewWords() // 3. 打印新增词汇
makeSogouFilterList() // 0. 准备好过滤词列表
downloadSogou() // 1. 下载搜狗流行词加入到文件末尾
checkAndWrite() // 2. 过滤、去重、排序
PrintNewWords() // 3. 打印新增词汇
// 弄完了删除临时用的文件,否则 VSCode 全局搜索词汇时会搜索到,影响体验
err := os.Remove("./scel2txt/scel/sogou.scel")
@ -45,8 +45,8 @@ func UpdateSogou() {
}
}
// makeFilterList 准备好过滤词列表 filterList
func makeFilterList() {
// 准备好过滤词列表 filterList
func makeSogouFilterList() {
file, err := os.Open(SogouPath)
if err != nil {
log.Fatal(err)

View File

@ -147,15 +147,17 @@ func Sort(dictPath string, flag int) {
}
// 其他词库需要从一个或多个词库中去重后再写入
if dictPath == SogouPath || dictPath == ExtPath || dictPath == TencentPath {
if contains([]string{SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
switch dictPath {
case SogouPath: // sogou 不和 base 有重复
intersect = SogouSet.Intersect(BaseSet)
case ExtPath: // ext 不和 base+sogou 有重复
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet))
case TencentPath:
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(ExtSet))
case MoegirlPath: // moegirl 不和 base+sogou 有重复
intersect = MoegirlSet.Intersect(BaseSet.Union(SogouSet))
case ExtPath: // ext 不和 base+sogou+moegirl 有重复
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet))
case TencentPath: // tencent 不和 base+sogou+moegirl+ext 有重复
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet).Union(ExtSet))
}
for _, line := range contents {
@ -177,7 +179,7 @@ func Sort(dictPath string, flag int) {
}
// 外部词库或临时文件,只排序,不去重
if !contains([]string{HanziPath, BasePath, SogouPath, ExtPath, TencentPath}, dictPath) {
if !contains([]string{HanziPath, BasePath, SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
switch flag {
case 1:
for _, line := range contents {