萌娘词库 (★≧▽^))★☆
commit da4acc764651ee2b5bb83ae9c4afb0e910f49ce9 Author: Dvel <git@dvel.me> Date: Fri Feb 24 02:00:30 2023 +0800 萌萌哒 commit 0fb437d852e19b11a89cd2778e41f0252d216f2f Author: Dvel <git@dvel.me> Date: Fri Feb 24 01:40:00 2023 +0800 搞定了啦 commit 52d2fecc25424cd82eca876fb21276725c3891da Author: Dvel <git@dvel.me> Date: Fri Feb 24 01:16:02 2023 +0800 fix commit 656162e42c879f914ca31a4cf049141efc7628de Author: Dvel <git@dvel.me> Date: Fri Feb 24 00:59:29 2023 +0800 基本逻辑完成 commit 8594874035a84016496b15357ebbd6ac54b52725 Author: Dvel <git@dvel.me> Date: Thu Feb 23 23:56:44 2023 +0800 引入萌娘词库
This commit is contained in:
parent
5e9ee058cb
commit
fe4af3a86e
File diff suppressed because it is too large
Load Diff
154398
cn_dicts/moegirl.dict.yaml
154398
cn_dicts/moegirl.dict.yaml
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -26,6 +26,7 @@ func main() {
|
||||
|
||||
// 为 sogou、ext、tencent 没权重的词条加上权重,有权重的改为下面设置的权重
|
||||
rime.AddWeight(rime.SogouPath, rime.DefaultWeight)
|
||||
rime.AddWeight(rime.MoegirlPath, rime.DefaultWeight)
|
||||
rime.AddWeight(rime.ExtPath, rime.DefaultWeight)
|
||||
rime.AddWeight(rime.TencentPath, rime.DefaultWeight)
|
||||
fmt.Println("--------------------------------------------------")
|
||||
@ -35,6 +36,7 @@ func main() {
|
||||
go rime.Check(rime.HanziPath, 3)
|
||||
go rime.Check(rime.BasePath, 3)
|
||||
go rime.Check(rime.SogouPath, 3)
|
||||
go rime.Check(rime.MoegirlPath, 3)
|
||||
go rime.Check(rime.ExtPath, 4)
|
||||
go rime.Check(rime.TencentPath, 4)
|
||||
|
||||
@ -44,8 +46,9 @@ func main() {
|
||||
rime.Sort(rime.HanziPath, 3)
|
||||
rime.Sort(rime.BasePath, 3)
|
||||
rime.Sort(rime.SogouPath, 3) // 对 base 中已经有的,去重
|
||||
rime.Sort(rime.ExtPath, 4) // 对 base、sogou 中已经有的,去重
|
||||
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、ext 中已经有的,去重
|
||||
rime.Sort(rime.MoegirlPath, 3) // 对 base、sogou 中已经有的,去重
|
||||
rime.Sort(rime.ExtPath, 4) // 对 base、sogou、moegirl 中已经有的,去重
|
||||
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、moegirl、ext 中已经有的,去重
|
||||
// rime.SortEnDict(rime.EnPath)
|
||||
}
|
||||
|
||||
|
120
others/script/rime/moegirl.go
Normal file
120
others/script/rime/moegirl.go
Normal file
@ -0,0 +1,120 @@
|
||||
package rime
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
mapset "github.com/deckarep/golang-set/v2"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func UpdateMoegirl() {
|
||||
// 使用和 UpdateSogou 一样的方法
|
||||
filterList = mapset.NewSet[string]() // 重置过滤列表
|
||||
|
||||
// 控制台输出
|
||||
defer updateVersion(MoegirlPath, getSha1(MoegirlPath))
|
||||
defer printfTimeCost("更新萌娘百科", time.Now())
|
||||
|
||||
// 0. 准备好过滤列表
|
||||
makeFilterList(MoegirlPath)
|
||||
// 1. 下载新的萌娘词库(暂时手动操作)
|
||||
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
|
||||
// 2. 将新的词汇加入到末尾,并且打印新词
|
||||
appendNewDict(MoegirlPath, newMoegirlFile)
|
||||
}
|
||||
|
||||
func makeFilterList(dictPath string) {
|
||||
file, err := os.Open(MoegirlPath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
sc := bufio.NewScanner(file)
|
||||
isFilterMark := false
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
// 只读取 +_+ 和 *_* 之间的内容作为过滤列表
|
||||
if line == mark {
|
||||
break
|
||||
}
|
||||
if !isFilterMark {
|
||||
if strings.Contains(line, fileterMark) {
|
||||
isFilterMark = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
// 过滤列表有两种情况:
|
||||
// 【# 测试一】取【测试一】
|
||||
// 【测试二 ce shi er 100】取【测试二】
|
||||
if strings.HasPrefix(line, "# ") {
|
||||
text := strings.TrimLeft(line, "# ")
|
||||
filterList.Add(text)
|
||||
} else {
|
||||
text := strings.Split(line, "\t")[0]
|
||||
filterList.Add(text)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func appendNewDict(dictPath string, newPath string) {
|
||||
// 逐行读取 newPath,有新词则加入到 dictPath 末尾
|
||||
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer moegirlFile.Close()
|
||||
newFile, err := os.Open(newPath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer newFile.Close()
|
||||
|
||||
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
|
||||
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
|
||||
// 新词列表
|
||||
newWords := make([]string, 0)
|
||||
|
||||
sc := bufio.NewScanner(newFile)
|
||||
isMark := false
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
// 只读取 ... 这行以下的词汇
|
||||
if !isMark {
|
||||
if line == "..." {
|
||||
isMark = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
text := strings.Split(line, "\t")[0]
|
||||
// 过滤
|
||||
if set.Contains(text) {
|
||||
continue
|
||||
}
|
||||
// 过滤两字词
|
||||
if utf8.RuneCountInString(text) <= 2 {
|
||||
continue
|
||||
}
|
||||
// 写入末尾
|
||||
_, err := moegirlFile.WriteString(line + "\n")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
newWords = append(newWords, line)
|
||||
}
|
||||
err = moegirlFile.Sync()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// 打印新词
|
||||
fmt.Println("新增词汇:")
|
||||
for _, word := range newWords {
|
||||
fmt.Println(word)
|
||||
}
|
||||
fmt.Println("count: ", len(newWords))
|
||||
}
|
@ -6,6 +6,7 @@ import (
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
// 临时用的或一次性的方法集
|
||||
@ -52,6 +53,34 @@ func enDictsIntersect(dict1, dict2 string) {
|
||||
}
|
||||
}
|
||||
|
||||
// 处理一个 Rime 词库,去除掉它两个字及以下的词汇
|
||||
func processNewDict(dictPath string) {
|
||||
file, _ := os.Open(dictPath)
|
||||
defer file.Close()
|
||||
|
||||
outFile, _ := os.OpenFile("/Users/dvel/Downloads/1.dict.yaml", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
|
||||
defer outFile.Close()
|
||||
|
||||
sc := bufio.NewScanner(file)
|
||||
isMark := false
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !isMark {
|
||||
if line == "..." {
|
||||
isMark = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
text := strings.Split(line, "\t")[0]
|
||||
if utf8.RuneCountInString(text) <= 2 {
|
||||
continue
|
||||
}
|
||||
outFile.WriteString(line+"\n")
|
||||
}
|
||||
outFile.Sync()
|
||||
}
|
||||
|
||||
func get字表汉字拼音映射() {
|
||||
file, err := os.Open(HanziPath)
|
||||
if err != nil {
|
||||
|
@ -24,28 +24,30 @@ type lemma struct {
|
||||
|
||||
const (
|
||||
mark = "# +_+" // 词库中的标记符号,表示从开始检查或排序
|
||||
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
|
||||
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
|
||||
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
|
||||
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
|
||||
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
|
||||
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
|
||||
MoegirlPath = "/Users/dvel/Library/Rime/cn_dicts/moegirl.dict.yaml"
|
||||
ExtPath = "/Users/dvel/Library/Rime/cn_dicts/ext.dict.yaml"
|
||||
TencentPath = "/Users/dvel/Library/Rime/cn_dicts/tencent.dict.yaml"
|
||||
EmojiPath = "/Users/dvel/Library/Rime/opencc/emoji-map.txt"
|
||||
EnPath = "/Users/dvel/Library/Rime/en_dicts/en.dict.yaml"
|
||||
EnPath = "/Users/dvel/Library/Rime/en_dicts/en.dict.yaml"
|
||||
|
||||
DefaultWeight = 100 // sogou、ext、tencet 词库中默认的权重数值
|
||||
DefaultWeight = 100 // sogou、moegirl、ext、tencet 词库中默认的权重数值
|
||||
)
|
||||
|
||||
var (
|
||||
BaseSet mapset.Set[string]
|
||||
SogouSet mapset.Set[string]
|
||||
ExtSet mapset.Set[string]
|
||||
TencentSet mapset.Set[string]
|
||||
SogouSetWithCode mapset.Set[string]
|
||||
BaseSet mapset.Set[string]
|
||||
SogouSet mapset.Set[string]
|
||||
MoegirlSet mapset.Set[string]
|
||||
ExtSet mapset.Set[string]
|
||||
TencentSet mapset.Set[string]
|
||||
)
|
||||
|
||||
func init() {
|
||||
BaseSet = readToSet(BasePath)
|
||||
SogouSet = readToSet(SogouPath)
|
||||
MoegirlSet = readToSet(MoegirlPath)
|
||||
ExtSet = readToSet(ExtPath)
|
||||
TencentSet = readToSet(TencentPath)
|
||||
}
|
||||
@ -116,12 +118,14 @@ func getSha1(dictPath string) string {
|
||||
// updateVersion 排序后,如果文件有改动,则修改 version 日期
|
||||
func updateVersion(dictPath string, oldSha1 string) {
|
||||
// 判断文件是否有改变
|
||||
newSha1 := getSha1(dictPath)
|
||||
if newSha1 == oldSha1 {
|
||||
fmt.Println()
|
||||
return
|
||||
if dictPath != MoegirlPath {
|
||||
newSha1 := getSha1(dictPath)
|
||||
if newSha1 == oldSha1 {
|
||||
fmt.Println()
|
||||
return
|
||||
}
|
||||
fmt.Println(" ...sorted")
|
||||
}
|
||||
fmt.Println(" ...sorted")
|
||||
|
||||
// 打开文件
|
||||
file, err := os.OpenFile(dictPath, os.O_RDWR, 0644)
|
||||
|
@ -25,10 +25,10 @@ func UpdateSogou() {
|
||||
defer updateVersion(SogouPath, getSha1(SogouPath))
|
||||
defer printfTimeCost("更新搜狗流行词", time.Now())
|
||||
|
||||
makeFilterList() // 0. 准备好过滤词列表
|
||||
downloadSogou() // 1. 下载搜狗流行词加入到文件末尾
|
||||
checkAndWrite() // 2. 过滤、去重、排序
|
||||
PrintNewWords() // 3. 打印新增词汇
|
||||
makeSogouFilterList() // 0. 准备好过滤词列表
|
||||
downloadSogou() // 1. 下载搜狗流行词加入到文件末尾
|
||||
checkAndWrite() // 2. 过滤、去重、排序
|
||||
PrintNewWords() // 3. 打印新增词汇
|
||||
|
||||
// 弄完了删除临时用的文件,否则 VSCode 全局搜索词汇时会搜索到,影响体验
|
||||
err := os.Remove("./scel2txt/scel/sogou.scel")
|
||||
@ -45,8 +45,8 @@ func UpdateSogou() {
|
||||
}
|
||||
}
|
||||
|
||||
// makeFilterList 准备好过滤词列表 filterList
|
||||
func makeFilterList() {
|
||||
// 准备好过滤词列表 filterList
|
||||
func makeSogouFilterList() {
|
||||
file, err := os.Open(SogouPath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
|
@ -147,15 +147,17 @@ func Sort(dictPath string, flag int) {
|
||||
}
|
||||
|
||||
// 其他词库需要从一个或多个词库中去重后再写入
|
||||
if dictPath == SogouPath || dictPath == ExtPath || dictPath == TencentPath {
|
||||
if contains([]string{SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
|
||||
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
|
||||
switch dictPath {
|
||||
case SogouPath: // sogou 不和 base 有重复
|
||||
intersect = SogouSet.Intersect(BaseSet)
|
||||
case ExtPath: // ext 不和 base+sogou 有重复
|
||||
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet))
|
||||
case TencentPath:
|
||||
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(ExtSet))
|
||||
case MoegirlPath: // moegirl 不和 base+sogou 有重复
|
||||
intersect = MoegirlSet.Intersect(BaseSet.Union(SogouSet))
|
||||
case ExtPath: // ext 不和 base+sogou+moegirl 有重复
|
||||
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet))
|
||||
case TencentPath: // tencent 不和 base+sogou+moegirl+ext 有重复
|
||||
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet).Union(ExtSet))
|
||||
}
|
||||
|
||||
for _, line := range contents {
|
||||
@ -177,7 +179,7 @@ func Sort(dictPath string, flag int) {
|
||||
}
|
||||
|
||||
// 外部词库或临时文件,只排序,不去重
|
||||
if !contains([]string{HanziPath, BasePath, SogouPath, ExtPath, TencentPath}, dictPath) {
|
||||
if !contains([]string{HanziPath, BasePath, SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
|
||||
switch flag {
|
||||
case 1:
|
||||
for _, line := range contents {
|
||||
|
Loading…
x
Reference in New Issue
Block a user