删除萌娘百科词库

This commit is contained in:
Dvel 2023-03-04 13:25:37 +08:00
parent 66cc744332
commit 6998278741
10 changed files with 9512 additions and 76565 deletions

View File

@ -247374,6 +247374,7 @@ sort: by_weight
洛神花茶 luo shen hua cha 218
锣声 luo sheng 2600
落生 luo sheng 1
洛圣都 luo sheng du 100
罗盛教 luo sheng jiao 136
罗生门 luo sheng men 1810
落实 luo shi 500563
@ -304178,6 +304179,7 @@ sort: by_weight
青田县 qing tian xian 4855
庆天香 qing tian xiang 1
请填写 qing tian xie 239940
清田信长 qing tian xin chang 100
青天削出金芙蓉 qing tian xue chu jin fu rong 4
擎天一柱 qing tian yi zhu 515
青天有月来几时 qing tian you yue lai ji shi 4
@ -308116,6 +308118,7 @@ sort: by_weight
圈地围栏网 quan di wei lan wang 111
圈地运动 quan di yun dong 4645
圈地战 quan di zhan 95
圈地自萌 quan di zi meng 100
圈点 quan dian 4090
全垫升式气垫船 quan dian sheng shi qi dian chuan 1180
圈定 quan ding 9005
@ -346792,6 +346795,7 @@ sort: by_weight
首都机场集团 shou du ji chang ji tuan 111
首都机场集团公司 shou du ji chang ji tuan gong si 111
首都机场派出所 shou du ji chang pai chu suo 272
首都机场线 shou du ji chang xian 100
首都吉隆坡 shou du ji long po 111
首都健康 shou du jian kang 111
首都建设 shou du jian she 111
@ -428719,6 +428723,7 @@ sort: by_weight
新场 xin chang 3970
信昌 xin chang 1540
心肠歹毒 xin chang dai du 170
信长的忍者 xin chang de ren zhe 100
新长发 xin chang fa 10
新长发糖炒栗子 xin chang fa tang chao li zi 40
新昌胡同 xin chang hu tong 2
@ -428734,10 +428739,12 @@ sort: by_weight
新昌县 xin chang xian 5780
新昌县司法局 xin chang xian si fa ju 26
新昌小区 xin chang xiao qu 2
信长协奏曲 xin chang xie zou qu 100
信长野望 xin chang ye wang 100
心肠硬 xin chang ying 265
新长征 xin chang zheng 7080
新长征花苑 xin chang zheng hua yuan 3
信长之野望 xin chang zhi ye wang 111
新潮 xin chao 43280
心潮 xin chao 4015
心潮翻腾 xin chao fan teng 15
@ -460878,6 +460885,7 @@ sort: by_weight
已收录 yi shou lu 2005
益寿路 yi shou lu 28
异兽录 yi shou lu 1
异兽魔都 yi shou mo du 100
易守难攻 yi shou nan gong 3295
益寿宁 yi shou ning 1
一受其成形 yi shou qi cheng xing 1

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -23,13 +23,8 @@ func main() {
rime.UpdateSogou()
fmt.Println("--------------------------------------------------")
// 更新萌娘百科
rime.UpdateMoegirl()
fmt.Println("--------------------------------------------------")
// 为 sogou、ext、tencent 没权重的词条加上权重,有权重的改为下面设置的权重
rime.AddWeight(rime.SogouPath, rime.DefaultWeight)
rime.AddWeight(rime.MoegirlPath, rime.DefaultWeight)
rime.AddWeight(rime.ExtPath, rime.DefaultWeight)
rime.AddWeight(rime.TencentPath, rime.DefaultWeight)
fmt.Println("--------------------------------------------------")
@ -39,7 +34,6 @@ func main() {
go rime.Check(rime.HanziPath, 3)
go rime.Check(rime.BasePath, 3)
go rime.Check(rime.SogouPath, 3)
go rime.Check(rime.MoegirlPath, 3)
go rime.Check(rime.ExtPath, 4)
go rime.Check(rime.TencentPath, 4)
@ -49,9 +43,8 @@ func main() {
rime.Sort(rime.HanziPath, 3)
rime.Sort(rime.BasePath, 3)
rime.Sort(rime.SogouPath, 3) // 对 base 中已经有的,去重
rime.Sort(rime.MoegirlPath, 3) // 对 base、sogou 中已经有的,去重
rime.Sort(rime.ExtPath, 4) // 对 base、sogou、moegirl 中已经有的,去重
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、moegirl、ext 中已经有的,去重
rime.Sort(rime.ExtPath, 4) // 对 base、sogou 中已经有的,去重
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、ext 中已经有的,去重
// rime.SortEnDict(rime.EnPath)
}

View File

@ -1,164 +0,0 @@
package rime
import (
"bufio"
"fmt"
mapset "github.com/deckarep/golang-set/v2"
"log"
"os"
"strings"
"time"
"unicode/utf8"
)
func UpdateMoegirl() {
// 使用和 UpdateSogou 一样的方法
filterList = mapset.NewSet[string]() // 重置过滤列表
// 控制台输出
defer printlnTimeCost("更新萌娘百科", time.Now())
// 0. 下载新的萌娘词库(暂时手动操作)
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
// 1. 准备好过滤列表
makeFilterList(MoegirlPath, newMoegirlFile)
// 2. 将新的词汇加入到末尾,并打印新词
appendNewDict(MoegirlPath, newMoegirlFile)
}
func makeFilterList(dictPath string, newPath string) {
// 读取目前词库 +_+ 和 *_* 之间的内容,加入过滤列表
file, err := os.Open(dictPath)
if err != nil {
log.Fatal(err)
}
defer file.Close()
sc := bufio.NewScanner(file)
isFilterMark := false
for sc.Scan() {
line := sc.Text()
if line == mark {
break
}
if !isFilterMark {
if strings.Contains(line, fileterMark) {
isFilterMark = true
}
continue
}
// 过滤列表有两种情况:
// 【# 测试一】取【测试一】
// 【测试二 ce shi er 100】取【测试二】
if strings.HasPrefix(line, "# ") {
filterList.Add(strings.TrimLeft(line, "# "))
} else {
filterList.Add(strings.Split(line, "\t")[0])
}
}
// 读取新词库,有问题的直接特么不要了
newFile, err := os.Open(newPath)
if err != nil {
if os.IsNotExist(err) {
return
}
log.Fatal(err)
}
defer newFile.Close()
sc = bufio.NewScanner(newFile)
isMark := false
for sc.Scan() {
line := sc.Text()
if !isMark {
if line == "..." {
isMark = true
}
continue
}
parts := strings.Split(line, "\t")
text, code := parts[0], parts[1]
// 过滤掉有注音问题的:
// 把汉字和拼音弄成一一对应关系,「拼音:pin yin」→「拼:pin」「音:yin」
pinyins := strings.Split(code, " ")
i := 0
for _, zi := range text {
if !contains(hanPinyinMap[string(zi)], pinyins[i]) {
filterList.Add(text)
}
i++
}
// 过滤掉有异形词问题的
for _, wrongWord := range wrongWords.ToSlice() {
if strings.Contains(text, wrongWord) {
filterList.Add(text)
}
}
}
}
func appendNewDict(dictPath string, newPath string) {
// 逐行读取 newPath有新词则加入到 dictPath 末尾
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
if err != nil {
log.Fatal(err)
}
defer moegirlFile.Close()
newFile, err := os.Open(newPath)
if err != nil {
if os.IsNotExist(err) {
return
}
log.Fatal(err)
}
defer newFile.Close()
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
// 新词列表
newWords := make([]string, 0)
sc := bufio.NewScanner(newFile)
isMark := false
for sc.Scan() {
line := sc.Text()
// 只读取 ... 这行以下的词汇
if !isMark {
if line == "..." {
isMark = true
}
continue
}
text := strings.Split(line, "\t")[0]
// 过滤
if set.Contains(text) {
continue
}
// 过滤两字词
if utf8.RuneCountInString(text) <= 2 {
continue
}
// 写入末尾
_, err := moegirlFile.WriteString(line + "\n")
if err != nil {
log.Fatal(err)
}
newWords = append(newWords, line)
}
err = moegirlFile.Sync()
if err != nil {
log.Fatal(err)
}
// 打印新词
fmt.Println("新增词汇:")
for _, word := range newWords {
fmt.Println(word)
}
fmt.Println("count: ", len(newWords))
}

View File

@ -27,19 +27,17 @@ const (
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
MoegirlPath = "/Users/dvel/Library/Rime/cn_dicts/moegirl.dict.yaml"
ExtPath = "/Users/dvel/Library/Rime/cn_dicts/ext.dict.yaml"
TencentPath = "/Users/dvel/Library/Rime/cn_dicts/tencent.dict.yaml"
EmojiPath = "/Users/dvel/Library/Rime/opencc/emoji-map.txt"
EnPath = "/Users/dvel/Library/Rime/en_dicts/en.dict.yaml"
DefaultWeight = 100 // sogou、moegirl、ext、tencet 词库中默认的权重数值
DefaultWeight = 100 // sogou、ext、tencet 词库中默认的权重数值
)
var (
BaseSet mapset.Set[string]
SogouSet mapset.Set[string]
MoegirlSet mapset.Set[string]
ExtSet mapset.Set[string]
TencentSet mapset.Set[string]
)
@ -47,7 +45,6 @@ var (
func init() {
BaseSet = readToSet(BasePath)
SogouSet = readToSet(SogouPath)
MoegirlSet = readToSet(MoegirlPath)
ExtSet = readToSet(ExtPath)
TencentSet = readToSet(TencentPath)
}

View File

@ -147,17 +147,15 @@ func Sort(dictPath string, flag int) {
}
// 其他词库需要从一个或多个词库中去重后再写入
if contains([]string{SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
if contains([]string{SogouPath, ExtPath, TencentPath}, dictPath) {
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
switch dictPath {
case SogouPath: // sogou 不和 base 有重复
case SogouPath:
intersect = SogouSet.Intersect(BaseSet)
case MoegirlPath: // moegirl 不和 base+sogou 有重复
intersect = MoegirlSet.Intersect(BaseSet.Union(SogouSet))
case ExtPath: // ext 不和 base+sogou+moegirl 有重复
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet))
case TencentPath: // tencent 不和 base+sogou+moegirl+ext 有重复
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet).Union(ExtSet))
case ExtPath:
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet))
case TencentPath:
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(ExtSet))
}
for _, line := range contents {
@ -166,7 +164,7 @@ func Sort(dictPath string, flag int) {
continue
}
str := ""
if flag == 3 { // sogou moegirl
if flag == 3 { // sogou
str = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"
} else if flag == 4 { // ext tencent
str = line.text + "\t" + strconv.Itoa(line.weight) + "\n"
@ -179,7 +177,7 @@ func Sort(dictPath string, flag int) {
}
// 外部词库或临时文件,只排序,不去重
if !contains([]string{HanziPath, BasePath, SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
if !contains([]string{HanziPath, BasePath, SogouPath, ExtPath, TencentPath}, dictPath) {
switch flag {
case 1:
for _, line := range contents {

View File

@ -1,5 +1,6 @@
截至到
截至日期
截至时间
合壁
飞行旗
做月子

View File

@ -3,13 +3,15 @@
---
name: rime_ice
version: "2023-02-26"
version: "2023-03-04"
import_tables:
- cn_dicts/8105 # 字表
- cn_dicts/base # 基础词库
- cn_dicts/sogou # 搜狗流行词
- cn_dicts/moegirl # 萌娘百科
- cn_dicts/ext # 扩展词库
- cn_dicts/tencent # 腾讯词向量(大词库,部署时间较长)
- cn_dicts/others # 一些杂项
# 建议把扩展词库放到下面,有重复词条时,最上面的权重生效
# - cn_dicts/my_dict
...