删除萌娘百科词库
This commit is contained in:
parent
66cc744332
commit
6998278741
@ -247374,6 +247374,7 @@ sort: by_weight
|
||||
洛神花茶 luo shen hua cha 218
|
||||
锣声 luo sheng 2600
|
||||
落生 luo sheng 1
|
||||
洛圣都 luo sheng du 100
|
||||
罗盛教 luo sheng jiao 136
|
||||
罗生门 luo sheng men 1810
|
||||
落实 luo shi 500563
|
||||
@ -304178,6 +304179,7 @@ sort: by_weight
|
||||
青田县 qing tian xian 4855
|
||||
庆天香 qing tian xiang 1
|
||||
请填写 qing tian xie 239940
|
||||
清田信长 qing tian xin chang 100
|
||||
青天削出金芙蓉 qing tian xue chu jin fu rong 4
|
||||
擎天一柱 qing tian yi zhu 515
|
||||
青天有月来几时 qing tian you yue lai ji shi 4
|
||||
@ -308116,6 +308118,7 @@ sort: by_weight
|
||||
圈地围栏网 quan di wei lan wang 111
|
||||
圈地运动 quan di yun dong 4645
|
||||
圈地战 quan di zhan 95
|
||||
圈地自萌 quan di zi meng 100
|
||||
圈点 quan dian 4090
|
||||
全垫升式气垫船 quan dian sheng shi qi dian chuan 1180
|
||||
圈定 quan ding 9005
|
||||
@ -346792,6 +346795,7 @@ sort: by_weight
|
||||
首都机场集团 shou du ji chang ji tuan 111
|
||||
首都机场集团公司 shou du ji chang ji tuan gong si 111
|
||||
首都机场派出所 shou du ji chang pai chu suo 272
|
||||
首都机场线 shou du ji chang xian 100
|
||||
首都吉隆坡 shou du ji long po 111
|
||||
首都健康 shou du jian kang 111
|
||||
首都建设 shou du jian she 111
|
||||
@ -428719,6 +428723,7 @@ sort: by_weight
|
||||
新场 xin chang 3970
|
||||
信昌 xin chang 1540
|
||||
心肠歹毒 xin chang dai du 170
|
||||
信长的忍者 xin chang de ren zhe 100
|
||||
新长发 xin chang fa 10
|
||||
新长发糖炒栗子 xin chang fa tang chao li zi 40
|
||||
新昌胡同 xin chang hu tong 2
|
||||
@ -428734,10 +428739,12 @@ sort: by_weight
|
||||
新昌县 xin chang xian 5780
|
||||
新昌县司法局 xin chang xian si fa ju 26
|
||||
新昌小区 xin chang xiao qu 2
|
||||
信长协奏曲 xin chang xie zou qu 100
|
||||
信长野望 xin chang ye wang 100
|
||||
心肠硬 xin chang ying 265
|
||||
新长征 xin chang zheng 7080
|
||||
新长征花苑 xin chang zheng hua yuan 3
|
||||
信长之野望 xin chang zhi ye wang 111
|
||||
新潮 xin chao 43280
|
||||
心潮 xin chao 4015
|
||||
心潮翻腾 xin chao fan teng 15
|
||||
@ -460878,6 +460885,7 @@ sort: by_weight
|
||||
已收录 yi shou lu 2005
|
||||
益寿路 yi shou lu 28
|
||||
异兽录 yi shou lu 1
|
||||
异兽魔都 yi shou mo du 100
|
||||
易守难攻 yi shou nan gong 3295
|
||||
益寿宁 yi shou ning 1
|
||||
一受其成形 yi shou qi cheng xing 1
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -23,13 +23,8 @@ func main() {
|
||||
rime.UpdateSogou()
|
||||
fmt.Println("--------------------------------------------------")
|
||||
|
||||
// 更新萌娘百科
|
||||
rime.UpdateMoegirl()
|
||||
fmt.Println("--------------------------------------------------")
|
||||
|
||||
// 为 sogou、ext、tencent 没权重的词条加上权重,有权重的改为下面设置的权重
|
||||
rime.AddWeight(rime.SogouPath, rime.DefaultWeight)
|
||||
rime.AddWeight(rime.MoegirlPath, rime.DefaultWeight)
|
||||
rime.AddWeight(rime.ExtPath, rime.DefaultWeight)
|
||||
rime.AddWeight(rime.TencentPath, rime.DefaultWeight)
|
||||
fmt.Println("--------------------------------------------------")
|
||||
@ -39,7 +34,6 @@ func main() {
|
||||
go rime.Check(rime.HanziPath, 3)
|
||||
go rime.Check(rime.BasePath, 3)
|
||||
go rime.Check(rime.SogouPath, 3)
|
||||
go rime.Check(rime.MoegirlPath, 3)
|
||||
go rime.Check(rime.ExtPath, 4)
|
||||
go rime.Check(rime.TencentPath, 4)
|
||||
|
||||
@ -49,9 +43,8 @@ func main() {
|
||||
rime.Sort(rime.HanziPath, 3)
|
||||
rime.Sort(rime.BasePath, 3)
|
||||
rime.Sort(rime.SogouPath, 3) // 对 base 中已经有的,去重
|
||||
rime.Sort(rime.MoegirlPath, 3) // 对 base、sogou 中已经有的,去重
|
||||
rime.Sort(rime.ExtPath, 4) // 对 base、sogou、moegirl 中已经有的,去重
|
||||
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、moegirl、ext 中已经有的,去重
|
||||
rime.Sort(rime.ExtPath, 4) // 对 base、sogou 中已经有的,去重
|
||||
rime.Sort(rime.TencentPath, 4) // 对 base、sogou、ext 中已经有的,去重
|
||||
// rime.SortEnDict(rime.EnPath)
|
||||
}
|
||||
|
||||
|
@ -1,164 +0,0 @@
|
||||
package rime
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
mapset "github.com/deckarep/golang-set/v2"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode/utf8"
|
||||
)
|
||||
|
||||
func UpdateMoegirl() {
|
||||
// 使用和 UpdateSogou 一样的方法
|
||||
filterList = mapset.NewSet[string]() // 重置过滤列表
|
||||
|
||||
// 控制台输出
|
||||
defer printlnTimeCost("更新萌娘百科", time.Now())
|
||||
|
||||
// 0. 下载新的萌娘词库(暂时手动操作)
|
||||
newMoegirlFile := "/Users/dvel/Downloads/moegirl.dict.yaml"
|
||||
// 1. 准备好过滤列表
|
||||
makeFilterList(MoegirlPath, newMoegirlFile)
|
||||
// 2. 将新的词汇加入到末尾,并打印新词
|
||||
appendNewDict(MoegirlPath, newMoegirlFile)
|
||||
}
|
||||
|
||||
func makeFilterList(dictPath string, newPath string) {
|
||||
// 读取目前词库 +_+ 和 *_* 之间的内容,加入过滤列表
|
||||
file, err := os.Open(dictPath)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
sc := bufio.NewScanner(file)
|
||||
isFilterMark := false
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if line == mark {
|
||||
break
|
||||
}
|
||||
if !isFilterMark {
|
||||
if strings.Contains(line, fileterMark) {
|
||||
isFilterMark = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
// 过滤列表有两种情况:
|
||||
// 【# 测试一】取【测试一】
|
||||
// 【测试二 ce shi er 100】取【测试二】
|
||||
if strings.HasPrefix(line, "# ") {
|
||||
filterList.Add(strings.TrimLeft(line, "# "))
|
||||
} else {
|
||||
filterList.Add(strings.Split(line, "\t")[0])
|
||||
}
|
||||
}
|
||||
|
||||
// 读取新词库,有问题的直接特么不要了
|
||||
newFile, err := os.Open(newPath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return
|
||||
}
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer newFile.Close()
|
||||
|
||||
sc = bufio.NewScanner(newFile)
|
||||
isMark := false
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
if !isMark {
|
||||
if line == "..." {
|
||||
isMark = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
parts := strings.Split(line, "\t")
|
||||
text, code := parts[0], parts[1]
|
||||
|
||||
// 过滤掉有注音问题的:
|
||||
// 把汉字和拼音弄成一一对应关系,「拼音:pin yin」→「拼:pin」「音:yin」
|
||||
pinyins := strings.Split(code, " ")
|
||||
i := 0
|
||||
for _, zi := range text {
|
||||
if !contains(hanPinyinMap[string(zi)], pinyins[i]) {
|
||||
filterList.Add(text)
|
||||
}
|
||||
i++
|
||||
}
|
||||
|
||||
// 过滤掉有异形词问题的
|
||||
for _, wrongWord := range wrongWords.ToSlice() {
|
||||
if strings.Contains(text, wrongWord) {
|
||||
filterList.Add(text)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func appendNewDict(dictPath string, newPath string) {
|
||||
// 逐行读取 newPath,有新词则加入到 dictPath 末尾
|
||||
moegirlFile, err := os.OpenFile(dictPath, os.O_APPEND|os.O_WRONLY, 0644)
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer moegirlFile.Close()
|
||||
|
||||
newFile, err := os.Open(newPath)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
return
|
||||
}
|
||||
log.Fatal(err)
|
||||
}
|
||||
defer newFile.Close()
|
||||
|
||||
// 需要过滤的: base+sogou+moegirl+过滤列表 filterList
|
||||
set := BaseSet.Union(SogouSet).Union(MoegirlSet).Union(filterList)
|
||||
// 新词列表
|
||||
newWords := make([]string, 0)
|
||||
|
||||
sc := bufio.NewScanner(newFile)
|
||||
isMark := false
|
||||
for sc.Scan() {
|
||||
line := sc.Text()
|
||||
// 只读取 ... 这行以下的词汇
|
||||
if !isMark {
|
||||
if line == "..." {
|
||||
isMark = true
|
||||
}
|
||||
continue
|
||||
}
|
||||
text := strings.Split(line, "\t")[0]
|
||||
// 过滤
|
||||
if set.Contains(text) {
|
||||
continue
|
||||
}
|
||||
// 过滤两字词
|
||||
if utf8.RuneCountInString(text) <= 2 {
|
||||
continue
|
||||
}
|
||||
// 写入末尾
|
||||
_, err := moegirlFile.WriteString(line + "\n")
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
newWords = append(newWords, line)
|
||||
}
|
||||
err = moegirlFile.Sync()
|
||||
if err != nil {
|
||||
log.Fatal(err)
|
||||
}
|
||||
|
||||
// 打印新词
|
||||
fmt.Println("新增词汇:")
|
||||
for _, word := range newWords {
|
||||
fmt.Println(word)
|
||||
}
|
||||
fmt.Println("count: ", len(newWords))
|
||||
}
|
@ -27,19 +27,17 @@ const (
|
||||
HanziPath = "/Users/dvel/Library/Rime/cn_dicts/8105.dict.yaml"
|
||||
BasePath = "/Users/dvel/Library/Rime/cn_dicts/base.dict.yaml"
|
||||
SogouPath = "/Users/dvel/Library/Rime/cn_dicts/sogou.dict.yaml"
|
||||
MoegirlPath = "/Users/dvel/Library/Rime/cn_dicts/moegirl.dict.yaml"
|
||||
ExtPath = "/Users/dvel/Library/Rime/cn_dicts/ext.dict.yaml"
|
||||
TencentPath = "/Users/dvel/Library/Rime/cn_dicts/tencent.dict.yaml"
|
||||
EmojiPath = "/Users/dvel/Library/Rime/opencc/emoji-map.txt"
|
||||
EnPath = "/Users/dvel/Library/Rime/en_dicts/en.dict.yaml"
|
||||
|
||||
DefaultWeight = 100 // sogou、moegirl、ext、tencet 词库中默认的权重数值
|
||||
DefaultWeight = 100 // sogou、ext、tencet 词库中默认的权重数值
|
||||
)
|
||||
|
||||
var (
|
||||
BaseSet mapset.Set[string]
|
||||
SogouSet mapset.Set[string]
|
||||
MoegirlSet mapset.Set[string]
|
||||
ExtSet mapset.Set[string]
|
||||
TencentSet mapset.Set[string]
|
||||
)
|
||||
@ -47,7 +45,6 @@ var (
|
||||
func init() {
|
||||
BaseSet = readToSet(BasePath)
|
||||
SogouSet = readToSet(SogouPath)
|
||||
MoegirlSet = readToSet(MoegirlPath)
|
||||
ExtSet = readToSet(ExtPath)
|
||||
TencentSet = readToSet(TencentPath)
|
||||
}
|
||||
|
@ -147,17 +147,15 @@ func Sort(dictPath string, flag int) {
|
||||
}
|
||||
|
||||
// 其他词库需要从一个或多个词库中去重后再写入
|
||||
if contains([]string{SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
|
||||
if contains([]string{SogouPath, ExtPath, TencentPath}, dictPath) {
|
||||
var intersect mapset.Set[string] // 交集,有交集的就是重复的,去掉
|
||||
switch dictPath {
|
||||
case SogouPath: // sogou 不和 base 有重复
|
||||
case SogouPath:
|
||||
intersect = SogouSet.Intersect(BaseSet)
|
||||
case MoegirlPath: // moegirl 不和 base+sogou 有重复
|
||||
intersect = MoegirlSet.Intersect(BaseSet.Union(SogouSet))
|
||||
case ExtPath: // ext 不和 base+sogou+moegirl 有重复
|
||||
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet))
|
||||
case TencentPath: // tencent 不和 base+sogou+moegirl+ext 有重复
|
||||
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(MoegirlSet).Union(ExtSet))
|
||||
case ExtPath:
|
||||
intersect = ExtSet.Intersect(BaseSet.Union(SogouSet))
|
||||
case TencentPath:
|
||||
intersect = TencentSet.Intersect(BaseSet.Union(SogouSet).Union(ExtSet))
|
||||
}
|
||||
|
||||
for _, line := range contents {
|
||||
@ -166,7 +164,7 @@ func Sort(dictPath string, flag int) {
|
||||
continue
|
||||
}
|
||||
str := ""
|
||||
if flag == 3 { // sogou moegirl
|
||||
if flag == 3 { // sogou
|
||||
str = line.text + "\t" + line.code + "\t" + strconv.Itoa(line.weight) + "\n"
|
||||
} else if flag == 4 { // ext tencent
|
||||
str = line.text + "\t" + strconv.Itoa(line.weight) + "\n"
|
||||
@ -179,7 +177,7 @@ func Sort(dictPath string, flag int) {
|
||||
}
|
||||
|
||||
// 外部词库或临时文件,只排序,不去重
|
||||
if !contains([]string{HanziPath, BasePath, SogouPath, MoegirlPath, ExtPath, TencentPath}, dictPath) {
|
||||
if !contains([]string{HanziPath, BasePath, SogouPath, ExtPath, TencentPath}, dictPath) {
|
||||
switch flag {
|
||||
case 1:
|
||||
for _, line := range contents {
|
||||
|
@ -1,5 +1,6 @@
|
||||
截至到
|
||||
截至日期
|
||||
截至时间
|
||||
合壁
|
||||
飞行旗
|
||||
做月子
|
||||
|
@ -3,13 +3,15 @@
|
||||
|
||||
---
|
||||
name: rime_ice
|
||||
version: "2023-02-26"
|
||||
version: "2023-03-04"
|
||||
import_tables:
|
||||
- cn_dicts/8105 # 字表
|
||||
- cn_dicts/base # 基础词库
|
||||
- cn_dicts/sogou # 搜狗流行词
|
||||
- cn_dicts/moegirl # 萌娘百科
|
||||
- cn_dicts/ext # 扩展词库
|
||||
- cn_dicts/tencent # 腾讯词向量(大词库,部署时间较长)
|
||||
- cn_dicts/others # 一些杂项
|
||||
|
||||
# 建议把扩展词库放到下面,有重复词条时,最上面的权重生效
|
||||
# - cn_dicts/my_dict
|
||||
...
|
||||
|
Loading…
x
Reference in New Issue
Block a user