2022-10-30 16:47:40 +01:00
|
|
|
|
package rime
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
|
|
|
|
"fmt"
|
|
|
|
|
"github.com/commander-cli/cmd"
|
|
|
|
|
mapset "github.com/deckarep/golang-set/v2"
|
|
|
|
|
"io"
|
|
|
|
|
"log"
|
|
|
|
|
"net/http"
|
|
|
|
|
"os"
|
2023-03-28 22:33:01 +02:00
|
|
|
|
"path/filepath"
|
2022-10-30 16:47:40 +01:00
|
|
|
|
"sort"
|
|
|
|
|
"strconv"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
"unicode/utf8"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
var fileterMark = "# *_*" // "# *_*" 和 mark 之间是过滤词列表
|
|
|
|
|
var filterList = mapset.NewSet[string]() // 过滤词列表,在这个列表里的词汇,不再写入
|
|
|
|
|
|
|
|
|
|
// UpdateSogou 更新搜狗流行词
|
|
|
|
|
func UpdateSogou() {
|
|
|
|
|
// 控制台输出
|
|
|
|
|
defer updateVersion(SogouPath, getSha1(SogouPath))
|
|
|
|
|
defer printfTimeCost("更新搜狗流行词", time.Now())
|
|
|
|
|
|
2023-02-23 19:09:29 +01:00
|
|
|
|
makeSogouFilterList() // 0. 准备好过滤词列表
|
|
|
|
|
downloadSogou() // 1. 下载搜狗流行词加入到文件末尾
|
|
|
|
|
checkAndWrite() // 2. 过滤、去重、排序
|
|
|
|
|
PrintNewWords() // 3. 打印新增词汇
|
2022-10-30 16:47:40 +01:00
|
|
|
|
|
|
|
|
|
// 弄完了删除临时用的文件,否则 VSCode 全局搜索词汇时会搜索到,影响体验
|
|
|
|
|
err := os.Remove("./scel2txt/scel/sogou.scel")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
err = os.Remove("./scel2txt/out/luna_pinyin.sogou.dict.yaml")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
err = os.Remove("./scel2txt/out/sogou.txt")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-02-23 19:09:29 +01:00
|
|
|
|
// 准备好过滤词列表 filterList
|
|
|
|
|
func makeSogouFilterList() {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
file, err := os.Open(SogouPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
isFilterMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if line == mark {
|
|
|
|
|
break
|
|
|
|
|
}
|
|
|
|
|
if !isFilterMark {
|
|
|
|
|
if strings.Contains(line, fileterMark) {
|
|
|
|
|
isFilterMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 判断一些可能出错的情况
|
|
|
|
|
if !strings.HasPrefix(line, "# ") {
|
|
|
|
|
log.Fatal("sogou 过滤列表 无效行:", line)
|
|
|
|
|
}
|
|
|
|
|
text := strings.TrimPrefix(line, "# ")
|
|
|
|
|
if strings.ContainsAny(text, " \t") {
|
|
|
|
|
log.Fatal("sogou 过滤列表 包含空字符:", line)
|
|
|
|
|
}
|
|
|
|
|
// 加入过滤词列表
|
|
|
|
|
filterList.Add(text)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// downloadSogou 下载搜狗流行词加入到文件末尾,如果是新词且不在过滤列表,打印出来
|
|
|
|
|
func downloadSogou() {
|
|
|
|
|
// 下载
|
|
|
|
|
url := "https://pinyin.sogou.com/d/dict/download_cell.php?id=4&name=%E7%BD%91%E7%BB%9C%E6%B5%81%E8%A1%8C%E6%96%B0%E8%AF%8D%E3%80%90%E5%AE%98%E6%96%B9%E6%8E%A8%E8%8D%90%E3%80%91&f=detail"
|
|
|
|
|
|
2023-03-28 22:33:01 +02:00
|
|
|
|
// 创建 scel/ 和 out/ 文件夹
|
|
|
|
|
scelDir := "./scel2txt/scel/"
|
|
|
|
|
if _, err := os.Stat(scelDir); os.IsNotExist(err) {
|
|
|
|
|
err := os.MkdirAll(scelDir, 0755)
|
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
outDir := "./scel2txt/out/"
|
|
|
|
|
if _, err := os.Stat(outDir); os.IsNotExist(err) {
|
|
|
|
|
err := os.MkdirAll(outDir, 0755)
|
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-30 16:47:40 +01:00
|
|
|
|
// Get the data
|
|
|
|
|
resp, err := http.Get(url)
|
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
defer resp.Body.Close()
|
|
|
|
|
|
|
|
|
|
// 创建一个文件用于保存
|
2023-03-28 22:33:01 +02:00
|
|
|
|
out, err := os.Create(filepath.Join(scelDir, "sogou.scel"))
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
defer out.Close()
|
|
|
|
|
|
|
|
|
|
// 然后将响应流和文件流对接起来
|
|
|
|
|
_, err = io.Copy(out, resp.Body)
|
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 用 Python 进行转换
|
|
|
|
|
c := cmd.NewCommand("python3 scel2txt.py", cmd.WithWorkingDir("./scel2txt"))
|
|
|
|
|
err = c.Execute()
|
|
|
|
|
if err != nil {
|
|
|
|
|
fmt.Println(c.Stderr())
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
fmt.Printf(c.Stdout())
|
|
|
|
|
|
|
|
|
|
// 加入到现有词库的末尾
|
|
|
|
|
sogouFile, err := os.OpenFile(SogouPath, os.O_WRONLY|os.O_APPEND, 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
defer sogouFile.Close()
|
2023-03-28 22:33:01 +02:00
|
|
|
|
download, err := os.ReadFile(filepath.Join(outDir, "sogou.txt"))
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
_, err = sogouFile.Write(download)
|
|
|
|
|
if err != nil {
|
|
|
|
|
panic(err)
|
|
|
|
|
}
|
|
|
|
|
err = sogouFile.Sync()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// checkAndWrite 过滤、去重、排序
|
|
|
|
|
func checkAndWrite() {
|
|
|
|
|
// 打开文件
|
|
|
|
|
file, err := os.OpenFile(SogouPath, os.O_RDWR, 0644)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
// 前缀内容和词库切片,以 mark 隔开
|
|
|
|
|
prefixContents := make([]string, 0) // 前置内容切片
|
|
|
|
|
contents := make([]lemma, 0) // 词库切片
|
|
|
|
|
|
|
|
|
|
isMark := false
|
|
|
|
|
set := mapset.NewSet[string]() // 去重用的 set
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if !isMark {
|
|
|
|
|
prefixContents = append(prefixContents, line)
|
|
|
|
|
if strings.Contains(line, mark) {
|
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 分割
|
|
|
|
|
parts := strings.Split(line, "\t")
|
|
|
|
|
var text, code, weight string
|
|
|
|
|
switch len(parts) {
|
|
|
|
|
case 2:
|
|
|
|
|
text, code = parts[0], parts[1]
|
|
|
|
|
case 3:
|
|
|
|
|
text, code, weight = parts[0], parts[1], parts[2]
|
|
|
|
|
default:
|
|
|
|
|
log.Fatal("分割错误:", line)
|
|
|
|
|
}
|
|
|
|
|
// 过滤:两个字及以下的
|
|
|
|
|
if utf8.RuneCountInString(text) <= 2 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 过滤:从过滤列表过滤掉
|
|
|
|
|
if filterList.Contains(text) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 过滤:去重
|
|
|
|
|
if set.Contains(text) {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
set.Add(text)
|
2022-11-10 12:30:00 +01:00
|
|
|
|
// 过滤:base 中已经有的就不要了
|
|
|
|
|
if BaseSet.Contains(text) {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// nue → nve,lue → lve
|
|
|
|
|
if strings.Contains(code, "nue") {
|
|
|
|
|
code = strings.ReplaceAll(code, "nue", "nve")
|
|
|
|
|
}
|
|
|
|
|
if strings.Contains(code, "lue") {
|
|
|
|
|
code = strings.ReplaceAll(code, "lue", "lve")
|
|
|
|
|
}
|
|
|
|
|
// 加入数组,没权重的默认给 DefaultWeight
|
|
|
|
|
if weight == "" {
|
|
|
|
|
contents = append(contents, lemma{text, code, DefaultWeight})
|
|
|
|
|
} else {
|
|
|
|
|
weightInt, err := strconv.Atoi(weight)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err, line)
|
|
|
|
|
}
|
|
|
|
|
contents = append(contents, lemma{text, code, weightInt})
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 排序
|
|
|
|
|
sort.Slice(contents, func(i, j int) bool {
|
|
|
|
|
if contents[i].code != contents[j].code {
|
|
|
|
|
return contents[i].code < contents[j].code
|
|
|
|
|
}
|
|
|
|
|
if contents[i].text != contents[j].text {
|
|
|
|
|
return contents[i].text < contents[j].text
|
|
|
|
|
}
|
|
|
|
|
return false
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
// 准备写入
|
|
|
|
|
err = file.Truncate(0)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
_, err = file.Seek(0, 0)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 写入前缀
|
|
|
|
|
for _, content := range prefixContents {
|
|
|
|
|
_, err := file.WriteString(content + "\n")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 写入词库
|
|
|
|
|
for _, content := range contents {
|
|
|
|
|
_, err := file.WriteString(content.text + "\t" + content.code + "\t" + strconv.Itoa(content.weight) + "\n")
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
err = file.Sync()
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// PrintNewWords 打印新增词汇
|
|
|
|
|
func PrintNewWords() {
|
|
|
|
|
// 对比 sogou 的新旧 set,找出新词汇
|
2023-01-17 17:47:12 +01:00
|
|
|
|
newSet := readToSet(SogouPath)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
newWords := newSet.Difference(SogouSet)
|
|
|
|
|
if newWords.Cardinality() == 0 {
|
|
|
|
|
return
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fmt.Println("新增词汇:")
|
|
|
|
|
|
|
|
|
|
// 打印无注音的
|
|
|
|
|
// for _, word := range newWords.ToSlice() {
|
|
|
|
|
// fmt.Println(word)
|
|
|
|
|
// }
|
|
|
|
|
|
|
|
|
|
// 把注音也打出来,方便直接校对
|
|
|
|
|
file, err := os.Open(SogouPath)
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatal(err)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
isMark := false
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if !isMark {
|
|
|
|
|
if strings.Contains(line, mark) {
|
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := strings.Split(line, "\t")[0]
|
|
|
|
|
if newWords.Contains(text) {
|
|
|
|
|
fmt.Println(line)
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fmt.Println("count: ", newWords.Cardinality())
|
|
|
|
|
|
|
|
|
|
// 更新全局的 set,方便后续的检查
|
|
|
|
|
SogouSet = newSet
|
|
|
|
|
}
|