2022-10-30 16:47:40 +01:00
|
|
|
|
package rime
|
|
|
|
|
|
|
|
|
|
import (
|
|
|
|
|
"bufio"
|
2023-02-06 20:32:24 +01:00
|
|
|
|
"fmt"
|
2023-04-13 11:04:44 +02:00
|
|
|
|
mapset "github.com/deckarep/golang-set/v2"
|
2022-10-30 16:47:40 +01:00
|
|
|
|
"log"
|
|
|
|
|
"os"
|
2023-05-03 18:21:34 +02:00
|
|
|
|
"path/filepath"
|
2022-10-30 16:47:40 +01:00
|
|
|
|
"regexp"
|
|
|
|
|
"strings"
|
|
|
|
|
"time"
|
|
|
|
|
"unicode/utf8"
|
|
|
|
|
)
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// CheckAndGenerateEmoji
|
|
|
|
|
// 检查 emoji-map.txt 是否合法,检查中文映射是否存在于 base 词库中
|
|
|
|
|
// 生成 Rime 格式的 emoji.txt
|
2023-05-03 18:21:34 +02:00
|
|
|
|
// 检查 other.txt 中文映射是否存在于 base 词库中
|
2023-04-13 11:04:44 +02:00
|
|
|
|
func CheckAndGenerateEmoji() {
|
|
|
|
|
// 控制台输出
|
|
|
|
|
defer printlnTimeCost("检查、更新 Emoji", time.Now())
|
2022-10-30 16:47:40 +01:00
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
checkEmoji()
|
|
|
|
|
generateEmoji()
|
2023-05-03 18:21:34 +02:00
|
|
|
|
checkOthersTXT()
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 检查 emoji-map.txt 是否合法,检查中文映射是否存在于 base 词库中
|
|
|
|
|
func checkEmoji() {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
// 打开文件
|
2023-04-13 11:04:44 +02:00
|
|
|
|
file, err := os.Open(EmojiMapPath)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if err != nil {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
log.Fatalln(err)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 将 Emoji 加入一个 set,为检测差集做准备
|
2022-10-30 16:47:40 +01:00
|
|
|
|
emojiSet := mapset.NewSet[string]()
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
2023-02-06 20:32:24 +01:00
|
|
|
|
// 过滤空行
|
|
|
|
|
if line == "" {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2022-10-30 16:47:40 +01:00
|
|
|
|
// 过滤注释
|
|
|
|
|
if strings.Contains(line, "#") {
|
|
|
|
|
continue
|
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 检查:是否包含 Tab
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if strings.Contains(line, "\t") {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
fmt.Println("❌ contains Tab:", line)
|
|
|
|
|
continue
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 检查:开头结尾无效的空格
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if strings.HasPrefix(line, " ") || strings.HasSuffix(line, " ") {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
fmt.Println("❌ unexpected space:", line)
|
|
|
|
|
continue
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 开始分割
|
2022-10-30 16:47:40 +01:00
|
|
|
|
parts := strings.Split(line, " ")
|
|
|
|
|
if len(parts) < 2 {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
fmt.Println("❌ invalid line:", line)
|
|
|
|
|
continue
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
// 加入 emojiSet,顺便用一个 tempSet 查重
|
|
|
|
|
tempSet := mapset.NewSet[string]()
|
2023-04-13 11:04:44 +02:00
|
|
|
|
for _, text := range parts[1:] {
|
|
|
|
|
emojiSet.Add(text)
|
|
|
|
|
if tempSet.Contains(text) {
|
|
|
|
|
fmt.Println("❌ duplicate mapping:", text)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
} else {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
tempSet.Add(text)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
if err := sc.Err(); err != nil {
|
|
|
|
|
log.Fatalln(err)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 检查: emoji-map.txt 中的映射是否存在于 base 词库中,有差集即不存在
|
|
|
|
|
for _, text := range emojiSet.Difference(BaseSet).ToSlice() {
|
|
|
|
|
// 不检查英文
|
|
|
|
|
if match, _ := regexp.MatchString(`[a-zA-Z]`, text); match {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
continue
|
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 不检查 1 个字的
|
|
|
|
|
if utf8.RuneCountInString(text) == 1 {
|
2022-10-30 16:47:40 +01:00
|
|
|
|
continue
|
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
fmt.Println("❌ Emoji 与 base 的差集:", text)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 从 emoji-map.txt 生成或更新 emoji.txt
|
|
|
|
|
func generateEmoji() {
|
|
|
|
|
// 打开文件
|
|
|
|
|
file, err := os.Open(EmojiMapPath)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if err != nil {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
log.Fatalln(err)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
defer file.Close()
|
2022-10-30 16:47:40 +01:00
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 模拟有序字典
|
|
|
|
|
OmKeys := make([]string, 0)
|
|
|
|
|
OmMap := make(map[string][]string)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
// 将映射读取到字典里
|
|
|
|
|
sc := bufio.NewScanner(file)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
2023-04-13 11:04:44 +02:00
|
|
|
|
if strings.HasPrefix(line, "#") && !strings.Contains(line, "井号") { // 井号的 Emoji 被判断为以 # 开头了。。。
|
2022-10-30 16:47:40 +01:00
|
|
|
|
continue
|
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
parts := strings.Split(line, " ")
|
|
|
|
|
for _, text := range parts[1:] {
|
|
|
|
|
if !contains(OmKeys, text) {
|
|
|
|
|
OmKeys = append(OmKeys, text)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
OmMap[text] = append(OmMap[text], parts[0])
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
if err := sc.Err(); err != nil {
|
|
|
|
|
log.Fatalln(err)
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-30 16:47:40 +01:00
|
|
|
|
// 写入 emoji.txt
|
2023-04-13 11:04:44 +02:00
|
|
|
|
file, err = os.OpenFile(EmojiPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatalln(err)
|
|
|
|
|
}
|
2023-04-13 11:04:44 +02:00
|
|
|
|
defer file.Close()
|
2022-10-30 16:47:40 +01:00
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
for _, key := range OmKeys {
|
|
|
|
|
line := key + "\t" + key + " " + strings.Join(OmMap[key], " ") + "\n"
|
|
|
|
|
_, err := file.WriteString(line)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
if err != nil {
|
2023-04-13 11:04:44 +02:00
|
|
|
|
log.Fatalln(err)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-04-13 11:04:44 +02:00
|
|
|
|
if err := file.Sync(); err != nil {
|
|
|
|
|
log.Fatalln(err)
|
2022-10-30 16:47:40 +01:00
|
|
|
|
}
|
|
|
|
|
}
|
2023-05-03 18:21:34 +02:00
|
|
|
|
|
|
|
|
|
// 检查 others.txt 里的中文映射是否存在于 base 词库中
|
|
|
|
|
func checkOthersTXT() {
|
|
|
|
|
// 打开文件
|
|
|
|
|
file, err := os.Open(filepath.Join(RimeDir, "opencc/others.txt"))
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatalln(err)
|
|
|
|
|
}
|
|
|
|
|
defer file.Close()
|
|
|
|
|
|
|
|
|
|
// 将 Emoji 加入一个 set,为检测差集做准备
|
|
|
|
|
set := mapset.NewSet[string]()
|
|
|
|
|
sc := bufio.NewScanner(file)
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
// 不能有空行
|
|
|
|
|
if line == "" {
|
|
|
|
|
fmt.Println("❌ empty line")
|
|
|
|
|
}
|
|
|
|
|
// 过滤注释
|
|
|
|
|
if strings.HasPrefix(line, "----------") {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := strings.Split(line, "\t")[0]
|
2023-12-08 16:26:33 +01:00
|
|
|
|
if set.Contains(text) {
|
|
|
|
|
fmt.Println("❌ duplicate key", text)
|
|
|
|
|
} else {
|
|
|
|
|
set.Add(text)
|
|
|
|
|
}
|
2023-05-03 18:21:34 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// base+cn_dicts/others.dict.yaml
|
|
|
|
|
othersDictYamlSet := mapset.NewSet[string]()
|
|
|
|
|
othersDictYaml, err := os.Open(filepath.Join(RimeDir, "cn_dicts/others.dict.yaml"))
|
|
|
|
|
if err != nil {
|
|
|
|
|
log.Fatalln(err)
|
|
|
|
|
}
|
|
|
|
|
defer othersDictYaml.Close()
|
|
|
|
|
sc = bufio.NewScanner(othersDictYaml)
|
|
|
|
|
isMark := false
|
|
|
|
|
for sc.Scan() {
|
|
|
|
|
line := sc.Text()
|
|
|
|
|
if !isMark {
|
|
|
|
|
if strings.HasPrefix(line, "##### 叠字") {
|
|
|
|
|
isMark = true
|
|
|
|
|
}
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
if strings.HasPrefix(line, "#") {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
text := strings.Split(line, "\t")[0]
|
|
|
|
|
othersDictYamlSet.Add(text)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 检查: emoji-map.txt 中的映射是否存在于 base+cn_dicts/others.dict.yaml 词库中,有差集即不存在
|
|
|
|
|
dictSet := BaseSet.Union(othersDictYamlSet)
|
|
|
|
|
for _, text := range set.Difference(dictSet).ToSlice() {
|
|
|
|
|
// 不检查英文
|
|
|
|
|
if match, _ := regexp.MatchString(`[a-zA-Z]`, text); match {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
// 不检查 1 个字的
|
|
|
|
|
if utf8.RuneCountInString(text) == 1 {
|
|
|
|
|
continue
|
|
|
|
|
}
|
|
|
|
|
fmt.Println("❌ others.txt 与 base 的差集:", text)
|
|
|
|
|
}
|
|
|
|
|
}
|