rime-ice/others/script/rime/cn_en.go

576 lines
13 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package rime
import (
"bufio"
"fmt"
"log"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"unicode"
)
// 多音字,手动选择注音
var polyphones = map[string]string{
"Eul的神圣法杖 > 的": "de",
"艾AA > 艾": "ai",
"大V > 大": "da",
"QQ音乐 > 乐": "yue",
"QQ会员 > 会": "hui",
"QQ会员 > 员": "yuan",
"阿Q精神 > 阿": "a",
"G胖 > 胖": "pang",
"阿Q > 阿": "a",
"阿Q正传 > 阿": "a",
"阿Q正传 > 传": "zhuan",
"单边z变换 > 单": "dan",
"卡拉OK > 卡": "ka",
"IP地址 > 地": "di",
"IP卡 > 卡": "ka",
"SIM卡 > 卡": "ka",
"UIM卡 > 卡": "ka",
"USIM卡 > 卡": "ka",
"X染色体 > 色": "se",
"Y染色体 > 色": "se",
"蒙奇·D·路飞 > 奇": "qi",
"蒙奇·D·龙 > 奇": "qi",
"马歇尔·D·蒂奇 > 奇": "qi",
"蒙奇·D·卡普 > 奇": "qi",
"蒙奇·D·卡普 > 卡": "ka",
"波特卡斯·D·艾斯 > 卡": "ka",
"波特卡斯·D·艾斯 > 艾": "ai",
"A壳 > 壳": "ke",
"B壳 > 壳": "ke",
"C壳 > 壳": "ke",
"D壳 > 壳": "ke",
"芭比Q了 > 了": "le",
"江南Style > 南": "nan",
"三无Marblue > 无": "wu",
"V字仇杀队 > 仇": "chou",
"Q弹 > 弹": "tan",
"M系列 > 系": "xi",
"阿Sir > 阿": "a",
"MAC地址 > 地": "di",
"OK了 > 了": "le",
"OK了吗 > 了": "le",
"圈X > 圈": "quan",
"A型血 > 血": "xue",
"A血型 > 血": "xue",
"B型血 > 血": "xue",
"B血型 > 血": "xue",
"AB型血 > 血": "xue",
"AB血型 > 血": "xue",
"O型血 > 血": "xue",
"O血型 > 血": "xue",
"没Bug > 没": "mei",
"没有Bug > 没": "mei",
"卡Bug > 卡": "ka",
"提Bug > 提": "ti",
"CT检查 > 查": "cha",
"N卡 > 卡": "ka",
"A卡 > 卡": "ka",
"A区 > 区": "qu",
"B区 > 区": "qu",
"C区 > 区": "qu",
"D区 > 区": "qu",
"E区 > 区": "qu",
"F区 > 区": "qu",
"IT行业 > 行": "hang",
"TF卡 > 卡": "ka",
"A屏 > 屏": "ping",
"A和B > 和": "he",
"X和Y > 和": "he",
"查IP > 查": "cha",
"VIP卡 > 卡": "ka",
"Chromium系 > 系": "xi",
"Chrome系 > 系": "xi",
}
var digitMap = map[string]string{
"0": "ling",
"1": "yi",
"2": "er",
"3": "san",
"4": "si",
"5": "wu",
"6": "liu",
"7": "qi",
"8": "ba",
"9": "jiu",
}
var doublePinyinMap = map[string]string{
// 零声母
"-a-": "aa",
"-e-": "ee",
"-o-": "oo",
"-ai-": "ai",
"-ei-": "ei",
"-ou-": "ou",
"-an-": "an",
"-en-": "en",
"-ang-": "ah",
"-eng-": "eg",
"-ao-": "ao",
"-er-": "er",
// zh ch sh
"zh": "v",
"ch": "i",
"sh": "u",
// 韵母
"iu": "q",
"ia": "w",
"ua": "w",
"uan": "r",
"ue": "t",
"ve": "t",
"ing": "y",
"uai": "y",
"uo": "o",
"un": "p",
"iong": "s",
"ong": "s",
"iang": "d",
"uang": "d",
"en": "f",
"eng": "g",
"ang": "h",
"an": "j",
"ao": "k",
"ai": "l",
"ei": "z",
"ie": "x",
"iao": "c",
"ui": "v",
"ou": "b",
"in": "n",
"ian": "m",
}
var doublePinyinFlypyMap = map[string]string{
// 零声母
"-a-": "aa",
"-e-": "ee",
"-o-": "oo",
"-ai-": "ai",
"-ei-": "ei",
"-ou-": "ou",
"-an-": "an",
"-en-": "en",
"-ang-": "ah",
"-eng-": "eg",
"-ao-": "ao",
"-er-": "er",
// zh ch sh
"zh": "v",
"ch": "i",
"sh": "u",
// 韵母
"iu": "q",
"ei": "w",
"uan": "r",
"ue": "t",
"ve": "t",
"un": "y",
"uo": "o",
"ie": "p",
"iong": "s",
"ong": "s",
"ai": "d",
"en": "f",
"eng": "g",
"ang": "h",
"an": "j",
"ing": "k",
"uai": "k",
"iang": "l",
"uang": "l",
"ou": "z",
"ia": "x",
"ua": "x",
"ao": "c",
"ui": "v",
"in": "b",
"iao": "n",
"ian": "m",
}
var doublePinyinMSPYMap = map[string]string{
// 零声母
"-a-": "oa",
"-e-": "oe",
"-o-": "oo",
"-ai-": "ol",
"-ei-": "oz",
"-ou-": "ob",
"-an-": "oj",
"-en-": "of",
"-ang-": "oh",
"-eng-": "og",
"-ao-": "ok",
"-er-": "or",
// zh ch sh
"zh": "v",
"ch": "i",
"sh": "u",
// 韵母
"iu": "q",
"ia": "w",
"ua": "w",
"er": "r",
"uan": "r",
"ue": "t",
"uai": "y",
"uo": "o",
"un": "p",
"iong": "s",
"ong": "s",
"iang": "d",
"uang": "d",
"en": "f",
"eng": "g",
"ang": "h",
"an": "j",
"ao": "k",
"ai": "l",
"ing": ";",
"ei": "z",
"ie": "x",
"iao": "c",
"ui": "v",
"ve": "v",
"ou": "b",
"in": "n",
"ian": "m",
}
var doublePinyinZiGuangMap = map[string]string{
// 零声母
"-a-": "oa",
"-e-": "oe",
"-o-": "oo",
"-ai-": "op",
"-ei-": "ok",
"-ou-": "oz",
"-an-": "or",
"-en-": "ow",
"-ang-": "os",
"-eng-": "ot",
"-ao-": "oq",
"-er-": "oj",
// zh ch sh
"zh": "u",
"ch": "a",
"sh": "i",
// 韵母
"ao": "q",
"en": "w",
"an": "r",
"eng": "t",
"in": "y",
"uai": "y",
"uo": "o",
"ai": "p",
"ang": "s",
"ie": "d",
"ian": "f",
"iang": "g",
"uang": "g",
"iong": "h",
"ong": "h",
"er": "j",
"iu": "j",
"ei": "k",
"uan": "l",
"ing": ";",
"ou": "z",
"ia": "x",
"ua": "x",
"iao": "b",
"ue": "n",
"ui": "n",
"un": "m",
}
var doublePinyinABCMap = map[string]string{
// 零声母
"-a-": "oa",
"-e-": "oe",
"-o-": "oo",
"-ai-": "ol",
"-ei-": "oq",
"-ou-": "ob",
"-an-": "oj",
"-en-": "of",
"-ang-": "oh",
"-eng-": "og",
"-ao-": "ok",
"-er-": "or",
// zh ch sh
"zh": "a",
"ch": "e",
"sh": "v",
// 韵母
"ao": "k",
"en": "f",
"an": "j",
"eng": "g",
"in": "c",
"uai": "c",
"uo": "o",
"ai": "l",
"ang": "h",
"ie": "x",
"ian": "w",
"iang": "t",
"uang": "t",
"iong": "s",
"ong": "s",
"er": "r",
"iu": "r",
"ei": "q",
"uan": "p",
"ing": "y",
"ou": "b",
"ia": "d",
"ua": "d",
"iao": "z",
"ue": "m",
"ui": "m",
"un": "n",
}
// CnEn 从 others/cn_en.txt 生成全拼和各个双拼的中英混输词库
func CnEn() {
// 读取
file, err := os.Open(filepath.Join(RimeDir, "others/cn_en.txt"))
if err != nil {
log.Fatalln(err)
}
defer file.Close()
// 准备要写入的文件,先先入前缀内容
pinyinFile, err := os.OpenFile(filepath.Join(RimeDir, "en_dicts/cn_en.dict.yaml"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Fatalln(err)
}
defer pinyinFile.Close()
writePrefix(pinyinFile)
doublePinyinFile, err := os.OpenFile(filepath.Join(RimeDir, "en_dicts/cn_en_double_pinyin.dict.yaml"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Fatalln(err)
}
defer doublePinyinFile.Close()
writePrefix(doublePinyinFile)
doublePinyinFlypyFile, err := os.OpenFile(filepath.Join(RimeDir, "en_dicts/cn_en_double_pinyin_flypy.dict.yaml"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Fatalln(err)
}
defer doublePinyinFlypyFile.Close()
writePrefix(doublePinyinFlypyFile)
doublePinyinMSPYFile, err := os.OpenFile(filepath.Join(RimeDir, "en_dicts/cn_en_double_pinyin_mspy.dict.yaml"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Fatalln(err)
}
defer doublePinyinMSPYFile.Close()
writePrefix(doublePinyinMSPYFile)
doublePinyinZiGuangFile, err := os.OpenFile(filepath.Join(RimeDir, "en_dicts/cn_en_double_pinyin_ziguang.dict.yaml"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Fatalln(err)
}
defer doublePinyinZiGuangFile.Close()
writePrefix(doublePinyinZiGuangFile)
doublePinyinABCFile, err := os.OpenFile(filepath.Join(RimeDir, "en_dicts/cn_en_double_pinyin_abc.dict.yaml"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
if err != nil {
log.Fatalln(err)
}
defer doublePinyinABCFile.Close()
writePrefix(doublePinyinABCFile)
// 遍历、注音、转换、写入
sc := bufio.NewScanner(file)
for sc.Scan() {
line := sc.Text()
if line == "" || strings.HasPrefix(line, "#") {
continue
}
if strings.TrimSpace(line) != line {
fmt.Println("❌ 前后有空格", line)
}
// 按顺序转换为全拼、自然码、小鹤、微软、紫光
codes := textToPinyin(line)
pinyinFile.WriteString(line + "\t" + codes[0] + "\n")
doublePinyinFile.WriteString(line + "\t" + codes[1] + "\n")
doublePinyinFlypyFile.WriteString(line + "\t" + codes[2] + "\n")
doublePinyinMSPYFile.WriteString(line + "\t" + codes[3] + "\n")
doublePinyinZiGuangFile.WriteString(line + "\t" + codes[4] + "\n")
doublePinyinABCFile.WriteString(line + "\t" + codes[5] + "\n")
}
if err := sc.Err(); err != nil {
log.Fatalln(err)
}
if err := pinyinFile.Sync(); err != nil {
log.Fatalln(err)
}
if err := doublePinyinFile.Sync(); err != nil {
log.Fatalln(err)
}
if err := doublePinyinFlypyFile.Sync(); err != nil {
log.Fatalln(err)
}
if err := doublePinyinMSPYFile.Sync(); err != nil {
log.Fatalln(err)
}
if err := doublePinyinZiGuangFile.Sync(); err != nil {
log.Fatalln(err)
}
if err := doublePinyinABCFile.Sync(); err != nil {
log.Fatalln(err)
}
}
// 写入前缀内容
func writePrefix(file *os.File) {
filename := filepath.Base(file.Name())
name := strings.TrimSuffix(filename, ".dict.yaml")
// name = strings.TrimPrefix(name, "cn_en_")
fmt.Println(name)
m := map[string]string{
"cn_en": "全拼",
"cn_en_double_pinyin": "自然码双拼",
"cn_en_double_pinyin_flypy": "小鹤双拼",
"cn_en_double_pinyin_mspy": "微软双拼",
"cn_en_double_pinyin_ziguang": "紫光双拼",
"cn_en_double_pinyin_abc": "智能ABC双拼",
}
content := fmt.Sprintf(`# Rime dictionary
# encoding: utf-8
#
#
# https://github.com/iDvel/rime-ice
# ------- 中英混输词库 for %s -------
# 由 others/cn_en.txt 生成
#
---
name: %s
version: "1"
sort: by_weight
...
`, m[name], name)
_, err := file.WriteString(content)
if err != nil {
log.Fatalln(err)
}
}
// 转换编码汉字转为拼音英文不变。拼音分别转为全拼、自然码、小鹤、微软、紫光、智能ABC
func textToPinyin(text string) []string {
pinyin := ""
doublePinyin := ""
doublePinyinFlypy := ""
doublePinyinMSPY := ""
doublePinyinZiGuang := ""
doublePinyinABC := ""
parts := splitMixedWords(text)
for _, part := range parts {
// 特殊情况,数字转为拼音
if _, err := strconv.Atoi(part); err == nil {
part = digitMap[part]
}
if len(hanPinyin[part]) == 0 { // 英文数字,不做转换
pinyin += part
doublePinyin += part
doublePinyinFlypy += part
doublePinyinMSPY += part
doublePinyinZiGuang += part
doublePinyinABC += part
} else if len(hanPinyin[part]) > 1 { // 多音字,按字典指定的读音
if value, ok := polyphones[text+" > "+part]; ok {
pinyin += value
doublePinyin += convertToDoublePinyin(value, doublePinyinMap)
doublePinyinFlypy += convertToDoublePinyin(value, doublePinyinFlypyMap)
doublePinyinMSPY += convertToDoublePinyin(value, doublePinyinMSPYMap)
doublePinyinZiGuang += convertToDoublePinyin(value, doublePinyinZiGuangMap)
doublePinyinABC += convertToDoublePinyin(value, doublePinyinABCMap)
} else {
log.Fatalln("❌ 未处理的多音字", text, part)
}
} else { // 其他,按唯一的读音
pinyin += hanPinyin[part][0]
doublePinyin += convertToDoublePinyin(hanPinyin[part][0], doublePinyinMap)
doublePinyinFlypy += convertToDoublePinyin(hanPinyin[part][0], doublePinyinFlypyMap)
doublePinyinMSPY += convertToDoublePinyin(hanPinyin[part][0], doublePinyinMSPYMap)
doublePinyinZiGuang += convertToDoublePinyin(hanPinyin[part][0], doublePinyinZiGuangMap)
doublePinyinABC += convertToDoublePinyin(hanPinyin[part][0], doublePinyinABCMap)
}
}
return []string{
pinyin,
doublePinyin,
doublePinyinFlypy,
doublePinyinMSPY,
doublePinyinZiGuang,
doublePinyinABC,
}
}
// 中英文分割,去掉间隔号和横杠
// "哆啦A梦" → ["哆", "啦", "A", "梦"]
// "QQ号" → ["QQ", "号"]
// "Wi-Fi密码" → ["WiFi", "密", "码"]
// "特拉法尔加·D·瓦铁尔·罗" → ["特", "拉", "法", "尔", "加", "D", "瓦", "铁", "尔", "罗"]
func splitMixedWords(input string) []string {
var result []string
word := ""
for _, r := range input {
if string(r) == "·" || string(r) == "-" {
continue
} else if unicode.Is(unicode.Latin, r) {
word += string(r)
} else {
if word != "" {
result = append(result, word)
word = ""
}
result = append(result, string(r))
}
}
if word != "" {
result = append(result, word)
}
return result
}
func convertToDoublePinyin(code string, m map[string]string) string {
// 零声母
if contains([]string{"a", "e", "o", "ai", "ei", "ou", "an", "en", "ang", "eng", "ao", "er"}, code) {
return m["-"+code+"-"]
}
// 分割为声母和韵母
consonantRegexp := regexp.MustCompile(`^(b|p|m|f|d|t|n|l|g|k|h|j|q|x|zh|ch|sh|r|z|c|s|y|w)`)
initial := consonantRegexp.FindString(code)
final := consonantRegexp.ReplaceAllString(code, "")
// 声母转换
if initial == "zh" || initial == "ch" || initial == "sh" {
initial = m[initial]
}
// 韵母转换
if len(final) > 1 {
final = m[final]
}
// 其余单个的声母和韵母不转换
return initial + final
}