Browse Source

update: traditional vocabulary white list

master
Dnomd343 2 years ago
parent
commit
948f3f957d
  1. 41
      src/chinese/convert.py

41
src/chinese/convert.py

@ -1,10 +1,36 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import json
from snownlp import SnowNLP
defaultPath = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '../../release/'
)
traditionalWhiteList = [
'其余', '残余', '有余', '多余', '之余', '梦余', '空余',
'业余', '剩余', '余地', '余光', '余音', '余后', '余下',
'余生', '余力', '余毒', '余出', '余晖', '余钱', '余脉',
'余痛', '余年', '余温', '余额', '余人', '余先生',
'上乾下坤', '一览无余', '茶余饭后', '著鞭跨马',
'慰藉', '狼藉', '蕴藉', '碟片', '哪吒', '括弧', '瞭望',
'覆盖', '覆舟', '覆到', '覆碗', '幺蛾子', '雪糕', '共用',
'翻来覆去', '覆水难收', '覆手为雨', '下覆昆仑', '翻天覆地',
'乾兑', '乾旋', '大乾', '为乾', '乾元',
'登乾', '乾卦', '丁乾', '乾金', '战乎乾', '乾三连',
'乾西北', '乾在上', '乾为天', '乾代表天', '乾为上卦',
'的士兵', '吒为正义', '金吒木吒郎', '连夜学一学', '乾、',
]
def traditionalCheck(sentence: str) -> None:
for c in traditionalWhiteList: # skip white list
sentence = sentence.replace(c, '')
simplified = SnowNLP(sentence).han # convert into simplified chinese
if simplified == sentence: # simplified chinese already
return
@ -19,5 +45,18 @@ def traditionalCheck(sentence: str) -> None:
))
traditionalCheck('繁體中文的叫法在臺灣亦很常見')
def loadContent(filename: str) -> list: # load json content
if not filename.endswith('.json'):
filename += '.json' # add file suffix
raw = json.loads(open(
os.path.join(defaultPath, filename)
).read())
combine = []
for (title, content) in raw.items():
combine.append(title)
combine += content
return combine
for row in loadContent(sys.argv[1]):
traditionalCheck(row)

Loading…
Cancel
Save