From 948f3f957d02a72ed77ecd360108980ea131798b Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 13 Dec 2022 16:59:04 +0800 Subject: [PATCH] update: traditional vocabulary white list --- src/chinese/convert.py | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/src/chinese/convert.py b/src/chinese/convert.py index bd0b296..e551357 100755 --- a/src/chinese/convert.py +++ b/src/chinese/convert.py @@ -1,10 +1,36 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +import os +import sys +import json from snownlp import SnowNLP +defaultPath = os.path.join( + os.path.dirname(os.path.realpath(__file__)), '../../release/' +) + +traditionalWhiteList = [ + '其余', '残余', '有余', '多余', '之余', '梦余', '空余', + '业余', '剩余', '余地', '余光', '余音', '余后', '余下', + '余生', '余力', '余毒', '余出', '余晖', '余钱', '余脉', + '余痛', '余年', '余温', '余额', '余人', '余先生', + + '上乾下坤', '一览无余', '茶余饭后', '著鞭跨马', + '慰藉', '狼藉', '蕴藉', '碟片', '哪吒', '括弧', '瞭望', + '覆盖', '覆舟', '覆到', '覆碗', '幺蛾子', '雪糕', '共用', + '翻来覆去', '覆水难收', '覆手为雨', '下覆昆仑', '翻天覆地', + + '乾兑', '乾旋', '大乾', '为乾', '乾元', + '登乾', '乾卦', '丁乾', '乾金', '战乎乾', '乾三连', + '乾西北', '乾在上', '乾为天', '乾代表天', '乾为上卦', + '的士兵', '吒为正义', '金吒木吒郎', '连夜学一学', '乾、', +] + def traditionalCheck(sentence: str) -> None: + for c in traditionalWhiteList: # skip white list + sentence = sentence.replace(c, '') simplified = SnowNLP(sentence).han # convert into simplified chinese if simplified == sentence: # simplified chinese already return @@ -19,5 +45,18 @@ def traditionalCheck(sentence: str) -> None: )) -traditionalCheck('繁體中文的叫法在臺灣亦很常見') +def loadContent(filename: str) -> list: # load json content + if not filename.endswith('.json'): + filename += '.json' # add file suffix + raw = json.loads(open( + os.path.join(defaultPath, filename) + ).read()) + combine = [] + for (title, content) in raw.items(): + combine.append(title) + combine += content + return combine + +for row in loadContent(sys.argv[1]): + traditionalCheck(row)