From 1182ee4448758c602d26050a3d3edc616bef3587 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 12 Mar 2023 11:47:05 +0800 Subject: [PATCH] feat: punctuation abstraction --- src/punctuation/sentence.py | 54 +++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100755 src/punctuation/sentence.py diff --git a/src/punctuation/sentence.py b/src/punctuation/sentence.py new file mode 100755 index 0000000..976b5e2 --- /dev/null +++ b/src/punctuation/sentence.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import json + +punctuations = [ + ' ', '-', '.', '~', '·', '—', + '‘', '’', '“', '”', '…', + '、', '。', '《', '》', '!', '(', ')', + ',', ':', ';', '?', +] + +defaultPath = os.path.join( + os.path.dirname(os.path.realpath(__file__)), '../../release/' +) + + +def loadContent(filename: str) -> list: # load json content + if not filename.endswith('.json'): + filename += '.json' # add file suffix + raw = json.loads(open( + os.path.join(defaultPath, filename) + ).read()) + combine = [] + for (title, content) in raw.items(): + combine.append(title) + combine += content + return combine + + +def abstract(raw: str) -> str: # keep only punctuation in sentence + sentence = list(raw) + for i in range(0, len(sentence)): + if sentence[i] not in punctuations: + sentence[i] = '' + result = [sentence[0]] + for c in sentence[1:]: + if c == '' and result[-1] == '': + continue + result.append(c) + return ''.join(['➕' if x == '' else x for x in result]) + + +def sentenceType(content: list) -> list: + result = set() + for row in content: + result.add(abstract(row)) + return list(sorted(result)) + + +print('\n'.join( + sentenceType(loadContent('rc-5')) +))