From f5727f7a59173c52cf8df8e9ac08fbca9b53914a Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 12 Mar 2023 23:01:21 +0800 Subject: [PATCH] update: symbol sequence repetition rules --- src/punctuation/sentence.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/src/punctuation/sentence.py b/src/punctuation/sentence.py index 976b5e2..93c195f 100755 --- a/src/punctuation/sentence.py +++ b/src/punctuation/sentence.py @@ -2,8 +2,11 @@ # -*- coding: utf-8 -*- import os +import sys import json +delimiter = '➕' + punctuations = [ ' ', '-', '.', '~', '·', '—', '‘', '’', '“', '”', '…', @@ -11,6 +14,23 @@ punctuations = [ ',', ':', ';', '?', ] +duplicates = [ + '~', '!', '?', + delimiter + ',', + delimiter + '!', + delimiter + '?', + delimiter + '、', + delimiter + '~', + delimiter + '……', + delimiter + '——', + delimiter + ',' + delimiter + '!', + delimiter + ',' + delimiter + '?', + delimiter + ',' + delimiter + '、', + delimiter + ',' + delimiter + '~', + delimiter + ',' + delimiter + '……', + delimiter + ',' + delimiter + '。', +] + defaultPath = os.path.join( os.path.dirname(os.path.realpath(__file__)), '../../release/' ) @@ -39,16 +59,26 @@ def abstract(raw: str) -> str: # keep only punctuation in sentence if c == '' and result[-1] == '': continue result.append(c) - return ''.join(['➕' if x == '' else x for x in result]) + return ''.join([delimiter if x == '' else x for x in result]) + + +def removeDuplicate(sentence: str) -> str: + for duplicate in duplicates: + while True: + tmp = sentence.replace(duplicate + duplicate, duplicate) + if tmp == sentence: + break + sentence = tmp + return sentence def sentenceType(content: list) -> list: result = set() for row in content: - result.add(abstract(row)) + result.add(removeDuplicate(abstract(row))) return list(sorted(result)) print('\n'.join( - sentenceType(loadContent('rc-5')) + sentenceType(loadContent(sys.argv[1])) ))