|
@ -2,8 +2,11 @@ |
|
|
# -*- coding: utf-8 -*- |
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
|
|
|
|
import os |
|
|
import os |
|
|
|
|
|
import sys |
|
|
import json |
|
|
import json |
|
|
|
|
|
|
|
|
|
|
|
delimiter = '➕' |
|
|
|
|
|
|
|
|
punctuations = [ |
|
|
punctuations = [ |
|
|
' ', '-', '.', '~', '·', '—', |
|
|
' ', '-', '.', '~', '·', '—', |
|
|
'‘', '’', '“', '”', '…', |
|
|
'‘', '’', '“', '”', '…', |
|
@ -11,6 +14,23 @@ punctuations = [ |
|
|
',', ':', ';', '?', |
|
|
',', ':', ';', '?', |
|
|
] |
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
duplicates = [ |
|
|
|
|
|
'~', '!', '?', |
|
|
|
|
|
delimiter + ',', |
|
|
|
|
|
delimiter + '!', |
|
|
|
|
|
delimiter + '?', |
|
|
|
|
|
delimiter + '、', |
|
|
|
|
|
delimiter + '~', |
|
|
|
|
|
delimiter + '……', |
|
|
|
|
|
delimiter + '——', |
|
|
|
|
|
delimiter + ',' + delimiter + '!', |
|
|
|
|
|
delimiter + ',' + delimiter + '?', |
|
|
|
|
|
delimiter + ',' + delimiter + '、', |
|
|
|
|
|
delimiter + ',' + delimiter + '~', |
|
|
|
|
|
delimiter + ',' + delimiter + '……', |
|
|
|
|
|
delimiter + ',' + delimiter + '。', |
|
|
|
|
|
] |
|
|
|
|
|
|
|
|
defaultPath = os.path.join( |
|
|
defaultPath = os.path.join( |
|
|
os.path.dirname(os.path.realpath(__file__)), '../../release/' |
|
|
os.path.dirname(os.path.realpath(__file__)), '../../release/' |
|
|
) |
|
|
) |
|
@ -39,16 +59,26 @@ def abstract(raw: str) -> str: # keep only punctuation in sentence |
|
|
if c == '' and result[-1] == '': |
|
|
if c == '' and result[-1] == '': |
|
|
continue |
|
|
continue |
|
|
result.append(c) |
|
|
result.append(c) |
|
|
return ''.join(['➕' if x == '' else x for x in result]) |
|
|
return ''.join([delimiter if x == '' else x for x in result]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def removeDuplicate(sentence: str) -> str: |
|
|
|
|
|
for duplicate in duplicates: |
|
|
|
|
|
while True: |
|
|
|
|
|
tmp = sentence.replace(duplicate + duplicate, duplicate) |
|
|
|
|
|
if tmp == sentence: |
|
|
|
|
|
break |
|
|
|
|
|
sentence = tmp |
|
|
|
|
|
return sentence |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sentenceType(content: list) -> list: |
|
|
def sentenceType(content: list) -> list: |
|
|
result = set() |
|
|
result = set() |
|
|
for row in content: |
|
|
for row in content: |
|
|
result.add(abstract(row)) |
|
|
result.add(removeDuplicate(abstract(row))) |
|
|
return list(sorted(result)) |
|
|
return list(sorted(result)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print('\n'.join( |
|
|
print('\n'.join( |
|
|
sentenceType(loadContent('rc-5')) |
|
|
sentenceType(loadContent(sys.argv[1])) |
|
|
)) |
|
|
)) |
|
|