Browse Source

update: symbol sequence repetition rules

master
Dnomd343 2 years ago
parent
commit
f5727f7a59
  1. 36
      src/punctuation/sentence.py

36
src/punctuation/sentence.py

@ -2,8 +2,11 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import os import os
import sys
import json import json
delimiter = ''
punctuations = [ punctuations = [
' ', '-', '.', '~', '·', '', ' ', '-', '.', '~', '·', '',
'', '', '', '', '', '', '', '', '', '',
@ -11,6 +14,23 @@ punctuations = [
'', '', '', '', '', '', '', '',
] ]
duplicates = [
'~', '', '',
delimiter + '',
delimiter + '',
delimiter + '',
delimiter + '',
delimiter + '~',
delimiter + '……',
delimiter + '——',
delimiter + '' + delimiter + '',
delimiter + '' + delimiter + '',
delimiter + '' + delimiter + '',
delimiter + '' + delimiter + '~',
delimiter + '' + delimiter + '……',
delimiter + '' + delimiter + '',
]
defaultPath = os.path.join( defaultPath = os.path.join(
os.path.dirname(os.path.realpath(__file__)), '../../release/' os.path.dirname(os.path.realpath(__file__)), '../../release/'
) )
@ -39,16 +59,26 @@ def abstract(raw: str) -> str: # keep only punctuation in sentence
if c == '' and result[-1] == '': if c == '' and result[-1] == '':
continue continue
result.append(c) result.append(c)
return ''.join(['' if x == '' else x for x in result]) return ''.join([delimiter if x == '' else x for x in result])
def removeDuplicate(sentence: str) -> str:
for duplicate in duplicates:
while True:
tmp = sentence.replace(duplicate + duplicate, duplicate)
if tmp == sentence:
break
sentence = tmp
return sentence
def sentenceType(content: list) -> list: def sentenceType(content: list) -> list:
result = set() result = set()
for row in content: for row in content:
result.add(abstract(row)) result.add(removeDuplicate(abstract(row)))
return list(sorted(result)) return list(sorted(result))
print('\n'.join( print('\n'.join(
sentenceType(loadContent('rc-5')) sentenceType(loadContent(sys.argv[1]))
)) ))

Loading…
Cancel
Save