Dnomd343
2 years ago
1 changed files with 91 additions and 0 deletions
@ -0,0 +1,91 @@ |
|||
#!/usr/bin/env python3 |
|||
# -*- coding: utf-8 -*- |
|||
|
|||
""" |
|||
Extract data from raw json content. |
|||
|
|||
USAGE: python3 extract.py [JSON_FILE] |
|||
""" |
|||
|
|||
import re |
|||
import sys |
|||
import json |
|||
sys.path.append('..') |
|||
from utils import logger |
|||
from bs4 import BeautifulSoup |
|||
|
|||
|
|||
def loadData() -> list: |
|||
rawData = json.loads(open(sys.argv[1]).read()) |
|||
data = [{ |
|||
'id': x['id'], |
|||
'title': x['title'], |
|||
'content': x['content'], |
|||
} for x in rawData['data']] |
|||
|
|||
def sortFunc(x: dict) -> int: |
|||
suffix = x['title'].replace('栩栩若生', '') |
|||
suffix = '1' if suffix == '' else suffix # `栩栩若生` -> `栩栩若生1` |
|||
suffix = '22' if suffix == '(全文完)' else suffix # `栩栩若生(全文完)` -> `栩栩若生22` |
|||
return int(suffix) |
|||
return sorted(data, key = sortFunc) |
|||
|
|||
|
|||
def splitHtml(rawHtml: str): |
|||
html = BeautifulSoup(rawHtml, 'lxml') |
|||
|
|||
def isCaption(obj: BeautifulSoup) -> bool: |
|||
if obj.name in ['h2', 'h3']: |
|||
if obj.text not in [ |
|||
'人生第一次如此无语。', |
|||
'第三棒是伍哥。', |
|||
'东风初送第一船。', |
|||
]: return True |
|||
if obj.text == "正文第870章对手": |
|||
return True |
|||
return False |
|||
|
|||
def formatCaption(raw: str) -> str: |
|||
if raw.startswith('正文'): |
|||
raw = raw.replace('正文', '') |
|||
match = re.search(r'^第(\d+)章(.*)', raw) |
|||
if match is not None: |
|||
return '第%s章 %s' % (match[1], match[2].strip()) |
|||
match = re.search(r'^第(\S+)章 (.*)', raw) |
|||
zhStr = match[1] |
|||
zhStr = '三十零' if zhStr == '三十' else zhStr |
|||
zhStr = '二十零' if zhStr == '二十' else zhStr |
|||
zhStr = '十零' if zhStr == '十' else zhStr |
|||
zhStr = zhStr.replace('三十', '3').replace('二十', '2').replace('十', '1') |
|||
numStr = zhStr.replace('零', '0').replace('一', '1').replace('二', '2').replace('三', '3').replace('四', '4')\ |
|||
.replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9') |
|||
return '第%s章 %s' % (numStr, match[2].strip()) |
|||
|
|||
for item in html.body.contents: |
|||
# print(item) |
|||
# continue |
|||
|
|||
if isCaption(item): |
|||
caption = formatCaption(item.text) |
|||
print(caption) |
|||
|
|||
# caption = item.text |
|||
# match = re.search(r'^第(\d+)章', caption) |
|||
# if match is not None: |
|||
# caption = match[1] |
|||
# elif re.search(r'^第(\S+)章', caption) is not None: |
|||
# caption = caption.replace('一', '1') |
|||
# print('ok') |
|||
|
|||
# print(caption) |
|||
|
|||
# print(item) |
|||
|
|||
|
|||
logger.warning('Extract info of `zhihu.com`') |
|||
sys.argv.append('./data/content.json') |
|||
|
|||
[splitHtml(x['content']) for x in loadData()] |
|||
# splitHtml(loadData()[0]['content']) |
|||
# splitHtml(loadData()[1]['content']) |
|||
# splitHtml(loadData()[0]['content']) |
Loading…
Reference in new issue