#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Extract data from raw json content. USAGE: python3 extract.py [JSON_FILE] """ import re import sys import json sys.path.append('..') from utils import logger from bs4 import BeautifulSoup def loadData() -> list: rawData = json.loads(open(sys.argv[1]).read()) data = [{ 'id': x['id'], 'title': x['title'], 'content': x['content'], } for x in rawData['data']] def sortFunc(x: dict) -> int: suffix = x['title'].replace('栩栩若生', '') suffix = '1' if suffix == '' else suffix # `栩栩若生` -> `栩栩若生1` suffix = '22' if suffix == '(全文完)' else suffix # `栩栩若生(全文完)` -> `栩栩若生22` return int(suffix) return sorted(data, key = sortFunc) def splitHtml(rawHtml: str): html = BeautifulSoup(rawHtml, 'lxml') def isCaption(obj: BeautifulSoup) -> bool: if obj.name in ['h2', 'h3']: if obj.text not in [ '人生第一次如此无语。', '第三棒是伍哥。', '东风初送第一船。', ]: return True if obj.text == "正文第870章对手": return True return False def formatCaption(raw: str) -> str: if raw.startswith('正文'): raw = raw.replace('正文', '') match = re.search(r'^第(\d+)章(.*)', raw) if match is not None: return '第%s章 %s' % (match[1], match[2].strip()) match = re.search(r'^第(\S+)章 (.*)', raw) zhStr = match[1] zhStr = '三十零' if zhStr == '三十' else zhStr zhStr = '二十零' if zhStr == '二十' else zhStr zhStr = '十零' if zhStr == '十' else zhStr zhStr = zhStr.replace('三十', '3').replace('二十', '2').replace('十', '1') numStr = zhStr.replace('零', '0').replace('一', '1').replace('二', '2').replace('三', '3').replace('四', '4')\ .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9') return '第%s章 %s' % (numStr, match[2].strip()) for item in html.body.contents: # print(item) # continue if isCaption(item): caption = formatCaption(item.text) print(caption) # caption = item.text # match = re.search(r'^第(\d+)章', caption) # if match is not None: # caption = match[1] # elif re.search(r'^第(\S+)章', caption) is not None: # caption = caption.replace('一', '1') # print('ok') # print(caption) # print(item) logger.warning('Extract info of `zhihu.com`') sys.argv.append('./data/content.json') [splitHtml(x['content']) for x in loadData()] # splitHtml(loadData()[0]['content']) # splitHtml(loadData()[1]['content']) # splitHtml(loadData()[0]['content'])