From 6cc718a6dd42cbc18a87ec8398caa229486c8946 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 18 Oct 2022 04:46:09 +0800 Subject: [PATCH] feat: extract caption of `zhihu.com` --- src/crawler/zhihu.com/extract.py | 91 ++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 src/crawler/zhihu.com/extract.py diff --git a/src/crawler/zhihu.com/extract.py b/src/crawler/zhihu.com/extract.py new file mode 100644 index 0000000..632089f --- /dev/null +++ b/src/crawler/zhihu.com/extract.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +Extract data from raw json content. + + USAGE: python3 extract.py [JSON_FILE] +""" + +import re +import sys +import json +sys.path.append('..') +from utils import logger +from bs4 import BeautifulSoup + + +def loadData() -> list: + rawData = json.loads(open(sys.argv[1]).read()) + data = [{ + 'id': x['id'], + 'title': x['title'], + 'content': x['content'], + } for x in rawData['data']] + + def sortFunc(x: dict) -> int: + suffix = x['title'].replace('栩栩若生', '') + suffix = '1' if suffix == '' else suffix # `栩栩若生` -> `栩栩若生1` + suffix = '22' if suffix == '(全文完)' else suffix # `栩栩若生(全文完)` -> `栩栩若生22` + return int(suffix) + return sorted(data, key = sortFunc) + + +def splitHtml(rawHtml: str): + html = BeautifulSoup(rawHtml, 'lxml') + + def isCaption(obj: BeautifulSoup) -> bool: + if obj.name in ['h2', 'h3']: + if obj.text not in [ + '人生第一次如此无语。', + '第三棒是伍哥。', + '东风初送第一船。', + ]: return True + if obj.text == "正文第870章对手": + return True + return False + + def formatCaption(raw: str) -> str: + if raw.startswith('正文'): + raw = raw.replace('正文', '') + match = re.search(r'^第(\d+)章(.*)', raw) + if match is not None: + return '第%s章 %s' % (match[1], match[2].strip()) + match = re.search(r'^第(\S+)章 (.*)', raw) + zhStr = match[1] + zhStr = '三十零' if zhStr == '三十' else zhStr + zhStr = '二十零' if zhStr == '二十' else zhStr + zhStr = '十零' if zhStr == '十' else zhStr + zhStr = zhStr.replace('三十', '3').replace('二十', '2').replace('十', '1') + numStr = zhStr.replace('零', '0').replace('一', '1').replace('二', '2').replace('三', '3').replace('四', '4')\ + .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9') + return '第%s章 %s' % (numStr, match[2].strip()) + + for item in html.body.contents: + # print(item) + # continue + + if isCaption(item): + caption = formatCaption(item.text) + print(caption) + + # caption = item.text + # match = re.search(r'^第(\d+)章', caption) + # if match is not None: + # caption = match[1] + # elif re.search(r'^第(\S+)章', caption) is not None: + # caption = caption.replace('一', '1') + # print('ok') + + # print(caption) + + # print(item) + + +logger.warning('Extract info of `zhihu.com`') +sys.argv.append('./data/content.json') + +[splitHtml(x['content']) for x in loadData()] +# splitHtml(loadData()[0]['content']) +# splitHtml(loadData()[1]['content']) +# splitHtml(loadData()[0]['content'])