xxrs-crawler/src/crawler/zhihu.com/extract.py


								#!/usr/bin/env python3

								# -*- coding: utf-8 -*-


								"""

								Extract data from raw json content.


								    USAGE: python3 extract.py [JSON_FILE]

								"""


								import re

								import sys

								import json

								sys.path.append('..')

								from utils import logger

								from bs4 import BeautifulSoup


								def loadData() -> list:

								    rawData = json.loads(open(sys.argv[1]).read())

								    data = [{

								        'id': x['id'],

								        'title': x['title'],

								        'content': x['content'],

								    } for x in rawData['data']]


								    def sortFunc(x: dict) -> int:

								        suffix = x['title'].replace('栩栩若生', '')

								        suffix = '1' if suffix == '' else suffix  # `栩栩若生` -> `栩栩若生1`

								        suffix = '22' if suffix == '（全文完）' else suffix  # `栩栩若生（全文完）` -> `栩栩若生22`

								        return int(suffix)

								    return sorted(data, key = sortFunc)


								def splitHtml(rawHtml: str):

								    html = BeautifulSoup(rawHtml, 'lxml')


								    def isCaption(obj: BeautifulSoup) -> bool:

								        if obj.name in ['h2', 'h3']:

								            if obj.text not in [

								                '人生第一次如此无语。',

								                '第三棒是伍哥。',

								                '东风初送第一船。',

								            ]: return True

								        if obj.text == "正文第870章对手":

								            return True

								        return False


								    def formatCaption(raw: str) -> str:

								        if raw.startswith('正文'):

								            raw = raw.replace('正文', '')

								        match = re.search(r'^第(\d+)章(.*)', raw)

								        if match is not None:

								            return '第%s章 %s' % (match[1], match[2].strip())

								        match = re.search(r'^第(\S+)章 (.*)', raw)

								        zhStr = match[1]

								        zhStr = '三十零' if zhStr == '三十' else zhStr

								        zhStr = '二十零' if zhStr == '二十' else zhStr

								        zhStr = '十零' if zhStr == '十' else zhStr

								        zhStr = zhStr.replace('三十', '3').replace('二十', '2').replace('十', '1')

								        numStr = zhStr.replace('零', '0').replace('一', '1').replace('二', '2').replace('三', '3').replace('四', '4')\

								            .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9')

								        return '第%s章 %s' % (numStr, match[2].strip())


								    for item in html.body.contents:

								        # print(item)

								        # continue


								        if isCaption(item):

								            caption = formatCaption(item.text)

								            print(caption)


								            # caption = item.text

								            # match = re.search(r'^第(\d+)章', caption)

								            # if match is not None:

								            #     caption = match[1]

								            # elif re.search(r'^第(\S+)章', caption) is not None:

								            #     caption = caption.replace('一', '1')

								                # print('ok')


								            # print(caption)


								            # print(item)


								logger.warning('Extract info of `zhihu.com`')

								sys.argv.append('./data/content.json')


								[splitHtml(x['content']) for x in loadData()]

								# splitHtml(loadData()[0]['content'])

								# splitHtml(loadData()[1]['content'])

								# splitHtml(loadData()[0]['content'])