Browse Source

feat: extract caption of `zhihu.com`

master
Dnomd343 2 years ago
parent
commit
6cc718a6dd
  1. 91
      src/crawler/zhihu.com/extract.py

91
src/crawler/zhihu.com/extract.py

@ -0,0 +1,91 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Extract data from raw json content.
USAGE: python3 extract.py [JSON_FILE]
"""
import re
import sys
import json
sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup
def loadData() -> list:
rawData = json.loads(open(sys.argv[1]).read())
data = [{
'id': x['id'],
'title': x['title'],
'content': x['content'],
} for x in rawData['data']]
def sortFunc(x: dict) -> int:
suffix = x['title'].replace('栩栩若生', '')
suffix = '1' if suffix == '' else suffix # `栩栩若生` -> `栩栩若生1`
suffix = '22' if suffix == '(全文完)' else suffix # `栩栩若生(全文完)` -> `栩栩若生22`
return int(suffix)
return sorted(data, key = sortFunc)
def splitHtml(rawHtml: str):
html = BeautifulSoup(rawHtml, 'lxml')
def isCaption(obj: BeautifulSoup) -> bool:
if obj.name in ['h2', 'h3']:
if obj.text not in [
'人生第一次如此无语。',
'第三棒是伍哥。',
'东风初送第一船。',
]: return True
if obj.text == "正文第870章对手":
return True
return False
def formatCaption(raw: str) -> str:
if raw.startswith('正文'):
raw = raw.replace('正文', '')
match = re.search(r'^第(\d+)章(.*)', raw)
if match is not None:
return '%s%s' % (match[1], match[2].strip())
match = re.search(r'^第(\S+)章 (.*)', raw)
zhStr = match[1]
zhStr = '三十零' if zhStr == '三十' else zhStr
zhStr = '二十零' if zhStr == '二十' else zhStr
zhStr = '十零' if zhStr == '' else zhStr
zhStr = zhStr.replace('三十', '3').replace('二十', '2').replace('', '1')
numStr = zhStr.replace('', '0').replace('', '1').replace('', '2').replace('', '3').replace('', '4')\
.replace('', '5').replace('', '6').replace('', '7').replace('', '8').replace('', '9')
return '%s%s' % (numStr, match[2].strip())
for item in html.body.contents:
# print(item)
# continue
if isCaption(item):
caption = formatCaption(item.text)
print(caption)
# caption = item.text
# match = re.search(r'^第(\d+)章', caption)
# if match is not None:
# caption = match[1]
# elif re.search(r'^第(\S+)章', caption) is not None:
# caption = caption.replace('一', '1')
# print('ok')
# print(caption)
# print(item)
logger.warning('Extract info of `zhihu.com`')
sys.argv.append('./data/content.json')
[splitHtml(x['content']) for x in loadData()]
# splitHtml(loadData()[0]['content'])
# splitHtml(loadData()[1]['content'])
# splitHtml(loadData()[0]['content'])
Loading…
Cancel
Save