Browse Source

feat: crawler script of `zhihu.com`

master
Dnomd343 2 years ago
parent
commit
11e46f3549
  1. 8
      src/crawler/zhihu.com/crawler.sh
  2. 4
      src/crawler/zhihu.com/extract.py
  3. 2
      src/crawler/zhihu.com/fetch.py

8
src/crawler/zhihu.com/crawler.sh

@ -0,0 +1,8 @@
#!/usr/bin/env bash
cd "$(dirname "$0")"
rm -rf ./data/
mkdir -p ./data/
python3 fetch.py ./data/content.json
python3 extract.py ./data/content.json > ./data/xxrs.json

4
src/crawler/zhihu.com/extract.py

@ -82,10 +82,8 @@ def splitHtml(rawHtml: str) -> list:
return result return result
logger.warning('Extract info of `zhihu.com`')
sys.argv.append('./data/content.json')
ret = {} ret = {}
logger.warning('Extract info of `zhihu.com`')
for dat in loadData(): for dat in loadData():
for chapter in splitHtml(dat['content']): for chapter in splitHtml(dat['content']):
ret[chapter['caption']] = chapter['content'] ret[chapter['caption']] = chapter['content']

2
src/crawler/zhihu.com/fetch.py

@ -12,7 +12,7 @@ sys.path.append('..')
from utils import logger from utils import logger
from utils import httpRequest from utils import httpRequest
logger.warning('Fetch html of `zhihu.com`') logger.warning('Fetch json of `zhihu.com`')
jsonRaw = httpRequest('https://www.zhihu.com/api/v4/columns/c_1553471910075449344/items?limit=%d&offset=0' % 23) jsonRaw = httpRequest('https://www.zhihu.com/api/v4/columns/c_1553471910075449344/items?limit=%d&offset=0' % 23)
with open(sys.argv[1], 'wb') as fileObj: with open(sys.argv[1], 'wb') as fileObj:
fileObj.write(jsonRaw) fileObj.write(jsonRaw)

Loading…
Cancel
Save