feat: crawler script of `zhihu.com`

3 years ago · 11e46f3549
3 changed files with 10 additions and 4 deletions
--- a/src/crawler/zhihu.com/crawler.sh
+++ b/src/crawler/zhihu.com/crawler.sh
@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+cd "$(dirname "$0")"
+rm -rf ./data/
+mkdir -p ./data/
+
+python3 fetch.py ./data/content.json
+python3 extract.py ./data/content.json > ./data/xxrs.json
--- a/src/crawler/zhihu.com/extract.py
+++ b/src/crawler/zhihu.com/extract.py
@ -82,10 +82,8 @@ def splitHtml(rawHtml: str) -> list:
    return result


-logger.warning('Extract info of `zhihu.com`')
-sys.argv.append('./data/content.json')
-
 ret = {}
+logger.warning('Extract info of `zhihu.com`')
 for dat in loadData():
    for chapter in splitHtml(dat['content']):
        ret[chapter['caption']] = chapter['content']
--- a/src/crawler/zhihu.com/fetch.py
+++ b/src/crawler/zhihu.com/fetch.py
@ -12,7 +12,7 @@ sys.path.append('..')
 from utils import logger
 from utils import httpRequest

-logger.warning('Fetch html of `zhihu.com`')
+logger.warning('Fetch json of `zhihu.com`')
 jsonRaw = httpRequest('https://www.zhihu.com/api/v4/columns/c_1553471910075449344/items?limit=%d&offset=0' % 23)
 with open(sys.argv[1], 'wb') as fileObj:
    fileObj.write(jsonRaw)