From 11e46f35497f09e73f3bfc306a320c7c10224fc8 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 18 Oct 2022 13:06:12 +0800 Subject: [PATCH] feat: crawler script of `zhihu.com` --- src/crawler/zhihu.com/crawler.sh | 8 ++++++++ src/crawler/zhihu.com/extract.py | 4 +--- src/crawler/zhihu.com/fetch.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) create mode 100755 src/crawler/zhihu.com/crawler.sh diff --git a/src/crawler/zhihu.com/crawler.sh b/src/crawler/zhihu.com/crawler.sh new file mode 100755 index 0000000..8a7e813 --- /dev/null +++ b/src/crawler/zhihu.com/crawler.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +cd "$(dirname "$0")" +rm -rf ./data/ +mkdir -p ./data/ + +python3 fetch.py ./data/content.json +python3 extract.py ./data/content.json > ./data/xxrs.json diff --git a/src/crawler/zhihu.com/extract.py b/src/crawler/zhihu.com/extract.py index 3d5cb05..2225dbc 100644 --- a/src/crawler/zhihu.com/extract.py +++ b/src/crawler/zhihu.com/extract.py @@ -82,10 +82,8 @@ def splitHtml(rawHtml: str) -> list: return result -logger.warning('Extract info of `zhihu.com`') -sys.argv.append('./data/content.json') - ret = {} +logger.warning('Extract info of `zhihu.com`') for dat in loadData(): for chapter in splitHtml(dat['content']): ret[chapter['caption']] = chapter['content'] diff --git a/src/crawler/zhihu.com/fetch.py b/src/crawler/zhihu.com/fetch.py index 5627189..775ca32 100644 --- a/src/crawler/zhihu.com/fetch.py +++ b/src/crawler/zhihu.com/fetch.py @@ -12,7 +12,7 @@ sys.path.append('..') from utils import logger from utils import httpRequest -logger.warning('Fetch html of `zhihu.com`') +logger.warning('Fetch json of `zhihu.com`') jsonRaw = httpRequest('https://www.zhihu.com/api/v4/columns/c_1553471910075449344/items?limit=%d&offset=0' % 23) with open(sys.argv[1], 'wb') as fileObj: fileObj.write(jsonRaw)