From afc75d6bce89f39c7a2d8a26cd3bcb786671e47d Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 18 Oct 2022 05:40:43 +0800 Subject: [PATCH] update: release `zhihu.com` --- src/crawler/zhihu.com/extract.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/crawler/zhihu.com/extract.py b/src/crawler/zhihu.com/extract.py index e263fce..3d5cb05 100644 --- a/src/crawler/zhihu.com/extract.py +++ b/src/crawler/zhihu.com/extract.py @@ -85,15 +85,8 @@ def splitHtml(rawHtml: str) -> list: logger.warning('Extract info of `zhihu.com`') sys.argv.append('./data/content.json') -dat = loadData() -ret = [] -[ret.extend(splitHtml(x['content'])) for x in dat] - -for r in ret: - print(r['caption']) - -# for r in ret[0]['content']: -# print(r) - -# [splitHtml(x['content']) for x in loadData()] -# splitHtml(loadData()[0]['content']) +ret = {} +for dat in loadData(): + for chapter in splitHtml(dat['content']): + ret[chapter['caption']] = chapter['content'] +print(json.dumps(ret))