diff --git a/src/crawler/zhihu.com/extract.py b/src/crawler/zhihu.com/extract.py index a6c6f4b..e263fce 100644 --- a/src/crawler/zhihu.com/extract.py +++ b/src/crawler/zhihu.com/extract.py @@ -66,7 +66,7 @@ def splitHtml(rawHtml: str) -> list: content = [] for item in html.body.contents: if not isCaption(item): - content.append(item) + content.append(item.text) continue result.append({ 'caption': caption, @@ -74,37 +74,26 @@ def splitHtml(rawHtml: str) -> list: }) content = [] caption = formatCaption(item.text) + result.append({ + 'caption': caption, + 'content': content, + }) + result.pop(0) return result - # for item in html.body.contents: - # if not isCaption(item): - # content.append(item) - # continue - # yield { - # 'caption': formatCaption(item.text), - # 'content': content - # } - # content.clear() - - logger.warning('Extract info of `zhihu.com`') sys.argv.append('./data/content.json') dat = loadData() -for r in splitHtml(dat[0]['content']): +ret = [] +[ret.extend(splitHtml(x['content'])) for x in dat] + +for r in ret: print(r['caption']) -# while True: # traverse generator -# try: -# d = next(s) -# print(d['caption']) -# if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']: -# for r in d['content']: -# print(r) -# # print(next(s)) -# except StopIteration: -# break - -[splitHtml(x['content']) for x in loadData()] +# for r in ret[0]['content']: +# print(r) + +# [splitHtml(x['content']) for x in loadData()] # splitHtml(loadData()[0]['content'])