Browse Source

update: release `zhihu.com`

master
Dnomd343 2 years ago
parent
commit
afc75d6bce
  1. 17
      src/crawler/zhihu.com/extract.py

17
src/crawler/zhihu.com/extract.py

@ -85,15 +85,8 @@ def splitHtml(rawHtml: str) -> list:
logger.warning('Extract info of `zhihu.com`') logger.warning('Extract info of `zhihu.com`')
sys.argv.append('./data/content.json') sys.argv.append('./data/content.json')
dat = loadData() ret = {}
ret = [] for dat in loadData():
[ret.extend(splitHtml(x['content'])) for x in dat] for chapter in splitHtml(dat['content']):
ret[chapter['caption']] = chapter['content']
for r in ret: print(json.dumps(ret))
print(r['caption'])
# for r in ret[0]['content']:
# print(r)
# [splitHtml(x['content']) for x in loadData()]
# splitHtml(loadData()[0]['content'])

Loading…
Cancel
Save