Browse Source

update: extract info in all chapters

master
Dnomd343 2 years ago
parent
commit
1e377fbbf7
  1. 39
      src/crawler/zhihu.com/extract.py

39
src/crawler/zhihu.com/extract.py

@ -66,7 +66,7 @@ def splitHtml(rawHtml: str) -> list:
content = [] content = []
for item in html.body.contents: for item in html.body.contents:
if not isCaption(item): if not isCaption(item):
content.append(item) content.append(item.text)
continue continue
result.append({ result.append({
'caption': caption, 'caption': caption,
@ -74,37 +74,26 @@ def splitHtml(rawHtml: str) -> list:
}) })
content = [] content = []
caption = formatCaption(item.text) caption = formatCaption(item.text)
result.append({
'caption': caption,
'content': content,
})
result.pop(0)
return result return result
# for item in html.body.contents:
# if not isCaption(item):
# content.append(item)
# continue
# yield {
# 'caption': formatCaption(item.text),
# 'content': content
# }
# content.clear()
logger.warning('Extract info of `zhihu.com`') logger.warning('Extract info of `zhihu.com`')
sys.argv.append('./data/content.json') sys.argv.append('./data/content.json')
dat = loadData() dat = loadData()
for r in splitHtml(dat[0]['content']): ret = []
[ret.extend(splitHtml(x['content'])) for x in dat]
for r in ret:
print(r['caption']) print(r['caption'])
# while True: # traverse generator # for r in ret[0]['content']:
# try: # print(r)
# d = next(s)
# print(d['caption']) # [splitHtml(x['content']) for x in loadData()]
# if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']:
# for r in d['content']:
# print(r)
# # print(next(s))
# except StopIteration:
# break
[splitHtml(x['content']) for x in loadData()]
# splitHtml(loadData()[0]['content']) # splitHtml(loadData()[0]['content'])

Loading…
Cancel
Save