Browse Source

update: extract content

master
Dnomd343 2 years ago
parent
commit
c455827851
  1. 61
      src/crawler/zhihu.com/extract.py

61
src/crawler/zhihu.com/extract.py

@ -31,7 +31,7 @@ def loadData() -> list:
return sorted(data, key = sortFunc) return sorted(data, key = sortFunc)
def splitHtml(rawHtml: str): def splitHtml(rawHtml: str) -> list:
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(rawHtml, 'lxml')
def isCaption(obj: BeautifulSoup) -> bool: def isCaption(obj: BeautifulSoup) -> bool:
@ -61,31 +61,50 @@ def splitHtml(rawHtml: str):
.replace('', '5').replace('', '6').replace('', '7').replace('', '8').replace('', '9') .replace('', '5').replace('', '6').replace('', '7').replace('', '8').replace('', '9')
return '%s%s' % (numStr, match[2].strip()) return '%s%s' % (numStr, match[2].strip())
result = []
caption = ''
content = []
for item in html.body.contents: for item in html.body.contents:
# print(item) if not isCaption(item):
# continue content.append(item)
continue
if isCaption(item): result.append({
caption = formatCaption(item.text) 'caption': caption,
print(caption) 'content': content,
})
# caption = item.text content = []
# match = re.search(r'^第(\d+)章', caption) caption = formatCaption(item.text)
# if match is not None: return result
# caption = match[1]
# elif re.search(r'^第(\S+)章', caption) is not None:
# caption = caption.replace('一', '1') # for item in html.body.contents:
# print('ok') # if not isCaption(item):
# content.append(item)
# print(caption) # continue
# yield {
# print(item) # 'caption': formatCaption(item.text),
# 'content': content
# }
# content.clear()
logger.warning('Extract info of `zhihu.com`') logger.warning('Extract info of `zhihu.com`')
sys.argv.append('./data/content.json') sys.argv.append('./data/content.json')
dat = loadData()
for r in splitHtml(dat[0]['content']):
print(r['caption'])
# while True: # traverse generator
# try:
# d = next(s)
# print(d['caption'])
# if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']:
# for r in d['content']:
# print(r)
# # print(next(s))
# except StopIteration:
# break
[splitHtml(x['content']) for x in loadData()] [splitHtml(x['content']) for x in loadData()]
# splitHtml(loadData()[0]['content']) # splitHtml(loadData()[0]['content'])
# splitHtml(loadData()[1]['content'])
# splitHtml(loadData()[0]['content'])

Loading…
Cancel
Save