From c455827851cebd614725143214aba19d653875a2 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 18 Oct 2022 05:19:41 +0800 Subject: [PATCH] update: extract content --- src/crawler/zhihu.com/extract.py | 61 +++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 21 deletions(-) diff --git a/src/crawler/zhihu.com/extract.py b/src/crawler/zhihu.com/extract.py index 632089f..a6c6f4b 100644 --- a/src/crawler/zhihu.com/extract.py +++ b/src/crawler/zhihu.com/extract.py @@ -31,7 +31,7 @@ def loadData() -> list: return sorted(data, key = sortFunc) -def splitHtml(rawHtml: str): +def splitHtml(rawHtml: str) -> list: html = BeautifulSoup(rawHtml, 'lxml') def isCaption(obj: BeautifulSoup) -> bool: @@ -61,31 +61,50 @@ def splitHtml(rawHtml: str): .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9') return '第%s章 %s' % (numStr, match[2].strip()) + result = [] + caption = '' + content = [] for item in html.body.contents: - # print(item) - # continue - - if isCaption(item): - caption = formatCaption(item.text) - print(caption) - - # caption = item.text - # match = re.search(r'^第(\d+)章', caption) - # if match is not None: - # caption = match[1] - # elif re.search(r'^第(\S+)章', caption) is not None: - # caption = caption.replace('一', '1') - # print('ok') - - # print(caption) - - # print(item) + if not isCaption(item): + content.append(item) + continue + result.append({ + 'caption': caption, + 'content': content, + }) + content = [] + caption = formatCaption(item.text) + return result + + + # for item in html.body.contents: + # if not isCaption(item): + # content.append(item) + # continue + # yield { + # 'caption': formatCaption(item.text), + # 'content': content + # } + # content.clear() logger.warning('Extract info of `zhihu.com`') sys.argv.append('./data/content.json') +dat = loadData() +for r in splitHtml(dat[0]['content']): + print(r['caption']) + +# while True: # traverse generator +# try: +# d = next(s) +# print(d['caption']) +# if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']: +# for r in d['content']: +# print(r) +# # print(next(s)) +# except StopIteration: +# break + [splitHtml(x['content']) for x in loadData()] # splitHtml(loadData()[0]['content']) -# splitHtml(loadData()[1]['content']) -# splitHtml(loadData()[0]['content'])