update: extract content

3 years ago · c455827851
1 changed files with 40 additions and 21 deletions
--- a/src/crawler/zhihu.com/extract.py
+++ b/src/crawler/zhihu.com/extract.py
@ -31,7 +31,7 @@ def loadData() -> list:
    return sorted(data, key = sortFunc)
-def splitHtml(rawHtml: str):
+def splitHtml(rawHtml: str) -> list:
    html = BeautifulSoup(rawHtml, 'lxml')
    def isCaption(obj: BeautifulSoup) -> bool:
@ -61,31 +61,50 @@ def splitHtml(rawHtml: str):
            .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9')
        return '第%s章 %s' % (numStr, match[2].strip())
    result = []
    caption = ''
    content = []
    for item in html.body.contents:
-        # print(item)
+        if not isCaption(item):
-        # continue
+            content.append(item)
-
+            continue
-        if isCaption(item):
+        result.append({
-            caption = formatCaption(item.text)
+            'caption': caption,
-            print(caption)
+            'content': content,
-
+        })
-            # caption = item.text
+        content = []
-            # match = re.search(r'^第(\d+)章', caption)
+        caption = formatCaption(item.text)
-            # if match is not None:
+    return result
-            #     caption = match[1]
+
-            # elif re.search(r'^第(\S+)章', caption) is not None:
+
-            #     caption = caption.replace('一', '1')
+    # for item in html.body.contents:
-                # print('ok')
+    #     if not isCaption(item):
-
+    #         content.append(item)
-            # print(caption)
+    #         continue
-
+    #     yield {
-            # print(item)
+    #         'caption': formatCaption(item.text),
    #         'content': content
    #     }
    #     content.clear()
 logger.warning('Extract info of `zhihu.com`')
 sys.argv.append('./data/content.json')
 dat = loadData()
 for r in splitHtml(dat[0]['content']):
    print(r['caption'])
 # while True:  # traverse generator
 #     try:
 #         d = next(s)
 #         print(d['caption'])
 #         if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']:
 #             for r in d['content']:
 #                 print(r)
 #         # print(next(s))
 #     except StopIteration:
 #         break
 [splitHtml(x['content']) for x in loadData()]
 # splitHtml(loadData()[0]['content'])
 # splitHtml(loadData()[1]['content'])
 # splitHtml(loadData()[0]['content'])