update: extract content

3 years ago · c455827851
1 changed files with 40 additions and 21 deletions
--- a/src/crawler/zhihu.com/extract.py
+++ b/src/crawler/zhihu.com/extract.py
@ -31,7 +31,7 @@ def loadData() -> list:
    return sorted(data, key = sortFunc)


-def splitHtml(rawHtml: str):
+def splitHtml(rawHtml: str) -> list:
    html = BeautifulSoup(rawHtml, 'lxml')

    def isCaption(obj: BeautifulSoup) -> bool:
@ -61,31 +61,50 @@ def splitHtml(rawHtml: str):
            .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9')
        return '第%s章 %s' % (numStr, match[2].strip())

+    result = []
+    caption = ''
+    content = []
    for item in html.body.contents:
-        # print(item)
-        # continue
-
-        if isCaption(item):
-            caption = formatCaption(item.text)
-            print(caption)
-
-            # caption = item.text
-            # match = re.search(r'^第(\d+)章', caption)
-            # if match is not None:
-            #     caption = match[1]
-            # elif re.search(r'^第(\S+)章', caption) is not None:
-            #     caption = caption.replace('一', '1')
-                # print('ok')
-
-            # print(caption)
-
-            # print(item)
+        if not isCaption(item):
+            content.append(item)
+            continue
+        result.append({
+            'caption': caption,
+            'content': content,
+        })
+        content = []
+        caption = formatCaption(item.text)
+    return result
+
+
+    # for item in html.body.contents:
+    #     if not isCaption(item):
+    #         content.append(item)
+    #         continue
+    #     yield {
+    #         'caption': formatCaption(item.text),
+    #         'content': content
+    #     }
+    #     content.clear()


 logger.warning('Extract info of `zhihu.com`')
 sys.argv.append('./data/content.json')

+dat = loadData()
+for r in splitHtml(dat[0]['content']):
+    print(r['caption'])
+
+# while True:  # traverse generator
+#     try:
+#         d = next(s)
+#         print(d['caption'])
+#         if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']:
+#             for r in d['content']:
+#                 print(r)
+#         # print(next(s))
+#     except StopIteration:
+#         break
+
 [splitHtml(x['content']) for x in loadData()]
 # splitHtml(loadData()[0]['content'])
-# splitHtml(loadData()[1]['content'])
-# splitHtml(loadData()[0]['content'])