From c455827851cebd614725143214aba19d653875a2 Mon Sep 17 00:00:00 2001
From: Dnomd343 <i@343.re>
Date: Tue, 18 Oct 2022 05:19:41 +0800
Subject: [PATCH] update: extract content

---
 src/crawler/zhihu.com/extract.py | 61 +++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 21 deletions(-)

diff --git a/src/crawler/zhihu.com/extract.py b/src/crawler/zhihu.com/extract.py
index 632089f..a6c6f4b 100644
--- a/src/crawler/zhihu.com/extract.py
+++ b/src/crawler/zhihu.com/extract.py
@@ -31,7 +31,7 @@ def loadData() -> list:
     return sorted(data, key = sortFunc)
 
 
-def splitHtml(rawHtml: str):
+def splitHtml(rawHtml: str) -> list:
     html = BeautifulSoup(rawHtml, 'lxml')
 
     def isCaption(obj: BeautifulSoup) -> bool:
@@ -61,31 +61,50 @@ def splitHtml(rawHtml: str):
             .replace('五', '5').replace('六', '6').replace('七', '7').replace('八', '8').replace('九', '9')
         return '第%s章 %s' % (numStr, match[2].strip())
 
+    result = []
+    caption = ''
+    content = []
     for item in html.body.contents:
-        # print(item)
-        # continue
-
-        if isCaption(item):
-            caption = formatCaption(item.text)
-            print(caption)
-
-            # caption = item.text
-            # match = re.search(r'^第(\d+)章', caption)
-            # if match is not None:
-            #     caption = match[1]
-            # elif re.search(r'^第(\S+)章', caption) is not None:
-            #     caption = caption.replace('一', '1')
-                # print('ok')
-
-            # print(caption)
-
-            # print(item)
+        if not isCaption(item):
+            content.append(item)
+            continue
+        result.append({
+            'caption': caption,
+            'content': content,
+        })
+        content = []
+        caption = formatCaption(item.text)
+    return result
+
+
+    # for item in html.body.contents:
+    #     if not isCaption(item):
+    #         content.append(item)
+    #         continue
+    #     yield {
+    #         'caption': formatCaption(item.text),
+    #         'content': content
+    #     }
+    #     content.clear()
 
 
 logger.warning('Extract info of `zhihu.com`')
 sys.argv.append('./data/content.json')
 
+dat = loadData()
+for r in splitHtml(dat[0]['content']):
+    print(r['caption'])
+
+# while True:  # traverse generator
+#     try:
+#         d = next(s)
+#         print(d['caption'])
+#         if d['caption'] in ['第1章 此女一生福名扬', '第2章 有人']:
+#             for r in d['content']:
+#                 print(r)
+#         # print(next(s))
+#     except StopIteration:
+#         break
+
 [splitHtml(x['content']) for x in loadData()]
 # splitHtml(loadData()[0]['content'])
-# splitHtml(loadData()[1]['content'])
-# splitHtml(loadData()[0]['content'])