fix: missing second page

3 years ago · a5c3d48178
2 changed files with 20 additions and 9 deletions
--- a/src/108shu.com/extract.py
+++ b/src/108shu.com/extract.py
@ -25,13 +25,23 @@ def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
    }


+def combinePage(chapterId: str) -> dict:  # combine sub pages
+    page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read())
+    page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read())
+    if page_1['title'] != page_2['title']:
+        logger.error('Title error -> `%s`' % page_1['title'])
+    return {
+        'title': page_1['title'],
+        'content': page_1['content'] + page_2['content'],
+    }
+
+
 result = {}
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog

 for chapterName, chapterId in catalog.items():  # traverse all chapters
    logger.info('Analyse chapter `%s`' % chapterId)
-    htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
-    info = splitHtml(open(htmlFile).read())
+    info = combinePage(chapterId)
    if chapterName != info['title']:
        logger.error('Title error -> %s' % info['title'])
    result[chapterName] = info['content']
--- a/src/108shu.com/fetch.py
+++ b/src/108shu.com/fetch.py
@ -44,10 +44,11 @@ def httpRequest(fileUrl: str, fileName: str) -> bool:  # save html content
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog

 for _, chapterId in catalog.items():  # traverse all chapters
-    pageUrl = 'http://www.108shu.com/book/54247/%s.html' % chapterId
-    pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
-    if httpRequest(pageUrl, pageFile):  # save html content
-        logger.info('Page request success -> %s' % pageUrl)
-    else:
-        logger.error('Page request failed -> %s' % pageUrl)
-    time.sleep(1)  # avoid being blocked by the server
+    for subPage in [1, 2]:
+        pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage)
+        pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage))
+        if httpRequest(pageUrl, pageFile):  # save html content
+            logger.info('Page request success -> %s' % pageUrl)
+        else:
+            logger.error('Page request failed -> %s' % pageUrl)
+        time.sleep(1)  # avoid being blocked by the server