From a5c3d481783d3bfbc1938dfa800107efa61ef219 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sat, 15 Oct 2022 20:41:07 +0800 Subject: [PATCH] fix: missing second page --- src/108shu.com/extract.py | 14 ++++++++++++-- src/108shu.com/fetch.py | 15 ++++++++------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/src/108shu.com/extract.py b/src/108shu.com/extract.py index 0b7eb18..e0766e3 100644 --- a/src/108shu.com/extract.py +++ b/src/108shu.com/extract.py @@ -25,13 +25,23 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content } +def combinePage(chapterId: str) -> dict: # combine sub pages + page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read()) + page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read()) + if page_1['title'] != page_2['title']: + logger.error('Title error -> `%s`' % page_1['title']) + return { + 'title': page_1['title'], + 'content': page_1['content'] + page_2['content'], + } + + result = {} catalog = json.loads(open(sys.argv[1]).read()) # load catalog for chapterName, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) - htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - info = splitHtml(open(htmlFile).read()) + info = combinePage(chapterId) if chapterName != info['title']: logger.error('Title error -> %s' % info['title']) result[chapterName] = info['content'] diff --git a/src/108shu.com/fetch.py b/src/108shu.com/fetch.py index a8ade3c..3fe9f3c 100644 --- a/src/108shu.com/fetch.py +++ b/src/108shu.com/fetch.py @@ -44,10 +44,11 @@ def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content catalog = json.loads(open(sys.argv[1]).read()) # load catalog for _, chapterId in catalog.items(): # traverse all chapters - pageUrl = 'http://www.108shu.com/book/54247/%s.html' % chapterId - pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) - else: - logger.error('Page request failed -> %s' % pageUrl) - time.sleep(1) # avoid being blocked by the server + for subPage in [1, 2]: + pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage) + pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)) + if httpRequest(pageUrl, pageFile): # save html content + logger.info('Page request success -> %s' % pageUrl) + else: + logger.error('Page request failed -> %s' % pageUrl) + time.sleep(1) # avoid being blocked by the server