Browse Source

fix: missing second page

master
Dnomd343 2 years ago
parent
commit
a5c3d48178
  1. 14
      src/108shu.com/extract.py
  2. 15
      src/108shu.com/fetch.py

14
src/108shu.com/extract.py

@ -25,13 +25,23 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
}
def combinePage(chapterId: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read())
if page_1['title'] != page_2['title']:
logger.error('Title error -> `%s`' % page_1['title'])
return {
'title': page_1['title'],
'content': page_1['content'] + page_2['content'],
}
result = {}
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId)
htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
info = splitHtml(open(htmlFile).read())
info = combinePage(chapterId)
if chapterName != info['title']:
logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content']

15
src/108shu.com/fetch.py

@ -44,10 +44,11 @@ def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'http://www.108shu.com/book/54247/%s.html' % chapterId
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server
for subPage in [1, 2]:
pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage)
pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage))
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save