Browse Source

fix: missing second page

master
Dnomd343 2 years ago
parent
commit
a5c3d48178
  1. 14
      src/108shu.com/extract.py
  2. 15
      src/108shu.com/fetch.py

14
src/108shu.com/extract.py

@ -25,13 +25,23 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
} }
def combinePage(chapterId: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read())
if page_1['title'] != page_2['title']:
logger.error('Title error -> `%s`' % page_1['title'])
return {
'title': page_1['title'],
'content': page_1['content'] + page_2['content'],
}
result = {} result = {}
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId) logger.info('Analyse chapter `%s`' % chapterId)
htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId) info = combinePage(chapterId)
info = splitHtml(open(htmlFile).read())
if chapterName != info['title']: if chapterName != info['title']:
logger.error('Title error -> %s' % info['title']) logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content'] result[chapterName] = info['content']

15
src/108shu.com/fetch.py

@ -44,10 +44,11 @@ def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'http://www.108shu.com/book/54247/%s.html' % chapterId for subPage in [1, 2]:
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage)
if httpRequest(pageUrl, pageFile): # save html content pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage))
logger.info('Page request success -> %s' % pageUrl) if httpRequest(pageUrl, pageFile): # save html content
else: logger.info('Page request success -> %s' % pageUrl)
logger.error('Page request failed -> %s' % pageUrl) else:
time.sleep(1) # avoid being blocked by the server logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save