diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py index 699b542..58518f9 100644 --- a/src/crawler/wxsy.net/fetch.py +++ b/src/crawler/wxsy.net/fetch.py @@ -14,14 +14,27 @@ import time sys.path.append('..') from utils import logger from utils import htmlFetch +from concurrent.futures import ThreadPoolExecutor +def pageFetch(info: dict, delay: float): + logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) + if htmlFetch(info['url'], info['file']): # save html content + logger.info('Page fetch success -> `%s`' % info['url']) + else: + logger.error('Page fetch failed -> `%s`' % info['url']) + time.sleep(delay) + + +pages = [] catalog = json.loads(open(sys.argv[1]).read()) # load catalog for _, chapterId in catalog.items(): # traverse all chapters - pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId - pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - if htmlFetch(pageUrl, pageFile): # save html content - logger.info('Page request success -> `%s`' % pageUrl) - else: - logger.error('Page request failed -> `%s`' % pageUrl) - time.sleep(1) # avoid being blocked by the server + pages.append({ + 'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId, + 'file': os.path.join(sys.argv[2], '%s.html' % chapterId), + }) + + +with ThreadPoolExecutor(max_workers = 2) as pool: + for page in pages: + pool.submit(pageFetch, page, 5)