diff --git a/src/crawler/utils/__init__.py b/src/crawler/utils/__init__.py index d7d7f4c..1e0743b 100644 --- a/src/crawler/utils/__init__.py +++ b/src/crawler/utils/__init__.py @@ -2,5 +2,7 @@ # -*- coding: utf-8 -*- from .logger import logger + +from .fetch import htmlSave from .fetch import htmlFetch from .fetch import httpRequest diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py index 79885c7..8c7097c 100644 --- a/src/crawler/utils/fetch.py +++ b/src/crawler/utils/fetch.py @@ -3,6 +3,9 @@ import requests from .logger import logger +from concurrent import futures +from concurrent.futures import ALL_COMPLETED +from concurrent.futures import ThreadPoolExecutor userAgent = ( # default user agent 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' @@ -21,7 +24,7 @@ def httpRequest(url: str) -> bytes: # fetch raw html content return request.content -def htmlFetch(url: str, file: str) -> bool: # save html content +def htmlSave(url: str, file: str) -> bool: # save html content logger.debug('Html fetch `%s` -> `%s`' % (url, file)) try: content = httpRequest(url) # http request @@ -41,3 +44,25 @@ def htmlFetch(url: str, file: str) -> bool: # save html content return False # save failed logger.debug('Html save success -> `%s`' % file) return True + + +def pageFetch(info: dict, delay: float): # fetch html content into file + logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) + if htmlSave(info['url'], info['file']): # save html content + logger.info('Page fetch success -> `%s`' % info['url']) + else: + logger.error('Page fetch failed -> `%s`' % info['url']) + time.sleep(delay) + + +def htmlFetch(page, thread: int = 1, delay: float = 0): + logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay)) + threadPool = ThreadPoolExecutor(max_workers = thread) + threads = [] + while True: + try: + threads.append(threadPool.submit(pageFetch, next(page), delay)) + except StopIteration: + break + futures.wait(threads, return_when = ALL_COMPLETED) + logger.info('Html fetch complete') diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py index 58518f9..87578e5 100644 --- a/src/crawler/wxsy.net/fetch.py +++ b/src/crawler/wxsy.net/fetch.py @@ -14,27 +14,15 @@ import time sys.path.append('..') from utils import logger from utils import htmlFetch -from concurrent.futures import ThreadPoolExecutor -def pageFetch(info: dict, delay: float): - logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) - if htmlFetch(info['url'], info['file']): # save html content - logger.info('Page fetch success -> `%s`' % info['url']) - else: - logger.error('Page fetch failed -> `%s`' % info['url']) - time.sleep(delay) +def loadChapter(): + catalog = json.loads(open(sys.argv[1]).read()) # load catalog + for _, chapterId in catalog.items(): # traverse all chapters + yield { + 'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId, + 'file': os.path.join(sys.argv[2], '%s.html' % chapterId), + } -pages = [] -catalog = json.loads(open(sys.argv[1]).read()) # load catalog -for _, chapterId in catalog.items(): # traverse all chapters - pages.append({ - 'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId, - 'file': os.path.join(sys.argv[2], '%s.html' % chapterId), - }) - - -with ThreadPoolExecutor(max_workers = 2) as pool: - for page in pages: - pool.submit(pageFetch, page, 5) +htmlFetch(loadChapter(), 2)