feat: thread pool demo

3 years ago · fb215b3398
1 changed files with 20 additions and 7 deletions
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@ -14,14 +14,27 @@ import time
 sys.path.append('..')
 from utils import logger
 from utils import htmlFetch
 from concurrent.futures import ThreadPoolExecutor
 def pageFetch(info: dict, delay: float):
    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
    if htmlFetch(info['url'], info['file']):  # save html content
        logger.info('Page fetch success -> `%s`' % info['url'])
    else:
        logger.error('Page fetch failed -> `%s`' % info['url'])
    time.sleep(delay)
 pages = []
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
 for _, chapterId in catalog.items():  # traverse all chapters
-    pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId
+    pages.append({
-    pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
+        'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
-    if htmlFetch(pageUrl, pageFile):  # save html content
+        'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
-        logger.info('Page request success -> `%s`' % pageUrl)
+    })
-    else:
+
-        logger.error('Page request failed -> `%s`' % pageUrl)
+
-    time.sleep(1)  # avoid being blocked by the server
+with ThreadPoolExecutor(max_workers = 2) as pool:
    for page in pages:
        pool.submit(pageFetch, page, 5)