feat: thread pool demo

4 years ago · fb215b3398
1 changed files with 20 additions and 7 deletions
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@ -14,14 +14,27 @@ import time
 sys.path.append('..')
 from utils import logger
 from utils import htmlFetch
+from concurrent.futures import ThreadPoolExecutor


+def pageFetch(info: dict, delay: float):
+    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
+    if htmlFetch(info['url'], info['file']):  # save html content
+        logger.info('Page fetch success -> `%s`' % info['url'])
+    else:
+        logger.error('Page fetch failed -> `%s`' % info['url'])
+    time.sleep(delay)
+
+
+pages = []
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
 for _, chapterId in catalog.items():  # traverse all chapters
-    pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId
-    pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
-    if htmlFetch(pageUrl, pageFile):  # save html content
-        logger.info('Page request success -> `%s`' % pageUrl)
-    else:
-        logger.error('Page request failed -> `%s`' % pageUrl)
-    time.sleep(1)  # avoid being blocked by the server
+    pages.append({
+        'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
+        'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
+    })
+
+
+with ThreadPoolExecutor(max_workers = 2) as pool:
+    for page in pages:
+        pool.submit(pageFetch, page, 5)