feat: multi thread fetch

2 years ago · 926508d704
3 changed files with 36 additions and 21 deletions
--- a/src/crawler/utils/init.py
+++ b/src/crawler/utils/init.py
@ -2,5 +2,7 @@
 # -*- coding: utf-8 -*-

 from .logger import logger
+
+from .fetch import htmlSave
 from .fetch import htmlFetch
 from .fetch import httpRequest
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@ -3,6 +3,9 @@

 import requests
 from .logger import logger
+from concurrent import futures
+from concurrent.futures import ALL_COMPLETED
+from concurrent.futures import ThreadPoolExecutor

 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
@ -21,7 +24,7 @@ def httpRequest(url: str) -> bytes:  # fetch raw html content
    return request.content


-def htmlFetch(url: str, file: str) -> bool:  # save html content
+def htmlSave(url: str, file: str) -> bool:  # save html content
    logger.debug('Html fetch `%s` -> `%s`' % (url, file))
    try:
        content = httpRequest(url)  # http request
@ -41,3 +44,25 @@ def htmlFetch(url: str, file: str) -> bool:  # save html content
        return False  # save failed
    logger.debug('Html save success -> `%s`' % file)
    return True
+
+
+def pageFetch(info: dict, delay: float):  # fetch html content into file
+    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
+    if htmlSave(info['url'], info['file']):  # save html content
+        logger.info('Page fetch success -> `%s`' % info['url'])
+    else:
+        logger.error('Page fetch failed -> `%s`' % info['url'])
+    time.sleep(delay)
+
+
+def htmlFetch(page, thread: int = 1, delay: float = 0):
+    logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay))
+    threadPool = ThreadPoolExecutor(max_workers = thread)
+    threads = []
+    while True:
+        try:
+            threads.append(threadPool.submit(pageFetch, next(page), delay))
+        except StopIteration:
+            break
+    futures.wait(threads, return_when = ALL_COMPLETED)
+    logger.info('Html fetch complete')
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@ -14,27 +14,15 @@ import time
 sys.path.append('..')
 from utils import logger
 from utils import htmlFetch
-from concurrent.futures import ThreadPoolExecutor


-def pageFetch(info: dict, delay: float):
-    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
-    if htmlFetch(info['url'], info['file']):  # save html content
-        logger.info('Page fetch success -> `%s`' % info['url'])
-    else:
-        logger.error('Page fetch failed -> `%s`' % info['url'])
-    time.sleep(delay)
+def loadChapter():
+    catalog = json.loads(open(sys.argv[1]).read())  # load catalog
+    for _, chapterId in catalog.items():  # traverse all chapters
+        yield {
+            'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
+            'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
+        }


-pages = []
-catalog = json.loads(open(sys.argv[1]).read())  # load catalog
-for _, chapterId in catalog.items():  # traverse all chapters
-    pages.append({
-        'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
-        'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
-    })
-
-
-with ThreadPoolExecutor(max_workers = 2) as pool:
-    for page in pages:
-        pool.submit(pageFetch, page, 5)
+htmlFetch(loadChapter(), 2)