From 926508d7043396a3026dcf19432dfbd6b7a1e968 Mon Sep 17 00:00:00 2001
From: Dnomd343 <i@343.re>
Date: Sun, 16 Oct 2022 13:53:08 +0800
Subject: [PATCH] feat: multi thread fetch

---
 src/crawler/utils/__init__.py |  2 ++
 src/crawler/utils/fetch.py    | 27 ++++++++++++++++++++++++++-
 src/crawler/wxsy.net/fetch.py | 28 ++++++++--------------------
 3 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/src/crawler/utils/__init__.py b/src/crawler/utils/__init__.py
index d7d7f4c..1e0743b 100644
--- a/src/crawler/utils/__init__.py
+++ b/src/crawler/utils/__init__.py
@@ -2,5 +2,7 @@
 # -*- coding: utf-8 -*-
 
 from .logger import logger
+
+from .fetch import htmlSave
 from .fetch import htmlFetch
 from .fetch import httpRequest
diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py
index 79885c7..8c7097c 100644
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@@ -3,6 +3,9 @@
 
 import requests
 from .logger import logger
+from concurrent import futures
+from concurrent.futures import ALL_COMPLETED
+from concurrent.futures import ThreadPoolExecutor
 
 userAgent = (  # default user agent
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
@@ -21,7 +24,7 @@ def httpRequest(url: str) -> bytes:  # fetch raw html content
     return request.content
 
 
-def htmlFetch(url: str, file: str) -> bool:  # save html content
+def htmlSave(url: str, file: str) -> bool:  # save html content
     logger.debug('Html fetch `%s` -> `%s`' % (url, file))
     try:
         content = httpRequest(url)  # http request
@@ -41,3 +44,25 @@ def htmlFetch(url: str, file: str) -> bool:  # save html content
         return False  # save failed
     logger.debug('Html save success -> `%s`' % file)
     return True
+
+
+def pageFetch(info: dict, delay: float):  # fetch html content into file
+    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
+    if htmlSave(info['url'], info['file']):  # save html content
+        logger.info('Page fetch success -> `%s`' % info['url'])
+    else:
+        logger.error('Page fetch failed -> `%s`' % info['url'])
+    time.sleep(delay)
+
+
+def htmlFetch(page, thread: int = 1, delay: float = 0):
+    logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay))
+    threadPool = ThreadPoolExecutor(max_workers = thread)
+    threads = []
+    while True:
+        try:
+            threads.append(threadPool.submit(pageFetch, next(page), delay))
+        except StopIteration:
+            break
+    futures.wait(threads, return_when = ALL_COMPLETED)
+    logger.info('Html fetch complete')
diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py
index 58518f9..87578e5 100644
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@@ -14,27 +14,15 @@ import time
 sys.path.append('..')
 from utils import logger
 from utils import htmlFetch
-from concurrent.futures import ThreadPoolExecutor
 
 
-def pageFetch(info: dict, delay: float):
-    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
-    if htmlFetch(info['url'], info['file']):  # save html content
-        logger.info('Page fetch success -> `%s`' % info['url'])
-    else:
-        logger.error('Page fetch failed -> `%s`' % info['url'])
-    time.sleep(delay)
+def loadChapter():
+    catalog = json.loads(open(sys.argv[1]).read())  # load catalog
+    for _, chapterId in catalog.items():  # traverse all chapters
+        yield {
+            'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
+            'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
+        }
 
 
-pages = []
-catalog = json.loads(open(sys.argv[1]).read())  # load catalog
-for _, chapterId in catalog.items():  # traverse all chapters
-    pages.append({
-        'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
-        'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
-    })
-
-
-with ThreadPoolExecutor(max_workers = 2) as pool:
-    for page in pages:
-        pool.submit(pageFetch, page, 5)
+htmlFetch(loadChapter(), 2)