From 2564342c0591a8b07d6f1c4f416ce3b5f7fdf99d Mon Sep 17 00:00:00 2001
From: Dnomd343 <i@343.re>
Date: Sun, 16 Oct 2022 20:34:45 +0800
Subject: [PATCH] update: enhance crawler of `wxsy.net`

---
 src/crawler/utils/__init__.py   | 2 --
 src/crawler/utils/fetch.py      | 6 +++---
 src/crawler/wxsy.net/crawler.sh | 3 +--
 src/crawler/wxsy.net/extract.py | 2 +-
 src/crawler/wxsy.net/release.py | 1 +
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/crawler/utils/__init__.py b/src/crawler/utils/__init__.py
index 1e0743b..d7d7f4c 100644
--- a/src/crawler/utils/__init__.py
+++ b/src/crawler/utils/__init__.py
@@ -2,7 +2,5 @@
 # -*- coding: utf-8 -*-
 
 from .logger import logger
-
-from .fetch import htmlSave
 from .fetch import htmlFetch
 from .fetch import httpRequest
diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py
index c3508c2..755d397 100644
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@@ -69,10 +69,10 @@ def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''):  # fetc
         logger.info('Html fetch proxy -> `%s`' % proxy)
     threadPool = ThreadPoolExecutor(max_workers = thread)
     threads = []
-    while True:
+    while True:  # traverse generator
         try:
-            threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))
+            threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))  # add task
         except StopIteration:
             break
-    futures.wait(threads, return_when = ALL_COMPLETED)
+    futures.wait(threads, return_when = ALL_COMPLETED)  # wait all task complete
     logger.info('Html fetch complete')
diff --git a/src/crawler/wxsy.net/crawler.sh b/src/crawler/wxsy.net/crawler.sh
index b4cb98e..f701f12 100755
--- a/src/crawler/wxsy.net/crawler.sh
+++ b/src/crawler/wxsy.net/crawler.sh
@@ -4,9 +4,8 @@ cd `dirname $0`
 mkdir -p ./data/html/
 mkdir -p ./data/json/
 
-[ -z ${PROXY} ] && PROXY=
-[ -z ${THREAD} ] && THREAD=1
 [ -z ${DELAY} ] && DELAY=1
+[ -z ${THREAD} ] && THREAD=1
 
 python3 catalog.py > ./data/catalog.json
 python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
diff --git a/src/crawler/wxsy.net/extract.py b/src/crawler/wxsy.net/extract.py
index f022e8c..c91bc9e 100644
--- a/src/crawler/wxsy.net/extract.py
+++ b/src/crawler/wxsy.net/extract.py
@@ -38,8 +38,8 @@ def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
     return info
 
 
+logger.warning('Extract info of `wxsy.net`')
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
-
 for _, chapterId in catalog.items():  # traverse all chapters
     logger.info('Analyse chapter `%s`' % chapterId)
     with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj:
diff --git a/src/crawler/wxsy.net/release.py b/src/crawler/wxsy.net/release.py
index fc0a98d..100ec7a 100644
--- a/src/crawler/wxsy.net/release.py
+++ b/src/crawler/wxsy.net/release.py
@@ -65,4 +65,5 @@ def combine() -> dict:  # combine all chapters
     return result
 
 
+logger.warning('Release info of `wxsy.net`')
 print(json.dumps(combine()))