update: enhance crawler of `wxsy.net`

3 years ago · 2564342c05
5 changed files with 6 additions and 8 deletions
--- a/src/crawler/utils/init.py
+++ b/src/crawler/utils/init.py
@ -2,7 +2,5 @@
 # -*- coding: utf-8 -*-
 from .logger import logger
 from .fetch import htmlSave
 from .fetch import htmlFetch
 from .fetch import httpRequest
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@ -69,10 +69,10 @@ def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''):  # fetc
        logger.info('Html fetch proxy -> `%s`' % proxy)
    threadPool = ThreadPoolExecutor(max_workers = thread)
    threads = []
-    while True:
+    while True:  # traverse generator
        try:
-            threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))
+            threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))  # add task
        except StopIteration:
            break
-    futures.wait(threads, return_when = ALL_COMPLETED)
+    futures.wait(threads, return_when = ALL_COMPLETED)  # wait all task complete
    logger.info('Html fetch complete')
--- a/src/crawler/wxsy.net/crawler.sh
+++ b/src/crawler/wxsy.net/crawler.sh
@ -4,9 +4,8 @@ cd `dirname $0`
 mkdir -p ./data/html/
 mkdir -p ./data/json/
 [ -z ${PROXY} ] && PROXY=
 [ -z ${THREAD} ] && THREAD=1
 [ -z ${DELAY} ] && DELAY=1
 [ -z ${THREAD} ] && THREAD=1
 python3 catalog.py > ./data/catalog.json
 python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
--- a/src/crawler/wxsy.net/extract.py
+++ b/src/crawler/wxsy.net/extract.py
@ -38,8 +38,8 @@ def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
    return info
 logger.warning('Extract info of `wxsy.net`')
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
 for _, chapterId in catalog.items():  # traverse all chapters
    logger.info('Analyse chapter `%s`' % chapterId)
    with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj:
--- a/src/crawler/wxsy.net/release.py
+++ b/src/crawler/wxsy.net/release.py
@ -65,4 +65,5 @@ def combine() -> dict:  # combine all chapters
    return result
 logger.warning('Release info of `wxsy.net`')
 print(json.dumps(combine()))