From 2564342c0591a8b07d6f1c4f416ce3b5f7fdf99d Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 16 Oct 2022 20:34:45 +0800 Subject: [PATCH] update: enhance crawler of `wxsy.net` --- src/crawler/utils/__init__.py | 2 -- src/crawler/utils/fetch.py | 6 +++--- src/crawler/wxsy.net/crawler.sh | 3 +-- src/crawler/wxsy.net/extract.py | 2 +- src/crawler/wxsy.net/release.py | 1 + 5 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/crawler/utils/__init__.py b/src/crawler/utils/__init__.py index 1e0743b..d7d7f4c 100644 --- a/src/crawler/utils/__init__.py +++ b/src/crawler/utils/__init__.py @@ -2,7 +2,5 @@ # -*- coding: utf-8 -*- from .logger import logger - -from .fetch import htmlSave from .fetch import htmlFetch from .fetch import httpRequest diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py index c3508c2..755d397 100644 --- a/src/crawler/utils/fetch.py +++ b/src/crawler/utils/fetch.py @@ -69,10 +69,10 @@ def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''): # fetc logger.info('Html fetch proxy -> `%s`' % proxy) threadPool = ThreadPoolExecutor(max_workers = thread) threads = [] - while True: + while True: # traverse generator try: - threads.append(threadPool.submit(pageFetch, next(page), delay, proxy)) + threads.append(threadPool.submit(pageFetch, next(page), delay, proxy)) # add task except StopIteration: break - futures.wait(threads, return_when = ALL_COMPLETED) + futures.wait(threads, return_when = ALL_COMPLETED) # wait all task complete logger.info('Html fetch complete') diff --git a/src/crawler/wxsy.net/crawler.sh b/src/crawler/wxsy.net/crawler.sh index b4cb98e..f701f12 100755 --- a/src/crawler/wxsy.net/crawler.sh +++ b/src/crawler/wxsy.net/crawler.sh @@ -4,9 +4,8 @@ cd `dirname $0` mkdir -p ./data/html/ mkdir -p ./data/json/ -[ -z ${PROXY} ] && PROXY= -[ -z ${THREAD} ] && THREAD=1 [ -z ${DELAY} ] && DELAY=1 +[ -z ${THREAD} ] && THREAD=1 python3 catalog.py > ./data/catalog.json python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} diff --git a/src/crawler/wxsy.net/extract.py b/src/crawler/wxsy.net/extract.py index f022e8c..c91bc9e 100644 --- a/src/crawler/wxsy.net/extract.py +++ b/src/crawler/wxsy.net/extract.py @@ -38,8 +38,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content return info +logger.warning('Extract info of `wxsy.net`') catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for _, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj: diff --git a/src/crawler/wxsy.net/release.py b/src/crawler/wxsy.net/release.py index fc0a98d..100ec7a 100644 --- a/src/crawler/wxsy.net/release.py +++ b/src/crawler/wxsy.net/release.py @@ -65,4 +65,5 @@ def combine() -> dict: # combine all chapters return result +logger.warning('Release info of `wxsy.net`') print(json.dumps(combine()))