Browse Source

update: enhance crawler of `wxsy.net`

master
Dnomd343 2 years ago
parent
commit
2564342c05
  1. 2
      src/crawler/utils/__init__.py
  2. 6
      src/crawler/utils/fetch.py
  3. 3
      src/crawler/wxsy.net/crawler.sh
  4. 2
      src/crawler/wxsy.net/extract.py
  5. 1
      src/crawler/wxsy.net/release.py

2
src/crawler/utils/__init__.py

@ -2,7 +2,5 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .logger import logger from .logger import logger
from .fetch import htmlSave
from .fetch import htmlFetch from .fetch import htmlFetch
from .fetch import httpRequest from .fetch import httpRequest

6
src/crawler/utils/fetch.py

@ -69,10 +69,10 @@ def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''): # fetc
logger.info('Html fetch proxy -> `%s`' % proxy) logger.info('Html fetch proxy -> `%s`' % proxy)
threadPool = ThreadPoolExecutor(max_workers = thread) threadPool = ThreadPoolExecutor(max_workers = thread)
threads = [] threads = []
while True: while True: # traverse generator
try: try:
threads.append(threadPool.submit(pageFetch, next(page), delay, proxy)) threads.append(threadPool.submit(pageFetch, next(page), delay, proxy)) # add task
except StopIteration: except StopIteration:
break break
futures.wait(threads, return_when = ALL_COMPLETED) futures.wait(threads, return_when = ALL_COMPLETED) # wait all task complete
logger.info('Html fetch complete') logger.info('Html fetch complete')

3
src/crawler/wxsy.net/crawler.sh

@ -4,9 +4,8 @@ cd `dirname $0`
mkdir -p ./data/html/ mkdir -p ./data/html/
mkdir -p ./data/json/ mkdir -p ./data/json/
[ -z ${PROXY} ] && PROXY=
[ -z ${THREAD} ] && THREAD=1
[ -z ${DELAY} ] && DELAY=1 [ -z ${DELAY} ] && DELAY=1
[ -z ${THREAD} ] && THREAD=1
python3 catalog.py > ./data/catalog.json python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}

2
src/crawler/wxsy.net/extract.py

@ -38,8 +38,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
return info return info
logger.warning('Extract info of `wxsy.net`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters for _, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId) logger.info('Analyse chapter `%s`' % chapterId)
with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj: with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj:

1
src/crawler/wxsy.net/release.py

@ -65,4 +65,5 @@ def combine() -> dict: # combine all chapters
return result return result
logger.warning('Release info of `wxsy.net`')
print(json.dumps(combine())) print(json.dumps(combine()))

Loading…
Cancel
Save