From ac588484f2d7865454d7125c240d193fc7184b40 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 16 Oct 2022 16:42:50 +0800 Subject: [PATCH] update: enhance crawler process --- src/crawler/utils/fetch.py | 44 ++++++++++++++++++++------------- src/crawler/wxsy.net/catalog.py | 2 +- src/crawler/wxsy.net/crawler.sh | 6 ++++- src/crawler/wxsy.net/extract.py | 3 ++- src/crawler/wxsy.net/fetch.py | 11 ++++++--- src/crawler/wxsy.net/release.py | 3 ++- 6 files changed, 45 insertions(+), 24 deletions(-) diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py index 8c7097c..c3508c2 100644 --- a/src/crawler/utils/fetch.py +++ b/src/crawler/utils/fetch.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- import requests +from retry import retry from .logger import logger from concurrent import futures from concurrent.futures import ALL_COMPLETED @@ -14,27 +15,34 @@ userAgent = ( # default user agent ) -def httpRequest(url: str) -> bytes: # fetch raw html content - request = requests.get(url, timeout = 30, headers = { # timeout -> 30s - 'user-agent': userAgent, # with fake user-agent - 'accept-encoding': 'gzip, deflate', # allow content compress - }) +@retry(tries = 10, delay = 2, logger = None) +def httpRequest(url: str, proxy: str = '') -> bytes: # fetch raw html content + proxyStr = '' if proxy == '' else ' (via %s)' % proxy + logger.debug('Http request `%s`%s' % (url, proxyStr)) + proxy = None if proxy == '' else proxy # empty string -> None + request = requests.get( + url, timeout = 10, # timeout -> 10s + proxies = { # request via socks or http proxy + 'http': proxy, + 'https': proxy + }, + headers = { + 'user-agent': userAgent, # with fake user-agent + 'accept-encoding': 'gzip, deflate', # allow content compress + } + ) if request.status_code not in range(200, 300): # http status code 2xx raise RuntimeError('Http request failed') return request.content -def htmlSave(url: str, file: str) -> bool: # save html content +def htmlSave(url: str, file: str, proxy: str = '') -> bool: # save html content logger.debug('Html fetch `%s` -> `%s`' % (url, file)) try: - content = httpRequest(url) # http request + content = httpRequest(url, proxy) # http request except: - logger.debug('Html fetch retry -> `%s`' % url) - try: - content = httpRequest(url) # retry - except: - logger.debug('Html fetch failed -> `%s`' % url) - return False # request failed + logger.debug('Html fetch failed -> `%s`' % url) + return False # request failed logger.debug('Html fetch success -> `%s`' % url) try: with open(file, 'wb') as fileObj: # save html content @@ -46,22 +54,24 @@ def htmlSave(url: str, file: str) -> bool: # save html content return True -def pageFetch(info: dict, delay: float): # fetch html content into file +def pageFetch(info: dict, delay: float, proxy: str = ''): # fetch html content into file logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) - if htmlSave(info['url'], info['file']): # save html content + if htmlSave(info['url'], info['file'], proxy): # save html content logger.info('Page fetch success -> `%s`' % info['url']) else: logger.error('Page fetch failed -> `%s`' % info['url']) time.sleep(delay) -def htmlFetch(page, thread: int = 1, delay: float = 0): +def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''): # fetch html with generator logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay)) + if proxy != '': + logger.info('Html fetch proxy -> `%s`' % proxy) threadPool = ThreadPoolExecutor(max_workers = thread) threads = [] while True: try: - threads.append(threadPool.submit(pageFetch, next(page), delay)) + threads.append(threadPool.submit(pageFetch, next(page), delay, proxy)) except StopIteration: break futures.wait(threads, return_when = ALL_COMPLETED) diff --git a/src/crawler/wxsy.net/catalog.py b/src/crawler/wxsy.net/catalog.py index 378f735..718a393 100644 --- a/src/crawler/wxsy.net/catalog.py +++ b/src/crawler/wxsy.net/catalog.py @@ -28,7 +28,7 @@ def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content return {x[0]: x[1] for x in catalog} # formatted output -logger.info('Fetch catalog of `wxsy.net`') +logger.warning('Fetch catalog of `wxsy.net`') print(json.dumps( extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/')) )) diff --git a/src/crawler/wxsy.net/crawler.sh b/src/crawler/wxsy.net/crawler.sh index d7d1850..b4cb98e 100755 --- a/src/crawler/wxsy.net/crawler.sh +++ b/src/crawler/wxsy.net/crawler.sh @@ -4,7 +4,11 @@ cd `dirname $0` mkdir -p ./data/html/ mkdir -p ./data/json/ +[ -z ${PROXY} ] && PROXY= +[ -z ${THREAD} ] && THREAD=1 +[ -z ${DELAY} ] && DELAY=1 + python3 catalog.py > ./data/catalog.json -python3 fetch.py ./data/catalog.json ./data/html/ +python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} python3 extract.py ./data/catalog.json ./data/html/ ./data/json python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json diff --git a/src/crawler/wxsy.net/extract.py b/src/crawler/wxsy.net/extract.py index 773e8c7..f022e8c 100644 --- a/src/crawler/wxsy.net/extract.py +++ b/src/crawler/wxsy.net/extract.py @@ -11,7 +11,8 @@ import os import re import sys import json -from logger import logger +sys.path.append('..') +from utils import logger from bs4 import BeautifulSoup diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py index 87578e5..f7438fa 100644 --- a/src/crawler/wxsy.net/fetch.py +++ b/src/crawler/wxsy.net/fetch.py @@ -4,13 +4,12 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ import os import sys import json -import time sys.path.append('..') from utils import logger from utils import htmlFetch @@ -25,4 +24,10 @@ def loadChapter(): } -htmlFetch(loadChapter(), 2) +logger.warning('Fetch html of `wxsy.net`') +htmlFetch( + loadChapter(), + proxy = sys.argv[3], + thread = int(sys.argv[4]), + delay = float(sys.argv[5]), +) diff --git a/src/crawler/wxsy.net/release.py b/src/crawler/wxsy.net/release.py index dd135d1..fc0a98d 100644 --- a/src/crawler/wxsy.net/release.py +++ b/src/crawler/wxsy.net/release.py @@ -10,7 +10,8 @@ Combine all chapters from json files. import os import sys import json -from logger import logger +sys.path.append('..') +from utils import logger def loadData(catalog: dict) -> dict: # load data from json files