update: enhance crawler process

3 years ago · ac588484f2
6 changed files with 45 additions and 24 deletions
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 import requests
 from retry import retry
 from .logger import logger
 from concurrent import futures
 from concurrent.futures import ALL_COMPLETED
@ -14,24 +15,31 @@ userAgent = (  # default user agent
 )
-def httpRequest(url: str) -> bytes:  # fetch raw html content
+@retry(tries = 10, delay = 2, logger = None)
-    request = requests.get(url, timeout = 30, headers = {  # timeout -> 30s
+def httpRequest(url: str, proxy: str = '') -> bytes:  # fetch raw html content
    proxyStr = '' if proxy == '' else ' (via %s)' % proxy
    logger.debug('Http request `%s`%s' % (url, proxyStr))
    proxy = None if proxy == '' else proxy  # empty string -> None
    request = requests.get(
        url, timeout = 10,  # timeout -> 10s
        proxies = {  # request via socks or http proxy
            'http': proxy,
            'https': proxy
        },
        headers = {
            'user-agent': userAgent,  # with fake user-agent
            'accept-encoding': 'gzip, deflate',  # allow content compress
-    })
+        }
    )
    if request.status_code not in range(200, 300):  # http status code 2xx
        raise RuntimeError('Http request failed')
    return request.content
-def htmlSave(url: str, file: str) -> bool:  # save html content
+def htmlSave(url: str, file: str, proxy: str = '') -> bool:  # save html content
    logger.debug('Html fetch `%s` -> `%s`' % (url, file))
    try:
-        content = httpRequest(url)  # http request
+        content = httpRequest(url, proxy)  # http request
    except:
        logger.debug('Html fetch retry -> `%s`' % url)
        try:
            content = httpRequest(url)  # retry
    except:
        logger.debug('Html fetch failed -> `%s`' % url)
        return False  # request failed
@ -46,22 +54,24 @@ def htmlSave(url: str, file: str) -> bool:  # save html content
    return True
-def pageFetch(info: dict, delay: float):  # fetch html content into file
+def pageFetch(info: dict, delay: float, proxy: str = ''):  # fetch html content into file
    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
-    if htmlSave(info['url'], info['file']):  # save html content
+    if htmlSave(info['url'], info['file'], proxy):  # save html content
        logger.info('Page fetch success -> `%s`' % info['url'])
    else:
        logger.error('Page fetch failed -> `%s`' % info['url'])
    time.sleep(delay)
-def htmlFetch(page, thread: int = 1, delay: float = 0):
+def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''):  # fetch html with generator
    logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay))
    if proxy != '':
        logger.info('Html fetch proxy -> `%s`' % proxy)
    threadPool = ThreadPoolExecutor(max_workers = thread)
    threads = []
    while True:
        try:
-            threads.append(threadPool.submit(pageFetch, next(page), delay))
+            threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))
        except StopIteration:
            break
    futures.wait(threads, return_when = ALL_COMPLETED)
--- a/src/crawler/wxsy.net/catalog.py
+++ b/src/crawler/wxsy.net/catalog.py
@ -28,7 +28,7 @@ def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
    return {x[0]: x[1] for x in catalog}  # formatted output
-logger.info('Fetch catalog of `wxsy.net`')
+logger.warning('Fetch catalog of `wxsy.net`')
 print(json.dumps(
    extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/'))
 ))
--- a/src/crawler/wxsy.net/crawler.sh
+++ b/src/crawler/wxsy.net/crawler.sh
@ -4,7 +4,11 @@ cd `dirname $0`
 mkdir -p ./data/html/
 mkdir -p ./data/json/
 [ -z ${PROXY} ] && PROXY=
 [ -z ${THREAD} ] && THREAD=1
 [ -z ${DELAY} ] && DELAY=1
 python3 catalog.py > ./data/catalog.json
-python3 fetch.py ./data/catalog.json ./data/html/
+python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
 python3 extract.py ./data/catalog.json ./data/html/ ./data/json
 python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json
--- a/src/crawler/wxsy.net/extract.py
+++ b/src/crawler/wxsy.net/extract.py
@ -11,7 +11,8 @@ import os
 import re
 import sys
 import json
-from logger import logger
+sys.path.append('..')
 from utils import logger
 from bs4 import BeautifulSoup
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@ -4,13 +4,12 @@
 """
 Download raw html content as `.html` files.
-    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
+    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] 
 """
 import os
 import sys
 import json
 import time
 sys.path.append('..')
 from utils import logger
 from utils import htmlFetch
@ -25,4 +24,10 @@ def loadChapter():
        }
-htmlFetch(loadChapter(), 2)
+logger.warning('Fetch html of `wxsy.net`')
 htmlFetch(
    loadChapter(),
    proxy = sys.argv[3],
    thread = int(sys.argv[4]),
    delay = float(sys.argv[5]),
 )
--- a/src/crawler/wxsy.net/release.py
+++ b/src/crawler/wxsy.net/release.py
@ -10,7 +10,8 @@ Combine all chapters from json files.
 import os
 import sys
 import json
-from logger import logger
+sys.path.append('..')
 from utils import logger
 def loadData(catalog: dict) -> dict:  # load data from json files