From ac588484f2d7865454d7125c240d193fc7184b40 Mon Sep 17 00:00:00 2001
From: Dnomd343 <i@343.re>
Date: Sun, 16 Oct 2022 16:42:50 +0800
Subject: [PATCH] update: enhance crawler process

---
 src/crawler/utils/fetch.py      | 44 ++++++++++++++++++++-------------
 src/crawler/wxsy.net/catalog.py |  2 +-
 src/crawler/wxsy.net/crawler.sh |  6 ++++-
 src/crawler/wxsy.net/extract.py |  3 ++-
 src/crawler/wxsy.net/fetch.py   | 11 ++++++---
 src/crawler/wxsy.net/release.py |  3 ++-
 6 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py
index 8c7097c..c3508c2 100644
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 
 import requests
+from retry import retry
 from .logger import logger
 from concurrent import futures
 from concurrent.futures import ALL_COMPLETED
@@ -14,27 +15,34 @@ userAgent = (  # default user agent
 )
 
 
-def httpRequest(url: str) -> bytes:  # fetch raw html content
-    request = requests.get(url, timeout = 30, headers = {  # timeout -> 30s
-        'user-agent': userAgent,  # with fake user-agent
-        'accept-encoding': 'gzip, deflate',  # allow content compress
-    })
+@retry(tries = 10, delay = 2, logger = None)
+def httpRequest(url: str, proxy: str = '') -> bytes:  # fetch raw html content
+    proxyStr = '' if proxy == '' else ' (via %s)' % proxy
+    logger.debug('Http request `%s`%s' % (url, proxyStr))
+    proxy = None if proxy == '' else proxy  # empty string -> None
+    request = requests.get(
+        url, timeout = 10,  # timeout -> 10s
+        proxies = {  # request via socks or http proxy
+            'http': proxy,
+            'https': proxy
+        },
+        headers = {
+            'user-agent': userAgent,  # with fake user-agent
+            'accept-encoding': 'gzip, deflate',  # allow content compress
+        }
+    )
     if request.status_code not in range(200, 300):  # http status code 2xx
         raise RuntimeError('Http request failed')
     return request.content
 
 
-def htmlSave(url: str, file: str) -> bool:  # save html content
+def htmlSave(url: str, file: str, proxy: str = '') -> bool:  # save html content
     logger.debug('Html fetch `%s` -> `%s`' % (url, file))
     try:
-        content = httpRequest(url)  # http request
+        content = httpRequest(url, proxy)  # http request
     except:
-        logger.debug('Html fetch retry -> `%s`' % url)
-        try:
-            content = httpRequest(url)  # retry
-        except:
-            logger.debug('Html fetch failed -> `%s`' % url)
-            return False  # request failed
+        logger.debug('Html fetch failed -> `%s`' % url)
+        return False  # request failed
     logger.debug('Html fetch success -> `%s`' % url)
     try:
         with open(file, 'wb') as fileObj:  # save html content
@@ -46,22 +54,24 @@ def htmlSave(url: str, file: str) -> bool:  # save html content
     return True
 
 
-def pageFetch(info: dict, delay: float):  # fetch html content into file
+def pageFetch(info: dict, delay: float, proxy: str = ''):  # fetch html content into file
     logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
-    if htmlSave(info['url'], info['file']):  # save html content
+    if htmlSave(info['url'], info['file'], proxy):  # save html content
         logger.info('Page fetch success -> `%s`' % info['url'])
     else:
         logger.error('Page fetch failed -> `%s`' % info['url'])
     time.sleep(delay)
 
 
-def htmlFetch(page, thread: int = 1, delay: float = 0):
+def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''):  # fetch html with generator
     logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay))
+    if proxy != '':
+        logger.info('Html fetch proxy -> `%s`' % proxy)
     threadPool = ThreadPoolExecutor(max_workers = thread)
     threads = []
     while True:
         try:
-            threads.append(threadPool.submit(pageFetch, next(page), delay))
+            threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))
         except StopIteration:
             break
     futures.wait(threads, return_when = ALL_COMPLETED)
diff --git a/src/crawler/wxsy.net/catalog.py b/src/crawler/wxsy.net/catalog.py
index 378f735..718a393 100644
--- a/src/crawler/wxsy.net/catalog.py
+++ b/src/crawler/wxsy.net/catalog.py
@@ -28,7 +28,7 @@ def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
     return {x[0]: x[1] for x in catalog}  # formatted output
 
 
-logger.info('Fetch catalog of `wxsy.net`')
+logger.warning('Fetch catalog of `wxsy.net`')
 print(json.dumps(
     extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/'))
 ))
diff --git a/src/crawler/wxsy.net/crawler.sh b/src/crawler/wxsy.net/crawler.sh
index d7d1850..b4cb98e 100755
--- a/src/crawler/wxsy.net/crawler.sh
+++ b/src/crawler/wxsy.net/crawler.sh
@@ -4,7 +4,11 @@ cd `dirname $0`
 mkdir -p ./data/html/
 mkdir -p ./data/json/
 
+[ -z ${PROXY} ] && PROXY=
+[ -z ${THREAD} ] && THREAD=1
+[ -z ${DELAY} ] && DELAY=1
+
 python3 catalog.py > ./data/catalog.json
-python3 fetch.py ./data/catalog.json ./data/html/
+python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
 python3 extract.py ./data/catalog.json ./data/html/ ./data/json
 python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json
diff --git a/src/crawler/wxsy.net/extract.py b/src/crawler/wxsy.net/extract.py
index 773e8c7..f022e8c 100644
--- a/src/crawler/wxsy.net/extract.py
+++ b/src/crawler/wxsy.net/extract.py
@@ -11,7 +11,8 @@ import os
 import re
 import sys
 import json
-from logger import logger
+sys.path.append('..')
+from utils import logger
 from bs4 import BeautifulSoup
 
 
diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py
index 87578e5..f7438fa 100644
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@@ -4,13 +4,12 @@
 """
 Download raw html content as `.html` files.
 
-    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
+    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] 
 """
 
 import os
 import sys
 import json
-import time
 sys.path.append('..')
 from utils import logger
 from utils import htmlFetch
@@ -25,4 +24,10 @@ def loadChapter():
         }
 
 
-htmlFetch(loadChapter(), 2)
+logger.warning('Fetch html of `wxsy.net`')
+htmlFetch(
+    loadChapter(),
+    proxy = sys.argv[3],
+    thread = int(sys.argv[4]),
+    delay = float(sys.argv[5]),
+)
diff --git a/src/crawler/wxsy.net/release.py b/src/crawler/wxsy.net/release.py
index dd135d1..fc0a98d 100644
--- a/src/crawler/wxsy.net/release.py
+++ b/src/crawler/wxsy.net/release.py
@@ -10,7 +10,8 @@ Combine all chapters from json files.
 import os
 import sys
 import json
-from logger import logger
+sys.path.append('..')
+from utils import logger
 
 
 def loadData(catalog: dict) -> dict:  # load data from json files