From 8d2f06018819d0e1c994fe42d47f16b8bea8e04a Mon Sep 17 00:00:00 2001
From: Dnomd343 <i@343.re>
Date: Sun, 16 Oct 2022 22:30:03 +0800
Subject: [PATCH] update: enhance crawler of `aidusk.com`

---
 src/crawler/aidusk.com/check.sh   |  6 +--
 src/crawler/aidusk.com/crawler.sh |  7 +++-
 src/crawler/aidusk.com/extract.py |  6 +--
 src/crawler/aidusk.com/fetch.py   | 62 +++++++++++--------------------
 4 files changed, 32 insertions(+), 49 deletions(-)

diff --git a/src/crawler/aidusk.com/check.sh b/src/crawler/aidusk.com/check.sh
index ab2bc6b..5c16c22 100755
--- a/src/crawler/aidusk.com/check.sh
+++ b/src/crawler/aidusk.com/check.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-cd `dirname $0`
+cd "$(dirname "$0")"
 
-diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .)
-diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .)
+diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
+diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)
diff --git a/src/crawler/aidusk.com/crawler.sh b/src/crawler/aidusk.com/crawler.sh
index 4a5836b..354e0a7 100755
--- a/src/crawler/aidusk.com/crawler.sh
+++ b/src/crawler/aidusk.com/crawler.sh
@@ -1,8 +1,11 @@
 #!/usr/bin/env bash
 
-cd `dirname $0`
+cd "$(dirname "$0")"
 mkdir -p ./data/html/
 
+[ -z "${DELAY}" ] && DELAY=1
+[ -z "${THREAD}" ] && THREAD=1
+
 python3 catalog.py > ./data/catalog.json
-python3 fetch.py ./data/catalog.json ./data/html/
+python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
 python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json
diff --git a/src/crawler/aidusk.com/extract.py b/src/crawler/aidusk.com/extract.py
index 4e6e1ae..35789df 100644
--- a/src/crawler/aidusk.com/extract.py
+++ b/src/crawler/aidusk.com/extract.py
@@ -11,7 +11,8 @@ import os
 import re
 import sys
 import json
-from logger import logger
+sys.path.append('..')
+from utils import logger
 from bs4 import BeautifulSoup
 
 
@@ -28,8 +29,8 @@ def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
 
 
 result = {}
+logger.warning('Extract info of `aidusk.com`')
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
-
 for chapterName, chapterId in catalog.items():  # traverse all chapters
     logger.info('Analyse chapter `%s`' % chapterId)
     htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
@@ -37,5 +38,4 @@ for chapterName, chapterId in catalog.items():  # traverse all chapters
     if chapterName != info['title']:
         logger.error('Title error -> %s' % info['title'])
     result[chapterName] = info['content']
-
 print(json.dumps(result))
diff --git a/src/crawler/aidusk.com/fetch.py b/src/crawler/aidusk.com/fetch.py
index 3651fc0..1b72289 100644
--- a/src/crawler/aidusk.com/fetch.py
+++ b/src/crawler/aidusk.com/fetch.py
@@ -4,50 +4,30 @@
 """
 Download raw html content as `.html` files.
 
-    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
+    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
 """
 
 import os
 import sys
 import json
-import time
-import requests
-from logger import logger
-
-userAgent = (  # default user agent
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
-    'AppleWebKit/537.36 (KHTML, like Gecko) '
-    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
+sys.path.append('..')
+from utils import logger
+from utils import htmlFetch
+
+
+def loadChapter():
+    catalog = json.loads(open(sys.argv[1]).read())  # load catalog
+    for _, chapterId in catalog.items():  # traverse all chapters
+        yield {
+            'url': 'http://www.aidusk.com/t/134659/%s.html' % chapterId,
+            'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
+        }
+
+
+logger.warning('Fetch html of `aidusk.com`')
+htmlFetch(
+    loadChapter(),
+    proxy = sys.argv[3],
+    thread = int(sys.argv[4]),
+    delay = float(sys.argv[5]),
 )
-
-
-def httpRequest(fileUrl: str, fileName: str) -> bool:  # save html content
-    try:
-        logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
-        request = requests.get(fileUrl, timeout = 30,  # timeout -> 30s
-            headers = {
-                'user-agent': userAgent,  # with fake user-agent
-            }
-        )
-        if request.status_code not in range(200, 300):  # http status code 2xx
-            logger.warning('Http request failed -> `%s`' % fileUrl)
-            return False
-        logger.debug('Http request success -> `%s`' % fileUrl)
-        with open(fileName, 'wb') as fileObj:  # save html content
-            fileObj.write(request.content)
-        logger.debug('File save success -> `%s`' % fileName)
-    except:
-        return False
-    return True
-
-
-catalog = json.loads(open(sys.argv[1]).read())  # load catalog
-
-for _, chapterId in catalog.items():  # traverse all chapters
-    pageUrl = 'http://www.aidusk.com/t/134659/%s.html' % chapterId
-    pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
-    if httpRequest(pageUrl, pageFile):  # save html content
-        logger.info('Page request success -> %s' % pageUrl)
-    else:
-        logger.error('Page request failed -> %s' % pageUrl)
-    time.sleep(1)  # avoid being blocked by the server