From 8d2f06018819d0e1c994fe42d47f16b8bea8e04a Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 16 Oct 2022 22:30:03 +0800 Subject: [PATCH] update: enhance crawler of `aidusk.com` --- src/crawler/aidusk.com/check.sh | 6 +-- src/crawler/aidusk.com/crawler.sh | 7 +++- src/crawler/aidusk.com/extract.py | 6 +-- src/crawler/aidusk.com/fetch.py | 62 +++++++++++-------------------- 4 files changed, 32 insertions(+), 49 deletions(-) diff --git a/src/crawler/aidusk.com/check.sh b/src/crawler/aidusk.com/check.sh index ab2bc6b..5c16c22 100755 --- a/src/crawler/aidusk.com/check.sh +++ b/src/crawler/aidusk.com/check.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" -diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .) -diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .) +diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json) +diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json) diff --git a/src/crawler/aidusk.com/crawler.sh b/src/crawler/aidusk.com/crawler.sh index 4a5836b..354e0a7 100755 --- a/src/crawler/aidusk.com/crawler.sh +++ b/src/crawler/aidusk.com/crawler.sh @@ -1,8 +1,11 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" mkdir -p ./data/html/ +[ -z "${DELAY}" ] && DELAY=1 +[ -z "${THREAD}" ] && THREAD=1 + python3 catalog.py > ./data/catalog.json -python3 fetch.py ./data/catalog.json ./data/html/ +python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json diff --git a/src/crawler/aidusk.com/extract.py b/src/crawler/aidusk.com/extract.py index 4e6e1ae..35789df 100644 --- a/src/crawler/aidusk.com/extract.py +++ b/src/crawler/aidusk.com/extract.py @@ -11,7 +11,8 @@ import os import re import sys import json -from logger import logger +sys.path.append('..') +from utils import logger from bs4 import BeautifulSoup @@ -28,8 +29,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content result = {} +logger.warning('Extract info of `aidusk.com`') catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for chapterName, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId) @@ -37,5 +38,4 @@ for chapterName, chapterId in catalog.items(): # traverse all chapters if chapterName != info['title']: logger.error('Title error -> %s' % info['title']) result[chapterName] = info['content'] - print(json.dumps(result)) diff --git a/src/crawler/aidusk.com/fetch.py b/src/crawler/aidusk.com/fetch.py index 3651fc0..1b72289 100644 --- a/src/crawler/aidusk.com/fetch.py +++ b/src/crawler/aidusk.com/fetch.py @@ -4,50 +4,30 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ import os import sys import json -import time -import requests -from logger import logger - -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' +sys.path.append('..') +from utils import logger +from utils import htmlFetch + + +def loadChapter(): + catalog = json.loads(open(sys.argv[1]).read()) # load catalog + for _, chapterId in catalog.items(): # traverse all chapters + yield { + 'url': 'http://www.aidusk.com/t/134659/%s.html' % chapterId, + 'file': os.path.join(sys.argv[2], '%s.html' % chapterId), + } + + +logger.warning('Fetch html of `aidusk.com`') +htmlFetch( + loadChapter(), + proxy = sys.argv[3], + thread = int(sys.argv[4]), + delay = float(sys.argv[5]), ) - - -def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content - try: - logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) - request = requests.get(fileUrl, timeout = 30, # timeout -> 30s - headers = { - 'user-agent': userAgent, # with fake user-agent - } - ) - if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> `%s`' % fileUrl) - return False - logger.debug('Http request success -> `%s`' % fileUrl) - with open(fileName, 'wb') as fileObj: # save html content - fileObj.write(request.content) - logger.debug('File save success -> `%s`' % fileName) - except: - return False - return True - - -catalog = json.loads(open(sys.argv[1]).read()) # load catalog - -for _, chapterId in catalog.items(): # traverse all chapters - pageUrl = 'http://www.aidusk.com/t/134659/%s.html' % chapterId - pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) - else: - logger.error('Page request failed -> %s' % pageUrl) - time.sleep(1) # avoid being blocked by the server