diff --git a/src/crawler/ixsw.la/catalog.py b/src/crawler/ixsw.la/catalog.py index 3025972..861b97f 100644 --- a/src/crawler/ixsw.la/catalog.py +++ b/src/crawler/ixsw.la/catalog.py @@ -8,30 +8,17 @@ Fetch catalog and output as JSON format. """ import re +import sys import json -import requests +sys.path.append('..') +from utils import logger +from utils import httpRequest from bs4 import BeautifulSoup -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' -) - -def httpRequest(url: str) -> str: # fetch raw html content - request = requests.get(url, headers = { - 'user-agent': userAgent, # with fake user-agent - 'accept-encoding': 'gzip, deflate', # allow content compress - }) - if request.status_code not in range(200, 300): # http status code 2xx - raise RuntimeError('Http request failed') - return str(request.content, encoding = 'utf-8') - - -def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content +def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content catalog = {} - html = BeautifulSoup(rawHtml, 'lxml') + html = BeautifulSoup(str(rawHtml, encoding='utf-8'), 'lxml') for item in html.select('dd'): item = item.select('a')[0] name = re.search(r'^(第\d+章)(.*)$', item.text) @@ -43,6 +30,7 @@ def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content return {x[0]: x[1] for x in catalog} # formatted output +logger.warning('Fetch catalog of `ixsw.la`') print(json.dumps( extractCatalog(httpRequest('https://www.ixsw.la/ks82668/')) )) diff --git a/src/crawler/ixsw.la/check.sh b/src/crawler/ixsw.la/check.sh index ab2bc6b..5c16c22 100755 --- a/src/crawler/ixsw.la/check.sh +++ b/src/crawler/ixsw.la/check.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" -diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .) -diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .) +diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json) +diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json) diff --git a/src/crawler/ixsw.la/crawler.sh b/src/crawler/ixsw.la/crawler.sh index 4a5836b..354e0a7 100755 --- a/src/crawler/ixsw.la/crawler.sh +++ b/src/crawler/ixsw.la/crawler.sh @@ -1,8 +1,11 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" mkdir -p ./data/html/ +[ -z "${DELAY}" ] && DELAY=1 +[ -z "${THREAD}" ] && THREAD=1 + python3 catalog.py > ./data/catalog.json -python3 fetch.py ./data/catalog.json ./data/html/ +python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json diff --git a/src/crawler/ixsw.la/extract.py b/src/crawler/ixsw.la/extract.py index 44cb0ef..723825a 100644 --- a/src/crawler/ixsw.la/extract.py +++ b/src/crawler/ixsw.la/extract.py @@ -11,7 +11,8 @@ import os import re import sys import json -from logger import logger +sys.path.append('..') +from utils import logger from bs4 import BeautifulSoup @@ -28,8 +29,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content result = {} +logger.warning('Extract info of `ixsw.la`') catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for chapterName, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId) @@ -37,5 +38,4 @@ for chapterName, chapterId in catalog.items(): # traverse all chapters if chapterName != info['title']: logger.error('Title error -> %s' % info['title']) result[chapterName] = info['content'] - print(json.dumps(result)) diff --git a/src/crawler/ixsw.la/fetch.py b/src/crawler/ixsw.la/fetch.py index 0c235ac..b1a8adb 100644 --- a/src/crawler/ixsw.la/fetch.py +++ b/src/crawler/ixsw.la/fetch.py @@ -4,50 +4,30 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ import os import sys import json -import time -import requests -from logger import logger - -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' +sys.path.append('..') +from utils import logger +from utils import htmlFetch + + +def loadChapter(): + catalog = json.loads(open(sys.argv[1]).read()) # load catalog + for _, chapterId in catalog.items(): # traverse all chapters + yield { + 'url': 'https://www.ixsw.la/ks82668/%s.html' % chapterId, + 'file': os.path.join(sys.argv[2], '%s.html' % chapterId), + } + + +logger.warning('Fetch html of `ixsw.la`') +htmlFetch( + loadChapter(), + proxy = sys.argv[3], + thread = int(sys.argv[4]), + delay = float(sys.argv[5]), ) - - -def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content - try: - logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) - request = requests.get(fileUrl, timeout = 30, # timeout -> 30s - headers = { - 'user-agent': userAgent, # with fake user-agent - } - ) - if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> `%s`' % fileUrl) - return False - logger.debug('Http request success -> `%s`' % fileUrl) - with open(fileName, 'wb') as fileObj: # save html content - fileObj.write(request.content) - logger.debug('File save success -> `%s`' % fileName) - except: - return False - return True - - -catalog = json.loads(open(sys.argv[1]).read()) # load catalog - -for _, chapterId in catalog.items(): # traverse all chapters - pageUrl = 'https://www.ixsw.la/ks82668/%s.html' % chapterId - pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) - else: - logger.error('Page request failed -> %s' % pageUrl) - time.sleep(1) # avoid being blocked by the server