From 3698f50c972e0d59375105ef939a78c2f177f99f Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 16 Oct 2022 23:59:52 +0800 Subject: [PATCH] update: enhance crawler of `xswang.com` --- src/crawler/xswang.com/catalog.py | 29 +++++---------- src/crawler/xswang.com/check.sh | 6 +++ src/crawler/xswang.com/crawler.sh | 7 +++- src/crawler/xswang.com/extract.py | 6 +-- src/crawler/xswang.com/fetch.py | 62 +++++++++++-------------------- 5 files changed, 44 insertions(+), 66 deletions(-) create mode 100755 src/crawler/xswang.com/check.sh diff --git a/src/crawler/xswang.com/catalog.py b/src/crawler/xswang.com/catalog.py index 2e40854..4d40581 100644 --- a/src/crawler/xswang.com/catalog.py +++ b/src/crawler/xswang.com/catalog.py @@ -8,39 +8,28 @@ Fetch catalog and output as JSON format. """ import re +import sys import json -import requests +sys.path.append('..') +from utils import logger +from utils import httpRequest from bs4 import BeautifulSoup -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' -) - -def httpRequest(url: str) -> str: # fetch raw html content - request = requests.get(url, headers = { - 'user-agent': userAgent, # with fake user-agent - 'accept-encoding': 'gzip, deflate', # allow content compress - }) - if request.status_code not in range(200, 300): # http status code 2xx - raise RuntimeError('Http request failed') - return request.text - - -def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content +def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content catalog = {} - html = BeautifulSoup(rawHtml, 'lxml') + html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml') for item in [x.select('a')[0] for x in html.select('dd')]: + title = re.search(r'^(第\d+章)(.*)', item.text.strip()) pageId = item.attrs['href'].replace('/book/56718/', '').replace('.html', '') - catalog[item.text.strip()] = pageId + catalog['%s %s' % (title[1], title[2].strip())] = pageId catalog = sorted(catalog.items(), key = lambda d: int( re.search(r'^第(\d+)章', d[0])[1] # sort by chapter )) return {x[0]: x[1] for x in catalog} # formatted output +logger.warning('Fetch catalog of `xswang.com`') print(json.dumps( extractCatalog(httpRequest('https://www.xswang.com/book/56718/')) )) diff --git a/src/crawler/xswang.com/check.sh b/src/crawler/xswang.com/check.sh new file mode 100755 index 0000000..5c16c22 --- /dev/null +++ b/src/crawler/xswang.com/check.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd "$(dirname "$0")" + +diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json) +diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json) diff --git a/src/crawler/xswang.com/crawler.sh b/src/crawler/xswang.com/crawler.sh index 4a5836b..354e0a7 100755 --- a/src/crawler/xswang.com/crawler.sh +++ b/src/crawler/xswang.com/crawler.sh @@ -1,8 +1,11 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" mkdir -p ./data/html/ +[ -z "${DELAY}" ] && DELAY=1 +[ -z "${THREAD}" ] && THREAD=1 + python3 catalog.py > ./data/catalog.json -python3 fetch.py ./data/catalog.json ./data/html/ +python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json diff --git a/src/crawler/xswang.com/extract.py b/src/crawler/xswang.com/extract.py index b1f3793..1bd0184 100644 --- a/src/crawler/xswang.com/extract.py +++ b/src/crawler/xswang.com/extract.py @@ -11,7 +11,8 @@ import os import re import sys import json -from logger import logger +sys.path.append('..') +from utils import logger from bs4 import BeautifulSoup @@ -32,8 +33,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content result = {} +logger.warning('Extract info of `xswang.com`') catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for chapterName, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId) @@ -41,5 +42,4 @@ for chapterName, chapterId in catalog.items(): # traverse all chapters if chapterName != info['title']: logger.error('Title error -> %s' % info['title']) result[chapterName] = info['content'] - print(json.dumps(result)) diff --git a/src/crawler/xswang.com/fetch.py b/src/crawler/xswang.com/fetch.py index 9aa2773..788a0de 100644 --- a/src/crawler/xswang.com/fetch.py +++ b/src/crawler/xswang.com/fetch.py @@ -4,50 +4,30 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ import os import sys import json -import time -import requests -from logger import logger - -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' +sys.path.append('..') +from utils import logger +from utils import htmlFetch + + +def loadChapter(): + catalog = json.loads(open(sys.argv[1]).read()) # load catalog + for _, chapterId in catalog.items(): # traverse all chapters + yield { + 'url': 'https://www.xswang.com/book/56718/%s.html' % chapterId, + 'file': os.path.join(sys.argv[2], '%s.html' % chapterId), + } + + +logger.warning('Fetch html of `xswang.com`') +htmlFetch( + loadChapter(), + proxy = sys.argv[3], + thread = int(sys.argv[4]), + delay = float(sys.argv[5]), ) - - -def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content - try: - logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) - request = requests.get(fileUrl, timeout = 30, # timeout -> 30s - headers = { - 'user-agent': userAgent, # with fake user-agent - } - ) - if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> `%s`' % fileUrl) - return False - logger.debug('Http request success -> `%s`' % fileUrl) - with open(fileName, 'wb') as fileObj: # save html content - fileObj.write(request.content) - logger.debug('File save success -> `%s`' % fileName) - except: - return False - return True - - -catalog = json.loads(open(sys.argv[1]).read()) # load catalog - -for _, chapterId in catalog.items(): # traverse all chapters - pageUrl = 'https://www.xswang.com/book/56718/%s.html' % chapterId - pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) - else: - logger.error('Page request failed -> %s' % pageUrl) - time.sleep(1) # avoid being blocked by the server