diff --git a/src/crawler/108shu.com/catalog.py b/src/crawler/108shu.com/catalog.py index 1e7c0cf..2d50718 100644 --- a/src/crawler/108shu.com/catalog.py +++ b/src/crawler/108shu.com/catalog.py @@ -8,31 +8,17 @@ Fetch catalog and output as JSON format. """ import re +import sys import json -import requests -from logger import logger +sys.path.append('..') +from utils import logger +from utils import httpRequest from bs4 import BeautifulSoup -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' -) - -def httpRequest(url: str) -> str: # fetch raw html content - request = requests.get(url, headers = { - 'user-agent': userAgent, # with fake user-agent - 'accept-encoding': 'gzip, deflate', # allow content compress - }) - if request.status_code not in range(200, 300): # http status code 2xx - raise RuntimeError('Http request failed') - return request.text - - -def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content +def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content catalog = {} - html = BeautifulSoup(rawHtml, 'lxml') + html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml') items = html.select('div[class="section-box"]')[1] for item in items.select('a'): name = re.search(r'^(第\d+章)(.*)$', item.text) @@ -45,9 +31,10 @@ def fetchCatalog(pageNum: int) -> dict: # fetch all catalog catalog = {} for pageId in range(1, pageNum + 1): # traverse all pages pageUrl = 'http://www.108shu.com/book/54247/index_%d.html' % pageId - logger.info('Page: %d -> `%s`' % (pageId, pageUrl)) + logger.info('Catalog page -> %d' % pageId) catalog.update(extractCatalog(httpRequest(pageUrl))) return catalog +logger.warning('Fetch catalog of `108shu.com`') print(json.dumps(fetchCatalog(45))) diff --git a/src/crawler/108shu.com/check.sh b/src/crawler/108shu.com/check.sh new file mode 100755 index 0000000..5c16c22 --- /dev/null +++ b/src/crawler/108shu.com/check.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +cd "$(dirname "$0")" + +diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json) +diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json) diff --git a/src/crawler/108shu.com/crawler.sh b/src/crawler/108shu.com/crawler.sh index 4f53aca..a30f770 100755 --- a/src/crawler/108shu.com/crawler.sh +++ b/src/crawler/108shu.com/crawler.sh @@ -1,17 +1,11 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" mkdir -p ./data/html/ -python3 catalog.py > ./data/catalog.json -python3 fetch.py ./data/catalog.json ./data/html/ -python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json - -cd ./data/ -xz -k9 catalog.json -tar cJf html.tar.xz html/ -xz -k9 xxrs.json +[ -z "${DELAY}" ] && DELAY=1 +[ -z "${THREAD}" ] && THREAD=1 -mkdir -p ../archive/ -mv *.xz ../archive/ -cd ../ +python3 catalog.py "" > ./data/catalog.json +python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} +python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json diff --git a/src/crawler/108shu.com/extract.py b/src/crawler/108shu.com/extract.py index e0766e3..5061ce2 100644 --- a/src/crawler/108shu.com/extract.py +++ b/src/crawler/108shu.com/extract.py @@ -11,13 +11,13 @@ import os import re import sys import json -from logger import logger +sys.path.append('..') +from utils import logger from bs4 import BeautifulSoup def splitHtml(rawHtml: str) -> dict: # extract from raw html content html = BeautifulSoup(rawHtml, 'lxml') - content = [x.text.strip() for x in html.select('div[class="content"]')[0].select('p')] title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text) return { 'title': '%s %s' % (title[1], title[2].strip()), @@ -25,9 +25,9 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content } -def combinePage(chapterId: str) -> dict: # combine sub pages - page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read()) - page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read()) +def combinePage(pageId: str) -> dict: # combine sub pages + page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read()) + page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read()) if page_1['title'] != page_2['title']: logger.error('Title error -> `%s`' % page_1['title']) return { @@ -37,13 +37,12 @@ def combinePage(chapterId: str) -> dict: # combine sub pages result = {} +logger.warning('Extract info of `108shu.com`') catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for chapterName, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) info = combinePage(chapterId) if chapterName != info['title']: logger.error('Title error -> %s' % info['title']) result[chapterName] = info['content'] - print(json.dumps(result)) diff --git a/src/crawler/108shu.com/fetch.py b/src/crawler/108shu.com/fetch.py index 3fe9f3c..7367d7f 100644 --- a/src/crawler/108shu.com/fetch.py +++ b/src/crawler/108shu.com/fetch.py @@ -4,51 +4,31 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ import os import sys import json -import time -import requests -from logger import logger - -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' -) - - -def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content - try: - logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) - request = requests.get(fileUrl, timeout = 30, # timeout -> 30s - headers = { - 'user-agent': userAgent, # with fake user-agent +sys.path.append('..') +from utils import logger +from utils import htmlFetch + + +def loadChapter(): + catalog = json.loads(open(sys.argv[1]).read()) # load catalog + for _, chapterId in catalog.items(): # traverse all chapters + for subPage in [1, 2]: # two sub-pages in one chapter + yield { + 'url': 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage), + 'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)), } - ) - if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> `%s`' % fileUrl) - return False - logger.debug('Http request success -> `%s`' % fileUrl) - with open(fileName, 'w') as fileObj: # save html content - fileObj.write(request.text) - logger.debug('File save success -> `%s`' % fileName) - except: - return False - return True - -catalog = json.loads(open(sys.argv[1]).read()) # load catalog -for _, chapterId in catalog.items(): # traverse all chapters - for subPage in [1, 2]: - pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage) - pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) - else: - logger.error('Page request failed -> %s' % pageUrl) - time.sleep(1) # avoid being blocked by the server +logger.warning('Fetch html of `108shu.com`') +htmlFetch( + loadChapter(), + proxy = sys.argv[3], + thread = int(sys.argv[4]), + delay = float(sys.argv[5]), +)