From 20dbf4c3d14f02c131f34c6c1983ff2ef5f1a0b3 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 16 Oct 2022 21:04:36 +0800 Subject: [PATCH] update: enhance crawler of `m.wxsy.net` --- src/crawler/m.wxsy.net/catalog.py | 42 +++++++-------------- src/crawler/m.wxsy.net/crawler.sh | 7 +++- src/crawler/m.wxsy.net/extract.py | 16 ++++---- src/crawler/m.wxsy.net/fetch.py | 63 +++++++++++-------------------- src/crawler/m.wxsy.net/release.py | 9 +++-- src/crawler/wxsy.net/fetch.py | 2 +- 6 files changed, 55 insertions(+), 84 deletions(-) diff --git a/src/crawler/m.wxsy.net/catalog.py b/src/crawler/m.wxsy.net/catalog.py index 1028536..a1d2e42 100644 --- a/src/crawler/m.wxsy.net/catalog.py +++ b/src/crawler/m.wxsy.net/catalog.py @@ -4,37 +4,17 @@ """ Fetch catalog and output as JSON format. - USAGE: python3 catalog.py + USAGE: python3 catalog.py [PROXY] """ -import sys -sys.path.append('..') - import re +import sys import json -import time -import requests -from logger import logger +sys.path.append('..') +from utils import logger +from utils import httpRequest from bs4 import BeautifulSoup -basicUrl = 'https://m.wxsy.net/novel/57104/all.html' - -userAgent = ( # default user-agent - 'Mozilla/5.0 (Linux; Android 10; moto g(7) play) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/100.0.4896.79 Mobile Safari/537.36' -) - - -def httpRequest(url: str) -> str: # fetch raw html content - request = requests.get(url, headers = { - 'user-agent': userAgent, # with fake user-agent - 'accept-encoding': 'gzip, deflate', # allow content compress - }) - if request.status_code not in range(200, 300): # http status code 2xx - raise RuntimeError('Http request failed') - return request.text - def analysePage(rawHtml: str) -> list: # extract catalog from html content analyseRet = [] @@ -51,10 +31,13 @@ def analysePage(rawHtml: str) -> list: # extract catalog from html content def fetchCatalog(pageNum: int) -> list: # fetch raw catalog catalog = [] for pageIndex in range(1, pageNum + 1): # traverse all pages (1 ~ pageNum) - logger.info('Page: %d' % pageIndex) - pageUrl = '%s?sort=1&page=%d' % (basicUrl, pageIndex) - catalog.append(analysePage(httpRequest(pageUrl))) - time.sleep(1) # avoid being blocked by the server + logger.info('Catalog page -> %d' % pageIndex) + catalog.append(analysePage( + httpRequest( + 'https://m.wxsy.net/novel/57104/all.html?sort=1&page=%d' % pageIndex, + proxy = sys.argv[1] + ) + )) return catalog @@ -70,5 +53,6 @@ def formatCatalog(rawCatalog: list) -> dict: return {x[0]: x[1] for x in catalog} # formatted output +logger.warning('Fetch catalog of `m.wxsy.net`') release = formatCatalog(fetchCatalog(18)) # 18 pages in total print(json.dumps(release)) # output as JSON format diff --git a/src/crawler/m.wxsy.net/crawler.sh b/src/crawler/m.wxsy.net/crawler.sh index d7d1850..a7710b1 100755 --- a/src/crawler/m.wxsy.net/crawler.sh +++ b/src/crawler/m.wxsy.net/crawler.sh @@ -4,7 +4,10 @@ cd `dirname $0` mkdir -p ./data/html/ mkdir -p ./data/json/ -python3 catalog.py > ./data/catalog.json -python3 fetch.py ./data/catalog.json ./data/html/ +[ -z ${DELAY} ] && DELAY=1 +[ -z ${THREAD} ] && THREAD=1 + +python3 catalog.py "${PROXY}" > ./data/catalog.json +python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} python3 extract.py ./data/catalog.json ./data/html/ ./data/json python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json diff --git a/src/crawler/m.wxsy.net/extract.py b/src/crawler/m.wxsy.net/extract.py index daf5fd0..86d7a76 100644 --- a/src/crawler/m.wxsy.net/extract.py +++ b/src/crawler/m.wxsy.net/extract.py @@ -7,10 +7,12 @@ Extract data from raw html content. USAGE: python3 extract.py [CATALOG] [HTML_DIR] [OUTPUT_DIR] """ +import os import re import sys import json -from logger import logger +sys.path.append('..') +from utils import logger from bs4 import BeautifulSoup @@ -19,7 +21,7 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content script = body.select('script')[5].text # js code with chapter info info = { 'title': body.select('div[class="size18 w100 text-center lh100 pt30 pb15"]')[0].text.strip(), - 'contents': [x.text.strip() for x in body.select('p[class="content_detail"]')], + 'content': [x.text.strip() for x in body.select('p[class="content_detail"]')], 'prePage': body.select('div[class="pt-prechapter"]')[0].a.attrs['href'], 'nextPage': body.select('div[class="pt-nextchapter"]')[0].a.attrs['href'], 'preId': re.search(r'window\.__PREVPAGE = "(\d*)"', script)[1], @@ -35,8 +37,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content def combinePage(id: str) -> dict: # combine sub pages - page_1 = splitHtml(open('%s/%s-1.html' % (sys.argv[2], id)).read()) - page_2 = splitHtml(open('%s/%s-2.html' % (sys.argv[2], id)).read()) + page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % id)).read()) + page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % id)).read()) # page info check if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]': @@ -66,13 +68,13 @@ def combinePage(id: str) -> dict: # combine sub pages 'preId': page_1['preId'], 'myId': page_1['myId'], 'nextId': page_1['nextId'], - 'contents': page_1['contents'] + page_2['contents'] + 'content': page_1['content'] + page_2['content'] } +logger.warning('Extract info of `m.wxsy.net`') catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for _, chapterId in catalog.items(): # traverse all chapters logger.info('Analyse chapter `%s`' % chapterId) - with open('%s/%s.json' % (sys.argv[3], chapterId), 'w') as fileObj: + with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj: fileObj.write(json.dumps(combinePage(chapterId))) diff --git a/src/crawler/m.wxsy.net/fetch.py b/src/crawler/m.wxsy.net/fetch.py index 4480aff..0d59507 100644 --- a/src/crawler/m.wxsy.net/fetch.py +++ b/src/crawler/m.wxsy.net/fetch.py @@ -4,52 +4,31 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ +import os import sys import json -import time -import requests -from logger import logger - -basicUrl = 'https://m.wxsy.net/novel/57104' - -userAgent = ( # default user-agent - 'Mozilla/5.0 (Linux; Android 10; moto g(7) play) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/100.0.4896.79 Mobile Safari/537.36' -) - - -def httpRequest(url: str, fileName: str) -> bool: # save html content - try: - logger.debug('Http request `%s` -> %s' % (url, fileName)) - request = requests.get(url, timeout = 30, # timeout -> 30s - headers = { - 'user-agent': userAgent, # with fake user-agent +sys.path.append('..') +from utils import logger +from utils import htmlFetch + + +def loadChapter(): + catalog = json.loads(open(sys.argv[1]).read()) # load catalog + for _, chapterId in catalog.items(): # traverse all chapters + for subPage in [1, 2]: # two sub pages in one chapter + yield { + 'url': 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (chapterId, subPage), + 'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)), } - ) - if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> %s' % url) - return False - logger.debug('Http request success -> %s' % url) - with open(fileName, 'w') as fileObj: # save html content - fileObj.write(request.text) - logger.debug('File save success -> %s' % fileName) - except: - return False - return True - -catalog = json.loads(open(sys.argv[1]).read()) # load catalog -for _, chapterId in catalog.items(): # traverse all chapters - for subPage in [1, 2]: # two sub pages in one chapter - pageUrl = '%s/read_%s/%d.html' % (basicUrl, chapterId, subPage) - pageFile = '%s/%s-%d.html' % (sys.argv[2], chapterId, subPage) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) - else: - logger.error('Page request failed -> %s' % pageUrl) - time.sleep(1) # avoid being blocked by the server +logger.warning('Fetch html of `m.wxsy.net`') +htmlFetch( + loadChapter(), + proxy = sys.argv[3], + thread = int(sys.argv[4]), + delay = float(sys.argv[5]), +) diff --git a/src/crawler/m.wxsy.net/release.py b/src/crawler/m.wxsy.net/release.py index f8df941..4224667 100644 --- a/src/crawler/m.wxsy.net/release.py +++ b/src/crawler/m.wxsy.net/release.py @@ -7,16 +7,18 @@ Combine all chapters from json files. USAGE: python3 release.py [CATALOG] [JSON_DIR] """ +import os import sys import json -from logger import logger +sys.path.append('..') +from utils import logger def loadData(catalog: dict) -> dict: # load data from json files data = {} for _, chapterId in catalog.items(): data[chapterId] = json.loads( - open('%s/%s.json' % (sys.argv[2], chapterId)).read() # read json content + open(os.path.join(sys.argv[2], '%s.json' % chapterId)).read() # read json content ) return data @@ -59,8 +61,9 @@ def combine() -> dict: # combine all chapters result = {} for _, info in data.items(): # combine contents - result[info['title']] = info['contents'] + result[info['title']] = info['content'] return result +logger.warning('Release info of `m.wxsy.net`') print(json.dumps(combine())) diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py index f7438fa..ba6daaf 100644 --- a/src/crawler/wxsy.net/fetch.py +++ b/src/crawler/wxsy.net/fetch.py @@ -4,7 +4,7 @@ """ Download raw html content as `.html` files. - USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] + USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY] """ import os