diff --git a/src/crawler/utils/__init__.py b/src/crawler/utils/__init__.py new file mode 100644 index 0000000..d7d7f4c --- /dev/null +++ b/src/crawler/utils/__init__.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from .logger import logger +from .fetch import htmlFetch +from .fetch import httpRequest diff --git a/src/crawler/utils/fetch.py b/src/crawler/utils/fetch.py new file mode 100644 index 0000000..79885c7 --- /dev/null +++ b/src/crawler/utils/fetch.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import requests +from .logger import logger + +userAgent = ( # default user agent + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' +) + + +def httpRequest(url: str) -> bytes: # fetch raw html content + request = requests.get(url, timeout = 30, headers = { # timeout -> 30s + 'user-agent': userAgent, # with fake user-agent + 'accept-encoding': 'gzip, deflate', # allow content compress + }) + if request.status_code not in range(200, 300): # http status code 2xx + raise RuntimeError('Http request failed') + return request.content + + +def htmlFetch(url: str, file: str) -> bool: # save html content + logger.debug('Html fetch `%s` -> `%s`' % (url, file)) + try: + content = httpRequest(url) # http request + except: + logger.debug('Html fetch retry -> `%s`' % url) + try: + content = httpRequest(url) # retry + except: + logger.debug('Html fetch failed -> `%s`' % url) + return False # request failed + logger.debug('Html fetch success -> `%s`' % url) + try: + with open(file, 'wb') as fileObj: # save html content + fileObj.write(content) + except: + logger.debug('Html save failed -> `%s`' % file) + return False # save failed + logger.debug('Html save success -> `%s`' % file) + return True diff --git a/src/crawler/logger.py b/src/crawler/utils/logger.py similarity index 100% rename from src/crawler/logger.py rename to src/crawler/utils/logger.py diff --git a/src/crawler/wxsy.net/catalog.py b/src/crawler/wxsy.net/catalog.py index 5f5d150..378f735 100644 --- a/src/crawler/wxsy.net/catalog.py +++ b/src/crawler/wxsy.net/catalog.py @@ -8,32 +8,19 @@ Fetch catalog and output as JSON format. """ import re +import sys import json -import requests +sys.path.append('..') +from utils import logger +from utils import httpRequest from bs4 import BeautifulSoup -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' -) - -def httpRequest(url: str) -> str: # fetch raw html content - request = requests.get(url, headers = { - 'user-agent': userAgent, # with fake user-agent - 'accept-encoding': 'gzip, deflate', # allow content compress - }) - if request.status_code not in range(200, 300): # http status code 2xx - raise RuntimeError('Http request failed') - return request.text - - -def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content +def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content catalog = {} - html = BeautifulSoup(rawHtml, 'lxml') - detail = html.select('div[class="pt-chapter-cont-detail full"]')[0] - for item in detail.select('a'): + html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml') + div = html.select('div[class="pt-chapter-cont-detail full"]')[0] + for item in div.select('a'): catalog[item.attrs['title']] = re.search(r'/novel/57104/read_(\d+).html', item.attrs['href'])[1] catalog = sorted(catalog.items(), key = lambda d: int( re.search(r'^第(\d+)章', d[0])[1] # sort by chapter @@ -41,6 +28,7 @@ def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content return {x[0]: x[1] for x in catalog} # formatted output +logger.info('Fetch catalog of `wxsy.net`') print(json.dumps( extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/')) )) diff --git a/src/crawler/wxsy.net/fetch.py b/src/crawler/wxsy.net/fetch.py index eb057b6..699b542 100644 --- a/src/crawler/wxsy.net/fetch.py +++ b/src/crawler/wxsy.net/fetch.py @@ -11,43 +11,17 @@ import os import sys import json import time -import requests -from logger import logger - -userAgent = ( # default user agent - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' - 'AppleWebKit/537.36 (KHTML, like Gecko) ' - 'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' -) - - -def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content - try: - logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) - request = requests.get(fileUrl, timeout = 30, # timeout -> 30s - headers = { - 'user-agent': userAgent, # with fake user-agent - } - ) - if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> `%s`' % fileUrl) - return False - logger.debug('Http request success -> `%s`' % fileUrl) - with open(fileName, 'w') as fileObj: # save html content - fileObj.write(request.text) - logger.debug('File save success -> `%s`' % fileName) - except: - return False - return True +sys.path.append('..') +from utils import logger +from utils import htmlFetch catalog = json.loads(open(sys.argv[1]).read()) # load catalog - for _, chapterId in catalog.items(): # traverse all chapters pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) - if httpRequest(pageUrl, pageFile): # save html content - logger.info('Page request success -> %s' % pageUrl) + if htmlFetch(pageUrl, pageFile): # save html content + logger.info('Page request success -> `%s`' % pageUrl) else: - logger.error('Page request failed -> %s' % pageUrl) + logger.error('Page request failed -> `%s`' % pageUrl) time.sleep(1) # avoid being blocked by the server