From fae554fe25a42953fd8e094d900e378899514dc9 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sat, 15 Oct 2022 15:50:07 +0800 Subject: [PATCH] update: enhance fetch script --- src/wxsy.net/fetch.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/wxsy.net/fetch.py b/src/wxsy.net/fetch.py index 5d7e142..eb057b6 100644 --- a/src/wxsy.net/fetch.py +++ b/src/wxsy.net/fetch.py @@ -7,6 +7,7 @@ Download raw html content as `.html` files. USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] """ +import os import sys import json import time @@ -20,21 +21,21 @@ userAgent = ( # default user agent ) -def httpRequest(url: str, fileName: str) -> bool: # save html content +def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content try: - logger.debug('Http request `%s` -> %s' % (url, fileName)) - request = requests.get(url, timeout = 30, # timeout -> 30s + logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) + request = requests.get(fileUrl, timeout = 30, # timeout -> 30s headers = { 'user-agent': userAgent, # with fake user-agent } ) if request.status_code not in range(200, 300): # http status code 2xx - logger.warning('Http request failed -> %s' % url) + logger.warning('Http request failed -> `%s`' % fileUrl) return False - logger.debug('Http request success -> %s' % url) + logger.debug('Http request success -> `%s`' % fileUrl) with open(fileName, 'w') as fileObj: # save html content fileObj.write(request.text) - logger.debug('File save success -> %s' % fileName) + logger.debug('File save success -> `%s`' % fileName) except: return False return True @@ -44,7 +45,7 @@ catalog = json.loads(open(sys.argv[1]).read()) # load catalog for _, chapterId in catalog.items(): # traverse all chapters pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId - pageFile = '%s/%s.html' % (sys.argv[2], chapterId) + pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) if httpRequest(pageUrl, pageFile): # save html content logger.info('Page request success -> %s' % pageUrl) else: