From 0a412eab0a8298b1e0171b5e167c6e59d9e27182 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 11 Oct 2022 13:17:35 +0800 Subject: [PATCH] feat: html file fetch --- demo.py | 10 ---------- fetch.py | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 10 deletions(-) delete mode 100644 demo.py create mode 100644 fetch.py diff --git a/demo.py b/demo.py deleted file mode 100644 index 802f415..0000000 --- a/demo.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from logger import logger - -logger.debug('debug') -logger.info('info') -logger.warning('warning') -logger.error('error') -logger.critical('critical') diff --git a/fetch.py b/fetch.py new file mode 100644 index 0000000..f6d276f --- /dev/null +++ b/fetch.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import time +import json +import requests +from logger import logger + +userAgent = ( # default user-agent + 'Mozilla/5.0 (Linux; Android 10; moto g(7) play) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/100.0.4896.79 Mobile Safari/537.36' +) + + +def httpRequest(url: str, fileName: str) -> bool: + try: + logger.debug('Http request `%s` -> %s' % (url, fileName)) + request = requests.get(url, timeout = 30, + headers = { + 'user-agent': userAgent, # with fake user-agent + } + ) + if request.status_code not in range(200, 300): # http status code 2xx + logger.warning('Http request failed -> %s' % url) + return False + logger.debug('Http request success -> %s' % url) + with open(fileName, 'w') as fileObj: # save html content + fileObj.write(request.text) + logger.debug('File save success -> %s' % fileName) + except: + return False + return True + + +catalog = json.loads(open('./catalog/catalog.json').read()) + +for _, pageId in catalog.items(): + for subPage in [1, 2]: + pageUrl = 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (pageId, subPage) + pageFile = './html/%s-%d.html' % (pageId, subPage) + if httpRequest(pageUrl, pageFile): + logger.info('Page request success -> %s' % pageUrl) + else: + logger.error('Page request failed -> %s' % pageUrl) + time.sleep(1)