update: http fetch module

2 years ago · 05b2f6235e
5 changed files with 64 additions and 53 deletions
--- a/src/crawler/utils/init.py
+++ b/src/crawler/utils/init.py
@ -0,0 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 from .logger import logger
 from .fetch import htmlFetch
 from .fetch import httpRequest
--- a/src/crawler/utils/fetch.py
+++ b/src/crawler/utils/fetch.py
@ -0,0 +1,43 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import requests
 from .logger import logger
 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
 )
 def httpRequest(url: str) -> bytes:  # fetch raw html content
    request = requests.get(url, timeout = 30, headers = {  # timeout -> 30s
        'user-agent': userAgent,  # with fake user-agent
        'accept-encoding': 'gzip, deflate',  # allow content compress
    })
    if request.status_code not in range(200, 300):  # http status code 2xx
        raise RuntimeError('Http request failed')
    return request.content
 def htmlFetch(url: str, file: str) -> bool:  # save html content
    logger.debug('Html fetch `%s` -> `%s`' % (url, file))
    try:
        content = httpRequest(url)  # http request
    except:
        logger.debug('Html fetch retry -> `%s`' % url)
        try:
            content = httpRequest(url)  # retry
        except:
            logger.debug('Html fetch failed -> `%s`' % url)
            return False  # request failed
    logger.debug('Html fetch success -> `%s`' % url)
    try:
        with open(file, 'wb') as fileObj:  # save html content
            fileObj.write(content)
    except:
        logger.debug('Html save failed -> `%s`' % file)
        return False  # save failed
    logger.debug('Html save success -> `%s`' % file)
    return True
--- a/src/crawler/utils/logger.py
+++ b/src/crawler/utils/logger.py
--- a/src/crawler/wxsy.net/catalog.py
+++ b/src/crawler/wxsy.net/catalog.py
@ -8,32 +8,19 @@ Fetch catalog and output as JSON format.
 """
 import re
 import sys
 import json
-import requests
+sys.path.append('..')
 from utils import logger
 from utils import httpRequest
 from bs4 import BeautifulSoup
 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
 )
-
+def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
 def httpRequest(url: str) -> str:  # fetch raw html content
    request = requests.get(url, headers = {
        'user-agent': userAgent,  # with fake user-agent
        'accept-encoding': 'gzip, deflate',  # allow content compress
    })
    if request.status_code not in range(200, 300):  # http status code 2xx
        raise RuntimeError('Http request failed')
    return request.text
 def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
    catalog = {}
-    html = BeautifulSoup(rawHtml, 'lxml')
+    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
-    detail = html.select('div[class="pt-chapter-cont-detail full"]')[0]
+    div = html.select('div[class="pt-chapter-cont-detail full"]')[0]
-    for item in detail.select('a'):
+    for item in div.select('a'):
        catalog[item.attrs['title']] = re.search(r'/novel/57104/read_(\d+).html', item.attrs['href'])[1]
    catalog = sorted(catalog.items(), key = lambda d: int(
        re.search(r'^第(\d+)章', d[0])[1]  # sort by chapter
@ -41,6 +28,7 @@ def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
    return {x[0]: x[1] for x in catalog}  # formatted output
 logger.info('Fetch catalog of `wxsy.net`')
 print(json.dumps(
    extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/'))
 ))
--- a/src/crawler/wxsy.net/fetch.py
+++ b/src/crawler/wxsy.net/fetch.py
@ -11,43 +11,17 @@ import os
 import sys
 import json
 import time
-import requests
+sys.path.append('..')
-from logger import logger
+from utils import logger
-
+from utils import htmlFetch
 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
 )
 def httpRequest(fileUrl: str, fileName: str) -> bool:  # save html content
    try:
        logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
        request = requests.get(fileUrl, timeout = 30,  # timeout -> 30s
            headers = {
                'user-agent': userAgent,  # with fake user-agent
            }
        )
        if request.status_code not in range(200, 300):  # http status code 2xx
            logger.warning('Http request failed -> `%s`' % fileUrl)
            return False
        logger.debug('Http request success -> `%s`' % fileUrl)
        with open(fileName, 'w') as fileObj:  # save html content
            fileObj.write(request.text)
        logger.debug('File save success -> `%s`' % fileName)
    except:
        return False
    return True
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
 for _, chapterId in catalog.items():  # traverse all chapters
    pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId
    pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
-    if httpRequest(pageUrl, pageFile):  # save html content
+    if htmlFetch(pageUrl, pageFile):  # save html content
-        logger.info('Page request success -> %s' % pageUrl)
+        logger.info('Page request success -> `%s`' % pageUrl)
    else:
-        logger.error('Page request failed -> %s' % pageUrl)
+        logger.error('Page request failed -> `%s`' % pageUrl)
    time.sleep(1)  # avoid being blocked by the server