fix: catalog fetch

4 years ago · 5a68b91709
2 changed files with 10 additions and 22 deletions
--- a/src/crawler/aidusk.com/catalog.py
+++ b/src/crawler/aidusk.com/catalog.py
@ -8,30 +8,17 @@ Fetch catalog and output as JSON format.
 """
 import re
 import sys
 import json
-import requests
+sys.path.append('..')
 from utils import logger
 from utils import httpRequest
 from bs4 import BeautifulSoup
 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
 )
-
+def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
 def httpRequest(url: str) -> str:  # fetch raw html content
    request = requests.get(url, headers = {
        'user-agent': userAgent,  # with fake user-agent
        'accept-encoding': 'gzip, deflate',  # allow content compress
    })
    if request.status_code not in range(200, 300):  # http status code 2xx
        raise RuntimeError('Http request failed')
    return str(request.content, encoding = 'utf-8')
 def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
    catalog = {}
-    html = BeautifulSoup(rawHtml, 'lxml')
+    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
    div = html.select('div[class="book_con_list"]')[1]
    for item in div.select('a'):
        name = re.search(r'^(第\d+章)(.*)$', item.text)
@ -39,6 +26,7 @@ def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
    return catalog
 logger.warning('Fetch catalog of `aidusk.com`')
 print(json.dumps(
    extractCatalog(httpRequest('http://www.aidusk.com/t/134659/'))
 ))
--- a/src/crawler/m.wxsy.net/catalog.py
+++ b/src/crawler/m.wxsy.net/catalog.py
@ -16,10 +16,10 @@ from utils import httpRequest
 from bs4 import BeautifulSoup
-def analysePage(rawHtml: str) -> list:  # extract catalog from html content
+def analysePage(rawHtml: bytes) -> list:  # extract catalog from html content
    analyseRet = []
-    soup = BeautifulSoup(rawHtml, 'lxml')
+    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
-    div = soup.select('div[class="border-b"]')[0]
+    div = html.select('div[class="border-b"]')[0]
    for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'):
        analyseRet.append({
            'name': row.attrs['title'],