From 5a68b91709b4b4f17dacc91e0c4c0296423ee2c3 Mon Sep 17 00:00:00 2001
From: Dnomd343 <i@343.re>
Date: Sun, 16 Oct 2022 21:12:50 +0800
Subject: [PATCH] fix: catalog fetch

---
 src/crawler/aidusk.com/catalog.py | 26 +++++++-------------------
 src/crawler/m.wxsy.net/catalog.py |  6 +++---
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/src/crawler/aidusk.com/catalog.py b/src/crawler/aidusk.com/catalog.py
index 35c871e..080fc40 100644
--- a/src/crawler/aidusk.com/catalog.py
+++ b/src/crawler/aidusk.com/catalog.py
@@ -8,30 +8,17 @@ Fetch catalog and output as JSON format.
 """
 
 import re
+import sys
 import json
-import requests
+sys.path.append('..')
+from utils import logger
+from utils import httpRequest
 from bs4 import BeautifulSoup
 
-userAgent = (  # default user agent
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
-    'AppleWebKit/537.36 (KHTML, like Gecko) '
-    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
-)
 
-
-def httpRequest(url: str) -> str:  # fetch raw html content
-    request = requests.get(url, headers = {
-        'user-agent': userAgent,  # with fake user-agent
-        'accept-encoding': 'gzip, deflate',  # allow content compress
-    })
-    if request.status_code not in range(200, 300):  # http status code 2xx
-        raise RuntimeError('Http request failed')
-    return str(request.content, encoding = 'utf-8')
-
-
-def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
+def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
     catalog = {}
-    html = BeautifulSoup(rawHtml, 'lxml')
+    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
     div = html.select('div[class="book_con_list"]')[1]
     for item in div.select('a'):
         name = re.search(r'^(第\d+章)(.*)$', item.text)
@@ -39,6 +26,7 @@ def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
     return catalog
 
 
+logger.warning('Fetch catalog of `aidusk.com`')
 print(json.dumps(
     extractCatalog(httpRequest('http://www.aidusk.com/t/134659/'))
 ))
diff --git a/src/crawler/m.wxsy.net/catalog.py b/src/crawler/m.wxsy.net/catalog.py
index a1d2e42..60f0f8c 100644
--- a/src/crawler/m.wxsy.net/catalog.py
+++ b/src/crawler/m.wxsy.net/catalog.py
@@ -16,10 +16,10 @@ from utils import httpRequest
 from bs4 import BeautifulSoup
 
 
-def analysePage(rawHtml: str) -> list:  # extract catalog from html content
+def analysePage(rawHtml: bytes) -> list:  # extract catalog from html content
     analyseRet = []
-    soup = BeautifulSoup(rawHtml, 'lxml')
-    div = soup.select('div[class="border-b"]')[0]
+    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
+    div = html.select('div[class="border-b"]')[0]
     for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'):
         analyseRet.append({
             'name': row.attrs['title'],