Browse Source

fix: catalog fetch

master
Dnomd343 2 years ago
parent
commit
5a68b91709
  1. 26
      src/crawler/aidusk.com/catalog.py
  2. 6
      src/crawler/m.wxsy.net/catalog.py

26
src/crawler/aidusk.com/catalog.py

@ -8,30 +8,17 @@ Fetch catalog and output as JSON format.
""" """
import re import re
import sys
import json import json
import requests sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return str(request.content, encoding = 'utf-8')
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
catalog = {} catalog = {}
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
div = html.select('div[class="book_con_list"]')[1] div = html.select('div[class="book_con_list"]')[1]
for item in div.select('a'): for item in div.select('a'):
name = re.search(r'^(第\d+章)(.*)$', item.text) name = re.search(r'^(第\d+章)(.*)$', item.text)
@ -39,6 +26,7 @@ def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
return catalog return catalog
logger.warning('Fetch catalog of `aidusk.com`')
print(json.dumps( print(json.dumps(
extractCatalog(httpRequest('http://www.aidusk.com/t/134659/')) extractCatalog(httpRequest('http://www.aidusk.com/t/134659/'))
)) ))

6
src/crawler/m.wxsy.net/catalog.py

@ -16,10 +16,10 @@ from utils import httpRequest
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def analysePage(rawHtml: str) -> list: # extract catalog from html content def analysePage(rawHtml: bytes) -> list: # extract catalog from html content
analyseRet = [] analyseRet = []
soup = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
div = soup.select('div[class="border-b"]')[0] div = html.select('div[class="border-b"]')[0]
for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'): for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'):
analyseRet.append({ analyseRet.append({
'name': row.attrs['title'], 'name': row.attrs['title'],

Loading…
Cancel
Save