Browse Source

feat: fetch catalog

master
Dnomd343 2 years ago
commit
3cbc91e31a
  1. 51
      catalog/fetch.py

51
catalog/fetch.py

@ -0,0 +1,51 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
import time
import requests
from bs4 import BeautifulSoup
basicUrl = 'https://m.wxsy.net/novel/57104/all.html'
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str) -> str:
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def analysePage(rawHtml: str) -> list:
analyseRet = []
soup = BeautifulSoup(rawHtml, 'lxml')
div = soup.select('div[class="border-b"]')[0]
for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'):
analyseRet.append({
'name': row.attrs['title'],
'url': row.attrs['href'],
})
return analyseRet
def fetchCatalog(pageNum: int) -> list:
catalog = []
for pageIndex in range(1, pageNum + 1):
print('Page: %d' % pageIndex, file = sys.stderr)
pageUrl = '%s?sort=1&page=%d' % (basicUrl, pageIndex)
catalog.append(analysePage(httpRequest(pageUrl)))
time.sleep(3)
return catalog
print(json.dumps(fetchCatalog(18)))
Loading…
Cancel
Save