You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

58 lines
1.8 KiB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Fetch catalog and output as JSON format.
USAGE: python3 catalog.py [PROXY]
"""
import re
import sys
import json
sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup
def analysePage(rawHtml: bytes) -> list: # extract catalog from html content
analyseRet = []
html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
div = html.select('div[class="border-b"]')[0]
for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'):
analyseRet.append({
'name': row.attrs['title'],
'url': row.attrs['href'],
})
return analyseRet
def fetchCatalog(pageNum: int) -> list: # fetch raw catalog
catalog = []
for pageIndex in range(1, pageNum + 1): # traverse all pages (1 ~ pageNum)
logger.info('Catalog page -> %d' % pageIndex)
catalog.append(analysePage(
httpRequest(
'https://m.wxsy.net/novel/57104/all.html?sort=1&page=%d' % pageIndex,
proxy = sys.argv[1]
)
))
return catalog
def formatCatalog(rawCatalog: list) -> dict:
catalog = {}
for catalogPage in rawCatalog: # traverse pages
for catalogItem in catalogPage: # traverse catalog items
pageId = re.search(r'^/novel/57104/read_(\d+)\.html$', catalogItem['url'])[1]
catalog[catalogItem['name']] = pageId # save page id
catalog = sorted(catalog.items(), key = lambda d: int(
re.search(r'^第(\d+)章', d[0])[1] # sort by chapter
))
return {x[0]: x[1] for x in catalog} # formatted output
logger.warning('Fetch catalog of `m.wxsy.net`')
release = formatCatalog(fetchCatalog(18)) # 18 pages in total
print(json.dumps(release)) # output as JSON format