xxrs-crawler/src/crawler/xswang.com/catalog.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Fetch catalog and output as JSON format.

    USAGE: python3 catalog.py
"""

import re
import sys
import json
sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup


def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
    catalog = {}
    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
    for item in [x.select('a')[0] for x in html.select('dd')]:
        title = re.search(r'^(第\d+章)(.*)', item.text.strip())
        pageId = item.attrs['href'].replace('/book/56718/', '').replace('.html', '')
        catalog['%s %s' % (title[1], title[2].strip())] = pageId
    catalog = sorted(catalog.items(), key = lambda d: int(
        re.search(r'^第(\d+)章', d[0])[1]  # sort by chapter
    ))
    return {x[0]: x[1] for x in catalog}  # formatted output


logger.warning('Fetch catalog of `xswang.com`')
print(json.dumps(
    extractCatalog(httpRequest('https://www.xswang.com/book/56718/'))
))
feat: catalog script for `xswang.com` 2 years ago			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`"""`
			`Fetch catalog and output as JSON format.`

			`USAGE: python3 catalog.py`
			`"""`

			`import re`
update: enhance crawler of `xswang.com` 2 years ago			`import sys`
feat: catalog script for `xswang.com` 2 years ago			`import json`
update: enhance crawler of `xswang.com` 2 years ago			`sys.path.append('..')`
			`from utils import logger`
			`from utils import httpRequest`
feat: catalog script for `xswang.com` 2 years ago			`from bs4 import BeautifulSoup`


update: enhance crawler of `xswang.com` 2 years ago			`def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content`
feat: catalog script for `xswang.com` 2 years ago			`catalog = {}`
update: enhance crawler of `xswang.com` 2 years ago			`html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')`
feat: catalog script for `xswang.com` 2 years ago			`for item in [x.select('a')[0] for x in html.select('dd')]:`
update: enhance crawler of `xswang.com` 2 years ago			`title = re.search(r'^(第\d+章)(.*)', item.text.strip())`
feat: catalog script for `xswang.com` 2 years ago			`pageId = item.attrs['href'].replace('/book/56718/', '').replace('.html', '')`
update: enhance crawler of `xswang.com` 2 years ago			`catalog['%s %s' % (title[1], title[2].strip())] = pageId`
feat: catalog script for `xswang.com` 2 years ago			`catalog = sorted(catalog.items(), key = lambda d: int(`
			`re.search(r'^第(\d+)章', d[0])[1] # sort by chapter`
			`))`
			`return {x[0]: x[1] for x in catalog} # formatted output`


update: enhance crawler of `xswang.com` 2 years ago			logger.warning('Fetch catalog of `xswang.com`')
feat: catalog script for `xswang.com` 2 years ago			`print(json.dumps(`
			`extractCatalog(httpRequest('https://www.xswang.com/book/56718/'))`
			`))`