Browse Source

update: enhance crawler of `xswang.com`

master
Dnomd343 2 years ago
parent
commit
3698f50c97
  1. 29
      src/crawler/xswang.com/catalog.py
  2. 6
      src/crawler/xswang.com/check.sh
  3. 7
      src/crawler/xswang.com/crawler.sh
  4. 6
      src/crawler/xswang.com/extract.py
  5. 62
      src/crawler/xswang.com/fetch.py

29
src/crawler/xswang.com/catalog.py

@ -8,39 +8,28 @@ Fetch catalog and output as JSON format.
"""
import re
import sys
import json
import requests
sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
catalog = {}
html = BeautifulSoup(rawHtml, 'lxml')
html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
for item in [x.select('a')[0] for x in html.select('dd')]:
title = re.search(r'^(第\d+章)(.*)', item.text.strip())
pageId = item.attrs['href'].replace('/book/56718/', '').replace('.html', '')
catalog[item.text.strip()] = pageId
catalog['%s %s' % (title[1], title[2].strip())] = pageId
catalog = sorted(catalog.items(), key = lambda d: int(
re.search(r'^第(\d+)章', d[0])[1] # sort by chapter
))
return {x[0]: x[1] for x in catalog} # formatted output
logger.warning('Fetch catalog of `xswang.com`')
print(json.dumps(
extractCatalog(httpRequest('https://www.xswang.com/book/56718/'))
))

6
src/crawler/xswang.com/check.sh

@ -0,0 +1,6 @@
#!/usr/bin/env bash
cd "$(dirname "$0")"
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)

7
src/crawler/xswang.com/crawler.sh

@ -1,8 +1,11 @@
#!/usr/bin/env bash
cd `dirname $0`
cd "$(dirname "$0")"
mkdir -p ./data/html/
[ -z "${DELAY}" ] && DELAY=1
[ -z "${THREAD}" ] && THREAD=1
python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json

6
src/crawler/xswang.com/extract.py

@ -11,7 +11,8 @@ import os
import re
import sys
import json
from logger import logger
sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup
@ -32,8 +33,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
result = {}
logger.warning('Extract info of `xswang.com`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId)
htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
@ -41,5 +42,4 @@ for chapterName, chapterId in catalog.items(): # traverse all chapters
if chapterName != info['title']:
logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content']
print(json.dumps(result))

62
src/crawler/xswang.com/fetch.py

@ -4,50 +4,30 @@
"""
Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
"""
import os
import sys
import json
import time
import requests
from logger import logger
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
sys.path.append('..')
from utils import logger
from utils import htmlFetch
def loadChapter():
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
yield {
'url': 'https://www.xswang.com/book/56718/%s.html' % chapterId,
'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
}
logger.warning('Fetch html of `xswang.com`')
htmlFetch(
loadChapter(),
proxy = sys.argv[3],
thread = int(sys.argv[4]),
delay = float(sys.argv[5]),
)
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'wb') as fileObj: # save html content
fileObj.write(request.content)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'https://www.xswang.com/book/56718/%s.html' % chapterId
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save