Browse Source

update: enhance crawler of `xswang.com`

master
Dnomd343 2 years ago
parent
commit
3698f50c97
  1. 29
      src/crawler/xswang.com/catalog.py
  2. 6
      src/crawler/xswang.com/check.sh
  3. 7
      src/crawler/xswang.com/crawler.sh
  4. 6
      src/crawler/xswang.com/extract.py
  5. 54
      src/crawler/xswang.com/fetch.py

29
src/crawler/xswang.com/catalog.py

@ -8,39 +8,28 @@ Fetch catalog and output as JSON format.
""" """
import re import re
import sys
import json import json
import requests sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
catalog = {} catalog = {}
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
for item in [x.select('a')[0] for x in html.select('dd')]: for item in [x.select('a')[0] for x in html.select('dd')]:
title = re.search(r'^(第\d+章)(.*)', item.text.strip())
pageId = item.attrs['href'].replace('/book/56718/', '').replace('.html', '') pageId = item.attrs['href'].replace('/book/56718/', '').replace('.html', '')
catalog[item.text.strip()] = pageId catalog['%s %s' % (title[1], title[2].strip())] = pageId
catalog = sorted(catalog.items(), key = lambda d: int( catalog = sorted(catalog.items(), key = lambda d: int(
re.search(r'^第(\d+)章', d[0])[1] # sort by chapter re.search(r'^第(\d+)章', d[0])[1] # sort by chapter
)) ))
return {x[0]: x[1] for x in catalog} # formatted output return {x[0]: x[1] for x in catalog} # formatted output
logger.warning('Fetch catalog of `xswang.com`')
print(json.dumps( print(json.dumps(
extractCatalog(httpRequest('https://www.xswang.com/book/56718/')) extractCatalog(httpRequest('https://www.xswang.com/book/56718/'))
)) ))

6
src/crawler/xswang.com/check.sh

@ -0,0 +1,6 @@
#!/usr/bin/env bash
cd "$(dirname "$0")"
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)

7
src/crawler/xswang.com/crawler.sh

@ -1,8 +1,11 @@
#!/usr/bin/env bash #!/usr/bin/env bash
cd `dirname $0` cd "$(dirname "$0")"
mkdir -p ./data/html/ mkdir -p ./data/html/
[ -z "${DELAY}" ] && DELAY=1
[ -z "${THREAD}" ] && THREAD=1
python3 catalog.py > ./data/catalog.json python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json

6
src/crawler/xswang.com/extract.py

@ -11,7 +11,8 @@ import os
import re import re
import sys import sys
import json import json
from logger import logger sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -32,8 +33,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
result = {} result = {}
logger.warning('Extract info of `xswang.com`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId) logger.info('Analyse chapter `%s`' % chapterId)
htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId) htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
@ -41,5 +42,4 @@ for chapterName, chapterId in catalog.items(): # traverse all chapters
if chapterName != info['title']: if chapterName != info['title']:
logger.error('Title error -> %s' % info['title']) logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content'] result[chapterName] = info['content']
print(json.dumps(result)) print(json.dumps(result))

54
src/crawler/xswang.com/fetch.py

@ -4,50 +4,30 @@
""" """
Download raw html content as `.html` files. Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
""" """
import os import os
import sys import sys
import json import json
import time sys.path.append('..')
import requests from utils import logger
from logger import logger from utils import htmlFetch
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content def loadChapter():
try: catalog = json.loads(open(sys.argv[1]).read()) # load catalog
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName)) for _, chapterId in catalog.items(): # traverse all chapters
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s yield {
headers = { 'url': 'https://www.xswang.com/book/56718/%s.html' % chapterId,
'user-agent': userAgent, # with fake user-agent 'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
} }
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'wb') as fileObj: # save html content
fileObj.write(request.content)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog logger.warning('Fetch html of `xswang.com`')
htmlFetch(
for _, chapterId in catalog.items(): # traverse all chapters loadChapter(),
pageUrl = 'https://www.xswang.com/book/56718/%s.html' % chapterId proxy = sys.argv[3],
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) thread = int(sys.argv[4]),
if httpRequest(pageUrl, pageFile): # save html content delay = float(sys.argv[5]),
logger.info('Page request success -> %s' % pageUrl) )
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save