Browse Source

update: enhance crawler of `108shu.com`

master
Dnomd343 2 years ago
parent
commit
f9ae7908e1
  1. 29
      src/crawler/108shu.com/catalog.py
  2. 6
      src/crawler/108shu.com/check.sh
  3. 18
      src/crawler/108shu.com/crawler.sh
  4. 13
      src/crawler/108shu.com/extract.py
  5. 60
      src/crawler/108shu.com/fetch.py

29
src/crawler/108shu.com/catalog.py

@ -8,31 +8,17 @@ Fetch catalog and output as JSON format.
"""
import re
import sys
import json
import requests
from logger import logger
sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
catalog = {}
html = BeautifulSoup(rawHtml, 'lxml')
html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
items = html.select('div[class="section-box"]')[1]
for item in items.select('a'):
name = re.search(r'^(第\d+章)(.*)$', item.text)
@ -45,9 +31,10 @@ def fetchCatalog(pageNum: int) -> dict: # fetch all catalog
catalog = {}
for pageId in range(1, pageNum + 1): # traverse all pages
pageUrl = 'http://www.108shu.com/book/54247/index_%d.html' % pageId
logger.info('Page: %d -> `%s`' % (pageId, pageUrl))
logger.info('Catalog page -> %d' % pageId)
catalog.update(extractCatalog(httpRequest(pageUrl)))
return catalog
logger.warning('Fetch catalog of `108shu.com`')
print(json.dumps(fetchCatalog(45)))

6
src/crawler/108shu.com/check.sh

@ -0,0 +1,6 @@
#!/usr/bin/env bash
cd "$(dirname "$0")"
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)

18
src/crawler/108shu.com/crawler.sh

@ -1,17 +1,11 @@
#!/usr/bin/env bash
cd `dirname $0`
cd "$(dirname "$0")"
mkdir -p ./data/html/
python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/
python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json
cd ./data/
xz -k9 catalog.json
tar cJf html.tar.xz html/
xz -k9 xxrs.json
[ -z "${DELAY}" ] && DELAY=1
[ -z "${THREAD}" ] && THREAD=1
mkdir -p ../archive/
mv *.xz ../archive/
cd ../
python3 catalog.py "" > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json

13
src/crawler/108shu.com/extract.py

@ -11,13 +11,13 @@ import os
import re
import sys
import json
from logger import logger
sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup
def splitHtml(rawHtml: str) -> dict: # extract from raw html content
html = BeautifulSoup(rawHtml, 'lxml')
content = [x.text.strip() for x in html.select('div[class="content"]')[0].select('p')]
title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text)
return {
'title': '%s %s' % (title[1], title[2].strip()),
@ -25,9 +25,9 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
}
def combinePage(chapterId: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read())
def combinePage(pageId: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read())
if page_1['title'] != page_2['title']:
logger.error('Title error -> `%s`' % page_1['title'])
return {
@ -37,13 +37,12 @@ def combinePage(chapterId: str) -> dict: # combine sub pages
result = {}
logger.warning('Extract info of `108shu.com`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId)
info = combinePage(chapterId)
if chapterName != info['title']:
logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content']
print(json.dumps(result))

60
src/crawler/108shu.com/fetch.py

@ -4,51 +4,31 @@
"""
Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
"""
import os
import sys
import json
import time
import requests
from logger import logger
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
sys.path.append('..')
from utils import logger
from utils import htmlFetch
def loadChapter():
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
for subPage in [1, 2]: # two sub-pages in one chapter
yield {
'url': 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage),
'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
for subPage in [1, 2]:
pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage)
pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage))
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server
logger.warning('Fetch html of `108shu.com`')
htmlFetch(
loadChapter(),
proxy = sys.argv[3],
thread = int(sys.argv[4]),
delay = float(sys.argv[5]),
)

Loading…
Cancel
Save