Browse Source

update: enhance crawler of `108shu.com`

master
Dnomd343 2 years ago
parent
commit
f9ae7908e1
  1. 29
      src/crawler/108shu.com/catalog.py
  2. 6
      src/crawler/108shu.com/check.sh
  3. 18
      src/crawler/108shu.com/crawler.sh
  4. 13
      src/crawler/108shu.com/extract.py
  5. 60
      src/crawler/108shu.com/fetch.py

29
src/crawler/108shu.com/catalog.py

@ -8,31 +8,17 @@ Fetch catalog and output as JSON format.
""" """
import re import re
import sys
import json import json
import requests sys.path.append('..')
from logger import logger from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
catalog = {} catalog = {}
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
items = html.select('div[class="section-box"]')[1] items = html.select('div[class="section-box"]')[1]
for item in items.select('a'): for item in items.select('a'):
name = re.search(r'^(第\d+章)(.*)$', item.text) name = re.search(r'^(第\d+章)(.*)$', item.text)
@ -45,9 +31,10 @@ def fetchCatalog(pageNum: int) -> dict: # fetch all catalog
catalog = {} catalog = {}
for pageId in range(1, pageNum + 1): # traverse all pages for pageId in range(1, pageNum + 1): # traverse all pages
pageUrl = 'http://www.108shu.com/book/54247/index_%d.html' % pageId pageUrl = 'http://www.108shu.com/book/54247/index_%d.html' % pageId
logger.info('Page: %d -> `%s`' % (pageId, pageUrl)) logger.info('Catalog page -> %d' % pageId)
catalog.update(extractCatalog(httpRequest(pageUrl))) catalog.update(extractCatalog(httpRequest(pageUrl)))
return catalog return catalog
logger.warning('Fetch catalog of `108shu.com`')
print(json.dumps(fetchCatalog(45))) print(json.dumps(fetchCatalog(45)))

6
src/crawler/108shu.com/check.sh

@ -0,0 +1,6 @@
#!/usr/bin/env bash
cd "$(dirname "$0")"
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)

18
src/crawler/108shu.com/crawler.sh

@ -1,17 +1,11 @@
#!/usr/bin/env bash #!/usr/bin/env bash
cd `dirname $0` cd "$(dirname "$0")"
mkdir -p ./data/html/ mkdir -p ./data/html/
python3 catalog.py > ./data/catalog.json [ -z "${DELAY}" ] && DELAY=1
python3 fetch.py ./data/catalog.json ./data/html/ [ -z "${THREAD}" ] && THREAD=1
python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json
cd ./data/
xz -k9 catalog.json
tar cJf html.tar.xz html/
xz -k9 xxrs.json
mkdir -p ../archive/ python3 catalog.py "" > ./data/catalog.json
mv *.xz ../archive/ python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
cd ../ python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json

13
src/crawler/108shu.com/extract.py

@ -11,13 +11,13 @@ import os
import re import re
import sys import sys
import json import json
from logger import logger sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
def splitHtml(rawHtml: str) -> dict: # extract from raw html content def splitHtml(rawHtml: str) -> dict: # extract from raw html content
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(rawHtml, 'lxml')
content = [x.text.strip() for x in html.select('div[class="content"]')[0].select('p')]
title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text) title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text)
return { return {
'title': '%s %s' % (title[1], title[2].strip()), 'title': '%s %s' % (title[1], title[2].strip()),
@ -25,9 +25,9 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
} }
def combinePage(chapterId: str) -> dict: # combine sub pages def combinePage(pageId: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read()) page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read()) page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read())
if page_1['title'] != page_2['title']: if page_1['title'] != page_2['title']:
logger.error('Title error -> `%s`' % page_1['title']) logger.error('Title error -> `%s`' % page_1['title'])
return { return {
@ -37,13 +37,12 @@ def combinePage(chapterId: str) -> dict: # combine sub pages
result = {} result = {}
logger.warning('Extract info of `108shu.com`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId) logger.info('Analyse chapter `%s`' % chapterId)
info = combinePage(chapterId) info = combinePage(chapterId)
if chapterName != info['title']: if chapterName != info['title']:
logger.error('Title error -> %s' % info['title']) logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content'] result[chapterName] = info['content']
print(json.dumps(result)) print(json.dumps(result))

60
src/crawler/108shu.com/fetch.py

@ -4,51 +4,31 @@
""" """
Download raw html content as `.html` files. Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
""" """
import os import os
import sys import sys
import json import json
import time sys.path.append('..')
import requests from utils import logger
from logger import logger from utils import htmlFetch
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' def loadChapter():
'AppleWebKit/537.36 (KHTML, like Gecko) ' catalog = json.loads(open(sys.argv[1]).read()) # load catalog
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47' for _, chapterId in catalog.items(): # traverse all chapters
) for subPage in [1, 2]: # two sub-pages in one chapter
yield {
'url': 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage),
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content 'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),
try:
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
} }
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters logger.warning('Fetch html of `108shu.com`')
for subPage in [1, 2]: htmlFetch(
pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage) loadChapter(),
pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)) proxy = sys.argv[3],
if httpRequest(pageUrl, pageFile): # save html content thread = int(sys.argv[4]),
logger.info('Page request success -> %s' % pageUrl) delay = float(sys.argv[5]),
else: )
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save