update: enhance crawler of `108shu.com`

4 years ago · f9ae7908e1
5 changed files with 46 additions and 80 deletions
--- a/src/crawler/108shu.com/catalog.py
+++ b/src/crawler/108shu.com/catalog.py
@ -8,31 +8,17 @@ Fetch catalog and output as JSON format.
 """
 import re
 import sys
 import json
-import requests
+sys.path.append('..')
-from logger import logger
+from utils import logger
 from utils import httpRequest
 from bs4 import BeautifulSoup
 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
 )
-
+def extractCatalog(rawHtml: bytes) -> dict:  # extract catalog from html content
 def httpRequest(url: str) -> str:  # fetch raw html content
    request = requests.get(url, headers = {
        'user-agent': userAgent,  # with fake user-agent
        'accept-encoding': 'gzip, deflate',  # allow content compress
    })
    if request.status_code not in range(200, 300):  # http status code 2xx
        raise RuntimeError('Http request failed')
    return request.text
 def extractCatalog(rawHtml: str) -> dict:  # extract catalog from html content
    catalog = {}
-    html = BeautifulSoup(rawHtml, 'lxml')
+    html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
    items = html.select('div[class="section-box"]')[1]
    for item in items.select('a'):
        name = re.search(r'^(第\d+章)(.*)$', item.text)
@ -45,9 +31,10 @@ def fetchCatalog(pageNum: int) -> dict:  # fetch all catalog
    catalog = {}
    for pageId in range(1, pageNum + 1):  # traverse all pages
        pageUrl = 'http://www.108shu.com/book/54247/index_%d.html' % pageId
-        logger.info('Page: %d -> `%s`' % (pageId, pageUrl))
+        logger.info('Catalog page -> %d' % pageId)
        catalog.update(extractCatalog(httpRequest(pageUrl)))
    return catalog
 logger.warning('Fetch catalog of `108shu.com`')
 print(json.dumps(fetchCatalog(45)))
--- a/src/crawler/108shu.com/check.sh
+++ b/src/crawler/108shu.com/check.sh
@ -0,0 +1,6 @@
 #!/usr/bin/env bash
 cd "$(dirname "$0")"
 diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
 diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)
--- a/src/crawler/108shu.com/crawler.sh
+++ b/src/crawler/108shu.com/crawler.sh
@ -1,17 +1,11 @@
 #!/usr/bin/env bash
-cd `dirname $0`
+cd "$(dirname "$0")"
 mkdir -p ./data/html/
-python3 catalog.py > ./data/catalog.json
+[ -z "${DELAY}" ] && DELAY=1
-python3 fetch.py ./data/catalog.json ./data/html/
+[ -z "${THREAD}" ] && THREAD=1
 python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json
 cd ./data/
 xz -k9 catalog.json
 tar cJf html.tar.xz html/
 xz -k9 xxrs.json
-mkdir -p ../archive/
+python3 catalog.py "" > ./data/catalog.json
-mv *.xz ../archive/
+python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
-cd ../
+python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json
--- a/src/crawler/108shu.com/extract.py
+++ b/src/crawler/108shu.com/extract.py
@ -11,13 +11,13 @@ import os
 import re
 import sys
 import json
-from logger import logger
+sys.path.append('..')
 from utils import logger
 from bs4 import BeautifulSoup
 def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
    html = BeautifulSoup(rawHtml, 'lxml')
    content = [x.text.strip() for x in html.select('div[class="content"]')[0].select('p')]
    title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text)
    return {
        'title': '%s %s' % (title[1], title[2].strip()),
@ -25,9 +25,9 @@ def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
    }
-def combinePage(chapterId: str) -> dict:  # combine sub pages
+def combinePage(pageId: str) -> dict:  # combine sub pages
-    page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % chapterId)).read())
+    page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read())
-    page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % chapterId)).read())
+    page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read())
    if page_1['title'] != page_2['title']:
        logger.error('Title error -> `%s`' % page_1['title'])
    return {
@ -37,13 +37,12 @@ def combinePage(chapterId: str) -> dict:  # combine sub pages
 result = {}
 logger.warning('Extract info of `108shu.com`')
 catalog = json.loads(open(sys.argv[1]).read())  # load catalog
 for chapterName, chapterId in catalog.items():  # traverse all chapters
    logger.info('Analyse chapter `%s`' % chapterId)
    info = combinePage(chapterId)
    if chapterName != info['title']:
        logger.error('Title error -> %s' % info['title'])
    result[chapterName] = info['content']
 print(json.dumps(result))
--- a/src/crawler/108shu.com/fetch.py
+++ b/src/crawler/108shu.com/fetch.py
@ -4,51 +4,31 @@
 """
 Download raw html content as `.html` files.
-    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
+    USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
 """
 import os
 import sys
 import json
-import time
+sys.path.append('..')
-import requests
+from utils import logger
-from logger import logger
+from utils import htmlFetch
 userAgent = (  # default user agent
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) '
    'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
 )
-def httpRequest(fileUrl: str, fileName: str) -> bool:  # save html content
+def loadChapter():
-    try:
+    catalog = json.loads(open(sys.argv[1]).read())  # load catalog
-        logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
+    for _, chapterId in catalog.items():  # traverse all chapters
-        request = requests.get(fileUrl, timeout = 30,  # timeout -> 30s
+        for subPage in [1, 2]:  # two sub-pages in one chapter
-            headers = {
+            yield {
-                'user-agent': userAgent,  # with fake user-agent
+                'url': 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage),
                'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),
            }
        )
        if request.status_code not in range(200, 300):  # http status code 2xx
            logger.warning('Http request failed -> `%s`' % fileUrl)
            return False
        logger.debug('Http request success -> `%s`' % fileUrl)
        with open(fileName, 'w') as fileObj:  # save html content
            fileObj.write(request.text)
        logger.debug('File save success -> `%s`' % fileName)
    except:
        return False
    return True
-catalog = json.loads(open(sys.argv[1]).read())  # load catalog
+logger.warning('Fetch html of `108shu.com`')
-
+htmlFetch(
-for _, chapterId in catalog.items():  # traverse all chapters
+    loadChapter(),
-    for subPage in [1, 2]:
+    proxy = sys.argv[3],
-        pageUrl = 'http://www.108shu.com/book/54247/%s_%d.html' % (chapterId, subPage)
+    thread = int(sys.argv[4]),
-        pageFile = os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage))
+    delay = float(sys.argv[5]),
-        if httpRequest(pageUrl, pageFile):  # save html content
+)
            logger.info('Page request success -> %s' % pageUrl)
        else:
            logger.error('Page request failed -> %s' % pageUrl)
        time.sleep(1)  # avoid being blocked by the server