Browse Source

update: enhance crawler process

master
Dnomd343 2 years ago
parent
commit
ac588484f2
  1. 44
      src/crawler/utils/fetch.py
  2. 2
      src/crawler/wxsy.net/catalog.py
  3. 6
      src/crawler/wxsy.net/crawler.sh
  4. 3
      src/crawler/wxsy.net/extract.py
  5. 11
      src/crawler/wxsy.net/fetch.py
  6. 3
      src/crawler/wxsy.net/release.py

44
src/crawler/utils/fetch.py

@ -2,6 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import requests import requests
from retry import retry
from .logger import logger from .logger import logger
from concurrent import futures from concurrent import futures
from concurrent.futures import ALL_COMPLETED from concurrent.futures import ALL_COMPLETED
@ -14,27 +15,34 @@ userAgent = ( # default user agent
) )
def httpRequest(url: str) -> bytes: # fetch raw html content @retry(tries = 10, delay = 2, logger = None)
request = requests.get(url, timeout = 30, headers = { # timeout -> 30s def httpRequest(url: str, proxy: str = '') -> bytes: # fetch raw html content
'user-agent': userAgent, # with fake user-agent proxyStr = '' if proxy == '' else ' (via %s)' % proxy
'accept-encoding': 'gzip, deflate', # allow content compress logger.debug('Http request `%s`%s' % (url, proxyStr))
}) proxy = None if proxy == '' else proxy # empty string -> None
request = requests.get(
url, timeout = 10, # timeout -> 10s
proxies = { # request via socks or http proxy
'http': proxy,
'https': proxy
},
headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
}
)
if request.status_code not in range(200, 300): # http status code 2xx if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed') raise RuntimeError('Http request failed')
return request.content return request.content
def htmlSave(url: str, file: str) -> bool: # save html content def htmlSave(url: str, file: str, proxy: str = '') -> bool: # save html content
logger.debug('Html fetch `%s` -> `%s`' % (url, file)) logger.debug('Html fetch `%s` -> `%s`' % (url, file))
try: try:
content = httpRequest(url) # http request content = httpRequest(url, proxy) # http request
except: except:
logger.debug('Html fetch retry -> `%s`' % url) logger.debug('Html fetch failed -> `%s`' % url)
try: return False # request failed
content = httpRequest(url) # retry
except:
logger.debug('Html fetch failed -> `%s`' % url)
return False # request failed
logger.debug('Html fetch success -> `%s`' % url) logger.debug('Html fetch success -> `%s`' % url)
try: try:
with open(file, 'wb') as fileObj: # save html content with open(file, 'wb') as fileObj: # save html content
@ -46,22 +54,24 @@ def htmlSave(url: str, file: str) -> bool: # save html content
return True return True
def pageFetch(info: dict, delay: float): # fetch html content into file def pageFetch(info: dict, delay: float, proxy: str = ''): # fetch html content into file
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
if htmlSave(info['url'], info['file']): # save html content if htmlSave(info['url'], info['file'], proxy): # save html content
logger.info('Page fetch success -> `%s`' % info['url']) logger.info('Page fetch success -> `%s`' % info['url'])
else: else:
logger.error('Page fetch failed -> `%s`' % info['url']) logger.error('Page fetch failed -> `%s`' % info['url'])
time.sleep(delay) time.sleep(delay)
def htmlFetch(page, thread: int = 1, delay: float = 0): def htmlFetch(page, thread: int = 1, delay: float = 1, proxy: str = ''): # fetch html with generator
logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay)) logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay))
if proxy != '':
logger.info('Html fetch proxy -> `%s`' % proxy)
threadPool = ThreadPoolExecutor(max_workers = thread) threadPool = ThreadPoolExecutor(max_workers = thread)
threads = [] threads = []
while True: while True:
try: try:
threads.append(threadPool.submit(pageFetch, next(page), delay)) threads.append(threadPool.submit(pageFetch, next(page), delay, proxy))
except StopIteration: except StopIteration:
break break
futures.wait(threads, return_when = ALL_COMPLETED) futures.wait(threads, return_when = ALL_COMPLETED)

2
src/crawler/wxsy.net/catalog.py

@ -28,7 +28,7 @@ def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
return {x[0]: x[1] for x in catalog} # formatted output return {x[0]: x[1] for x in catalog} # formatted output
logger.info('Fetch catalog of `wxsy.net`') logger.warning('Fetch catalog of `wxsy.net`')
print(json.dumps( print(json.dumps(
extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/')) extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/'))
)) ))

6
src/crawler/wxsy.net/crawler.sh

@ -4,7 +4,11 @@ cd `dirname $0`
mkdir -p ./data/html/ mkdir -p ./data/html/
mkdir -p ./data/json/ mkdir -p ./data/json/
[ -z ${PROXY} ] && PROXY=
[ -z ${THREAD} ] && THREAD=1
[ -z ${DELAY} ] && DELAY=1
python3 catalog.py > ./data/catalog.json python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ ./data/json python3 extract.py ./data/catalog.json ./data/html/ ./data/json
python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json

3
src/crawler/wxsy.net/extract.py

@ -11,7 +11,8 @@ import os
import re import re
import sys import sys
import json import json
from logger import logger sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup

11
src/crawler/wxsy.net/fetch.py

@ -4,13 +4,12 @@
""" """
Download raw html content as `.html` files. Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
""" """
import os import os
import sys import sys
import json import json
import time
sys.path.append('..') sys.path.append('..')
from utils import logger from utils import logger
from utils import htmlFetch from utils import htmlFetch
@ -25,4 +24,10 @@ def loadChapter():
} }
htmlFetch(loadChapter(), 2) logger.warning('Fetch html of `wxsy.net`')
htmlFetch(
loadChapter(),
proxy = sys.argv[3],
thread = int(sys.argv[4]),
delay = float(sys.argv[5]),
)

3
src/crawler/wxsy.net/release.py

@ -10,7 +10,8 @@ Combine all chapters from json files.
import os import os
import sys import sys
import json import json
from logger import logger sys.path.append('..')
from utils import logger
def loadData(catalog: dict) -> dict: # load data from json files def loadData(catalog: dict) -> dict: # load data from json files

Loading…
Cancel
Save