Browse Source

feat: multi thread fetch

master
Dnomd343 2 years ago
parent
commit
926508d704
  1. 2
      src/crawler/utils/__init__.py
  2. 27
      src/crawler/utils/fetch.py
  3. 28
      src/crawler/wxsy.net/fetch.py

2
src/crawler/utils/__init__.py

@ -2,5 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from .logger import logger from .logger import logger
from .fetch import htmlSave
from .fetch import htmlFetch from .fetch import htmlFetch
from .fetch import httpRequest from .fetch import httpRequest

27
src/crawler/utils/fetch.py

@ -3,6 +3,9 @@
import requests import requests
from .logger import logger from .logger import logger
from concurrent import futures
from concurrent.futures import ALL_COMPLETED
from concurrent.futures import ThreadPoolExecutor
userAgent = ( # default user agent userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
@ -21,7 +24,7 @@ def httpRequest(url: str) -> bytes: # fetch raw html content
return request.content return request.content
def htmlFetch(url: str, file: str) -> bool: # save html content def htmlSave(url: str, file: str) -> bool: # save html content
logger.debug('Html fetch `%s` -> `%s`' % (url, file)) logger.debug('Html fetch `%s` -> `%s`' % (url, file))
try: try:
content = httpRequest(url) # http request content = httpRequest(url) # http request
@ -41,3 +44,25 @@ def htmlFetch(url: str, file: str) -> bool: # save html content
return False # save failed return False # save failed
logger.debug('Html save success -> `%s`' % file) logger.debug('Html save success -> `%s`' % file)
return True return True
def pageFetch(info: dict, delay: float): # fetch html content into file
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
if htmlSave(info['url'], info['file']): # save html content
logger.info('Page fetch success -> `%s`' % info['url'])
else:
logger.error('Page fetch failed -> `%s`' % info['url'])
time.sleep(delay)
def htmlFetch(page, thread: int = 1, delay: float = 0):
logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay))
threadPool = ThreadPoolExecutor(max_workers = thread)
threads = []
while True:
try:
threads.append(threadPool.submit(pageFetch, next(page), delay))
except StopIteration:
break
futures.wait(threads, return_when = ALL_COMPLETED)
logger.info('Html fetch complete')

28
src/crawler/wxsy.net/fetch.py

@ -14,27 +14,15 @@ import time
sys.path.append('..') sys.path.append('..')
from utils import logger from utils import logger
from utils import htmlFetch from utils import htmlFetch
from concurrent.futures import ThreadPoolExecutor
def pageFetch(info: dict, delay: float): def loadChapter():
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) catalog = json.loads(open(sys.argv[1]).read()) # load catalog
if htmlFetch(info['url'], info['file']): # save html content for _, chapterId in catalog.items(): # traverse all chapters
logger.info('Page fetch success -> `%s`' % info['url']) yield {
else: 'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
logger.error('Page fetch failed -> `%s`' % info['url']) 'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
time.sleep(delay) }
pages = [] htmlFetch(loadChapter(), 2)
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
pages.append({
'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
})
with ThreadPoolExecutor(max_workers = 2) as pool:
for page in pages:
pool.submit(pageFetch, page, 5)

Loading…
Cancel
Save