Browse Source

feat: thread pool demo

master
Dnomd343 2 years ago
parent
commit
fb215b3398
  1. 27
      src/crawler/wxsy.net/fetch.py

27
src/crawler/wxsy.net/fetch.py

@ -14,14 +14,27 @@ import time
sys.path.append('..') sys.path.append('..')
from utils import logger from utils import logger
from utils import htmlFetch from utils import htmlFetch
from concurrent.futures import ThreadPoolExecutor
def pageFetch(info: dict, delay: float):
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
if htmlFetch(info['url'], info['file']): # save html content
logger.info('Page fetch success -> `%s`' % info['url'])
else:
logger.error('Page fetch failed -> `%s`' % info['url'])
time.sleep(delay)
pages = []
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId pages.append({
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) 'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
if htmlFetch(pageUrl, pageFile): # save html content 'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
logger.info('Page request success -> `%s`' % pageUrl) })
else:
logger.error('Page request failed -> `%s`' % pageUrl)
time.sleep(1) # avoid being blocked by the server with ThreadPoolExecutor(max_workers = 2) as pool:
for page in pages:
pool.submit(pageFetch, page, 5)

Loading…
Cancel
Save