Browse Source

feat: thread pool demo

master
Dnomd343 2 years ago
parent
commit
fb215b3398
  1. 27
      src/crawler/wxsy.net/fetch.py

27
src/crawler/wxsy.net/fetch.py

@ -14,14 +14,27 @@ import time
sys.path.append('..')
from utils import logger
from utils import htmlFetch
from concurrent.futures import ThreadPoolExecutor
def pageFetch(info: dict, delay: float):
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file']))
if htmlFetch(info['url'], info['file']): # save html content
logger.info('Page fetch success -> `%s`' % info['url'])
else:
logger.error('Page fetch failed -> `%s`' % info['url'])
time.sleep(delay)
pages = []
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
if htmlFetch(pageUrl, pageFile): # save html content
logger.info('Page request success -> `%s`' % pageUrl)
else:
logger.error('Page request failed -> `%s`' % pageUrl)
time.sleep(1) # avoid being blocked by the server
pages.append({
'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId,
'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
})
with ThreadPoolExecutor(max_workers = 2) as pool:
for page in pages:
pool.submit(pageFetch, page, 5)

Loading…
Cancel
Save