|
@ -14,14 +14,27 @@ import time |
|
|
sys.path.append('..') |
|
|
sys.path.append('..') |
|
|
from utils import logger |
|
|
from utils import logger |
|
|
from utils import htmlFetch |
|
|
from utils import htmlFetch |
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pageFetch(info: dict, delay: float): |
|
|
|
|
|
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) |
|
|
|
|
|
if htmlFetch(info['url'], info['file']): # save html content |
|
|
|
|
|
logger.info('Page fetch success -> `%s`' % info['url']) |
|
|
|
|
|
else: |
|
|
|
|
|
logger.error('Page fetch failed -> `%s`' % info['url']) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pages = [] |
|
|
catalog = json.loads(open(sys.argv[1]).read()) # load catalog |
|
|
catalog = json.loads(open(sys.argv[1]).read()) # load catalog |
|
|
for _, chapterId in catalog.items(): # traverse all chapters |
|
|
for _, chapterId in catalog.items(): # traverse all chapters |
|
|
pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId |
|
|
pages.append({ |
|
|
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) |
|
|
'url': 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId, |
|
|
if htmlFetch(pageUrl, pageFile): # save html content |
|
|
'file': os.path.join(sys.argv[2], '%s.html' % chapterId), |
|
|
logger.info('Page request success -> `%s`' % pageUrl) |
|
|
}) |
|
|
else: |
|
|
|
|
|
logger.error('Page request failed -> `%s`' % pageUrl) |
|
|
|
|
|
time.sleep(1) # avoid being blocked by the server |
|
|
with ThreadPoolExecutor(max_workers = 2) as pool: |
|
|
|
|
|
for page in pages: |
|
|
|
|
|
pool.submit(pageFetch, page, 5) |
|
|