|
@ -3,6 +3,9 @@ |
|
|
|
|
|
|
|
|
import requests |
|
|
import requests |
|
|
from .logger import logger |
|
|
from .logger import logger |
|
|
|
|
|
from concurrent import futures |
|
|
|
|
|
from concurrent.futures import ALL_COMPLETED |
|
|
|
|
|
from concurrent.futures import ThreadPoolExecutor |
|
|
|
|
|
|
|
|
userAgent = ( # default user agent |
|
|
userAgent = ( # default user agent |
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' |
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' |
|
@ -21,7 +24,7 @@ def httpRequest(url: str) -> bytes: # fetch raw html content |
|
|
return request.content |
|
|
return request.content |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def htmlFetch(url: str, file: str) -> bool: # save html content |
|
|
def htmlSave(url: str, file: str) -> bool: # save html content |
|
|
logger.debug('Html fetch `%s` -> `%s`' % (url, file)) |
|
|
logger.debug('Html fetch `%s` -> `%s`' % (url, file)) |
|
|
try: |
|
|
try: |
|
|
content = httpRequest(url) # http request |
|
|
content = httpRequest(url) # http request |
|
@ -41,3 +44,25 @@ def htmlFetch(url: str, file: str) -> bool: # save html content |
|
|
return False # save failed |
|
|
return False # save failed |
|
|
logger.debug('Html save success -> `%s`' % file) |
|
|
logger.debug('Html save success -> `%s`' % file) |
|
|
return True |
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def pageFetch(info: dict, delay: float): # fetch html content into file |
|
|
|
|
|
logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) |
|
|
|
|
|
if htmlSave(info['url'], info['file']): # save html content |
|
|
|
|
|
logger.info('Page fetch success -> `%s`' % info['url']) |
|
|
|
|
|
else: |
|
|
|
|
|
logger.error('Page fetch failed -> `%s`' % info['url']) |
|
|
|
|
|
time.sleep(delay) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def htmlFetch(page, thread: int = 1, delay: float = 0): |
|
|
|
|
|
logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay)) |
|
|
|
|
|
threadPool = ThreadPoolExecutor(max_workers = thread) |
|
|
|
|
|
threads = [] |
|
|
|
|
|
while True: |
|
|
|
|
|
try: |
|
|
|
|
|
threads.append(threadPool.submit(pageFetch, next(page), delay)) |
|
|
|
|
|
except StopIteration: |
|
|
|
|
|
break |
|
|
|
|
|
futures.wait(threads, return_when = ALL_COMPLETED) |
|
|
|
|
|
logger.info('Html fetch complete') |
|
|