| 
						
						
							
								
							
						
						
					 | 
				
				 | 
				
					@ -3,6 +3,9 @@ | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					import requests | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					from .logger import logger | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					from concurrent import futures | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					from concurrent.futures import ALL_COMPLETED | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					from concurrent.futures import ThreadPoolExecutor | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					userAgent = (  # default user agent | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' | 
				
			
			
		
	
	
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
				
				 | 
				
					@ -21,7 +24,7 @@ def httpRequest(url: str) -> bytes:  # fetch raw html content | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    return request.content | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					def htmlFetch(url: str, file: str) -> bool:  # save html content | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					def htmlSave(url: str, file: str) -> bool:  # save html content | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    logger.debug('Html fetch `%s` -> `%s`' % (url, file)) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    try: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        content = httpRequest(url)  # http request | 
				
			
			
		
	
	
		
			
				
					| 
						
						
						
							
								
							
						
					 | 
				
				 | 
				
					@ -41,3 +44,25 @@ def htmlFetch(url: str, file: str) -> bool:  # save html content | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        return False  # save failed | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    logger.debug('Html save success -> `%s`' % file) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    return True | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					def pageFetch(info: dict, delay: float):  # fetch html content into file | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    logger.debug('Page fetch: `%s` -> `%s`' % (info['url'], info['file'])) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    if htmlSave(info['url'], info['file']):  # save html content | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        logger.info('Page fetch success -> `%s`' % info['url']) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    else: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        logger.error('Page fetch failed -> `%s`' % info['url']) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    time.sleep(delay) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					
 | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					def htmlFetch(page, thread: int = 1, delay: float = 0): | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    logger.info('Start html fetch process (thread = %d, delay = %f)' % (thread, delay)) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    threadPool = ThreadPoolExecutor(max_workers = thread) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    threads = [] | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    while True: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        try: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            threads.append(threadPool.submit(pageFetch, next(page), delay)) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					        except StopIteration: | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					            break | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    futures.wait(threads, return_when = ALL_COMPLETED) | 
				
			
			
		
	
		
			
				
					 | 
					 | 
				
				 | 
				
					    logger.info('Html fetch complete') | 
				
			
			
		
	
	
		
			
				
					| 
						
						
						
					 | 
				
				 | 
				
					
  |