Browse Source

update: http fetch module

master
Dnomd343 2 years ago
parent
commit
05b2f6235e
  1. 6
      src/crawler/utils/__init__.py
  2. 43
      src/crawler/utils/fetch.py
  3. 0
      src/crawler/utils/logger.py
  4. 30
      src/crawler/wxsy.net/catalog.py
  5. 38
      src/crawler/wxsy.net/fetch.py

6
src/crawler/utils/__init__.py

@ -0,0 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from .logger import logger
from .fetch import htmlFetch
from .fetch import httpRequest

43
src/crawler/utils/fetch.py

@ -0,0 +1,43 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from .logger import logger
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(url: str) -> bytes: # fetch raw html content
request = requests.get(url, timeout = 30, headers = { # timeout -> 30s
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.content
def htmlFetch(url: str, file: str) -> bool: # save html content
logger.debug('Html fetch `%s` -> `%s`' % (url, file))
try:
content = httpRequest(url) # http request
except:
logger.debug('Html fetch retry -> `%s`' % url)
try:
content = httpRequest(url) # retry
except:
logger.debug('Html fetch failed -> `%s`' % url)
return False # request failed
logger.debug('Html fetch success -> `%s`' % url)
try:
with open(file, 'wb') as fileObj: # save html content
fileObj.write(content)
except:
logger.debug('Html save failed -> `%s`' % file)
return False # save failed
logger.debug('Html save success -> `%s`' % file)
return True

0
src/crawler/logger.py → src/crawler/utils/logger.py

30
src/crawler/wxsy.net/catalog.py

@ -8,32 +8,19 @@ Fetch catalog and output as JSON format.
"""
import re
import sys
import json
import requests
sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
catalog = {}
html = BeautifulSoup(rawHtml, 'lxml')
detail = html.select('div[class="pt-chapter-cont-detail full"]')[0]
for item in detail.select('a'):
html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
div = html.select('div[class="pt-chapter-cont-detail full"]')[0]
for item in div.select('a'):
catalog[item.attrs['title']] = re.search(r'/novel/57104/read_(\d+).html', item.attrs['href'])[1]
catalog = sorted(catalog.items(), key = lambda d: int(
re.search(r'^第(\d+)章', d[0])[1] # sort by chapter
@ -41,6 +28,7 @@ def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
return {x[0]: x[1] for x in catalog} # formatted output
logger.info('Fetch catalog of `wxsy.net`')
print(json.dumps(
extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/'))
))

38
src/crawler/wxsy.net/fetch.py

@ -11,43 +11,17 @@ import os
import sys
import json
import time
import requests
from logger import logger
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
sys.path.append('..')
from utils import logger
from utils import htmlFetch
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
if htmlFetch(pageUrl, pageFile): # save html content
logger.info('Page request success -> `%s`' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
logger.error('Page request failed -> `%s`' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save