Browse Source

update: http fetch module

master
Dnomd343 2 years ago
parent
commit
05b2f6235e
  1. 6
      src/crawler/utils/__init__.py
  2. 43
      src/crawler/utils/fetch.py
  3. 0
      src/crawler/utils/logger.py
  4. 30
      src/crawler/wxsy.net/catalog.py
  5. 38
      src/crawler/wxsy.net/fetch.py

6
src/crawler/utils/__init__.py

@ -0,0 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from .logger import logger
from .fetch import htmlFetch
from .fetch import httpRequest

43
src/crawler/utils/fetch.py

@ -0,0 +1,43 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from .logger import logger
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(url: str) -> bytes: # fetch raw html content
request = requests.get(url, timeout = 30, headers = { # timeout -> 30s
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.content
def htmlFetch(url: str, file: str) -> bool: # save html content
logger.debug('Html fetch `%s` -> `%s`' % (url, file))
try:
content = httpRequest(url) # http request
except:
logger.debug('Html fetch retry -> `%s`' % url)
try:
content = httpRequest(url) # retry
except:
logger.debug('Html fetch failed -> `%s`' % url)
return False # request failed
logger.debug('Html fetch success -> `%s`' % url)
try:
with open(file, 'wb') as fileObj: # save html content
fileObj.write(content)
except:
logger.debug('Html save failed -> `%s`' % file)
return False # save failed
logger.debug('Html save success -> `%s`' % file)
return True

0
src/crawler/logger.py → src/crawler/utils/logger.py

30
src/crawler/wxsy.net/catalog.py

@ -8,32 +8,19 @@ Fetch catalog and output as JSON format.
""" """
import re import re
import sys
import json import json
import requests sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def extractCatalog(rawHtml: bytes) -> dict: # extract catalog from html content
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
catalog = {} catalog = {}
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(str(rawHtml, encoding = 'utf-8'), 'lxml')
detail = html.select('div[class="pt-chapter-cont-detail full"]')[0] div = html.select('div[class="pt-chapter-cont-detail full"]')[0]
for item in detail.select('a'): for item in div.select('a'):
catalog[item.attrs['title']] = re.search(r'/novel/57104/read_(\d+).html', item.attrs['href'])[1] catalog[item.attrs['title']] = re.search(r'/novel/57104/read_(\d+).html', item.attrs['href'])[1]
catalog = sorted(catalog.items(), key = lambda d: int( catalog = sorted(catalog.items(), key = lambda d: int(
re.search(r'^第(\d+)章', d[0])[1] # sort by chapter re.search(r'^第(\d+)章', d[0])[1] # sort by chapter
@ -41,6 +28,7 @@ def extractCatalog(rawHtml: str) -> dict: # extract catalog from html content
return {x[0]: x[1] for x in catalog} # formatted output return {x[0]: x[1] for x in catalog} # formatted output
logger.info('Fetch catalog of `wxsy.net`')
print(json.dumps( print(json.dumps(
extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/')) extractCatalog(httpRequest('https://www.wxsy.net/novel/57104/'))
)) ))

38
src/crawler/wxsy.net/fetch.py

@ -11,43 +11,17 @@ import os
import sys import sys
import json import json
import time import time
import requests sys.path.append('..')
from logger import logger from utils import logger
from utils import htmlFetch
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
)
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId pageUrl = 'https://www.wxsy.net/novel/57104/read_%s.html' % chapterId
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId) pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
if httpRequest(pageUrl, pageFile): # save html content if htmlFetch(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl) logger.info('Page request success -> `%s`' % pageUrl)
else: else:
logger.error('Page request failed -> %s' % pageUrl) logger.error('Page request failed -> `%s`' % pageUrl)
time.sleep(1) # avoid being blocked by the server time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save