Browse Source

update: enhance crawler of `m.wxsy.net`

master
Dnomd343 2 years ago
parent
commit
20dbf4c3d1
  1. 42
      src/crawler/m.wxsy.net/catalog.py
  2. 7
      src/crawler/m.wxsy.net/crawler.sh
  3. 16
      src/crawler/m.wxsy.net/extract.py
  4. 59
      src/crawler/m.wxsy.net/fetch.py
  5. 9
      src/crawler/m.wxsy.net/release.py

42
src/crawler/m.wxsy.net/catalog.py

@ -4,37 +4,17 @@
""" """
Fetch catalog and output as JSON format. Fetch catalog and output as JSON format.
USAGE: python3 catalog.py USAGE: python3 catalog.py [PROXY]
""" """
import sys
sys.path.append('..')
import re import re
import sys
import json import json
import time sys.path.append('..')
import requests from utils import logger
from logger import logger from utils import httpRequest
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
basicUrl = 'https://m.wxsy.net/novel/57104/all.html'
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def analysePage(rawHtml: str) -> list: # extract catalog from html content def analysePage(rawHtml: str) -> list: # extract catalog from html content
analyseRet = [] analyseRet = []
@ -51,10 +31,13 @@ def analysePage(rawHtml: str) -> list: # extract catalog from html content
def fetchCatalog(pageNum: int) -> list: # fetch raw catalog def fetchCatalog(pageNum: int) -> list: # fetch raw catalog
catalog = [] catalog = []
for pageIndex in range(1, pageNum + 1): # traverse all pages (1 ~ pageNum) for pageIndex in range(1, pageNum + 1): # traverse all pages (1 ~ pageNum)
logger.info('Page: %d' % pageIndex) logger.info('Catalog page -> %d' % pageIndex)
pageUrl = '%s?sort=1&page=%d' % (basicUrl, pageIndex) catalog.append(analysePage(
catalog.append(analysePage(httpRequest(pageUrl))) httpRequest(
time.sleep(1) # avoid being blocked by the server 'https://m.wxsy.net/novel/57104/all.html?sort=1&page=%d' % pageIndex,
proxy = sys.argv[1]
)
))
return catalog return catalog
@ -70,5 +53,6 @@ def formatCatalog(rawCatalog: list) -> dict:
return {x[0]: x[1] for x in catalog} # formatted output return {x[0]: x[1] for x in catalog} # formatted output
logger.warning('Fetch catalog of `m.wxsy.net`')
release = formatCatalog(fetchCatalog(18)) # 18 pages in total release = formatCatalog(fetchCatalog(18)) # 18 pages in total
print(json.dumps(release)) # output as JSON format print(json.dumps(release)) # output as JSON format

7
src/crawler/m.wxsy.net/crawler.sh

@ -4,7 +4,10 @@ cd `dirname $0`
mkdir -p ./data/html/ mkdir -p ./data/html/
mkdir -p ./data/json/ mkdir -p ./data/json/
python3 catalog.py > ./data/catalog.json [ -z ${DELAY} ] && DELAY=1
python3 fetch.py ./data/catalog.json ./data/html/ [ -z ${THREAD} ] && THREAD=1
python3 catalog.py "${PROXY}" > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ ./data/json python3 extract.py ./data/catalog.json ./data/html/ ./data/json
python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json

16
src/crawler/m.wxsy.net/extract.py

@ -7,10 +7,12 @@ Extract data from raw html content.
USAGE: python3 extract.py [CATALOG] [HTML_DIR] [OUTPUT_DIR] USAGE: python3 extract.py [CATALOG] [HTML_DIR] [OUTPUT_DIR]
""" """
import os
import re import re
import sys import sys
import json import json
from logger import logger sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -19,7 +21,7 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
script = body.select('script')[5].text # js code with chapter info script = body.select('script')[5].text # js code with chapter info
info = { info = {
'title': body.select('div[class="size18 w100 text-center lh100 pt30 pb15"]')[0].text.strip(), 'title': body.select('div[class="size18 w100 text-center lh100 pt30 pb15"]')[0].text.strip(),
'contents': [x.text.strip() for x in body.select('p[class="content_detail"]')], 'content': [x.text.strip() for x in body.select('p[class="content_detail"]')],
'prePage': body.select('div[class="pt-prechapter"]')[0].a.attrs['href'], 'prePage': body.select('div[class="pt-prechapter"]')[0].a.attrs['href'],
'nextPage': body.select('div[class="pt-nextchapter"]')[0].a.attrs['href'], 'nextPage': body.select('div[class="pt-nextchapter"]')[0].a.attrs['href'],
'preId': re.search(r'window\.__PREVPAGE = "(\d*)"', script)[1], 'preId': re.search(r'window\.__PREVPAGE = "(\d*)"', script)[1],
@ -35,8 +37,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
def combinePage(id: str) -> dict: # combine sub pages def combinePage(id: str) -> dict: # combine sub pages
page_1 = splitHtml(open('%s/%s-1.html' % (sys.argv[2], id)).read()) page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % id)).read())
page_2 = splitHtml(open('%s/%s-2.html' % (sys.argv[2], id)).read()) page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % id)).read())
# page info check # page info check
if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]': if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]':
@ -66,13 +68,13 @@ def combinePage(id: str) -> dict: # combine sub pages
'preId': page_1['preId'], 'preId': page_1['preId'],
'myId': page_1['myId'], 'myId': page_1['myId'],
'nextId': page_1['nextId'], 'nextId': page_1['nextId'],
'contents': page_1['contents'] + page_2['contents'] 'content': page_1['content'] + page_2['content']
} }
logger.warning('Extract info of `m.wxsy.net`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters for _, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId) logger.info('Analyse chapter `%s`' % chapterId)
with open('%s/%s.json' % (sys.argv[3], chapterId), 'w') as fileObj: with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj:
fileObj.write(json.dumps(combinePage(chapterId))) fileObj.write(json.dumps(combinePage(chapterId)))

59
src/crawler/m.wxsy.net/fetch.py

@ -4,52 +4,31 @@
""" """
Download raw html content as `.html` files. Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
""" """
import os
import sys import sys
import json import json
import time sys.path.append('..')
import requests from utils import logger
from logger import logger from utils import htmlFetch
basicUrl = 'https://m.wxsy.net/novel/57104'
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> %s' % (url, fileName))
request = requests.get(url, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> %s' % url)
return False
logger.debug('Http request success -> %s' % url)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> %s' % fileName)
except:
return False
return True
def loadChapter():
catalog = json.loads(open(sys.argv[1]).read()) # load catalog catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters for _, chapterId in catalog.items(): # traverse all chapters
for subPage in [1, 2]: # two sub pages in one chapter for subPage in [1, 2]: # two sub pages in one chapter
pageUrl = '%s/read_%s/%d.html' % (basicUrl, chapterId, subPage) yield {
pageFile = '%s/%s-%d.html' % (sys.argv[2], chapterId, subPage) 'url': 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (chapterId, subPage),
if httpRequest(pageUrl, pageFile): # save html content 'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),
logger.info('Page request success -> %s' % pageUrl) }
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server logger.warning('Fetch html of `m.wxsy.net`')
htmlFetch(
loadChapter(),
proxy = sys.argv[3],
thread = int(sys.argv[4]),
delay = float(sys.argv[5]),
)

9
src/crawler/m.wxsy.net/release.py

@ -7,16 +7,18 @@ Combine all chapters from json files.
USAGE: python3 release.py [CATALOG] [JSON_DIR] USAGE: python3 release.py [CATALOG] [JSON_DIR]
""" """
import os
import sys import sys
import json import json
from logger import logger sys.path.append('..')
from utils import logger
def loadData(catalog: dict) -> dict: # load data from json files def loadData(catalog: dict) -> dict: # load data from json files
data = {} data = {}
for _, chapterId in catalog.items(): for _, chapterId in catalog.items():
data[chapterId] = json.loads( data[chapterId] = json.loads(
open('%s/%s.json' % (sys.argv[2], chapterId)).read() # read json content open(os.path.join(sys.argv[2], '%s.json' % chapterId)).read() # read json content
) )
return data return data
@ -59,8 +61,9 @@ def combine() -> dict: # combine all chapters
result = {} result = {}
for _, info in data.items(): # combine contents for _, info in data.items(): # combine contents
result[info['title']] = info['contents'] result[info['title']] = info['content']
return result return result
logger.warning('Release info of `m.wxsy.net`')
print(json.dumps(combine())) print(json.dumps(combine()))

Loading…
Cancel
Save