Browse Source

update: enhance crawler of `m.wxsy.net`

master
Dnomd343 2 years ago
parent
commit
20dbf4c3d1
  1. 42
      src/crawler/m.wxsy.net/catalog.py
  2. 7
      src/crawler/m.wxsy.net/crawler.sh
  3. 16
      src/crawler/m.wxsy.net/extract.py
  4. 63
      src/crawler/m.wxsy.net/fetch.py
  5. 9
      src/crawler/m.wxsy.net/release.py
  6. 2
      src/crawler/wxsy.net/fetch.py

42
src/crawler/m.wxsy.net/catalog.py

@ -4,37 +4,17 @@
"""
Fetch catalog and output as JSON format.
USAGE: python3 catalog.py
USAGE: python3 catalog.py [PROXY]
"""
import sys
sys.path.append('..')
import re
import sys
import json
import time
import requests
from logger import logger
sys.path.append('..')
from utils import logger
from utils import httpRequest
from bs4 import BeautifulSoup
basicUrl = 'https://m.wxsy.net/novel/57104/all.html'
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str) -> str: # fetch raw html content
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def analysePage(rawHtml: str) -> list: # extract catalog from html content
analyseRet = []
@ -51,10 +31,13 @@ def analysePage(rawHtml: str) -> list: # extract catalog from html content
def fetchCatalog(pageNum: int) -> list: # fetch raw catalog
catalog = []
for pageIndex in range(1, pageNum + 1): # traverse all pages (1 ~ pageNum)
logger.info('Page: %d' % pageIndex)
pageUrl = '%s?sort=1&page=%d' % (basicUrl, pageIndex)
catalog.append(analysePage(httpRequest(pageUrl)))
time.sleep(1) # avoid being blocked by the server
logger.info('Catalog page -> %d' % pageIndex)
catalog.append(analysePage(
httpRequest(
'https://m.wxsy.net/novel/57104/all.html?sort=1&page=%d' % pageIndex,
proxy = sys.argv[1]
)
))
return catalog
@ -70,5 +53,6 @@ def formatCatalog(rawCatalog: list) -> dict:
return {x[0]: x[1] for x in catalog} # formatted output
logger.warning('Fetch catalog of `m.wxsy.net`')
release = formatCatalog(fetchCatalog(18)) # 18 pages in total
print(json.dumps(release)) # output as JSON format

7
src/crawler/m.wxsy.net/crawler.sh

@ -4,7 +4,10 @@ cd `dirname $0`
mkdir -p ./data/html/
mkdir -p ./data/json/
python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/
[ -z ${DELAY} ] && DELAY=1
[ -z ${THREAD} ] && THREAD=1
python3 catalog.py "${PROXY}" > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ ./data/json
python3 release.py ./data/catalog.json ./data/json/ > ./data/xxrs.json

16
src/crawler/m.wxsy.net/extract.py

@ -7,10 +7,12 @@ Extract data from raw html content.
USAGE: python3 extract.py [CATALOG] [HTML_DIR] [OUTPUT_DIR]
"""
import os
import re
import sys
import json
from logger import logger
sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup
@ -19,7 +21,7 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
script = body.select('script')[5].text # js code with chapter info
info = {
'title': body.select('div[class="size18 w100 text-center lh100 pt30 pb15"]')[0].text.strip(),
'contents': [x.text.strip() for x in body.select('p[class="content_detail"]')],
'content': [x.text.strip() for x in body.select('p[class="content_detail"]')],
'prePage': body.select('div[class="pt-prechapter"]')[0].a.attrs['href'],
'nextPage': body.select('div[class="pt-nextchapter"]')[0].a.attrs['href'],
'preId': re.search(r'window\.__PREVPAGE = "(\d*)"', script)[1],
@ -35,8 +37,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
def combinePage(id: str) -> dict: # combine sub pages
page_1 = splitHtml(open('%s/%s-1.html' % (sys.argv[2], id)).read())
page_2 = splitHtml(open('%s/%s-2.html' % (sys.argv[2], id)).read())
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % id)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % id)).read())
# page info check
if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]':
@ -66,13 +68,13 @@ def combinePage(id: str) -> dict: # combine sub pages
'preId': page_1['preId'],
'myId': page_1['myId'],
'nextId': page_1['nextId'],
'contents': page_1['contents'] + page_2['contents']
'content': page_1['content'] + page_2['content']
}
logger.warning('Extract info of `m.wxsy.net`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId)
with open('%s/%s.json' % (sys.argv[3], chapterId), 'w') as fileObj:
with open(os.path.join(sys.argv[3], '%s.json' % chapterId), 'w') as fileObj:
fileObj.write(json.dumps(combinePage(chapterId)))

63
src/crawler/m.wxsy.net/fetch.py

@ -4,52 +4,31 @@
"""
Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
"""
import os
import sys
import json
import time
import requests
from logger import logger
basicUrl = 'https://m.wxsy.net/novel/57104'
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> %s' % (url, fileName))
request = requests.get(url, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
sys.path.append('..')
from utils import logger
from utils import htmlFetch
def loadChapter():
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
for subPage in [1, 2]: # two sub pages in one chapter
yield {
'url': 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (chapterId, subPage),
'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> %s' % url)
return False
logger.debug('Http request success -> %s' % url)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> %s' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
for subPage in [1, 2]: # two sub pages in one chapter
pageUrl = '%s/read_%s/%d.html' % (basicUrl, chapterId, subPage)
pageFile = '%s/%s-%d.html' % (sys.argv[2], chapterId, subPage)
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server
logger.warning('Fetch html of `m.wxsy.net`')
htmlFetch(
loadChapter(),
proxy = sys.argv[3],
thread = int(sys.argv[4]),
delay = float(sys.argv[5]),
)

9
src/crawler/m.wxsy.net/release.py

@ -7,16 +7,18 @@ Combine all chapters from json files.
USAGE: python3 release.py [CATALOG] [JSON_DIR]
"""
import os
import sys
import json
from logger import logger
sys.path.append('..')
from utils import logger
def loadData(catalog: dict) -> dict: # load data from json files
data = {}
for _, chapterId in catalog.items():
data[chapterId] = json.loads(
open('%s/%s.json' % (sys.argv[2], chapterId)).read() # read json content
open(os.path.join(sys.argv[2], '%s.json' % chapterId)).read() # read json content
)
return data
@ -59,8 +61,9 @@ def combine() -> dict: # combine all chapters
result = {}
for _, info in data.items(): # combine contents
result[info['title']] = info['contents']
result[info['title']] = info['content']
return result
logger.warning('Release info of `m.wxsy.net`')
print(json.dumps(combine()))

2
src/crawler/wxsy.net/fetch.py

@ -4,7 +4,7 @@
"""
Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
"""
import os

Loading…
Cancel
Save