Browse Source

update: remove legacy code

master
Dnomd343 2 years ago
parent
commit
e1b3fd69fa
  1. 68
      analyse.py
  2. 1
      catalog/catalog.json
  3. 51
      catalog/fetch.py
  4. 1
      catalog/raw.json
  5. 16
      catalog/sort.py
  6. 58
      combine.py
  7. 46
      fetch.py
  8. 41
      logger.py
  9. 11
      release.py

68
analyse.py

@ -1,68 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import json
from logger import logger
from bs4 import BeautifulSoup
def splitPage(rawHtml: str) -> dict:
body = BeautifulSoup(rawHtml, 'lxml').body
script = body.select('script')[5].text
info = {
'title': body.select('div[class="size18 w100 text-center lh100 pt30 pb15"]')[0].text.strip(),
'contents': [x.text.strip() for x in body.select('p[class="content_detail"]')],
'prePage': body.select('div[class="pt-prechapter"]')[0].a.attrs['href'],
'nextPage': body.select('div[class="pt-nextchapter"]')[0].a.attrs['href'],
'preId': re.search(r'window\.__PREVPAGE = "(\d*)"', script)[1],
'nextId': re.search(r'window\.__NEXTPAGE = "(\d*)"', script)[1],
'myId': re.search(r'window\.chapterNum = (\d+)', script)[1],
'caption': re.search(r'window\.chapterName = \'(.+)\'', script)[1],
}
if not info['title'].startswith(info['caption']):
logger.error('Title error -> %s' % info['caption'])
info['index'] = info['title'].replace(info['caption'], '')
info.pop('title')
return info
def combinePage(id: str) -> dict:
page_1 = splitPage(open('./html/%s-1.html' % id).read())
page_2 = splitPage(open('./html/%s-2.html' % id).read())
if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]':
logger.error('Sub page error -> `%s` <-> `%s`' % (page_1['index'], page_2['index']))
if not page_1['caption'] == page_2['caption']:
logger.error('Caption error -> `%s` <-> `%s`' % (page_1['caption'], page_2['caption']))
if not page_1['myId'] == page_2['myId']:
logger.error('Page ID error -> `%s` <-> `%s`' % (page_1['myId'], page_2['myId']))
if not page_1['preId'] == page_2['preId']:
logger.error('Pre page ID error -> `%s` <-> `%s`' % (page_1['preId'], page_2['preId']))
if not page_1['nextId'] == page_2['nextId']:
logger.error('Next page ID error -> `%s` <-> `%s`' % (page_1['nextId'], page_2['nextId']))
if not page_1['prePage'] == '/novel/57104/read_%s.html' % page_1['preId']:
logger.warning('Page-1 pre url -> `%s` (ID = %s)' % (page_1['prePage'], id))
if not page_1['nextPage'] == '/novel/57104/read_%s/2.html' % page_1['myId']:
logger.warning('Page-1 next url -> `%s` (ID = %s)' % (page_1['nextPage'], id))
if not page_2['prePage'] == '/novel/57104/read_%s.html' % page_2['myId']:
logger.warning('Page-2 pre url -> `%s` (ID = %s)' % (page_2['prePage'], id))
if not page_2['nextPage'] == '/novel/57104/read_%s.html' % page_2['nextId']:
logger.warning('Page-2 next url -> `%s` (ID = %s)' % (page_2['nextPage'], id))
return {
'title': page_1['caption'],
'preId': page_1['preId'],
'myId': page_1['myId'],
'nextId': page_1['nextId'],
'contents': page_1['contents'] + page_2['contents']
}
catalog = json.loads(open('./catalog/catalog.json').read())
for _, pageId in catalog.items():
logger.info('Analyse page `%s`' % pageId)
with open('./json/%s.json' % pageId, 'w') as fileObj:
fileObj.write(json.dumps(combinePage(pageId)))

1
catalog/catalog.json

File diff suppressed because one or more lines are too long

51
catalog/fetch.py

@ -1,51 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import json
import time
import requests
from bs4 import BeautifulSoup
basicUrl = 'https://m.wxsy.net/novel/57104/all.html'
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str) -> str:
request = requests.get(url, headers = {
'user-agent': userAgent, # with fake user-agent
'accept-encoding': 'gzip, deflate', # allow content compress
})
if request.status_code not in range(200, 300): # http status code 2xx
raise RuntimeError('Http request failed')
return request.text
def analysePage(rawHtml: str) -> list:
analyseRet = []
soup = BeautifulSoup(rawHtml, 'lxml')
div = soup.select('div[class="border-b"]')[0]
for row in div.select('a[class="w100 flex-wrp flex-align-center flex-between pt10 pb10"]'):
analyseRet.append({
'name': row.attrs['title'],
'url': row.attrs['href'],
})
return analyseRet
def fetchCatalog(pageNum: int) -> list:
catalog = []
for pageIndex in range(1, pageNum + 1):
print('Page: %d' % pageIndex, file = sys.stderr)
pageUrl = '%s?sort=1&page=%d' % (basicUrl, pageIndex)
catalog.append(analysePage(httpRequest(pageUrl)))
time.sleep(3)
return catalog
print(json.dumps(fetchCatalog(18)))

1
catalog/raw.json

File diff suppressed because one or more lines are too long

16
catalog/sort.py

@ -1,16 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import json
catalog = {}
for catalogPage in json.loads(open('raw.json').read()):
for pageInfo in catalogPage:
pageId = re.search(r'^/novel/57104/read_(\d+)\.html$', pageInfo['url'])[1]
catalog[pageInfo['name']] = pageId
catalog = sorted(catalog.items(), key = lambda d: int(re.search(r'^第(\d+)章', d[0])[1]))
catalog = {x[0]: x[1] for x in catalog}
print(json.dumps(catalog))

58
combine.py

@ -1,58 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
from logger import logger
def loadData(catalog: dict) -> dict:
data = {}
for _, pageId in catalog.items():
data[pageId] = json.loads(
open('./json/%s.json' % pageId).read()
)
return data
def listDiff(list_1: list, list_2: list) -> bool:
diffFlag = False
if len(list_1) != len(list_2):
diffFlag = True
logger.error('List with different length')
for i in range(0, len(list_1)):
if list_1[i] != list_2[i]:
diffFlag = True
logger.error('List diff: `%s` <-> `%s`' % (list_1[i], list_2[i]))
return diffFlag
def check(catalog: dict, data: dict) -> None:
titles = [x['title'] for _, x in data.items()]
ids = [x['myId'] for _, x in data.items()]
preIds = [x['preId'] for _, x in data.items()]
nextIds = [x['nextId'] for _, x in data.items()]
nextIds.pop(-1)
preIds.pop(0)
# if listDiff(ids, preIds + [ids[-1]]):
# logger.warning('Pre IDs mismatch')
# if listDiff(ids, [ids[0]] + nextIds):
# logger.warning('Next IDs mismatch')
if listDiff(ids, [x for _, x in catalog.items()]):
logger.warning('IDs mismatch')
if listDiff(titles, [x for x in catalog]):
logger.warning('Titles mismatch')
def combine() -> dict:
catalog = json.loads(open('./catalog/catalog.json').read())
data = loadData(catalog)
check(catalog, data)
result = {}
for _, info in data.items():
result[info['title']] = info['contents']
return result
print(json.dumps(combine()))

46
fetch.py

@ -1,46 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import json
import requests
from logger import logger
userAgent = ( # default user-agent
'Mozilla/5.0 (Linux; Android 10; moto g(7) play) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/100.0.4896.79 Mobile Safari/537.36'
)
def httpRequest(url: str, fileName: str) -> bool:
try:
logger.debug('Http request `%s` -> %s' % (url, fileName))
request = requests.get(url, timeout = 30,
headers = {
'user-agent': userAgent, # with fake user-agent
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> %s' % url)
return False
logger.debug('Http request success -> %s' % url)
with open(fileName, 'w') as fileObj: # save html content
fileObj.write(request.text)
logger.debug('File save success -> %s' % fileName)
except:
return False
return True
catalog = json.loads(open('./catalog/catalog.json').read())
for _, pageId in catalog.items():
for subPage in [1, 2]:
pageUrl = 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (pageId, subPage)
pageFile = './html/%s-%d.html' % (pageId, subPage)
if httpRequest(pageUrl, pageFile):
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1)

41
logger.py

@ -1,41 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sys
import logging
import colorlog
logColor = { # log color
'DEBUG': 'white',
'INFO': 'green',
'WARNING': 'yellow',
'ERROR': 'red',
'CRITICAL': 'bold_red',
}
dateFormat = '%Y-%m-%d %H:%M:%S'
timeFormat = '%(asctime)s.%(msecs)03d'
logFormat = '[%(levelname)s] %(message)s (%(module)s.%(funcName)s:%(lineno)d)'
# load fileHandler -> log file
fileHandler = logging.FileHandler('runtime.log', encoding = 'utf-8')
fileHandler.setFormatter(logging.Formatter(
'[' + timeFormat + '] ' + logFormat,
datefmt = dateFormat
))
fileHandler.setLevel(logging.DEBUG) # debug level for log file
# load stdHandler -> stderr
stdHandler = colorlog.StreamHandler()
stdHandler.setFormatter(colorlog.ColoredFormatter(
'%(light_black)s' + timeFormat + '%(log_color)s ' + logFormat,
datefmt = dateFormat,
log_colors = logColor,
stream = sys.stderr
))
stdHandler.setLevel(logging.INFO) # info level for stderr
logger = logging.getLogger()
logger.addHandler(stdHandler)
logger.addHandler(fileHandler)
logger.setLevel(logging.DEBUG) # set log level in handler

11
release.py

@ -1,11 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
strData = ''
data = json.loads(open('xxrs.json').read())
for title, content in data.items():
strData += '%s\n\n' % title
strData += '%s\n\n\n' % '\n\n'.join(content)
print(strData.strip())
Loading…
Cancel
Save