|
@ -15,6 +15,12 @@ from logger import logger |
|
|
from bs4 import BeautifulSoup |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clearContent(raw: str) -> str: # remove popularize |
|
|
|
|
|
if '\n' in raw: |
|
|
|
|
|
raw = re.search(r'^(.+?)\n', raw)[1] |
|
|
|
|
|
return raw |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def splitHtml(rawHtml: str) -> dict: # extract from raw html content |
|
|
def splitHtml(rawHtml: str) -> dict: # extract from raw html content |
|
|
html = BeautifulSoup(rawHtml, 'lxml') |
|
|
html = BeautifulSoup(rawHtml, 'lxml') |
|
|
script = html.select('script')[9].text # js code with chapter info |
|
|
script = html.select('script')[9].text # js code with chapter info |
|
@ -27,6 +33,7 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content |
|
|
} |
|
|
} |
|
|
if info['title'] != re.search(r'window\.chapterName = \'(.+)\'', script)[1]: # chapter title check |
|
|
if info['title'] != re.search(r'window\.chapterName = \'(.+)\'', script)[1]: # chapter title check |
|
|
logger.error('Title error -> %s' % info['title']) |
|
|
logger.error('Title error -> %s' % info['title']) |
|
|
|
|
|
info['content'] = [clearContent(x) for x in info['content']] |
|
|
return info |
|
|
return info |
|
|
|
|
|
|
|
|
|
|
|
|
|
|