Browse Source

fix: extract script of `aidusk.com`

master
Dnomd343 2 years ago
parent
commit
cc815c0fb7
  1. 7
      src/crawler/aidusk.com/extract.py

7
src/crawler/aidusk.com/extract.py

@ -18,13 +18,12 @@ from bs4 import BeautifulSoup
def splitHtml(rawHtml: str) -> dict: # extract from raw html content def splitHtml(rawHtml: str) -> dict: # extract from raw html content
html = BeautifulSoup(rawHtml, 'lxml') html = BeautifulSoup(rawHtml, 'lxml')
div = html.select('div[class="book_content"]')[0]
title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text) title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text)
div = html.select('div[class="book_content"]')[0].prettify().split('\n', 1)[1] [x.decompose() for x in div.select('div')] # remove extraneous items
content = [x.strip() for x in div.split('\n <br/>\n <br/>')]
content.pop(-1) # remove last item
return { return {
'title': '%s %s' % (title[1], title[2].strip()), 'title': '%s %s' % (title[1], title[2].strip()),
'content': content 'content': [x.strip() for x in div.text.split('\n') if x.strip() != '']
} }

Loading…
Cancel
Save