From cc815c0fb79e6e110057fe93e13bf73f6ef53c55 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 18 Oct 2022 12:23:06 +0800 Subject: [PATCH] fix: extract script of `aidusk.com` --- src/crawler/aidusk.com/extract.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/crawler/aidusk.com/extract.py b/src/crawler/aidusk.com/extract.py index 35789df..dcba8e0 100644 --- a/src/crawler/aidusk.com/extract.py +++ b/src/crawler/aidusk.com/extract.py @@ -18,13 +18,12 @@ from bs4 import BeautifulSoup def splitHtml(rawHtml: str) -> dict: # extract from raw html content html = BeautifulSoup(rawHtml, 'lxml') + div = html.select('div[class="book_content"]')[0] title = re.search(r'^(第\d+章)(.*)$', html.select('h1')[0].text) - div = html.select('div[class="book_content"]')[0].prettify().split('\n', 1)[1] - content = [x.strip() for x in div.split('\n
\n
')] - content.pop(-1) # remove last item + [x.decompose() for x in div.select('div')] # remove extraneous items return { 'title': '%s %s' % (title[1], title[2].strip()), - 'content': content + 'content': [x.strip() for x in div.text.split('\n') if x.strip() != ''] }