Browse Source

update: enhance crawler of `aidusk.com`

master
Dnomd343 2 years ago
parent
commit
8d2f060188
  1. 6
      src/crawler/aidusk.com/check.sh
  2. 7
      src/crawler/aidusk.com/crawler.sh
  3. 6
      src/crawler/aidusk.com/extract.py
  4. 62
      src/crawler/aidusk.com/fetch.py

6
src/crawler/aidusk.com/check.sh

@ -1,6 +1,6 @@
#!/usr/bin/env bash
cd `dirname $0`
cd "$(dirname "$0")"
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .)
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)

7
src/crawler/aidusk.com/crawler.sh

@ -1,8 +1,11 @@
#!/usr/bin/env bash
cd `dirname $0`
cd "$(dirname "$0")"
mkdir -p ./data/html/
[ -z "${DELAY}" ] && DELAY=1
[ -z "${THREAD}" ] && THREAD=1
python3 catalog.py > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
python3 extract.py ./data/catalog.json ./data/html/ > ./data/xxrs.json

6
src/crawler/aidusk.com/extract.py

@ -11,7 +11,8 @@ import os
import re
import sys
import json
from logger import logger
sys.path.append('..')
from utils import logger
from bs4 import BeautifulSoup
@ -28,8 +29,8 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
result = {}
logger.warning('Extract info of `aidusk.com`')
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for chapterName, chapterId in catalog.items(): # traverse all chapters
logger.info('Analyse chapter `%s`' % chapterId)
htmlFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
@ -37,5 +38,4 @@ for chapterName, chapterId in catalog.items(): # traverse all chapters
if chapterName != info['title']:
logger.error('Title error -> %s' % info['title'])
result[chapterName] = info['content']
print(json.dumps(result))

62
src/crawler/aidusk.com/fetch.py

@ -4,50 +4,30 @@
"""
Download raw html content as `.html` files.
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR]
USAGE: python3 fetch.py [CATALOG] [OUTPUT_DIR] [PROXY] [THREAD] [DELAY]
"""
import os
import sys
import json
import time
import requests
from logger import logger
userAgent = ( # default user agent
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47'
sys.path.append('..')
from utils import logger
from utils import htmlFetch
def loadChapter():
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
yield {
'url': 'http://www.aidusk.com/t/134659/%s.html' % chapterId,
'file': os.path.join(sys.argv[2], '%s.html' % chapterId),
}
logger.warning('Fetch html of `aidusk.com`')
htmlFetch(
loadChapter(),
proxy = sys.argv[3],
thread = int(sys.argv[4]),
delay = float(sys.argv[5]),
)
def httpRequest(fileUrl: str, fileName: str) -> bool: # save html content
try:
logger.debug('Http request `%s` -> `%s`' % (fileUrl, fileName))
request = requests.get(fileUrl, timeout = 30, # timeout -> 30s
headers = {
'user-agent': userAgent, # with fake user-agent
}
)
if request.status_code not in range(200, 300): # http status code 2xx
logger.warning('Http request failed -> `%s`' % fileUrl)
return False
logger.debug('Http request success -> `%s`' % fileUrl)
with open(fileName, 'wb') as fileObj: # save html content
fileObj.write(request.content)
logger.debug('File save success -> `%s`' % fileName)
except:
return False
return True
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
pageUrl = 'http://www.aidusk.com/t/134659/%s.html' % chapterId
pageFile = os.path.join(sys.argv[2], '%s.html' % chapterId)
if httpRequest(pageUrl, pageFile): # save html content
logger.info('Page request success -> %s' % pageUrl)
else:
logger.error('Page request failed -> %s' % pageUrl)
time.sleep(1) # avoid being blocked by the server

Loading…
Cancel
Save