From f9f1ebda416c07b4cae7a54e5e8b34628110fd1f Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Sun, 16 Oct 2022 22:03:30 +0800 Subject: [PATCH] update: crawler script --- src/crawler/m.wxsy.net/check.sh | 8 ++++---- src/crawler/m.wxsy.net/crawler.sh | 6 +++--- src/crawler/m.wxsy.net/extract.py | 6 +++--- src/crawler/m.wxsy.net/fetch.py | 2 +- src/crawler/m.wxsy.net/release.py | 2 +- src/crawler/wxsy.net/release.py | 2 +- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/crawler/m.wxsy.net/check.sh b/src/crawler/m.wxsy.net/check.sh index d25a5a5..3126d26 100755 --- a/src/crawler/m.wxsy.net/check.sh +++ b/src/crawler/m.wxsy.net/check.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" -diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .) -diff <(cd ./data/json/ && sha1sum * | sort -u) <(cat ./archive/json.sha1sum | sort -u) -diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .) +diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json) +diff <(cd ./data/json/ && sha1sum -- * | sort -u) <(sort -u ./archive/json.sha1sum) +diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json) diff --git a/src/crawler/m.wxsy.net/crawler.sh b/src/crawler/m.wxsy.net/crawler.sh index a7710b1..a21e566 100755 --- a/src/crawler/m.wxsy.net/crawler.sh +++ b/src/crawler/m.wxsy.net/crawler.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -cd `dirname $0` +cd "$(dirname "$0")" mkdir -p ./data/html/ mkdir -p ./data/json/ -[ -z ${DELAY} ] && DELAY=1 -[ -z ${THREAD} ] && THREAD=1 +[ -z "${DELAY}" ] && DELAY=1 +[ -z "${THREAD}" ] && THREAD=1 python3 catalog.py "${PROXY}" > ./data/catalog.json python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY} diff --git a/src/crawler/m.wxsy.net/extract.py b/src/crawler/m.wxsy.net/extract.py index 86d7a76..9917a00 100644 --- a/src/crawler/m.wxsy.net/extract.py +++ b/src/crawler/m.wxsy.net/extract.py @@ -36,9 +36,9 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content return info -def combinePage(id: str) -> dict: # combine sub pages - page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % id)).read()) - page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % id)).read()) +def combinePage(pageId: str) -> dict: # combine sub pages + page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read()) + page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read()) # page info check if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]': diff --git a/src/crawler/m.wxsy.net/fetch.py b/src/crawler/m.wxsy.net/fetch.py index 0d59507..0ca815a 100644 --- a/src/crawler/m.wxsy.net/fetch.py +++ b/src/crawler/m.wxsy.net/fetch.py @@ -18,7 +18,7 @@ from utils import htmlFetch def loadChapter(): catalog = json.loads(open(sys.argv[1]).read()) # load catalog for _, chapterId in catalog.items(): # traverse all chapters - for subPage in [1, 2]: # two sub pages in one chapter + for subPage in [1, 2]: # two sub-pages in one chapter yield { 'url': 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (chapterId, subPage), 'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)), diff --git a/src/crawler/m.wxsy.net/release.py b/src/crawler/m.wxsy.net/release.py index 4224667..a01b291 100644 --- a/src/crawler/m.wxsy.net/release.py +++ b/src/crawler/m.wxsy.net/release.py @@ -28,7 +28,7 @@ def listDiff(list_1: list, list_2: list) -> bool: # compare two lists if len(list_1) != len(list_2): # with different length diffFlag = True logger.error('List with different length') - for i in range(0, len(list_1)): # check every items + for i in range(0, len(list_1)): # check every item if list_1[i] == list_2[i]: continue diffFlag = True # found different item diff --git a/src/crawler/wxsy.net/release.py b/src/crawler/wxsy.net/release.py index 100ec7a..d16967e 100644 --- a/src/crawler/wxsy.net/release.py +++ b/src/crawler/wxsy.net/release.py @@ -28,7 +28,7 @@ def listDiff(list_1: list, list_2: list) -> bool: # compare two lists if len(list_1) != len(list_2): # with different length diffFlag = True logger.error('List with different length') - for i in range(0, len(list_1)): # check every items + for i in range(0, len(list_1)): # check every item if list_1[i] == list_2[i]: continue diffFlag = True # found different item