update: crawler script

3 years ago · f9f1ebda41
6 changed files with 13 additions and 13 deletions
--- a/src/crawler/m.wxsy.net/check.sh
+++ b/src/crawler/m.wxsy.net/check.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash

-cd `dirname $0`
+cd "$(dirname "$0")"

-diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .)
-diff <(cd ./data/json/ && sha1sum * | sort -u) <(cat ./archive/json.sha1sum | sort -u)
-diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .)
+diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
+diff <(cd ./data/json/ && sha1sum -- * | sort -u) <(sort -u ./archive/json.sha1sum)
+diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)
--- a/src/crawler/m.wxsy.net/crawler.sh
+++ b/src/crawler/m.wxsy.net/crawler.sh
@ -1,11 +1,11 @@
 #!/usr/bin/env bash

-cd `dirname $0`
+cd "$(dirname "$0")"
 mkdir -p ./data/html/
 mkdir -p ./data/json/

-[ -z ${DELAY} ] && DELAY=1
-[ -z ${THREAD} ] && THREAD=1
+[ -z "${DELAY}" ] && DELAY=1
+[ -z "${THREAD}" ] && THREAD=1

 python3 catalog.py "${PROXY}" > ./data/catalog.json
 python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}
--- a/src/crawler/m.wxsy.net/extract.py
+++ b/src/crawler/m.wxsy.net/extract.py
@ -36,9 +36,9 @@ def splitHtml(rawHtml: str) -> dict:  # extract from raw html content
    return info


-def combinePage(id: str) -> dict:  # combine sub pages
-    page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % id)).read())
-    page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % id)).read())
+def combinePage(pageId: str) -> dict:  # combine sub pages
+    page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read())
+    page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read())

    # page info check
    if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]':
--- a/src/crawler/m.wxsy.net/fetch.py
+++ b/src/crawler/m.wxsy.net/fetch.py
@ -18,7 +18,7 @@ from utils import htmlFetch
 def loadChapter():
    catalog = json.loads(open(sys.argv[1]).read())  # load catalog
    for _, chapterId in catalog.items():  # traverse all chapters
-        for subPage in [1, 2]:  # two sub pages in one chapter
+        for subPage in [1, 2]:  # two sub-pages in one chapter
            yield {
                'url': 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (chapterId, subPage),
                'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),
--- a/src/crawler/m.wxsy.net/release.py
+++ b/src/crawler/m.wxsy.net/release.py
@ -28,7 +28,7 @@ def listDiff(list_1: list, list_2: list) -> bool:  # compare two lists
    if len(list_1) != len(list_2):  # with different length
        diffFlag = True
        logger.error('List with different length')
-    for i in range(0, len(list_1)):  # check every items
+    for i in range(0, len(list_1)):  # check every item
        if list_1[i] == list_2[i]:
            continue
        diffFlag = True  # found different item
--- a/src/crawler/wxsy.net/release.py
+++ b/src/crawler/wxsy.net/release.py
@ -28,7 +28,7 @@ def listDiff(list_1: list, list_2: list) -> bool:  # compare two lists
    if len(list_1) != len(list_2):  # with different length
        diffFlag = True
        logger.error('List with different length')
-    for i in range(0, len(list_1)):  # check every items
+    for i in range(0, len(list_1)):  # check every item
        if list_1[i] == list_2[i]:
            continue
        diffFlag = True  # found different item