Browse Source

update: crawler script

master
Dnomd343 2 years ago
parent
commit
f9f1ebda41
  1. 8
      src/crawler/m.wxsy.net/check.sh
  2. 6
      src/crawler/m.wxsy.net/crawler.sh
  3. 6
      src/crawler/m.wxsy.net/extract.py
  4. 2
      src/crawler/m.wxsy.net/fetch.py
  5. 2
      src/crawler/m.wxsy.net/release.py
  6. 2
      src/crawler/wxsy.net/release.py

8
src/crawler/m.wxsy.net/check.sh

@ -1,7 +1,7 @@
#!/usr/bin/env bash
cd `dirname $0`
cd "$(dirname "$0")"
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(cat ./data/catalog.json | jq .)
diff <(cd ./data/json/ && sha1sum * | sort -u) <(cat ./archive/json.sha1sum | sort -u)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(cat ./data/xxrs.json | jq .)
diff <(xz -cdk ./archive/catalog.json.xz | jq .) <(jq . ./data/catalog.json)
diff <(cd ./data/json/ && sha1sum -- * | sort -u) <(sort -u ./archive/json.sha1sum)
diff <(xz -cdk ./archive/xxrs.json.xz | jq .) <(jq . ./data/xxrs.json)

6
src/crawler/m.wxsy.net/crawler.sh

@ -1,11 +1,11 @@
#!/usr/bin/env bash
cd `dirname $0`
cd "$(dirname "$0")"
mkdir -p ./data/html/
mkdir -p ./data/json/
[ -z ${DELAY} ] && DELAY=1
[ -z ${THREAD} ] && THREAD=1
[ -z "${DELAY}" ] && DELAY=1
[ -z "${THREAD}" ] && THREAD=1
python3 catalog.py "${PROXY}" > ./data/catalog.json
python3 fetch.py ./data/catalog.json ./data/html/ "${PROXY}" ${THREAD} ${DELAY}

6
src/crawler/m.wxsy.net/extract.py

@ -36,9 +36,9 @@ def splitHtml(rawHtml: str) -> dict: # extract from raw html content
return info
def combinePage(id: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % id)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % id)).read())
def combinePage(pageId: str) -> dict: # combine sub pages
page_1 = splitHtml(open(os.path.join(sys.argv[2], '%s-1.html' % pageId)).read())
page_2 = splitHtml(open(os.path.join(sys.argv[2], '%s-2.html' % pageId)).read())
# page info check
if not page_1['index'] == '[1/2页]' or not page_2['index'] == '[2/2页]':

2
src/crawler/m.wxsy.net/fetch.py

@ -18,7 +18,7 @@ from utils import htmlFetch
def loadChapter():
catalog = json.loads(open(sys.argv[1]).read()) # load catalog
for _, chapterId in catalog.items(): # traverse all chapters
for subPage in [1, 2]: # two sub pages in one chapter
for subPage in [1, 2]: # two sub-pages in one chapter
yield {
'url': 'https://m.wxsy.net/novel/57104/read_%s/%d.html' % (chapterId, subPage),
'file': os.path.join(sys.argv[2], '%s-%d.html' % (chapterId, subPage)),

2
src/crawler/m.wxsy.net/release.py

@ -28,7 +28,7 @@ def listDiff(list_1: list, list_2: list) -> bool: # compare two lists
if len(list_1) != len(list_2): # with different length
diffFlag = True
logger.error('List with different length')
for i in range(0, len(list_1)): # check every items
for i in range(0, len(list_1)): # check every item
if list_1[i] == list_2[i]:
continue
diffFlag = True # found different item

2
src/crawler/wxsy.net/release.py

@ -28,7 +28,7 @@ def listDiff(list_1: list, list_2: list) -> bool: # compare two lists
if len(list_1) != len(list_2): # with different length
diffFlag = True
logger.error('List with different length')
for i in range(0, len(list_1)): # check every items
for i in range(0, len(list_1)): # check every item
if list_1[i] == list_2[i]:
continue
diffFlag = True # found different item

Loading…
Cancel
Save