From eff4a278dea6067df8d1e84a7e659c1d2560c904 Mon Sep 17 00:00:00 2001 From: Dnomd343 Date: Tue, 6 Dec 2022 15:20:13 +0800 Subject: [PATCH] docs: update README.md --- README.md | 57 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 4a0795b..e0b8181 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,11 @@ s3_fixed(sample_3) end + subgraph release + sa{{sample_a}} + sb{{sample_b}} + end + s1a_raw -- replenish --> s1_combine s1b_raw --> s1_combine s2a_raw -- replenish --> s2_combine @@ -32,6 +37,11 @@ s1_combine -- fix --> s1_fixed s2_combine -- fix --> s2_fixed s3_combine -- fix --> s3_fixed + + s1_fixed --> sa + s2_fixed -- replenish --> sa + s2_fixed -. restore .-> sb + s3_fixed -- replenish --> sb end subgraph crawler @@ -44,16 +54,9 @@ source_7([zhihu.com]) --> s3_raw end - subgraph release - sa{{sample_a}} - sb{{sample_b}} + subgraph rc rc-1(rc-1) - s1_fixed --> sa - s2_fixed -- replenish --> sa - s2_fixed -. restore .-> sb - s3_fixed -- replenish --> sb - sa --> rc-1 sb -- fix --> rc-1 end @@ -61,47 +64,51 @@ ## 数据爬虫来源 -+ [`108shu.com`](./src/crawler/108shu.com) :[`http://www.108shu.com/book/54247/`](http://www.108shu.com/book/54247/) ++ [`108shu.com`](./src/crawler/108shu.com) :[http://www.108shu.com/book/54247/](http://www.108shu.com/book/54247/) -+ [`aidusk.com`](./src/crawler/aidusk.com) :[`http://www.aidusk.com/t/134659/`](http://www.aidusk.com/t/134659/) ++ [`aidusk.com`](./src/crawler/aidusk.com) :[http://www.aidusk.com/t/134659/](http://www.aidusk.com/t/134659/) -+ [`ixsw.la`](./src/crawler/ixsw.la) :[`https://www.ixsw.la/ks82668/`](https://www.ixsw.la/ks82668/) ++ [`ixsw.la`](./src/crawler/ixsw.la) :[https://www.ixsw.la/ks82668/](https://www.ixsw.la/ks82668/) -+ [`m.wxsy.net`](./src/crawler/m.wxsy.net) :[`https://m.wxsy.net/novel/57104/`](https://m.wxsy.net/novel/57104/) ++ [`m.wxsy.net`](./src/crawler/m.wxsy.net) :[https://m.wxsy.net/novel/57104/](https://m.wxsy.net/novel/57104/) -+ [`wxsy.net`](./src/crawler/wxsy.net) :[`https://www.wxsy.net/novel/57104/`](https://www.wxsy.net/novel/57104/) ++ [`wxsy.net`](./src/crawler/wxsy.net) :[https://www.wxsy.net/novel/57104/](https://www.wxsy.net/novel/57104/) -+ [`xswang.com`](./src/crawler/xswang.com) :[`https://www.xswang.com/book/56718/`](https://www.xswang.com/book/56718/) ++ [`xswang.com`](./src/crawler/xswang.com) :[https://www.xswang.com/book/56718/](https://www.xswang.com/book/56718/) -+ [`zhihu.com`](./src/crawler/zhihu.com) :[`https://www.zhihu.com/column/c_1553471910075449344`](https://www.zhihu.com/column/c_1553471910075449344) ++ [`zhihu.com`](./src/crawler/zhihu.com) :[https://www.zhihu.com/column/c_1553471910075449344](https://www.zhihu.com/column/c_1553471910075449344) ## 爬虫样本分析 原始爬虫得到5份三组不同 `raw` 数据: -+ sample_1-a ++ `sample_1-a` -+ sample_1-b ++ `sample_1-b` -+ sample_2-a ++ `sample_2-a` -+ sample_2-b ++ `sample_2-b` -+ sample_3 ++ `sample_3` 经过简单合并后可得到三份初始 `combine` 样本: -+ sample_1 ++ `sample_1` -+ sample_2 ++ `sample_2` -+ sample_3 ++ `sample_3` 三份样本进行对照合并,修复各类语法词汇错误、违禁屏蔽词等,得到最终的三组 `fixed` 样本,而后对其合并,获得两组 `release` 样本: -+ sample_a ++ `sample_a` -+ sample_b ++ `sample_b` 两组样本只有微小的分隔区别,经过修复合并后得到 `RC` 样本。 + +## 数据发布 + ++ `RC-1` :初始合并版本