Browse Source

feat: fully automatic data acquisition

master
Dnomd343 3 years ago
parent
commit
19fbc19d63
  1. 5
      build.sh
  2. 2
      html/tlds/biz.html
  3. 2
      html/tlds/compare.html
  4. 12
      html/tlds/eu.html
  5. 2
      html/tlds/kiwi.html
  6. 7
      html/tlds/lb.html
  7. 28
      html/tlds/london.html
  8. 2
      html/tlds/select.html
  9. 12
      html/tlds/xn--e1a4c.html
  10. 12
      html/tlds/xn--qxa6a.html
  11. 2
      release/all-data.json
  12. BIN
      release/iana-data.db
  13. 3
      src/analyse.php
  14. 45
      src/load.php
  15. 55
      src/main.php

5
build.sh

@ -0,0 +1,5 @@
cd ./src
php main.php
cd ../
rm -rf ./html
mv ./temp ./html

2
html/tlds/biz.html

@ -56,7 +56,7 @@
<h2>Sponsoring Organisation</h2>
<b>Registry Services, LLC</b><br/>
14455 North Hayden Rd.<br>Scottsdale, AZ 85260<br/>
2155 E. GoDaddy Way<br>Tempe, AZ 85284<br/>
United States<br/>
<h2>Administrative Contact</h2>

2
html/tlds/compare.html

@ -56,7 +56,7 @@
<h2>Sponsoring Organisation</h2>
<b>Registry Services, LLC</b><br/>
21575 Ridgetop Circle<br>Sterling, VA 20166<br/>
2155 E. GoDaddy Way<br>Tempe, AZ 85284<br/>
United States<br/>
<h2>Administrative Contact</h2>

12
html/tlds/eu.html

@ -97,11 +97,6 @@
<td>194.146.106.90<br/>2001:67c:1010:23:0:0:0:53<br/></td>
</tr>
<tr>
<td>nl.dns.eu</td>
<td>91.200.16.100<br/></td>
</tr>
<tr>
<td>x.dns.eu</td>
<td>185.151.141.1<br/>2a02:568:fe00:0:0:0:0:6575<br/></td>
@ -112,6 +107,11 @@
<td>194.0.25.28<br/>2001:678:20:0:0:0:0:28<br/></td>
</tr>
<tr>
<td>be.dns.eu</td>
<td>149.38.1.26<br/>2001:978:2:1:0:0:93:2<br/></td>
</tr>
</tbody>
</table>
</div>
@ -140,7 +140,7 @@
<p><i>
Record last updated 2020-01-15.
Record last updated 2021-08-10.
Registration date 2005-04-28.
</i></p>

2
html/tlds/kiwi.html

@ -122,7 +122,7 @@
<p><i>
Record last updated 2021-08-10.
Record last updated 2021-08-13.
Registration date 2013-11-25.
</i></p>

7
html/tlds/lb.html

@ -117,6 +117,11 @@
<td>41.87.127.253<br/>2c0f:feb0:0:0:0:0:0:4<br/></td>
</tr>
<tr>
<td>ns-jp.lbdr.org.lb</td>
<td>203.178.141.64<br/>2001:200:0:2:0:0:53:1<br/></td>
</tr>
</tbody>
</table>
</div>
@ -138,7 +143,7 @@
<p><i>
Record last updated 2021-05-21.
Record last updated 2021-08-10.
Registration date 1993-08-25.
</i></p>

28
html/tlds/london.html

@ -84,11 +84,6 @@
<thead><tr><th>Host Name</th><th>IP Address(es)</th></thead>
<tbody>
<tr>
<td>dns2.nic.london</td>
<td>103.49.81.28<br/>2401:fd80:401:0:0:0:0:28<br/></td>
</tr>
<tr>
<td>dns1.nic.london</td>
<td>213.248.217.28<br/>2a01:618:401:0:0:0:0:28<br/></td>
@ -105,23 +100,28 @@
</tr>
<tr>
<td>dnsb.nic.london</td>
<td>156.154.101.3<br/></td>
<td>dnsc.nic.london</td>
<td>156.154.102.3<br/></td>
</tr>
<tr>
<td>a.nic.london</td>
<td>194.169.218.141<br/>2001:67c:13cc:0:0:0:1:141<br/></td>
</tr>
<tr>
<td>dns4.nic.london</td>
<td>43.230.49.28<br/>2401:fd80:405:0:0:0:0:28<br/></td>
<td>c.nic.london</td>
<td>212.18.248.141<br/>2a04:2b00:13ee:0:0:0:0:141<br/></td>
</tr>
<tr>
<td>dnsc.nic.london</td>
<td>156.154.102.3<br/></td>
<td>b.nic.london</td>
<td>185.24.64.141<br/>2a04:2b00:13cc:0:0:0:1:141<br/></td>
</tr>
<tr>
<td>dnsd.nic.london</td>
<td>156.154.103.3<br/></td>
<td>d.nic.london</td>
<td>212.18.249.141<br/>2a04:2b00:13ff:0:0:0:0:14<br/></td>
</tr>
</tbody>
@ -152,7 +152,7 @@
<p><i>
Record last updated 2019-08-22.
Record last updated 2021-08-10.
Registration date 2014-02-13.
</i></p>

2
html/tlds/select.html

@ -56,7 +56,7 @@
<h2>Sponsoring Organisation</h2>
<b>Registry Services, LLC</b><br/>
21575 Ridgetop Circle<br>Sterling, VA 20166<br/>
2155 E. GoDaddy Way<br>Tempe, AZ 85284<br/>
United States<br/>
<h2>Administrative Contact</h2>

12
html/tlds/xn--e1a4c.html

@ -98,11 +98,6 @@
<td>194.146.106.90<br/>2001:67c:1010:23:0:0:0:53<br/></td>
</tr>
<tr>
<td>nl.dns.eu</td>
<td>91.200.16.100<br/></td>
</tr>
<tr>
<td>x.dns.eu</td>
<td>185.151.141.1<br/>2a02:568:fe00:0:0:0:0:6575<br/></td>
@ -113,6 +108,11 @@
<td>194.0.25.28<br/>2001:678:20:0:0:0:0:28<br/></td>
</tr>
<tr>
<td>be.dns.eu</td>
<td>149.38.1.26<br/>2001:978:2:1:0:0:93:2<br/></td>
</tr>
</tbody>
</table>
</div>
@ -141,7 +141,7 @@
<p><i>
Record last updated 2020-01-15.
Record last updated 2021-08-10.
Registration date 2015-12-08.
</i></p>

12
html/tlds/xn--qxa6a.html

@ -88,11 +88,6 @@
<thead><tr><th>Host Name</th><th>IP Address(es)</th></thead>
<tbody>
<tr>
<td>nl.dns.eu</td>
<td>91.200.16.100<br/></td>
</tr>
<tr>
<td>si.dns.eu</td>
<td>193.2.221.60<br/>2001:1470:8000:100:0:0:0:1<br/></td>
@ -113,6 +108,11 @@
<td>194.146.106.90<br/>2001:67c:1010:23:0:0:0:53<br/></td>
</tr>
<tr>
<td>be.dns.eu</td>
<td>149.38.1.26<br/>2001:978:2:1:0:0:93:2<br/></td>
</tr>
</tbody>
</table>
</div>
@ -141,7 +141,7 @@
<p><i>
Record last updated 2020-01-15.
Record last updated 2021-08-10.
Registration date 2019-07-18.
</i></p>

2
release/all-data.json

File diff suppressed because one or more lines are too long

BIN
release/iana-data.db

Binary file not shown.

3
src/analyse.php

@ -97,9 +97,8 @@ function isVoice($str) {
return false;
}
function splitHtml($htmlFile) {
function splitHtml($html) {
// Get core part
$html = file_get_contents($htmlFile);
$html = explode('main_right">', $html)[1];
$html = explode('<div id="sidebar_left', $html)[0];
$html = explode('<script>', $html)[0];

45
src/load.php

@ -1,8 +1,48 @@
<?php
$ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67';
class tldDB extends SQLite3 { // Sqlite3数据库
public function __construct($filename) {
$this->open($filename);
}
public function __destruct() {
$this->close();
}
}
function writeFile($filename, $data) { // 写入文件
$file = fopen($filename, 'w');
fwrite($file, $data);
fclose($file);
}
function curl($url) { // curl模拟 20s超时
global $ua;
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 20);
curl_setopt($curl, CURLOPT_USERAGENT, $ua);
$content = curl_exec($curl);
curl_close($curl);
return $content;
}
function loadHtmlFile($url, $filename) {
$content = curl($url);
if (!$content) {
return false;
} else {
writeFile($filename, $content);
return true;
}
}
function getTldsInfo($tldList, $htmlDir) { // 抓取各个TLD数据
foreach ($tldList as $tld) {
$html = splitHtml($htmlDir . substr($tld, 1 - strlen($tld)) . '.html');
$html_content = file_get_contents($htmlDir . substr($tld, 1 - strlen($tld)) . '.html');
$html = splitHtml($html_content);
unset($html['report']);
if (getHtmlTitle($html['title']) !== $tld) {
die('error analyse -> title');
@ -23,8 +63,7 @@ function getTldsInfo($tldList, $htmlDir) { // 抓取各个TLD数据
return $data;
}
function getIanaTlds($htmlFile) { // 获取IANA上所有TLD
$html = file_get_contents($htmlFile);
function getIanaTlds($html) { // 获取IANA上所有TLD
$html = explode('tbody>', $html)[1];
$html = explode('</tr>', $html);
unset($html[count($html) - 1]);

55
src/main.php

@ -4,21 +4,48 @@ require_once './load.php';
require_once './analyse.php';
require_once './punycode.php';
$html_path = '../html/';
$temp_path = '../temp/';
$release_path = '../release/';
shell_exec('mkdir -p ' . $release_path);
shell_exec('mkdir -p ' . $temp_path . 'tlds/');
// Get IANA main page
echo 'Connect to IANA...';
if (!loadHtmlFile('https://www.iana.org/domains/root/db', $temp_path . 'main.html')) {
die('error -> fail to load IANA main page');
}
echo "\033[32mOK\033[0m" . PHP_EOL;
// Get TLD list from IANA website
// main.html -> https://www.iana.org/domains/root/db
$tlds = getIanaTlds($html_path . 'main.html');
$html_content = file_get_contents($temp_path . 'main.html');
$tlds = getIanaTlds($html_content);
writeFile($release_path . 'tld-list.txt', implode(PHP_EOL, $tlds) . PHP_EOL);
echo "Found \033[33m" . count($tlds) . "\033[0m TLDs." . PHP_EOL;
// Fetch all tld's html file
foreach ($tlds as $index => $tld) {
$tld = substr($tld, 1 - strlen($tld));
$url = 'https://www.iana.org/domains/root/db/' . $tld . '.html';
echo "\033[36m" . ($index + 1) . '/' . count($tlds) . "\033[0m -> \033[35m." . $tld . "\033[0m";
if (!loadHtmlFile($url, $temp_path . 'tlds/' . $tld . '.html')) {
die('error -> fail to load page');
}
echo PHP_EOL;
}
// Analyse all TLDs from html files
$data = getTldsInfo($tlds, $html_path . 'tlds/');
echo 'Analyse all pages...';
$data = getTldsInfo($tlds, $temp_path . 'tlds/');
echo "\033[32mOK\033[0m" . PHP_EOL;
// Output data by json format
echo 'Save as JSON format...';
writeFile($release_path . 'all-data.json', json_encode($data));
echo "\033[32mOK\033[0m" . PHP_EOL;
// Output whois server list by csv format
echo 'Dump the whois server list...';
$whoisStr = '';
foreach ($data as $index => $row) {
if ($row['whois'] !== '') {
@ -26,8 +53,10 @@ foreach ($data as $index => $row) {
}
}
writeFile($release_path . 'whois-server.csv', $whoisStr);
echo "\033[32mOK\033[0m" . PHP_EOL;
// Output into sqlite3 database
echo 'Output into sqlite3 database...';
$init_sql =<<<EOF
CREATE TABLE data (
tld TEXT NOT NULL,
@ -61,20 +90,10 @@ foreach ($data as $tld => $info) {
}
$db->exec($sql . ');');
}
$db->exec('VACUUM;');
echo "\033[32mOK\033[0m" . PHP_EOL;
function writeFile($filename, $data) {
$file = fopen($filename, 'w');
fwrite($file, $data);
fclose($file);
}
class tldDB extends SQLite3 {
public function __construct($filename) {
$this->open($filename);
}
public function __destruct() {
$this->close();
}
}
// All done
echo "\033[32mdone\033[0m" . PHP_EOL;
?>

Loading…
Cancel
Save