Browse Source

fix: filter and repair of some records

master
Dnomd343 3 years ago
parent
commit
cb93679b3c
  1. 43
      src/analyse.php
  2. 2
      src/load.php
  3. 70
      src/main.php
  4. 1256
      whois-server.txt

43
src/analyse.php

@ -225,6 +225,9 @@ function splitHtml($htmlFile) {
} }
foreach ($result as &$row) { foreach ($result as &$row) {
$row = trim($row); $row = trim($row);
$row = str_replace(''', '\'', $row);
$row = str_replace('&', '&', $row);
$row = str_replace('"', '"', $row);
} }
return $result; return $result;
} }
@ -260,7 +263,7 @@ function getHtmlType($str) { // 提取TLD类型
case '(Restricted generic top-level domain)': case '(Restricted generic top-level domain)':
return 'Restricted TLD'; return 'Restricted TLD';
case '(Test top-level domain)': case '(Test top-level domain)':
return 'TLD for test'; return 'Test TLD';
default: default:
die('error analyse -> type'); die('error analyse -> type');
} }
@ -269,8 +272,8 @@ function getHtmlType($str) { // 提取TLD类型
function getHtmlManager($str) { // 提取TLD所有者信息 function getHtmlManager($str) { // 提取TLD所有者信息
if ($str == '') { if ($str == '') {
return array( return array(
'manager' => '', 'name' => array(),
'manager_info' => '' 'addr' => array()
); );
} }
$temp = explode('</b><br/>', $str); $temp = explode('</b><br/>', $str);
@ -278,17 +281,19 @@ function getHtmlManager($str) { // 提取TLD所有者信息
die('error analyse -> manager'); die('error analyse -> manager');
} }
$manager = trim($temp[0]); $manager = trim($temp[0]);
preg_match('/\\\u[0-9a-f]{4}/', $manager, $match);
$manager = substr($manager, 3 - strlen($manager)); $manager = substr($manager, 3 - strlen($manager));
if ($manager === 'Not assigned') { if ($manager === 'Not assigned') {
return array( return array(
'manager' => '', 'name' => array(),
'manager_info' => '' 'addr' => array()
); );
} }
$manager = explode('<br>', $manager);
if ($temp[1] == '') { if ($temp[1] == '') {
return array( return array(
'manager' => $manager, 'name' => $manager,
'manager_info' => '' 'addr' => array()
); );
} }
$temp = str_replace('<br>', '<br/>', trim($temp[1])); $temp = str_replace('<br>', '<br/>', trim($temp[1]));
@ -303,8 +308,8 @@ function getHtmlManager($str) { // 提取TLD所有者信息
die('error analyse -> manager'); die('error analyse -> manager');
} }
return array( return array(
'manager' => $manager, 'name' => $manager,
'manager_addr' => $manager_addr 'addr' => $manager_addr
); );
} }
@ -370,7 +375,7 @@ function getHtmlContact($str) { // 提取联系人信息
$flag = false; $flag = false;
foreach ($temp as $line) { foreach ($temp as $line) {
if (!$flag) { if (!$flag) {
$result['org'] = $line; $result['org'] = preg_replace('/[\s]+/', ' ', $line);
$flag = true; $flag = true;
continue; continue;
} }
@ -398,15 +403,21 @@ function getHtmlContact($str) { // 提取联系人信息
if ($result['email'] != '' && substr($result['email'], -5) !== '<br/>') { if ($result['email'] != '' && substr($result['email'], -5) !== '<br/>') {
die('error analyse -> contact'); die('error analyse -> contact');
} }
$result['email'] = substr($result['email'], 0, strlen($result['email']) - 5); if ($result['email'] !== '') {
$result['email'] = substr($result['email'], 0, strlen($result['email']) - 5);
}
if ($result['voice'] != '' && substr($result['voice'], -5) !== '<br/>') { if ($result['voice'] != '' && substr($result['voice'], -5) !== '<br/>') {
die('error analyse -> contact'); die('error analyse -> contact');
} }
$result['voice'] = substr($result['voice'], 0, strlen($result['voice']) - 5); if ($result['voice'] !== '') {
$result['voice'] = substr($result['voice'], 0, strlen($result['voice']) - 5);
}
if ($result['fax'] != '' && substr($result['fax'], -5) !== '<br/>') { if ($result['fax'] != '' && substr($result['fax'], -5) !== '<br/>') {
die('error analyse -> contact'); die('error analyse -> contact');
} }
$result['fax'] = substr($result['fax'], 0, strlen($result['fax']) - 5); if ($result['fax'] !== '') {
$result['fax'] = substr($result['fax'], 0, strlen($result['fax']) - 5);
}
if ($result['fax'] === 'n/a' || $result['fax'] === 'NA' || $result['fax'] === 'N/A' || $result['fax'] === '-') { if ($result['fax'] === 'n/a' || $result['fax'] === 'NA' || $result['fax'] === 'N/A' || $result['fax'] === '-') {
$result['fax'] = ''; $result['fax'] = '';
} }
@ -476,12 +487,6 @@ function getHtmlNS($str) { // 提取TLD名称服务器
} }
function getHtmlInfo($str) { // 获取官网/Whois服务器信息 function getHtmlInfo($str) { // 获取官网/Whois服务器信息
// if ($str == '') {
// return array(
// 'website' => '',
// 'whois' => ''
// );
// }
preg_match_all('/<p>[\s\S]+?<\/p>/', $str, $match); preg_match_all('/<p>[\s\S]+?<\/p>/', $str, $match);
if (count($match) !== 1) { if (count($match) !== 1) {
die('error analyse -> info'); die('error analyse -> info');

2
src/load.php

@ -8,7 +8,7 @@ function getTldsInfo($tldList, $htmlDir) { // 抓取各个TLD数据
die('error analyse -> title'); die('error analyse -> title');
} }
$info['type'] = getHtmlType($html['type']); $info['type'] = getHtmlType($html['type']);
$info += getHtmlManager($html['manager']); $info['manager'] = getHtmlManager($html['manager']);
$info['admin_contact'] = getHtmlContact($html['admin']); $info['admin_contact'] = getHtmlContact($html['admin']);
$info['tech_contact'] = getHtmlContact($html['tech']); $info['tech_contact'] = getHtmlContact($html['tech']);
$info['nameserver'] = getHtmlNS($html['ns']); $info['nameserver'] = getHtmlNS($html['ns']);

70
src/main.php

@ -4,30 +4,60 @@ require_once './load.php';
require_once './analyse.php'; require_once './analyse.php';
require_once './punycode.php'; require_once './punycode.php';
// function writeFile($filename, $data) { $html_path = '../html/';
// $file = fopen($filename, 'w'); $release_path = '../release/';
// fwrite($file, $data);
// fclose($file);
// }
// $data = getIanaTlds('../html/main.html'); // https://www.iana.org/domains/root/db
// // writeFile('tlds.txt', implode(PHP_EOL, $data) . PHP_EOL);
// $urls = '';
// foreach ($data as $tld) {
// $urls .= 'https://www.iana.org/domains/root/db/';
// $urls .= substr($tld, 1, strlen($tld) - 1);
// $urls .= '.html' . PHP_EOL;
// }
// writeFile('urls.txt', $urls);
$tlds = getIanaTlds('../html/main.html');
$data = getTldsInfo($tlds, '../html/tlds/');
// echo count($data);
// Get TLD list from IANA website
// main.html -> https://www.iana.org/domains/root/db
$tlds = getIanaTlds($html_path . 'main.html');
writeFile($release_path . 'tld-list.txt', implode(PHP_EOL, $tlds) . PHP_EOL);
// Analyse all TLDs from html files
$data = getTldsInfo($tlds, $html_path . 'tlds/');
// $str = $data['.to']['admin_contact']['org'];
// echo preg_replace('/[\s]+/', ' ', $str);
// echo $str;
// exit;
// var_dump($data['.com']);
// exit;
foreach ($data as $index => $row) {
// if ($row['admin_contact']['voice'] === false) {
// echo $index . ' -> ' . implode(' | ', $row['manager']['addr']) . PHP_EOL;
// if (count($row['nameserver']) === 0) { continue; }
// echo $row['website'] . PHP_EOL;
// echo $row['whois'] . PHP_EOL;
// echo $row['last_updated'] . PHP_EOL;
// echo $row['regist_date'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['name'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['org'] . PHP_EOL;
// echo $index . ' -> ' . implode(' | ', $row['tech_contact']['addr']) . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['email'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['voice'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['fax'] . PHP_EOL;
// }
}
// return;
// Output data by json format
writeFile($release_path . 'all-data.json', json_encode($data));
// Output whois server list by csv format
$whoisStr = '';
foreach ($data as $index => $row) { foreach ($data as $index => $row) {
if ($row['whois'] !== '') { if ($row['whois'] !== '') {
echo $index . ' -> ' . $row['whois'] . PHP_EOL; $whoisStr .= $index . ',' . $row['whois'] . PHP_EOL;
} }
} }
writeFile('../release/whois-server.csv', $whoisStr);
function writeFile($filename, $data) {
$file = fopen($filename, 'w');
fwrite($file, $data);
fclose($file);
}
?> ?>

1256
whois-server.txt

File diff suppressed because it is too large
Loading…
Cancel
Save