Browse Source

fix: filter and repair of some records

master
Dnomd343 3 years ago
parent
commit
cb93679b3c
  1. 43
      src/analyse.php
  2. 2
      src/load.php
  3. 70
      src/main.php
  4. 1256
      whois-server.txt

43
src/analyse.php

@ -225,6 +225,9 @@ function splitHtml($htmlFile) {
}
foreach ($result as &$row) {
$row = trim($row);
$row = str_replace(''', '\'', $row);
$row = str_replace('&', '&', $row);
$row = str_replace('"', '"', $row);
}
return $result;
}
@ -260,7 +263,7 @@ function getHtmlType($str) { // 提取TLD类型
case '(Restricted generic top-level domain)':
return 'Restricted TLD';
case '(Test top-level domain)':
return 'TLD for test';
return 'Test TLD';
default:
die('error analyse -> type');
}
@ -269,8 +272,8 @@ function getHtmlType($str) { // 提取TLD类型
function getHtmlManager($str) { // 提取TLD所有者信息
if ($str == '') {
return array(
'manager' => '',
'manager_info' => ''
'name' => array(),
'addr' => array()
);
}
$temp = explode('</b><br/>', $str);
@ -278,17 +281,19 @@ function getHtmlManager($str) { // 提取TLD所有者信息
die('error analyse -> manager');
}
$manager = trim($temp[0]);
preg_match('/\\\u[0-9a-f]{4}/', $manager, $match);
$manager = substr($manager, 3 - strlen($manager));
if ($manager === 'Not assigned') {
return array(
'manager' => '',
'manager_info' => ''
'name' => array(),
'addr' => array()
);
}
$manager = explode('<br>', $manager);
if ($temp[1] == '') {
return array(
'manager' => $manager,
'manager_info' => ''
'name' => $manager,
'addr' => array()
);
}
$temp = str_replace('<br>', '<br/>', trim($temp[1]));
@ -303,8 +308,8 @@ function getHtmlManager($str) { // 提取TLD所有者信息
die('error analyse -> manager');
}
return array(
'manager' => $manager,
'manager_addr' => $manager_addr
'name' => $manager,
'addr' => $manager_addr
);
}
@ -370,7 +375,7 @@ function getHtmlContact($str) { // 提取联系人信息
$flag = false;
foreach ($temp as $line) {
if (!$flag) {
$result['org'] = $line;
$result['org'] = preg_replace('/[\s]+/', ' ', $line);
$flag = true;
continue;
}
@ -398,15 +403,21 @@ function getHtmlContact($str) { // 提取联系人信息
if ($result['email'] != '' && substr($result['email'], -5) !== '<br/>') {
die('error analyse -> contact');
}
$result['email'] = substr($result['email'], 0, strlen($result['email']) - 5);
if ($result['email'] !== '') {
$result['email'] = substr($result['email'], 0, strlen($result['email']) - 5);
}
if ($result['voice'] != '' && substr($result['voice'], -5) !== '<br/>') {
die('error analyse -> contact');
}
$result['voice'] = substr($result['voice'], 0, strlen($result['voice']) - 5);
if ($result['voice'] !== '') {
$result['voice'] = substr($result['voice'], 0, strlen($result['voice']) - 5);
}
if ($result['fax'] != '' && substr($result['fax'], -5) !== '<br/>') {
die('error analyse -> contact');
}
$result['fax'] = substr($result['fax'], 0, strlen($result['fax']) - 5);
if ($result['fax'] !== '') {
$result['fax'] = substr($result['fax'], 0, strlen($result['fax']) - 5);
}
if ($result['fax'] === 'n/a' || $result['fax'] === 'NA' || $result['fax'] === 'N/A' || $result['fax'] === '-') {
$result['fax'] = '';
}
@ -476,12 +487,6 @@ function getHtmlNS($str) { // 提取TLD名称服务器
}
function getHtmlInfo($str) { // 获取官网/Whois服务器信息
// if ($str == '') {
// return array(
// 'website' => '',
// 'whois' => ''
// );
// }
preg_match_all('/<p>[\s\S]+?<\/p>/', $str, $match);
if (count($match) !== 1) {
die('error analyse -> info');

2
src/load.php

@ -8,7 +8,7 @@ function getTldsInfo($tldList, $htmlDir) { // 抓取各个TLD数据
die('error analyse -> title');
}
$info['type'] = getHtmlType($html['type']);
$info += getHtmlManager($html['manager']);
$info['manager'] = getHtmlManager($html['manager']);
$info['admin_contact'] = getHtmlContact($html['admin']);
$info['tech_contact'] = getHtmlContact($html['tech']);
$info['nameserver'] = getHtmlNS($html['ns']);

70
src/main.php

@ -4,30 +4,60 @@ require_once './load.php';
require_once './analyse.php';
require_once './punycode.php';
// function writeFile($filename, $data) {
// $file = fopen($filename, 'w');
// fwrite($file, $data);
// fclose($file);
// }
// $data = getIanaTlds('../html/main.html'); // https://www.iana.org/domains/root/db
// // writeFile('tlds.txt', implode(PHP_EOL, $data) . PHP_EOL);
// $urls = '';
// foreach ($data as $tld) {
// $urls .= 'https://www.iana.org/domains/root/db/';
// $urls .= substr($tld, 1, strlen($tld) - 1);
// $urls .= '.html' . PHP_EOL;
// }
// writeFile('urls.txt', $urls);
$tlds = getIanaTlds('../html/main.html');
$data = getTldsInfo($tlds, '../html/tlds/');
// echo count($data);
$html_path = '../html/';
$release_path = '../release/';
// Get TLD list from IANA website
// main.html -> https://www.iana.org/domains/root/db
$tlds = getIanaTlds($html_path . 'main.html');
writeFile($release_path . 'tld-list.txt', implode(PHP_EOL, $tlds) . PHP_EOL);
// Analyse all TLDs from html files
$data = getTldsInfo($tlds, $html_path . 'tlds/');
// $str = $data['.to']['admin_contact']['org'];
// echo preg_replace('/[\s]+/', ' ', $str);
// echo $str;
// exit;
// var_dump($data['.com']);
// exit;
foreach ($data as $index => $row) {
// if ($row['admin_contact']['voice'] === false) {
// echo $index . ' -> ' . implode(' | ', $row['manager']['addr']) . PHP_EOL;
// if (count($row['nameserver']) === 0) { continue; }
// echo $row['website'] . PHP_EOL;
// echo $row['whois'] . PHP_EOL;
// echo $row['last_updated'] . PHP_EOL;
// echo $row['regist_date'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['name'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['org'] . PHP_EOL;
// echo $index . ' -> ' . implode(' | ', $row['tech_contact']['addr']) . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['email'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['voice'] . PHP_EOL;
// echo $index . ' -> ' . $row['tech_contact']['fax'] . PHP_EOL;
// }
}
// return;
// Output data by json format
writeFile($release_path . 'all-data.json', json_encode($data));
// Output whois server list by csv format
$whoisStr = '';
foreach ($data as $index => $row) {
if ($row['whois'] !== '') {
echo $index . ' -> ' . $row['whois'] . PHP_EOL;
$whoisStr .= $index . ',' . $row['whois'] . PHP_EOL;
}
}
writeFile('../release/whois-server.csv', $whoisStr);
function writeFile($filename, $data) {
$file = fopen($filename, 'w');
fwrite($file, $data);
fclose($file);
}
?>

1256
whois-server.txt

File diff suppressed because it is too large
Loading…
Cancel
Save