-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_pattern.php
50 lines (35 loc) · 1.36 KB
/
build_pattern.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
<?php
include 'vendor/autoload.php';
use Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;
$url = 'http://wwwap.hi.u-tokyo.ac.jp/ships/itaiji_list.jsp';
$client = new Client();
$crawler = $client->request('GET', $url);
$rows = $crawler->filter('table.ITAIJI tr');
$patterns = array();
$iso2022Patterns = array();
$rows->each(function (Crawler $row) use (&$patterns, &$iso2022Patterns) {
$cells = $row->filter('td');
// ヘッダーなどはスキップ
if ($cells->count() == 0) {
return;
}
$replace = trim(str_replace(array(' ', ' '), ' ', $cells->eq(1)->text()));
$searchs = preg_split('/\s+/u', trim(str_replace(array(' ', ' '), ' ', trim($cells->eq(2)->text()))));
foreach ($searchs as $search) {
if ($search) {
$patterns[$search] = $replace;
if (!isISO2022JPSafe($search)) {
$iso2022Patterns[$search] = $replace;
}
}
}
});
function isISO2022JPSafe($text)
{
$iso2022 = mb_convert_encoding($text, 'ISO-2022-JP', 'UTF-8');
$utf8 = mb_convert_encoding($iso2022, 'UTF-8', 'ISO-2022-JP');
return $utf8 === $text;
}
file_put_contents(__DIR__.'/src/pattern.php', "<?php\nreturn " . var_export($patterns, true) . ";\n");
file_put_contents(__DIR__.'/src/iso_2022_jp_pattern.php', "<?php\nreturn " . var_export($iso2022Patterns, true) . ";\n");