#!/usr/bin/perl
#
use strict;
use warnings;
use utf8;
use Encode;
use FileHandle;
use Text::CSV_XS;
use PerlIO::encoding;
use Encode qw(:fallbacks);
use HTML::Entities;
sub getInflections(**)
{
my ($arg, $infs) = @_;
@{$infs} = ${$arg} =~ /【変化】(.*?)(、|
|\r).*$/;
if ($#{$infs} >= 0) {
$infs->[0] =~ s/《.+?》//g;
$infs->[0] =~ s/ +(\| +)*?《.+?》/ \| /g;
@{$infs} = grep(!/\s/, split(' \| ',$infs->[0])); # more xxxx, most xxxx のパターンを排除する
# print STDERR encode('shift_jis', "inflections: " . join('//', @{$infs}) . "\n");
# print encode('shift_jis', "inflections: " . $infs->[0] . "\n");
# print encode('shift_jis', "rest: " . $infs->[1] . "\n=====\n");
}
}
sub Substitute(*)
{
my $arg = $_[0];
${$arg} =~ s/【@】.*?【/【/g;
# ${$arg} =~ s/【変化】《.+?》.*?(、|
)//g;
${$arg} =~ s/【変化】.*?(、|
)//g;
${$arg} =~ s/【大学入試】//g;
${$arg} =~ s/ \| /\//g;
# ${$arg} =~ s/【(.+?)】/[__START_BOLD__$1__END_BOLD__]/g;
# ${$arg} =~ s/《(.+?)》/[__START_BOLD__$1__END_BOLD__]/g;
${$arg} =~ tr/。、【】《》[]%_()=^/.,[]<>()%_()=^/;
${$arg} =~ s/◆ */ /g;
${$arg} =~ s/-([0-9]+)\]/$1\]/g;
${$arg} =~ s/
/ /g;
${$arg} =~ s/\r/ /g;
}
sub printElements(*)
{
my($array) = @_;
my $i = 0;
my $str = "";
# /* $#{$array} */
for ($i = 0; $i <= 1 ; $i++) {
$str = $array->[$i];
Substitute(\$str);
print encode('shift_jis', "[$i] $str\n");
}
}
sub getStringOfLevels(*)
{
my $array = $_[0];
my $retStr = "";
for (my $x = 0; $x <= $#{$array}; $x++) {
$retStr .= ("level[$x]:" . $array->[$x] . "\n");
}
return $retStr;
}
my %linkedHash;
sub replaceLink($)
{
my $idx = $_[0];
my $id = $linkedHash{$idx};
if (defined $id) {
# print STDERR encode('shift_jis', "** $idx, $id\n");
return "ZXZXZXZ${id}ZXZXZXZ${idx}ZXZXZXZ\n";
} else {
return "[参照]$idx\n";
}
}
sub restoreLink($$)
{
my ($id, $idx) = @_;
return "[参照]${idx}\n";
}
# =======
# M A I N
# =======
#
die "please specify the name of file for input.\n" if ($#ARGV < 0);
die "please specify the name of your Dictionary.\n" if ($#ARGV < 1);
my $charCodeOfCSV = 'cp932';
$charCodeOfCSV = $ARGV[2] if $#ARGV >= 2;
my $unsafe_chars = '<>&"';
my $fileName = $ARGV[0];
my $dictionaryName = decode('shift_jis', encode_entities($ARGV[1], $unsafe_chars));
my ($wordsInKey) = ($dictionaryName =~ /WIK:(\d+)/);
my ($idiomsToAnotherIndex) = ($dictionaryName =~ /ITAI/ ? 1 : undef);
$wordsInKey = 1 if !defined($wordsInKey);
my $fileOfLinkedWords = "linked.txt";
my $hasLinkedWords = 0;
my $linkedId = "LK0000000";
if (-f $fileOfLinkedWords) {
$hasLinkedWords = 1;
open my $fhLink, "<:encoding(UTF-8)", $fileOfLinkedWords || ($hasLinkedWords = 0);
if ($hasLinkedWords) {
print STDERR encode('shift_jis', "$fileOfLinkedWords has be opened!!\n");
while (<$fhLink>) {
chomp;
# print STDERR encode('shift_jis', "$_: $linkedId\n");
$linkedHash{$_} = $linkedId;
$linkedId++;
}
close $fhLink;
}
}
my $htmlFirstPage = <
$dictionaryName
$dictionaryName
TOP PAGE
検索 |
*/?
EOF
print encode('utf-8', $htmlFirstPage);
open my $fh, "<:encoding($charCodeOfCSV)", $fileName || die "error, Cannot open $fileName¥n";
my $csv = Text::CSV_XS->new({ binary => 1, eol => $/ }); #日本語を読み込むときはbinaryを1にする
# 一行ごとに読み込む
my $i = 0;
my $maxLengthOfEntry = 0;
my @levels = (0..10);
my $indexName = "";
foreach my $j (@levels) { $levels[$j] = 0; }
LOOP_OF_GETLINE:
while (my $row = $csv->getline($fh)) {
my @fields = @$row;
# last if ($i > 4000);
next if ($i == 0 && $fields[3] eq 'level');
if (defined $wordsInKey) {
my @words = split(' ', $fields[0]);
next LOOP_OF_GETLINE if ($#words >= $wordsInKey);
if ($idiomsToAnotherIndex) {
$indexName = ($#words > 0 ? ' name="idiom" ' : "");
}
}
$i++;
my $targetMark = "";
if ($hasLinkedWords) {
my $linkedId = $linkedHash{$fields[0]};
if (defined $linkedId) {
$targetMark = "\n";
}
$fields[1] =~ s/<→(.*?)>/replaceLink($1)/ge;
}
my ($escapedKey, $key, $explain, $entry);
$escapedKey = $key = encode_entities($fields[0], $unsafe_chars);
$escapedKey =~ s/\\/\\\\/g;
$escapedKey =~ s/'/\\'/g;
my (@inflections, $inflectionTags);
$explain = $fields[1] . '
' . $fields[2];
$explain =~ s/\n/
/g;
getInflections(\$explain, \@inflections);
$inflectionTags = "";
if ($#inflections >= 0) {
foreach my $inf (@inflections) {
$inflectionTags .= "\n";
}
$inflectionTags = "\n\n$inflectionTags\n";
}
Substitute(\$explain);
# my $len = length($explain);
# $maxLengthOfEntry = $len if ($len > $maxLengthOfEntry);
print STDERR "[$i]\r" unless ($i % 2000);
print encode('utf-8', $targetMark);
encode_entities($explain, $unsafe_chars);
$explain =~ s/ZXZXZXZ(.*?)ZXZXZXZ(.*?)ZXZXZXZ/restoreLink($1, $2)/ge;
# $pieceOfExplain =~ s/__START_BOLD__//g;
# $pieceOfExplain =~ s/__END_BOLD__/<\/b>/g;
$entry = <
•$key$inflectionTags
$explain
EOF
print encode('utf-8', $entry);
}
# print STDERR "[$i:$maxLengthOfEntry] ", getStringOfLevels(\@levels), "\r";
$fh->close();
print <
EOF
print STDERR "\n";