#!/usr/bin/perl # use strict; use warnings; use utf8; use Encode; use FileHandle; use Text::CSV_XS; use PerlIO::encoding; use Encode qw(:fallbacks); use HTML::Entities; sub getInflections(**) { my ($arg, $infs) = @_; @{$infs} = ${$arg} =~ /【変化】(.*?)(、|
|\r).*$/; if ($#{$infs} >= 0) { $infs->[0] =~ s/《.+?》//g; $infs->[0] =~ s/ +(\| +)*?《.+?》/ \| /g; @{$infs} = grep(!/\s/, split(' \| ',$infs->[0])); # more xxxx, most xxxx のパターンを排除する # print STDERR encode('shift_jis', "inflections: " . join('//', @{$infs}) . "\n"); # print encode('shift_jis', "inflections: " . $infs->[0] . "\n"); # print encode('shift_jis', "rest: " . $infs->[1] . "\n=====\n"); } } sub Substitute(*) { my $arg = $_[0]; ${$arg} =~ s/【@】.*?【/【/g; # ${$arg} =~ s/【変化】《.+?》.*?(、|
)//g; ${$arg} =~ s/【変化】.*?(、|
)//g; ${$arg} =~ s/【大学入試】//g; ${$arg} =~ s/ \| /\//g; # ${$arg} =~ s/【(.+?)】/[__START_BOLD__$1__END_BOLD__]/g; # ${$arg} =~ s/《(.+?)》/[__START_BOLD__$1__END_BOLD__]/g; ${$arg} =~ tr/。、【】《》[]%_()=^/.,[]<>()%_()=^/; ${$arg} =~ s/◆ */ /g; ${$arg} =~ s/-([0-9]+)\]/$1\]/g; ${$arg} =~ s/
/ /g; ${$arg} =~ s/\r/ /g; } sub printElements(*) { my($array) = @_; my $i = 0; my $str = ""; # /* $#{$array} */ for ($i = 0; $i <= 1 ; $i++) { $str = $array->[$i]; Substitute(\$str); print encode('shift_jis', "[$i] $str\n"); } } sub getStringOfLevels(*) { my $array = $_[0]; my $retStr = ""; for (my $x = 0; $x <= $#{$array}; $x++) { $retStr .= ("level[$x]:" . $array->[$x] . "\n"); } return $retStr; } my %linkedHash; sub replaceLink($) { my $idx = $_[0]; my $id = $linkedHash{$idx}; if (defined $id) { # print STDERR encode('shift_jis', "** $idx, $id\n"); return "ZXZXZXZ${id}ZXZXZXZ${idx}ZXZXZXZ\n"; } else { return "[参照]$idx\n"; } } sub restoreLink($$) { my ($id, $idx) = @_; return "[参照]${idx}\n"; } # ======= # M A I N # ======= # die "please specify the name of file for input.\n" if ($#ARGV < 0); die "please specify the name of your Dictionary.\n" if ($#ARGV < 1); my $charCodeOfCSV = 'cp932'; $charCodeOfCSV = $ARGV[2] if $#ARGV >= 2; my $unsafe_chars = '<>&"'; my $fileName = $ARGV[0]; my $dictionaryName = decode('shift_jis', encode_entities($ARGV[1], $unsafe_chars)); my ($wordsInKey) = ($dictionaryName =~ /WIK:(\d+)/); my ($idiomsToAnotherIndex) = ($dictionaryName =~ /ITAI/ ? 1 : undef); $wordsInKey = 1 if !defined($wordsInKey); my $fileOfLinkedWords = "linked.txt"; my $hasLinkedWords = 0; my $linkedId = "LK0000000"; if (-f $fileOfLinkedWords) { $hasLinkedWords = 1; open my $fhLink, "<:encoding(UTF-8)", $fileOfLinkedWords || ($hasLinkedWords = 0); if ($hasLinkedWords) { print STDERR encode('shift_jis', "$fileOfLinkedWords has be opened!!\n"); while (<$fhLink>) { chomp; # print STDERR encode('shift_jis', "$_: $linkedId\n"); $linkedHash{$_} = $linkedId; $linkedId++; } close $fhLink; } } my $htmlFirstPage = < $dictionaryName

$dictionaryName

 TOP PAGE 


検索 | */?

EOF print encode('utf-8', $htmlFirstPage); open my $fh, "<:encoding($charCodeOfCSV)", $fileName || die "error, Cannot open $fileName¥n"; my $csv = Text::CSV_XS->new({ binary => 1, eol => $/ }); #日本語を読み込むときはbinaryを1にする # 一行ごとに読み込む my $i = 0; my $maxLengthOfEntry = 0; my @levels = (0..10); my $indexName = ""; foreach my $j (@levels) { $levels[$j] = 0; } LOOP_OF_GETLINE: while (my $row = $csv->getline($fh)) { my @fields = @$row; # last if ($i > 4000); next if ($i == 0 && $fields[3] eq 'level'); if (defined $wordsInKey) { my @words = split(' ', $fields[0]); next LOOP_OF_GETLINE if ($#words >= $wordsInKey); if ($idiomsToAnotherIndex) { $indexName = ($#words > 0 ? ' name="idiom" ' : ""); } } $i++; my $targetMark = ""; if ($hasLinkedWords) { my $linkedId = $linkedHash{$fields[0]}; if (defined $linkedId) { $targetMark = "\n"; } $fields[1] =~ s/<→(.*?)>/replaceLink($1)/ge; } my ($escapedKey, $key, $explain, $entry); $escapedKey = $key = encode_entities($fields[0], $unsafe_chars); $escapedKey =~ s/\\/\\\\/g; $escapedKey =~ s/'/\\'/g; my (@inflections, $inflectionTags); $explain = $fields[1] . '
' . $fields[2]; $explain =~ s/\n/
/g; getInflections(\$explain, \@inflections); $inflectionTags = ""; if ($#inflections >= 0) { foreach my $inf (@inflections) { $inflectionTags .= "\n"; } $inflectionTags = "\n\n$inflectionTags\n"; } Substitute(\$explain); # my $len = length($explain); # $maxLengthOfEntry = $len if ($len > $maxLengthOfEntry); print STDERR "[$i]\r" unless ($i % 2000); print encode('utf-8', $targetMark); encode_entities($explain, $unsafe_chars); $explain =~ s/ZXZXZXZ(.*?)ZXZXZXZ(.*?)ZXZXZXZ/restoreLink($1, $2)/ge; # $pieceOfExplain =~ s/__START_BOLD__//g; # $pieceOfExplain =~ s/__END_BOLD__/<\/b>/g; $entry = <

$key$inflectionTags

$explain
EOF print encode('utf-8', $entry); } # print STDERR "[$i:$maxLengthOfEntry] ", getStringOfLevels(\@levels), "\r"; $fh->close(); print < EOF print STDERR "\n";