# -*- Perl -*-
# $Id: migemo.pl,v 1.2 2001/03/14 02:05:53 satoru Exp $
#
# Copyright (C) 2000 Satoru Takabayashi All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either versions 2, or (at your option)
#  any later version.
# 
#  This program is distributed in the hope that it will be useful
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA
#
#  This file must be encoded in EUC-JP encoding

use strict;
use Lingua::Romkan;
use Getopt::Long;
use FileHandle;
use Search::Dict;

$| = '1';  # turn buffring off.

my $ASCII = '[\x00-\x7f]';
my $CHAR  = '(?:[\x00-\x7f]|(?:\x8f[\xa1-\xfe]|[\x8e\xa1-\xfe])[\xa1-\xfe])';
my $EUC   = '(?:(?:\x8f[\xa1-\xfe]|[\x8e\xa1-\xfe])[\xa1-\xfe])';
my $KANJI = '(?:[\xb0-\xfe][\xa1-\xfe]|\xa1\xb9)';

my %hanzen    = init_hanzen();
my $hanzenpat = init_hanzenpat();

my %cache = ();

my $dictfile;
my $dictfh;
my $insert_string;
my $opt_optimize;
my $opt_nocache;
main();

sub main {
    parse_options();
    open_dictionary();
    init_cache();

    while (<>) {
	chomp $_;
	my $roma  = $_;
	if (!$opt_nocache && $cache{$roma}) {
	    print $cache{$roma}, "\n";
	} else {
	    my $pat = genpat($roma);
	    $cache{$roma} = $pat;
	    print $pat, "\n";
	}
	print "ok\n";
    }
    exit 0;
}

sub open_dictionary {
    $dictfh = new FileHandle;
    $dictfh->open($dictfile) || die "$dictfile: $!";
}

sub parse_options {
    my $additional_inc;

    Getopt::Long::Configure('bundling');
    GetOptions('i|insert-string=s' => \$insert_string,
	       'o|optimize'        => \$opt_optimize,
	       'n|nocache'         => \$opt_nocache,
	       'I|INC=s'           => \$additional_inc,
	       );
    exit 1 if (@ARGV == 0);
    push @INC, $additional_inc if defined $additional_inc;
    $dictfile = $ARGV[0];
    shift @ARGV;
}

sub genpat {
    my ($roma) = @_;
    my @cand  = ();

    push @cand, hanzen($roma);  # should be before escaping.
    $roma = quotemeta($roma);
    $roma =~ /^([-a-z\']*)/;
    my $head = $1;
    push @cand, $roma;

    if ($head ne "") {
	my $hepburn = romrom($roma);  # normalize into Hepburn.
	my @words = extract_words($roma);         # for entries such as `file'
	if ($hepburn ne $roma) {
	    push @words, extract_words($hepburn); # for entries such as `kanji'
	}
	my @hiras = extract_kanas($hepburn);
	my @katas = map { hirakata($_) } @hiras;

	push @cand, $roma, @words, @hiras, @katas;
    }

    @cand = process($roma, @cand);

    my $pat = join "\\|", map { emacsize($_) } @cand;

    return $pat;
}

# Absorb regular expression incompatibility.
sub emacsize {
    my ($pat) = @_;

    $pat =~ s/\\\(/(/g;
    $pat =~ s/\\\)/)/g;
    $pat =~ s/\\\|/|/g;
    $pat =~ s/\\\</</g;
    $pat =~ s/\\\>/>/g;
    $pat =~ s/\\\=/=/g;
    $pat =~ s/\\\'/'/g;
    $pat =~ s/\\\`/`/g;
    $pat =~ s/\\\{/{/g;

    return $pat;
}


# FIXME: More clear name is needed.
sub process {
    my ($roma, @cand) = @_;

    # Remove duplicates.
    my %seen = ();
    @cand = grep { ! $seen{$_}++ } @cand;

    if (defined $opt_optimize) {
	@cand = optimize(@cand);
    }

    if (defined $insert_string) {
	@cand = insert_string(@cand);
    }

    return @cand;
}

sub optimize {
    my @cand = @_;

    # Remove entries having same a prefix to shorten the resulting pattern.
    # e.g. ( ư ܸ ư ư) => (ư )
    my $prefixpat = undef;
    @cand = grep {
	if (defined $prefixpat && /^$prefixpat/) {
	    0;  # Exclude it.
	} else {
	    # Apply quotemeta to escape perl regex's meta chars.
	    $prefixpat = quotemeta($_);
	    1;  # Include it.
	}
    } sort @cand;  # Must do sorting.

    # Gather one-length entries.
    # e.g. (s      )
    my @onelens = grep { /^$CHAR$/ } @cand;
    # Exclude one-length entries.
    @cand = grep { ! /^$CHAR$/ } @cand;

    # Sort in short order.
    @cand = map  { $_->[0] } 
            sort { $a->[1] <=> $b->[1] || $a->[0] cmp $b->[0] } 
            map  { [ $_, length($_) ] } @cand;

    # Make them a character class.
    if (@onelens > 0) {
	my $charclass = join '', ("[", @onelens, "]");
	unshift @cand, $charclass;  # Add to the head.
    }

    return @cand;
}

sub insert_string {
    my @cand = @_;

    # Insert $insert_string into each 2 byte character 
    # except character in a character class.
    # e.g. $insert_string = "." and $_ = "abc" => "a.b.c",
    # e.g. $insert_string = "." and $_ = "[def]"  =>  "[def]"
    @cand = map { s/($EUC)(?!$)/$1$insert_string/g unless /^\[/; $_; } @cand;
}

# `do'   => ()
# `d'    => (     )
# `nod'  => (Τ Τ Τ Τ Τ Τ)
# `sh'   => (    )
# `don'  => (ɤ ɤ ɤ ɤ ɤ ɤ ɤ)  # special case 1
# `nodd' => (Τ)                                # special case 2
# `doc'  => (ɤ ɤ)                           # special case 3
# `dox'  => (ɤ ɤ ɤ ɤ)                 # special case 4
sub extract_kanas {
    my ($roma) = @_;
    my $kana = romkan($roma);

    $kana =~ /(.*)($CHAR)$/;
    my $head = $1;
    my $last = $2;

    my @cand = ();
    if (isconsonant($last)) {
	if ($head =~ /^(.*)($CHAR)$/ && isconsonant($2)) {
	    my $head2 = $1;
	    my $beforelast = $2;

	    if ($last eq $beforelast) { # special case 2
		push @cand, $head2 . "";
	    } else {
		push @cand, map { $head2 . romkan($_) } 
                     consonant2moras($beforelast . $last);
	    }
	} else {
	    my @moras = consonant2moras($last);
	    push @moras, "xtsu";
	    if ($last eq "c") {      # special case 3
		push @moras, "chi";
	    } elsif ($last eq "x") { # special case 4
		push @moras, "xya", "xyu", "xyo", "xwa";
	    } 
	    push @cand, map { $head . romkan($_) } @moras;
	}
    } elsif ($last eq "") { # speacial case 1
	push @cand, $kana;
	push @cand, map { $head . romkan($_) } consonant2moras("n"), ""; 
    } else {
	push @cand, $kana;
    }
    return sort @cand;
}

# `meshi' => ( ӹ   ݤ ܲ γ  Ӳ)
sub extract_words {
    my ($roma) = @_;
    my @words = ();

    my @lines = lookup($roma);

    # Handle special cases.
    if ($roma =~ /^(.*)(.)$/) {
	my $head = $1;
	my $last = $2;
	push @lines, lookup($head . "j")  if $last eq "z";  
	push @lines, lookup($head . "ch") if $last eq "t";
	if ($last eq "y" && $head =~ /^(.*)t$/) { # ty
	    push @lines, lookup($1 . "ch") ;
	}
    }


    for my $line (@lines) {
	$line =~ s/^([^\t]+)\t//;
	my $head = $1;
	for my $cand (split " ", $line) {
	    # Exclude ASCII entries if its head is ASCII.
	    # e.g. "foo\tbar baz ա" => (ա)
  	    unless ($head =~ /^$ASCII/ && $cand =~ /^$ASCII/) {
  		push @words, $cand;
 	    }
	}
    }

    # Remove duplicates.
    my %seen = ();
    @words = grep { ! $seen{$_}++ } @words;

    return @words;
}

sub lookup {
    my ($roma) = @_;
    my @results = ();

    if (look($dictfh, $roma) == -1) { # Search failed.
	@results = ();
    } else {
	# Gather all entries having the prefix $roma.
	while (defined(my $line = <$dictfh>)) {
	    chomp $line;
	    if ($line =~ /^$roma/) {
		push @results, $line;
	    } else {
		last;
	    }
	}
    }
    return @results;
}

# (   ﳣ   ) => (     )
sub extract_initials {
    my (@words) = @_;

    my %seen = ();
    my @initials = sort grep { ! $seen{$_}++ } 
    map {/^($CHAR)/; $1} @words;

    return @initials;
}

sub hanzen {
    my ($str) =@_;
    $str =~ s/($hanzenpat)/$hanzen{$1}/g;
    return $str;
}

sub init_cache {
    if (!$opt_nocache) {
	require "migemo-cache.pl";
	%cache = migemo_cache::load();
    }
}

sub init_hanzen {
    my %hanzen = 
	(
	 " " => "",
	 "!" => "",
	 '"' => "[ȡ]",
	 "#" => "[]",
	 "\$" => "",
	 "%" => "",
	 "&" => "",
	 "'" => "",
	 "(" => "",
	 ")" => "",
	 "*" => "[ߢ]",
	 "+" => "",
	 "," => "[]",
	 "-" => "",
	 "." => "[]",
	 "/" => "[]",
	 "0" => "",
	 "1" => "",
	 "2" => "",
	 "3" => "",
	 "4" => "",
	 "5" => "",
	 "6" => "",
	 "7" => "",
	 "8" => "",
	 "9" => "",
	 ":" => "",
	 ";" => "",
	 "<" => "[ҡ]",
	 "=" => "",
	 ">" => "[ӡ]",
	 "?" => "",
	 '@' => "",
	 "A" => "",
	 "B" => "",
	 "C" => "",
	 "D" => "",
	 "E" => "",
	 "F" => "",
	 "G" => "",
	 "H" => "",
	 "I" => "",
	 "J" => "",
	 "K" => "",
	 "L" => "",
	 "M" => "",
	 "N" => "",
	 "O" => "",
	 "P" => "",
	 "Q" => "",
	 "R" => "",
	 "S" => "",
	 "T" => "",
	 "U" => "",
	 "V" => "",
	 "W" => "",
	 "X" => "",
	 "Y" => "",
	 "Z" => "",
	 "[" => "[Ρ֡ءڡ]",
	 "\\" => "[]",
	 "]" => "[ϡס١ۡ]",
	 "^" => "",
	 "_" => "",
	 "`" => "",
	 "a" => "",
	 "b" => "",
	 "c" => "",
	 "d" => "",
	 "e" => "",
	 "f" => "",
	 "g" => "",
	 "h" => "",
	 "i" => "",
	 "j" => "",
	 "k" => "",
	 "l" => "",
	 "m" => "",
	 "n" => "",
	 "o" => "",
	 "p" => "",
	 "q" => "",
	 "r" => "",
	 "s" => "",
	 "t" => "",
	 "u" => "",
	 "v" => "",
	 "w" => "",
	 "x" => "",
	 "y" => "",
	 "z" => "",
	 "{" => "",
	 "|" => "",
	 "}" => "",
	 "~" => "",
	 );
    return %hanzen;
}

sub init_hanzenpat {
    my $hanzenpat = join '|', map { quotemeta } keys %hanzen;
    return $hanzenpat;
}
