#
# Romkan.pm - a Romaji <-> Kana conversion module.
#
# Copyright (C) 2000 Satoru Takabayashi <satoru-t@is.aist-nara.ac.jp>
#     All rights reserved.
#     This is free software with ABSOLUTELY NO WARRANTY.
#
# You can redistribute it and/or modify it under the terms of 
# the GNU General Public License version 2.
#

package Lingua::Romkan;
require 5.005;
require Exporter;

$VERSION = "0.16";
@ISA = qw(Exporter);
@EXPORT = qw(isconsonant isvowel consonant2moras get_hepburntab get_kunreitab
             romkan kanrom romrom hirakata katahira);

# Global variables.
my $consonants  = "ckgszjtdhfpbmyrwxn";
my $vowels      = "aeiou";

my $conpat      = "[$consonants]";
my $vowpat      = "[$vowels]";
my $kunreitab   = get_kunreitab();
my $hepburntab  = get_hepburntab();

my $rompat;
my $kanpat;
my $kunpat;
my $heppat;

my %romrom;
my %romkan;
my %kanrom;

my $CHAR  = '(?:[\x00-\x7f]|(?:\x8f[\xa1-\xfe]|[\x8e\xa1-\xfe])[\xa1-\xfe])';

init_romrom();
init_romkan();
init_kanrom();

sub isconsonant {
    my ($char) = @_;

    if ($char =~ /^$conpat$/) {
	1;
    } else {
	0;
    }
}

sub isvowel {
    my ($char) = @_;

    if ($char =~ /^$vowpat$/) {
	1;
    } else {
	0;
    }
}

# `z' => (za ze zi zo zu)
sub consonant2moras {
    my ($consonant) = @_;

    my @results = ();
    for my $roma (keys %romkan) {
	if ($roma =~ /^$consonant.$/) {
	    push @results, $roma;
	}
    }
    return @results;
}

# FIXME: ad hod solution
# tanni   => tan'i
# kannji  => kanji
# hannnou => han'nou
# hannnya => han'nya
sub normalize_double_n {
    my ($str) = @_;

    $str =~ s/nn/n'/g;
    $str =~ s/n\'(?=[^aiueoyn]|$)/n/g; #'

    return $str;
}

# Romaji -> Romaji
# Normalize into Hepburn sequences.
# e.g. kannji -> kanzi, tiezo -> chiezo
sub romrom {
    my ($str) = @_;
    $str = normalize_double_n($str);
    $str =~ s/\G($heppat*?)($kunpat)/$1$romrom{$2}/go;
    return $str;
}

# Romaji -> Kana
# It can handle both Hepburn and Kunrei sequences.
sub romkan {
    my ($str) = @_;
    $str = normalize_double_n($str);
    $str =~ s/\G($CHAR*?)($rompat)/$1$romkan{$2}/go;
    return $str;
}

# Kana -> Romaji.  
# Return Hepburn sequences.
sub kanrom {
    my ($str) = @_;
    $str =~ s/\G($CHAR*?)($kanpat)/$1$kanrom{$2}/go;
    $str =~ s/n'(?=[^aeiuoyn]|$)/n/g; #'
    return $str;
}

# Hiragana -> Katakana
sub hirakata {
    my ($str) = @_;
    $str =~ s/\xa4(.)/\xa5$1/g;
    return $str;
}

# Katakana -> Hiragana
sub katahira {
    my ($str) = @_;
    $str =~ s/\xa5(.)/\xa4$1/g;
    return $str;
}

sub init_romrom {
    my $i = 0;
    # Extract odd entries.
    my @kunrei  = grep { $i++ % 2 } split /\s+/, $kunreitab;
    $i = 0;
    my @hepburn = grep { $i++ % 2 } split /\s+/, $hepburntab;

    # FIXME: ad hoc solution. special case.
    push @kunrei,  "sy";
    push @hepburn, "sh";

    # Sort in long order so that a longer Romaji sequence precedes.
    $kunpat = join '|', sort { length($b) <=> length($a) } @kunrei;
    $heppat = join '|', sort { length($b) <=> length($a) } @hepburn;

    # Both arrays must be same size.
    die "$#kunrei and $#hepburn differ!\n" if @kunrei != @hepburn;
    
    while (@kunrei) {
	my $kunrei  = shift @kunrei;
	my $hepburn = shift @hepburn;
	# Both entries must be parallel.
#	print "!! $kunrei $hepburn\n" if $kunrei ne $hepburn;
	$romrom{$kunrei} = $hepburn;
    }

}

sub init_kanrom {
    %kanrom = split /\s+/, $hepburntab;

    $kanpat = join '|', sort { length($b) <=> length($a) || 
			       length($kanrom{$a}) <=> length($kanrom{$b}) }
                        keys %kanrom;
}

sub init_romkan {
    my @kanrom = split /\s+/, $hepburntab . $kunreitab;

    while (@kanrom) {
	my $kan = shift @kanrom;
	my $rom = shift @kanrom;
	$romkan{$rom} = $kan;
    }

    # Sort in long order so that a longer Romaji sequence precedes.
    $rompat = join '|', sort {length($b) <=> length($a)} keys %romkan;
}

# This table is imported from KAKASI <http://kakasi.namazu.org/> and modified.
sub get_kunreitab {
    my $kunreitab = <<"EOT";
	xa		a		xi		i		xu
	u		vu		va		vi 		ve
	vo		xe		e		xo		o 

	ka		ga		ki		kya		kyu 
	kyo		gi		gya		gyu		gyo 
	ku		gu		ke		ge		ko
	go 

	sa		za		si		sya		syu 
	syo		zi		zya		zyu		zyo 
	su		zu		se		ze		so
	zo 

	ta		da		ti		tya		tyu 
	tyo		di	¤	dya	¤	dyu	¤	dyo 

	xtu 
ä	vvu	ä	vva	ä	vvi 
ä	vve	ä	vvo 
ä	kka	ä	gga	ä	kki	ä	kkya 
ä	kkyu	ä	kkyo	ä	ggi	ä	ggya 
ä	ggyu	ä	ggyo	ä	kku	ä	ggu 
ä	kke	ä	gge	ä	kko	ä	ggo	ä	ssa 
ä	zza	ä	ssi	ä	ssya 
ä	ssyu	ä	ssho 
ä	zzi	ä	zzya	ä	zzyu	ä	zzyo 
ä	ssu	ä	zzu	ä	sse	ä	zze	ä	sso 
ä	zzo	ä	tta	ä	dda	ä	tti 
ä	ttya	ä	ttyu	ä	ttyo	ä	ddi 
ä¤	ddya	ä¤	ddyu	ä¤	ddyo	ä	ttu 
ä	ddu	ä	tte	ä	dde	ä	tto	ä	ddo 
ä	hha	ä	bba	ä	ppa	ä	hhi 
äҤ	hhya	äҤ	hhyu	äҤ	hhyo	ä	bbi 
äӤ	bbya	äӤ	bbyu	äӤ	bbyo	ä	ppi 
äԤ	ppya	äԤ	ppyu	äԤ	ppyo	ä	hhu 
äդ	ffa	äդ	ffi	äդ	ffe	äդ	ffo 
ä	bbu	ä	ppu	ä	hhe	ä	bbe	ä    ppe
ä	hho	ä	bbo	ä	ppo	ä	yya	ä	yyu 
ä	yyo	ä	rra	ä	rri	ä	rrya 
ä	rryu	ä	rryo	ä	rru	ä	rre 
ä	rro 

	tu		du		te		de		to
	do 

	na		ni	ˤ	nya	ˤ	nyu	ˤ	nyo 
	nu		ne		no 

	ha		ba		pa		hi	Ҥ	hya 
Ҥ	hyu	Ҥ	hyo		bi	Ӥ	bya	Ӥ	byu 
Ӥ	byo		pi	Ԥ	pya	Ԥ	pyu	Ԥ	pyo 
	hu	դ	fa	դ	fi	դ	fe	դ	fo 
	bu		pu		he		be		pe
	ho		bo		po 

	ma		mi	ߤ	mya	ߤ	myu	ߤ	myo 
	mu		me		mo 

	xya		ya		xyu		yu		xyo
	yo

	ra		ri		rya		ryu		ryo 
	ru		re		ro 

	xwa		wa		wi		we
	wo		n 

     n'
Ǥ   dyi
     -
    tye
ä	ttye
EOT
#'
}

# This table is imported from KAKASI <http://kakasi.namazu.org/> and modified.
sub get_hepburntab {
    my $hepburntab = <<"EOT";
	xa		a		xi		i		xu
	u		vu		va		vi		ve
	vo		xe		e		xo		o
	

	ka		ga		ki		kya		kyu
	kyo		gi		gya		gyu		gyo
	ku		gu		ke		ge		ko
	go	

	sa		za		shi		sha		shu
	sho		ji		ja		ju		jo
	su		zu		se		ze		so
	zo	

	ta		da		chi		cha		chu
	cho		di	¤	dya	¤	dyu	¤	dyo

	xtsu	
ä	vvu	ä	vva	ä	vvi	
ä	vve	ä	vvo	
ä	kka	ä	gga	ä	kki	ä	kkya	
ä	kkyu	ä	kkyo	ä	ggi	ä	ggya	
ä	ggyu	ä	ggyo	ä	kku	ä	ggu	
ä	kke	ä	gge	ä	kko	ä	ggo	ä	ssa
ä	zza	ä	sshi	ä	ssha	
ä	sshu	ä	ssho	
ä	jji	ä	jja	ä	jju	ä	jjo	
ä	ssu	ä	zzu	ä	sse	ä	zze	ä	sso
ä	zzo	ä	tta	ä	dda	ä	cchi	
ä	ccha	ä	cchu	ä	ccho	ä	ddi	
ä¤	ddya	ä¤	ddyu	ä¤	ddyo	ä	ttsu	
ä	ddu	ä	tte	ä	dde	ä	tto	ä	ddo
ä	hha	ä	bba	ä	ppa	ä	hhi	
äҤ	hhya	äҤ	hhyu	äҤ	hhyo	ä	bbi	
äӤ	bbya	äӤ	bbyu	äӤ	bbyo	ä	ppi	
äԤ	ppya	äԤ	ppyu	äԤ	ppyo	ä	ffu	
äդ	ffa	äդ	ffi	äդ	ffe	äդ	ffo	
ä	bbu	ä	ppu	ä	hhe	ä	bbe	ä	ppe
ä	hho	ä	bbo	ä	ppo	ä	yya	ä	yyu
ä	yyo	ä	rra	ä	rri	ä	rrya	
ä	rryu	ä	rryo	ä	rru	ä	rre	
ä	rro	

	tsu		du		te		de		to
	do	

	na		ni	ˤ	nya	ˤ	nyu	ˤ	nyo
	nu		ne		no	

	ha		ba		pa		hi	Ҥ	hya
Ҥ	hyu	Ҥ	hyo		bi	Ӥ	bya	Ӥ	byu
Ӥ	byo		pi	Ԥ	pya	Ԥ	pyu	Ԥ	pyo
	fu	դ	fa	դ	fi	դ	fe	դ	fo
	bu		pu		he		be		pe
	ho		bo		po	

	ma		mi	ߤ	mya	ߤ	myu	ߤ	myo
	mu		me		mo

	xya		ya		xyu		yu		xyo
	yo	

	ra		ri		rya		ryu		ryo
	ru		re		ro	

	xwa		wa		wi		we
	wo		n	

     n'
Ǥ   dyi
     -
    che
ä	cche
EOT
#'
}

1;

