#!/usr/bin/perl

# convert internal Yiddish into transcription
# i2y.pl [-full]

# takes input from STDIN, directs output to STDOUT.  
# if -full is specified, then remaining nikud is replaced with Roman vowels, and
# remaining Hebrew consonants are replaced with Roman consonants.
# Author: Raphael Finkel, 1998, 2011, 2015
# See the COPYRIGHT file enclosed with this distribution.

# read in the Hebrew respelling list.

use Encode;

# constants
$Home = "/homes/raphael/cs/links/y";

# variables
my %Respell; # $Respell[internal form] = romanized form

# open (HEBLIST, "$Home/y2i.pl < $Home/respell.data|");
open (HEBLIST, "$Home/respell.data");
$toss = <HEBLIST>;
while (defined ($line = <HEBLIST>)) { # one line
	chomp $line;
	$line =~ s/\|//g; 
	$line =~ s/\s*$//;
	# print STDERR "line: $line\n";
	$line =~ /(\S+)\s+(.+)/;
	# print STDERR "when I see [$2] I understand it to mean [$1]\n";
	$Respell{$2} = $1;
} # one line of Hebrew respelling

# build the reverse-spell list: biggest first, short entries removed.
foreach $code (keys %Respell) {
	if (length($code) < 3) {next;}
	push @spellList, $code;
}
@spellList = sort {length($b) <=> length($a)} @spellList;

sub reverse {  # used only for English inclusions marked \english{whatever}
	local($orig) = @_;
	if ($orig =~ /^(.)$/) {
		return $orig;
	} elsif ($orig =~ /^(.)(.)$/) {
		return "$2$1";
	} else {
		return ($orig =~ s/(.)(.*)(.)/"$3&reverse($2)$1"/e);
	}
}

$SIG{ALRM} = sub {exit(1);};
alarm(10);
$| = 1; # autoflush output
binmode STDIN, ":raw";
binmode STDOUT, ":utf8";
while (defined ($line = <STDIN>)) { # one line
	# print "Working on $line\n";
	$line = decode('utf8', $line); # errors become �.
	$line =~ s/\r//; # remove any \r characters; \n is enough.
	if ($line =~ /\\relax \{\\roman (.*)\}/) { # English insert
		$rev = reverse($1);
		print "\\english{$rev}$'";
		next;
	}
	if ($line =~ /\\spell\s+(\S+)\s+(\S+)/) { # explicit respelling
		$Respell{$2} = $1;
		push @spellList, $2; # would be better to put at front, but expensive
	}
	if ($line =~ /\\/) { # Tex
		print $line;
		next;
	}
	$fullLine = $line;
	$line = "";
	while (1) { # look at one word at a time
		$fullLine =~ s/(\s*)([-\w#*]*)([^-\w#*]*)(.*)/$4/;
		$initialSpace = $1;
		$word = $2;
		$separator = $3;
		print $initialSpace;
		if ($word eq "" && $separator =~  /^\s*$/) {last;}
		if ($word eq "") {print $separator; next;}
		$word =~ s/([MNFCX])$/lc $1/eg;  # convert final forms to regular
		if (defined($Respell{$word})) {
			# print "was $word, is now $Respell{$word}\n";
			$word = $Respell{$word};
			# print STDERR "\trespelled to $word\n";
		} elsif ($word =~ /[QBHWTK]/) { # need to try respelling this word 
			# print "[$word->] ";
			foreach $code (@spellList) {
				if ($word =~ /$code/) {
					$word =~ s/(.*)$code(.*)/$1|$Respell{$code}|$2/;
					$word =~ s/^\|//;
					$word =~ s/\|$//;
					# print "[->$word] ";
					last;
				}
			}
		} # need to try respelling word
		$line .= $word . $separator;
		# print "Line is now $line\n";
	} # look at one word at a time
	$line .= $separator;
	$line =~ s/S/sh/g;
	$line =~ s/I/ay/g;	
	$line =~ s/A/ey/g;	
	$line =~ s/O/oy/g;	
	$line =~ s/(.)((h|k)eyt)([ns]?)\b/$1$3ayt$4/g; # most heyt => hayt
	$line =~ s/ts(\b|[^h])/t|s$1/s; # prophylactic | between t and s
	$line =~ s/wy/oy/g;	
	$line =~ s/(^|[^a-zA-Z])\#([woey]|ay|ey)/$1$2/g; # remove shtumer alef	
	$line =~ s/\b(far|wr)#/$1/g; # remove known shtumer alef	
	$line =~ s/([FMNCX])($|\s|\b)/"\l$1$2"/ge;
	$line =~ s/j/i/g;	
	$line =~ s/c/ts/g;
	$line =~ s/w/u/g;
	$line =~ s/x/kh/g;
	$line =~ s/zsh/zh/g;
	$line =~ s/([^aeiou]|\b)y((?=[^aeiou]|\b))/$1i$2/g; # most y should be i
	$line =~ s/(.)((h|k)eyt)([ns]?)\b/$1$3ayt$4/g; # most heyt => hayt
	$line =~ s//^a/g; # pasekh
	$line =~ s//^o/g; # komets
	$line =~ s//^e/g; # segol
	$line =~ s//^i/g; # khirik
	$line =~ s//^U/g; # kubuts
	$line =~ s//^:/g; # shva
	$line =~ s//^A/g; # tseyre
	$line =~ s//^O/g; # kholem
	$line =~ s//^./g; 
	$line =~ s//^;/g; 
	$line =~ s//^,/g; 
	$line =~ s//^=/g; 
	$line =~ s//---/g; # quotation dash
	$line =~ s//^^a/g; # khataf patakh
	$line =~ s//^^o/g; # HEBREW POINT HATAF QAMATS
	$line =~ s//^^e/g; # HEBREW POINT HATAF SEGOL
	if (defined($ARGV[0]) and $ARGV[0] eq '-full') {
		$line =~ s/[aeiou]\^/\^/g; # vowel already there
		$line =~ s/\^\^a/a/g;
		$line =~ s/\^\^o/o/g;
		$line =~ s/\^\^e/e/g;
		$line =~ s/\^U/u/g;
		$line =~ s/\^://g;
		$line =~ s/\^A/ey/g;
		$line =~ s/\^O/o/g;
		$line =~ s/\^ii/i/g;
		$line =~ s/\^//g;
		$line =~ s/H/kh/g;
		$line =~ s/T/s/g;
		$line =~ s/X/kh/g;
		$line =~ s/K/k/g;
		$line =~ s/Q/k/g;
		$line =~ s/W/t/g;
		$line =~ s///g; # extraneous rofe
		$line =~ s/ #/ a/g; # avimelekh
		$line =~ s/#//g;
		$line =~ s/(?<=\w)B/v/g;
		$line =~ s/(?<=\w)D/d/g;
		$line =~ s/(?<=\w)R/r/g;
		$line =~ s/\|//g;
		$line =~ s/_//g;
	}
	if ($line eq "\n") {
		print "\r\n"; # so Mac will see a paragraph break.
	} else {
		print $line;
	}
} # one line;
print "\n";
