Note that this program makes direct use of accented characters in the CSX encoding; unless your Web browser uses a font based on this encoding (which is vanishingly unlikely) these characters will not appear correctly. The simplest expedient is to save this file to your own disk and view it there with software that does accept a CSX-based font.
#!/usr/bin/perl
#-*-Perl-*-
#------------------------------------------------------------------#
$description =
"Syntax: mconv [options] file
Mconv converts \"file\", which should be an uncompressed Tokunaga
Mahàbhàrata file, to CSX format. As part of this process it inserts
corrections generated by mkmconv.
-h option prints this help.
-s option runs in \"stupid\" mode, i.e. deliberately does not make
any of the corrections suggested by mkmconv.
";
#------------------------------------------------------------------#
require 'getopts.pl';
Getopts(':hs');
if ($opt_h || $#ARGV != 0) {
print $description;
exit 1;
}
$filename = $ARGV[0];
@problems = ();
$date = `Date`;
if ($opt_s) {
open(STDOUT, "|mconv.cleanup");
}
else {
# The following line (commented out) runs the DANGEROUS unbatched option.
# Check output very carefully if you use it.
# open(STDOUT, "|perl -sp mconv.spacing -unbatched |mconv.spacing |mconv.cleanup");
# The standard safer option follows:
open(STDOUT, "|mconv.spacing |mconv.cleanup");
}
print "% This file generated by\n";
print "% mconv $filename\n";
print "% $date\%\n";
while (<>) {
# Correct layout and transliteration
next if (/^%/|/^\s/);
study;
s/\015//g; # Get rid of DOS CR char
chop;
s/^0//; # Get rid of leading zeros
if (/(\s*({.*})$)/) {
$uvaaca = $2;
s/$1//;
print substr($_, 0, 8) . " $uvaaca\n";
}
tr/././s;
s/\///;
s/\/.*$//;
s/\./ /g;
s/ $//;
s/aa/à/g;
s/ii/ã/g;
s/uu/å/g;
s/R/ç/g;
s/çç/é/g;
s/T/ñ/g;
s/D/ó/g;
s/N/õ/g;
s/z/÷/g;
s/S/ù/g;
# Temporary 1-letter versions of ai and au
s/ai/E/g;
s/au/O/g;
# Correct visarga, anusvàra and ï, ¤
s/([aàiãuåçeoEO])h([ ;kp÷ùs]|$)/$1þ$2/g;
s/m( |;)([^aàiãuåçeoEO])/ü$1$2/g;
s/sam([^ ;aàiãuåçeoEO])/saü$1/g;
s/saürà([jñó])/samrà$1/g;
s/saüya([¤ïkg])/samya$1/g;
s/( |;)pum([ ;]?)([^aàiãuåçeoEO])/$1puü$2$3/g;
s/( |;)puübh/$1pumbh/g;
s/([aàiãuåçéeoEO])m([kgcjñóõtdm÷ùsh])/$1ü$2/g;
s/÷aükh/÷aïkh/g;
s/( |;)saüga /$1saïga /g;
s/ml( |$)/§l$1/g;
s/n([cj])/¤$1/g;
s/([cj])n/$1¤/g;
s/n([kg])/ï$1/g;
s/( |;)(a?)pratyan( |;|$)/$1$2pratyaï$3/g;
s/( |;)(a?)pratyann( |;)/$1$2pratyaïï$3/g;
s/( |;)(a?)pràn( |;|$)/$1$2pràï$3/g;
s/( |;)(a?)paràn( |;)(man|mukh)/$1$2paràï$3$4/g;
# Correct anomalies and common typos that are best dealt with before sandhi
s/(k[^ ;]+)(ci[tdncjl]|cana)($|[ ;])/$1 $2$3/g;
s/([ ;])kठcana/$1kà¤cana/g;
s/([krçé])s/$1ù/g;
s/([ ;])kçùar/$1kçsar/g;
s/([ ;])bçù/$1bçs/g;
s/([ ;])viùvakùen/$1viùvaksen/g;
s/([ ;])nçùiüh/$1nçsiüh/g;
s/ùt/ùñ/g;
s/(dur|ni[rù]|par[iyà]|pr[aàoeEO]|antar)([kgïpbmyrvh]*)([aàiãuåeçéoEO]*)([aàiãuåeçéoEOkgïpbmyrvh]*)n/$1$2$3$4N/g;
s/([rçéù])([aàiãuåeçéoEOkgïpbmyrvh]*)n([aàiãuåçéeoEOnmyv])/$1$2õ$3/g;
s/õn([^ ;])/õõ$1/g;
s/õn/nn/g;
s/N/n/g;
s/nirvinn/nirviõõ/g;
s/prayàn([aàeo])/prayàõ$1/g;
s/õçt/nçt/g;
s/õart/nart/g;
s/õabh/nabh/g;
s/õand/nand/g;
s/õaùñ/naùñ/g;
s/pràpõ/pràpn/g;
s/ghõ/ghn/g;
s/([iãuåçeoEO])s([uv])($|[ ;])/$1ù$2$3/g;
s/([eo])s([aàiãuåçeoEO])/$1ù$2/g;
s/isy/iùy/g;
s/maõãù/manãù/g;
s/av([ ;])/àv$1/g;
s/tisth/tiùñh/g;
s/nisth/niùñh/g;
s/nEsth/nEùñh/g;
s/ùaõ([ ;])/ùañ$1/;
s/õ([ ;])/n$1/g;
s/rådh/råóh/g;
s/cest/ceùñ/g;
s/pattr/patr/g;
s/([aàiãuåçéeoEO])ç([ùõ])/$1r$2/g;
s/([^ ;])ayàmàs/$1ayàm às/g;
s/ar([ ;])([kp÷ùs])/aþ$1$2/g;
s/(tira|pura|nama)([sþ]) (kç|kar|kur[uv])/$1s$3/g;
s/preùp/preps/g;
# Correct vowel sandhi
s/[aà] [aà]/à/g;
s/[iã] [iã]/ã/g;
s/[uå] [uå]/å/g;
s/[aà] [iã]/e/g;
s/[aà] [uå]/o/g;
s/[aà] ç/ar/g;
s/[aà] [eE]/E/g;
s/[aà] [oO]/O/g;
s/[iã]( [aàuåçeoEO])/y$1/g;
s/[uå]( [aàiãçeoEO])/v$1/g;
s/ç ([aàiãuåeoEO])/r$1/g;
s/([eo]) a/$1 '/g;
s/[eo]( [àiãuåçeoEO])/a$1/g;
s/E( [aàiãuåçeoEO])/à$1/g;
s/O( [aàiãuåçeoEO])/àv$1/g;
# Deal with ";"
s/([aàiãuåçeoEO]);([aàiãuåçeoEO])/$1:$2/g;
s/;/; /g;
$count = (tr/aàiãuåçéëeoEO//);
if ($count > 18) {
if (!/; /) {
if (/:/) {
s/:/; /g;
$count = (tr/;//);
if ($count > 1) {
push(@problems, $_);
}
}
else {
push(@problems, $_);
}
}
}
s/:/ /;
# Correct consonant sandhi
s/s(;? |$)/þ$1/g;
s/ (và|tva|ru|sru|mu)c(;? |$)/ $1k$2/g;
s/ (ru|çtvi|vaõi|bhiùa|sra|år)j(;? |$)/ $1k$2/g;
s/ (di|dç|spç)÷(;? |$)/ $1k$2/g;
s/ (samrà|parivrà)j(;? |$)/ $1ñ$2/g;
s/ vi÷(;? |$)/ viñ$1/g;
s/[cj](;? )/t$1/g;
s/([kgñótdpb])\1(;? )/$1$2/g;
s/k(;? )([gjódbyrlvhaàiãuåçeoEO])/g$1$2/g;
s/g(;? )([kcñtp÷ùs])/k$1$2/g;
s/[kg](;? )([nm])/ï$1$2/g;
s/ñ(;? )([gjódbyrlvhaàiãuåçeoEO])/ó$1$2/g;
s/ó(;? )([kcñtp÷ùs])/ñ$1$2/g;
s/[ñó](;? )([nm])/õ$1$2/g;
s/t(;? )([gjódbyrlvhaàiãuåçeoEO])/d$1$2/g;
s/(d|dh)(;? )([kcñtp÷ùs])/t$2$3/g;
s/([td]|dh)(;? )([nm])/n$2$3/g;
s/p(;? )([gjódbyrlvhaàiãuåçeoEO])/b$1$2/g;
s/(b|bh)(;? )([kcñtp÷ùs])/p$2$3/g;
s/([pb]|bh)(;? )([nm])/m$2$3/g;
s/([gódb])(;? )h/$1$2$1h/g;
s/n(;? )c/ü÷$1c/g;
s/n(;? )ñ/üù$1ñ/g;
s/n(;? )t/üs$1t/g;
s/n(;? )([j÷])/¤$1$2/g;
s/n(;? )ó/õ$1ó/g;
s/n(;? )l/§l$1l/g;
s/t(;? )c/c$1c/g;
s/t(;? )÷/c$1ch/g;
s/t(;? )ñ/ñ$1ñ/g;
s/d(;? )j/j$1j/g;
s/d(;? )ó/ó$1ó/g;
s/d(;? )l/l$1l/g;
s/þ(;? )c/÷$1c/g;
s/þ(;? )ñ/ù$1ñ/g;
s/þ(;? )t/s$1t/g;
s/àþ(;? )([^kp÷ùs])/à$1$2/g;
s/aþ(;? )([àiãuåçeoEO])/a$1$2/g;
s/aþ(;? )([gjódnbmyrlvh])/o$1$2/g;
s/aþ(;? )a/o$1'/g;
s/þ(;? )([aàiãuåçeoEOgjódnbmyrlvh])/r$1$2/g;
s/ar(;? )r/à$1r/g;
s/ir(;? )r/ã$1r/g;
s/ur(;? )r/å$1r/g;
s/r(;? )r/$1r/g;
print;
print "\n";
}
if (@problems != ()) {
print STDERR "\nThe file $filename.log contains important information.\n\n";
open(LOGFILE, ">$filename.log");
print LOGFILE "This file generated by\n";
print LOGFILE "mconv $filename\n";
print LOGFILE "$date\n";
print LOGFILE "Mconv regards the following line(s) as problematical:\n";
print LOGFILE "they contain either too many semicolons or too few.\n";
print LOGFILE "(Note that the lines are quoted from an early stage\n";
print LOGFILE "of mconv's processing, so there may be minor textual\n";
print LOGFILE "divergences from the final output.)\n\n";
do {
$line = shift(@problems);
$line =~ s/E/ai/g;
$line =~ s/O/au/g;
print LOGFILE "$line\n";
}
until (@problems == ());
close LOGFILE;
}
John Smith can be contacted as jds10@cam.ac.uk