The mconv program


Mconv does the basic task of converting a Tokunaga Mahabharata file. It standardises the layout, converts Tokunaga's transliteration scheme to CSX, resolves as many as possible of the ambiguities introduced by his "short cuts" (such as "m" standing for both the labial nasal and anusvara), corrects common errors, normalises vowel sandhi, attempts to rationalise the use of the semicolon, and imposes correct consonant sandhi. Finally, the text is optionally passed through mconv.spacing, which seeks to mend as many of the broken compounds as possible. (Even more finally, it is passed through mconv.cleanup, which does what its name suggests.)

Note that this program makes direct use of accented characters in the CSX encoding; unless your Web browser uses a font based on this encoding (which is vanishingly unlikely) these characters will not appear correctly. The simplest expedient is to save this file to your own disk and view it there with software that does accept a CSX-based font.



#!/usr/bin/perl
#-*-Perl-*-

#------------------------------------------------------------------#
$description =
"Syntax: mconv [options] file

Mconv converts \"file\", which should be an uncompressed Tokunaga
Mahbhrata file, to CSX format. As part of this process it inserts
corrections generated by mkmconv.

-h option prints this help.
-s option runs in \"stupid\" mode, i.e. deliberately does not make
   any of the corrections suggested by mkmconv.
";
#------------------------------------------------------------------#

require 'getopts.pl';
Getopts(':hs');
if ($opt_h || $#ARGV != 0) {
    print $description;
        exit 1;
}

$filename = $ARGV[0];
@problems = ();
$date = `Date`;
if ($opt_s) {
    open(STDOUT, "|mconv.cleanup");
}
else {

#   The following line (commented out) runs the DANGEROUS unbatched option.
#   Check output very carefully if you use it.
#   open(STDOUT, "|perl -sp mconv.spacing -unbatched |mconv.spacing |mconv.cleanup");

#   The standard safer option follows:
    open(STDOUT, "|mconv.spacing |mconv.cleanup");
}
print "% This file generated by\n";
print "% mconv $filename\n";
print "% $date\%\n";
while (<>) {

    # Correct layout and transliteration

    next if (/^%/|/^\s/);
    study;
    s/\015//g;					# Get rid of DOS CR char
    chop;
    s/^0//;					# Get rid of leading zeros
    if (/(\s*({.*})$)/) {
	$uvaaca = $2;
	s/$1//;
	print substr($_, 0, 8) . "  $uvaaca\n";
    }
    tr/././s;
    s/\///;
    s/\/.*$//;
    s/\./ /g;
    s/ $//;
    s/aa//g;
    s/ii//g;
    s/uu//g;
    s/R//g;
    s///g;
    s/T//g;
    s/D//g;
    s/N//g;
    s/z//g;
    s/S//g;

    # Temporary 1-letter versions of ai and au

    s/ai/E/g;
    s/au/O/g;

    # Correct visarga, anusvra and , 

    s/([aiueoEO])h([ ;kps]|$)/$1$2/g;
    s/m( |;)([^aiueoEO])/$1$2/g;
    s/sam([^ ;aiueoEO])/sa$1/g;
    s/sar([j])/samr$1/g;
    s/saya([kg])/samya$1/g;
    s/( |;)pum([ ;]?)([^aiueoEO])/$1pu$2$3/g;
    s/( |;)pubh/$1pumbh/g;
    s/([aiueoEO])m([kgcjtdmsh])/$1$2/g;
    s/akh/akh/g;
    s/( |;)saga /$1saga /g;
    s/ml( |$)/l$1/g;
    s/n([cj])/$1/g;
    s/([cj])n/$1/g;
    s/n([kg])/$1/g;
    s/( |;)(a?)pratyan( |;|$)/$1$2pratya$3/g;
    s/( |;)(a?)pratyann( |;)/$1$2pratya$3/g;
    s/( |;)(a?)prn( |;|$)/$1$2pr$3/g;
    s/( |;)(a?)parn( |;)(man|mukh)/$1$2par$3$4/g;

    # Correct anomalies and common typos that are best dealt with before sandhi

    s/(k[^ ;]+)(ci[tdncjl]|cana)($|[ ;])/$1 $2$3/g;
    s/([ ;])k cana/$1kcana/g;
    s/([kr])s/$1/g;
    s/([ ;])kar/$1ksar/g;
    s/([ ;])b/$1bs/g;
    s/([ ;])vivaken/$1vivaksen/g;
    s/([ ;])nih/$1nsih/g;
    s/t//g;
    s/(dur|ni[r]|par[iy]|pr[aoeEO]|antar)([kgpbmyrvh]*)([aiueoEO]*)([aiueoEOkgpbmyrvh]*)n/$1$2$3$4N/g;
    s/([r])([aiueoEOkgpbmyrvh]*)n([aiueoEOnmyv])/$1$2$3/g;
    s/n([^ ;])/$1/g;
    s/n/nn/g;
    s/N/n/g;
    s/nirvinn/nirvi/g;
    s/prayn([aeo])/pray$1/g;
    s/t/nt/g;
    s/art/nart/g;
    s/abh/nabh/g;
    s/and/nand/g;
    s/a/na/g;
    s/prp/prpn/g;
    s/gh/ghn/g;
    s/([iueoEO])s([uv])($|[ ;])/$1$2$3/g;
    s/([eo])s([aiueoEO])/$1$2/g;
    s/isy/iy/g;
    s/ma/man/g;
    s/av([ ;])/v$1/g;
    s/tisth/tih/g;
    s/nisth/nih/g;
    s/nEsth/nEh/g;
    s/a([ ;])/a$1/;
    s/([ ;])/n$1/g;
    s/rdh/rh/g;
    s/cest/ce/g;
    s/pattr/patr/g;
    s/([aiueoEO])([])/$1r$2/g;
    s/([^ ;])ayms/$1aym s/g;
    s/ar([ ;])([kps])/a$1$2/g;
    s/(tira|pura|nama)([s]) (k|kar|kur[uv])/$1s$3/g;
    s/prep/preps/g;

    # Correct vowel sandhi

    s/[a] [a]//g;
    s/[i] [i]//g;
    s/[u] [u]//g;
    s/[a] [i]/e/g;
    s/[a] [u]/o/g;
    s/[a] /ar/g;
    s/[a] [eE]/E/g;
    s/[a] [oO]/O/g;
    s/[i]( [aueoEO])/y$1/g;
    s/[u]( [aieoEO])/v$1/g;
    s/ ([aiueoEO])/r$1/g;
    s/([eo]) a/$1 '/g;
    s/[eo]( [iueoEO])/a$1/g;
    s/E( [aiueoEO])/$1/g;
    s/O( [aiueoEO])/v$1/g;

    # Deal with ";"

    s/([aiueoEO]);([aiueoEO])/$1:$2/g;
    s/;/; /g;
    $count = (tr/aiueoEO//);
    if ($count > 18) {
	if (!/; /) {
	    if (/:/) {
		s/:/; /g;
		$count = (tr/;//);
		if ($count > 1) {
		    push(@problems, $_);
		}
	    }
	    else {
		push(@problems, $_);
	    }
	}
    }
    s/:/ /;

    # Correct consonant sandhi

    s/s(;? |$)/$1/g;
    s/ (v|tva|ru|sru|mu)c(;? |$)/ $1k$2/g;
    s/ (ru|tvi|vai|bhia|sra|r)j(;? |$)/ $1k$2/g;
    s/ (di|d|sp)(;? |$)/ $1k$2/g;
    s/ (samr|parivr)j(;? |$)/ $1$2/g;
    s/ vi(;? |$)/ vi$1/g;
    s/[cj](;? )/t$1/g;
    s/([kgtdpb])\1(;? )/$1$2/g;
    s/k(;? )([gjdbyrlvhaiueoEO])/g$1$2/g;
    s/g(;? )([kctps])/k$1$2/g;
    s/[kg](;? )([nm])/$1$2/g;
    s/(;? )([gjdbyrlvhaiueoEO])/$1$2/g;
    s/(;? )([kctps])/$1$2/g;
    s/[](;? )([nm])/$1$2/g;
    s/t(;? )([gjdbyrlvhaiueoEO])/d$1$2/g;
    s/(d|dh)(;? )([kctps])/t$2$3/g;
    s/([td]|dh)(;? )([nm])/n$2$3/g;
    s/p(;? )([gjdbyrlvhaiueoEO])/b$1$2/g;
    s/(b|bh)(;? )([kctps])/p$2$3/g;
    s/([pb]|bh)(;? )([nm])/m$2$3/g;
    s/([gdb])(;? )h/$1$2$1h/g;
    s/n(;? )c/$1c/g;
    s/n(;? )/$1/g;
    s/n(;? )t/s$1t/g;
    s/n(;? )([j])/$1$2/g;
    s/n(;? )/$1/g;
    s/n(;? )l/l$1l/g;
    s/t(;? )c/c$1c/g;
    s/t(;? )/c$1ch/g;
    s/t(;? )/$1/g;
    s/d(;? )j/j$1j/g;
    s/d(;? )/$1/g;
    s/d(;? )l/l$1l/g;
    s/(;? )c/$1c/g;
    s/(;? )/$1/g;
    s/(;? )t/s$1t/g;
    s/(;? )([^kps])/$1$2/g;
    s/a(;? )([iueoEO])/a$1$2/g;
    s/a(;? )([gjdnbmyrlvh])/o$1$2/g;
    s/a(;? )a/o$1'/g;
    s/(;? )([aiueoEOgjdnbmyrlvh])/r$1$2/g;
    s/ar(;? )r/$1r/g;
    s/ir(;? )r/$1r/g;
    s/ur(;? )r/$1r/g;
    s/r(;? )r/$1r/g;

    print;
    print "\n";
}

if (@problems != ()) {
    print STDERR "\nThe file $filename.log contains important information.\n\n";
    open(LOGFILE, ">$filename.log");
    print LOGFILE "This file generated by\n";
    print LOGFILE "mconv $filename\n";
    print LOGFILE "$date\n";
    print LOGFILE "Mconv regards the following line(s) as problematical:\n";
    print LOGFILE "they contain either too many semicolons or too few.\n";
    print LOGFILE "(Note that the lines are quoted from an early stage\n";
    print LOGFILE "of mconv's processing, so there may be minor textual\n";
    print LOGFILE "divergences from the final output.)\n\n";
    do {
	$line = shift(@problems);
	$line =~ s/E/ai/g;
	$line =~ s/O/au/g;
	print LOGFILE "$line\n";
    }
    until (@problems == ());
    close LOGFILE;
}


Back to home page