The mconv program


Mconv does the basic task of converting a Tokunaga Mahabharata file. It standardises the layout, converts Tokunaga's transliteration scheme to CSX, resolves as many as possible of the ambiguities introduced by his "short cuts" (such as "m" standing for both the labial nasal and anusvara), corrects common errors, normalises vowel sandhi, attempts to rationalise the use of the semicolon, and imposes correct consonant sandhi. Finally, the text is optionally passed through mconv.spacing, which seeks to mend as many of the broken compounds as possible. (Even more finally, it is passed through mconv.cleanup, which does what its name suggests.)

Note that the original program makes direct use of accented characters in the CSX encoding; here it has been converted to use Unicode equivalents to ensure readability on modern computers. This means that downloading and attempting to run the program as it stands is unlikely to work.



#!/usr/bin/perl
#-*-Perl-*-

#------------------------------------------------------------------#
$description =
"Syntax: mconv [options] file

Mconv converts \"file\", which should be an uncompressed Tokunaga
Mahābhārata file, to CSX format. As part of this process it inserts
corrections generated by mkmconv.

-h option prints this help.
-s option runs in \"stupid\" mode, i.e. deliberately does not make
   any of the corrections suggested by mkmconv.
";
#------------------------------------------------------------------#

require 'getopts.pl';
Getopts(':hs');
if ($opt_h || $#ARGV != 0) {
    print $description;
        exit 1;
}

$filename = $ARGV[0];
@problems = ();
$date = `Date`;
if ($opt_s) {
    open(STDOUT, "|mconv.cleanup");
}
else {

#   The following line (commented out) runs the DANGEROUS unbatched option.
#   Check output very carefully if you use it.
#   open(STDOUT, "|perl -sp mconv.spacing -unbatched |mconv.spacing |mconv.cleanup");

#   The standard safer option follows:
    open(STDOUT, "|mconv.spacing |mconv.cleanup");
}
print "% This file generated by\n";
print "% mconv $filename\n";
print "% $date\%\n";
while (<>) {

    # Correct layout and transliteration

    next if (/^%/|/^\s/);
    study;
    s/\015//g;					# Get rid of DOS CR char
    chop;
    s/^0//;					# Get rid of leading zeros
    if (/(\s*({.*})$)/) {
	$uvaaca = $2;
	s/$1//;
	print substr($_, 0, 8) . "  $uvaaca\n";
    }
    tr/././s;
    s/\///;
    s/\/.*$//;
    s/\./ /g;
    s/ $//;
    s/aa/ā/g;
    s/ii/ī/g;
    s/uu/ū/g;
    s/R/ṛ/g;
    s/ṛṛ/ṝ/g;
    s/T/ṭ/g;
    s/D/ḍ/g;
    s/N/ṇ/g;
    s/z/ś/g;
    s/S/ṣ/g;

    # Temporary 1-letter versions of ai and au

    s/ai/E/g;
    s/au/O/g;

    # Correct visarga, anusvāra and ṅ, ñ

    s/([aāiīuūṛeoEO])h([ ;kpśṣs]|$)/$1ḥ$2/g;
    s/m( |;)([^aāiīuūṛeoEO])/ṃ$1$2/g;
    s/sam([^ ;aāiīuūṛeoEO])/saṃ$1/g;
    s/saṃrā([jṭḍ])/samrā$1/g;
    s/saṃya([ñṅkg])/samya$1/g;
    s/( |;)pum([ ;]?)([^aāiīuūṛeoEO])/$1puṃ$2$3/g;
    s/( |;)puṃbh/$1pumbh/g;
    s/([aāiīuūṛṝeoEO])m([kgcjṭḍṇtdmśṣsh])/$1ṃ$2/g;
    s/śaṃkh/śaṅkh/g;
    s/( |;)saṃga /$1saṅga /g;
    s/ml( |$)/ṁl$1/g;
    s/n([cj])/ñ$1/g;
    s/([cj])n/$1ñ/g;
    s/n([kg])/ṅ$1/g;
    s/( |;)(a?)pratyan( |;|$)/$1$2pratyaṅ$3/g;
    s/( |;)(a?)pratyann( |;)/$1$2pratyaṅṅ$3/g;
    s/( |;)(a?)prān( |;|$)/$1$2prāṅ$3/g;
    s/( |;)(a?)parān( |;)(man|mukh)/$1$2parāṅ$3$4/g;

    # Correct anomalies and common typos that are best dealt with before sandhi

    s/(k[^ ;]+)(ci[tdncjl]|cana)($|[ ;])/$1 $2$3/g;
    s/([ ;])kāñ cana/$1kāñcana/g;
    s/([krṛṝ])s/$1ṣ/g;
    s/([ ;])kṛṣar/$1kṛsar/g;
    s/([ ;])bṛṣ/$1bṛs/g;
    s/([ ;])viṣvakṣen/$1viṣvaksen/g;
    s/([ ;])nṛṣiṃh/$1nṛsiṃh/g;
    s/ṣt/ṣṭ/g;
    s/(dur|ni[rṣ]|par[iyā]|pr[aāoeEO]|antar)([kgṅpbmyrvh]*)([aāiīuūeṛṝoEO]*)([aāiīuūeṛṝoEOkgṅpbmyrvh]*)n/$1$2$3$4N/g;
    s/([rṛṝṣ])([aāiīuūeṛṝoEOkgṅpbmyrvh]*)n([aāiīuūṛṝeoEOnmyv])/$1$2ṇ$3/g;
    s/ṇn([^ ;])/ṇṇ$1/g;
    s/ṇn/nn/g;
    s/N/n/g;
    s/nirvinn/nirviṇṇ/g;
    s/prayān([aāeo])/prayāṇ$1/g;
    s/ṇṛt/nṛt/g;
    s/ṇart/nart/g;
    s/ṇabh/nabh/g;
    s/ṇand/nand/g;
    s/ṇaṣṭ/naṣṭ/g;
    s/prāpṇ/prāpn/g;
    s/ghṇ/ghn/g;
    s/([iīuūṛeoEO])s([uv])($|[ ;])/$1ṣ$2$3/g;
    s/([eo])s([aāiīuūṛeoEO])/$1ṣ$2/g;
    s/isy/iṣy/g;
    s/maṇīṣ/manīṣ/g;
    s/av([ ;])/āv$1/g;
    s/tisth/tiṣṭh/g;
    s/nisth/niṣṭh/g;
    s/nEsth/nEṣṭh/g;
    s/ṣaṇ([ ;])/ṣaṭ$1/;
    s/ṇ([ ;])/n$1/g;
    s/rūdh/rūḍh/g;
    s/cest/ceṣṭ/g;
    s/pattr/patr/g;
    s/([aāiīuūṛṝeoEO])ṛ([ṣṇ])/$1r$2/g;
    s/([^ ;])ayāmās/$1ayām ās/g;
    s/ar([ ;])([kpśṣs])/aḥ$1$2/g;
    s/(tira|pura|nama)([sḥ]) (kṛ|kar|kur[uv])/$1s$3/g;
    s/preṣp/preps/g;

    # Correct vowel sandhi

    s/[aā] [aā]/ā/g;
    s/[iī] [iī]/ī/g;
    s/[uū] [uū]/ū/g;
    s/[aā] [iī]/e/g;
    s/[aā] [uū]/o/g;
    s/[aā] ṛ/ar/g;
    s/[aā] [eE]/E/g;
    s/[aā] [oO]/O/g;
    s/[iī]( [aāuūṛeoEO])/y$1/g;
    s/[uū]( [aāiīṛeoEO])/v$1/g;
    s/ṛ ([aāiīuūeoEO])/r$1/g;
    s/([eo]) a/$1 '/g;
    s/[eo]( [āiīuūṛeoEO])/a$1/g;
    s/E( [aāiīuūṛeoEO])/ā$1/g;
    s/O( [aāiīuūṛeoEO])/āv$1/g;

    # Deal with ";"

    s/([aāiīuūṛeoEO]);([aāiīuūṛeoEO])/$1:$2/g;
    s/;/; /g;
    $count = (tr/aāiīuūṛṝḷeoEO//);
    if ($count > 18) {
	if (!/; /) {
	    if (/:/) {
		s/:/; /g;
		$count = (tr/;//);
		if ($count > 1) {
		    push(@problems, $_);
		}
	    }
	    else {
		push(@problems, $_);
	    }
	}
    }
    s/:/ /;

    # Correct consonant sandhi

    s/s(;? |$)/ḥ$1/g;
    s/ (vā|tva|ru|sru|mu)c(;? |$)/ $1k$2/g;
    s/ (ru|ṛtvi|vaṇi|bhiṣa|sra|ūr)j(;? |$)/ $1k$2/g;
    s/ (di|dṛ|spṛ)ś(;? |$)/ $1k$2/g;
    s/ (samrā|parivrā)j(;? |$)/ $1ṭ$2/g;
    s/ viś(;? |$)/ viṭ$1/g;
    s/[cj](;? )/t$1/g;
    s/([kgṭḍtdpb])\1(;? )/$1$2/g;
    s/k(;? )([gjḍdbyrlvhaāiīuūṛeoEO])/g$1$2/g;
    s/g(;? )([kcṭtpśṣs])/k$1$2/g;
    s/[kg](;? )([nm])/ṅ$1$2/g;
    s/ṭ(;? )([gjḍdbyrlvhaāiīuūṛeoEO])/ḍ$1$2/g;
    s/ḍ(;? )([kcṭtpśṣs])/ṭ$1$2/g;
    s/[ṭḍ](;? )([nm])/ṇ$1$2/g;
    s/t(;? )([gjḍdbyrlvhaāiīuūṛeoEO])/d$1$2/g;
    s/(d|dh)(;? )([kcṭtpśṣs])/t$2$3/g;
    s/([td]|dh)(;? )([nm])/n$2$3/g;
    s/p(;? )([gjḍdbyrlvhaāiīuūṛeoEO])/b$1$2/g;
    s/(b|bh)(;? )([kcṭtpśṣs])/p$2$3/g;
    s/([pb]|bh)(;? )([nm])/m$2$3/g;
    s/([gḍdb])(;? )h/$1$2$1h/g;
    s/n(;? )c/ṃś$1c/g;
    s/n(;? )ṭ/ṃṣ$1ṭ/g;
    s/n(;? )t/ṃs$1t/g;
    s/n(;? )([jś])/ñ$1$2/g;
    s/n(;? )ḍ/ṇ$1ḍ/g;
    s/n(;? )l/ṁl$1l/g;
    s/t(;? )c/c$1c/g;
    s/t(;? )ś/c$1ch/g;
    s/t(;? )ṭ/ṭ$1ṭ/g;
    s/d(;? )j/j$1j/g;
    s/d(;? )ḍ/ḍ$1ḍ/g;
    s/d(;? )l/l$1l/g;
    s/ḥ(;? )c/ś$1c/g;
    s/ḥ(;? )ṭ/ṣ$1ṭ/g;
    s/ḥ(;? )t/s$1t/g;
    s/āḥ(;? )([^kpśṣs])/ā$1$2/g;
    s/aḥ(;? )([āiīuūṛeoEO])/a$1$2/g;
    s/aḥ(;? )([gjḍdnbmyrlvh])/o$1$2/g;
    s/aḥ(;? )a/o$1'/g;
    s/ḥ(;? )([aāiīuūṛeoEOgjḍdnbmyrlvh])/r$1$2/g;
    s/ar(;? )r/ā$1r/g;
    s/ir(;? )r/ī$1r/g;
    s/ur(;? )r/ū$1r/g;
    s/r(;? )r/$1r/g;

    print;
    print "\n";
}

if (@problems != ()) {
    print STDERR "\nThe file $filename.log contains important information.\n\n";
    open(LOGFILE, ">$filename.log");
    print LOGFILE "This file generated by\n";
    print LOGFILE "mconv $filename\n";
    print LOGFILE "$date\n";
    print LOGFILE "Mconv regards the following line(s) as problematical:\n";
    print LOGFILE "they contain either too many semicolons or too few.\n";
    print LOGFILE "(Note that the lines are quoted from an early stage\n";
    print LOGFILE "of mconv's processing, so there may be minor textual\n";
    print LOGFILE "divergences from the final output.)\n\n";
    do {
	$line = shift(@problems);
	$line =~ s/E/ai/g;
	$line =~ s/O/au/g;
	print LOGFILE "$line\n";
    }
    until (@problems == ());
    close LOGFILE;
}


Back to home page