The mkmconv program


Mkmconv compares two versions of the same piece of Mahabharata text: the first should be a simple CSX transliteration of a Tokunaga passage, and will thus contain "false spaces" (and other errors); the second should be the same passage corrected by hand. Mkmconv isolates all cases where a false space has been eliminated, and then runs edmconv to establish how each such case should be treated.

Note that the original program makes direct use of accented characters in the CSX encoding; here it has been converted to use Unicode equivalents to ensure readability on modern computers. This means that downloading and attempting to run the program as it stands is unlikely to work.



#!/usr/bin/perl
#-*-Perl-*-

#-------------------------------------------------------------------#
$description =
"Syntax: mkmconv [options] file1 file2

Mkmconv compares file1, which should be a Tokunaga Mahābhārata file
converted by mconv to CSX format, with file2, which should be a
more correct version of the same text, and attempts to generate a
script that can be used to correct further Tokunaga texts.

-h option prints this help.
-l option specifies the limit on the length of the discrepancy which
          mkmconv will tolerate between file1 and file2. The default
          is 10 words. If the text contains very long compounds it
          may be necessary to specify a higher number.
";
#-------------------------------------------------------------------#

require 'getopts.pl';
Getopts('l:h');
if ($opt_h || $#ARGV != 1) {
    print $description;
        exit 1;
}

if ($opt_l) {
    $safetylimit = $opt_l;
}
else {
    $safetylimit = 10;
}

$file1 = shift;
$file2 = shift;

open(FILE1, $file1);
$outfile1 = $file1 . ".wordlist";
open(OUT_FILE1, ">$outfile1");
&prepare(FILE1, OUT_FILE1);
close FILE1;
close OUT_FILE1;

open(FILE2, $file2);
$outfile2 = $file2 . ".wordlist";
open(OUT_FILE2, ">$outfile2");
&prepare(FILE2, OUT_FILE2);
close FILE2;
close OUT_FILE2;

system("diff -y --suppress-common-lines $outfile1 $outfile2 >$file1.diff");

open(DIFF_FILE, "$file1.diff");
open(CORRECTION_FILE, ">$file1.corrections");
open(DISCR_FILE, ">$file1.discrepancies");
print DISCR_FILE <    ", "/corrected-form/";

@from = ();
@to = ();
while () {
    if (! /^([^\s]*)\s*([^\s])\s*([^\s]*)$/) {
	print DISCR_FILE "# Malformed diff line: $_";
	next;
    }
    $from = $1;
    push(@from, $from) if $from ne "";
    $to = $3;
    push(@to, $to) if $to ne "";
}
close DIFF_FILE;
do {
    $from = shift(@from);
    $to = shift(@to);
    if (! ($to =~ /^$from/)) {
	$safetycheck = 0;
	$tmpfrom = $tmpto = "";
	do {
	    $from =~ s/a([iu])/\u$1/g;
	    $tmpfrom .= $from;
	    $cntfrom = ($tmpfrom =~ tr/aāiīuūṛṝḷeoIU//);
	    $to =~ s/a([iu])/\u$1/g;
	    $tmpto .= $to;
	    $cntto = ($tmpto =~ tr/aāiīuūṛṝḷeoIU//);
	    if ($cntfrom < $cntto) {
		$tmpfrom .= " ";
		$to = "";
		$from = shift(@from);
	    }
	    elsif ($cntto < $cntfrom) {
		$tmpto .= " ";
		$from = "";
		$to = shift(@to);
	    }
	    $safetycheck ++;
	}
	until ($cntfrom == $cntto || (@from == () && @to == ()));
	($from = $tmpfrom) =~ s/([IU])/a\l$1/g;
	($to = $tmpto) =~ s/([IU])/a\l$1/g;
	printf DISCR_FILE "%-35s%s%s\n", "/$from/", ">    ", "/$to/";
    }
    else {
	$done = 0;
	do {
	    $newfrom = 
	    $from .= " " . shift(@from);
	    ($stripfrom = $from) =~ s/ //g;
	    if ($stripfrom eq $to) {
		print CORRECTION_FILE "s/$from/$to/g;\n";
		$done = 1;
	    }
	    elsif (! ($to =~ /^$stripfrom/)) {
		$safetycheck = 0;
		$tmpfrom = $tmpto = "";
		do {
		    $from =~ s/a([iu])/\u$1/g;
		    $tmpfrom .= $from;
		    $cntfrom = ($tmpfrom =~ tr/aāiīuūṛṝḷeoIU//);
		    $to =~ s/a([iu])/\u$1/g;
		    $tmpto .= $to;
		    $cntto = ($tmpto =~ tr/aāiīuūṛṝḷeoIU//);
		    if ($cntfrom < $cntto) {
			$tmpfrom .= " ";
			$to = "";
			$from = shift(@from);
		    }
		    elsif ($cntto < $cntfrom) {
			$tmpto .= " ";
			$from = "";
			$to = shift(@to);
		    }
		    $safetycheck ++;
		}
		until ($cntfrom == $cntto || (@from == () && @to == ()));
		($from = $tmpfrom) =~ s/([IU])/a\l$1/g;
		($to = $tmpto) =~ s/([IU])/a\l$1/g;
		printf DISCR_FILE "%-35s%s%s\n", "/$from/", ">    ", "/$to/";
		$done = 1;
	    }
	}
	until $done;
    }
    if ($safetycheck > $safetylimit) {
	print "\nAn abnormally long \"discrepancy\" occurred.\n";
	print "Check $file1.discrepancies";
	print " and edit files as necessary.\n\n";
	exit 1;
    }
}
until (@from == () && @to == ());
close CORRECTION_FILE;
close DISCR_FILE;

system "sort -u -o $file1.corrections $file1.corrections";

$date = `Date`;
open(COMPOUND_FILE, ">$file1.compounds");
open(STEM_FILE, ">$file1.stems");
open(WORD_FILE, ">$file1.words");
print COMPOUND_FILE "\# These compound corrections generated by\n";
print COMPOUND_FILE "\# \"mkmconv $file1 $file2\"\n";
print COMPOUND_FILE "\# $date\n";
print STEM_FILE "\# These stem corrections generated by\n";
print STEM_FILE "\# \"mkmconv $file1 $file2\"\n";
print STEM_FILE "\# $date\n";
print WORD_FILE "\# This list generated by\n";
print WORD_FILE "\# \"mkmconv $file1 $file2\"\n";
print WORD_FILE "\# $date\n";
close COMPOUND_FILE;
close STEM_FILE;
close WORD_FILE;

print <) {
	next if (/[{}\\]/) || !(/['A-zÇ-Ḥ]/) || (/^[0-9]+  /) || (/^%/);
	s/^([0-9]+[A-Z]?\s*)([^\s].*)$/$2/;
	tr/ / /s;
	s/;//g;
	s/ /\n/g;
	print($outfile $_);
    }
}


Back to home page