The mkmconv program


Mkmconv compares two versions of the same piece of Mahabharata text: the first should be a simple CSX transliteration of a Tokunaga passage, and will thus contain "false spaces" (and other errors); the second should be the same passage corrected by hand. Mkmconv isolates all cases where a false space has been eliminated, and then runs edmconv to establish how each such case should be treated.

Note that this program makes direct use of accented characters in the CSX encoding; unless your Web browser uses a font based on this encoding (which is vanishingly unlikely) these characters will not appear correctly. The simplest expedient is to save this file to your own disk and view it there with software that does accept a CSX-based font.



#!/usr/bin/perl
#-*-Perl-*-

#-------------------------------------------------------------------#
$description =
"Syntax: mkmconv [options] file1 file2

Mkmconv compares file1, which should be a Tokunaga Mahbhrata file
converted by mconv to CSX format, with file2, which should be a
more correct version of the same text, and attempts to generate a
script that can be used to correct further Tokunaga texts.

-h option prints this help.
-l option specifies the limit on the length of the discrepancy which
          mkmconv will tolerate between file1 and file2. The default
          is 10 words. If the text contains very long compounds it
          may be necessary to specify a higher number.
";
#-------------------------------------------------------------------#

require 'getopts.pl';
Getopts('l:h');
if ($opt_h || $#ARGV != 1) {
    print $description;
        exit 1;
}

if ($opt_l) {
    $safetylimit = $opt_l;
}
else {
    $safetylimit = 10;
}

$file1 = shift;
$file2 = shift;

open(FILE1, $file1);
$outfile1 = $file1 . ".wordlist";
open(OUT_FILE1, ">$outfile1");
&prepare(FILE1, OUT_FILE1);
close FILE1;
close OUT_FILE1;

open(FILE2, $file2);
$outfile2 = $file2 . ".wordlist";
open(OUT_FILE2, ">$outfile2");
&prepare(FILE2, OUT_FILE2);
close FILE2;
close OUT_FILE2;

system("diff -y --suppress-common-lines $outfile1 $outfile2 >$file1.diff");

open(DIFF_FILE, "$file1.diff");
open(CORRECTION_FILE, ">$file1.corrections");
open(DISCR_FILE, ">$file1.discrepancies");
print DISCR_FILE <    ", "/corrected-form/";

@from = ();
@to = ();
while () {
    if (! /^([^\s]*)\s*([^\s])\s*([^\s]*)$/) {
	print DISCR_FILE "# Malformed diff line: $_";
	next;
    }
    $from = $1;
    push(@from, $from) if $from ne "";
    $to = $3;
    push(@to, $to) if $to ne "";
}
close DIFF_FILE;
do {
    $from = shift(@from);
    $to = shift(@to);
    if (! ($to =~ /^$from/)) {
	$safetycheck = 0;
	$tmpfrom = $tmpto = "";
	do {
	    $from =~ s/a([iu])/\u$1/g;
	    $tmpfrom .= $from;
	    $cntfrom = ($tmpfrom =~ tr/aiueoIU//);
	    $to =~ s/a([iu])/\u$1/g;
	    $tmpto .= $to;
	    $cntto = ($tmpto =~ tr/aiueoIU//);
	    if ($cntfrom < $cntto) {
		$tmpfrom .= " ";
		$to = "";
		$from = shift(@from);
	    }
	    elsif ($cntto < $cntfrom) {
		$tmpto .= " ";
		$from = "";
		$to = shift(@to);
	    }
	    $safetycheck ++;
	}
	until ($cntfrom == $cntto || (@from == () && @to == ()));
	($from = $tmpfrom) =~ s/([IU])/a\l$1/g;
	($to = $tmpto) =~ s/([IU])/a\l$1/g;
	printf DISCR_FILE "%-35s%s%s\n", "/$from/", ">    ", "/$to/";
    }
    else {
	$done = 0;
	do {
	    $newfrom = 
	    $from .= " " . shift(@from);
	    ($stripfrom = $from) =~ s/ //g;
	    if ($stripfrom eq $to) {
		print CORRECTION_FILE "s/$from/$to/g;\n";
		$done = 1;
	    }
	    elsif (! ($to =~ /^$stripfrom/)) {
		$safetycheck = 0;
		$tmpfrom = $tmpto = "";
		do {
		    $from =~ s/a([iu])/\u$1/g;
		    $tmpfrom .= $from;
		    $cntfrom = ($tmpfrom =~ tr/aiueoIU//);
		    $to =~ s/a([iu])/\u$1/g;
		    $tmpto .= $to;
		    $cntto = ($tmpto =~ tr/aiueoIU//);
		    if ($cntfrom < $cntto) {
			$tmpfrom .= " ";
			$to = "";
			$from = shift(@from);
		    }
		    elsif ($cntto < $cntfrom) {
			$tmpto .= " ";
			$from = "";
			$to = shift(@to);
		    }
		    $safetycheck ++;
		}
		until ($cntfrom == $cntto || (@from == () && @to == ()));
		($from = $tmpfrom) =~ s/([IU])/a\l$1/g;
		($to = $tmpto) =~ s/([IU])/a\l$1/g;
		printf DISCR_FILE "%-35s%s%s\n", "/$from/", ">    ", "/$to/";
		$done = 1;
	    }
	}
	until $done;
    }
    if ($safetycheck > $safetylimit) {
	print "\nAn abnormally long \"discrepancy\" occurred.\n";
	print "Check $file1.discrepancies";
	print " and edit files as necessary.\n\n";
	exit 1;
    }
}
until (@from == () && @to == ());
close CORRECTION_FILE;
close DISCR_FILE;

system "sort -u -o $file1.corrections $file1.corrections";

$date = `Date`;
open(COMPOUND_FILE, ">$file1.compounds");
open(STEM_FILE, ">$file1.stems");
open(WORD_FILE, ">$file1.words");
print COMPOUND_FILE "\# These compound corrections generated by\n";
print COMPOUND_FILE "\# \"mkmconv $file1 $file2\"\n";
print COMPOUND_FILE "\# $date\n";
print STEM_FILE "\# These stem corrections generated by\n";
print STEM_FILE "\# \"mkmconv $file1 $file2\"\n";
print STEM_FILE "\# $date\n";
print WORD_FILE "\# This list generated by\n";
print WORD_FILE "\# \"mkmconv $file1 $file2\"\n";
print WORD_FILE "\# $date\n";
close COMPOUND_FILE;
close STEM_FILE;
close WORD_FILE;

print <) {
	next if (/[{}\\]/) || !(/['A-z-]/) || (/^[0-9]+  /) || (/^%/);
	s/^([0-9]+[A-Z]?\s*)([^\s].*)$/$2/;
	tr/ / /s;
	s/;//g;
	s/ /\n/g;
	print($outfile $_);
    }
}


Back to home page