Note that this program makes direct use of accented characters in the CSX encoding; unless your Web browser uses a font based on this encoding (which is vanishingly unlikely) these characters will not appear correctly. The simplest expedient is to save this file to your own disk and view it there with software that does accept a CSX-based font.
#!/usr/bin/perl
#-*-Perl-*-
#-------------------------------------------------------------------#
$description =
"Syntax: mkmconv [options] file1 file2
Mkmconv compares file1, which should be a Tokunaga Mahàbhàrata file
converted by mconv to CSX format, with file2, which should be a
more correct version of the same text, and attempts to generate a
script that can be used to correct further Tokunaga texts.
-h option prints this help.
-l option specifies the limit on the length of the discrepancy which
mkmconv will tolerate between file1 and file2. The default
is 10 words. If the text contains very long compounds it
may be necessary to specify a higher number.
";
#-------------------------------------------------------------------#
require 'getopts.pl';
Getopts('l:h');
if ($opt_h || $#ARGV != 1) {
print $description;
exit 1;
}
if ($opt_l) {
$safetylimit = $opt_l;
}
else {
$safetylimit = 10;
}
$file1 = shift;
$file2 = shift;
open(FILE1, $file1);
$outfile1 = $file1 . ".wordlist";
open(OUT_FILE1, ">$outfile1");
&prepare(FILE1, OUT_FILE1);
close FILE1;
close OUT_FILE1;
open(FILE2, $file2);
$outfile2 = $file2 . ".wordlist";
open(OUT_FILE2, ">$outfile2");
&prepare(FILE2, OUT_FILE2);
close FILE2;
close OUT_FILE2;
system("diff -y --suppress-common-lines $outfile1 $outfile2 >$file1.diff");
open(DIFF_FILE, "$file1.diff");
open(CORRECTION_FILE, ">$file1.corrections");
open(DISCR_FILE, ">$file1.discrepancies");
print DISCR_FILE < ", "/corrected-form/";
@from = ();
@to = ();
while () {
if (! /^([^\s]*)\s*([^\s])\s*([^\s]*)$/) {
print DISCR_FILE "# Malformed diff line: $_";
next;
}
$from = $1;
push(@from, $from) if $from ne "";
$to = $3;
push(@to, $to) if $to ne "";
}
close DIFF_FILE;
do {
$from = shift(@from);
$to = shift(@to);
if (! ($to =~ /^$from/)) {
$safetycheck = 0;
$tmpfrom = $tmpto = "";
do {
$from =~ s/a([iu])/\u$1/g;
$tmpfrom .= $from;
$cntfrom = ($tmpfrom =~ tr/aàiãuåçéëeoIU//);
$to =~ s/a([iu])/\u$1/g;
$tmpto .= $to;
$cntto = ($tmpto =~ tr/aàiãuåçéëeoIU//);
if ($cntfrom < $cntto) {
$tmpfrom .= " ";
$to = "";
$from = shift(@from);
}
elsif ($cntto < $cntfrom) {
$tmpto .= " ";
$from = "";
$to = shift(@to);
}
$safetycheck ++;
}
until ($cntfrom == $cntto || (@from == () && @to == ()));
($from = $tmpfrom) =~ s/([IU])/a\l$1/g;
($to = $tmpto) =~ s/([IU])/a\l$1/g;
printf DISCR_FILE "%-35s%s%s\n", "/$from/", "> ", "/$to/";
}
else {
$done = 0;
do {
$newfrom =
$from .= " " . shift(@from);
($stripfrom = $from) =~ s/ //g;
if ($stripfrom eq $to) {
print CORRECTION_FILE "s/$from/$to/g;\n";
$done = 1;
}
elsif (! ($to =~ /^$stripfrom/)) {
$safetycheck = 0;
$tmpfrom = $tmpto = "";
do {
$from =~ s/a([iu])/\u$1/g;
$tmpfrom .= $from;
$cntfrom = ($tmpfrom =~ tr/aàiãuåçéëeoIU//);
$to =~ s/a([iu])/\u$1/g;
$tmpto .= $to;
$cntto = ($tmpto =~ tr/aàiãuåçéëeoIU//);
if ($cntfrom < $cntto) {
$tmpfrom .= " ";
$to = "";
$from = shift(@from);
}
elsif ($cntto < $cntfrom) {
$tmpto .= " ";
$from = "";
$to = shift(@to);
}
$safetycheck ++;
}
until ($cntfrom == $cntto || (@from == () && @to == ()));
($from = $tmpfrom) =~ s/([IU])/a\l$1/g;
($to = $tmpto) =~ s/([IU])/a\l$1/g;
printf DISCR_FILE "%-35s%s%s\n", "/$from/", "> ", "/$to/";
$done = 1;
}
}
until $done;
}
if ($safetycheck > $safetylimit) {
print "\nAn abnormally long \"discrepancy\" occurred.\n";
print "Check $file1.discrepancies";
print " and edit files as necessary.\n\n";
exit 1;
}
}
until (@from == () && @to == ());
close CORRECTION_FILE;
close DISCR_FILE;
system "sort -u -o $file1.corrections $file1.corrections";
$date = `Date`;
open(COMPOUND_FILE, ">$file1.compounds");
open(STEM_FILE, ">$file1.stems");
open(WORD_FILE, ">$file1.words");
print COMPOUND_FILE "\# These compound corrections generated by\n";
print COMPOUND_FILE "\# \"mkmconv $file1 $file2\"\n";
print COMPOUND_FILE "\# $date\n";
print STEM_FILE "\# These stem corrections generated by\n";
print STEM_FILE "\# \"mkmconv $file1 $file2\"\n";
print STEM_FILE "\# $date\n";
print WORD_FILE "\# This list generated by\n";
print WORD_FILE "\# \"mkmconv $file1 $file2\"\n";
print WORD_FILE "\# $date\n";
close COMPOUND_FILE;
close STEM_FILE;
close WORD_FILE;
print <) {
next if (/[{}\\]/) || !(/['A-z€-ÿ]/) || (/^[0-9]+ /) || (/^%/);
s/^([0-9]+[A-Z]?\s*)([^\s].*)$/$2/;
tr/ / /s;
s/;//g;
s/ /\n/g;
print($outfile $_);
}
}
John Smith can be contacted as jds10@cam.ac.uk