The edmconv program

Edmconv is run as the final stage of mkmconv, but can also be invoked separately (allowing the user -- i.e. me -- to break a lengthy run into several sessions). Edmconv examines each "false space" identified by mkmconv; if it is not covered by existing rules, the program asks the user to categorise the leftmost form as (a) a stem, after which it is always correct to remove a space (e.g. ratna), or (b) a word, where it will only be safe to remove the space in the specific case under consideration (e.g. bhiima paraakramaH). On the basis of the answers it receives it attempts to build scripts of stem corrections and compound corrections to add to mconv.spacing. Compound corrections are generalised as far as possible to avoid specificities of case-inflexion and word-final sandhi (the rule generated will be bhiima paraakram -> bhiimaparaakram).

Note that the original program makes direct use of accented characters in the CSX encoding; here it has been converted to use Unicode equivalents to ensure readability on modern computers. This means that downloading and attempting to run the program as it stands is unlikely to work.


#!/usr/bin/perl
#-*-Perl-*-

#------------------------------------------------------------------#
$description =
"Syntax: edmconv [options] file

Edmconv allows the user to continue editing the correction files
generated by mkmconv. \"File\" should be the name of the Tokunaga
Mahābhārata file in CSX format which mconv compared with another,
more correct file.

-n option asks no questions but does a \"next\" on every form which
   is not already known to it.
-h option prints this help.
";
#------------------------------------------------------------------#

require 'getopts.pl';
Getopts(':nh');
if ($opt_h || $#ARGV != 0) {
    print $description;
        exit 1;
}

$file = shift;
$aborted = 0;
@stems = "";
open(STEM_FILE, "mconv.spacing");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([][)('A-zÇ-Ḥ]*) \//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@stems, "=$word=");	# "=" to anchor boundaries
    }
}
close STEM_FILE;
open(STEM_FILE, "$file.stems");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([][)('A-zÇ-Ḥ]*) \//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@stems, "=$word=");
    }
}
close STEM_FILE;

@words = "";
open(WORD_FILE, "mconv.words");
while () {
    chop;
    next if (/^\#/ || /^\s*$/);
    push(@words, "=$_=");
}
close WORD_FILE;
open(WORD_FILE, "$file.words");
while () {
    chop;
    next if (/^\#/ || /^\s*$/);
    push(@words, "=$_=");
}
close WORD_FILE;

@compounds = "";
open(COMPOUND_FILE, "mconv.spacing");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([^ \/]+ [^ \/]+)\//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@compounds, "=$word=");
    }
}
close COMPOUND_FILE;
open(COMPOUND_FILE, "$file.compounds");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([^ \/]+ [^ \/]+)\//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@compounds, "=$word=");
    }
}
close COMPOUND_FILE;

open(CORRECTION_FILE, "$file.corrections");
open(NEW_CORRECTION_FILE, ">$file.corrections.new");
open(COMPOUND_FILE, ">>$file.compounds.new");
open(STEM_FILE, ">>$file.stems.new");
open(WORD_FILE, ">>$file.words.new");
while () {
    if (/^\#/) {
	print NEW_CORRECTION_FILE $_;
	next;
    }
    print "\nProposed correction: $_";
    while (/([^ \/]+) /g) {
	$word = $1;
	($gword = $word) =~ s/^[a']/([a'])/;
	($ggword = $gword) =~ s/([iī])$/([$1y])/;
	$ggword =~ s/([uū])$/([$1v])/;
	($grepword = $ggword) =~ s/([][)(])/\\$1/g;
	if (grep(/=$grepword=/, @stems)) {
	    print NEW_CORRECTION_FILE "\# > Stems       $ggword\n";
	    print "\"$word\" is already recognised as a stem.\n";
	    next;
	}
	elsif (grep(/=$grepword=/, @words)) {
	    print "\"$word\" is already recognised as a word.\n";
	    if (/($word)( [^ \/]+)[\/ ].*$word.*\/g;/) {
		$left = $gword . $2;
		($right = $left) =~ s/ /./g;
		$right =~ s/^\(\[.*\]\)/\$2/;
		$cpd = "s/(\\.|\\s)$left\/\$1$right\/g;";
	    }
	    print NEW_CORRECTION_FILE "\# > Compounds   $cpd\n";
	    $strip = &strip_case_endings($cpd);
	    if ($strip =~ /\\s\)([^ \/]+ [^ \/]+)\//) {
		$compound = $1;
		($grepcompound = $compound ) =~ s/([][)(])/\\$1/g;
		unless (grep(/=$grepcompound=/, @compounds)) {
		    push(@compounds, "=$compound=");
		    $strip =~ s/ai/E/g;
		    $strip =~ s/au/O/g;
		    print COMPOUND_FILE "$strip\n";
		}
	    }
	    next;
	}
	if ($opt_n) {
	    print NEW_CORRECTION_FILE "$_";
	    next;
	}
	else {
	    do {
		print "*** Is \"$word\" a (S)tem or a (W)ord,\n";
		print "*** or would you rather (C)omment it out,\n";
		print "*** move to the (N)ext case or (Q)uit? ";
		$c = ;
		print "\n";
		if ($c =~ /^[sS]/) {
		    print NEW_CORRECTION_FILE "\# > Stems       $ggword\n";
		    push(@stems, "=$ggword=");
		    $ggword =~ s/ai/E/g;
		    $ggword =~ s/au/O/g;
		    print STEM_FILE "s/(\\.|\\s)$ggword /\$1";
		    $ggword =~ s/\(\[.*\]\)/\$2/g;
		    $ggword =~ s/(\$2.*)\$2/$1\$3/g;
		    print STEM_FILE "$ggword./g;\n";
		}
		elsif ($c =~ /^[wW]/) {
		    if (/($word)( [^ \/]+)[\/ ].*$word.*\/g;/) {
			$left = $gword . $2;
			($right = $left) =~ s/ /./g;
			$right =~ s/^\(\[.*\]\)/\$2/;
			$cpd = "s/(\\.|\\s)$left\/\$1$right\/g;";
		    }
		    print NEW_CORRECTION_FILE "\# > Compounds   $cpd\n";
		    push(@words, "=$ggword=");
		    print WORD_FILE "$ggword\n";
		    $strip = &strip_case_endings($cpd);
		    if ($strip =~ /\\s\)([^ \/]+ [^ \/]+)\//) {
			$compound = $1;
			($grepcompound = $compound ) =~ s/([][)(])/\\$1/g;
			unless (grep(/=$grepcompound=/, @compounds)) {
			    push(@compounds, "=$compound=");
			    $strip =~ s/ai/E/g;
			    $strip =~ s/au/O/g;
			    print COMPOUND_FILE "$strip\n";
			}
		    }
		}
		elsif ($c =~ /^[cC]/) {
		    print NEW_CORRECTION_FILE "\# Commented out $_";
		}
		elsif ($c =~ /^[nN]/) {
		    print NEW_CORRECTION_FILE "$_";
		    next;
		}
		elsif ($c =~ /^[qQ]/) {
		    print NEW_CORRECTION_FILE "$_";
		    while () {
			print NEW_CORRECTION_FILE "$_";
		    }
		    $aborted = 1;
		}
	    }
	    until ($c =~ /^[nNqQcCwWsS]/);
	}    
    }
}
print COMPOUND_FILE "\n";
print STEM_FILE "\n";
print WORD_FILE "\n";

close CORRECTION_FILE;
close NEW_CORRECTION_FILE;
close COMPOUND_FILE;
close STEM_FILE;
close WORD_FILE;

do {
    print "\nSave changes? (Y/N) ";
    $c = ;
    print "\n";
    if ($c =~ /^[yY]/) {
	rename("$file.corrections.new", "$file.corrections");
	system "cat $file.compounds.new >>$file.compounds";
	system "cat $file.stems.new >>$file.stems";
	system "cat $file.words.new >>$file.words";
	unlink ("$file.compounds.new", "$file.stems.new", "$file.words.new");
    }
    elsif ($c =~ /^[nN]/) {
	$aborted = 1;
	unlink ("$file.corrections.new", "$file.compounds.new", "$file.stems.new", "$file.words.new");
    }
}
until ($c =~ /^[yYnN]/);

if ($aborted) {
    print "Use \"edmconv $file\" to resume editing. Do not use \"mkmconv\"\n";
    print "unless you want to scrap what you have done and start again.\n\n";
}
else {
    do {
	print "\nAdd these corrections to the master files? (Y/N) ";
	$c = ;
	print "\n\n";
	if ($c =~ /^[yY]/) {
	    system "cat $file.stems >>mconv.spacing";
	    system "cat $file.compounds >>mconv.spacing";
	    open(SPACING_FILE, ">>mconv.spacing");
	    print SPACING_FILE "\n#\ns/\\.//g unless \$unbatched;";
	    print SPACING_FILE " #          END OF BATCH\n#\n\n\n";
	    close SPACING_FILE;
	    system "cat $file.words >>mconv.words";
	}
    }
    until ($c =~ /^[yYnN]/);
}

sub strip_case_endings {
    # Generalise compound corrections by removing inflexional endings
    # where this will not lead to ambiguity. Watch out for cases such
    # as the apparent instrumental (actually vocative) bhīmasena.

    local ($line) = $_[0];
    return $line if $line =~ s/[aiu][ṃm]\//\//g;
    return $line if $line =~ s/([^s])ena\//$1\//g;
    return $line if $line =~ s/eṇa\//\//g;
    return $line if $line =~ s/[iu][ṇn]ā\//\//g;
    return $line if $line =~ s/([^dm])ayā\//$1\//g;
    return $line if $line =~ s/([^km])āya\//$1\//g;
    return $line if $line =~ s/āyai\//\//g;
    return $line if $line =~ s/a[yv]e\//\//g;
    return $line if $line =~ s/[yv]ai\//\//g;
    return $line if $line =~ s/([^p])ā[tdcj]\//$1\//g;
    return $line if $line =~ s/āyā[ḥśṣs]\//\//g;
    return $line if $line =~ s/[eo][ḥśṣsr]\//\//g;
    return $line if $line =~ s/[iu][ṇn](a[ḥśṣs]|o)\//\//g;
    return $line if $line =~ s/asya\//\//g;
    return $line if $line =~ s/au\//\//g;
    return $line if $line =~ s/āyā[ṃm]\//\//g;
    return $line if $line =~ s/yā[ṃm]\//\//g;
    return $line if $line =~ s/[aāiīuūṛo]bhyā[ṃm]\//\//g;
    return $line if $line =~ s/[āīūṝ][ḥśṣsrn]\//\//g;
    return $line if $line =~ s/[āīūṝ][ṇn]i\//\//g;
    return $line if $line =~ s/ai[ḥśṣsr]\//\//g;
    return $line if $line =~ s/[āiīuūṛo]bhi[ḥśṣsr]\//\//g;
    return $line if $line =~ s/[āiīuūṛeo]bhy(a([ḥśṣs]?)|o)\//\//g;
    return $line if $line =~ s/[āiīūṝ][ṇn]ā[ṃm]\//\//g;
    return $line if $line =~ s/ā[ṃm]\//\//g;
    return $line if $line =~ s/ās[uv]\//\//g;
    return $line if $line =~ s/[eiīuūṛ]ṣ[uv]\//\//g;
    return $line if $line =~ s/[aāiīuūeo]\//\//g;
    return $line if $line =~ s/a[ḥśṣs]\//\//g;
    return $line if $line =~ s/[iu][ḥśṣsr]\//\//g;
    return $line;
}





Back to home page