The edmconv program


Edmconv is run as the final stage of mkmconv, but can also be invoked separately (allowing the user -- i.e. me -- to break a lengthy run into several sessions). Edmconv examines each "false space" identified by mkmconv; if it is not covered by existing rules, the program asks the user to categorise the leftmost form as (a) a stem, after which it is always correct to remove a space (e.g. ratna), or (b) a word, where it will only be safe to remove the space in the specific case under consideration (e.g. bhiima paraakramaH). On the basis of the answers it receives it attempts to build scripts of stem corrections and compound corrections to add to mconv.spacing. Compound corrections are generalised as far as possible to avoid specificities of case-inflexion and word-final sandhi (the rule generated will be bhiima paraakram -> bhiimaparaakram).

Note that this program makes direct use of accented characters in the CSX encoding; unless your Web browser uses a font based on this encoding (which is vanishingly unlikely) these characters will not appear correctly. The simplest expedient is to save this file to your own disk and view it there with software that does accept a CSX-based font.



#!/usr/bin/perl
#-*-Perl-*-

#------------------------------------------------------------------#
$description =
"Syntax: edmconv [options] file

Edmconv allows the user to continue editing the correction files
generated by mkmconv. \"File\" should be the name of the Tokunaga
Mahbhrata file in CSX format which mconv compared with another,
more correct file.

-n option asks no questions but does a \"next\" on every form which
   is not already known to it.
-h option prints this help.
";
#------------------------------------------------------------------#

require 'getopts.pl';
Getopts(':nh');
if ($opt_h || $#ARGV != 0) {
    print $description;
        exit 1;
}

$file = shift;
$aborted = 0;
@stems = "";
open(STEM_FILE, "mconv.spacing");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([][)('A-z-]*) \//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@stems, "=$word=");	# "=" to anchor boundaries
    }
}
close STEM_FILE;
open(STEM_FILE, "$file.stems");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([][)('A-z-]*) \//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@stems, "=$word=");
    }
}
close STEM_FILE;

@words = "";
open(WORD_FILE, "mconv.words");
while () {
    chop;
    next if (/^\#/ || /^\s*$/);
    push(@words, "=$_=");
}
close WORD_FILE;
open(WORD_FILE, "$file.words");
while () {
    chop;
    next if (/^\#/ || /^\s*$/);
    push(@words, "=$_=");
}
close WORD_FILE;

@compounds = "";
open(COMPOUND_FILE, "mconv.spacing");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([^ \/]+ [^ \/]+)\//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@compounds, "=$word=");
    }
}
close COMPOUND_FILE;
open(COMPOUND_FILE, "$file.compounds");
while () {
    next if (/^\#/ || /^\s*$/);
    if (/\\s\)([^ \/]+ [^ \/]+)\//) {
	$word = $1;
	$word =~ s/E/ai/g;
	$word =~ s/O/au/g;
	push(@compounds, "=$word=");
    }
}
close COMPOUND_FILE;

open(CORRECTION_FILE, "$file.corrections");
open(NEW_CORRECTION_FILE, ">$file.corrections.new");
open(COMPOUND_FILE, ">>$file.compounds.new");
open(STEM_FILE, ">>$file.stems.new");
open(WORD_FILE, ">>$file.words.new");
while () {
    if (/^\#/) {
	print NEW_CORRECTION_FILE $_;
	next;
    }
    print "\nProposed correction: $_";
    while (/([^ \/]+) /g) {
	$word = $1;
	($gword = $word) =~ s/^[a']/([a'])/;
	($ggword = $gword) =~ s/([i])$/([$1y])/;
	$ggword =~ s/([u])$/([$1v])/;
	($grepword = $ggword) =~ s/([][)(])/\\$1/g;
	if (grep(/=$grepword=/, @stems)) {
	    print NEW_CORRECTION_FILE "\# > Stems       $ggword\n";
	    print "\"$word\" is already recognised as a stem.\n";
	    next;
	}
	elsif (grep(/=$grepword=/, @words)) {
	    print "\"$word\" is already recognised as a word.\n";
	    if (/($word)( [^ \/]+)[\/ ].*$word.*\/g;/) {
		$left = $gword . $2;
		($right = $left) =~ s/ /./g;
		$right =~ s/^\(\[.*\]\)/\$2/;
		$cpd = "s/(\\.|\\s)$left\/\$1$right\/g;";
	    }
	    print NEW_CORRECTION_FILE "\# > Compounds   $cpd\n";
	    $strip = &strip_case_endings($cpd);
	    if ($strip =~ /\\s\)([^ \/]+ [^ \/]+)\//) {
		$compound = $1;
		($grepcompound = $compound ) =~ s/([][)(])/\\$1/g;
		unless (grep(/=$grepcompound=/, @compounds)) {
		    push(@compounds, "=$compound=");
		    $strip =~ s/ai/E/g;
		    $strip =~ s/au/O/g;
		    print COMPOUND_FILE "$strip\n";
		}
	    }
	    next;
	}
	if ($opt_n) {
	    print NEW_CORRECTION_FILE "$_";
	    next;
	}
	else {
	    do {
		print "*** Is \"$word\" a (S)tem or a (W)ord,\n";
		print "*** or would you rather (C)omment it out,\n";
		print "*** move to the (N)ext case or (Q)uit? ";
		$c = ;
		print "\n";
		if ($c =~ /^[sS]/) {
		    print NEW_CORRECTION_FILE "\# > Stems       $ggword\n";
		    push(@stems, "=$ggword=");
		    $ggword =~ s/ai/E/g;
		    $ggword =~ s/au/O/g;
		    print STEM_FILE "s/(\\.|\\s)$ggword /\$1";
		    $ggword =~ s/\(\[.*\]\)/\$2/g;
		    $ggword =~ s/(\$2.*)\$2/$1\$3/g;
		    print STEM_FILE "$ggword./g;\n";
		}
		elsif ($c =~ /^[wW]/) {
		    if (/($word)( [^ \/]+)[\/ ].*$word.*\/g;/) {
			$left = $gword . $2;
			($right = $left) =~ s/ /./g;
			$right =~ s/^\(\[.*\]\)/\$2/;
			$cpd = "s/(\\.|\\s)$left\/\$1$right\/g;";
		    }
		    print NEW_CORRECTION_FILE "\# > Compounds   $cpd\n";
		    push(@words, "=$ggword=");
		    print WORD_FILE "$ggword\n";
		    $strip = &strip_case_endings($cpd);
		    if ($strip =~ /\\s\)([^ \/]+ [^ \/]+)\//) {
			$compound = $1;
			($grepcompound = $compound ) =~ s/([][)(])/\\$1/g;
			unless (grep(/=$grepcompound=/, @compounds)) {
			    push(@compounds, "=$compound=");
			    $strip =~ s/ai/E/g;
			    $strip =~ s/au/O/g;
			    print COMPOUND_FILE "$strip\n";
			}
		    }
		}
		elsif ($c =~ /^[cC]/) {
		    print NEW_CORRECTION_FILE "\# Commented out $_";
		}
		elsif ($c =~ /^[nN]/) {
		    print NEW_CORRECTION_FILE "$_";
		    next;
		}
		elsif ($c =~ /^[qQ]/) {
		    print NEW_CORRECTION_FILE "$_";
		    while () {
			print NEW_CORRECTION_FILE "$_";
		    }
		    $aborted = 1;
		}
	    }
	    until ($c =~ /^[nNqQcCwWsS]/);
	}    
    }
}
print COMPOUND_FILE "\n";
print STEM_FILE "\n";
print WORD_FILE "\n";

close CORRECTION_FILE;
close NEW_CORRECTION_FILE;
close COMPOUND_FILE;
close STEM_FILE;
close WORD_FILE;

do {
    print "\nSave changes? (Y/N) ";
    $c = ;
    print "\n";
    if ($c =~ /^[yY]/) {
	rename("$file.corrections.new", "$file.corrections");
	system "cat $file.compounds.new >>$file.compounds";
	system "cat $file.stems.new >>$file.stems";
	system "cat $file.words.new >>$file.words";
	unlink ("$file.compounds.new", "$file.stems.new", "$file.words.new");
    }
    elsif ($c =~ /^[nN]/) {
	$aborted = 1;
	unlink ("$file.corrections.new", "$file.compounds.new", "$file.stems.new", "$file.words.new");
    }
}
until ($c =~ /^[yYnN]/);

if ($aborted) {
    print "Use \"edmconv $file\" to resume editing. Do not use \"mkmconv\"\n";
    print "unless you want to scrap what you have done and start again.\n\n";
}
else {
    do {
	print "\nAdd these corrections to the master files? (Y/N) ";
	$c = ;
	print "\n\n";
	if ($c =~ /^[yY]/) {
	    system "cat $file.stems >>mconv.spacing";
	    system "cat $file.compounds >>mconv.spacing";
	    open(SPACING_FILE, ">>mconv.spacing");
	    print SPACING_FILE "\n#\ns/\\.//g unless \$unbatched;";
	    print SPACING_FILE " #          END OF BATCH\n#\n\n\n";
	    close SPACING_FILE;
	    system "cat $file.words >>mconv.words";
	}
    }
    until ($c =~ /^[yYnN]/);
}

sub strip_case_endings {
    # Generalise compound corrections by removing inflexional endings
    # where this will not lead to ambiguity. Watch out for cases such
    # as the apparent instrumental (actually vocative) bhmasena.

    local ($line) = $_[0];
    return $line if $line =~ s/[aiu][m]\//\//g;
    return $line if $line =~ s/([^s])ena\//$1\//g;
    return $line if $line =~ s/ea\//\//g;
    return $line if $line =~ s/[iu][n]\//\//g;
    return $line if $line =~ s/([^dm])ay\//$1\//g;
    return $line if $line =~ s/([^km])ya\//$1\//g;
    return $line if $line =~ s/yai\//\//g;
    return $line if $line =~ s/a[yv]e\//\//g;
    return $line if $line =~ s/[yv]ai\//\//g;
    return $line if $line =~ s/([^p])[tdcj]\//$1\//g;
    return $line if $line =~ s/y[s]\//\//g;
    return $line if $line =~ s/[eo][sr]\//\//g;
    return $line if $line =~ s/[iu][n](a[s]|o)\//\//g;
    return $line if $line =~ s/asya\//\//g;
    return $line if $line =~ s/au\//\//g;
    return $line if $line =~ s/y[m]\//\//g;
    return $line if $line =~ s/y[m]\//\//g;
    return $line if $line =~ s/[aiuo]bhy[m]\//\//g;
    return $line if $line =~ s/[][srn]\//\//g;
    return $line if $line =~ s/[][n]i\//\//g;
    return $line if $line =~ s/ai[sr]\//\//g;
    return $line if $line =~ s/[iuo]bhi[sr]\//\//g;
    return $line if $line =~ s/[iueo]bhy(a([s]?)|o)\//\//g;
    return $line if $line =~ s/[i][n][m]\//\//g;
    return $line if $line =~ s/[m]\//\//g;
    return $line if $line =~ s/s[uv]\//\//g;
    return $line if $line =~ s/[eiu][uv]\//\//g;
    return $line if $line =~ s/[aiueo]\//\//g;
    return $line if $line =~ s/a[s]\//\//g;
    return $line if $line =~ s/[iu][sr]\//\//g;
    return $line;
}


Back to home page