#! /usr/bin/perl

# Fix ICL1900 style program names that ocropus is reluctant to recognise.

while (<>) {

	#(word 2502 3975 2643 4032 "#XJ") (word 2649 3975 2751 4032 "EC")

	for (;;) {
		my ($match, $ax1,$ay1,$ax2,$ay2,$ap,
		    $bx1,$by1,$bx2,$by2,$bs,$bp );

		last unless 
			m/(\(word\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s
				"(\#[A-Z][A-Z0-9]{0,2})"\)\s
				\(word\s(\d+)\s(\d+)\s(\d+)\s(\d+)\s
				"(([A-Z0-9]+)[^"]*)"\))/xg;

		my ($match, $ax1,$ay1,$ax2,$ay2,$ap,
		    $bx1,$by1,$bx2,$by2,$bs,$bp ) = 
			($1, $2, $3, $4, $5, $6, $7, $8, $9, ${10},
			${11}, ${12});

		# Must have a program name like #XFAT
		# Second word must be on same line, after first word,
		# but less than one char of space between them

		if (length ($ap) + length ($bp) != 5 ||
		  	($bx1 - $ax2) > (($ax2 - $ax1) / length $ap) ||
			$by1 > $ay2)
		{
			# Avoid rescanning text before match

			pos () = pos () + 1;
			next;
		}

		$ax2 = $bx2;
		$ay1 = $by1 if $by1 < $ax1;
		$ay2 = $by2 if $by2 > $ay2;

		my $new = "(word $ax1 $ay1 $ax2 $ay2 \"$ap$bs\")";

		print STDERR $match, " => ", $new, "\n";

		s/\Q$match\E/$new/;
	}

	print $_;
}
