#!/usr/bin/perl
#
# htmldiff - present a diff marked version of two html documents
#
# Copyright (c) 1998, 1999 MACS, Inc. All rights reserved.
#
# See http://www.themacs.com for more information.
#
# usage: htmldiff [[-c] [-o] oldversion newversion [output]]
#
# -c - disable metahtml comment processing
# -o - disable outputting of old text
# oldversion - the previous version of the document
# newversion - the newer version of the document
# output - a filename to place the output in. If omitted, the output goes to
#          standard output.
#
# if invoked with no options or arguments, operates as a CGI script. It then
# takes the following parameters:
#
# oldfile - the URL of the original file
# newfile - the URL of the new file
# mhtml - a flag to indicate whether it should be aware of MetaHTML comments.
#
# requires GNU diff utility
# also requires the perl modules Getopt::Std
#
# NOTE: The markup created by htmldiff may not validate against the HTML 4.0
# DTD. This is because the algorithm is realtively simple, and there are
# places in the markup content model where the span element is not allowed.
# Htmldiff is NOT aware of these places.
#
# $Source: /sources/public/2002/ws/desc/tools/htmldiff,v $
# $Revision: 1.1 $
#
# $Log: htmldiff,v $
# Revision 1.1  2004/03/26 13:44:58  hugo
# htmldiff tool
#
# Revision 1.1  2000/07/12 12:20:04  ahby
# Updated to remove empty spans - this fixes validation problems under
# strict.
#
# Revision 1.11  1999/12/08 19:46:45  ahby
# Fixed validation errors introduced by placing markup where it didn't
# belong.
#
# Revision 1.10  1999/10/18 13:42:58  ahby
# Added -o to the usage message.
#
# Revision 1.9  1999/05/04 12:29:11  ahby
# Added an option to turn off the display of old text.
#
# Revision 1.8  1999/04/09 14:37:27  ahby
# Fixed a perl syntax error.
#
# Revision 1.7  1999/04/09 14:35:49  ahby
# Added reference to MACS homepage.
#
# Revision 1.6  1999/04/09 14:35:09  ahby
# Added comment about validity of generated markup.
#
# Revision 1.5  1999/02/22 22:17:54  ahby
# Changed to use stylesheets.
# Changed to rely upon span.
# Changed to work around content model problems.
#
# Revision 1.4  1999/02/08 02:32:22  ahby
# Added a copyright statement.
#
# Revision 1.3  1999/02/08 02:30:40  ahby
# Added header processing.
#
# Revision 1.2  1998/12/10 17:31:31  ahby
# Fixed to escape less-thans in change blocks and to not permit change
# markup within specific elements (like TITLE).
#
# Revision 1.1  1998/11/26 00:09:22  ahby
# Initial revision
#
#

use Getopt::Std;

sub usage {
	print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n";
	exit;
}


# markit - diff-mark the streams
#
# markit(file1, file2)
#
# markit relies upon GNUdiff to mark up the text.
#
# The markup is encoded using special control sequences:
#
#   a block wrapped in control-a is deleted text
#   a block wrapped in control-b is old text
#   a block wrapped in control-c is new text
#
# The main processing loop attempts to wrap the text blocks in appropriate
# SPANs based upon the type of text that it is.
#
# When the loop encounters a < in the text, it stops the span. Then it outputs
# the element that is defined, then it restarts the span.

sub markit {
	my $retval = "";
	my($file1) = shift;
	my($file2) = shift;
	my $old="<span class=\\\"diff-old-a\\\">deleted text: </span>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
	my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'";
	my $unchanged="%=";
	my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'";
	if ($opt_o) {
		$old = "";
		$changed = "%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'";
	}
#	my $old="%c'\002'<font color=\\\"purple\\\" size=\\\"-2\\\">deleted text:</font><s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s>%c'\012'%c'\002'";
#	my $new="%c'\002'<font color=\\\"purple\\\"><u>%c'\012'%c'\002'%>%c'\002'</u></font>%c'\002'%c'\012'";
#	my $unchanged="%=";
#	my $changed="%c'\002'<s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s><font color=\\\"purple\\\"><u>%c'\002'%c'\012'%>%c'\012'%c'\002'</u></font>%c'\002'%c'\012'";

	my @span;
	$span[0]="</span>";
	$span[1]="<span class=\"diff-old\">";
	$span[2]="<span class=\"diff-old\">";
	$span[3]="<span class=\"diff-new\">";

	open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!");
#	system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output));

	my $state = 0;
	my $inblock = 0;
	my $temp = "";
	while (<FILE>) {
		if ($state == 0) {
			if (m/^\001/) {
				$state = 1;
				s/^/$span[1]/;
			} elsif (m/^\002/) {
				$state = 2;
				s/^/$span[2]/;
			} elsif (m/^\003/) {
				$state = 3;
				s/^/$span[3]/;
			}
		} else {
			if (($state == 1) || ($state ==2)) {
				s/\</&lt;/g;
			}
			if (m/^[\001|\002|\003]/) {
				s/$/\<\/span>/;
				$state = 0;
			} elsif (m/^\</) {
				s/^/$span[0]/;
				s/$/$span[$state]/;
			}
		}

		s/\001//g;
		s/\002//g;
		s/\003//g;
		$retval .= $_;
	}
	close FILE;
	$retval =~ s/$span[1]\n*$span[0]//g;
	$retval =~ s/$span[2]\n*$span[0]//g;
	$retval =~ s/$span[3]\n*$span[0]//g;
	$retval =~ s/$span[1]\n*$//g;
	$retval =~ s/$span[2]\n*$//g;
	$retval =~ s/$span[3]\n*$//g;
	return $retval;
}

sub splitit {
	my $filename = shift;
	my $headertmp = shift;
	my $inheader=0;
	my $preformatted=0;
	my $inelement=0;
	my $retval = "";
	my $styles = "<style type='text/css'>
.diff-old-a {
  font-size: smaller;
  color: red;
}

.diff-old {
  color: red;
  text-decoration: line-through;
}

.diff-new {
	color: green;
	text-decoration: underline;
}
</style>
";
	
	if ($stripheader) {
		open(HEADER, ">$headertmp");
	}

	my $incomment = 0;
	my $inhead = 1;
	open(FILE, $filename) || die("File $filename cannot be opened: $!");
	while (<FILE>) {
		if ($inhead == 1) {
			if (m/\<\/head/i) {
				print HEADER $styles;
			}
			if (m/\<body/i) {
				$inhead = 0;
				print HEADER;
				close HEADER;
			} else {
				print HEADER;
			}
		} else {
			if ($incomment) {
				if (m;-->;) {
					$incomment = 0;
					s/.*-->//;
				} else {
					next;
				}
			}
			if (m;<!-- ;) {
				$incomment = 1;
				s/<!-- .*//;
			}
			if (m/\<pre/i) {
				$preformatted = 1;
			}
			if (m/\<\/pre\>/i) {
				$preformatted = 0;
			}
			if ($preformatted) {
				$retval .= $_;
			} elsif ($mhtmlcomments && /^;;;/) {
				$retval .= $_;
			} else {
				my @list = split(' ');
				foreach $element (@list) {
					if ($element =~ m/\<H[1-6]/i) {
#						$inheader = 1;
					}
					if ($inheader == 0) {
						$element =~ s/</\n</g;
						$element =~ s/^\n//;
						$element =~ s/>/>\n/g;
						$element =~ s/\n$//;
						$element =~ s/>\n([.,:!]+)/>\1/g;
					}
					if ($element =~ m/\<\/H[1-6]\>/i) {
						$inheader = 0;
					}
					$retval .= "$element";
					$inelement += ($element =~ s/</&lt;/g);
					$inelement -= ($element =~ s/>/&gt;/g);
					if ($inelement < 0) {
						$inelement = 0;
					}
					if (($inelement == 0) && ($inheader == 0)) {
						$retval .= "\n";
					} else {
						$retval .= " ";
					}
				}
			undef @list;
			}
		}
	}
	$retval .= "\n";
	close FILE;
	return $retval;
}

$mhtmlcomments = 1;

sub cli {
	getopts("co") || usage();

	if ($opt_c) {$mhtmlcomments = 0;}

	if (@ARGV < 2) { usage(); }

	$file1 = $ARGV[0];
	$file2 = $ARGV[1];
	$file3 = $ARGV[2];

	$tmp = splitit($file1, $headertmp1);
	open (FILE, ">$tmp1");
	print FILE $tmp;
	close FILE;

	$tmp = splitit($file2, $headertmp2);
	open (FILE, ">$tmp2");
	print FILE $tmp;
	close FILE;

	$output = "";

	if ($stripheader) {
		open(FILE, $headertmp2);
		while (<FILE>) {
			$output .= $_;
		}
		close(FILE);
	}

	$output .= markit($tmp1, $tmp2);

	if ($file3) {
		open(FILE, ">$file3");
		print FILE $output;
		close FILE;
	} else {
		print $output;
	}
}

sub cgi {
#	use LWP::UserAgent;
#	use CGI;

	my $query = new CGI;
	my $url1 = $query->param("oldfile");
	my $url2 = $query->param("newfile");
	my $mhtml = $query->param("mhtml");

	my $file1 = "/tmp/htdcgi1.$$";
	my $file2 = "/tmp/htdcgi2.$$";

	my $ua = new LWP::UserAgent;
	$ua->agent("MACS, Inc. HTMLdiff/0.9 " . $ua->agent);

	# Create a request

	my $req1 = new HTTP::Request GET => $url1;

	my $res1 = $ua->request($req1, $file1);
	if ($res1->is_error) {
		print $res1->error_as_HTML();
		print "<p>The URL $url1 could not be found.  Please check it and try again.</p>";
		return;
	}

	my $req2 = new HTTP::Request GET => $url2;

	my $res2 = $ua->request($req2, $file2);
	if ($res2->is_error) {
		print $res2->error_as_HTML();
		print "<p>The URL $url2 could not be found.  Please check it and try again.</p>";
		return;
	}

	$split1 = splitit($file1, $headertmp1);
	open (FILE, ">$tmp1");
	print FILE $split1;
	close FILE;

	$split2 = splitit($file2, $headertmp2);
	open (FILE, ">$tmp2");
	print FILE $split2;
	close FILE;

	$output = "";

	if ($stripheader) {
		open(FILE, $headertmp2);
		while (<FILE>) {
			$output .= $_;
		}
		close(FILE);
	}

	$output .= markit($tmp1, $tmp2);

	my $base=$res2->base;

	if ($base !~ /\/$/) {
		$base =~ s/[^\/]*$//;
	}

	if ( $output !~ /<base/i ) {
		$output =~ s/<head>/<head>\n<base href="$base">/i ||
	  	$output =~ s/<html>/<html>\n<base href="$base">/i ;
	}

	print $query->header(-type=>'text/html',-nph=>1);
	print $output;

	unlink $file1;
	unlink $file2;

}

$tmp1="/tmp/htdtmp1.$$";
$headertmp1="/tmp/htdhtmp1.$$";
$tmp2="/tmp/htdtmp2.$$";
$headertmp2="/tmp/htdhtmp2.$$";
$stripheader = 1;

if (@ARGV == 0) {
	cgi();		# if no arguments, we must be operating as a cgi script
} else {
	cli();		# if there are arguments, then we are operating as a CLI
}

unlink $tmp1;
unlink $headertmp1;
unlink $tmp2;
unlink $headertmp2;
