/*
 * $Id: HtmlParser.java,v 1.1.1.1 2002/09/30 15:08:52 smartine Exp $
 * Copyright (C) 1999-2000 David Brownell
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package xml.vendor;

import xml.*;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Locale;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTMLEditorKit.ParserCallback;
import javax.swing.text.html.HTML.Tag;
import javax.swing.text.html.parser.ParserDelegator;

import org.xml.sax.*;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.LocatorImpl;




// $Id: HtmlParser.java,v 1.1.1.1 2002/09/30 15:08:52 smartine Exp $

/**
 * This is a wrapper around the <b>javax.swing.text.html.parser.*</b> HTML
 * parser, implementing the <b>SAX2</b> interfaces.
 * On valid HTML, much invalid or malformed HTML, and compatible XHTML,
 * it produces a stream of SAX parsing events corresponding to the parse
 * of the corresponding (well formed and often more valid) XHTML document.
 * Element and attribute names are uniformly presented in lower case, and
 * belonging to the XHTML namespace.  </p>
 *
 * <p> Only one type of lexical event is reported:  comments are visible.
 * This is generally used with HTML to access inlined CSS comments which are
 * protected against browsers old enough that they don't understand what the
 * "style" tag means.  Expansions of built-in entities (such as "&amp;nbsp;")
 * or character references are accordingly not visible.</p>
 *
 * <p> This parser does not support dynamic modification of the input stream
 * to the parser, needed to fully support <em>&lt;script&gt;</em> tags which
 * use the DOM to splice new page content into documents as they load. </p>
 *
 * <p> Current (Swing 1.1.1) HTML parsing issues <em>without included
 * workarounds</em> include:</p> <ul>
 *
 *	<li> Not all HTML 4.0 features are recognized or supported; for
 *	example, not all of the table model is parsed. </li>
 *
 *	<li> Some XHTML constructs are handled poorly; for example, XML
 *	declarations at the beginning of a document cause multiple "html"
 *	elements to be reported, processing instructions are treated like
 *	regular text except that their leading '&lt;' is removed, and
 *	attributes with colonized names such as <em>myNS:label</em> are
 *	treated as two separate attributes.  (Only the latter is not noted
 *	as a compatibility issue in the XHTML specification.) </li>
 *
 *	<li> The content of a <em>&lt;noframes&gt;</em> element is treated as
 *	text, and elements within it aren't parsed.</li>
 *
 *	<li> HTML <em>&lt;link&gt;</em> elements are handled wrong; in some
 *	cases their attributes are coupled to a subsequent element.</li>
 *
 *	<li> (Swing 1.1, fixed in 1.1.1) HTML <em>&lt;style&gt;</em> elements
 *      are handled wrong; their
 *	contents aren't reported when wrapped in comments, and as with the
 *	<em>link</em> elements, their attributes may be shifted. </li>
 *
 *	<li> Some elements (often non-HTML names) may be given an "endtag"
 *	attribute (value true) that is not found in the input document.
 *	This is often coupled with the element being split into two, separated
 *	by the content which should be inside that element.</li>
 *
 *	<li> An issue with non-ASCII characters is that even for a JDK
 *	that handles an encoding, the standard IANA character set identifiers
 *	may not be recognized by the JDK.  Such bugs are slowly getting
 *	fixed; as a rule, use at least JDK 1.1.7 to ensure you have access
 *	to encoding names such as "ASCII" and "US-ASCII" (for the widely
 *	used seven bit standard character set and encoding).  </li>
 *
 *	<li> Error handling is a sore spot with HTML.  This parser doesn't do
 *	a very consistent job of categorizing errors; most are reported as
 *	warnings.  Diagnostics aren't very useful.  Swing's HTML parser does
 *	not accept as much malformed HTML as many widely used web browsers,
 *	and may not interpret such broken HTML as expected.  </li>
 *
 *	</ul>
 *
 * <P> This assigns all elements and attributes, except those known to be
 * in the XML namespace, to the XHTML namespace.  It also reports a default
 * prefix mapping for that namespace.  The overall result is intended to be
 * that this produces XHTML which is as valid as the input HTML, given an
 * appropriate doctype declaration.  Achievement of that goal may be limited
 * by problems in the Swing HTML parser, as noted above.</P>
 *
 * <P> This driver adds ignorable newlines at various locations where they
 * won't be confused with HTML content.  These may of course be ignored.  If
 * they are not ignored, they make the output of this parser be more easily
 * printed, since otherwise HTML files of all sizes will appear without line
 * breaks of any kind, and viewing the output of this parser will cause
 * trouble for most text editors.</P>
 *
 * <p> There are also various undocumented, or poorly documented, behaviors
 * of the Swing parser.  It adds an illegal <em>&lt;__EndOfLineTag__&gt;</em>
 * element after the root element, for example.  These are ignored as well as
 * possible, given the all but complete lack of specification for the Swing
 * parser callbacks.
 *
 * @author David Brownell
 * @version $Date: 2002/09/30 15:08:52 $
 */
public final class HtmlParser implements XMLReader
{
    // Stuff used internally to route events correctly
    private Adapter		adapter = new Adapter ();
    private DefaultHandler	defaultHandler = new DefaultHandler ();

    // SAX handlers
    private LexicalHandler	lexicalHandler = defaultHandler;
    private ContentHandler	contentHandler = defaultHandler;
    private ErrorHandler	errHandler = defaultHandler;
    private DTDHandler		dtdHandler = defaultHandler;
    private EntityResolver	resolver = defaultHandler;

    // state
    private String		baseURI;

    // debugging
    private static final boolean	debug = false;
    private static final boolean	debugErrs = false;


    // report using the least restrictive DTD
    private static final String	xhtmlPublic
	    = "-//W3C//DTD XHTML 1.0 Transitional//EN";
    private static final String	xhtmlSystem
	    = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";

    // thankfully, just one namespace
    private static final String	xhtmlNS
	    = "http://www.w3.org/1999/xhtml";


    /**
     * Constructs a new HTML parser.
     */
    public HtmlParser () {}


    /**
     * <b>SAX2</b>: Returns the object used to receive callbacks for XML
     * errors of all levels (fatal, nonfatal, warning).
     */
    public ErrorHandler getErrorHandler ()
    {
	return errHandler;
    }

    /**
     * <b>SAX1</b>: Provides an object which receives callbacks for HTML errors
     * of all levels (fatal, nonfatal, warning).
     *
     * <p> Note that this parser does not provide a consistent categorization
     * of errors according to the categories defined in the SAX API.  Most
     * problems are reported at the "warning" level, and even those few
     * validity related errors reported at the "nonfatal" level may not be
     * viewed as issues in all HTML environments.  No errors are reported as
     * "fatal".
     *
     * <p> Throwing an exception from an error handler may not work well.
     */
    public void setErrorHandler (ErrorHandler handler)
    {
	if (handler == null)
	    handler = defaultHandler;
	errHandler = handler;
    }

    /**
     * <b>SAX2</b>: Returns the object used to process declarations related
     * to notations and unparsed entities.
     */
    public DTDHandler getDTDHandler ()
    {
	return dtdHandler;
    }

    /**
     * <b>SAX1</b>: Provides an object which may be used to intercept
     * declarations related to notations and unparsed entities.
     *
     * <p><em>Not used by this parser.</em>
     */
    public void setDTDHandler (DTDHandler handler)
    {
	if (handler == null)
	    handler = defaultHandler;
	dtdHandler = handler;
    }

    /**
     * <b>SAX2</b>: Returns the object used when resolving external
     * entities during parsing (both general and parameter entities).
     */
    public EntityResolver getEntityResolver ()
    {
	return resolver;
    }

    /**
     * <b>SAX1</b>: Provides an object which may be used when resolving external
     * entities during parsing (both general and parameter entities).
     *
     * <p><em>Not used by this parser.</em>
     */
    public void setEntityResolver (EntityResolver resolver)
    {
	if (resolver == null)
	    resolver = defaultHandler;
	this.resolver = resolver;
    }

    /**
     * <b>SAX2</b>: Returns the object used to report the logical
     * content of an XML document.
     */
    public ContentHandler getContentHandler ()
    {
	return contentHandler;
    }

    /**
     * <b>SAX2</b>: Assigns the object used to report the logical
     * content of an XML document.
     */
    public void setContentHandler (ContentHandler handler)
    {
	if (handler == null)
	    handler = defaultHandler;
	contentHandler = handler;
    }

    /**
     * <b>SAX1</b>: Identifies the locale which the parser should use for the
     * diagnostics it provides.
     *
     * <p><em>Not used by this parser.</em>
     *
     * @exception SAXException as defined in the specification for
     *	<em>org.xml.sax.Parser.setLocale()</em>
     */
    public void setLocale (Locale locale)
    throws SAXException
    {
	// ignoring this entirely for now 
    }

    /**
     * <b>SAX1</b>:  parse the HTML text in the given input source.
     *
     * @exception SAXException as defined in the specification for
     *	<em>org.xml.sax.Parser.parse()</em>
     * @exception IOException as defined in the specification for
     *	<em>org.xml.sax.Parser.parse()</em>
     */
    public void parse (InputSource input) throws SAXException, IOException
    {
	Reader		reader = input.getCharacterStream ();

	if (reader == null) {
	    InputStream	in = input.getByteStream ();
	    String	encoding;

	    if (in != null) {
		encoding = input.getEncoding ();
		if (encoding == null)
		    encoding = "8859_1";
		reader = new InputStreamReader (in, encoding);

	    } else if (input.getSystemId () != null) {
		URLConnection	conn;

		conn = new URL (input.getSystemId ()).openConnection ();
		conn.connect ();
		in = conn.getInputStream ();
		    // NOTE:  we expect getContentType() is never null
		encoding = Resolver.getEncoding (conn.getContentType ());
		reader = new InputStreamReader (in, encoding);

	    } else
		throw new IllegalArgumentException ("InputSource");
	}

	// here we expect Swing never reads external entities!!
	LocatorImpl	locator = new LocatorImpl ();

	if (input.getSystemId () != null)
	    locator.setSystemId (input.getSystemId ());
	if (input.getPublicId () != null)
	    locator.setPublicId (input.getPublicId ());
	locator.setLineNumber (-1);
	locator.setColumnNumber (-1);

	contentHandler.setDocumentLocator (locator);
	contentHandler.startDocument ();

	// all valid XHTML must have a DTD with no an internal subset
	// so we report that's what we saw (but no declaration events)
	lexicalHandler.startDTD ("html", xhtmlPublic, xhtmlSystem);
	lexicalHandler.endDTD ();
	
	// for when someone looks
	char buf [] = " HTML-to-XHTML conversion ($Date: 2002/09/30 15:08:52 $)".toCharArray ();
	lexicalHandler.comment (buf, 0, buf.length);
	buf = null;

	try {
	    contentHandler.startPrefixMapping ("", xhtmlNS);
	    new ParserDelegator ().parse (reader, adapter, true);
	    contentHandler.endPrefixMapping ("");
	} catch (Wrapper x) {
	    throw x.x;
	} finally {
	    contentHandler.endDocument ();
	}
    }


    /**
     * <b>SAX1</b>:  Parse the HTML text at the given input URI.
     */
    public void parse (String uri) throws SAXException, IOException
    {
	parse (new InputSource (uri));
    }

    private static final String FEATURES = "http://xml.org/sax/features/";
    private static final String HANDLERS = "http://xml.org/sax/properties/";

    /**
     * <b>SAX2</b>: Tells whether this parser supports the specified feature.
     */
    public boolean getFeature (String featureId)
    throws SAXNotRecognizedException, SAXNotSupportedException
    {
	// none of that XML stuff!
	if ((FEATURES + "validation").equals (featureId)
		|| (FEATURES + "external-general-entities").equals (featureId)
		|| (FEATURES + "external-parameter-entities").equals (featureId)
		|| (FEATURES + "string-interning").equals (featureId)
		)
	    return false;

	if ((FEATURES + "namespaces").equals (featureId))
	    return true;
	if ((FEATURES + "namespace-prefixes").equals (featureId))
	    return false;

	throw new SAXNotRecognizedException (featureId);
    }

    /**
     * <b>SAX2</b>:  Returns the specified property.  At this time only
     * lexical handlers are supported.
     */
    public Object getProperty (String propertyId)
    throws SAXNotRecognizedException, SAXNotSupportedException
    {
	if ((HANDLERS + "lexical-handler").equals (propertyId))
	    return lexicalHandler;

	// unknown properties
	throw new SAXNotRecognizedException (propertyId);
    }

    /**
     * <b>SAX2</b>:  Sets the state of features supported in this parser.
     * As of this writing, no feature's state may be changed from its
     * default value.
     */
    public void setFeature (String featureId, boolean state)
    throws SAXNotRecognizedException, SAXNotSupportedException
    {
	boolean value = getFeature (featureId);

	if (state != value)
	    throw new SAXNotSupportedException (featureId);
    }

    /**
     * <b>SAX2</b>:  Assigns the specified property.  At this time only
     * lexical handlers are supported, and these must not be changed to
     * values of the wrong type.  Like SAX1 handlers, these may be changed
     * at any time.
     */
    public void setProperty (String propertyId, Object property)
    throws SAXNotRecognizedException, SAXNotSupportedException
    {
	if ((HANDLERS + "lexical-handler").equals (propertyId)) {
	    if (!(property instanceof LexicalHandler))
		throw new SAXNotSupportedException (propertyId);
	    lexicalHandler = (LexicalHandler) property;
	    return;
	}

	// unknown properties
	throw new SAXNotRecognizedException (propertyId);
    }


    static final private char newline [] = { '\n' };

    //
    // Adapter between Swing parser's events, and SAX ones.
    // 
    class Adapter extends ParserCallback
    {
	private AttributesImpl		atts = new AttributesImpl ();
	private boolean			recovering;
    
	// public void flush () { }

	public void handleText (char data [], int pos)
	{
	    if (debug)
		System.err.println ("handleText");
	    if (recovering)
		return;

	    try {
		contentHandler.characters (data, 0, data.length);
	    } catch (SAXException e) {
		recovering = true;
		throw new Wrapper (e);
	    }
	}

	public void handleComment (char data [], int pos)
	{
	    if (debug)
		System.err.println ("handleComment");
	    if (recovering)
		return;

	    try {
		// "pos" signifies ... ?
		lexicalHandler.comment (data, 0, data.length);
	    } catch (SAXException e) {
		recovering = true;
		throw new Wrapper (e);
	    }
	}

	public void handleStartTag (
	    Tag				tag,
	    MutableAttributeSet 	attributes,
	    int				pos
	) {
	    String			name = tag.toString ();

	    if (debug)
		System.err.println ("handleStartTag " + name);
	    if (recovering)
		return;
	    if ("__EndOfLineTag__".equals (name))
	        return;
		
	    try {
		convertAttributes ("body".equals (tag), attributes);
		contentHandler.startElement (xhtmlNS, name, name, atts);
		if (addSpaceInside (name))
		    contentHandler.ignorableWhitespace (newline, 0, 1);
		atts.clear ();
	    } catch (SAXException e) {
		recovering = true;
		throw new Wrapper (e);
	    }
	}

	public void handleEndTag (Tag tag, int pos)
	{
	    String			name = tag.toString ();

	    if (debug)
		System.err.println ("handleEndTag " + name);
	    if (recovering)
		return;
	    if ("__EndOfLineTag__".equals (name))
	        return;
		
	    try {
		contentHandler.endElement (xhtmlNS, name, name);
		if ("html".equals (tag))
		    contentHandler.endPrefixMapping ("");
		else if (addSpaceAfter (name))
		    contentHandler.ignorableWhitespace (newline, 0, 1);
	    } catch (SAXException e) {
		recovering = true;
		throw new Wrapper (e);
	    }
	}

	// Workaround a bug in the Swing parser, where some elements
	// are treated as two simple elements separated by unrelated
	// content instead of one element containing that content
	private boolean isSwingBrokenTag (String name)
	{
	    return "div".equals (name)
		    || "span".equals (name)
		    || "abbr".equals (name);
	}

	public void handleSimpleTag (
	    Tag				tag,
	    MutableAttributeSet 	attributes,
	    int				pos
	) {
	    String			name = tag.toString ();

	    if (debug)
		System.err.println ("handleSimpleTag " + name);
	    if (recovering)
		return;
		
	    // Swing 1.1.1 adds at least this bogus tag
	    // (after the <html>...</html> element)
	    if ("__EndOfLineTag__".equals (name))
	        return;

	    try {
		convertAttributes (false, attributes);
		if (isSwingBrokenTag (name)) {
		    if ("true".equals (atts.getValue ("endtag"))) {
			atts.clear ();
			contentHandler.endElement (xhtmlNS, name, name);
			if (addSpaceAfter (name))
			    contentHandler.ignorableWhitespace (newline, 0, 1);
		    } else {
			contentHandler.startElement (xhtmlNS, name, name, atts);
			atts.clear ();
		    }
		} else {
		    contentHandler.startElement (xhtmlNS, name, name, atts);
		    atts.clear ();
		    contentHandler.endElement (xhtmlNS, name, name);
		    if (addSpaceAfter (name))
			contentHandler.ignorableWhitespace (newline, 0, 1);
		}
	    } catch (SAXException e) {
		recovering = true;
		throw new Wrapper (e);
	    }
	}

	private boolean addSpaceInside (String tag)
	{
	    return "body".equals (tag)
		    || "head".equals (tag)
		    || "center".equals (tag)
		    ;
	}

	private boolean addSpaceAfter (String tag)
	{
	    return "p".equals (tag)
		    || "li".equals (tag)
		    || "table".equals (tag)
		    || "tr".equals (tag)
		    || "td".equals (tag)
		    || "h1".equals (tag)
		    || "h2".equals (tag)
		    || "h3".equals (tag)
		    || "h4".equals (tag)
		    || "h5".equals (tag)
		    || "h6".equals (tag)
		    || "dt".equals (tag)
		    || "dd".equals (tag)
		    || "center".equals (tag)
		    || "map".equals (tag)
		    || "area".equals (tag)
		    || "tbody".equals (tag)
		    || "title".equals (tag)
		    || "meta".equals (tag)
		    || "link".equals (tag)
		    || "style".equals (tag)
		    ;
	}

	//
	// This gets called lots with invalid HTML (or even just HTML that
	// the Swing parser was surprised by) ... can we provide better
	// diagnostics?  HotJava can, with more or less the same parser,
	// so it should be doable.
	//
	public void handleError (String diagnostic, int pos)
	{
	    if (debug || debugErrs)
		System.err.println ("handleError: " + diagnostic);

	    //
	    // Categorization of these errors is not trustworthy;
	    // we'll leave interpretations of validity to a real
	    // XHTML validator.
	    //
	    // NOTE:  one common cause of this appears to be colonized
	    // attribute names such as the fairly basic "xml:lang";
	    // such names are removed from the output of this the Swing
	    // parser, we have to restore them later.
	    //
	    if (diagnostic.startsWith ("invalid."))
		return;

	    try {
		SAXParseException	ex;

		ex = new SAXParseException (diagnostic,
		    null, null,
		    -1, -1
		    );
		errHandler.warning (ex);

	    } catch (SAXException e) {
		recovering = true;
		throw new Wrapper (e);
	    }
	}

	private void convertAttributes (
	    boolean		isBody,
	    MutableAttributeSet	attributes
	) {
	    // invariant:  'atts' is clear on entry!!

	    if (attributes.getAttributeCount () != 0) {
		for (Enumeration e = attributes.getAttributeNames ();
			e.hasMoreElements ();
			) {
		    Object	key = e.nextElement ();
		    String	name = key.toString ();
		    String	value;
		    String	ns;
		    boolean	isXml;

		    //
		    // Swing morphs "xml:space", "xml:lang" etc, into
		    // bizarrely ordered attribute pairs.  Patch this as
		    // best we can, assuming the input was once valid.
		    //
		    if ("xml".equals (name))
			continue;

		    // prune 'xmlns*' to make SAX2 happy (sigh)
		    if (name.startsWith ("xmlns"))
			continue;

		    value = attributes.getAttribute (key).toString ();
		    isXml = "space".equals (name);

		    if (isXml)
			atts.addAttribute (
			    "http://www.w3.org/XML/1998/namespace",
			    name, "xml:" + name,
			    "CDATA", value);
		    else {
			// "element scope"
			atts.addAttribute (
			    "",
			    "", name,
			    "CDATA", value);
			if ("lang".equals (name))
			    atts.addAttribute (
				"http://www.w3.org/XML/1998/namespace",
				"lang", "xml:lang",
				"CDATA", value);
		    }
		}
	    }
	}
    }

    static class Wrapper extends RuntimeException {
	SAXException	x;

	Wrapper (SAXException x) { this.x = x; }
    }
}
