/*
 * $Id: DomBuilder.java,v 1.1.1.1 2002/09/30 15:08:51 smartine Exp $
 * Copyright (C) 1999-2000 David Brownell
 * 
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

package xml;

import java.io.IOException;

import org.xml.sax.*;
import org.xml.sax.helpers.XMLReaderFactory;
import org.xml.sax.ext.*;

import org.w3c.dom.*;

import xml.pipeline.*;


// $Id: DomBuilder.java,v 1.1.1.1 2002/09/30 15:08:51 smartine Exp $

/**
 * Builds a DOM Level 1 (or Level 2) Document object from the output of a
 * SAX2 parser, using a defaulted or specified DOM
 * implementation and parser.  For example, a validating XML parser can be
 * connected with an XML DOM; or an HTML parser could be connected to an
 * HTML (or non-HTML) DOM.  By default, if the DOM supports Level 2 calls,
 * namespace rules are used when building the DOM document; this will cause
 * some legal XML 1.0 documents to be rejected.  If standard XML behavior is
 * desired, use <b><code>setUsingNamespaces(false)</code></b>.
 *
 * <p> Note that some of the information exposed through DOM is only of
 * exotic interest, and does not truly relate to the core semantic model
 * of XML as consisting of elements, attributes, text, and processing
 * instructions.  By default, this builder only exposes core node types,
 * and will not create any "extra" nodes, such as those for comments or
 * ignorable whitespace. This behavior may be changed by using the
 * <b><code>setSavingExtraNodes(true)</code></b> method.  The most useful
 * of such nodes are probably comments, which are used in some legacy
 * environments, such as HTML/XHTML, to wrap content such as inlined CSS
 * style directives or scripting code.  However, some applications may
 * also want to know when text was represented using CDATA delimiters.</p>
 *
 * <p> As a rule, if you ignore "extra" nodes and all of the incomplete DOM
 * DTD functionality, and provide a fully featured SAX2 parser, the main
 * portability issue
 * your code may have is that some nonvalidating parsers will report
 * ignorable whitespace characters as normal character data, so that your
 * application can't as readily ignore it.</p>
 *
 * <h3> Selection of DOM and SAX2 Implementations </h3>
 *
 * <p> The DOM implementation class used is specified either as a parameter
 * to a construction method, or is derived from the value of the
 * <b><code>xml.DomBuilder.Document</code></b> system property.
 * That class must provide a default constructor, which creates an object
 * conforming to the DOM Level 1 core "Document" API.</p>
 *
 * <p> The SAX2 parser used is either provided as a constructor parameter,
 * or is identified from "org.xml.sax.driver" system property.  The parser
 * used should be a SAX2 XMLReader supporting the "lexical-handler" feature;
 * see below for details of how the SAX1 API does not provide information
 * needed to support DOM correctly (it's more than just hiding data for
 * "extra" nodes).  (There are fallbacks in case the system property is
 * not defined.) </p>
 *
 * <p> By providing a parser directly, the caller can ensure that it has been
 * properly configured.  For example, it might be set up to validate.  The
 * caller may set up the ErrorHandler, Locale, and EntityResolver.  This
 * builder will assign the SAX2 Content and Lexical handlers of the parser,
 * Other handlers, except for the error handler and the entity resolver, are
 * reserved for future use by this builder. </p>
 *
 * <h3> DOM Functionality Restrictions </h3>
 *
 * <p> Because of missing functionality in the DOM APIs, the
 * following DTD-related functionality can't be supported by software, such
 * as this builder, which does not rely on nonstandard extensions to the
 * DOM APIs. <ul>
 *
 *	<li> Creation of DOM <b>DocumentType</b> nodes, or their associated
 *	<b>Notation</b> and <b>Entity</b> nodes. No APIs exist in DOM Level 1
 *	create such nodes.  DOM Level 2 only provides a broken way to create
 *	DocumentType nodes, and no way at all to create the other two types of
 *	associated nodes.</li>
 *
 *	<li> Creation of DOM <b>EntityReference</b> nodes.  The related
 *	<b>Entity</b> nodes (and their optional contents) can't be created,
 *	and in any case the children can't be set to be "read only".</b>
 *
 *	<li> The <b>Attr.isSpecified</b> flag can't be assigned through the
 *	DOM API. </li>
 *
 *	<li> Resetting values of element attributes which have defaults,
 *	when those attributes are deleted.  No API exists in DOM to tell
 *	the DOM which attributes have defaults, and what they are. </b>
 *
 *	</ul></p>
 *
 * <p> Some other functionality is not available through SAX, even using the
 * SAX2 parser APIs, and so can't be provided here.   Some other functionality
 * (some affecting completeness of DOM data models) is only available in those
 * SAX2 APIs, and so can't be provided when a SAX1 parser is in use.  <ul>
 *
 *	<li> DTDs can't be fully recreated, even using a Level 2 DOM which
 *	permits both the internal and external subset to be represented.
 *	The internal subset isn't usefully reported.  </li>
 *
 *	<li> There is no way to detect which attribute values were specified
 *	in the document, versus being defaulted through a DTD.  (As noted
 *	above, there is a parallel issue with the DOM API:  if it were known,
 *	there is no way to tell it to DOM.  DOM, however, expects to be able
 *	to report this information.) </li>
 *
 *	<li> Entity references which are found within attributes aren't
 *	reported as such through SAX2.  This means that "Attr" nodes will
 *	not have such references as children.
 *	Since this feature is of negligible value in DOM, and is
 *	also a feature with a substantial cost, this is seldom missed.</li>
 *
 *	<li> <em>Unless using a SAX2 parser which reports them,</em>
 *	comments will not appear in the resulting document. </li>
 *
 *	<li> <em>Unless using a SAX2 parser which reports them,</em> CDATA
 *	delimiters for text will not cause CDATASection nodes to appear. </li>
 *
 *	<li><em>Unless using a SAX2 parser which reports DTD boundaries</em>,
 *	processing instructions (and perhaps comments, see above) found in
 *	a document's DTD will inappropriately be reported as being part of
 *	the document.</li>
 *
 *	</ul></p>
 *
 * <p> At this time, XML 1.0 conformance is assumed unless a Level 2
 * implementation
 * of DOM is used, in which case the more restrictive XML namespace rules are
 * used when elements and processing instructions are processed.  This means
 * that the use of colons in element and attribute names is more restrictive
 * than in the XML 1.0 specification, and is completely forbidden in the names
 * of processing instruction targets.  Also, only attributes of types CDATA,
 * NMTOKEN, and NMTOKENS may contain colons; and name prefixes must be
 * declared.  The
 * namespace restrictions on entity and notation names (they must not contain
 * colons) are not currently enforced.
 *
 * @see xml.pipeline.DomConsumer
 *
 * @author David Brownell
 * @version $Date: 2002/09/30 15:08:51 $
 */
final public class DomBuilder
{
    private DomConsumer		consumer;
    private XMLReader		parser;


    private static String getDocClassName ()
    {
	// default: any conformant DOM implementation (L2 better than L1)
	String		defaultImpl = "xml.dom.DomDocument";

	try {
	    return System.getProperty ("xml.DomBuilder.Document",
		    defaultImpl);
	} catch (Exception e) {
	    return defaultImpl;
	}
    }

    /**
     * Returns an empty DOM document of the system default implementation.
     * Uses the system property to determine that default, and has a built
     * in fallback.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate the document.
     */
    public static Document createEmptyDocument ()
    throws SAXException
    {
	return createEmptyDocument (getDocClassName ());
    }

    /**
     * Returns an empty DOM document of the specified implementation.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate the document.
     */
    public static Document createEmptyDocument (String name)
    throws SAXException
    {
	try {
	    Class	documentClass = Class.forName (name);
	    Object	temp = documentClass.newInstance ();

	    return (Document) temp;
	} catch (Exception e) {
	    throw new SAXException (
		"can't create document of class " + name,
		e);
	}
    }

    /**
     * Convenience routine, which uses the default DOM and parser (as
     * described above) to parse the specified (XML) document into a DOM
     * document tree.
     *
     * @param uri Identifies the resource to be parsed.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate the document object
     *  or a parser.
     * @exception IOException when there is a problem reading a part of
     *	the specified document
     */
    public static Document createDocument (String uri)
    throws SAXException, IOException
    {
	return createDocument (new InputSource (uri));
    }


    /**
     * Convenience routine, which uses the default DOM and parser (as
     * described above) to parse the specified (XML) document into a DOM
     * document tree.
     *
     * @param input Provided to the SAX parser as input.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate the document object
     *	or a parser.
     * @exception IOException when there is a problem reading a part of
     *	the specified document
     */
    public static Document createDocument (InputSource input)
    throws SAXException, IOException
    {
	EventProducer	producer = new EventProducer (input);
	DomConsumer	consumer = new DomConsumer ();

	producer.produce (consumer);
	return consumer.getDocument ();
    }


    /**
     * Constructs a builder using the default DOM document class and the
     * default SAX parser.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate a SAX2 parser or
     *	DOM document object.
     */
    public DomBuilder () throws SAXException
    {
	this (getDocClassName ());
    }


    /**
     * Constructs a builder using the specified DOM document class and the
     * default SAX2 parser.
     *
     * @param DOMDocumentClassName The name of the class implementing the
     *   kind of DOM document to be returned.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate a SAX2 parser or
     *	DOM document object.
     */
    public DomBuilder (String DOMDocumentClassName)
    throws SAXException
    {
	try {
	    parser = XMLReaderFactory.createXMLReader ();
	} catch (Exception e) {
	    //
	    // XXX would rather not have a backup config _here_ ...
	    // but they need to go somewhere!
	    //
	    if (parser == null)
		parser = XMLReaderFactory.createXMLReader (
			"xml.aelfred2.SAXDriver");
	}
	consumer = new DomConsumer (DOMDocumentClassName);
    }


    /**
     * Constructs a builder using the default DOM document class and the
     * specified SAX2 parser.
     *
     * @param parser The SAX parser to be used; it may be partially
     *	configured, as described above.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate a document object.
     */
    public DomBuilder (XMLReader parser)
    throws SAXException
    {
	this (getDocClassName (), parser);
    }


    /**
     * Constructs a builder using the specified DOM document class and the
     * specified SAX parser.
     *
     * @param DOMDocumentClassName The name of the class implementing the
     *   kind of DOM document to be returned.
     * @param parser The SAX parser to be used; it may be partially
     *	configured, as described above.
     *
     * @exception SAXException Normally wraps another exception reflecting
     *	a problem uncovered when trying to instantiate a document object.
     */
    public DomBuilder (String DOMDocumentClassName, XMLReader parser)
    throws SAXException
    {
	consumer = new DomConsumer (DOMDocumentClassName);
	this.parser = parser;
    }


    /**
     * Returns true if the builder is saving "extra" nodes, and false (the
     * default) otherwise.  "Extra" nodes are defined to be ignorable
     * whitespace, comments, and the use of CDATA nodes instead of normal
     * text nodes.  (Entity Reference nodes are also "extra", but can't be
     * exposed in any case since the DOM doesn't expose an API that permits
     * them to be constructed.)
     *
     * @see #setSavingExtraNodes
     */
    public boolean	isSavingExtraNodes ()
	{ return consumer.isSavingExtraNodes (); }


    /**
     * Controls whether the builder will save "extra" nodes.
     *
     * @see #isSavingExtraNodes
     * @param flag True iff extra nodes should be saved; false otherwise.
     */
    public void		setSavingExtraNodes (boolean flag)
	{ consumer.setSavingExtraNodes (flag); }
    

    /**
     * Returns true (the default for L2 DOM implementations) if the
     * builder is using an "XML + Namespaces" style DOM construction,
     * which will cause fatal errors on some legal XML 1.0 documents.
     *
     * @see #setUsingNamespaces
     */
    public boolean	isUsingNamespaces ()
	{ return consumer.isUsingNamespaces (); }


    /**
     * Controls whether the builder uses an "XML + Namespaces" style
     * DOM construction.
     *
     * @see #isUsingNamespaces
     * @param flag True iff namespaces should be enforced; else false.
     */
    public void		setUsingNamespaces (boolean flag)
	{ consumer.setUsingNamespaces (flag); }
    

    /**
     * Parses the specified document, returning its contents as a DOM
     * document.  This uses the SAX parser and DOM implementation which
     * were specified in the constructor to this builder.
     *
     * @param uri Identifies the resource to be parsed.
     * @exception SAXException As reported by the parser (in which case it is
     *   often a SAXParseException) or this builder.
     * @exception IOException As reported by the parser
     * @exception DOMException As reported by the DOM; always indicates
     *   a bug in either the DOM or the SAX parser
     */
    public Document	parse (String uri)
    throws SAXException, IOException, DOMException
    {
	return parse (new InputSource (uri));
    }

    
    /**
     * Parses the document provided, returning its contents as a DOM
     * document.  This uses the SAX parser and DOM implementation which
     * were specified in the constructor to this builder.
     *
     * @param input Provided to the SAX parser as input.
     * @exception SAXException As reported by the parser (in which case it is
     *   often a SAXParseException) or this builder.
     * @exception IOException As reported by the parser
     * @exception DOMException As reported by the DOM; always indicates
     *   a bug in either the DOM or the SAX parser
     */
    public Document	parse (InputSource input)
    throws SAXException, IOException, DOMException
    {
	// synchronizing, just in case caller violates mt-sanity rules
	synchronized (this) {
	    EventProducer	producer;

	    producer = new EventProducer (parser, input);
	    producer.produce (consumer);
	    return consumer.getDocument ();

	    // could save producer and call
	    // producer.produce (input) on 2nd-Nth calls
	}
    }
}