File:  [Public] / XML / parser.c
Revision 1.13: download - view: text, annotated - select for diffs
Sat May 9 07:16:54 1998 UTC (26 years ago) by veillard
Branches: MAIN
CVS tags: HEAD
Removed two errors, Daniel.

/*
 * parser.c : an XML 1.0 non-verifying parser
 */

#include <config.h>
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <malloc.h>
#include <sys/stat.h>
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif

#include "parser.h"
#include "tree.h"

/*
 * A few macros needed to help building the parser.
 */

#ifdef UNICODE
/*
 * UNICODE version of the macros. Incomplete now !!!!
 */
#define IS_CHAR(c)							\
    (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) ||			\
     (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF)))

#define SKIP_BLANKS(p) 							\
    while ((*(p) == 0x20) || (*(p) == 0x09) || (*(p) == 0xa) ||		\
           (*(p) == 0x3000)) (p)++;

/* I'm too lazy to complete this one !!!! */
#define IS_BASECHAR(c)							\
    ((((c) >= 0x41) && ((c) <= 0x5a)) ||				\		
     (((c) >= 0x61) && ((c) <= 0x7a)) ||				\
     (((c) >= 0xaa) && ((c) <= 0x5b)) ||				\
     (((c) >= 0xc0) && ((c) <= 0xd6)) ||				\
     (((c) >= 0xd8) && ((c) <= 0xf6)) ||				\
     (((c) >= 0xf8) && ((c) <= 0xff)) ||				\
      ((c) == 0xba))

/* I'm too lazy to complete this one !!!! */
#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))

/* I'm too lazy to complete this one !!!! */
#define IS_COMBINING(c) 0

#define IS_IGNORABLE(c)							\
    ((((c) >= 0x200c) && ((c) <= 0x200f)) ||				\
     (((c) >= 0x202a) && ((c) <= 0x202e)) ||				\
     (((c) >= 0x206a) && ((c) <= 0x206f)) ||				\
      ((c) == 0xfeff))

#define IS_EXTENDER(c)							\
    (((c) == 0xb7) || ((c) == 0x2d0) || ((c) == 0x2d1) ||		\
     ((c) == 0x387) || ((c) == 0x640) || ((c) == 0xe46) ||		\
     ((c) == 0xec6) || ((c) == 0x3005)					\
     (((c) >= 0x3031) && ((c) <= 0x3035)) ||				\
     (((c) >= 0x309b) && ((c) <= 0x309e)) ||				\
     (((c) >= 0x30fc) && ((c) <= 0x30fe)) ||				\
     (((c) >= 0xff70) && ((c) <= 0xff9e)) ||				\
      ((c) == 0xff9f))

#define IS_IDEOGRAPHIC(c)						\
    ((((c) >= 0x4e00) && ((c) <= 0x9fa5)) ||				\
     (((c) >= 0xf900) && ((c) <= 0xfa2d)) ||				\
     (((c) >= 0x3021) && ((c) <= 0x3029)) ||				\
      ((c) == 0x3007))

#define IS_LETTER(c) (IS_BASECHAR(c) || IS_IDEOGRAPHIC(c))

/* I'm too lazy to complete this one ! */
#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
#else
/*
 * 8bits / ASCII version of the macros.
 */
#define IS_CHAR(c)							\
    (((c) == 0x09) || ((c) == 0x0a) || ((c) == 0x0d) || ((c) >= 0x20))

#define IS_BASECHAR(c)							\
    ((((c) >= 0x41) && ((c) <= 0x5a)) ||				\
     (((c) >= 0x61) && ((c) <= 0x7a)) ||				\
     (((c) >= 0xaa) && ((c) <= 0x5b)) ||				\
     (((c) >= 0xc0) && ((c) <= 0xd6)) ||				\
     (((c) >= 0xd8) && ((c) <= 0xf6)) ||				\
     (((c) >= 0xf8) && ((c) <= 0xff)) ||				\
      ((c) == 0xba))

#define IS_DIGIT(c) (((c) >= 0x30) && ((c) <= 0x39))

#define IS_LETTER(c) IS_BASECHAR(c)

#define IS_COMBINING(c) 0

#define IS_IGNORABLE(c) 0

#define IS_EXTENDER(c) ((c) == 0xb7)

#define IS_BLANK(c) (((c) == 0x20) || ((c) == 0x09) || ((c) == 0xa))
#endif


#define SKIP_EOL(p) 							\
    if (*(p) == 0x13) { p++ ; if (*(p) == 0x10) p++; }			\
    if (*(p) == 0x10) { p++ ; if (*(p) == 0x13) p++; }

#define SKIP_BLANKS(p) 							\
    while (IS_BLANK(*(p))) (p)++;

#define MOVETO_ENDTAG(p)						\
    while (IS_CHAR(*p) && (*(p) != '>')) (p)++;

#define MOVETO_STARTTAG(p)						\
    while (IS_CHAR(*p) && (*(p) != '<')) (p)++;

/*
 * Forward definition for recusive behaviour.
 */
xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc);

/*
 * xmlHandleData : this routine represent's the specific application
 *    behaviour when reading a piece of text.
 *
 * For example in WebDav, any piece made only of blanks is eliminated
 */

CHAR *xmlHandleData(CHAR *in) {
    CHAR *cur;

    if (in == NULL) return(NULL);
    cur = in;
    while (IS_CHAR(*cur)) {
        if (!IS_BLANK(*cur)) goto not_blank;
	cur++;
    }
    free(in);
    return(NULL);

not_blank:
    return(in);
}

/*
 * xmlStrndup : a strdup for array of CHAR's
 */

CHAR *xmlStrndup(const CHAR *cur, int len) {
    CHAR *ret = malloc((len + 1) * sizeof(CHAR));

    if (ret == NULL) {
        fprintf(stderr, "malloc of %d byte failed\n",
	        (len + 1) * sizeof(CHAR));
        return(NULL);
    }
    memcpy(ret, cur, len * sizeof(CHAR));
    ret[len] = 0;
    return(ret);
}

/*
 * xmlStrdup : a strdup for CHAR's
 */

CHAR *xmlStrdup(const CHAR *cur) {
    const CHAR *p = cur;

    while (IS_CHAR(*p)) p++;
    return(xmlStrndup(cur, p - cur));
}

/*
 * xmlParseName : parse an XML name.
 */

CHAR *xmlParseName(CHAR **p) {
    CHAR *cur = *p, *q, *ret = NULL;

    /*
     * Name ::= (Letter | '_') (NameChar)*
     */
    if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
    q = cur++;
    while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
           (*cur == '.') || (*cur == '-') || (*cur == '_') ||
	   (*cur == ':') || 
	   (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
	   (IS_EXTENDER(*cur)))
	cur++;
    
    ret = xmlStrndup(q, cur - q);

    *p = cur;
    return(ret);
}

/*
 * Parse and return a string between quotes or doublequotes
 */
CHAR *xmlParseQuotedString(CHAR **p) {
    CHAR *ret = NULL;
    CHAR *cur = *p, *q;

    if (*cur == '"') {
        cur++;
	q = cur;
	while (IS_CHAR(*cur) && (*cur != '"')) cur++;
	if (*cur != '"')
	    fprintf(stderr, "String not closed \"%.50s\n", q);
        else {
            ret = xmlStrndup(q, cur - q);
	    cur++;
	}
    } else if (*cur == '\''){
        cur++;
	q = cur;
	while (IS_CHAR(*cur) && (*cur != '\'')) cur++;
	if (*cur != '\'')
	    fprintf(stderr, "String not closed '%.50s\n", q);
        else {
            ret = xmlStrndup(q, cur - q);
	    cur++;
	}
    }
    *p = cur;
    return(ret);
}

/*
 * Skip an XML (SGML) comment <!-- .... -->
 */
void xmlParserSkipComment(CHAR **p) {
    CHAR *cur = *p, *q, *r, *start;

    /*
     * An extra check may avoid errors and isn't that costly !
     */
    if ((cur[0] != '<') || (cur[1] != '!') ||
        (cur[2] != '-') || (cur[3] != '-')) return;

    cur += 4;
    start = q = cur;
    cur++;
    r = cur;
    cur++;
    while (IS_CHAR(*cur) &&
           ((*cur == ':') || (*cur != '>') || (*r != '-') || (*q != '-'))) {
        cur++;r++;q++;
    }
    if (!IS_CHAR(*cur)) {
        fprintf(stderr, "Comment not terminated <!--%.50s\n", start);
	*p = start;
    } else {
        cur++;
	*p = cur;
    }
}

/*
 * xmlParseNamespace: parse specific '<?namespace ...' constructs.
 */

void xmlParseNamespace(CHAR **p, xmlDocPtr doc) {
    CHAR *cur = *p;
    CHAR *href = NULL;
    CHAR *AS = NULL;
    int garbage = 0;

    /*
     * We know that 'namespace' is here.
     */
    cur += 9;
    SKIP_BLANKS(cur);

    while (IS_CHAR(*cur) && (*cur != '>')) {
	/*
	 * We can have 'href' or 'AS' attributes.
	 */
	if ((cur[0] == 'h') && (cur[1] == 'r') && (cur[2] == 'e') && 
	    (cur[3] == 'f')) {
	    garbage = 0;
	    cur += 4;
	    SKIP_BLANKS(cur);

	    if (*cur != '=') continue;
	    cur++;
	    SKIP_BLANKS(cur);

	    href = xmlParseQuotedString(&cur);
	    SKIP_BLANKS(cur);
	} else if ((cur[0] == 'A') && (cur[1] == 'S')) {
	    garbage = 0;
	    cur += 2;
	    SKIP_BLANKS(cur);

	    if (*cur != '=') continue;
	    cur++;
	    SKIP_BLANKS(cur);

	    AS = xmlParseQuotedString(&cur);
	    SKIP_BLANKS(cur);
	} else if ((cur[0] == '?') && (cur[1] == '>')) {
	    garbage = 0;
	    cur ++;
	} else {
            /*
	     * Found garbage when parsing the namespace
	     */
	    if (!garbage) fprintf(stderr,
	          "\nxmlParseNamespace found garbage: ");
            fprintf(stderr, "%c", *cur);
            cur++;
        }
    }

    MOVETO_ENDTAG(cur);
    cur++;

    /*
     * Register the DTD.
     */
    if (href != NULL)
        xmlNewDtd(doc, href, AS);

    if (AS != NULL) free(AS);
    if (href != NULL) free(href);

    *p = cur;
}

/*
 * xmlParsePI: parse an XML Processing Instruction.
 */

void xmlParsePI(CHAR **p, xmlDocPtr doc) {
    CHAR *cur = *p;

    if ((cur[0] == '<') && (cur[1] == '?')) {
	/*
	 * this is a Processing Instruction.
	 */
	cur += 2;

	/*
	 * Special for WebDav, support for the Processing Instruction
	 * '<?namespace ...' contruct in the header of the XML document.
	 */
	if ((cur[0] == 'n') && (cur[1] == 'a') &&
	    (cur[2] == 'm') && (cur[3] == 'e') &&
	    (cur[4] == 's') && (cur[5] == 'p') &&
	    (cur[6] == 'a') && (cur[7] == 'c') &&
	    (cur[8] == 'e')) {
	    xmlParseNamespace(&cur, doc);
	} else {
	    /* Unknown PI, ignore it ! */
	    fprintf(stderr, "xmlParsePI : skipping unknown PI %30s\n", cur);
	    MOVETO_ENDTAG(cur);
	    cur++;
	}
    }
    *p = cur;
}

/*
 * xmlParseAttribute: parse a start of tag.
 *
 * Attribute ::= Name Eq AttValue
 */

void xmlParseAttribute(CHAR **p, xmlNodePtr node) {
    CHAR *cur = *p, *q, *name, *value = NULL;

    if (!IS_LETTER(*cur) && (*cur != '_')) {
        return;
    }
    q = cur++;
    while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
           (*cur == '.') || (*cur == '-') || (*cur == '_') ||
	   (*cur == ':') || 
	   (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
	   (IS_EXTENDER(*cur)))
	cur++;
    name = xmlStrndup(q, cur - q);

    /*
     * We should have the equal, we are laxist here and allow attributes
     * without values and extra spaces.
     */
    SKIP_BLANKS(cur);
    if (*cur == '=') {
        cur++;
	SKIP_BLANKS(cur);
	if ((*cur != '\'') && (*cur != '"')) {
	    fprintf(stderr, "Quotes were expected for attribute value %.20s\n",
	            q);
	} else
	    value = xmlParseQuotedString(&cur);
    }

    /*
     * Add the attribute to the node.
     */
    if (name != NULL)
	xmlNewProp(node, name, value);
    
    *p = cur;
}

/*
 * xmlParseStartTag: parse a start of tag.
 */

xmlNodePtr xmlParseStartTag(CHAR **p, xmlDocPtr doc) {
    CHAR *cur = *p, *q, *ns, *name;
    xmlDtdPtr dtd = NULL;
    xmlNodePtr ret = NULL;

    /*
     * Theorically one should just parse a Name, but with the addition
     * of the namespace needed for WebDav, it's a bit more complicated
     * since the element name may be prefixed by a namespace prefix.
     *
     * QName ::= (NSPart ':')? LocalPart
     * NSPart ::= Name
     * LocalPart ::= Name
     * STag ::= '<' QName (S Attribute)* S? '>'
     *
     * instead of :
     *
     * STag ::= '<' QName (S Attribute)* S? '>'
     */
    if (*cur != '<') return(NULL);
    cur++;

    if (!IS_LETTER(*cur) && (*cur != '_')) return(NULL);
    q = cur++;
    while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
           (*cur == '.') || (*cur == '-') || (*cur == '_') ||
	   (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
	   (IS_EXTENDER(*cur)))
	cur++;

    if (*cur == ':') {
        ns = xmlStrndup(q, cur - q);
        
	cur++; /* skip the column */
	if (!IS_LETTER(*cur) && (*cur != '_')) {
	    fprintf(stderr,
	       "Start tag : no element name after namespace identifier %.20s\n",
	            q);
            free(ns);
	    *p = cur;
	    return(NULL);
	}
	q = cur++;
	while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
	       (*cur == '.') || (*cur == '-') || (*cur == '_') ||
	       (*cur == ':') || 
	       (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
	       (IS_EXTENDER(*cur)))
	    cur++;
        name = xmlStrndup(q, cur - q);

	/*
	 * Search the DTD associated to ns.
	 */
	dtd = xmlSearchDtd(doc, ns);
	if (dtd == NULL)
	    fprintf(stderr, "Start tag : Couldn't find namespace %s\n", ns);
	free(ns);
    } else
        name = xmlStrndup(q, cur - q);

    ret = xmlNewNode(dtd, name, NULL);

    /*
     * Now parse the attributes, it ends up with the ending
     *
     * (S Attribute)* S?
     */
    SKIP_BLANKS(cur);
    while ((IS_CHAR(*cur)) &&
           (*cur != '>') && 
	   ((cur[0] != '/') || (cur[1] != '>'))) {
	if (IS_LETTER(*cur) || (*cur == '_'))
	    xmlParseAttribute(&cur, ret);
	else {
	    /* We should warn !!! */
	    cur++;
	}
	SKIP_BLANKS(cur);
    }

    *p = cur;
    return(ret);
}

/*
 * xmlParseEndTag: parse an end of tag, note that the '</' part has
 * already been read.
 */

void xmlParseEndTag(CHAR **p, xmlDocPtr doc, xmlDtdPtr *dtdPtr, CHAR **tagPtr) {
    CHAR *cur = *p, *q, *ns, *name;
    xmlDtdPtr dtd = NULL;

    *dtdPtr = NULL;
    *tagPtr = NULL;

    /*
     * Theorically one should just parse a Name, but with the addition
     * of the namespace needed for WebDav, it's a bit more complicated
     * since the element name may be prefixed by a namespace prefix.
     *
     * QName ::= (NSPart ':')? LocalPart
     * NSPart ::= Name
     * LocalPart ::= Name
     * ETag ::= '</' QName S? '>'
     *
     * instead of :
     *
     * ETag ::= '</' Name S? '>'
     */
    if (!IS_LETTER(*cur) && (*cur != '_')) return;
    q = cur++;
    while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
           (*cur == '.') || (*cur == '-') || (*cur == '_') ||
	   (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
	   (IS_EXTENDER(*cur)))
	cur++;

    if (*cur == ':') {
        ns = xmlStrndup(q, cur - q);
        
	cur++; /* skip the column */
	if (!IS_LETTER(*cur) && (*cur != '_')) {
	    fprintf(stderr,
	        "End tag : no element name after namespace identifier %.20s\n",
	            q);
            free(ns);
	    *p = cur;
	    return;
	}
	q = cur++;
	while ((IS_LETTER(*cur)) || (IS_DIGIT(*cur)) ||
	       (*cur == '.') || (*cur == '-') || (*cur == '_') ||
	       (*cur == ':') || 
	       (IS_COMBINING(*cur)) || (IS_IGNORABLE(*cur)) ||
	       (IS_EXTENDER(*cur)))
	    cur++;
        name = xmlStrndup(q, cur - q);

	/*
	 * Search the DTD associated to ns.
	 */
	dtd = xmlSearchDtd(doc, ns);
	if (dtd == NULL)
	    fprintf(stderr, "End tag : Couldn't find namespace %s\n", ns);
	free(ns);
    } else
        name = xmlStrndup(q, cur - q);

    *dtdPtr = dtd;
    *tagPtr = name;

    /*
     * We should definitely be at the ending "S? '>'" part
     */
    SKIP_BLANKS(cur);
    if ((!IS_CHAR(*cur)) || (*cur != '>')) {
        fprintf(stderr, "End tag : expected '>', got %.20s\n", cur);
	/*
	 * Note : skipping to the next '>' is probably otherkill,
	 * especially in case the '>' is hust missing.
	 *
	 * Otherwise add:
	 *  MOVETO_ENDTAG(cur);
	 */
    } else
	cur++;

    *p = cur;
    return;
}

/*
 * xmlParseCDSect: escaped pure raw content.
 */
CHAR *xmlParseCDSect(CHAR **p) {
    CHAR *cur = *p, *r, *s, *base, *ret;

    base = cur;
    if (!IS_CHAR(*cur)) {
        fprintf(stderr, "CData section not finished : %.20s\n", base);
        return(NULL);
    }
    r = cur++;
    if (!IS_CHAR(*cur)) {
        fprintf(stderr, "CData section not finished : %.20s\n", base);
        return(NULL);
    }
    s = cur++;
    while (IS_CHAR(*cur) &&
           ((*r != ']') || (*s != ']') || (*cur != '>'))) {
        r++;s++;cur++;
    }
    if (!IS_CHAR(*cur)) {
        fprintf(stderr, "CData section not finished : %.20s\n", base);
        return(NULL);
    }
    ret = xmlStrndup(base, cur-base);
    *p = cur;
    return(ret);
}

/*
 * xmlParseContent: a content is
 * (element | PCData | Reference | CDSect | PI | Comment)
 *
 * element : starts by '<'
 * PCData : any CHAR but '&' or '<'
 * Reference : starts by '&'
 * CDSect : starts by '<![CDATA['
 * PI : starts by '<?'
 */

xmlNodePtr xmlParseContent(CHAR **p, xmlDocPtr doc, xmlNodePtr node) {
    CHAR *cur = *p, *q, *data = NULL;
    xmlNodePtr ret = NULL;

    /*
     * First case : a Processing Instruction.
     */
    if ((cur[0] == '<') && (cur[1] == '?')) {
	xmlParsePI(&cur, doc);
    }
    /*
     * Second case : a CDSection
     */
    if ((cur[0] == '<') && (cur[1] == '!') && (cur[2] == '[') &&
        (cur[3] == 'C') && (cur[4] == 'D') && (cur[5] == 'A') &&
	(cur[6] == 'T') && (cur[7] == 'A') && (cur[8] == '[')) {
	cur += 9;
	data = xmlParseCDSect(&cur);
    }
    /*
     * Third case :  a sub-element.
     */
    else if (cur[0] == '<') {
        ret = xmlParseElement(&cur, doc);
    }
    /*
     * Last case, text. Note that References are handled directly.
     */
    else {
        q = cur;
	while (IS_CHAR(*cur) && (*cur != '<')) cur++;

	if (!IS_CHAR(*cur)) {
	    fprintf(stderr, "Truncated content : %.50s\n", q);
	    *p = cur;
	    return(NULL);
	}
        data = xmlStrndup(q, cur - q);
	/* Should apply the &...; reduction !!!! */
    }

    /*
     * Handle the data if any. If there is no child
     * add it as content, otherwise create a new node of type text.
     */
    if (data != NULL)
	data = xmlHandleData(data);
    if (data != NULL) {
	if (node->childs == NULL)
	    xmlNodeSetContent(node, data); 
	else {
	    ret = xmlNewText(data);
	}
    }

    *p = cur;
    return(ret);
}

/*
 * xmlParseElement: parse an XML element
 */

xmlNodePtr xmlParseElement(CHAR **p, xmlDocPtr doc) {
    CHAR *cur = *p;
    xmlNodePtr ret, child;
    CHAR *openTag = *p;
    CHAR *closeTag = *p;

    ret = xmlParseStartTag(&cur, doc);
    if (ret == NULL) {
	*p = cur;
        return(NULL);
    }

    /*
     * Check for an Empty Element.
     */
    if ((cur[0] == '/') && (cur[1] == '>')) {
        cur += 2;
	*p = cur;
	return(ret);
    }
    if (cur[0] == '>') cur++;
    else {
        fprintf(stderr, "Couldn't find end of Start Tag %.30s\n", *p);
	*p = cur;
	return(ret);
    }

    /*
     * Parse the content of the element:
     * (element | PCData | Reference | CDSect | PI | Comment) *
     *
     * element : starts by '<'
     * PCData : any CHAR but '&' or '<'
     * Reference : starts by '&'
     * CDSect : starts by '<![CDATA['
     * PI : starts by '<?'
     *
     * The loop stops upon detection of an end of tag '</'
     */
    while ((IS_CHAR(cur[0])) && ((cur[0] != '<') || (cur[1] != '/'))) {
        child = xmlParseContent(&cur, doc, ret);
	if (child != NULL)
	    xmlAddChild(ret, child);
    }
    if (!IS_CHAR(cur[0])) {
        fprintf(stderr, "Premature end of data in tag %.30s\n", *p);
	*p = cur;
	return(ret);
    }

    /*
     * parse the end of tag : '</' has been detected.
     */
    cur += 2;
    if (*cur == '>') cur++; /* simplified closing </> */
    else {
        CHAR *endTag;
	xmlDtdPtr endDtd;

	xmlParseEndTag(&cur, doc, &endDtd, &endTag);

        /*
	 * Check that the Name in the ETag is the same as in the STag.
	 */
	if (endDtd != ret->dtd) {
	    fprintf(stderr, "Start and End tags don't use the same DTD:\n");
	    fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
	}
	if (strcmp(ret->name, endTag)) {
	    fprintf(stderr, "Start and End tags don't use the same name:\n");
	    fprintf(stderr, "\t%.30s\n\t%.30s\n", openTag, closeTag);
	}
    }

    *p = cur;
    return(ret);
}

/*
 * xmlParseXMLDecl: parse an XML declaration header
 */

xmlDocPtr xmlParseXMLDecl(CHAR **p) {
    CHAR *cur = *p;
    CHAR *version;
    xmlDocPtr ret;

    /*
     * We know that '<?XML' is here.
     */
    cur += 5;

    /*
     * Parse the version info
     */
    SKIP_BLANKS(cur);

    /*
     * We should have 'version=' here !
     */
    if ((cur[0] == 'v') && (cur[1] == 'e') && (cur[2] == 'r') && 
        (cur[3] == 's') && (cur[4] == 'i') && (cur[5] == 'o') &&
	(cur[6] == 'n') && (cur[7] == '=')) {
	cur += 8;
	version = xmlParseQuotedString(&cur);
	if (version == NULL)
	    ret = xmlNewDoc(XML_DEFAULT_VERSION);
	else {
	    ret = xmlNewDoc(version);
	    free(version);
	}
    } else {
        ret = xmlNewDoc(XML_DEFAULT_VERSION);
    }

    /*
     * We should check for encoding !!!!
     */

    /*
     * We should check for Required Markup Declaration !!!!
     */
    MOVETO_ENDTAG(cur);
    cur++;

    *p = cur;
    return(ret);
}

/*
 * xmlParseMisc: parse an XML Misc optionnal field.
 * (Comment | PI | S)*
 */

void xmlParseMisc(CHAR **p, xmlDocPtr doc) {
    CHAR *cur = *p;

    while (((cur[0] == '<') && (cur[1] == '?')) ||
           ((cur[0] == '<') && (cur[1] == '!') &&
	    (cur[2] == '-') && (cur[2] == '-')) ||
           IS_BLANK(*cur)) {
        if ((cur[0] == '<') && (cur[1] == '?')) {
	    xmlParsePI(&cur, doc);
	} else if (IS_BLANK(*cur)) {
	    cur++;
	} else
	    xmlParserSkipComment(&cur);
    }

    *p = cur;
}

/*
 * xmlParseDoc : parse an XML document and build a tree.
 */

xmlDocPtr xmlParseDoc(CHAR *cur) {
    xmlDocPtr ret;

    /*
     * Wipe out everything which is before the first '<'
     */
    SKIP_BLANKS(cur);

    /*
     * Check for the XMLDecl in the Prolog.
     */
    if ((cur[0] == '<') && (cur[1] == '?') &&
        (cur[2] == 'X') && (cur[3] == 'M') &&
	(cur[4] == 'L')) {
	ret = xmlParseXMLDecl(&cur);
	/* SKIP_EOL(cur); */
	SKIP_BLANKS(cur);
    } else {
        ret = xmlNewDoc(XML_DEFAULT_VERSION);
    }

    /*
     * The Misc part of the Prolog
     * (Comment | PI | S) *
     */
    xmlParseMisc(&cur, ret);

    /*
     * Time to start parsing 
     */
    ret->root = xmlParseElement(&cur, ret);

    return(ret);
}

/*
 * xmlParseFile : parse an XML file and build a tree.
 */

xmlDocPtr xmlParseFile(const char *filename) {
    xmlDocPtr ret;
    int input;
    int res;
    struct stat buf;
    char *buffer;

    res = stat(filename, &buf);
    if (res < 0) return(NULL);

    buffer = malloc(buf.st_size + 100);
    if (buffer == NULL) {
	perror("malloc");
        return(NULL);
    }

    memset(buffer, 0, sizeof(buffer));
    input = open (filename, O_RDONLY);
    if (input < 0) {
        fprintf (stderr, "Cannot read file %s :\n", filename);
	perror ("open failed");
	return(NULL);
    }
    res = read(input, buffer, buf.st_size);
    if (res < 0) {
        fprintf (stderr, "Cannot read file %s :\n", filename);
	perror ("read failed");
	return(NULL);
    }
    close(input);

    buffer[buf.st_size] = '\0';
    ret = xmlParseDoc(buffer);
    free(buffer);
    return(ret);
}

Webmaster