File:  [Public] / XML / parser.h
Revision 1.72: download - view: text, annotated - select for diffs
Tue Mar 14 16:46:30 2000 UTC (24 years, 2 months ago) by daniel
Branches: MAIN
CVS tags: HEAD
Fixing more entities stuff, daniel.

/*
 * parser.h : Interfaces, constants and types related to the XML parser.
 *
 * See Copyright for the status of this software.
 *
 * Daniel.Veillard@w3.org
 */

#ifndef __XML_PARSER_H__
#define __XML_PARSER_H__

#include "tree.h"
#include "valid.h"
#include "xmlIO.h"
#include "entities.h"


#ifdef __cplusplus
extern "C" {
#endif

/*
 * Constants.
 */
#define XML_DEFAULT_VERSION	"1.0"

/**
 * an xmlParserInput is an input flow for the XML processor.
 * Each entity parsed is associated an xmlParserInput (except the
 * few predefined ones). This is the case both for internal entities
 * - in which case the flow is already completely in memory - or
 * external entities - in which case we use the buf structure for
 * progressive reading and I18N conversions to the internal UTF-8 format.
 */

typedef void (* xmlParserInputDeallocate)(xmlChar *);
typedef struct _xmlParserInput xmlParserInput;
typedef xmlParserInput *xmlParserInputPtr;
struct _xmlParserInput {
    /* Input buffer */
    xmlParserInputBufferPtr buf;      /* UTF-8 encoded buffer */

    const char *filename;             /* The file analyzed, if any */
    const char *directory;            /* the directory/base of teh file */
    const xmlChar *base;              /* Base of the array to parse */
    const xmlChar *cur;               /* Current char being parsed */
    int length;                       /* length if known */
    int line;                         /* Current line */
    int col;                          /* Current column */
    int consumed;                     /* How many xmlChars already consumed */
    xmlParserInputDeallocate free;    /* function to deallocate the base */
    const xmlChar *encoding;          /* the encoding string for entity */
    const xmlChar *version;           /* the version string for entity */
    int standalone;                   /* Was that entity marked standalone */
};

/**
 * the parser can be asked to collect Node informations, i.e. at what
 * place in the file they were detected. 
 * NOTE: This is off by default and not very well tested.
 */
typedef struct _xmlParserNodeInfo xmlParserNodeInfo;
typedef xmlParserNodeInfo *xmlParserNodeInfoPtr;

struct _xmlParserNodeInfo {
  const struct _xmlNode* node;
  /* Position & line # that text that created the node begins & ends on */
  unsigned long begin_pos;
  unsigned long begin_line;
  unsigned long end_pos;
  unsigned long end_line;
};

typedef struct _xmlParserNodeInfoSeq xmlParserNodeInfoSeq;
typedef xmlParserNodeInfoSeq *xmlParserNodeInfoSeqPtr;
struct _xmlParserNodeInfoSeq {
  unsigned long maximum;
  unsigned long length;
  xmlParserNodeInfo* buffer;
};

/**
 * The parser is now working also as a state based parser
 * The recursive one use the stagte info for entities processing
 */
typedef enum {
    XML_PARSER_EOF = -1,	/* nothing is to be parsed */
    XML_PARSER_START = 0,	/* nothing has been parsed */
    XML_PARSER_MISC,		/* Misc* before int subset */
    XML_PARSER_PI,		/* Whithin a processing instruction */
    XML_PARSER_DTD,		/* within some DTD content */
    XML_PARSER_PROLOG,		/* Misc* after internal subset */
    XML_PARSER_COMMENT,		/* within a comment */
    XML_PARSER_START_TAG,	/* within a start tag */
    XML_PARSER_CONTENT,		/* within the content */
    XML_PARSER_CDATA_SECTION,	/* within a CDATA section */
    XML_PARSER_END_TAG,		/* within a closing tag */
    XML_PARSER_ENTITY_DECL,	/* within an entity declaration */
    XML_PARSER_ENTITY_VALUE,	/* within an entity value in a decl */
    XML_PARSER_ATTRIBUTE_VALUE,	/* within an attribute value */
    XML_PARSER_SYSTEM_LITERAL,	/* within a SYSTEM value */
    XML_PARSER_EPILOG 		/* the Misc* after the last end tag */
} xmlParserInputState;

/**
 * The parser context.
 * NOTE This doesn't completely defines the parser state, the (current ?)
 *      design of the parser uses recursive function calls since this allow
 *      and easy mapping from the production rules of the specification
 *      to the actual code. The drawback is that the actual function call
 *      also reflect the parser state. However most of the parsing routines
 *      takes as the only argument the parser context pointer, so migrating
 *      to a state based parser for progressive parsing shouldn't be too hard.
 */
typedef struct _xmlParserCtxt xmlParserCtxt;
typedef xmlParserCtxt *xmlParserCtxtPtr;
struct _xmlParserCtxt {
    struct _xmlSAXHandler *sax;       /* The SAX handler */
    void            *userData;        /* the document being built */
    xmlDocPtr           myDoc;        /* the document being built */
    int            wellFormed;        /* is the document well formed */
    int       replaceEntities;        /* shall we replace entities ? */
    const xmlChar       *version;        /* the XML version string */
    const xmlChar      *encoding;        /* encoding, if any */
    int            standalone;        /* standalone document */
    int                  html;        /* are we parsing an HTML document */

    /* Input stream stack */
    xmlParserInputPtr  input;         /* Current input stream */
    int                inputNr;       /* Number of current input streams */
    int                inputMax;      /* Max number of input streams */
    xmlParserInputPtr *inputTab;      /* stack of inputs */

    /* Node analysis stack only used for DOM building */
    xmlNodePtr         node;          /* Current parsed Node */
    int                nodeNr;        /* Depth of the parsing stack */
    int                nodeMax;       /* Max depth of the parsing stack */
    xmlNodePtr        *nodeTab;       /* array of nodes */

    int record_info;                  /* Whether node info should be kept */
    xmlParserNodeInfoSeq node_seq;    /* info about each node parsed */

    int errNo;                        /* error code */

    int     hasExternalSubset;        /* reference and external subset */
    int             hasPErefs;        /* the internal subset has PE refs */
    int              external;        /* are we parsing an external entity */

    int                 valid;        /* is the document valid */
    int              validate;        /* shall we try to validate ? */
    xmlValidCtxt        vctxt;        /* The validity context */

    xmlParserInputState instate;      /* current type of input */
    int                 token;        /* next char look-ahead */    

    char           *directory;        /* the data directory */

    /* Node name stack */
    xmlChar           *name;          /* Current parsed Node */
    int                nameNr;        /* Depth of the parsing stack */
    int                nameMax;       /* Max depth of the parsing stack */
    xmlChar *         *nameTab;       /* array of nodes */

    long               nbChars;       /* number of xmlChar processed */
    long            checkIndex;       /* used by progressive parsing lookup */
    int             keepBlanks;       /* ugly but ... */
    int             disableSAX;       /* SAX callbacks are disabled */
    int               inSubset;       /* Parsing is in int 1/ext 2 subset */
    xmlChar *          intSubName;    /* name of subset */
    xmlChar *          extSubURI;     /* URI of external subset */
    xmlChar *          extSubSystem;  /* SYSTEM ID of external subset */

    /* xml:space values */
    int *              space;         /* Should the parser preserve spaces */
    int                spaceNr;       /* Depth of the parsing stack */
    int                spaceMax;      /* Max depth of the parsing stack */
    int *              spaceTab;      /* array of space infos */

    int                depth;         /* to prevent entity substitution loops */
    xmlParserInputPtr  entity;      /* used to check entities boundaries */
};

/**
 * a SAX Locator.
 */
typedef struct _xmlSAXLocator xmlSAXLocator;
typedef xmlSAXLocator *xmlSAXLocatorPtr;
struct _xmlSAXLocator {
    const xmlChar *(*getPublicId)(void *ctx);
    const xmlChar *(*getSystemId)(void *ctx);
    int (*getLineNumber)(void *ctx);
    int (*getColumnNumber)(void *ctx);
};

/**
 * a SAX handler is bunch of callbacks called by the parser when processing
 * of the input generate data or structure informations.
 */

typedef xmlParserInputPtr (*resolveEntitySAXFunc) (void *ctx,
			    const xmlChar *publicId, const xmlChar *systemId);
typedef void (*internalSubsetSAXFunc) (void *ctx, const xmlChar *name,
                            const xmlChar *ExternalID, const xmlChar *SystemID);
typedef void (*externalSubsetSAXFunc) (void *ctx, const xmlChar *name,
                            const xmlChar *ExternalID, const xmlChar *SystemID);
typedef xmlEntityPtr (*getEntitySAXFunc) (void *ctx,
                            const xmlChar *name);
typedef xmlEntityPtr (*getParameterEntitySAXFunc) (void *ctx,
                            const xmlChar *name);
typedef void (*entityDeclSAXFunc) (void *ctx,
                            const xmlChar *name, int type, const xmlChar *publicId,
			    const xmlChar *systemId, xmlChar *content);
typedef void (*notationDeclSAXFunc)(void *ctx, const xmlChar *name,
			    const xmlChar *publicId, const xmlChar *systemId);
typedef void (*attributeDeclSAXFunc)(void *ctx, const xmlChar *elem,
                            const xmlChar *name, int type, int def,
			    const xmlChar *defaultValue, xmlEnumerationPtr tree);
typedef void (*elementDeclSAXFunc)(void *ctx, const xmlChar *name,
			    int type, xmlElementContentPtr content);
typedef void (*unparsedEntityDeclSAXFunc)(void *ctx,
                            const xmlChar *name, const xmlChar *publicId,
			    const xmlChar *systemId, const xmlChar *notationName);
typedef void (*setDocumentLocatorSAXFunc) (void *ctx,
                            xmlSAXLocatorPtr loc);
typedef void (*startDocumentSAXFunc) (void *ctx);
typedef void (*endDocumentSAXFunc) (void *ctx);
typedef void (*startElementSAXFunc) (void *ctx, const xmlChar *name,
                            const xmlChar **atts);
typedef void (*endElementSAXFunc) (void *ctx, const xmlChar *name);
typedef void (*attributeSAXFunc) (void *ctx, const xmlChar *name,
                                  const xmlChar *value);
typedef void (*referenceSAXFunc) (void *ctx, const xmlChar *name);
typedef void (*charactersSAXFunc) (void *ctx, const xmlChar *ch,
		            int len);
typedef void (*ignorableWhitespaceSAXFunc) (void *ctx,
			    const xmlChar *ch, int len);
typedef void (*processingInstructionSAXFunc) (void *ctx,
                            const xmlChar *target, const xmlChar *data);
typedef void (*commentSAXFunc) (void *ctx, const xmlChar *value);
typedef void (*cdataBlockSAXFunc) (void *ctx, const xmlChar *value, int len);
typedef void (*warningSAXFunc) (void *ctx, const char *msg, ...);
typedef void (*errorSAXFunc) (void *ctx, const char *msg, ...);
typedef void (*fatalErrorSAXFunc) (void *ctx, const char *msg, ...);
typedef int (*isStandaloneSAXFunc) (void *ctx);
typedef int (*hasInternalSubsetSAXFunc) (void *ctx);
typedef int (*hasExternalSubsetSAXFunc) (void *ctx);

typedef struct _xmlSAXHandler xmlSAXHandler;
typedef xmlSAXHandler *xmlSAXHandlerPtr;
struct _xmlSAXHandler {
    internalSubsetSAXFunc internalSubset;
    isStandaloneSAXFunc isStandalone;
    hasInternalSubsetSAXFunc hasInternalSubset;
    hasExternalSubsetSAXFunc hasExternalSubset;
    resolveEntitySAXFunc resolveEntity;
    getEntitySAXFunc getEntity;
    entityDeclSAXFunc entityDecl;
    notationDeclSAXFunc notationDecl;
    attributeDeclSAXFunc attributeDecl;
    elementDeclSAXFunc elementDecl;
    unparsedEntityDeclSAXFunc unparsedEntityDecl;
    setDocumentLocatorSAXFunc setDocumentLocator;
    startDocumentSAXFunc startDocument;
    endDocumentSAXFunc endDocument;
    startElementSAXFunc startElement;
    endElementSAXFunc endElement;
    referenceSAXFunc reference;
    charactersSAXFunc characters;
    ignorableWhitespaceSAXFunc ignorableWhitespace;
    processingInstructionSAXFunc processingInstruction;
    commentSAXFunc comment;
    warningSAXFunc warning;
    errorSAXFunc error;
    fatalErrorSAXFunc fatalError;
    getParameterEntitySAXFunc getParameterEntity;
    cdataBlockSAXFunc cdataBlock;
    externalSubsetSAXFunc externalSubset;
};

/**
 * External entity loaders types
 */
typedef xmlParserInputPtr (*xmlExternalEntityLoader)(const char *URL,
						     const char *ID,
						     xmlParserCtxtPtr context);

/**
 * Global variables: just the default SAX interface tables and XML
 * version infos.
 */
extern const char *xmlParserVersion;

extern xmlSAXLocator xmlDefaultSAXLocator;
extern xmlSAXHandler xmlDefaultSAXHandler;
extern xmlSAXHandler htmlDefaultSAXHandler;

/**
 * entity substitution default behaviour.
 */

extern int xmlSubstituteEntitiesDefaultValue;
extern int xmlGetWarningsDefaultValue;


/**
 * Cleanup
 */
void		xmlCleanupParser	(void);

/**
 * Input functions
 */
int		xmlParserInputRead	(xmlParserInputPtr in,
					 int len);
int		xmlParserInputGrow	(xmlParserInputPtr in,
					 int len);

/**
 * xmlChar handling
 */
xmlChar *	xmlStrdup		(const xmlChar *cur);
xmlChar *	xmlStrndup		(const xmlChar *cur,
					 int len);
xmlChar *	xmlStrsub		(const xmlChar *str,
					 int start,
					 int len);
const xmlChar *	xmlStrchr		(const xmlChar *str,
					 xmlChar val);
const xmlChar *	xmlStrstr		(const xmlChar *str,
					 xmlChar *val);
int		xmlStrcmp		(const xmlChar *str1,
					 const xmlChar *str2);
int		xmlStrncmp		(const xmlChar *str1,
					 const xmlChar *str2,
					 int len);
int		xmlStrlen		(const xmlChar *str);
xmlChar *	xmlStrcat		(xmlChar *cur,
					 const xmlChar *add);
xmlChar *	xmlStrncat		(xmlChar *cur,
					 const xmlChar *add,
					 int len);

/**
 * Basic parsing Interfaces
 */
xmlDocPtr	xmlParseDoc		(xmlChar *cur);
xmlDocPtr	xmlParseMemory		(char *buffer,
					 int size);
xmlDocPtr	xmlParseFile		(const char *filename);
int		xmlSubstituteEntitiesDefault(int val);
int		xmlKeepBlanksDefault	(int val);

/**
 * Recovery mode 
 */
xmlDocPtr	xmlRecoverDoc		(xmlChar *cur);
xmlDocPtr	xmlRecoverMemory	(char *buffer,
					 int size);
xmlDocPtr	xmlRecoverFile		(const char *filename);

/**
 * Less common routines and SAX interfaces
 */
int		xmlParseDocument	(xmlParserCtxtPtr ctxt);
xmlDocPtr	xmlSAXParseDoc		(xmlSAXHandlerPtr sax,
					 xmlChar *cur,
					 int recovery);
int		xmlSAXUserParseFile	(xmlSAXHandlerPtr sax,
					 void *user_data,
					 const char *filename);
int		xmlSAXUserParseMemory	(xmlSAXHandlerPtr sax,
					 void *user_data,
					 char *buffer,
					 int size);
xmlDocPtr	xmlSAXParseMemory	(xmlSAXHandlerPtr sax,
					 char *buffer,
                                   	 int size,
					 int recovery);
xmlDocPtr	xmlSAXParseFile		(xmlSAXHandlerPtr sax,
					 const char *filename,
					 int recovery);
xmlDtdPtr	xmlParseDTD		(const xmlChar *ExternalID,
					 const xmlChar *SystemID);
xmlDtdPtr	xmlSAXParseDTD		(xmlSAXHandlerPtr sax,
					 const xmlChar *ExternalID,
					 const xmlChar *SystemID);
int		xmlParseBalancedChunkMemory(xmlDocPtr doc,
					 xmlSAXHandlerPtr sax,
					 void *user_data,
					 int depth,
					 const xmlChar *string,
					 xmlNodePtr *list);
int		xmlParseExternalEntity	(xmlDocPtr doc,
					 xmlSAXHandlerPtr sax,
					 void *user_data,
					 int depth,
					 const xmlChar *URL,
					 const xmlChar *ID,
					 xmlNodePtr *list);

/**
 * SAX initialization routines
 */
void		xmlDefaultSAXHandlerInit(void);
void		htmlDefaultSAXHandlerInit(void);

/**
 * Parser contexts handling.
 */
void		xmlInitParserCtxt	(xmlParserCtxtPtr ctxt);
void		xmlClearParserCtxt	(xmlParserCtxtPtr ctxt);
void		xmlFreeParserCtxt	(xmlParserCtxtPtr ctxt);
void		xmlSetupParserForBuffer	(xmlParserCtxtPtr ctxt,
					 const xmlChar* buffer,
					 const char* filename);
xmlParserCtxtPtr xmlCreateDocParserCtxt	(xmlChar *cur);

/**
 * Interfaces for the Push mode
 */
xmlParserCtxtPtr xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax,
					 void *user_data,
					 const char *chunk,
					 int size,
					 const char *filename);
int		 xmlParseChunk		(xmlParserCtxtPtr ctxt,
					 const char *chunk,
					 int size,
					 int terminate);

/**
 * Node infos
 */
const xmlParserNodeInfo*
		xmlParserFindNodeInfo	(const xmlParserCtxt* ctxt,
                                               const xmlNode* node);
void		xmlInitNodeInfoSeq	(xmlParserNodeInfoSeqPtr seq);
void		xmlClearNodeInfoSeq	(xmlParserNodeInfoSeqPtr seq);
unsigned long xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeq* seq,
                                         const xmlNode* node);
void		xmlParserAddNodeInfo	(xmlParserCtxtPtr ctxt,
					 const xmlParserNodeInfo* info);

/*
 * External entities handling actually implemented in xmlIO
 */

void		xmlSetExternalEntityLoader(xmlExternalEntityLoader f);
xmlExternalEntityLoader
		xmlGetExternalEntityLoader(void);
xmlParserInputPtr
		xmlLoadExternalEntity	(const char *URL,
					 const char *ID,
					 xmlParserCtxtPtr context);

#ifdef __cplusplus
}
#endif

#endif /* __XML_PARSER_H__ */


Webmaster