version 1.30, 1999/12/28 15:31:13
|
version 1.31, 1999/12/29 12:47:07
|
Line 41
|
Line 41
|
#include "valid.h" |
#include "valid.h" |
#include "parserInternals.h" |
#include "parserInternals.h" |
#include "xmlIO.h" |
#include "xmlIO.h" |
|
#include "xml-error.h" |
|
|
#define HTML_MAX_NAMELEN 1000 |
#define HTML_MAX_NAMELEN 1000 |
#define INPUT_CHUNK 50 |
#define INPUT_CHUNK 50 |
|
#define HTML_PARSER_BIG_BUFFER_SIZE 1024 |
|
#define HTML_PARSER_BUFFER_SIZE 100 |
|
|
/* #define DEBUG */ |
/* #define DEBUG */ |
|
/* #define DEBUG_PUSH */ |
|
|
/************************************************************************ |
/************************************************************************ |
* * |
* * |
Line 145 PUSH_AND_POP(extern, xmlChar*, name)
|
Line 149 PUSH_AND_POP(extern, xmlChar*, name)
|
xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \ |
xmlParserInputGrow(ctxt->input, INPUT_CHUNK); \ |
}} |
}} |
|
|
/**************************************** |
|
#define NEXT ((*ctxt->input->cur) ? \ |
|
(((*(ctxt->input->cur) == '\n') ? \ |
|
(ctxt->input->line++, ctxt->input->col = 1) : \ |
|
(ctxt->input->col++)), \ |
|
(ctxt->input->cur++), \ |
|
((*ctxt->input->cur) ? \ |
|
(xmlParserInputGrow(ctxt->input, 100), \ |
|
ctxt->input->cur): \ |
|
(ctxt->input->cur))) : \ |
|
((xmlParserInputGrow(ctxt->input, 100) > 0) ? \ |
|
ctxt->input->cur: \ |
|
(xmlPopInput(ctxt), ctxt->input->cur))) |
|
****************************************/ |
|
#else |
#else |
#endif |
#endif |
|
|
Line 926 htmlDecodeEntities(htmlParserCtxtPtr ctx
|
Line 916 htmlDecodeEntities(htmlParserCtxtPtr ctx
|
/* |
/* |
* allocate a translation buffer. |
* allocate a translation buffer. |
*/ |
*/ |
buffer_size = 1000; |
buffer_size = HTML_PARSER_BIG_BUFFER_SIZE; |
buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); |
buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); |
if (buffer == NULL) { |
if (buffer == NULL) { |
perror("htmlDecodeEntities: malloc failed"); |
perror("htmlDecodeEntities: malloc failed"); |
Line 1128 htmlSwitchEncoding(htmlParserCtxtPtr ctx
|
Line 1118 htmlSwitchEncoding(htmlParserCtxtPtr ctx
|
} |
} |
} |
} |
|
|
|
/************************************************************************ |
|
* * |
|
* Commodity functions to handle streams * |
|
* * |
|
************************************************************************/ |
|
|
|
/** |
|
* htmlFreeInputStream: |
|
* @input: an htmlParserInputPtr |
|
* |
|
* Free up an input stream. |
|
*/ |
|
void |
|
htmlFreeInputStream(htmlParserInputPtr input) { |
|
if (input == NULL) return; |
|
|
|
if (input->filename != NULL) xmlFree((char *) input->filename); |
|
if (input->directory != NULL) xmlFree((char *) input->directory); |
|
if ((input->free != NULL) && (input->base != NULL)) |
|
input->free((xmlChar *) input->base); |
|
if (input->buf != NULL) |
|
xmlFreeParserInputBuffer(input->buf); |
|
memset(input, -1, sizeof(htmlParserInput)); |
|
xmlFree(input); |
|
} |
|
|
|
/** |
|
* htmlNewInputStream: |
|
* @ctxt: an HTML parser context |
|
* |
|
* Create a new input stream structure |
|
* Returns the new input stream or NULL |
|
*/ |
|
htmlParserInputPtr |
|
htmlNewInputStream(htmlParserCtxtPtr ctxt) { |
|
htmlParserInputPtr input; |
|
|
|
input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); |
|
if (input == NULL) { |
|
ctxt->errNo = XML_ERR_NO_MEMORY; |
|
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
|
ctxt->sax->error(ctxt->userData, |
|
"malloc: couldn't allocate a new input stream\n"); |
|
ctxt->errNo = XML_ERR_NO_MEMORY; |
|
return(NULL); |
|
} |
|
input->filename = NULL; |
|
input->directory = NULL; |
|
input->base = NULL; |
|
input->cur = NULL; |
|
input->buf = NULL; |
|
input->line = 1; |
|
input->col = 1; |
|
input->buf = NULL; |
|
input->free = NULL; |
|
input->consumed = 0; |
|
input->length = 0; |
|
return(input); |
|
} |
|
|
|
|
/************************************************************************ |
/************************************************************************ |
* * |
* * |
Line 1268 xmlChar *
|
Line 1318 xmlChar *
|
htmlParseHTMLName(htmlParserCtxtPtr ctxt) { |
htmlParseHTMLName(htmlParserCtxtPtr ctxt) { |
xmlChar *ret = NULL; |
xmlChar *ret = NULL; |
int i = 0; |
int i = 0; |
xmlChar loc[100]; |
xmlChar loc[HTML_PARSER_BUFFER_SIZE]; |
|
|
if (!IS_LETTER(CUR) && (CUR != '_') && |
if (!IS_LETTER(CUR) && (CUR != '_') && |
(CUR != ':')) return(NULL); |
(CUR != ':')) return(NULL); |
|
|
while ((i < 100) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) { |
while ((i < HTML_PARSER_BUFFER_SIZE) && |
|
((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) { |
if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; |
if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; |
else loc[i] = CUR; |
else loc[i] = CUR; |
i++; |
i++; |
Line 1615 void
|
Line 1666 void
|
htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) { |
htmlParseCharData(htmlParserCtxtPtr ctxt, int cdata) { |
xmlChar *buf = NULL; |
xmlChar *buf = NULL; |
int len = 0; |
int len = 0; |
int size = 100; |
int size = HTML_PARSER_BUFFER_SIZE; |
xmlChar q; |
xmlChar q; |
|
|
buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar)); |
buf = (xmlChar *) xmlMalloc(size * sizeof(xmlChar)); |
Line 1742 htmlParseExternalID(htmlParserCtxtPtr ct
|
Line 1793 htmlParseExternalID(htmlParserCtxtPtr ct
|
/** |
/** |
* htmlParseComment: |
* htmlParseComment: |
* @ctxt: an HTML parser context |
* @ctxt: an HTML parser context |
* @create: should we create a node, or just skip the content |
|
* |
* |
* Parse an XML (SGML) comment <!-- .... --> |
* Parse an XML (SGML) comment <!-- .... --> |
* |
* |
* [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' |
* [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' |
*/ |
*/ |
void |
void |
htmlParseComment(htmlParserCtxtPtr ctxt, int create) { |
htmlParseComment(htmlParserCtxtPtr ctxt) { |
xmlChar *buf = NULL; |
xmlChar *buf = NULL; |
int len = 0; |
int len = 0; |
int size = 100; |
int size = HTML_PARSER_BUFFER_SIZE; |
register xmlChar s, r, q; |
register xmlChar s, r, q; |
|
|
/* |
/* |
Line 1793 htmlParseComment(htmlParserCtxtPtr ctxt,
|
Line 1843 htmlParseComment(htmlParserCtxtPtr ctxt,
|
ctxt->wellFormed = 0; |
ctxt->wellFormed = 0; |
} else { |
} else { |
NEXT; |
NEXT; |
if (create) { |
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) { |
if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL)) { |
ctxt->sax->comment(ctxt->userData, buf); |
ctxt->sax->comment(ctxt->userData, buf); |
|
} |
|
} |
} |
} |
} |
xmlFree(buf); |
xmlFree(buf); |
Line 1935 htmlParseDocTypeDecl(htmlParserCtxtPtr c
|
Line 1983 htmlParseDocTypeDecl(htmlParserCtxtPtr c
|
/* |
/* |
* Create the document accordingly to the DOCTYPE |
* Create the document accordingly to the DOCTYPE |
*/ |
*/ |
|
if (ctxt->myDoc != NULL) |
|
xmlFreeDoc(ctxt->myDoc); |
|
|
ctxt->myDoc = htmlNewDoc(URI, ExternalID); |
ctxt->myDoc = htmlNewDoc(URI, ExternalID); |
|
|
/* |
/* |
Line 1968 htmlParseDocTypeDecl(htmlParserCtxtPtr c
|
Line 2019 htmlParseDocTypeDecl(htmlParserCtxtPtr c
|
|
|
xmlChar * |
xmlChar * |
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { |
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { |
xmlChar *name, *val; |
xmlChar *name, *val = NULL; |
|
|
*value = NULL; |
*value = NULL; |
name = htmlParseName(ctxt); |
name = htmlParseName(ctxt); |
Line 1990 htmlParseAttribute(htmlParserCtxtPtr ctx
|
Line 2041 htmlParseAttribute(htmlParserCtxtPtr ctx
|
} else { |
} else { |
/* TODO : some attribute must have values, some may not */ |
/* TODO : some attribute must have values, some may not */ |
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
ctxt->sax->error(ctxt->userData, |
ctxt->sax->warning(ctxt->userData, |
"Specification mandate value for attribute %s\n", name); |
"No value for attribute %s\n", name); |
ctxt->wellFormed = 0; |
|
return(NULL); |
|
} |
} |
|
|
*value = val; |
*value = val; |
Line 2060 htmlParseStartTag(htmlParserCtxtPtr ctxt
|
Line 2109 htmlParseStartTag(htmlParserCtxtPtr ctxt
|
|
|
GROW; |
GROW; |
attname = htmlParseAttribute(ctxt, &attvalue); |
attname = htmlParseAttribute(ctxt, &attvalue); |
if ((attname != NULL) && (attvalue != NULL)) { |
if (attname != NULL) { |
/* |
/* |
* Well formedness requires at most one declaration of an attribute |
* Well formedness requires at most one declaration of an attribute |
*/ |
*/ |
Line 2072 htmlParseStartTag(htmlParserCtxtPtr ctxt
|
Line 2121 htmlParseStartTag(htmlParserCtxtPtr ctxt
|
attname); |
attname); |
ctxt->wellFormed = 0; |
ctxt->wellFormed = 0; |
xmlFree(attname); |
xmlFree(attname); |
xmlFree(attvalue); |
if (attvalue != NULL) |
|
xmlFree(attvalue); |
goto failed; |
goto failed; |
} |
} |
} |
} |
Line 2127 failed:
|
Line 2177 failed:
|
ctxt->sax->startElement(ctxt->userData, name, atts); |
ctxt->sax->startElement(ctxt->userData, name, atts); |
|
|
if (atts != NULL) { |
if (atts != NULL) { |
for (i = 0;i < nbatts;i++) xmlFree((xmlChar *) atts[i]); |
for (i = 0;i < nbatts;i++) { |
|
if (atts[i] != NULL) |
|
xmlFree((xmlChar *) atts[i]); |
|
} |
xmlFree(atts); |
xmlFree(atts); |
} |
} |
if (name != NULL) xmlFree(name); |
if (name != NULL) xmlFree(name); |
Line 2330 htmlParseContent(htmlParserCtxtPtr ctxt)
|
Line 2383 htmlParseContent(htmlParserCtxtPtr ctxt)
|
*/ |
*/ |
if ((CUR == '<') && (NXT(1) == '!') && |
if ((CUR == '<') && (NXT(1) == '!') && |
(NXT(2) == '-') && (NXT(3) == '-')) { |
(NXT(2) == '-') && (NXT(3) == '-')) { |
htmlParseComment(ctxt, 1); |
htmlParseComment(ctxt); |
} |
} |
|
|
/* |
/* |
Line 2384 htmlParseContent(htmlParserCtxtPtr ctxt)
|
Line 2437 htmlParseContent(htmlParserCtxtPtr ctxt)
|
void |
void |
htmlParseElement(htmlParserCtxtPtr ctxt) { |
htmlParseElement(htmlParserCtxtPtr ctxt) { |
const xmlChar *openTag = CUR_PTR; |
const xmlChar *openTag = CUR_PTR; |
xmlChar *oldname; |
|
xmlChar *name; |
xmlChar *name; |
xmlChar *currentNode = NULL; |
xmlChar *currentNode = NULL; |
htmlElemDescPtr info; |
htmlElemDescPtr info; |
htmlParserNodeInfo node_info; |
htmlParserNodeInfo node_info; |
|
xmlChar *oldname; |
int depth = ctxt->nameNr; |
int depth = ctxt->nameNr; |
|
|
/* Capture start position */ |
/* Capture start position */ |
Line 2585 htmlParseDocument(htmlParserCtxtPtr ctxt
|
Line 2638 htmlParseDocument(htmlParserCtxtPtr ctxt
|
*/ |
*/ |
while ((CUR == '<') && (NXT(1) == '!') && |
while ((CUR == '<') && (NXT(1) == '!') && |
(NXT(2) == '-') && (NXT(3) == '-')) { |
(NXT(2) == '-') && (NXT(3) == '-')) { |
ctxt->myDoc = htmlNewDoc(NULL, NULL); |
if (ctxt->myDoc == NULL) |
htmlParseComment(ctxt, 1); |
ctxt->myDoc = htmlNewDoc(NULL, NULL); |
|
htmlParseComment(ctxt); |
SKIP_BLANKS; |
SKIP_BLANKS; |
} |
} |
|
|
Line 2721 htmlFreeParserCtxt(htmlParserCtxtPtr ctx
|
Line 2775 htmlFreeParserCtxt(htmlParserCtxtPtr ctx
|
xmlFree(oldname); |
xmlFree(oldname); |
} |
} |
if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab); |
if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab); |
|
if (ctxt->directory != NULL) xmlFree(ctxt->directory); |
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab); |
if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab); |
if (ctxt->version != NULL) xmlFree((char *) ctxt->version); |
if (ctxt->version != NULL) xmlFree((char *) ctxt->version); |
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler)) |
if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler)) |
Line 2766 htmlCreateDocParserCtxt(xmlChar *cur, co
|
Line 2821 htmlCreateDocParserCtxt(xmlChar *cur, co
|
return(ctxt); |
return(ctxt); |
} |
} |
|
|
/******************************************************************************** |
/************************************************************************ |
* * |
* * |
* User entry points * |
* Progressive parsing interfaces * |
* * |
* * |
********************************************************************************/ |
************************************************************************/ |
|
|
|
/** |
|
* htmlParseLookupSequence: |
|
* @ctxt: an HTML parser context |
|
* @first: the first char to lookup |
|
* @next: the next char to lookup or zero |
|
* @third: the next char to lookup or zero |
|
* |
|
* Try to find if a sequence (first, next, third) or just (first next) or |
|
* (first) is available in the input stream. |
|
* This function has a side effect of (possibly) incrementing ctxt->checkIndex |
|
* to avoid rescanning sequences of bytes, it DOES change the state of the |
|
* parser, do not use liberally. |
|
* This is basically similar to xmlParseLookupSequence() |
|
* |
|
* Returns the index to the current parsing point if the full sequence |
|
* is available, -1 otherwise. |
|
*/ |
|
int |
|
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, |
|
xmlChar next, xmlChar third) { |
|
int base, len; |
|
htmlParserInputPtr in; |
|
const xmlChar *buf; |
|
|
|
in = ctxt->input; |
|
if (in == NULL) return(-1); |
|
base = in->cur - in->base; |
|
if (base < 0) return(-1); |
|
if (ctxt->checkIndex > base) |
|
base = ctxt->checkIndex; |
|
if (in->buf == NULL) { |
|
buf = in->base; |
|
len = in->length; |
|
} else { |
|
buf = in->buf->buffer->content; |
|
len = in->buf->buffer->use; |
|
} |
|
/* take into account the sequence length */ |
|
if (third) len -= 2; |
|
else if (next) len --; |
|
for (;base < len;base++) { |
|
if (buf[base] == first) { |
|
if (third != 0) { |
|
if ((buf[base + 1] != next) || |
|
(buf[base + 2] != third)) continue; |
|
} else if (next != 0) { |
|
if (buf[base + 1] != next) continue; |
|
} |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
if (next == 0) |
|
fprintf(stderr, "HPP: lookup '%c' found at %d\n", |
|
first, base); |
|
else if (third == 0) |
|
fprintf(stderr, "HPP: lookup '%c%c' found at %d\n", |
|
first, next, base); |
|
else |
|
fprintf(stderr, "HPP: lookup '%c%c%c' found at %d\n", |
|
first, next, third, base); |
|
#endif |
|
return(base - (in->cur - in->base)); |
|
} |
|
} |
|
ctxt->checkIndex = base; |
|
#ifdef DEBUG_PUSH |
|
if (next == 0) |
|
fprintf(stderr, "HPP: lookup '%c' failed\n", first); |
|
else if (third == 0) |
|
fprintf(stderr, "HPP: lookup '%c%c' failed\n", first, next); |
|
else |
|
fprintf(stderr, "HPP: lookup '%c%c%c' failed\n", first, next, third); |
|
#endif |
|
return(-1); |
|
} |
|
|
|
/** |
|
* htmlParseTry: |
|
* @ctxt: an HTML parser context |
|
* |
|
* Try to progress on parsing |
|
* |
|
* Returns zero if no parsing was possible |
|
*/ |
|
int |
|
htmlParseTry(htmlParserCtxtPtr ctxt) { |
|
int ret = 0; |
|
htmlParserInputPtr in; |
|
int avail; |
|
xmlChar cur, next; |
|
|
|
#ifdef DEBUG_PUSH |
|
switch (ctxt->instate) { |
|
case XML_PARSER_EOF: |
|
fprintf(stderr, "HPP: try EOF\n"); break; |
|
case XML_PARSER_START: |
|
fprintf(stderr, "HPP: try START\n"); break; |
|
case XML_PARSER_MISC: |
|
fprintf(stderr, "HPP: try MISC\n");break; |
|
case XML_PARSER_COMMENT: |
|
fprintf(stderr, "HPP: try COMMENT\n");break; |
|
case XML_PARSER_PROLOG: |
|
fprintf(stderr, "HPP: try PROLOG\n");break; |
|
case XML_PARSER_START_TAG: |
|
fprintf(stderr, "HPP: try START_TAG\n");break; |
|
case XML_PARSER_CONTENT: |
|
fprintf(stderr, "HPP: try CONTENT\n");break; |
|
case XML_PARSER_CDATA_SECTION: |
|
fprintf(stderr, "HPP: try CDATA_SECTION\n");break; |
|
case XML_PARSER_END_TAG: |
|
fprintf(stderr, "HPP: try END_TAG\n");break; |
|
case XML_PARSER_ENTITY_DECL: |
|
fprintf(stderr, "HPP: try ENTITY_DECL\n");break; |
|
case XML_PARSER_ENTITY_VALUE: |
|
fprintf(stderr, "HPP: try ENTITY_VALUE\n");break; |
|
case XML_PARSER_ATTRIBUTE_VALUE: |
|
fprintf(stderr, "HPP: try ATTRIBUTE_VALUE\n");break; |
|
case XML_PARSER_DTD: |
|
fprintf(stderr, "HPP: try DTD\n");break; |
|
case XML_PARSER_EPILOG: |
|
fprintf(stderr, "HPP: try EPILOG\n");break; |
|
case XML_PARSER_PI: |
|
fprintf(stderr, "HPP: try PI\n");break; |
|
} |
|
#endif |
|
|
|
while (1) { |
|
|
|
in = ctxt->input; |
|
if (in == NULL) break; |
|
if (in->buf == NULL) |
|
avail = in->length - (in->cur - in->base); |
|
else |
|
avail = in->buf->buffer->use - (in->cur - in->base); |
|
if (avail < 1) |
|
goto done; |
|
switch (ctxt->instate) { |
|
case XML_PARSER_EOF: |
|
/* |
|
* Document parsing is done ! |
|
*/ |
|
goto done; |
|
case XML_PARSER_START: |
|
/* |
|
* Very first chars read from the document flow. |
|
*/ |
|
cur = in->cur[0]; |
|
if (IS_BLANK(cur)) { |
|
SKIP_BLANKS; |
|
if (in->buf == NULL) |
|
avail = in->length - (in->cur - in->base); |
|
else |
|
avail = in->buf->buffer->use - (in->cur - in->base); |
|
} |
|
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) |
|
ctxt->sax->setDocumentLocator(ctxt->userData, |
|
&xmlDefaultSAXLocator); |
|
cur = in->cur[0]; |
|
next = in->cur[1]; |
|
if ((cur == '<') && (next == '!') && |
|
(UPP(2) == 'D') && (UPP(3) == 'O') && |
|
(UPP(4) == 'C') && (UPP(5) == 'T') && |
|
(UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
(UPP(8) == 'E')) { |
|
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing internal subset\n"); |
|
#endif |
|
htmlParseDocTypeDecl(ctxt); |
|
ctxt->instate = XML_PARSER_PROLOG; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering PROLOG\n"); |
|
#endif |
|
} else { |
|
ctxt->myDoc = htmlNewDoc(NULL, NULL); |
|
ctxt->instate = XML_PARSER_MISC; |
|
} |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering MISC\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_MISC: |
|
SKIP_BLANKS; |
|
if (in->buf == NULL) |
|
avail = in->length - (in->cur - in->base); |
|
else |
|
avail = in->buf->buffer->use - (in->cur - in->base); |
|
if (avail < 2) |
|
goto done; |
|
cur = in->cur[0]; |
|
next = in->cur[1]; |
|
if ((cur == '<') && (next == '!') && |
|
(in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing Comment\n"); |
|
#endif |
|
htmlParseComment(ctxt); |
|
ctxt->instate = XML_PARSER_MISC; |
|
} else if ((cur == '<') && (next == '!') && |
|
(UPP(2) == 'D') && (UPP(3) == 'O') && |
|
(UPP(4) == 'C') && (UPP(5) == 'T') && |
|
(UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
(UPP(8) == 'E')) { |
|
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing internal subset\n"); |
|
#endif |
|
htmlParseDocTypeDecl(ctxt); |
|
ctxt->instate = XML_PARSER_PROLOG; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering PROLOG\n"); |
|
#endif |
|
} else if ((cur == '<') && (next == '!') && |
|
(avail < 9)) { |
|
goto done; |
|
} else { |
|
ctxt->instate = XML_PARSER_START_TAG; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering START_TAG\n"); |
|
#endif |
|
} |
|
break; |
|
case XML_PARSER_PROLOG: |
|
SKIP_BLANKS; |
|
if (in->buf == NULL) |
|
avail = in->length - (in->cur - in->base); |
|
else |
|
avail = in->buf->buffer->use - (in->cur - in->base); |
|
if (avail < 2) |
|
goto done; |
|
cur = in->cur[0]; |
|
next = in->cur[1]; |
|
if ((cur == '<') && (next == '!') && |
|
(in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing Comment\n"); |
|
#endif |
|
htmlParseComment(ctxt); |
|
ctxt->instate = XML_PARSER_PROLOG; |
|
} else if ((cur == '<') && (next == '!') && |
|
(avail < 4)) { |
|
goto done; |
|
} else { |
|
ctxt->instate = XML_PARSER_START_TAG; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering START_TAG\n"); |
|
#endif |
|
} |
|
break; |
|
case XML_PARSER_EPILOG: |
|
SKIP_BLANKS; |
|
if (in->buf == NULL) |
|
avail = in->length - (in->cur - in->base); |
|
else |
|
avail = in->buf->buffer->use - (in->cur - in->base); |
|
if (avail < 2) |
|
goto done; |
|
cur = in->cur[0]; |
|
next = in->cur[1]; |
|
if ((cur == '<') && (next == '!') && |
|
(in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing Comment\n"); |
|
#endif |
|
htmlParseComment(ctxt); |
|
ctxt->instate = XML_PARSER_EPILOG; |
|
} else if ((cur == '<') && (next == '!') && |
|
(avail < 4)) { |
|
goto done; |
|
} else { |
|
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
|
ctxt->sax->error(ctxt->userData, |
|
"Extra content at the end of the document\n"); |
|
ctxt->wellFormed = 0; |
|
ctxt->errNo = XML_ERR_DOCUMENT_END; |
|
ctxt->instate = XML_PARSER_EOF; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering EOF\n"); |
|
#endif |
|
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
ctxt->sax->endDocument(ctxt->userData); |
|
goto done; |
|
} |
|
break; |
|
case XML_PARSER_START_TAG: { |
|
xmlChar *name, *oldname; |
|
int depth = ctxt->nameNr; |
|
htmlElemDescPtr info; |
|
|
|
if (avail < 2) |
|
goto done; |
|
cur = in->cur[0]; |
|
if (cur != '<') { |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
} |
|
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0) |
|
goto done; |
|
|
|
oldname = xmlStrdup(ctxt->name); |
|
htmlParseStartTag(ctxt); |
|
name = ctxt->name; |
|
#ifdef DEBUG |
|
if (oldname == NULL) |
|
fprintf(stderr, "Start of element %s\n", name); |
|
else if (name == NULL) |
|
fprintf(stderr, "Start of element failed, was %s\n", |
|
oldname); |
|
else |
|
fprintf(stderr, "Start of element %s, was %s\n", |
|
name, oldname); |
|
#endif |
|
if (((depth == ctxt->nameNr) && |
|
(!xmlStrcmp(oldname, ctxt->name))) || |
|
(name == NULL)) { |
|
if (CUR == '>') |
|
NEXT; |
|
if (oldname != NULL) |
|
xmlFree(oldname); |
|
break; |
|
} |
|
if (oldname != NULL) |
|
xmlFree(oldname); |
|
|
|
/* |
|
* Lookup the info for that element. |
|
*/ |
|
info = htmlTagLookup(name); |
|
if (info == NULL) { |
|
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
|
ctxt->sax->error(ctxt->userData, "Tag %s invalid\n", |
|
name); |
|
ctxt->wellFormed = 0; |
|
} else if (info->depr) { |
|
/*************************** |
|
if ((ctxt->sax != NULL) && (ctxt->sax->warning != NULL)) |
|
ctxt->sax->warning(ctxt->userData, |
|
"Tag %s is deprecated\n", |
|
name); |
|
***************************/ |
|
} |
|
|
|
/* |
|
* Check for an Empty Element labelled the XML/SGML way |
|
*/ |
|
if ((CUR == '/') && (NXT(1) == '>')) { |
|
SKIP(2); |
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
ctxt->sax->endElement(ctxt->userData, name); |
|
oldname = htmlnamePop(ctxt); |
|
#ifdef DEBUG |
|
fprintf(stderr,"End of tag the XML way: popping out %s\n", |
|
oldname); |
|
#endif |
|
if (oldname != NULL) |
|
xmlFree(oldname); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
} |
|
|
|
if (CUR == '>') { |
|
NEXT; |
|
} else { |
|
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
|
ctxt->sax->error(ctxt->userData, |
|
"Couldn't find end of Start Tag %s\n", |
|
name); |
|
ctxt->wellFormed = 0; |
|
|
|
/* |
|
* end of parsing of this node. |
|
*/ |
|
if (!xmlStrcmp(name, ctxt->name)) { |
|
nodePop(ctxt); |
|
oldname = htmlnamePop(ctxt); |
|
#ifdef DEBUG |
|
fprintf(stderr, |
|
"End of start tag problem: popping out %s\n", oldname); |
|
#endif |
|
if (oldname != NULL) |
|
xmlFree(oldname); |
|
} |
|
|
|
ctxt->instate = XML_PARSER_CONTENT; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
} |
|
|
|
/* |
|
* Check for an Empty Element from DTD definition |
|
*/ |
|
if ((info != NULL) && (info->empty)) { |
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
ctxt->sax->endElement(ctxt->userData, name); |
|
oldname = htmlnamePop(ctxt); |
|
#ifdef DEBUG |
|
fprintf(stderr,"End of empty tag %s : popping out %s\n", name, oldname); |
|
#endif |
|
if (oldname != NULL) |
|
xmlFree(oldname); |
|
} |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
} |
|
case XML_PARSER_CONTENT: |
|
/* |
|
* Handle preparsed entities and charRef |
|
*/ |
|
if (ctxt->token != 0) { |
|
xmlChar cur[2] = { 0 , 0 } ; |
|
|
|
cur[0] = (xmlChar) ctxt->token; |
|
if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) |
|
ctxt->sax->characters(ctxt->userData, cur, 1); |
|
ctxt->token = 0; |
|
ctxt->checkIndex = 0; |
|
} |
|
if (avail < 2) |
|
goto done; |
|
cur = in->cur[0]; |
|
next = in->cur[1]; |
|
if ((cur == '<') && (next == '!') && |
|
(in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
if (htmlParseLookupSequence(ctxt, '-', '-', '>') < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing Comment\n"); |
|
#endif |
|
htmlParseComment(ctxt); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
} else if ((cur == '<') && (next == '!') && (avail < 4)) { |
|
goto done; |
|
} else if ((cur == '<') && (next == '/')) { |
|
ctxt->instate = XML_PARSER_END_TAG; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering END_TAG\n"); |
|
#endif |
|
break; |
|
} else if (cur == '<') { |
|
ctxt->instate = XML_PARSER_START_TAG; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering START_TAG\n"); |
|
#endif |
|
break; |
|
} else if (cur == '&') { |
|
if (htmlParseLookupSequence(ctxt, ';', 0, 0) < 0) |
|
goto done; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing Reference\n"); |
|
#endif |
|
/* TODO: check generation of subtrees if noent !!! */ |
|
htmlParseReference(ctxt); |
|
} else { |
|
/* TODO Avoid the extra copy, handle directly !!!!!! */ |
|
/* |
|
* Goal of the following test is : |
|
* - minimize calls to the SAX 'character' callback |
|
* when they are mergeable |
|
*/ |
|
if ((ctxt->inputNr == 1) && |
|
(avail < HTML_PARSER_BIG_BUFFER_SIZE)) { |
|
if (htmlParseLookupSequence(ctxt, '<', 0, 0) < 0) |
|
goto done; |
|
} |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: Parsing char data\n"); |
|
#endif |
|
htmlParseCharData(ctxt, 0); |
|
} |
|
break; |
|
case XML_PARSER_END_TAG: |
|
if (avail < 2) |
|
goto done; |
|
if (htmlParseLookupSequence(ctxt, '>', 0, 0) < 0) |
|
goto done; |
|
htmlParseEndTag(ctxt); |
|
if (ctxt->nameNr == 0) { |
|
ctxt->instate = XML_PARSER_EPILOG; |
|
} else { |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
} |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_CDATA_SECTION: |
|
fprintf(stderr, "HPP: internal error, state == CDATA\n"); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_DTD: |
|
fprintf(stderr, "HPP: internal error, state == DTD\n"); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_COMMENT: |
|
fprintf(stderr, "HPP: internal error, state == COMMENT\n"); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_PI: |
|
fprintf(stderr, "HPP: internal error, state == PI\n"); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_ENTITY_DECL: |
|
fprintf(stderr, "HPP: internal error, state == ENTITY_DECL\n"); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering CONTENT\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_ENTITY_VALUE: |
|
fprintf(stderr, "HPP: internal error, state == ENTITY_VALUE\n"); |
|
ctxt->instate = XML_PARSER_CONTENT; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering DTD\n"); |
|
#endif |
|
break; |
|
case XML_PARSER_ATTRIBUTE_VALUE: |
|
fprintf(stderr, "HPP: internal error, state == ATTRIBUTE_VALUE\n"); |
|
ctxt->instate = XML_PARSER_START_TAG; |
|
ctxt->checkIndex = 0; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: entering START_TAG\n"); |
|
#endif |
|
break; |
|
} |
|
} |
|
done: |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: done %d\n", ret); |
|
#endif |
|
return(ret); |
|
} |
|
|
|
/** |
|
* htmlParseChunk: |
|
* @ctxt: an XML parser context |
|
* @chunk: an char array |
|
* @size: the size in byte of the chunk |
|
* @terminate: last chunk indicator |
|
* |
|
* Parse a Chunk of memory |
|
* |
|
* Returns zero if no error, the xmlParserErrors otherwise. |
|
*/ |
|
int |
|
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, |
|
int terminate) { |
|
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && |
|
(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { |
|
int base = ctxt->input->base - ctxt->input->buf->buffer->content; |
|
int cur = ctxt->input->cur - ctxt->input->base; |
|
|
|
xmlParserInputBufferPush(ctxt->input->buf, size, chunk); |
|
ctxt->input->base = ctxt->input->buf->buffer->content + base; |
|
ctxt->input->cur = ctxt->input->base + cur; |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: pushed %d\n", size); |
|
#endif |
|
|
|
htmlParseTry(ctxt); |
|
} else if (ctxt->instate != XML_PARSER_EOF) |
|
htmlParseTry(ctxt); |
|
if (terminate) { |
|
if ((ctxt->instate != XML_PARSER_EOF) && |
|
(ctxt->instate != XML_PARSER_EPILOG) && |
|
(ctxt->instate != XML_PARSER_MISC)) { |
|
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
|
ctxt->sax->error(ctxt->userData, |
|
"Extra content at the end of the document\n"); |
|
ctxt->wellFormed = 0; |
|
ctxt->errNo = XML_ERR_DOCUMENT_END; |
|
} |
|
if (ctxt->instate != XML_PARSER_EOF) { |
|
if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
ctxt->sax->endDocument(ctxt->userData); |
|
} |
|
ctxt->instate = XML_PARSER_EOF; |
|
} |
|
return((xmlParserErrors) ctxt->errNo); |
|
} |
|
|
|
/************************************************************************ |
|
* * |
|
* User entry points * |
|
* * |
|
************************************************************************/ |
|
|
|
/** |
|
* htmlCreatePushParserCtxt : |
|
* @sax: a SAX handler |
|
* @user_data: The user data returned on SAX callbacks |
|
* @chunk: a pointer to an array of chars |
|
* @size: number of chars in the array |
|
* @filename: an optional file name or URI |
|
* @enc: an optional encoding |
|
* |
|
* Create a parser context for using the HTML parser in push mode |
|
* To allow content encoding detection, @size should be >= 4 |
|
* The value of @filename is used for fetching external entities |
|
* and error/warning reports. |
|
* |
|
* Returns the new parser context or NULL |
|
*/ |
|
htmlParserCtxtPtr |
|
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, |
|
const char *chunk, int size, const char *filename, |
|
xmlCharEncoding enc) { |
|
htmlParserCtxtPtr ctxt; |
|
htmlParserInputPtr inputStream; |
|
xmlParserInputBufferPtr buf; |
|
|
|
buf = xmlAllocParserInputBuffer(enc); |
|
if (buf == NULL) return(NULL); |
|
|
|
ctxt = (htmlParserCtxtPtr) xmlMalloc(sizeof(htmlParserCtxt)); |
|
if (ctxt == NULL) { |
|
xmlFree(buf); |
|
return(NULL); |
|
} |
|
memset(ctxt, 0, sizeof(htmlParserCtxt)); |
|
htmlInitParserCtxt(ctxt); |
|
if (sax != NULL) { |
|
if (ctxt->sax != &htmlDefaultSAXHandler) |
|
xmlFree(ctxt->sax); |
|
ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); |
|
if (ctxt->sax == NULL) { |
|
xmlFree(buf); |
|
xmlFree(ctxt); |
|
return(NULL); |
|
} |
|
memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); |
|
if (user_data != NULL) |
|
ctxt->userData = user_data; |
|
} |
|
if (filename == NULL) { |
|
ctxt->directory = NULL; |
|
} else { |
|
ctxt->directory = xmlParserGetDirectory(filename); |
|
} |
|
|
|
inputStream = htmlNewInputStream(ctxt); |
|
if (inputStream == NULL) { |
|
xmlFreeParserCtxt(ctxt); |
|
return(NULL); |
|
} |
|
|
|
if (filename == NULL) |
|
inputStream->filename = NULL; |
|
else |
|
inputStream->filename = xmlMemStrdup(filename); |
|
inputStream->buf = buf; |
|
inputStream->base = inputStream->buf->buffer->content; |
|
inputStream->cur = inputStream->buf->buffer->content; |
|
|
|
inputPush(ctxt, inputStream); |
|
|
|
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && |
|
(ctxt->input->buf != NULL)) { |
|
xmlParserInputBufferPush(ctxt->input->buf, size, chunk); |
|
#ifdef DEBUG_PUSH |
|
fprintf(stderr, "HPP: pushed %d\n", size); |
|
#endif |
|
} |
|
|
|
return(ctxt); |
|
} |
|
|
/** |
/** |
* htmlSAXParseDoc : |
* htmlSAXParseDoc : |