version 1.60, 2000/08/22 22:20:34
|
version 1.61, 2000/08/27 22:38:17
|
Line 607 htmlInitAutoClose(void) {
|
Line 607 htmlInitAutoClose(void) {
|
*/ |
*/ |
htmlElemDescPtr |
htmlElemDescPtr |
htmlTagLookup(const xmlChar *tag) { |
htmlTagLookup(const xmlChar *tag) { |
int i = 0; |
int i; |
|
|
for (i = 0; i < (sizeof(html40ElementTable) / |
for (i = 0; i < (sizeof(html40ElementTable) / |
sizeof(html40ElementTable[0]));i++) { |
sizeof(html40ElementTable[0]));i++) { |
Line 911 htmlCheckParagraph(htmlParserCtxtPtr ctx
|
Line 911 htmlCheckParagraph(htmlParserCtxtPtr ctx
|
|
|
htmlEntityDesc html40EntitiesTable[] = { |
htmlEntityDesc html40EntitiesTable[] = { |
/* |
/* |
* the 4 absolute ones, |
* the 4 absolute ones, plus apostrophe. |
*/ |
*/ |
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, |
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, |
{ 38, "amp", "ampersand, U+0026 ISOnum" }, |
{ 38, "amp", "ampersand, U+0026 ISOnum" }, |
|
{ 39, "apos", "single quote" }, |
{ 60, "lt", "less-than sign, U+003C ISOnum" }, |
{ 60, "lt", "less-than sign, U+003C ISOnum" }, |
{ 62, "gt", "greater-than sign, U+003E ISOnum" }, |
{ 62, "gt", "greater-than sign, U+003E ISOnum" }, |
|
|
Line 922 htmlEntityDesc html40EntitiesTable[] =
|
Line 923 htmlEntityDesc html40EntitiesTable[] =
|
* A bunch still in the 128-255 range |
* A bunch still in the 128-255 range |
* Replacing them depend really on the charset used. |
* Replacing them depend really on the charset used. |
*/ |
*/ |
{ 39, "apos", "single quote" }, |
|
{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, |
{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, |
{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, |
{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, |
{ 162, "cent", "cent sign, U+00A2 ISOnum" }, |
{ 162, "cent", "cent sign, U+00A2 ISOnum" }, |
Line 1020 htmlEntityDesc html40EntitiesTable[] =
|
Line 1020 htmlEntityDesc html40EntitiesTable[] =
|
{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, |
{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, |
{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, |
{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, |
|
|
|
{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, |
|
{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, |
|
{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, |
|
{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, |
|
{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, |
|
|
/* |
/* |
* Anything below should really be kept as entities references |
* Anything below should really be kept as entities references |
*/ |
*/ |
{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, |
{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, |
|
|
|
{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, |
|
{ 732, "tilde","small tilde, U+02DC ISOdia" }, |
|
|
{ 913, "Alpha","greek capital letter alpha, U+0391" }, |
{ 913, "Alpha","greek capital letter alpha, U+0391" }, |
{ 914, "Beta", "greek capital letter beta, U+0392" }, |
{ 914, "Beta", "greek capital letter beta, U+0392" }, |
{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, |
{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, |
Line 1079 htmlEntityDesc html40EntitiesTable[] =
|
Line 1088 htmlEntityDesc html40EntitiesTable[] =
|
{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, |
{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, |
{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, |
{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, |
|
|
|
{ 8194, "ensp", "en space, U+2002 ISOpub" }, |
|
{ 8195, "emsp", "em space, U+2003 ISOpub" }, |
|
{ 8201, "thinsp","thin space, U+2009 ISOpub" }, |
|
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, |
|
{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, |
|
{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, |
|
{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, |
|
{ 8211, "ndash","en dash, U+2013 ISOpub" }, |
|
{ 8212, "mdash","em dash, U+2014 ISOpub" }, |
|
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, |
|
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, |
|
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, |
|
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, |
|
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, |
|
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, |
|
{ 8224, "dagger","dagger, U+2020 ISOpub" }, |
|
{ 8225, "Dagger","double dagger, U+2021 ISOpub" }, |
|
|
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, |
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, |
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, |
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, |
|
|
|
{ 8240, "permil","per mille sign, U+2030 ISOtech" }, |
|
|
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, |
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, |
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, |
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, |
|
|
|
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, |
|
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, |
|
|
{ 8254, "oline","overline = spacing overscore, U+203E NEW" }, |
{ 8254, "oline","overline = spacing overscore, U+203E NEW" }, |
{ 8260, "frasl","fraction slash, U+2044 NEW" }, |
{ 8260, "frasl","fraction slash, U+2044 NEW" }, |
|
|
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, |
{ 8364, "euro", "euro sign, U+20AC NEW" }, |
|
|
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, |
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, |
|
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, |
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, |
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, |
{ 8482, "trade","trade mark sign, U+2122 ISOnum" }, |
{ 8482, "trade","trade mark sign, U+2122 ISOnum" }, |
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, |
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, |
Line 1103 htmlEntityDesc html40EntitiesTable[] =
|
Line 1139 htmlEntityDesc html40EntitiesTable[] =
|
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, |
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, |
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, |
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, |
|
|
|
|
{ 8704, "forall","for all, U+2200 ISOtech" }, |
{ 8704, "forall","for all, U+2200 ISOtech" }, |
{ 8706, "part", "partial differential, U+2202 ISOtech" }, |
{ 8706, "part", "partial differential, U+2202 ISOtech" }, |
{ 8707, "exist","there exists, U+2203 ISOtech" }, |
{ 8707, "exist","there exists, U+2203 ISOtech" }, |
Line 1155 htmlEntityDesc html40EntitiesTable[] =
|
Line 1190 htmlEntityDesc html40EntitiesTable[] =
|
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, |
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, |
{ 9830, "diams","black diamond suit, U+2666 ISOpub" }, |
{ 9830, "diams","black diamond suit, U+2666 ISOpub" }, |
|
|
{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, |
|
{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, |
|
{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, |
|
{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, |
|
{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, |
|
{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, |
|
{ 732, "tilde","small tilde, U+02DC ISOdia" }, |
|
|
|
{ 8194, "ensp", "en space, U+2002 ISOpub" }, |
|
{ 8195, "emsp", "em space, U+2003 ISOpub" }, |
|
{ 8201, "thinsp","thin space, U+2009 ISOpub" }, |
|
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, |
|
{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, |
|
{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, |
|
{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, |
|
{ 8211, "ndash","en dash, U+2013 ISOpub" }, |
|
{ 8212, "mdash","em dash, U+2014 ISOpub" }, |
|
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, |
|
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, |
|
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, |
|
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, |
|
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, |
|
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, |
|
{ 8224, "dagger","dagger, U+2020 ISOpub" }, |
|
{ 8225, "Dagger","double dagger, U+2021 ISOpub" }, |
|
{ 8240, "permil","per mille sign, U+2030 ISOtech" }, |
|
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, |
|
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, |
|
{ 8364, "euro", "euro sign, U+20AC NEW" } |
|
}; |
}; |
|
|
/************************************************************************ |
/************************************************************************ |
Line 1231 htmlEntityLookup(const xmlChar *name) {
|
Line 1237 htmlEntityLookup(const xmlChar *name) {
|
} |
} |
|
|
/** |
/** |
|
* htmlEntityValueLookup: |
|
* @value: the entity's unicode value |
|
* |
|
* Lookup the given entity in EntitiesTable |
|
* |
|
* TODO: the linear scan is really ugly, an hash table is really needed. |
|
* |
|
* Returns the associated htmlEntityDescPtr if found, NULL otherwise. |
|
*/ |
|
htmlEntityDescPtr |
|
htmlEntityValueLookup(int value) { |
|
int i; |
|
#ifdef DEBUG |
|
int lv = 0; |
|
#endif |
|
|
|
for (i = 0;i < (sizeof(html40EntitiesTable)/ |
|
sizeof(html40EntitiesTable[0]));i++) { |
|
if (html40EntitiesTable[i].value >= value) { |
|
if (html40EntitiesTable[i].value > value) |
|
break; |
|
#ifdef DEBUG |
|
fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name); |
|
#endif |
|
return(&html40EntitiesTable[i]); |
|
} |
|
#ifdef DEBUG |
|
if (lv > html40EntitiesTable[i].value) { |
|
fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n", |
|
lv, html40EntitiesTable[i].value); |
|
} |
|
lv = html40EntitiesTable[i].value; |
|
#endif |
|
} |
|
return(NULL); |
|
} |
|
|
|
/** |
* UTF8ToHtml: |
* UTF8ToHtml: |
* @out: a pointer to an array of bytes to store the result |
* @out: a pointer to an array of bytes to store the result |
* @outlen: the length of @out |
* @outlen: the length of @out |
Line 1301 UTF8ToHtml(unsigned char* out, int *outl
|
Line 1345 UTF8ToHtml(unsigned char* out, int *outl
|
break; |
break; |
*out++ = c; |
*out++ = c; |
} else { |
} else { |
int i, j, len; |
int len; |
|
htmlEntityDescPtr ent; |
|
|
/* |
/* |
* Try to lookup a predefined HTML entity for it |
* Try to lookup a predefined HTML entity for it |
*/ |
*/ |
|
|
for (i = 0;i < (sizeof(html40EntitiesTable)/ |
ent = htmlEntityValueLookup(c); |
sizeof(html40EntitiesTable[0]));i++) { |
if (ent == NULL) { |
if (html40EntitiesTable[i].value == c) { |
/* no chance for this in Ascii */ |
#ifdef DEBUG |
*outlen = out - outstart; |
fprintf(stderr,"Found entity %s\n", |
*inlen = processed - instart; |
html40EntitiesTable[i].name); |
return(-2); |
#endif |
|
goto found_ent; |
|
} |
|
if (html40EntitiesTable[i].value > c) |
|
break; |
|
} |
} |
|
len = strlen(ent->name); |
/* no chance for this in Ascii */ |
if (out + 2 + len > outend) |
*outlen = out - outstart; |
|
*inlen = processed - instart; |
|
return(-2); |
|
found_ent: |
|
len = strlen(html40EntitiesTable[i].name); |
|
if (out + 2 + len >= outend) |
|
break; |
break; |
*out++ = '&'; |
*out++ = '&'; |
for (j = 0;j < len;j++) |
memcpy(out, ent->name, len); |
*out++ = html40EntitiesTable[i].name[j]; |
out += len; |
*out++ = ';'; |
*out++ = ';'; |
} |
} |
processed = in; |
processed = in; |