Diff for /XML/HTMLparser.c between versions 1.60 and 1.61

version 1.60, 2000/08/22 22:20:34 version 1.61, 2000/08/27 22:38:17
Line 607  htmlInitAutoClose(void) { Line 607  htmlInitAutoClose(void) {
  */   */
 htmlElemDescPtr  htmlElemDescPtr
 htmlTagLookup(const xmlChar *tag) {  htmlTagLookup(const xmlChar *tag) {
     int i = 0;      int i;
   
     for (i = 0; i < (sizeof(html40ElementTable) /      for (i = 0; i < (sizeof(html40ElementTable) /
                      sizeof(html40ElementTable[0]));i++) {                       sizeof(html40ElementTable[0]));i++) {
Line 911  htmlCheckParagraph(htmlParserCtxtPtr ctx Line 911  htmlCheckParagraph(htmlParserCtxtPtr ctx
   
 htmlEntityDesc  html40EntitiesTable[] = {  htmlEntityDesc  html40EntitiesTable[] = {
 /*  /*
  * the 4 absolute ones,   * the 4 absolute ones, plus apostrophe.
  */   */
 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },  { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
 { 38,   "amp",  "ampersand, U+0026 ISOnum" },  { 38,   "amp",  "ampersand, U+0026 ISOnum" },
   { 39,   "apos", "single quote" },
 { 60,   "lt",   "less-than sign, U+003C ISOnum" },  { 60,   "lt",   "less-than sign, U+003C ISOnum" },
 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },  { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
   
Line 922  htmlEntityDesc  html40EntitiesTable[] = Line 923  htmlEntityDesc  html40EntitiesTable[] =
  * A bunch still in the 128-255 range   * A bunch still in the 128-255 range
  * Replacing them depend really on the charset used.   * Replacing them depend really on the charset used.
  */   */
 { 39,   "apos", "single quote" },  
 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },  { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },  { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
 { 162,  "cent", "cent sign, U+00A2 ISOnum" },  { 162,  "cent", "cent sign, U+00A2 ISOnum" },
Line 1020  htmlEntityDesc  html40EntitiesTable[] = Line 1020  htmlEntityDesc  html40EntitiesTable[] =
 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },  { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },  { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
   
   { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
   { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
   { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
   { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
   { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
   
 /*  /*
  * Anything below should really be kept as entities references   * Anything below should really be kept as entities references
  */   */
 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },  { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
   
   { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
   { 732,  "tilde","small tilde, U+02DC ISOdia" },
   
 { 913,  "Alpha","greek capital letter alpha, U+0391" },  { 913,  "Alpha","greek capital letter alpha, U+0391" },
 { 914,  "Beta", "greek capital letter beta, U+0392" },  { 914,  "Beta", "greek capital letter beta, U+0392" },
 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },  { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
Line 1079  htmlEntityDesc  html40EntitiesTable[] = Line 1088  htmlEntityDesc  html40EntitiesTable[] =
 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },  { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },  { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
   
   { 8194, "ensp", "en space, U+2002 ISOpub" },
   { 8195, "emsp", "em space, U+2003 ISOpub" },
   { 8201, "thinsp","thin space, U+2009 ISOpub" },
   { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
   { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
   { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
   { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
   { 8211, "ndash","en dash, U+2013 ISOpub" },
   { 8212, "mdash","em dash, U+2014 ISOpub" },
   { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
   { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
   { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
   { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
   { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
   { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
   { 8224, "dagger","dagger, U+2020 ISOpub" },
   { 8225, "Dagger","double dagger, U+2021 ISOpub" },
   
 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },  { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },  { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
   
   { 8240, "permil","per mille sign, U+2030 ISOtech" },
   
 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },  { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },  { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
   
   { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
   { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
   
 { 8254, "oline","overline = spacing overscore, U+203E NEW" },  { 8254, "oline","overline = spacing overscore, U+203E NEW" },
 { 8260, "frasl","fraction slash, U+2044 NEW" },  { 8260, "frasl","fraction slash, U+2044 NEW" },
   
 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },  { 8364, "euro", "euro sign, U+20AC NEW" },
   
 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },  { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
   { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },  { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
 { 8482, "trade","trade mark sign, U+2122 ISOnum" },  { 8482, "trade","trade mark sign, U+2122 ISOnum" },
 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },  { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
Line 1103  htmlEntityDesc  html40EntitiesTable[] = Line 1139  htmlEntityDesc  html40EntitiesTable[] =
 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },  { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },  { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
   
   
 { 8704, "forall","for all, U+2200 ISOtech" },  { 8704, "forall","for all, U+2200 ISOtech" },
 { 8706, "part", "partial differential, U+2202 ISOtech" },  { 8706, "part", "partial differential, U+2202 ISOtech" },
 { 8707, "exist","there exists, U+2203 ISOtech" },  { 8707, "exist","there exists, U+2203 ISOtech" },
Line 1155  htmlEntityDesc  html40EntitiesTable[] = Line 1190  htmlEntityDesc  html40EntitiesTable[] =
 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },  { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
 { 9830, "diams","black diamond suit, U+2666 ISOpub" },  { 9830, "diams","black diamond suit, U+2666 ISOpub" },
   
 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },  
 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },  
 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },  
 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },  
 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },  
 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },  
 { 732,  "tilde","small tilde, U+02DC ISOdia" },  
   
 { 8194, "ensp", "en space, U+2002 ISOpub" },  
 { 8195, "emsp", "em space, U+2003 ISOpub" },  
 { 8201, "thinsp","thin space, U+2009 ISOpub" },  
 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },  
 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },  
 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },  
 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },  
 { 8211, "ndash","en dash, U+2013 ISOpub" },  
 { 8212, "mdash","em dash, U+2014 ISOpub" },  
 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },  
 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },  
 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },  
 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },  
 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },  
 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },  
 { 8224, "dagger","dagger, U+2020 ISOpub" },  
 { 8225, "Dagger","double dagger, U+2021 ISOpub" },  
 { 8240, "permil","per mille sign, U+2030 ISOtech" },  
 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },  
 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },  
 { 8364, "euro", "euro sign, U+20AC NEW" }  
 };  };
   
 /************************************************************************  /************************************************************************
Line 1231  htmlEntityLookup(const xmlChar *name) { Line 1237  htmlEntityLookup(const xmlChar *name) {
 }  }
   
 /**  /**
    * htmlEntityValueLookup:
    * @value: the entity's unicode value
    *
    * Lookup the given entity in EntitiesTable
    *
    * TODO: the linear scan is really ugly, an hash table is really needed.
    *
    * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
    */
   htmlEntityDescPtr
   htmlEntityValueLookup(int value) {
       int i;
   #ifdef DEBUG
       int lv = 0;
   #endif
   
       for (i = 0;i < (sizeof(html40EntitiesTable)/
                       sizeof(html40EntitiesTable[0]));i++) {
           if (html40EntitiesTable[i].value >= value) {
               if (html40EntitiesTable[i].value > value)
                   break;
   #ifdef DEBUG
               fprintf(stderr,"Found entity %s\n", html40EntitiesTable[i].name);
   #endif
               return(&html40EntitiesTable[i]);
           }
   #ifdef DEBUG
           if (lv > html40EntitiesTable[i].value) {
               fprintf(stderr, "html40EntitiesTable[] is not sorted (%d > %d)!\n",
                       lv, html40EntitiesTable[i].value);
           }
           lv = html40EntitiesTable[i].value;
   #endif
       }
       return(NULL);
   }
   
   /**
  * UTF8ToHtml:   * UTF8ToHtml:
  * @out:  a pointer to an array of bytes to store the result   * @out:  a pointer to an array of bytes to store the result
  * @outlen:  the length of @out   * @outlen:  the length of @out
Line 1301  UTF8ToHtml(unsigned char* out, int *outl Line 1345  UTF8ToHtml(unsigned char* out, int *outl
                 break;                  break;
             *out++ = c;              *out++ = c;
         } else {          } else {
             int i, j, len;              int len;
               htmlEntityDescPtr ent;
   
             /*              /*
              * Try to lookup a predefined HTML entity for it               * Try to lookup a predefined HTML entity for it
              */               */
   
             for (i = 0;i < (sizeof(html40EntitiesTable)/              ent = htmlEntityValueLookup(c);
                             sizeof(html40EntitiesTable[0]));i++) {              if (ent == NULL) {
                 if (html40EntitiesTable[i].value == c) {                  /* no chance for this in Ascii */
 #ifdef DEBUG                  *outlen = out - outstart;
                     fprintf(stderr,"Found entity %s\n",                   *inlen = processed - instart;
                             html40EntitiesTable[i].name);                  return(-2);
 #endif  
                     goto found_ent;  
                 }  
                 if (html40EntitiesTable[i].value > c)  
                     break;  
             }              }
               len = strlen(ent->name);
             /* no chance for this in Ascii */              if (out + 2 + len > outend)
             *outlen = out - outstart;  
             *inlen = processed - instart;  
             return(-2);  
 found_ent:  
             len = strlen(html40EntitiesTable[i].name);  
             if (out + 2 + len >= outend)  
                 break;                  break;
             *out++ = '&';              *out++ = '&';
             for (j = 0;j < len;j++)              memcpy(out, ent->name, len);
                 *out++ = html40EntitiesTable[i].name[j];              out += len;
             *out++ = ';';              *out++ = ';';
         }          }
         processed = in;          processed = in;

Removed from v.1.60  
changed lines
  Added in v.1.61


Webmaster