Annotation of libwww/Library/src/HTMLGen.c, revision 2.25

2.25    ! frystyk     1: /*                                                                   HTMLGen.c
        !             2: **     HTML GENERATOR
        !             3: **
        !             4: **     (c) COPYRIGHT CERN 1994.
        !             5: **     Please first read the full copyright statement in the file COPYRIGH.
2.1       timbl       6: **
                      7: **     This version of the HTML object sends HTML markup to the output stream.
                      8: **
                      9: ** Bugs:       Line wrapping is not done at all.
                     10: **             All data handled as PCDATA.
                     11: **             Should convert old XMP, LISTING and PLAINTEXT to PRE.
                     12: **
                     13: **     It is not obvious to me right now whether the HEAD should be generated
2.7       timbl      14: **     from the incomming data or the anchor.  Currently it is from the former
2.17      timbl      15: **     which is cleanest. TBL
2.22      duns       16: **
                     17: ** HISTORY:
                     18: **      8 Jul 94  FM   Insulate free() from _free structure element.
                     19: **
2.1       timbl      20: */
                     21: 
2.12      timbl      22: #include "HTMLPDTD.h"
2.1       timbl      23: #include "HTStream.h"
                     24: #include "SGML.h"
                     25: #include "HTFormat.h"
2.23      frystyk    26: #include "HTMLGen.h"                                    /* Implemented here */
                     27: 
                     28: #define BUFFER_SIZE    80      /* Line buffer attempts to make neat breaks */
2.1       timbl      29: 
2.3       timbl      30: #define PUTC(c) (*me->targetClass.put_character)(me->target, c)
2.7       timbl      31: /* #define PUTS(s) (*me->targetClass.put_string)(me->target, s) */
2.4       timbl      32: #define PUTB(s,l) (*me->targetClass.put_block)(me->target, s, l)
2.1       timbl      33: 
                     34: /*             HTML Object
                     35: **             -----------
                     36: */
                     37: 
                     38: struct _HTStream {
                     39:        CONST HTStreamClass *           isa;    
                     40:        HTStream *                      target;
                     41:        HTStreamClass                   targetClass;    /* COPY for speed */
                     42: };
                     43: 
2.21      timbl      44: #define MAX_CLEANNESS 10
2.1       timbl      45: struct _HTStructured {
                     46:        CONST HTStructuredClass *       isa;
                     47:        HTStream *                      target;
                     48:        HTStreamClass                   targetClass;    /* COPY for speed */
2.12      timbl      49:        CONST SGML_dtd *                dtd;
2.17      timbl      50:        BOOL                            seven_bit;      /* restrict output*/
2.7       timbl      51:        
2.14      frystyk    52:        char                            buffer[BUFFER_SIZE+1];
2.7       timbl      53:        char *                          write_pointer;
2.21      timbl      54:        char *                          line_break [MAX_CLEANNESS+1];
2.7       timbl      55:        int                             cleanness;
2.21      timbl      56:        BOOL                            overflowed;
                     57:        BOOL                            delete_line_break_char
                     58:                                                [MAX_CLEANNESS+1];
2.14      frystyk    59:        char                            preformatted;
2.1       timbl      60: };
                     61: 
2.17      timbl      62: /*                     OUTPUT FUNCTIONS
                     63: **
                     64: **     These function output the finished SGML stream doing the
                     65: **     line wrap
                     66: */
                     67: 
2.7       timbl      68: /*     Flush Buffer
                     69: **     ------------
                     70: */
2.21      timbl      71: 
                     72: PRIVATE void flush_breaks ARGS1(HTStructured *, me)
                     73: {
                     74:     int i;
                     75:     for (i=0; i<= MAX_CLEANNESS; i++) {
                     76:         me->line_break[i] = NULL;
                     77:     }
                     78: }
                     79: 
                     80: 
2.7       timbl      81: PRIVATE void HTMLGen_flush ARGS1(HTStructured *, me)
                     82: {
                     83:     (*me->targetClass.put_block)(me->target, 
                     84:                                me->buffer,
                     85:                                me->write_pointer - me->buffer);
                     86:     me->write_pointer = me->buffer;
2.21      timbl      87:     flush_breaks(me);
2.7       timbl      88:     me->cleanness = 0;
2.21      timbl      89: }
                     90: 
                     91: 
                     92: /*     Weighted optional line break
                     93: **
                     94: **     We keep track of all the breaks for when we chop the line
                     95: */
                     96: 
                     97: PRIVATE void allow_break ARGS3(HTStructured *, me, int, new_cleanness,
                     98:                BOOL, dlbc)
                     99: {
                    100:     me->line_break[new_cleanness] = 
                    101:                         dlbc ? me->write_pointer - 1 /* Point to space */
                    102:                              : me->write_pointer ;   /* point to gap */
                    103:     me->delete_line_break_char[new_cleanness] = dlbc;
                    104:     if (new_cleanness >= me->cleanness)
                    105:        me->cleanness = new_cleanness;
2.7       timbl     106: }
                    107: 
                    108: 
2.1       timbl     109: /*     Character handling
                    110: **     ------------------
2.8       timbl     111: **
                    112: **     The tricky bits are the line break handling.  This attempts
                    113: **     to synchrononise line breaks on sentence or phrase ends. This
                    114: **     is important if one stores SGML files in a line-oriented code
                    115: **     repository, so that if a small change is made, line ends don't
                    116: **     shift in a ripple-through to apparently change a large part of the
                    117: **     file. We give extra "cleanness" to spaces appearing directly
                    118: **     after periods (full stops), [semi]colons and commas.
                    119: **        This should make the source files easier to read and modify
2.17      timbl     120: **     by hand, too, though this is not a primary design consideration. TBL
2.1       timbl     121: */
2.21      timbl     122: PRIVATE char delims[] = ",;:.";                /* @@ english bias */
2.17      timbl     123: PRIVATE void HTMLGen_output_character ARGS2(HTStructured *, me, char, c)
2.1       timbl     124: {
2.7       timbl     125: 
                    126:     *me->write_pointer++ = c;
                    127:     
2.21      timbl     128:     if (c=='\n') {             /* Newlines */
                    129:         if (me->preformatted) {
                    130:            HTMLGen_flush(me);
                    131:            return;
                    132:        } else {
                    133:            me->write_pointer[-1] = c = ' ';    /* Treat same as space */
                    134:        }
2.7       timbl     135:     }
                    136:     
2.21      timbl     137:     /* Figure our whether we can break at this point
                    138:     */
2.7       timbl     139:     if ((!me->preformatted  && c==' ')) {
2.8       timbl     140:         int new_cleanness = 1;
                    141:        if (me->write_pointer > (me->buffer + 1)) {
2.9       luotonen  142:            char * p;
2.11      timbl     143:            p = strchr(delims, me->write_pointer[-2]);
2.21      timbl     144:            if (p) new_cleanness = p - delims + 4;
2.8       timbl     145:        }
2.21      timbl     146:        allow_break(me, new_cleanness, YES);
2.7       timbl     147:     }
                    148:     
2.21      timbl     149:     /* Flush buffer out when full, or whenever the line is over
                    150:        the nominal maximum and we can break at all
                    151:     */
                    152:     if (me->write_pointer >= me->buffer + BUFFER_SIZE-1
                    153:         ||  (me->overflowed && me->cleanness)) {
                    154:        if (me->cleanness) {
                    155:            char line_break_char = me->line_break[me->cleanness][0];
                    156:            char * saved = me->line_break[me->cleanness];
2.8       timbl     157:            
2.21      timbl     158:            if (me->delete_line_break_char[me->cleanness]) saved++; 
                    159:            me->line_break[me->cleanness][0] = '\n';
2.7       timbl     160:            (*me->targetClass.put_block)(me->target,
                    161:                                        me->buffer,
2.21      timbl     162:                                        me->line_break[me->cleanness] - me->buffer + 1);
                    163:            me->line_break[me->cleanness][0] = line_break_char;
2.7       timbl     164:            {  /* move next line in */
2.8       timbl     165:                char * p=saved;
                    166:                char *q;
                    167:                for(q=me->buffer; p < me->write_pointer; )
2.7       timbl     168:                        *q++ = *p++;
                    169:            }
                    170:            me->cleanness = 0;
2.21      timbl     171:            /* Now we have to check whether ther are any perfectly good breaks
                    172:            ** which weren't good enough for the last line but may be
                    173:            **  good enough for the next
                    174:            */
                    175:            {
                    176:                int i;
                    177:                for(i=0; i <= MAX_CLEANNESS; i++) {
                    178:                    if (me->line_break[i] > saved) {
                    179:                        me->line_break[i] = me->line_break[i] -
                    180:                                                (saved-me->buffer);
                    181:                        me->cleanness = i;
                    182:                    } else {
                    183:                        me->line_break[i] = NULL;
                    184:                    }
                    185:                }
                    186:            }
                    187: 
2.8       timbl     188:            me->write_pointer = me->write_pointer - (saved-me->buffer);
2.21      timbl     189:            me->overflowed = NO;
                    190:        } else {   /* No break- just output with no newline */
2.7       timbl     191:            (*me->targetClass.put_block)(me->target,
2.14      frystyk   192:                                         me->buffer,
2.15      luotonen  193:                                         me->write_pointer - me->buffer);
2.8       timbl     194:            me->write_pointer = me->buffer;
2.21      timbl     195:            flush_breaks(me);
                    196:            me->overflowed = YES;
2.7       timbl     197:        }
                    198:     }
2.1       timbl     199: }
                    200: 
                    201: 
                    202: 
                    203: /*     String handling
                    204: **     ---------------
                    205: */
2.17      timbl     206: PRIVATE void HTMLGen_output_string ARGS2(HTStructured *, me, CONST char*, s)
                    207: {
2.24      frystyk   208:     while (*s) HTMLGen_output_character(me, *s++);
2.17      timbl     209: }
                    210: 
                    211: 
                    212: /*                     INPUT FUNCTIONS
                    213: **
                    214: **     These take data from the structured stream.  In the input
                    215: **     stream, entities are in raw form.  The seven_bit flag controls
                    216: **     whether the ISO Latin-1 charactrs are represented in SGML entity
                    217: **     form.  This is only recommended for viewing on older non-latin-1
                    218: **     capable equipment, or for mailing for example. 
                    219: **
                    220: ** Bug: assumes local encoding is ISO!
                    221: */     
                    222: PRIVATE void HTMLGen_put_character ARGS2(HTStructured *, me, char, c)
                    223: {
                    224:     if (c=='&') HTMLGen_output_string(me, "&amp;");
                    225:     else if (c=='<') HTMLGen_output_string(me, "&lt;");
                    226:     else if (me->seven_bit && ((unsigned char)c > 127)) {
                    227:         char temp[8];
                    228:        sprintf(temp, "&%d;", c);
                    229:        HTMLGen_output_string(me, temp);
                    230:     }
                    231:     else HTMLGen_output_character(me, c);
                    232: }
                    233: 
2.3       timbl     234: PRIVATE void HTMLGen_put_string ARGS2(HTStructured *, me, CONST char*, s)
2.1       timbl     235: {
2.24      frystyk   236:     while (*s) HTMLGen_put_character(me, *s++);
2.1       timbl     237: }
                    238: 
2.3       timbl     239: PRIVATE void HTMLGen_write ARGS3(HTStructured *, me, CONST char*, s, int, l)
2.1       timbl     240: {
2.24      frystyk   241:     while (l-- > 0) HTMLGen_put_character(me, *s++);
2.1       timbl     242: }
                    243: 
                    244: 
                    245: /*     Start Element
                    246: **     -------------
2.7       timbl     247: **
                    248: **     Within the opening tag, there may be spaces
                    249: **     and the line may be broken at these spaces.
2.1       timbl     250: */
                    251: PRIVATE void HTMLGen_start_element ARGS4(
2.3       timbl     252:        HTStructured *,         me,
2.2       timbl     253:        int,                    element_number,
                    254:        CONST BOOL*,            present,
                    255:        CONST char **,          value)
2.1       timbl     256: {
                    257:     int i;
2.12      timbl     258:     HTTag * tag = &me->dtd->tags[element_number];
2.1       timbl     259: 
2.20      timbl     260:     /* Control line breaks allowed within tag! */
                    261:     int was_preformatted = me->preformatted;   /* save state */
                    262:     me->preformatted = 1;      /* Can break between attributes */
                    263: 
2.17      timbl     264:     HTMLGen_output_character(me, '<');
                    265:     HTMLGen_output_string(me, tag->name);
2.1       timbl     266:     if (present) for (i=0; i< tag->number_of_attributes; i++) {
                    267:         if (present[i]) {
2.17      timbl     268:            HTMLGen_output_character(me, ' ');
2.21      timbl     269:            allow_break(me, 1, YES);
2.17      timbl     270:            HTMLGen_output_string(me, tag->attributes[i].name);
2.1       timbl     271:            if (value[i]) {
2.17      timbl     272:                HTMLGen_output_string(me, "=\"");
                    273:                HTMLGen_output_string(me, value[i]);
                    274:                HTMLGen_output_character(me, '"');
2.1       timbl     275:            }
                    276:        }
                    277:     }
2.20      timbl     278:     me->preformatted = was_preformatted;       /* Restore state */
                    279: 
2.14      frystyk   280:     /* Nested PRE is no more a problem! */
                    281:     if (element_number == HTML_PRE)
                    282:        me->preformatted++;
2.19      timbl     283: 
                    284:     HTMLGen_output_character(me, '>');
2.7       timbl     285:     
2.20      timbl     286:     /* Here is a funny one.  In PRE, newlines are significant, except of
                    287:     course for one after the <PRE> which is ignored. This means that
                    288:     we MUST put in a dummy one after the <PRE> to protect any real newline
                    289:     within the pre section.
                    290:     
                    291:     However, *within* a PRE section, although we can break after
                    292:     (for example) emphasis start tags, it will probably confuse some
                    293:     parsers so we won't.*/
                    294:     
                    295:     if (element_number == HTML_PRE) {
                    296:         HTMLGen_output_character(me, '\n');
                    297:     } else  if (!me->preformatted && 
                    298:         tag->contents != SGML_EMPTY) {  /* can break after element start */ 
2.21      timbl     299:        allow_break(me, 3, NO);
2.8       timbl     300:     }
2.1       timbl     301: }
                    302: 
                    303: 
2.17      timbl     304: /*     End Element
                    305: **     -----------
2.1       timbl     306: **
2.16      timbl     307: **      The rules for insertring CR LF into SGML are weird, strict, and
                    308: **     nonintitive.
2.20      timbl     309: **     See comment also about PRE above.
2.1       timbl     310: */
2.3       timbl     311: PRIVATE void HTMLGen_end_element ARGS2(HTStructured *, me,
2.24      frystyk   312:                                      int , element_number)
2.1       timbl     313: {
2.20      timbl     314:     if (element_number == HTML_PRE) {
                    315:         HTMLGen_output_character(me, '\n');
                    316:     } else  if (!me->preformatted) { /* can break before element end */ 
2.21      timbl     317:        allow_break(me, 1, NO);
2.8       timbl     318:     }
2.17      timbl     319:     HTMLGen_output_string(me, "</");
                    320:     HTMLGen_output_string(me, me->dtd->tags[element_number].name);
                    321:     HTMLGen_output_character(me, '>');    /* NO break after. TBL 940501 */
2.14      frystyk   322:     if (element_number == HTML_PRE && me->preformatted)
                    323:        me->preformatted--;
2.1       timbl     324: }
                    325: 
                    326: 
2.17      timbl     327: /*     Expanding entities
                    328: **     ------------------
2.1       timbl     329: **
                    330: */
                    331: 
2.3       timbl     332: PRIVATE void HTMLGen_put_entity ARGS2(HTStructured *, me, int, entity_number)
2.1       timbl     333: {
2.17      timbl     334:     HTMLGen_output_character(me, '&');
                    335:     HTMLGen_output_string(me, me->dtd->entity_names[entity_number]);
                    336:     HTMLGen_output_character(me, ';');
2.1       timbl     337: }
                    338: 
                    339: 
                    340: 
2.17      timbl     341: /*     Free an object
                    342: **     --------------
2.1       timbl     343: **
                    344: */
2.24      frystyk   345: PRIVATE int HTMLGen_free ARGS1(HTStructured *, me)
2.1       timbl     346: {
2.21      timbl     347:     HTMLGen_flush(me);
2.7       timbl     348:     (*me->targetClass.put_character)(me->target, '\n');
2.22      duns      349:     (*me->targetClass._free)(me->target);      /* ripple through */
2.3       timbl     350:     free(me);
2.24      frystyk   351:     return 0;
2.1       timbl     352: }
                    353: 
                    354: 
2.24      frystyk   355: PRIVATE int PlainToHTML_free ARGS1(HTStructured *, me)
2.7       timbl     356: {
                    357:     HTMLGen_end_element(me, HTML_PRE);
                    358:     HTMLGen_end_element(me, HTML_BODY);
                    359:     HTMLGen_end_element(me, HTML_HTML);
                    360:     HTMLGen_free(me);
2.24      frystyk   361:     return 0;
2.7       timbl     362: }
                    363: 
                    364: 
2.1       timbl     365: 
2.24      frystyk   366: PRIVATE int HTMLGen_abort ARGS2(HTStructured *, me, HTError, e)
2.1       timbl     367: {
2.6       timbl     368:     HTMLGen_free(me);
2.24      frystyk   369:     return EOF;
2.1       timbl     370: }
                    371: 
                    372: 
2.24      frystyk   373: PRIVATE int PlainToHTML_abort ARGS2(HTStructured *, me, HTError, e)
2.1       timbl     374: {
2.7       timbl     375:     PlainToHTML_free(me);
2.24      frystyk   376:     return EOF;
2.1       timbl     377: }
                    378: 
                    379: 
                    380: 
                    381: /*     Structured Object Class
                    382: **     -----------------------
                    383: */
2.5       timbl     384: PRIVATE CONST HTStructuredClass HTMLGeneration = /* As opposed to print etc */
2.1       timbl     385: {              
                    386:        "text/html",
                    387:        HTMLGen_free,
2.6       timbl     388:        HTMLGen_abort,
2.1       timbl     389:        HTMLGen_put_character,  HTMLGen_put_string, HTMLGen_write,
2.13      frystyk   390:        HTMLGen_start_element,  HTMLGen_end_element,
2.1       timbl     391:        HTMLGen_put_entity
                    392: }; 
                    393: 
                    394: 
                    395: /*     Subclass-specific Methods
                    396: **     -------------------------
                    397: */
                    398: 
                    399: PUBLIC HTStructured * HTMLGenerator ARGS1(HTStream *, output)
                    400: {
2.18      luotonen  401:     HTStructured* me = (HTStructured*)calloc(1,sizeof(*me));
2.3       timbl     402:     if (me == NULL) outofmem(__FILE__, "HTMLGenerator");
                    403:     me->isa = &HTMLGeneration;       
2.12      timbl     404:     me->dtd = &HTMLP_dtd;
2.1       timbl     405: 
2.3       timbl     406:     me->target = output;
                    407:     me->targetClass = *me->target->isa; /* Copy pointers to routines for speed*/
2.7       timbl     408:     
                    409:     me->write_pointer = me->buffer;
2.21      timbl     410:     flush_breaks(me);
2.3       timbl     411:     return me;
2.1       timbl     412: }
                    413: 
                    414: /*     Stream Object Class
                    415: **     -------------------
                    416: **
2.2       timbl     417: **     This object just converts a plain text stream into HTML
2.12      timbl     418: **     It is officially a structured stream but only the stream bits exist.
2.2       timbl     419: **     This is just the easiest way of typecasting all the routines.
2.1       timbl     420: */
2.2       timbl     421: PRIVATE CONST HTStructuredClass PlainToHTMLConversion =
2.1       timbl     422: {              
                    423:        "plaintexttoHTML",
2.13      frystyk   424:        PlainToHTML_free,       /* HTMLGen_free,  Henrik 03/03-94 */
2.6       timbl     425:        PlainToHTML_abort,      
2.1       timbl     426:        HTMLGen_put_character,
                    427:        HTMLGen_put_string,
                    428:        HTMLGen_write,
2.2       timbl     429:        NULL,           /* Structured stuff */
                    430:        NULL,
                    431:        NULL
2.1       timbl     432: }; 
                    433: 
                    434: 
                    435: /*     HTConverter from plain text to HTML Stream
                    436: **     ------------------------------------------
2.13      frystyk   437: **
                    438: ** Changed by henrik 03/03-94, so no more core dumps etc. (I hope!!!)
2.1       timbl     439: */
                    440: 
2.12      timbl     441: PUBLIC HTStream* HTPlainToHTML ARGS5(
                    442:        HTRequest *,            request,
                    443:        void *,                 param,
                    444:        HTFormat,               input_format,
                    445:        HTFormat,               output_format,
                    446:        HTStream *,             output_stream)
2.1       timbl     447: {
2.13      frystyk   448:     BOOL present[MAX_ATTRIBUTES];      /* Flags: attribute is present? */
                    449:     CONST char *value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
2.18      luotonen  450:     HTStructured* me = (HTStructured*)calloc(1,sizeof(*me));
2.3       timbl     451:     if (me == NULL) outofmem(__FILE__, "PlainToHTML");
2.13      frystyk   452:     
                    453:     memset(present, '\0', MAX_ATTRIBUTES);
                    454:     memset(value, '\0', MAX_ATTRIBUTES*sizeof(char *));
                    455:     
                    456:     me->isa = (HTStructuredClass*) &PlainToHTMLConversion;
2.12      timbl     457:     me->dtd = &HTMLP_dtd;
                    458:     me->target = output_stream;
2.13      frystyk   459:     me->targetClass = *me->target->isa;/* Copy pointers to routines for speed*/
                    460:     me->write_pointer = me->buffer;
2.21      timbl     461:     flush_breaks(me);
2.13      frystyk   462:     
                    463:     HTMLGen_start_element(me, HTML_HTML, present, value);
                    464:     HTMLGen_start_element(me, HTML_BODY, present, value);
                    465:     HTMLGen_start_element(me, HTML_PRE, present, value);
                    466: 
2.7       timbl     467:     return (HTStream*) me;
2.1       timbl     468: }
2.13      frystyk   469: 
                    470: 
2.17      timbl     471: /*     A safe version for making 7-bit restricted HTML
                    472: **     Beware that thsi makes it horrible for the Scandinavians
                    473: **     to actually read it.
                    474: */
                    475: 
                    476: PUBLIC HTStream* HTPlainTo7BitHTML ARGS5(
                    477:        HTRequest *,            request,
                    478:        void *,                 param,
                    479:        HTFormat,               input_format,
                    480:        HTFormat,               output_format,
                    481:        HTStream *,             output_stream)
2.13      frystyk   482: 
2.17      timbl     483: {
                    484:     HTStream* me = HTPlainToHTML(request,param,input_format,
                    485:                output_format, output_stream);
                    486:     ((HTStructured*)me)->seven_bit = YES;
                    487:     return me;
                    488: }
2.1       timbl     489: 

Webmaster