Annotation of libwww/Library/src/HTMLGen.c, revision 2.43

2.25      frystyk     1: /*                                                                   HTMLGen.c
                      2: **     HTML GENERATOR
                      3: **
2.29      frystyk     4: **     (c) COPYRIGHT MIT 1995.
2.25      frystyk     5: **     Please first read the full copyright statement in the file COPYRIGH.
2.43    ! frystyk     6: **     @(#) $Id: HTMLGen.c,v 2.42 1996/04/12 17:47:48 frystyk Exp $
2.1       timbl       7: **
                      8: **     This version of the HTML object sends HTML markup to the output stream.
                      9: **
                     10: ** Bugs:       Line wrapping is not done at all.
                     11: **             All data handled as PCDATA.
                     12: **             Should convert old XMP, LISTING and PLAINTEXT to PRE.
                     13: **
                     14: **     It is not obvious to me right now whether the HEAD should be generated
2.7       timbl      15: **     from the incomming data or the anchor.  Currently it is from the former
2.17      timbl      16: **     which is cleanest. TBL
2.22      duns       17: **
                     18: ** HISTORY:
                     19: **      8 Jul 94  FM   Insulate free() from _free structure element.
                     20: **
2.1       timbl      21: */
                     22: 
2.27      frystyk    23: /* Library include files */
2.43    ! frystyk    24: #include "wwwsys.h"
2.27      frystyk    25: #include "HTUtils.h"
2.12      timbl      26: #include "HTMLPDTD.h"
2.31      frystyk    27: #include "HTStruct.h"
2.1       timbl      28: #include "HTFormat.h"
2.23      frystyk    29: #include "HTMLGen.h"                                    /* Implemented here */
                     30: 
                     31: #define BUFFER_SIZE    80      /* Line buffer attempts to make neat breaks */
2.31      frystyk    32: #define MAX_CLEANNESS  10
2.1       timbl      33: 
2.31      frystyk    34: #define PUT_CHAR(c)    (*me->target->isa->put_character)(me->target, c)
                     35: #define PUT_STR(s)     (*me->target->isa->put_string)(me->target, s)
                     36: #define PUT_BLOCK(s,l) (*me->target->isa->put_block)(me->target, s, l)
2.1       timbl      37: 
2.31      frystyk    38: /* HTML Generator Object */
2.1       timbl      39: struct _HTStream {
2.41      frystyk    40:     const HTStreamClass *      isa;
2.31      frystyk    41:     HTStream *                 target;
2.1       timbl      42: };
                     43: 
                     44: struct _HTStructured {
2.41      frystyk    45:     const HTStructuredClass *  isa;
2.31      frystyk    46:     HTStream *                         target;
2.41      frystyk    47:     const SGML_dtd *           dtd;
2.31      frystyk    48:     BOOL                       seven_bit;                /* restrict output */
2.7       timbl      49:        
2.31      frystyk    50:     char                       buffer[BUFFER_SIZE+1];
                     51:     char *                     write_pointer;
                     52:     char *                     line_break [MAX_CLEANNESS+1];
                     53:     int                                cleanness;
                     54:     BOOL                       overflowed;
                     55:     BOOL                       delete_line_break_char[MAX_CLEANNESS+1];
                     56:     char                       preformatted;
2.1       timbl      57: };
                     58: 
2.17      timbl      59: /*                     OUTPUT FUNCTIONS
                     60: **
                     61: **     These function output the finished SGML stream doing the
                     62: **     line wrap
                     63: */
                     64: 
2.7       timbl      65: /*     Flush Buffer
                     66: **     ------------
                     67: */
2.21      timbl      68: 
2.37      frystyk    69: PRIVATE void flush_breaks (HTStructured * me)
2.21      timbl      70: {
                     71:     int i;
                     72:     for (i=0; i<= MAX_CLEANNESS; i++) {
                     73:         me->line_break[i] = NULL;
                     74:     }
                     75: }
                     76: 
                     77: 
2.37      frystyk    78: PRIVATE int HTMLGen_flush (HTStructured * me)
2.7       timbl      79: {
2.31      frystyk    80:     PUT_BLOCK(me->buffer, me->write_pointer - me->buffer);
2.7       timbl      81:     me->write_pointer = me->buffer;
2.21      timbl      82:     flush_breaks(me);
2.7       timbl      83:     me->cleanness = 0;
2.28      frystyk    84:     return HT_OK;
2.21      timbl      85: }
                     86: 
                     87: 
                     88: /*     Weighted optional line break
                     89: **
                     90: **     We keep track of all the breaks for when we chop the line
                     91: */
                     92: 
2.37      frystyk    93: PRIVATE void allow_break (HTStructured * me, int new_cleanness, BOOL dlbc)
2.21      timbl      94: {
                     95:     me->line_break[new_cleanness] = 
                     96:                         dlbc ? me->write_pointer - 1 /* Point to space */
                     97:                              : me->write_pointer ;   /* point to gap */
                     98:     me->delete_line_break_char[new_cleanness] = dlbc;
                     99:     if (new_cleanness >= me->cleanness)
                    100:        me->cleanness = new_cleanness;
2.7       timbl     101: }
                    102: 
                    103: 
2.1       timbl     104: /*     Character handling
                    105: **     ------------------
2.8       timbl     106: **
                    107: **     The tricky bits are the line break handling.  This attempts
                    108: **     to synchrononise line breaks on sentence or phrase ends. This
                    109: **     is important if one stores SGML files in a line-oriented code
                    110: **     repository, so that if a small change is made, line ends don't
                    111: **     shift in a ripple-through to apparently change a large part of the
                    112: **     file. We give extra "cleanness" to spaces appearing directly
                    113: **     after periods (full stops), [semi]colons and commas.
                    114: **        This should make the source files easier to read and modify
2.17      timbl     115: **     by hand, too, though this is not a primary design consideration. TBL
2.1       timbl     116: */
2.21      timbl     117: PRIVATE char delims[] = ",;:.";                /* @@ english bias */
2.37      frystyk   118: PRIVATE int HTMLGen_output_character (HTStructured * me, char c)
2.1       timbl     119: {
2.7       timbl     120: 
                    121:     *me->write_pointer++ = c;
                    122:     
2.21      timbl     123:     if (c=='\n') {             /* Newlines */
                    124:         if (me->preformatted) {
                    125:            HTMLGen_flush(me);
2.28      frystyk   126:            return HT_OK;
2.21      timbl     127:        } else {
                    128:            me->write_pointer[-1] = c = ' ';    /* Treat same as space */
                    129:        }
2.7       timbl     130:     }
                    131:     
2.21      timbl     132:     /* Figure our whether we can break at this point
                    133:     */
2.7       timbl     134:     if ((!me->preformatted  && c==' ')) {
2.8       timbl     135:         int new_cleanness = 1;
                    136:        if (me->write_pointer > (me->buffer + 1)) {
2.9       luotonen  137:            char * p;
2.11      timbl     138:            p = strchr(delims, me->write_pointer[-2]);
2.21      timbl     139:            if (p) new_cleanness = p - delims + 4;
2.8       timbl     140:        }
2.21      timbl     141:        allow_break(me, new_cleanness, YES);
2.7       timbl     142:     }
                    143:     
2.21      timbl     144:     /* Flush buffer out when full, or whenever the line is over
                    145:        the nominal maximum and we can break at all
                    146:     */
                    147:     if (me->write_pointer >= me->buffer + BUFFER_SIZE-1
                    148:         ||  (me->overflowed && me->cleanness)) {
                    149:        if (me->cleanness) {
                    150:            char line_break_char = me->line_break[me->cleanness][0];
                    151:            char * saved = me->line_break[me->cleanness];
2.8       timbl     152:            
2.21      timbl     153:            if (me->delete_line_break_char[me->cleanness]) saved++; 
                    154:            me->line_break[me->cleanness][0] = '\n';
2.31      frystyk   155:            PUT_BLOCK(me->buffer, me->line_break[me->cleanness]-me->buffer+1);
2.21      timbl     156:            me->line_break[me->cleanness][0] = line_break_char;
2.7       timbl     157:            {  /* move next line in */
2.8       timbl     158:                char * p=saved;
                    159:                char *q;
                    160:                for(q=me->buffer; p < me->write_pointer; )
2.7       timbl     161:                        *q++ = *p++;
                    162:            }
                    163:            me->cleanness = 0;
2.21      timbl     164:            /* Now we have to check whether ther are any perfectly good breaks
                    165:            ** which weren't good enough for the last line but may be
                    166:            **  good enough for the next
                    167:            */
                    168:            {
                    169:                int i;
                    170:                for(i=0; i <= MAX_CLEANNESS; i++) {
                    171:                    if (me->line_break[i] > saved) {
                    172:                        me->line_break[i] = me->line_break[i] -
                    173:                                                (saved-me->buffer);
                    174:                        me->cleanness = i;
                    175:                    } else {
                    176:                        me->line_break[i] = NULL;
                    177:                    }
                    178:                }
                    179:            }
                    180: 
2.8       timbl     181:            me->write_pointer = me->write_pointer - (saved-me->buffer);
2.21      timbl     182:            me->overflowed = NO;
                    183:        } else {   /* No break- just output with no newline */
2.31      frystyk   184:            PUT_BLOCK(me->buffer, me->write_pointer - me->buffer);
2.8       timbl     185:            me->write_pointer = me->buffer;
2.21      timbl     186:            flush_breaks(me);
                    187:            me->overflowed = YES;
2.7       timbl     188:        }
                    189:     }
2.28      frystyk   190:     return HT_OK;
2.1       timbl     191: }
                    192: 
                    193: 
                    194: /*     String handling
                    195: **     ---------------
                    196: */
2.41      frystyk   197: PRIVATE int HTMLGen_output_string (HTStructured * me, const char* s)
2.17      timbl     198: {
2.28      frystyk   199:     while (*s)
                    200:        HTMLGen_output_character(me, *s++);
                    201:     return HT_OK;
2.17      timbl     202: }
                    203: 
                    204: 
                    205: /*                     INPUT FUNCTIONS
                    206: **
                    207: **     These take data from the structured stream.  In the input
                    208: **     stream, entities are in raw form.  The seven_bit flag controls
                    209: **     whether the ISO Latin-1 charactrs are represented in SGML entity
                    210: **     form.  This is only recommended for viewing on older non-latin-1
                    211: **     capable equipment, or for mailing for example. 
                    212: **
                    213: ** Bug: assumes local encoding is ISO!
                    214: */     
2.37      frystyk   215: PRIVATE int HTMLGen_put_character (HTStructured * me, char c)
2.17      timbl     216: {
                    217:     if (c=='&') HTMLGen_output_string(me, "&amp;");
                    218:     else if (c=='<') HTMLGen_output_string(me, "&lt;");
                    219:     else if (me->seven_bit && ((unsigned char)c > 127)) {
                    220:         char temp[8];
                    221:        sprintf(temp, "&%d;", c);
                    222:        HTMLGen_output_string(me, temp);
2.28      frystyk   223:     } else
                    224:        HTMLGen_output_character(me, c);
                    225:     return HT_OK;
2.17      timbl     226: }
                    227: 
2.41      frystyk   228: PRIVATE int HTMLGen_put_string (HTStructured * me, const char* s)
2.1       timbl     229: {
2.28      frystyk   230:     while (*s)
                    231:        HTMLGen_put_character(me, *s++);
                    232:     return HT_OK;
2.1       timbl     233: }
                    234: 
2.41      frystyk   235: PRIVATE int HTMLGen_write (HTStructured * me, const char* b, int l)
2.1       timbl     236: {
2.28      frystyk   237:     while (l-- > 0)
                    238:        HTMLGen_put_character(me, *b++);
                    239:     return HT_OK;
2.1       timbl     240: }
                    241: 
                    242: 
                    243: /*     Start Element
                    244: **     -------------
2.7       timbl     245: **
                    246: **     Within the opening tag, there may be spaces
                    247: **     and the line may be broken at these spaces.
2.1       timbl     248: */
2.37      frystyk   249: PRIVATE void HTMLGen_start_element (
                    250:        HTStructured *  me,
                    251:        int                     element_number,
2.41      frystyk   252:        const BOOL*             present,
                    253:        const char **           value)
2.1       timbl     254: {
                    255:     int i;
2.12      timbl     256:     HTTag * tag = &me->dtd->tags[element_number];
2.1       timbl     257: 
2.20      timbl     258:     /* Control line breaks allowed within tag! */
                    259:     int was_preformatted = me->preformatted;   /* save state */
                    260:     me->preformatted = 1;      /* Can break between attributes */
                    261: 
2.17      timbl     262:     HTMLGen_output_character(me, '<');
                    263:     HTMLGen_output_string(me, tag->name);
2.1       timbl     264:     if (present) for (i=0; i< tag->number_of_attributes; i++) {
                    265:         if (present[i]) {
2.17      timbl     266:            HTMLGen_output_character(me, ' ');
2.21      timbl     267:            allow_break(me, 1, YES);
2.17      timbl     268:            HTMLGen_output_string(me, tag->attributes[i].name);
2.1       timbl     269:            if (value[i]) {
2.17      timbl     270:                HTMLGen_output_string(me, "=\"");
                    271:                HTMLGen_output_string(me, value[i]);
                    272:                HTMLGen_output_character(me, '"');
2.1       timbl     273:            }
                    274:        }
                    275:     }
2.20      timbl     276:     me->preformatted = was_preformatted;       /* Restore state */
                    277: 
2.14      frystyk   278:     /* Nested PRE is no more a problem! */
                    279:     if (element_number == HTML_PRE)
                    280:        me->preformatted++;
2.19      timbl     281: 
                    282:     HTMLGen_output_character(me, '>');
2.7       timbl     283:     
2.20      timbl     284:     /* Here is a funny one.  In PRE, newlines are significant, except of
                    285:     course for one after the <PRE> which is ignored. This means that
                    286:     we MUST put in a dummy one after the <PRE> to protect any real newline
                    287:     within the pre section.
                    288:     
                    289:     However, *within* a PRE section, although we can break after
                    290:     (for example) emphasis start tags, it will probably confuse some
                    291:     parsers so we won't.*/
                    292:     
                    293:     if (element_number == HTML_PRE) {
                    294:         HTMLGen_output_character(me, '\n');
                    295:     } else  if (!me->preformatted && 
                    296:         tag->contents != SGML_EMPTY) {  /* can break after element start */ 
2.21      timbl     297:        allow_break(me, 3, NO);
2.8       timbl     298:     }
2.1       timbl     299: }
                    300: 
                    301: 
2.17      timbl     302: /*     End Element
                    303: **     -----------
2.1       timbl     304: **
2.16      timbl     305: **      The rules for insertring CR LF into SGML are weird, strict, and
                    306: **     nonintitive.
2.20      timbl     307: **     See comment also about PRE above.
2.1       timbl     308: */
2.37      frystyk   309: PRIVATE void HTMLGen_end_element (HTStructured * me, int element_number)
2.1       timbl     310: {
2.20      timbl     311:     if (element_number == HTML_PRE) {
                    312:         HTMLGen_output_character(me, '\n');
                    313:     } else  if (!me->preformatted) { /* can break before element end */ 
2.21      timbl     314:        allow_break(me, 1, NO);
2.8       timbl     315:     }
2.17      timbl     316:     HTMLGen_output_string(me, "</");
                    317:     HTMLGen_output_string(me, me->dtd->tags[element_number].name);
                    318:     HTMLGen_output_character(me, '>');    /* NO break after. TBL 940501 */
2.14      frystyk   319:     if (element_number == HTML_PRE && me->preformatted)
                    320:        me->preformatted--;
2.1       timbl     321: }
                    322: 
                    323: 
2.17      timbl     324: /*     Expanding entities
                    325: **     ------------------
2.1       timbl     326: **
                    327: */
                    328: 
2.37      frystyk   329: PRIVATE void HTMLGen_put_entity (HTStructured * me, int entity_number)
2.1       timbl     330: {
2.17      timbl     331:     HTMLGen_output_character(me, '&');
                    332:     HTMLGen_output_string(me, me->dtd->entity_names[entity_number]);
                    333:     HTMLGen_output_character(me, ';');
2.1       timbl     334: }
                    335: 
2.17      timbl     336: /*     Free an object
                    337: **     --------------
2.1       timbl     338: **
                    339: */
2.37      frystyk   340: PRIVATE int HTMLGen_free (HTStructured * me)
2.1       timbl     341: {
2.21      timbl     342:     HTMLGen_flush(me);
2.31      frystyk   343:     PUT_CHAR('\n');
                    344:     (*me->target->isa->_free)(me->target);
2.39      frystyk   345:     HT_FREE(me);
2.28      frystyk   346:     return HT_OK;
2.1       timbl     347: }
                    348: 
                    349: 
2.37      frystyk   350: PRIVATE int PlainToHTML_free (HTStructured * me)
2.7       timbl     351: {
                    352:     HTMLGen_end_element(me, HTML_PRE);
                    353:     HTMLGen_end_element(me, HTML_BODY);
                    354:     HTMLGen_end_element(me, HTML_HTML);
                    355:     HTMLGen_free(me);
2.28      frystyk   356:     return HT_OK;
2.7       timbl     357: }
                    358: 
                    359: 
2.1       timbl     360: 
2.37      frystyk   361: PRIVATE int HTMLGen_abort (HTStructured * me, HTList * e)
2.1       timbl     362: {
2.6       timbl     363:     HTMLGen_free(me);
2.28      frystyk   364:     return HT_ERROR;
2.1       timbl     365: }
                    366: 
                    367: 
2.37      frystyk   368: PRIVATE int PlainToHTML_abort (HTStructured * me, HTList * e)
2.1       timbl     369: {
2.7       timbl     370:     PlainToHTML_free(me);
2.28      frystyk   371:     return HT_ERROR;
2.1       timbl     372: }
                    373: 
                    374: 
                    375: 
                    376: /*     Structured Object Class
                    377: **     -----------------------
                    378: */
2.41      frystyk   379: PRIVATE const HTStructuredClass HTMLGeneration = /* As opposed to print etc */
2.1       timbl     380: {              
                    381:        "text/html",
2.28      frystyk   382:        HTMLGen_flush,
2.1       timbl     383:        HTMLGen_free,
2.6       timbl     384:        HTMLGen_abort,
2.1       timbl     385:        HTMLGen_put_character,  HTMLGen_put_string, HTMLGen_write,
2.13      frystyk   386:        HTMLGen_start_element,  HTMLGen_end_element,
2.1       timbl     387:        HTMLGen_put_entity
                    388: }; 
                    389: 
                    390: 
                    391: /*     Subclass-specific Methods
                    392: **     -------------------------
                    393: */
2.37      frystyk   394: PUBLIC HTStructured* HTMLGenerator (HTRequest *        request,
                    395:                                    void *      param,
                    396:                                    HTFormat    input_format,
                    397:                                    HTFormat    output_format,
                    398:                                    HTStream *  output_stream)
2.1       timbl     399: {
2.39      frystyk   400:     HTStructured* me;
                    401:     if ((me = (HTStructured  *) HT_CALLOC(1, sizeof(HTStructured))) == NULL)
                    402:         HT_OUTOFMEM("HTMLGenerator");
2.3       timbl     403:     me->isa = &HTMLGeneration;       
2.12      timbl     404:     me->dtd = &HTMLP_dtd;
2.31      frystyk   405:     if ((me->target = HTStreamStack(WWW_HTML, output_format, output_stream,
                    406:                                    request, YES)) == NULL) {
                    407:        if (STREAM_TRACE)
2.40      eric      408:            HTTrace("HTMLGen..... Can't convert to media type\n");
2.39      frystyk   409:        HT_FREE(me);
2.38      frystyk   410:        me->target = HTErrorStream();
2.31      frystyk   411:     }
2.7       timbl     412:     me->write_pointer = me->buffer;
2.21      timbl     413:     flush_breaks(me);
2.3       timbl     414:     return me;
2.1       timbl     415: }
                    416: 
                    417: /*     Stream Object Class
                    418: **     -------------------
                    419: **
2.2       timbl     420: **     This object just converts a plain text stream into HTML
2.12      timbl     421: **     It is officially a structured stream but only the stream bits exist.
2.2       timbl     422: **     This is just the easiest way of typecasting all the routines.
2.1       timbl     423: */
2.41      frystyk   424: PRIVATE const HTStructuredClass PlainToHTMLConversion =
2.1       timbl     425: {              
                    426:        "plaintexttoHTML",
2.28      frystyk   427:        HTMLGen_flush,
2.13      frystyk   428:        PlainToHTML_free,       /* HTMLGen_free,  Henrik 03/03-94 */
2.6       timbl     429:        PlainToHTML_abort,      
2.1       timbl     430:        HTMLGen_put_character,
                    431:        HTMLGen_put_string,
                    432:        HTMLGen_write,
2.2       timbl     433:        NULL,           /* Structured stuff */
                    434:        NULL,
                    435:        NULL
2.1       timbl     436: }; 
                    437: 
                    438: 
                    439: /*     HTConverter from plain text to HTML Stream
                    440: **     ------------------------------------------
2.13      frystyk   441: **
                    442: ** Changed by henrik 03/03-94, so no more core dumps etc. (I hope!!!)
2.1       timbl     443: */
                    444: 
2.37      frystyk   445: PUBLIC HTStream* HTPlainToHTML (HTRequest *    request,
                    446:                                void *          param,
                    447:                                HTFormat        input_format,
                    448:                                HTFormat        output_format,
                    449:                                HTStream *      output_stream)
2.1       timbl     450: {
2.13      frystyk   451:     BOOL present[MAX_ATTRIBUTES];      /* Flags: attribute is present? */
2.41      frystyk   452:     const char *value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
2.39      frystyk   453:     HTStructured* me;
                    454:     if ((me = (HTStructured *) HT_CALLOC(1,sizeof(*me))) == NULL)
                    455:         HT_OUTOFMEM("PlainToHTML");
2.13      frystyk   456:     
2.32      frystyk   457:     memset((void *) present, '\0', MAX_ATTRIBUTES);
                    458:     memset((void *) value, '\0', MAX_ATTRIBUTES*sizeof(char *));
2.13      frystyk   459:     
                    460:     me->isa = (HTStructuredClass*) &PlainToHTMLConversion;
2.12      timbl     461:     me->dtd = &HTMLP_dtd;
                    462:     me->target = output_stream;
2.13      frystyk   463:     me->write_pointer = me->buffer;
2.21      timbl     464:     flush_breaks(me);
2.13      frystyk   465:     
2.33      frystyk   466:     if (me->target) {
                    467:        HTMLGen_start_element(me, HTML_HTML, present, value);
                    468:        HTMLGen_start_element(me, HTML_BODY, present, value);
                    469:        HTMLGen_start_element(me, HTML_PRE, present, value);
                    470:     }
2.7       timbl     471:     return (HTStream*) me;
2.1       timbl     472: }
2.13      frystyk   473: 
                    474: 
2.17      timbl     475: /*     A safe version for making 7-bit restricted HTML
                    476: **     Beware that thsi makes it horrible for the Scandinavians
                    477: **     to actually read it.
2.30      frystyk   478: **     ehh - not horrible - THIS REALLY PISSES THEM OFF - Henrik ;-)
2.17      timbl     479: */
                    480: 
2.37      frystyk   481: PUBLIC HTStream* HTPlainTo7BitHTML (HTRequest *        request,
                    482:                                    void *      param,
                    483:                                    HTFormat    input_format,
                    484:                                    HTFormat    output_format,
                    485:                                    HTStream *  output_stream)
2.17      timbl     486: {
                    487:     HTStream* me = HTPlainToHTML(request,param,input_format,
                    488:                output_format, output_stream);
                    489:     ((HTStructured*)me)->seven_bit = YES;
                    490:     return me;
                    491: }
2.1       timbl     492: 

Webmaster