File:  [Public] / libwww / Library / src / HTMLGen.c
Revision 2.43: download - view: text, annotated - select for diffs
Mon May 4 19:36:56 1998 UTC (26 years, 1 month ago) by frystyk
Branches: MAIN
CVS tags: Release-5-2, Release-5-1m, HEAD
version 5.1m

/*								      HTMLGen.c
**	HTML GENERATOR
**
**	(c) COPYRIGHT MIT 1995.
**	Please first read the full copyright statement in the file COPYRIGH.
**	@(#) $Id: HTMLGen.c,v 2.43 1998/05/04 19:36:56 frystyk Exp $
**
**	This version of the HTML object sends HTML markup to the output stream.
**
** Bugs:	Line wrapping is not done at all.
**		All data handled as PCDATA.
**		Should convert old XMP, LISTING and PLAINTEXT to PRE.
**
**	It is not obvious to me right now whether the HEAD should be generated
**	from the incomming data or the anchor.  Currently it is from the former
**	which is cleanest. TBL
**
** HISTORY:
**	 8 Jul 94  FM	Insulate free() from _free structure element.
**
*/

/* Library include files */
#include "wwwsys.h"
#include "HTUtils.h"
#include "HTMLPDTD.h"
#include "HTStruct.h"
#include "HTFormat.h"
#include "HTMLGen.h"				         /* Implemented here */

#define BUFFER_SIZE	80	/* Line buffer attempts to make neat breaks */
#define MAX_CLEANNESS	10

#define PUT_CHAR(c)	(*me->target->isa->put_character)(me->target, c)
#define PUT_STR(s)	(*me->target->isa->put_string)(me->target, s)
#define PUT_BLOCK(s,l)	(*me->target->isa->put_block)(me->target, s, l)

/* HTML Generator Object */
struct _HTStream {
    const HTStreamClass *	isa;
    HTStream *			target;
};

struct _HTStructured {
    const HTStructuredClass *	isa;
    HTStream * 			target;
    const SGML_dtd *		dtd;
    BOOL			seven_bit;		  /* restrict output */
	
    char			buffer[BUFFER_SIZE+1];
    char *			write_pointer;
    char *			line_break [MAX_CLEANNESS+1];
    int				cleanness;
    BOOL			overflowed;
    BOOL			delete_line_break_char[MAX_CLEANNESS+1];
    char			preformatted;
};

/*			OUTPUT FUNCTIONS
**
**	These function output the finished SGML stream doing the
**	line wrap
*/

/*	Flush Buffer
**	------------
*/

PRIVATE void flush_breaks (HTStructured * me)
{
    int i;
    for (i=0; i<= MAX_CLEANNESS; i++) {
        me->line_break[i] = NULL;
    }
}


PRIVATE int HTMLGen_flush (HTStructured * me)
{
    PUT_BLOCK(me->buffer, me->write_pointer - me->buffer);
    me->write_pointer = me->buffer;
    flush_breaks(me);
    me->cleanness = 0;
    return HT_OK;
}


/*	Weighted optional line break
**
**	We keep track of all the breaks for when we chop the line
*/

PRIVATE void allow_break (HTStructured * me, int new_cleanness, BOOL dlbc)
{
    me->line_break[new_cleanness] = 
			 dlbc ? me->write_pointer - 1 /* Point to space */
			      : me->write_pointer ;   /* point to gap */
    me->delete_line_break_char[new_cleanness] = dlbc;
    if (new_cleanness >= me->cleanness)
	me->cleanness = new_cleanness;
}


/*	Character handling
**	------------------
**
**	The tricky bits are the line break handling.  This attempts
**	to synchrononise line breaks on sentence or phrase ends. This
**	is important if one stores SGML files in a line-oriented code
**	repository, so that if a small change is made, line ends don't
**	shift in a ripple-through to apparently change a large part of the
**	file. We give extra "cleanness" to spaces appearing directly
**	after periods (full stops), [semi]colons and commas.
**	   This should make the source files easier to read and modify
**	by hand, too, though this is not a primary design consideration. TBL
*/
PRIVATE char delims[] = ",;:.";		/* @@ english bias */
PRIVATE int HTMLGen_output_character (HTStructured * me, char c)
{

    *me->write_pointer++ = c;
    
    if (c=='\n') {		/* Newlines */
        if (me->preformatted) {
	    HTMLGen_flush(me);
	    return HT_OK;
	} else {
	    me->write_pointer[-1] = c = ' ';	/* Treat same as space */
	}
    }
    
    /* Figure our whether we can break at this point
    */
    if ((!me->preformatted  && c==' ')) {
        int new_cleanness = 1;
	if (me->write_pointer > (me->buffer + 1)) {
	    char * p;
	    p = strchr(delims, me->write_pointer[-2]);
	    if (p) new_cleanness = p - delims + 4;
	}
	allow_break(me, new_cleanness, YES);
    }
    
    /* Flush buffer out when full, or whenever the line is over
       the nominal maximum and we can break at all
    */
    if (me->write_pointer >= me->buffer + BUFFER_SIZE-1
        ||  (me->overflowed && me->cleanness)) {
    	if (me->cleanness) {
	    char line_break_char = me->line_break[me->cleanness][0];
	    char * saved = me->line_break[me->cleanness];
	    
	    if (me->delete_line_break_char[me->cleanness]) saved++; 
	    me->line_break[me->cleanness][0] = '\n';
	    PUT_BLOCK(me->buffer, me->line_break[me->cleanness]-me->buffer+1);
	    me->line_break[me->cleanness][0] = line_break_char;
	    {  /* move next line in */
	    	char * p=saved;
		char *q;
		for(q=me->buffer; p < me->write_pointer; )
			*q++ = *p++;
	    }
	    me->cleanness = 0;
	    /* Now we have to check whether ther are any perfectly good breaks
	    ** which weren't good enough for the last line but may be
	    **  good enough for the next
	    */
	    {
	        int i;
		for(i=0; i <= MAX_CLEANNESS; i++) {
		    if (me->line_break[i] > saved) {
		        me->line_break[i] = me->line_break[i] -
						(saved-me->buffer);
			me->cleanness = i;
		    } else {
		        me->line_break[i] = NULL;
		    }
		}
	    }

	    me->write_pointer = me->write_pointer - (saved-me->buffer);
	    me->overflowed = NO;
	} else {   /* No break- just output with no newline */
	    PUT_BLOCK(me->buffer, me->write_pointer - me->buffer);
	    me->write_pointer = me->buffer;
	    flush_breaks(me);
	    me->overflowed = YES;
	}
    }
    return HT_OK;
}


/*	String handling
**	---------------
*/
PRIVATE int HTMLGen_output_string (HTStructured * me, const char* s)
{
    while (*s)
	HTMLGen_output_character(me, *s++);
    return HT_OK;
}


/*			INPUT FUNCTIONS
**
**	These take data from the structured stream.  In the input
**	stream, entities are in raw form.  The seven_bit flag controls
**	whether the ISO Latin-1 charactrs are represented in SGML entity
**	form.  This is only recommended for viewing on older non-latin-1
**	capable equipment, or for mailing for example. 
**
** Bug: assumes local encoding is ISO!
*/	
PRIVATE int HTMLGen_put_character (HTStructured * me, char c)
{
    if (c=='&') HTMLGen_output_string(me, "&amp;");
    else if (c=='<') HTMLGen_output_string(me, "&lt;");
    else if (me->seven_bit && ((unsigned char)c > 127)) {
        char temp[8];
	sprintf(temp, "&%d;", c);
	HTMLGen_output_string(me, temp);
    } else
	HTMLGen_output_character(me, c);
    return HT_OK;
}

PRIVATE int HTMLGen_put_string (HTStructured * me, const char* s)
{
    while (*s)
	HTMLGen_put_character(me, *s++);
    return HT_OK;
}

PRIVATE int HTMLGen_write (HTStructured * me, const char* b, int l)
{
    while (l-- > 0)
	HTMLGen_put_character(me, *b++);
    return HT_OK;
}


/*	Start Element
**	-------------
**
**	Within the opening tag, there may be spaces
**	and the line may be broken at these spaces.
*/
PRIVATE void HTMLGen_start_element (
	HTStructured * 	me,
	int			element_number,
	const BOOL*	 	present,
	const char **		value)
{
    int i;
    HTTag * tag = &me->dtd->tags[element_number];

    /* Control line breaks allowed within tag! */
    int was_preformatted = me->preformatted;	/* save state */
    me->preformatted = 1;	/* Can break between attributes */

    HTMLGen_output_character(me, '<');
    HTMLGen_output_string(me, tag->name);
    if (present) for (i=0; i< tag->number_of_attributes; i++) {
        if (present[i]) {
	    HTMLGen_output_character(me, ' ');
	    allow_break(me, 1, YES);
	    HTMLGen_output_string(me, tag->attributes[i].name);
	    if (value[i]) {
	 	HTMLGen_output_string(me, "=\"");
		HTMLGen_output_string(me, value[i]);
		HTMLGen_output_character(me, '"');
	    }
	}
    }
    me->preformatted = was_preformatted;	/* Restore state */

    /* Nested PRE is no more a problem! */
    if (element_number == HTML_PRE)
	me->preformatted++;

    HTMLGen_output_character(me, '>');
    
    /* Here is a funny one.  In PRE, newlines are significant, except of
    course for one after the <PRE> which is ignored. This means that
    we MUST put in a dummy one after the <PRE> to protect any real newline
    within the pre section.
    
    However, *within* a PRE section, although we can break after
    (for example) emphasis start tags, it will probably confuse some
    parsers so we won't.*/
    
    if (element_number == HTML_PRE) {
        HTMLGen_output_character(me, '\n');
    } else  if (!me->preformatted && 
    	 tag->contents != SGML_EMPTY) {  /* can break after element start */ 
    	allow_break(me, 3, NO);
    }
}


/*	End Element
**	-----------
**
**      The rules for insertring CR LF into SGML are weird, strict, and
** 	nonintitive.
**	See comment also about PRE above.
*/
PRIVATE void HTMLGen_end_element (HTStructured * me, int element_number)
{
    if (element_number == HTML_PRE) {
        HTMLGen_output_character(me, '\n');
    } else  if (!me->preformatted) { /* can break before element end */ 
    	allow_break(me, 1, NO);
    }
    HTMLGen_output_string(me, "</");
    HTMLGen_output_string(me, me->dtd->tags[element_number].name);
    HTMLGen_output_character(me, '>');	   /* NO break after. TBL 940501 */
    if (element_number == HTML_PRE && me->preformatted)
	me->preformatted--;
}


/*	Expanding entities
**	------------------
**
*/

PRIVATE void HTMLGen_put_entity (HTStructured * me, int entity_number)
{
    HTMLGen_output_character(me, '&');
    HTMLGen_output_string(me, me->dtd->entity_names[entity_number]);
    HTMLGen_output_character(me, ';');
}

/*	Free an object
**	--------------
**
*/
PRIVATE int HTMLGen_free (HTStructured * me)
{
    HTMLGen_flush(me);
    PUT_CHAR('\n');
    (*me->target->isa->_free)(me->target);
    HT_FREE(me);
    return HT_OK;
}


PRIVATE int PlainToHTML_free (HTStructured * me)
{
    HTMLGen_end_element(me, HTML_PRE);
    HTMLGen_end_element(me, HTML_BODY);
    HTMLGen_end_element(me, HTML_HTML);
    HTMLGen_free(me);
    return HT_OK;
}



PRIVATE int HTMLGen_abort (HTStructured * me, HTList * e)
{
    HTMLGen_free(me);
    return HT_ERROR;
}


PRIVATE int PlainToHTML_abort (HTStructured * me, HTList * e)
{
    PlainToHTML_free(me);
    return HT_ERROR;
}



/*	Structured Object Class
**	-----------------------
*/
PRIVATE const HTStructuredClass HTMLGeneration = /* As opposed to print etc */
{		
	"text/html",
	HTMLGen_flush,
	HTMLGen_free,
	HTMLGen_abort,
	HTMLGen_put_character, 	HTMLGen_put_string, HTMLGen_write,
	HTMLGen_start_element,	HTMLGen_end_element,
	HTMLGen_put_entity
}; 


/*	Subclass-specific Methods
**	-------------------------
*/
PUBLIC HTStructured* HTMLGenerator (HTRequest *	request,
				    void *	param,
				    HTFormat	input_format,
				    HTFormat	output_format,
				    HTStream *	output_stream)
{
    HTStructured* me;
    if ((me = (HTStructured  *) HT_CALLOC(1, sizeof(HTStructured))) == NULL)
        HT_OUTOFMEM("HTMLGenerator");
    me->isa = &HTMLGeneration;       
    me->dtd = &HTMLP_dtd;
    if ((me->target = HTStreamStack(WWW_HTML, output_format, output_stream,
				    request, YES)) == NULL) {
	if (STREAM_TRACE)
	    HTTrace("HTMLGen..... Can't convert to media type\n");
	HT_FREE(me);
	me->target = HTErrorStream();
    }
    me->write_pointer = me->buffer;
    flush_breaks(me);
    return me;
}

/*	Stream Object Class
**	-------------------
**
**	This object just converts a plain text stream into HTML
**	It is officially a structured stream but only the stream bits exist.
**	This is just the easiest way of typecasting all the routines.
*/
PRIVATE const HTStructuredClass PlainToHTMLConversion =
{		
	"plaintexttoHTML",
	HTMLGen_flush,
	PlainToHTML_free,	/* HTMLGen_free,  Henrik 03/03-94 */
	PlainToHTML_abort,	
	HTMLGen_put_character,
	HTMLGen_put_string,
	HTMLGen_write,
	NULL,		/* Structured stuff */
	NULL,
	NULL
}; 


/*	HTConverter from plain text to HTML Stream
**	------------------------------------------
**
** Changed by henrik 03/03-94, so no more core dumps etc. (I hope!!!)
*/

PUBLIC HTStream* HTPlainToHTML (HTRequest *	request,
				void *		param,
				HTFormat	input_format,
				HTFormat	output_format,
				HTStream *	output_stream)
{
    BOOL present[MAX_ATTRIBUTES];	/* Flags: attribute is present? */
    const char *value[MAX_ATTRIBUTES];	/* malloc'd strings or NULL if none */
    HTStructured* me;
    if ((me = (HTStructured *) HT_CALLOC(1,sizeof(*me))) == NULL)
        HT_OUTOFMEM("PlainToHTML");
    
    memset((void *) present, '\0', MAX_ATTRIBUTES);
    memset((void *) value, '\0', MAX_ATTRIBUTES*sizeof(char *));
    
    me->isa = (HTStructuredClass*) &PlainToHTMLConversion;
    me->dtd = &HTMLP_dtd;
    me->target = output_stream;
    me->write_pointer = me->buffer;
    flush_breaks(me);
    
    if (me->target) {
	HTMLGen_start_element(me, HTML_HTML, present, value);
	HTMLGen_start_element(me, HTML_BODY, present, value);
	HTMLGen_start_element(me, HTML_PRE, present, value);
    }
    return (HTStream*) me;
}


/* 	A safe version for making 7-bit restricted HTML
**	Beware that thsi makes it horrible for the Scandinavians
**	to actually read it.
**	ehh - not horrible - THIS REALLY PISSES THEM OFF - Henrik ;-)
*/

PUBLIC HTStream* HTPlainTo7BitHTML (HTRequest *	request,
				    void *	param,
				    HTFormat	input_format,
				    HTFormat	output_format,
				    HTStream *	output_stream)
{
    HTStream* me = HTPlainToHTML(request,param,input_format,
    		output_format, output_stream);
    ((HTStructured*)me)->seven_bit = YES;
    return me;
}


Webmaster