File:  [Public] / libwww / Library / src / SGML.c
Revision 1.41: download - view: text, annotated - select for diffs
Tue Jul 2 22:55:21 1996 UTC (27 years, 10 months ago) by frystyk
Branches: MAIN
CVS tags: Release-5-1, Release-5-0a, Release-5-0, Release-4-1b5, Release-4-1b4, Release-4-1b3, PIPELINE1, HEAD
4.1b2

/*									 SGML.c
**	GENERAL SGML PARSER CODE
**
**	(c) COPYRIGHT MIT 1995.
**	Please first read the full copyright statement in the file COPYRIGH.
**	@(#) $Id: SGML.c,v 1.41 1996/07/02 22:55:21 frystyk Exp $
**
**	This module implements an HTStream object. To parse an
**	SGML file, create this object which is a parser. The object
**	is (currently) created by being passed a DTD structure,
**	and a target HTStructured oject at which to throw the parsed stuff.
**	
**	 6 Feb 93  	Binary seraches used. Intreface modified.
**	 8 Jul 94  FM	Insulate free() from _free structure element.
*/

/* Library include files */
#include "sysdep.h"
#include "HTUtils.h"
#include "HTString.h"
#include "HTChunk.h"
#include "SGML.h"

#define INVALID (-1)

/*	The State (context) of the parser
**
**	This is passed with each call to make the parser reentrant
**
*/



	
/*		Element Stack
**		-------------
**	This allows us to return down the stack reselcting styles.
**	As we return, attribute values will be garbage in general.
*/
typedef struct _HTElement HTElement;
struct _HTElement {
	HTElement *	next;	/* Previously nested element or 0 */
	HTTag*		tag;	/* The tag at this level  */
};


typedef enum _sgml_state {
    S_text, S_literal, S_tag, S_tag_gap, 
    S_attr, S_attr_gap, S_equals, S_value, S_after_open,
    S_nl, S_nl_tago,
    S_ero, S_cro,
#ifdef ISO_2022_JP
    S_esc, S_dollar, S_paren, S_nonascii_text,
#endif
    S_squoted, S_dquoted, S_end, S_entity, S_junk_tag
} sgml_state;


/*	Internal Context Data Structure
**	-------------------------------
*/
struct _HTStream {

    const HTStreamClass *	isa;		/* inherited from HTStream */
    
    const SGML_dtd 		*dtd;
    HTStructuredClass	*actions;	/* target class  */
    HTStructured	*target;	/* target object */

    HTTag 		*current_tag;
    int 		current_attribute_number;
    HTChunk		*string;
    HTElement		*element_stack;
    sgml_state		state;
    BOOL present[MAX_ATTRIBUTES];	/* Flags: attribute is present? */
    char * value[MAX_ATTRIBUTES];	/* malloc'd strings or NULL if none */
} ;


#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))


/*	Find Attribute Number
**	---------------------
*/

PRIVATE int SGMLFindAttribute  (HTTag* tag, const char * s)
{
    attr* attributes = tag->attributes;

    int high, low, i, diff;		/* Binary search for attribute name */
    for(low=0, high=tag->number_of_attributes;
    		high > low ;
		diff < 0 ? (low = i+1) : (high = i) )  {
	i = (low + (high-low)/2);
	diff = strcasecomp(attributes[i].name, s);
	if (diff==0) return i;			/* success: found it */
    } /* for */
    
    return -1;
}


/*	Handle Attribute
**	----------------
*/
/* PUBLIC const char * SGML_default = "";   ?? */

PRIVATE void handle_attribute_name (HTStream * context, const char * s)
{

    HTTag * tag = context->current_tag;

    int i = SGMLFindAttribute(tag, s);
    if (i>=0) {
	context->current_attribute_number = i;
	context->present[i] = YES;
	if (context->value[i]) {
	    HT_FREE(context->value[i]);
	    context->value[i] = NULL;
	}
	return;
    } /* if */
	
    if (SGML_TRACE)
	HTTrace("SGML Parser. Unknown attribute %s for tag %s\n",
	    s, context->current_tag->name);
    context->current_attribute_number = INVALID;	/* Invalid */
}


/*	Handle attribute value
**	----------------------
*/
PRIVATE void handle_attribute_value (HTStream * context, const char * s)
{
    if (context->current_attribute_number != INVALID) {
	StrAllocCopy(context->value[context->current_attribute_number], s);
    } else {
        if (SGML_TRACE) HTTrace("SGML Parser. Attribute value %s ignored\n", s);
    }
    context->current_attribute_number = INVALID; /* can't have two assignments! */
}


/*	Handle entity
**	-------------
**
** On entry,
**	s	contains the entity name zero terminated
** Bugs:
**	If the entity name is unknown, the terminator is treated as
**	a printable non-special character in all cases, even if it is '<'
*/
PRIVATE void handle_entity (HTStream * context, char term)
{

    const char ** entities = context->dtd->entity_names;
    const char *s = context->string->data;
    
    int high, low, i, diff;
    for(low=0, high = context->dtd->number_of_entities;
    		high > low ;
		diff < 0 ? (low = i+1) : (high = i))   {  /* Binary serach */
	i = (low + (high-low)/2);
	diff = strcmp(entities[i], s);	/* Csse sensitive! */
	if (diff==0) {			/* success: found it */
	    (*context->actions->put_entity)(context->target, i);
	    return;
	}
    }
    /* If entity string not found, display as text */
    if (SGML_TRACE)
	HTTrace("SGML Parser. Unknown entity %s\n", s); 
    PUTC('&');
    {
	const char *p;
	for (p=s; *p; p++) {
	    PUTC(*p);
	}
    }
    PUTC(term);
}

/*
**	Helper function to check if the tag is on the stack
*/
PRIVATE BOOL lookup_element_stack (HTElement* stack, HTTag *tag)
{
    HTElement* elem;
    for (elem = stack; elem != NULL; elem = elem->next)
    {
        if (elem->tag == tag)  return YES;
    }
    return NO;
}

/*	End element
**	-----------
*/
PRIVATE void end_element (HTStream * context, HTTag * old_tag)
{
    if (SGML_TRACE) HTTrace("SGML Parser. End   </%s>\n", old_tag->name);
    if (old_tag->contents == SGML_EMPTY) {
        if (SGML_TRACE) HTTrace("SGML Parser. Illegal end tag </%s> found.\n",
		old_tag->name);
	return;
    }
    while (context->element_stack) 	{/* Loop is error path only */
	HTElement * N = context->element_stack;
	HTTag * t = N->tag;
	
	if (old_tag != t) {		/* Mismatch: syntax error */
	    /*
	    ** Patch from Maciej Puzio, puzio@laser.mimuw.edu.pl
	    ** See explanation in ../User/Patch/lib_4.0_1.fix
	    */
            if (context->element_stack->next   /* This is not the last level */
		&& lookup_element_stack(context->element_stack, old_tag)) {
		if (SGML_TRACE) HTTrace(
	    	"SGML Parser. Found </%s> when expecting </%s>. </%s> assumed.\n",
		    old_tag->name, t->name, t->name);
	    } else {			/* last level */
		if (SGML_TRACE) HTTrace(
	            "SGML Parser. Found </%s> when expecting </%s>. </%s> Ignored.\n",
		    old_tag->name, t->name, old_tag->name);
	        return;			/* Ignore */
	    }
	}
	
	context->element_stack = N->next;		/* Remove from stack */
	HT_FREE(N);
	(*context->actions->end_element)(context->target,
		 t - context->dtd->tags);
	if (old_tag == t) return;  /* Correct sequence */
	
	/* Syntax error path only */
	
    }
    if (SGML_TRACE) HTTrace(
	"SGML Parser. Extra end tag </%s> found and ignored.\n", old_tag->name);
}


/*	Start an element
**	----------------
*/
PRIVATE void start_element (HTStream * context)
{
    HTTag * new_tag = context->current_tag;
    
    if (SGML_TRACE) HTTrace("SGML Parser. Start <%s>\n", new_tag->name);
    (*context->actions->start_element)(
    	context->target,
	new_tag - context->dtd->tags,
	context->present,
	(const char**) context->value);  /* coerce type for think c */
    if (new_tag->contents != SGML_EMPTY) {		/* i.e. tag not empty */
	HTElement * N;
	if ((N = (HTElement  *) HT_MALLOC(sizeof(HTElement))) == NULL)
	    HT_OUTOFMEM("start_element");
	N->next = context->element_stack;
	N->tag = new_tag;
	context->element_stack = N;
    }
}


/*		Find Tag in DTD tag list
**		------------------------
**
** On entry,
**	dtd	points to dtd structire including valid tag list
**	string	points to name of tag in question
**
** On exit,
**	returns:
**		NULL		tag not found
**		else		address of tag structure in dtd
*/
PRIVATE HTTag * SGMLFindTag (const SGML_dtd* dtd, const char * string)
{
    int high, low, i, diff;
    for(low=0, high=dtd->number_of_tags;
    		high > low ;
		diff < 0 ? (low = i+1) : (high = i))   {  /* Binary serach */
	i = (low + (high-low)/2);
	diff = strcasecomp(dtd->tags[i].name, string);	/* Case insensitive */
	if (diff==0) {			/* success: found it */
	    return &dtd->tags[i];
	}
    }
    return NULL;
}

/*________________________________________________________________________
**			Public Methods
*/


/*	Could check that we are back to bottom of stack! @@  */
PRIVATE int SGML_flush  (HTStream * context)
{
    while (context->element_stack) {
	HTElement *ptr = context->element_stack;
	if (SGML_TRACE)
	    HTTrace("SGML........ Non-matched tag found: <%s>\n",
		    context->element_stack->tag->name);
	context->element_stack = ptr->next;
	HT_FREE(ptr);
    }
    return (*context->actions->flush)(context->target);
}

PRIVATE int SGML_free  (HTStream * context)
{
    int status;
    int cnt;
    while (context->element_stack) {    /* Make sure, that all tags are gone */
	HTElement *ptr = context->element_stack;

	if (SGML_TRACE)
	    HTTrace("SGML........ Non-matched tag found: <%s>\n",
		    context->element_stack->tag->name);
	context->element_stack = ptr->next;
	HT_FREE(ptr);
    }
    if ((status = (*context->actions->_free)(context->target)) != HT_OK)
	return status;
    HTChunk_delete(context->string);
    for(cnt=0; cnt<MAX_ATTRIBUTES; cnt++)      	 /* Leak fix Henrik 18/02-94 */
	if(context->value[cnt])
	    HT_FREE(context->value[cnt]);
    HT_FREE(context);
    return HT_OK;
}

PRIVATE int SGML_abort  (HTStream * context, HTList * e)
{
    int cnt;
    while (context->element_stack) {    /* Make sure, that all tags are gone */
	HTElement *ptr = context->element_stack;
	if (SGML_TRACE)
	    HTTrace("SGML........ Non-matched tag found: <%s>\n",
		    context->element_stack->tag->name);
	context->element_stack = ptr->next;
	HT_FREE(ptr);
    }
    (*context->actions->abort)(context->target, e);
    HTChunk_delete(context->string);
    for(cnt=0; cnt<MAX_ATTRIBUTES; cnt++)      	/* Leak fix Henrik 18/02-94 */
	if(context->value[cnt])
	    HT_FREE(context->value[cnt]);
    HT_FREE(context);
    return HT_ERROR;
}

PRIVATE int SGML_write (HTStream * context, const char * b, int l)
{
    const SGML_dtd	*dtd	=	context->dtd;
    HTChunk	*string = 	context->string;

    while (l-- > 0) {
	char c = *b++;
	switch(context->state) {
    
	case S_after_open:	/* Strip one trainling newline
			  only after opening nonempty element.  - SGML: Ugh! */
        if (c=='\n' && (context->current_tag->contents != SGML_EMPTY)) {
	    break;
	}
	context->state = S_text;
	goto normal_text;
	/* (***falls through***) */
	
    case S_text:
normal_text:

#ifdef ISO_2022_JP
 	if (c=='\033') {
 	    context->state = S_esc;
 	    PUTC(c);
 	    break;
 	}
#endif /* ISO_2022_JP */
	if (c=='&' && (!context->element_stack || (
	    		 context->element_stack->tag  &&
	    		 ( context->element_stack->tag->contents == SGML_MIXED
			   || context->element_stack->tag->contents ==
			      				 SGML_RCDATA)
			))) {
	    string->size = 0;
	    context->state = S_ero;
	    
	} else if (c=='<') {
	    string->size = 0;
	    context->state = (context->element_stack &&
	    	context->element_stack->tag  &&
	    	context->element_stack->tag->contents == SGML_LITERAL) ?
	    			S_literal : S_tag;
	} else if (c=='\n') {	/* Newline - ignore if before tag end! */
	    context->state = S_nl;
	} else PUTC(c);
	break;

    case S_nl:
        if (c=='<') {
	    string->size = 0;
	    context->state = (context->element_stack &&
		context->element_stack->tag  &&
		context->element_stack->tag->contents == SGML_LITERAL) ?
				S_literal : S_nl_tago;
	} else {
	    PUTC('\n');
	    context->state = S_text;
	    goto normal_text;
	}
	break;

    case S_nl_tago:		/* Had newline and tag opener */
        if (c != '/') {
	    PUTC('\n');		/* Only ignore newline before </ */
	}
	context->state = S_tag;
	goto handle_S_tag;

#ifdef ISO_2022_JP
    case S_esc:
	if (c=='$') {
	    context->state = S_dollar;
	} else if (c=='(') {
	    context->state = S_paren;
	} else {
	    context->state = S_text;
	}
	PUTC(c);
	break;
    case S_dollar:
	if (c=='@' || c=='B') {
	    context->state = S_nonascii_text;
	} else {
	    context->state = S_text;
	}
	PUTC(c);
	break;
    case S_paren:
	if (c=='B' || c=='J') {
	    context->state = S_text;
	} else {
	    context->state = S_text;
	}
	PUTC(c);
	break;
    case S_nonascii_text:
	if (c=='\033') {
	    context->state = S_esc;
	    PUTC(c);
	} else {
	    PUTC(c);
	}
	break;
#endif /* ISO_2022_JP */

/*	In literal mode, waits only for specific end tag!
**	Only foir compatibility with old servers.
*/
    case S_literal :
	HTChunk_putc(string, c);
	if ( TOUPPER(c) != ((string->size ==1) ? '/'
		: context->element_stack->tag->name[string->size-2])) {
	    int i;
	    
	    /*	If complete match, end literal */
	    if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
		end_element(context, context->element_stack->tag);
		string->size = 0;
		context->current_attribute_number = INVALID;
		context->state = S_text;
		break;
	    }		/* If Mismatch: recover string. */
	    PUTC( '<');
	    for (i=0; i<string->size; i++)	/* recover */
	       PUTC(
	       				      string->data[i]);
	    context->state = S_text;	
	}
	
        break;

/*	Character reference or Entity
*/
   case S_ero:
   	if (c=='#') {
	    context->state = S_cro;  /*   &# is Char Ref Open */ 
	    break;
	}
	context->state = S_entity;    /* Fall through! */
	
/*	Handle Entities
*/
    case S_entity:
	if (isalnum(c))
	    HTChunk_putc(string, c);
	else {
	    HTChunk_terminate(string);
	    handle_entity(context, c);
	    context->state = S_text;
	}
	break;

/*	Character reference
*/
    case S_cro:
	if (isalnum(c))
	    HTChunk_putc(string, c);	/* accumulate a character NUMBER */
	else {
	    int value;
	    HTChunk_terminate(string);
	    if (sscanf(string->data, "%d", &value)==1)
	        PUTC((char) value);
	    context->state = S_text;
	}
	break;

/*		Tag
*/	    
    case S_tag:				/* new tag */
handle_S_tag:

	if (isalnum(c))
	    HTChunk_putc(string, c);
	else {				/* End of tag name */
	    HTTag * t;
	    if (c=='/') {
		if (SGML_TRACE) if (string->size!=0)
		    HTTrace("SGML Parser.  `<%s/' found!\n", string->data);
		context->state = S_end;
		break;
	    }
	    HTChunk_terminate(string) ;

	    t = SGMLFindTag(dtd, string->data);
	    if (!t) {
		if(SGML_TRACE) HTTrace("SGML Parser. *** Unknown element %s\n",
			string->data);
		context->state = (c=='>') ? S_text : S_junk_tag;
		break;
	    }
	    context->current_tag = t;
	    
	    /*  Clear out attributes
	    */
	    
	    {
	        int i;
	        for (i=0; i< context->current_tag->number_of_attributes; i++)
	    	    context->present[i] = NO;
	    }
	    string->size = 0;
	    context->current_attribute_number = INVALID;
	    
	    if (c=='>') {
		if (context->current_tag->name) start_element(context);
		context->state = S_after_open;
	    } else {
	        context->state = S_tag_gap;
	    }
	}
	break;

		
    case S_tag_gap:		/* Expecting attribute or > */
	if (WHITE(c)) break;	/* Gap between attributes */
	if (c=='>') {		/* End of tag */
	    if (context->current_tag->name) start_element(context);
	    context->state = S_after_open;
	    break;
	}
	HTChunk_putc(string, c);
	context->state = S_attr;		/* Get attribute */
	break;
	
   				/* accumulating value */
    case S_attr:
	if (WHITE(c) || (c=='>') || (c=='=')) {		/* End of word */
	    HTChunk_terminate(string) ;
	    handle_attribute_name(context, string->data);
	    string->size = 0;
	    if (c=='>') {		/* End of tag */
		if (context->current_tag->name) start_element(context);
		context->state = S_after_open;
		break;
	    }
	    context->state = (c=='=' ?  S_equals: S_attr_gap);
	} else {
	    HTChunk_putc(string, c);
	}
	break;
		
    case S_attr_gap:		/* Expecting attribute or = or > */
	if (WHITE(c)) break;	/* Gap after attribute */
	if (c=='>') {		/* End of tag */
	    if (context->current_tag->name) start_element(context);
	    context->state = S_after_open;
	    break;
	} else if (c=='=') {
	    context->state = S_equals;
	    break;
	}
	HTChunk_putc(string, c);
	context->state = S_attr;		/* Get next attribute */
	break;
	
    case S_equals:			/* After attr = */ 
	if (WHITE(c)) break;	/* Before attribute value */
	if (c=='>') {		/* End of tag */
	    if (SGML_TRACE) HTTrace("SGML Parser. found = but no value\n");
	    if (context->current_tag->name) start_element(context);
	    context->state = S_after_open;
	    break;
	    
	} else if (c=='\'') {
	    context->state = S_squoted;
	    break;

	} else if (c=='"') {
	    context->state = S_dquoted;
	    break;
	}
	HTChunk_putc(string, c);
	context->state = S_value;
	break;
	
    case S_value:
	if (WHITE(c) || (c=='>')) {		/* End of word */
	    HTChunk_terminate(string) ;
	    handle_attribute_value(context, string->data);
	    string->size = 0;
	    if (c=='>') {		/* End of tag */
		if (context->current_tag->name) start_element(context);
		context->state = S_after_open;
		break;
	    }
	    else context->state = S_tag_gap;
	} else {
	    HTChunk_putc(string, c);
	}
	break;
		
    case S_squoted:		/* Quoted attribute value */
	if (c=='\'') {		/* End of attribute value */
	    HTChunk_terminate(string) ;
	    handle_attribute_value(context, string->data);
	    string->size = 0;
	    context->state = S_tag_gap;
	} else {
	    HTChunk_putc(string, c);
	}
	break;
	
    case S_dquoted:		/* Quoted attribute value */
	if (c=='"') {		/* End of attribute value */
	    HTChunk_terminate(string) ;
	    handle_attribute_value(context, string->data);
	    string->size = 0;
	    context->state = S_tag_gap;
	} else {
	    HTChunk_putc(string, c);
	}
	break;
	
    case S_end:					/* </ */
	if (isalnum(c))
	    HTChunk_putc(string, c);
	else {				/* End of end tag name */
	    HTTag * t;
	    HTChunk_terminate(string) ;
	    if (!*string->data)	{	/* Empty end tag */
	        t = context->element_stack->tag;
	    } else {
		t = SGMLFindTag(dtd, string->data);
	    }
	    if (!t) {
		if(SGML_TRACE) HTTrace(
		    "Unknown end tag </%s>\n", string->data); 
	    } else {
	        context->current_tag = t;
		end_element( context, context->current_tag);
	    }

	    string->size = 0;
	    context->current_attribute_number = INVALID;
	    if (c!='>') {
		if (SGML_TRACE && !WHITE(c))
		    HTTrace("SGML Parser.  `</%s%c' found!\n",
		    	string->data, c);
		context->state = S_junk_tag;
	    } else {
	        context->state = S_text;
	    }
	}
	break;

		
    case S_junk_tag:
	if (c=='>') {
	    context->state = S_text;
	}
    } /* switch on context->state */
  }
    return HT_OK;
}


PRIVATE int SGML_string (HTStream * context, const char* s)
{
    return SGML_write(context, s, (int) strlen(s));
}


PRIVATE int SGML_character (HTStream * context, char c)
{
    return SGML_write(context, &c, 1);
}

/*_______________________________________________________________________
*/

/*	Structured Object Class
**	-----------------------
*/
PRIVATE const HTStreamClass SGMLParser = 
{		
    "SGMLParser",
    SGML_flush,
    SGML_free,
    SGML_abort,
    SGML_character, 
    SGML_string,
    SGML_write,
}; 

/*	Create SGML Engine
**	------------------
**
** On entry,
**	dtd		represents the DTD, along with
**	actions		is the sink for the data as a set of routines.
**
*/
PUBLIC HTStream * SGML_new (const SGML_dtd * dtd, HTStructured * target)
{
    int i;
    HTStream* context;
    if ((context = (HTStream  *) HT_CALLOC(1, sizeof(HTStream))) == NULL)
        HT_OUTOFMEM("SGML_begin");

    context->isa = &SGMLParser;
    context->string = HTChunk_new(128);	/* Grow by this much */
    context->dtd = dtd;
    context->target = target;
    context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
    					/* Ugh: no OO */
    context->state = S_text;
    context->element_stack = 0;			/* empty */
    for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;

    return context;
}

Webmaster