/* SGML.c
** GENERAL SGML PARSER CODE
**
** (c) COPYRIGHT MIT 1995.
** Please first read the full copyright statement in the file COPYRIGH.
**
** This module implements an HTStream object. To parse an
** SGML file, create this object which is a parser. The object
** is (currently) created by being passed a DTD structure,
** and a target HTStructured oject at which to throw the parsed stuff.
**
** 6 Feb 93 Binary seraches used. Intreface modified.
** 8 Jul 94 FM Insulate free() from _free structure element.
*/
/* Library include files */
#include "tcp.h"
#include "HTUtils.h"
#include "HTString.h"
#include "HTChunk.h"
#include "SGML.h"
#define INVALID (-1)
/* The State (context) of the parser
**
** This is passed with each call to make the parser reentrant
**
*/
/* Element Stack
** -------------
** This allows us to return down the stack reselcting styles.
** As we return, attribute values will be garbage in general.
*/
typedef struct _HTElement HTElement;
struct _HTElement {
HTElement * next; /* Previously nested element or 0 */
HTTag* tag; /* The tag at this level */
};
typedef enum _sgml_state {
S_text, S_literal, S_tag, S_tag_gap,
S_attr, S_attr_gap, S_equals, S_value, S_after_open,
S_nl, S_nl_tago,
S_ero, S_cro,
#ifdef ISO_2022_JP
S_esc, S_dollar, S_paren, S_nonascii_text,
#endif
S_squoted, S_dquoted, S_end, S_entity, S_junk_tag
} sgml_state;
/* Internal Context Data Structure
** -------------------------------
*/
struct _HTStream {
CONST HTStreamClass * isa; /* inherited from HTStream */
CONST SGML_dtd *dtd;
HTStructuredClass *actions; /* target class */
HTStructured *target; /* target object */
HTTag *current_tag;
int current_attribute_number;
HTChunk *string;
HTElement *element_stack;
sgml_state state;
#ifdef CALLERDATA
void * callerData;
#endif
BOOL present[MAX_ATTRIBUTES]; /* Flags: attribute is present? */
char * value[MAX_ATTRIBUTES]; /* malloc'd strings or NULL if none */
} ;
#define PUTC(ch) ((*context->actions->put_character)(context->target, ch))
/* Find Attribute Number
** ---------------------
*/
PUBLIC int SGMLFindAttribute (HTTag* tag, CONST char * s)
{
attr* attributes = tag->attributes;
int high, low, i, diff; /* Binary search for attribute name */
for(low=0, high=tag->number_of_attributes;
high > low ;
diff < 0 ? (low = i+1) : (high = i) ) {
i = (low + (high-low)/2);
diff = strcasecomp(attributes[i].name, s);
if (diff==0) return i; /* success: found it */
} /* for */
return -1;
}
/* Handle Attribute
** ----------------
*/
/* PUBLIC CONST char * SGML_default = ""; ?? */
PRIVATE void handle_attribute_name (HTStream * context, CONST char * s)
{
HTTag * tag = context->current_tag;
int i = SGMLFindAttribute(tag, s);
if (i>=0) {
context->current_attribute_number = i;
context->present[i] = YES;
if (context->value[i]) {
free(context->value[i]);
context->value[i] = NULL;
}
return;
} /* if */
if (SGML_TRACE)
TTYPrint(TDEST, "SGML: Unknown attribute %s for tag %s\n",
s, context->current_tag->name);
context->current_attribute_number = INVALID; /* Invalid */
}
/* Handle attribute value
** ----------------------
*/
PRIVATE void handle_attribute_value (HTStream * context, CONST char * s)
{
if (context->current_attribute_number != INVALID) {
StrAllocCopy(context->value[context->current_attribute_number], s);
} else {
if (SGML_TRACE) TTYPrint(TDEST, "SGML: Attribute value %s ignored\n", s);
}
context->current_attribute_number = INVALID; /* can't have two assignments! */
}
/* Handle entity
** -------------
**
** On entry,
** s contains the entity name zero terminated
** Bugs:
** If the entity name is unknown, the terminator is treated as
** a printable non-special character in all cases, even if it is '<'
*/
PRIVATE void handle_entity (HTStream * context, char term)
{
CONST char ** entities = context->dtd->entity_names;
CONST char *s = context->string->data;
int high, low, i, diff;
for(low=0, high = context->dtd->number_of_entities;
high > low ;
diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
i = (low + (high-low)/2);
diff = strcmp(entities[i], s); /* Csse sensitive! */
if (diff==0) { /* success: found it */
(*context->actions->put_entity)(context->target, i);
return;
}
}
/* If entity string not found, display as text */
if (SGML_TRACE)
TTYPrint(TDEST, "SGML: Unknown entity %s\n", s);
PUTC('&');
{
CONST char *p;
for (p=s; *p; p++) {
PUTC(*p);
}
}
PUTC(term);
}
/* End element
** -----------
*/
PRIVATE void end_element (HTStream * context, HTTag * old_tag)
{
if (SGML_TRACE) TTYPrint(TDEST, "SGML: End </%s>\n", old_tag->name);
if (old_tag->contents == SGML_EMPTY) {
if (SGML_TRACE) TTYPrint(TDEST,"SGML: Illegal end tag </%s> found.\n",
old_tag->name);
return;
}
while (context->element_stack) {/* Loop is error path only */
HTElement * N = context->element_stack;
HTTag * t = N->tag;
if (old_tag != t) { /* Mismatch: syntax error */
if (context->element_stack->next) { /* This is not the last level */
if (SGML_TRACE) TTYPrint(TDEST,
"SGML: Found </%s> when expecting </%s>. </%s> assumed.\n",
old_tag->name, t->name, t->name);
} else { /* last level */
if (SGML_TRACE) TTYPrint(TDEST,
"SGML: Found </%s> when expecting </%s>. </%s> Ignored.\n",
old_tag->name, t->name, old_tag->name);
return; /* Ignore */
}
}
context->element_stack = N->next; /* Remove from stack */
free(N);
(*context->actions->end_element)(context->target,
t - context->dtd->tags);
if (old_tag == t) return; /* Correct sequence */
/* Syntax error path only */
}
if (SGML_TRACE) TTYPrint(TDEST,
"SGML: Extra end tag </%s> found and ignored.\n", old_tag->name);
}
/* Start an element
** ----------------
*/
PRIVATE void start_element (HTStream * context)
{
HTTag * new_tag = context->current_tag;
if (SGML_TRACE) TTYPrint(TDEST, "SGML: Start <%s>\n", new_tag->name);
(*context->actions->start_element)(
context->target,
new_tag - context->dtd->tags,
context->present,
(CONST char**) context->value); /* coerce type for think c */
if (new_tag->contents != SGML_EMPTY) { /* i.e. tag not empty */
HTElement * N = (HTElement *)malloc(sizeof(HTElement));
if (N == NULL) outofmem(__FILE__, "start_element");
N->next = context->element_stack;
N->tag = new_tag;
context->element_stack = N;
}
}
/* Find Tag in DTD tag list
** ------------------------
**
** On entry,
** dtd points to dtd structire including valid tag list
** string points to name of tag in question
**
** On exit,
** returns:
** NULL tag not found
** else address of tag structure in dtd
*/
PUBLIC HTTag * SGMLFindTag (CONST SGML_dtd* dtd, CONST char * string)
{
int high, low, i, diff;
for(low=0, high=dtd->number_of_tags;
high > low ;
diff < 0 ? (low = i+1) : (high = i)) { /* Binary serach */
i = (low + (high-low)/2);
diff = strcasecomp(dtd->tags[i].name, string); /* Case insensitive */
if (diff==0) { /* success: found it */
return &dtd->tags[i];
}
}
return NULL;
}
/*________________________________________________________________________
** Public Methods
*/
/* Could check that we are back to bottom of stack! @@ */
PUBLIC int SGML_flush (HTStream * context)
{
while (context->element_stack) {
HTElement *ptr = context->element_stack;
if (SGML_TRACE)
TTYPrint(TDEST, "SGML........ Non-matched tag found: <%s>\n",
context->element_stack->tag->name);
context->element_stack = ptr->next;
free(ptr);
}
return (*context->actions->flush)(context->target);
}
PUBLIC int SGML_free (HTStream * context)
{
int status;
int cnt;
while (context->element_stack) { /* Make sure, that all tags are gone */
HTElement *ptr = context->element_stack;
if (SGML_TRACE)
TTYPrint(TDEST, "SGML........ Non-matched tag found: <%s>\n",
context->element_stack->tag->name);
context->element_stack = ptr->next;
free(ptr);
}
if ((status = (*context->actions->_free)(context->target)) != HT_OK)
return status;
HTChunkFree(context->string);
for(cnt=0; cnt<MAX_ATTRIBUTES; cnt++) /* Leak fix Henrik 18/02-94 */
if(context->value[cnt])
free(context->value[cnt]);
free(context);
return HT_OK;
}
PUBLIC int SGML_abort (HTStream * context, HTList * e)
{
int cnt;
while (context->element_stack) { /* Make sure, that all tags are gone */
HTElement *ptr = context->element_stack;
if (SGML_TRACE)
TTYPrint(TDEST, "SGML........ Non-matched tag found: <%s>\n",
context->element_stack->tag->name);
context->element_stack = ptr->next;
free(ptr);
}
(*context->actions->abort)(context->target, e);
HTChunkFree(context->string);
for(cnt=0; cnt<MAX_ATTRIBUTES; cnt++) /* Leak fix Henrik 18/02-94 */
if(context->value[cnt])
free(context->value[cnt]);
free(context);
return HT_ERROR;
}
/* Read and write user callback handle
** -----------------------------------
**
** The callbacks from the SGML parser have an SGML context parameter.
** These calls allow the caller to associate his own context with a
** particular SGML context.
*/
#ifdef CALLERDATA
PUBLIC void* SGML_callerData (HTStream * context)
{
return context->callerData;
}
PUBLIC void SGML_setCallerData (HTStream * context, void* data)
{
context->callerData = data;
}
#endif
PUBLIC int SGML_character (HTStream * context, char c)
{
CONST SGML_dtd *dtd = context->dtd;
HTChunk *string = context->string;
switch(context->state) {
case S_after_open: /* Strip one trainling newline
only after opening nonempty element. - SGML:Ugh! */
if (c=='\n' && (context->current_tag->contents != SGML_EMPTY)) {
break;
}
context->state = S_text;
goto normal_text;
/* (***falls through***) */
case S_text:
normal_text:
#ifdef ISO_2022_JP
if (c=='\033') {
context->state = S_esc;
PUTC(c);
break;
}
#endif /* ISO_2022_JP */
if (c=='&' && (!context->element_stack || (
context->element_stack->tag &&
( context->element_stack->tag->contents == SGML_MIXED
|| context->element_stack->tag->contents ==
SGML_RCDATA)
))) {
string->size = 0;
context->state = S_ero;
} else if (c=='<') {
string->size = 0;
context->state = (context->element_stack &&
context->element_stack->tag &&
context->element_stack->tag->contents == SGML_LITERAL) ?
S_literal : S_tag;
} else if (c=='\n') { /* Newline - ignore if before tag end! */
context->state = S_nl;
} else PUTC(c);
break;
case S_nl:
if (c=='<') {
string->size = 0;
context->state = (context->element_stack &&
context->element_stack->tag &&
context->element_stack->tag->contents == SGML_LITERAL) ?
S_literal : S_nl_tago;
} else {
PUTC('\n');
context->state = S_text;
goto normal_text;
}
break;
case S_nl_tago: /* Had newline and tag opener */
if (c != '/') {
PUTC('\n'); /* Only ignore newline before </ */
}
context->state = S_tag;
goto handle_S_tag;
#ifdef ISO_2022_JP
case S_esc:
if (c=='$') {
context->state = S_dollar;
} else if (c=='(') {
context->state = S_paren;
} else {
context->state = S_text;
}
PUTC(c);
break;
case S_dollar:
if (c=='@' || c=='B') {
context->state = S_nonascii_text;
} else {
context->state = S_text;
}
PUTC(c);
break;
case S_paren:
if (c=='B' || c=='J') {
context->state = S_text;
} else {
context->state = S_text;
}
PUTC(c);
break;
case S_nonascii_text:
if (c=='\033') {
context->state = S_esc;
PUTC(c);
} else {
PUTC(c);
}
break;
#endif /* ISO_2022_JP */
/* In literal mode, waits only for specific end tag!
** Only foir compatibility with old servers.
*/
case S_literal :
HTChunkPutc(string, c);
if ( TOUPPER(c) != ((string->size ==1) ? '/'
: context->element_stack->tag->name[string->size-2])) {
int i;
/* If complete match, end literal */
if ((c=='>') && (!context->element_stack->tag->name[string->size-2])) {
end_element(context, context->element_stack->tag);
string->size = 0;
context->current_attribute_number = INVALID;
context->state = S_text;
break;
} /* If Mismatch: recover string. */
PUTC( '<');
for (i=0; i<string->size; i++) /* recover */
PUTC(
string->data[i]);
context->state = S_text;
}
break;
/* Character reference or Entity
*/
case S_ero:
if (c=='#') {
context->state = S_cro; /* &# is Char Ref Open */
break;
}
context->state = S_entity; /* Fall through! */
/* Handle Entities
*/
case S_entity:
if (isalnum(c))
HTChunkPutc(string, c);
else {
HTChunkTerminate(string);
handle_entity(context, c);
context->state = S_text;
}
break;
/* Character reference
*/
case S_cro:
if (isalnum(c))
HTChunkPutc(string, c); /* accumulate a character NUMBER */
else {
int value;
HTChunkTerminate(string);
if (sscanf(string->data, "%d", &value)==1)
PUTC((char) value);
context->state = S_text;
}
break;
/* Tag
*/
case S_tag: /* new tag */
handle_S_tag:
if (isalnum(c))
HTChunkPutc(string, c);
else { /* End of tag name */
HTTag * t;
if (c=='/') {
if (SGML_TRACE) if (string->size!=0)
TTYPrint(TDEST,"SGML: `<%s/' found!\n", string->data);
context->state = S_end;
break;
}
HTChunkTerminate(string) ;
t = SGMLFindTag(dtd, string->data);
if (!t) {
if(SGML_TRACE) TTYPrint(TDEST, "SGML: *** Unknown element %s\n",
string->data);
context->state = (c=='>') ? S_text : S_junk_tag;
break;
}
context->current_tag = t;
/* Clear out attributes
*/
{
int i;
for (i=0; i< context->current_tag->number_of_attributes; i++)
context->present[i] = NO;
}
string->size = 0;
context->current_attribute_number = INVALID;
if (c=='>') {
if (context->current_tag->name) start_element(context);
context->state = S_after_open;
} else {
context->state = S_tag_gap;
}
}
break;
case S_tag_gap: /* Expecting attribute or > */
if (WHITE(c)) break; /* Gap between attributes */
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_after_open;
break;
}
HTChunkPutc(string, c);
context->state = S_attr; /* Get attribute */
break;
/* accumulating value */
case S_attr:
if (WHITE(c) || (c=='>') || (c=='=')) { /* End of word */
HTChunkTerminate(string) ;
handle_attribute_name(context, string->data);
string->size = 0;
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_after_open;
break;
}
context->state = (c=='=' ? S_equals: S_attr_gap);
} else {
HTChunkPutc(string, c);
}
break;
case S_attr_gap: /* Expecting attribute or = or > */
if (WHITE(c)) break; /* Gap after attribute */
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_after_open;
break;
} else if (c=='=') {
context->state = S_equals;
break;
}
HTChunkPutc(string, c);
context->state = S_attr; /* Get next attribute */
break;
case S_equals: /* After attr = */
if (WHITE(c)) break; /* Before attribute value */
if (c=='>') { /* End of tag */
if (SGML_TRACE) TTYPrint(TDEST, "SGML: found = but no value\n");
if (context->current_tag->name) start_element(context);
context->state = S_after_open;
break;
} else if (c=='\'') {
context->state = S_squoted;
break;
} else if (c=='"') {
context->state = S_dquoted;
break;
}
HTChunkPutc(string, c);
context->state = S_value;
break;
case S_value:
if (WHITE(c) || (c=='>')) { /* End of word */
HTChunkTerminate(string) ;
handle_attribute_value(context, string->data);
string->size = 0;
if (c=='>') { /* End of tag */
if (context->current_tag->name) start_element(context);
context->state = S_after_open;
break;
}
else context->state = S_tag_gap;
} else {
HTChunkPutc(string, c);
}
break;
case S_squoted: /* Quoted attribute value */
if (c=='\'') { /* End of attribute value */
HTChunkTerminate(string) ;
handle_attribute_value(context, string->data);
string->size = 0;
context->state = S_tag_gap;
} else {
HTChunkPutc(string, c);
}
break;
case S_dquoted: /* Quoted attribute value */
if (c=='"') { /* End of attribute value */
HTChunkTerminate(string) ;
handle_attribute_value(context, string->data);
string->size = 0;
context->state = S_tag_gap;
} else {
HTChunkPutc(string, c);
}
break;
case S_end: /* </ */
if (isalnum(c))
HTChunkPutc(string, c);
else { /* End of end tag name */
HTTag * t;
HTChunkTerminate(string) ;
if (!*string->data) { /* Empty end tag */
t = context->element_stack->tag;
} else {
t = SGMLFindTag(dtd, string->data);
}
if (!t) {
if(SGML_TRACE) TTYPrint(TDEST,
"Unknown end tag </%s>\n", string->data);
} else {
context->current_tag = t;
end_element( context, context->current_tag);
}
string->size = 0;
context->current_attribute_number = INVALID;
if (c!='>') {
if (SGML_TRACE && !WHITE(c))
TTYPrint(TDEST,"SGML: `</%s%c' found!\n",
string->data, c);
context->state = S_junk_tag;
} else {
context->state = S_text;
}
}
break;
case S_junk_tag:
if (c=='>') {
context->state = S_text;
}
} /* switch on context->state */
return HT_OK;
}
PUBLIC int SGML_string (HTStream * context, CONST char* s)
{
while (*s)
SGML_character(context, *s++);
return HT_OK;
}
PUBLIC int SGML_write (HTStream * context, CONST char* b, int l)
{
while (l-- > 0)
SGML_character(context, *b++);
return HT_OK;
}
/*_______________________________________________________________________
*/
/* Structured Object Class
** -----------------------
*/
PRIVATE CONST HTStreamClass SGMLParser =
{
"SGMLParser",
SGML_flush,
SGML_free,
SGML_abort,
SGML_character,
SGML_string,
SGML_write,
};
/* Create SGML Engine
** ------------------
**
** On entry,
** dtd represents the DTD, along with
** actions is the sink for the data as a set of routines.
**
*/
PUBLIC HTStream * SGML_new (CONST SGML_dtd * dtd, HTStructured * target)
{
int i;
HTStream* context = (HTStream *) malloc(sizeof(*context));
if (!context) outofmem(__FILE__, "SGML_begin");
context->isa = &SGMLParser;
context->string = HTChunkCreate(128); /* Grow by this much */
context->dtd = dtd;
context->target = target;
context->actions = (HTStructuredClass*)(((HTStream*)target)->isa);
/* Ugh: no OO */
context->state = S_text;
context->element_stack = 0; /* empty */
#ifdef CALLERDATA
context->callerData = (void*) callerData;
#endif
for(i=0; i<MAX_ATTRIBUTES; i++) context->value[i] = 0;
return context;
}
Webmaster