File:  [Public] / libwww / Library / src / HTTeXGen.c
Revision 2.20: download - view: text, annotated - select for diffs
Fri Feb 9 19:27:01 1996 UTC (28 years, 4 months ago) by frystyk
Branches: MAIN
CVS tags: v4/0D, HEAD
post HT_Mem

/*								     HTTeXGen.c
**	HTML -> LaTeX CONVERTER
**
**	(c) COPYRIGHT MIT 1995.
**	Please first read the full copyright statement in the file COPYRIGH.
**
** 	This version of the HTML object sends LaTeX to the output stream.
** 	No attributes are considered in the translation!
** 	The module uses simple 1:1 table-conversions, but this COULD be
**	expanded to a stack-machine. This would then be in start_element and
**	end_element...
**						Henrik 07/03-94
**
** HISTORY:
**	 8 Jul 94  FM	Insulate free() from _free structure element.
**
*/

#define BUFFER_SIZE	80	/* Line buffer attempts to make neat breaks */
#define WORD_DELIMITERS ",;:[]()"

/* Library include files */
#include "tcp.h"
#include "HTUtils.h"
#include "HTTeXGen.h"
#include "HTMLPDTD.h"
#include "HTStruct.h"
#include "HTFormat.h"

/*		HTML Object
**		-----------
*/

struct _HTStream {
	CONST HTStreamClass *		isa;	
	HTStream * 			target;
	HTStreamClass			targetClass;	   /* COPY for speed */
};

struct _HTStructured {
	CONST HTStructuredClass *	isa;
	HTStream * 			target;
	HTStreamClass			targetClass;	   /* COPY for speed */
	CONST SGML_dtd *		dtd;
	
	char				buffer[2*BUFFER_SIZE];   /* See note */
	char *				write_pointer;
	char *				line_break;
	BOOL				sensitive;          /* Can we put \n */
	BOOL				preformatted;     /* Is it verbatim? */
	BOOL				markup;     /* If doing LaTeX markup */
	BOOL				startup;      /* To skip MIME header */
};

/* The buffer has to be bigger than 80 as latex markup might make the line
   longer before we get to flush it. */

PRIVATE char *TeX_names[HTMLP_ELEMENTS][2] = {
    { "",       	""		},	/* HTML_A		*/
    { "",		""		},	/* HTML_ABBREV		*/
    { "\n\\begin{abstract}\n","\n\\end{abstract}\n"},  /* HTML_ABSTRACT	*/
    { "",		""		},	/* HTML_ACRONYM		*/
    { "",		""		},	/* HTML_ADDED		*/
    { "{\\it ",		"}"		},	/* HTML_ADDRESS		*/
    { "",		""		},	/* HTML_ARG		*/
    { "{\\bf ",		"}"		},	/* HTML_B		*/
    { "",		""		},	/* HTML_BASE		*/
    { "{\\sf ",		"}"		},	/* HTML_BLOCKQUOTE 	*/
    { "", 		""		},	/* HTML_BODY     	*/
    { "",		""		},	/* HTML_BOX		*/
    { "",		""		},	/* HTML_BR		*/
    { "",		""		},	/* HTML_BYLINE		*/
    { "",		""		},	/* HTML_CAPTION		*/
    { "",		""		},	/* HTML_CHANGED		*/
    { "\\cite{",      	"}"		},	/* HTML_CITE		*/
    { "",		""		},	/* HTML_CMD		*/
    { "{\\tt ",		"}"		},	/* HTML_CODE		*/
    { "\n\\typeout{",	"}\n"		},	/* HTML_COMMENT		*/
    { "]",		""		},	/* HTML_DD		*/
    { "",		""		},	/* HTML_DFN		*/
    { "",		""		},	/* HTML_DIR		*/
    { "\n\\begin{description}","\n\\end{description}\n"}, /* HTML_DL 	*/
    { "\n\\item[",	""		},	/* HTML_DT		*/
    { "{\\em ",		"}"		},	/* HTML_EM		*/
    { "",	""			},	/* HTML_FIG		*/
    { "\n\\footnote{",	"}\n"	       	},	/* HTML_FOOTNOTE	*/
    { "",	"" 			},	/* HTML_FORM		*/
    { "\n\\chapter{",	"}\n"		},	/* HTML_H1		*/
    { "\n\\section{",	"}\n"		},	/* HTML_H2		*/
    { "\n\\subsection{","}\n"  		},	/* HTML_H3		*/
    { "\n\\subsubsection{","}\n"	},	/* HTML_H4		*/
    { "\n\\paragraph{",	"}\n"		},	/* HTML_H5		*/
    { "\n\\subparagraph{","}\n"		},	/* HTML_H6		*/
    { "",		"\n"		},	/* HTML_H7		*/
    { "",		""		},	/* HTML_HEAD		*/
    { "",		""		},	/* HTML_HR		*/
    { "",		""		}, 	/* HTML_HTML		*/
    { "",		""		},    	/* HTML_HTMLPLUS	*/
    { "{\\it ",		"}"		},	/* HTML_I		*/
    { "",		""		},	/* HTML_IMAGE		*/
    { "_FIGUR_",	""		},	/* HTML_IMG		*/
    { "",		""		},	/* HTML_INPUT		*/
    { "",		""		},	/* HTML_ISINDEX		*/
    { "{\\tt ",		"}"		},	/* HTML_KBD		*/
    { "",		""		},	/* HTML_L		*/
    { "\n\\item ",      "" 	        },	/* HTML_LI		*/
    { "",		""		},	/* HTML_LINK		*/
    { "",		""		},	/* HTML_LISTING		*/
    { "",		""		},	/* HTML_LIT		*/
    { "",		""		},	/* HTML_MARGIN		*/
    { "",		""		},	/* HTML_MATH		*/
    { "",		""		},	/* HTML_MENU		*/
    { "",		""		},	/* HTML_NEXTID		*/
    { "",		""		},	/* HTML_NOTE		*/
    { "\n\\begin{enumerate}\n","\n\\end{enumerate}\n"}, /* HTML_OL     	*/
    { "",		""		},	/* HTML_OPTION		*/
    { "",		""		},	/* HTML_OVER		*/
    { "\n\n",		""		},	/* HTML_P		*/
    { "",		""		},	/* HTML_PERSON		*/
    { "",		""		},	/* HTML_PLAINTEXT	*/
    { "\n\\begin{verbatim}"," \\end{verbatim}\n"},  /* HTML_PRE        */
    { "",		""		},	/* HTML_Q		*/
    { "\\begin{quote}",	"\\end{quote}"}, 	/* HTML_QUOTE		*/
    { "",		""		},	/* HTML_RENDER		*/
    { "",		""		},	/* HTML_REMOVED		*/
    { "",		""		},	/* HTML_S		*/
    { "",		""		},	/* HTML_SAMP		*/
    { "", 		""		},	/* HTML_SELECT		*/
    { "{\\bf ",		"}"		},	/* HTML_STRONG		*/
    { "", 		""		},	/* HTML_SUB		*/
    { "",		""		},	/* HTML_SUP		*/
    { "",		""		},	/* HTML_TAB		*/
    { "",		""		},	/* HTML_TABLE		*/
    { "",		""		},	/* HTML_TD		*/
    { "",		""		},	/* HTML_TEXTAREA	*/
    { "",		""		},	/* HTML_TH		*/
    { "\n\\title{",	"}\n\\author{}\n\\maketitle\n"},  /* HTML_TITLE */
    { "",		""		},	/* HTML_TR		*/
    { "",		""		},	/* HTML_TT		*/
    { "",		""		},	/* HTML_U		*/
    { "\n\\begin{itemize}","\n\\end{itemize}\n"},   /* HTML_UL		*/
    { "",		""		},	/* HTML_VAR		*/
    { "{\\sf ",		"}"		}	/* HTML_XMP		*/
};

PRIVATE char *TeX_entities[HTML_ENTITIES] = {
    "\\AE ",		/*"AElig",	 capital AE diphthong (ligature) */ 
    "\\\'{A}",		/*"Aacute",	 capital A, acute accent */ 
    "\\^{A}",		/*"Acirc",	 capital A, circumflex accent */ 
    "\\`{A}",		/*"Agrave",	 capital A, grave accent */ 
    "\\AA",   	 	/*"Aring",	 capital A, ring */ 
    "\\~{A}", 		/*"Atilde",	 capital A, tilde */ 
    "\\\"{A}",		/*"Auml",	 capital A, dieresis or umlaut mark */ 
    "\\c{C}",		/*"Ccedil",	 capital C, cedilla */ 
    "\\OE ",		/*"ETH",	 capital Eth, Icelandic */ 
    "\\\'{E}",		/*"Eacute",	 capital E, acute accent */ 
    "\\^{E}",		/*"Ecirc",	 capital E, circumflex accent */ 
    "\\`{E}",		/*"Egrave",	 capital E, grave accent */ 
    "\\\"{E}",		/*"Euml",	 capital E, dieresis or umlaut mark */ 
    "\\\'{I}",		/*"Iacute",	 capital I, acute accent */ 
    "\\^{I}",		/*"Icirc",	 capital I, circumflex accent */ 
    "\\`{I}",		/*"Igrave",	 capital I, grave accent */ 
    "\\\"{I}",		/*"Iuml",	 capital I, dieresis or umlaut mark */ 
    "\\~{N}",		/*"Ntilde",	 capital N, tilde */ 
    "\\\'{O}",		/*"Oacute",	 capital O, acute accent */ 
    "\\^{O}",		/*"Ocirc",	 capital O, circumflex accent */ 
    "\\`{O}",		/*"Ograve",	 capital O, grave accent */ 
    "\\O ",            	/*"Oslash",	 capital O, slash */ 
    "\\~{O}",		/*"Otilde",	 capital O, tilde */ 
    "\\\"{O}",        	/*"Ouml",	 capital O, dieresis or umlaut mark */ 
    " ",      	/*"THORN",	 capital THORN, Icelandic */ 
    "\\\'{U}",		/*"Uacute",	 capital U, acute accent */ 
    "\\^{U}",		/*"Ucirc",	 capital U, circumflex accent */ 
    "\\`{U}",		/*"Ugrave",	 capital U, grave accent */ 
    "\\\"{U}",		/*"Uuml",	 capital U, dieresis or umlaut mark */ 
    "\\\'{Y}",		/*"Yacute",	 capital Y, acute accent */ 
    "\\\'{a}",		/*"aacute",	 small a, acute accent */ 
    "\\^{a}",		/*"acirc",	 small a, circumflex accent */ 
    "\\ae ",		/*"aelig",	 small ae diphthong (ligature) */ 
    "\\`{a}",		/*"agrave",	 small a, grave accent */ 
    "&",            	/*"amp",	 ampersand */ 
    "\\aa ",		/*"aring",	 small a, ring */ 
    "\\~{a}",		/*"atilde",	 small a, tilde */ 
    "\\\"{a}",		/*"auml",	 small a, dieresis or umlaut mark */ 
    "\\c{c}",		/*"ccedil",	 small c, cedilla */ 
    "\\\'{e}",		/*"eacute",	 small e, acute accent */ 
    "\\^{c}",		/*"ecirc",	 small e, circumflex accent */ 
    "\\`{c}",		/*"egrave",	 small e, grave accent */ 
    "\\oe ",		/*"eth",	 small eth, Icelandic */ 
    "\\\"{e}",		/*"euml",	 small e, dieresis or umlaut mark */ 
    ">",		/*"gt",	 greater than */ 
    "\\\'{\\i}",      	/*"iacute",	 small i, acute accent */ 
    "\\^{\\i}",       	/*"icirc",	 small i, circumflex accent */ 
    "\\`{\\i}",       	/*"igrave",	 small i, grave accent */ 
    "\\\"{\\i}",     	/*"iuml",	 small i, dieresis or umlaut mark */ 
    "<",		/*"lt",	 less than */ 
    "\\~{n}",		/*"ntilde",	 small n, tilde */ 
    "\\\'{o}",		/*"oacute",	 small o, acute accent */ 
    "\\~{o}",		/*"ocirc",	 small o, circumflex accent */ 
    "\\`{o}",		/*"ograve",	 small o, grave accent */ 
    "\\o ",            	/*"oslash",	 small o, slash */ 
    "\\~{o}",         	/*"otilde",	 small o, tilde */ 
    "\\\"{o}",	   	/*"ouml",	 small o, dieresis or umlaut mark */ 
    "\"",	   	/*"quot",	 double quote sign - June 1994 */ 
    "\\ss ",		/*"szlig",	 small sharp s, German (sz ligature)*/ 
    " ",       	/*"thorn",	 small thorn, Icelandic */ 
    "\\\'{u}",        	/*"uacute",	 small u, acute accent */ 
    "\\^{u}",        	/*"ucirc",	 small u, circumflex accent */ 
    "\\`{u}",         	/*"ugrave",	 small u, grave accent */ 
    "\\\"{u}",        	/*"uuml",	 small u, dieresis or umlaut mark */ 
    "\\\'{y}",		/*"yacute",	 small y, acute accent */ 
    "\\\"{y}"	    	/*"yuml",	 small y, dieresis or umlaut mark */ 
};


/*	Flush Buffer
**	------------
*/
PRIVATE int HTTeXGen_flush (HTStructured * me)
{
    int status;
    if ((status =
	 (*me->targetClass.put_block)(me->target, me->buffer,
				      me->write_pointer-me->buffer)) != HT_OK)
	return status;
    me->write_pointer = me->buffer;
    me->line_break = me->buffer;
    return (*me->targetClass.flush)(me->target);
}


/*	Character handling
**	------------------
**
*/
PRIVATE int HTTeXGen_put_character (HTStructured * me, char c)
{
    if (!me->startup)		                      /* To skip MIME header */
	return HT_OK;
    if (c=='\n') {
	if (me->markup || me->preformatted) {     /* Put out as is and flush */
	    *me->write_pointer++ = c;
	    HTTeXGen_flush(me);
	    return HT_OK;
	} else if (me->sensitive || *(me->write_pointer-1)==' ') {
	    return HT_OK;
        } else
	    *me->write_pointer++ = ' ';		      /* Try to pretty print */
    } else if (me->markup || me->preformatted) {
	*me->write_pointer++ = c;
    } else if (c==' ' || c=='\t') {	              /* Skip space and tabs */
	if (*(me->write_pointer-1) != ' ')
	    *me->write_pointer++ = ' ';
	else
	    return HT_OK;
    } else {
	if (c=='$' || c=='&' || c=='%' || c=='#' ||         /* Special chars */
	    c=='{' || c=='}' || c=='_') {
	    *me->write_pointer++ = '\\';
	    *me->write_pointer++ = c;
	} else if (c=='\\') {			            /* Special names */
	    char *temp = "$\\backslash$";
	    strcpy(me->write_pointer, temp);
	    me->write_pointer += strlen(temp);
	} else if (c=='^') {
	    char *temp = "$\\hat{ }$";
	    strcpy(me->write_pointer, temp);
	    me->write_pointer += strlen(temp);	    
	} else if (c=='~') {
	    char *temp = "$\\tilde{ }$";
	    strcpy(me->write_pointer, temp);
	    me->write_pointer += strlen(temp);
	} else if (c=='|' || c=='<' || c=='>') {       	        /* Math mode */
	    *me->write_pointer++ = '$';
	    *me->write_pointer++ = c;
	    *me->write_pointer++ = '$';
	} else
	    *me->write_pointer++ = c;	      	        /* Char seems normal */
    }

    if (c==' ')						   /* Find delimiter */
	me->line_break = me->write_pointer;
    else if (strchr(WORD_DELIMITERS, c))
	me->line_break = me->write_pointer-1;

    /* Flush buffer out when full */
    if (me->write_pointer >= me->buffer+BUFFER_SIZE-3) {
#ifdef OLD_CODE
	if (me->markup || me->preformatted) {
#endif /* OLD_CODE */
	if (me->preformatted) {
	    *me->write_pointer = '\n';
	    (*me->targetClass.put_block)(me->target,
					 me->buffer,
					 me->write_pointer-me->buffer+1);
	    me->write_pointer = me->buffer;
	} else {		       		          /* Use break-point */
	    char line_break_char = *me->line_break;
	    char *saved = me->line_break;
	    *me->line_break = '\n';
	    (*me->targetClass.put_block)(me->target,
					 me->buffer,
					 me->line_break-me->buffer+1);
	    *me->line_break = line_break_char;
	    {                                           /* move next line in */
		char *p = saved;
		char *q;
		for(q=me->buffer; p<me->write_pointer; )
		    *q++ = *p++;
	    }
	    me->write_pointer = me->buffer + (me->write_pointer-saved);
	}	    
	me->line_break = me->buffer;
    }
    return HT_OK;
}



/*	String handling
**	---------------
*/
PRIVATE int HTTeXGen_put_string (HTStructured * me, CONST char* s)
{
    while (*s)
	HTTeXGen_put_character(me, *s++);
    return HT_OK;
}


PRIVATE int HTTeXGen_write (HTStructured * me, CONST char* b, int l)
{
    while (l-- > 0)
	HTTeXGen_put_character(me, *b++);
    return HT_OK;
}


/*	Start Element
**	-------------
**
**     	No attributes are put to the output		Henrik 07/03-94
**	Does no assumptions of WHAT element is started...
*/
PRIVATE void HTTeXGen_start_element (HTStructured * 	me,
				     int		element_number,
				     CONST BOOL *	present,
				     CONST char **	value)
{
    me->startup = YES;			        /* Now, let's get down to it */
    if (me->preformatted == YES) {	       /* Don't start markup in here */
	if (WWWTRACE)
	    TTYPrint(TDEST, "LaTeX....... No Markup in verbatim mode\n");
	return;
    }
    if (element_number == HTML_PRE)
	me->preformatted = YES;
    if (element_number == HTML_CITE ||	              /* No \n here, please! */
	element_number == HTML_DT ||
	element_number == HTML_H1 ||
	element_number == HTML_H2 ||
	element_number == HTML_H3 ||
	element_number == HTML_H4 ||
	element_number == HTML_H5 ||
	element_number == HTML_H6 ||
	element_number == HTML_H7 ||
	element_number == HTML_TITLE)
	me->sensitive = YES;
    else if (element_number == HTML_DD)         /* Only way to turn <DT> off */
	me->sensitive = NO;
    me->markup = element_number == HTML_A ? NO : YES;
    HTTeXGen_put_string(me, *TeX_names[element_number]);
    me->markup = NO;
}


/*		End Element
**		-----------
**
**	Ends an markup element			Henrik 07/03-94
**	Does no assumptions of WHAT element is ended...
*/
PRIVATE void HTTeXGen_end_element (HTStructured * me, int element_number)
{
    if (me->preformatted && element_number != HTML_PRE) {
	if (WWWTRACE)
	    TTYPrint(TDEST, "LaTeX....... No markup in verbatim mode\n");
	return;
    }
    me->preformatted = NO;
    me->markup = YES;
    HTTeXGen_put_string(me, *(TeX_names[element_number]+1));
    me->markup = NO;
    if (element_number == HTML_CITE ||	
	element_number == HTML_DL ||
	element_number == HTML_H1 ||
	element_number == HTML_H2 ||
	element_number == HTML_H3 ||
	element_number == HTML_H4 ||
	element_number == HTML_H5 ||
	element_number == HTML_H6 ||
	element_number == HTML_H7 ||
	element_number == HTML_TITLE)
	me->sensitive = NO;
}


/*		Expanding entities
**		------------------
**
*/
PRIVATE void HTTeXGen_put_entity (HTStructured * me, int entity_number)
{
    BOOL mark = me->markup;
    if (*TeX_entities[entity_number] != '&' && /* Theese are converted later */
	*TeX_entities[entity_number] != '<' &&
	*TeX_entities[entity_number] != '>')
	me->markup = YES;
    HTTeXGen_put_string(me, TeX_entities[entity_number]);
    me->markup = mark;
}



/*	Free an HTML object
**	-------------------
**
*/
PRIVATE int HTTeXGen_free (HTStructured * me)
{
    HTTeXGen_flush(me);
    (*me->targetClass.put_string)(me->target, "\n\\end{document}\n");
    HTTeXGen_flush(me);
    (*me->targetClass._free)(me->target);	/* ripple through */
    HT_FREE(me);
    return HT_OK;
}


PRIVATE int HTTeXGen_abort (HTStructured * me, HTList * e)
{
    HTTeXGen_free(me);
    return HT_ERROR;
}


/*	Structured Object Class
**	-----------------------
*/
PRIVATE CONST HTStructuredClass HTTeXGeneration = /* As opposed to print etc */
{		
	"HTMLToTeX",
	HTTeXGen_flush,
	HTTeXGen_free,
	HTTeXGen_abort,
	HTTeXGen_put_character,        	HTTeXGen_put_string,	HTTeXGen_write,
	HTTeXGen_start_element, 	HTTeXGen_end_element,
	HTTeXGen_put_entity
}; 


/*	HTConverter from HTML to TeX Stream
**	------------------------------------------
**
*/
PUBLIC HTStream* HTMLToTeX (HTRequest *	request,
			    void *	param,
			    HTFormat	input_format,
			    HTFormat	output_format,
			    HTStream *	output_stream)
{
    HTStructured* me;
    if ((me = (HTStructured *) HT_CALLOC(1, sizeof(*me))) == NULL)
        HT_OUTOFMEM("HTMLToTeX");

    me->isa = (HTStructuredClass*) &HTTeXGeneration;
    me->dtd = &HTMLP_dtd;
    me->target = output_stream;
    me->targetClass = *me->target->isa;/* Copy pointers to routines for speed*/
    me->write_pointer = me->buffer;
    me->line_break = 	me->buffer;
    (*me->targetClass.put_string)(me->target,
        "\\documentstyle[11pt]{report}\n\\begin{document}\n");
    return SGML_new(&HTMLP_dtd, me);
}

Webmaster