Annotation of libwww/Library/src/HTTeXGen.c, revision 2.6
2.1 frystyk 1: /* Simple LaTeX Generator that converts in a 1:1 manner from HTML to LaTeX
2: ** =======================================================================
3: **
4: ** This version of the HTML object sends LaTeX to the output stream.
5: ** No attributes are considered in the translation!
6: ** The module uses simple 1:1 table-conversions, but this COULD be
7: ** expanded to a stack-machine. This would then be in start_element and
8: ** end_element...
9: ** Henrik 07/03-94
10: */
11:
12: #define BUFFER_SIZE 80 /* Line buffer attempts to make neat breaks */
13: #define WORD_DELIMITERS ",/;:\"[]()"
14:
15: /* Implements: */
16: #include "HTTeXGen.h"
17: #include <stdio.h>
18: #include "HTMLPDTD.h"
19: #include "HTStream.h"
20: #include "SGML.h"
21: #include "HTFormat.h"
22: #include "tcp.h"
23:
24:
25: /* HTML Object
26: ** -----------
27: */
28:
29: struct _HTStream {
30: CONST HTStreamClass * isa;
31: HTStream * target;
32: HTStreamClass targetClass; /* COPY for speed */
33: };
34:
35: struct _HTStructured {
36: CONST HTStructuredClass * isa;
37: HTStream * target;
38: HTStreamClass targetClass; /* COPY for speed */
39: CONST SGML_dtd * dtd;
40:
2.5 frystyk 41: char buffer[2*BUFFER_SIZE]; /* See note */
2.1 frystyk 42: char * write_pointer;
43: char * line_break;
44: BOOL sensitive; /* Can we put \n */
45: BOOL preformatted; /* Is it verbatim? */
46: BOOL markup; /* If doing LaTeX markup */
47: BOOL startup; /* To skip MIME header */
48: };
2.5 frystyk 49:
50: /* The buffer has to be bigger than 80 as latex markup might make the line
51: longer before we get to flush it. */
2.1 frystyk 52:
2.4 frystyk 53: PRIVATE char *TeX_names[HTMLP_ELEMENTS][2] = {
2.1 frystyk 54: { "", "" }, /* HTML_A */
55: { "", "" }, /* HTML_ABBREV */
56: { "\n\\begin{abstract}\n","\n\\end{abstract}\n"}, /* HTML_ABSTRACT */
57: { "", "" }, /* HTML_ACRONYM */
58: { "", "" }, /* HTML_ADDED */
59: { "{\\it ", "}" }, /* HTML_ADDRESS */
60: { "", "" }, /* HTML_ARG */
61: { "{\\bf ", "}" }, /* HTML_B */
62: { "", "" }, /* HTML_BASE */
63: { "{\\sf ", "}" }, /* HTML_BLOCKQUOTE */
64: { "", "" }, /* HTML_BODY */
65: { "", "" }, /* HTML_BOX */
66: { "", "" }, /* HTML_BR */
67: { "", "" }, /* HTML_BYLINE */
68: { "", "" }, /* HTML_CAPTION */
69: { "", "" }, /* HTML_CHANGED */
70: { "\\cite{", "}" }, /* HTML_CITE */
71: { "", "" }, /* HTML_CMD */
72: { "{\\tt ", "}" }, /* HTML_CODE */
73: { "\n\\typeout{", "}\n" }, /* HTML_COMMENT */
74: { "]", "" }, /* HTML_DD */
75: { "", "" }, /* HTML_DFN */
76: { "", "" }, /* HTML_DIR */
77: { "\n\\begin{description}","\n\\end{description}\n"}, /* HTML_DL */
78: { "\n\\item[", "" }, /* HTML_DT */
79: { "{\\em ", "}" }, /* HTML_EM */
80: { "", "" }, /* HTML_FIG */
81: { "\n\\footnote{", "}\n" }, /* HTML_FOOTNOTE */
82: { "", "" }, /* HTML_FORM */
83: { "\n\\chapter{", "}\n" }, /* HTML_H1 */
84: { "\n\\section{", "}\n" }, /* HTML_H2 */
85: { "\n\\subsection{","}\n" }, /* HTML_H3 */
86: { "\n\\subsubsection{","}\n" }, /* HTML_H4 */
87: { "\n\\paragraph{", "}\n" }, /* HTML_H5 */
88: { "\n\\subparagraph{","}\n" }, /* HTML_H6 */
89: { "", "\n" }, /* HTML_H7 */
90: { "", "" }, /* HTML_HEAD */
91: { "", "" }, /* HTML_HR */
92: { "", "" }, /* HTML_HTML */
2.3 duns 93: { "", "" }, /* HTML_HTMLPLUS */
2.1 frystyk 94: { "{\\it ", "}" }, /* HTML_I */
95: { "", "" }, /* HTML_IMAGE */
96: { "", "" }, /* HTML_IMG */
97: { "", "" }, /* HTML_INPUT */
98: { "", "" }, /* HTML_ISINDEX */
99: { "{\\tt ", "}" }, /* HTML_KBD */
100: { "", "" }, /* HTML_L */
101: { "\n\\item ", "" }, /* HTML_LI */
102: { "", "" }, /* HTML_LINK */
103: { "", "" }, /* HTML_LISTING */
104: { "", "" }, /* HTML_LIT */
105: { "", "" }, /* HTML_MARGIN */
106: { "", "" }, /* HTML_MATH */
107: { "", "" }, /* HTML_MENU */
108: { "", "" }, /* HTML_NEXTID */
109: { "", "" }, /* HTML_NOTE */
110: { "\n\\begin{enumerate}\n","\n\\end{enumerate}\n"}, /* HTML_OL */
111: { "", "" }, /* HTML_OPTION */
112: { "", "" }, /* HTML_OVER */
113: { "\n\n", "" }, /* HTML_P */
114: { "", "" }, /* HTML_PERSON */
115: { "", "" }, /* HTML_PLAINTEXT */
116: { "\n\\begin{verbatim}"," \\end{verbatim}\n"}, /* HTML_PRE */
117: { "", "" }, /* HTML_Q */
118: { "\\begin{quote}", "\\end{quote}"}, /* HTML_QUOTE */
119: { "", "" }, /* HTML_RENDER */
120: { "", "" }, /* HTML_REMOVED */
121: { "", "" }, /* HTML_S */
122: { "", "" }, /* HTML_SAMP */
123: { "", "" }, /* HTML_SELECT */
124: { "{\\bf ", "}" }, /* HTML_STRONG */
125: { "", "" }, /* HTML_SUB */
126: { "", "" }, /* HTML_SUP */
127: { "", "" }, /* HTML_TAB */
128: { "", "" }, /* HTML_TABLE */
129: { "", "" }, /* HTML_TD */
130: { "", "" }, /* HTML_TEXTAREA */
131: { "", "" }, /* HTML_TH */
132: { "\n\\title{", "}\n\\author{}\n\\maketitle\n"}, /* HTML_TITLE */
133: { "", "" }, /* HTML_TR */
134: { "", "" }, /* HTML_TT */
135: { "", "" }, /* HTML_U */
136: { "\n\\begin{itemize}","\n\\end{itemize}\n"}, /* HTML_UL */
137: { "", "" }, /* HTML_VAR */
138: { "{\\sf ", "}" } /* HTML_XMP */
139: };
140:
141: PRIVATE char *TeX_entities[] = {
142: "\\AE ", /*"AElig", capital AE diphthong (ligature) */
2.2 frystyk 143: "\\\'{A}", /*"Aacute", capital A, acute accent */
144: "\\^{A}", /*"Acirc", capital A, circumflex accent */
145: "\\`{A}", /*"Agrave", capital A, grave accent */
146: "\\AA", /*"Aring", capital A, ring */
147: "\\~{A}", /*"Atilde", capital A, tilde */
148: "\\\"{A}", /*"Auml", capital A, dieresis or umlaut mark */
149: "\\c{C}", /*"Ccedil", capital C, cedilla */
150: "\\OE ", /*"ETH", capital Eth, Icelandic */
151: "\\\'{E}", /*"Eacute", capital E, acute accent */
152: "\\^{E}", /*"Ecirc", capital E, circumflex accent */
153: "\\`{E}", /*"Egrave", capital E, grave accent */
154: "\\\"{E}", /*"Euml", capital E, dieresis or umlaut mark */
155: "\\\'{I}", /*"Iacute", capital I, acute accent */
156: "\\^{I}", /*"Icirc", capital I, circumflex accent */
157: "\\`{I}", /*"Igrave", capital I, grave accent */
158: "\\\"{I}", /*"Iuml", capital I, dieresis or umlaut mark */
159: "\\~{N}", /*"Ntilde", capital N, tilde */
160: "\\\'{O}", /*"Oacute", capital O, acute accent */
161: "\\^{O}", /*"Ocirc", capital O, circumflex accent */
162: "\\`{O}", /*"Ograve", capital O, grave accent */
2.1 frystyk 163: "\\O ", /*"Oslash", capital O, slash */
2.2 frystyk 164: "\\~{O}", /*"Otilde", capital O, tilde */
165: "\\\"{O}", /*"Ouml", capital O, dieresis or umlaut mark */
166: " ", /*"THORN", capital THORN, Icelandic */
167: "\\\'{U}", /*"Uacute", capital U, acute accent */
168: "\\^{U}", /*"Ucirc", capital U, circumflex accent */
169: "\\`{U}", /*"Ugrave", capital U, grave accent */
170: "\\\"{U}", /*"Uuml", capital U, dieresis or umlaut mark */
171: "\\\'{Y}", /*"Yacute", capital Y, acute accent */
172: "\\\'{a}", /*"aacute", small a, acute accent */
173: "\\^{a}", /*"acirc", small a, circumflex accent */
2.1 frystyk 174: "\\ae ", /*"aelig", small ae diphthong (ligature) */
2.2 frystyk 175: "\\`{a}", /*"agrave", small a, grave accent */
2.1 frystyk 176: "&", /*"amp", ampersand */
177: "\\aa ", /*"aring", small a, ring */
2.2 frystyk 178: "\\~{a}", /*"atilde", small a, tilde */
179: "\\\"{a}", /*"auml", small a, dieresis or umlaut mark */
180: "\\c{c}", /*"ccedil", small c, cedilla */
181: "\\\'{e}", /*"eacute", small e, acute accent */
182: "\\^{c}", /*"ecirc", small e, circumflex accent */
183: "\\`{c}", /*"egrave", small e, grave accent */
184: "\\oe ", /*"eth", small eth, Icelandic */
185: "\\\"{e}", /*"euml", small e, dieresis or umlaut mark */
2.1 frystyk 186: ">", /*"gt", greater than */
2.2 frystyk 187: "\\\'{\\i}", /*"iacute", small i, acute accent */
188: "\\^{\\i}", /*"icirc", small i, circumflex accent */
189: "\\`{\\i}", /*"igrave", small i, grave accent */
190: "\\\"{\\i}", /*"iuml", small i, dieresis or umlaut mark */
2.1 frystyk 191: "<", /*"lt", less than */
2.2 frystyk 192: "\\~{n}", /*"ntilde", small n, tilde */
193: "\\\'{o}", /*"oacute", small o, acute accent */
194: "\\~{o}", /*"ocirc", small o, circumflex accent */
195: "\\`{o}", /*"ograve", small o, grave accent */
2.1 frystyk 196: "\\o ", /*"oslash", small o, slash */
2.2 frystyk 197: "\\~{o}", /*"otilde", small o, tilde */
198: "\\\"{o}", /*"ouml", small o, dieresis or umlaut mark */
2.1 frystyk 199: "\\ss ", /*"szlig", small sharp s, German (sz ligature)*/
2.2 frystyk 200: " ", /*"thorn", small thorn, Icelandic */
201: "\\\'{u}", /*"uacute", small u, acute accent */
202: "\\^{u}", /*"ucirc", small u, circumflex accent */
203: "\\`{u}", /*"ugrave", small u, grave accent */
204: "\\\"{u}", /*"uuml", small u, dieresis or umlaut mark */
205: "\\\'{y}", /*"yacute", small y, acute accent */
206: "\\\"{y}" /*"yuml", small y, dieresis or umlaut mark */
2.1 frystyk 207: };
208:
209:
210: /* Flush Buffer
211: ** ------------
212: */
213: PRIVATE void HTTeXGen_flush ARGS1(HTStructured *, me)
214: {
215: (*me->targetClass.put_block)(me->target,
216: me->buffer,
217: me->write_pointer - me->buffer);
218: me->write_pointer = me->buffer;
219: me->line_break = me->buffer;
220: }
221:
222:
223: /* Character handling
224: ** ------------------
225: **
226: */
227: PRIVATE void HTTeXGen_put_character ARGS2(HTStructured *, me, char, c)
228: {
229: if (!me->startup) /* To skip MIME header */
230: return;
231: if (c=='\n') {
232: if (me->markup || me->preformatted) { /* Put out as is and flush */
233: *me->write_pointer++ = c;
234: HTTeXGen_flush(me);
235: return;
236: } else if (me->sensitive || *(me->write_pointer-1)==' ') {
237: return;
238: } else
239: *me->write_pointer++ = ' '; /* Try to pretty print */
240: } else if (me->markup || me->preformatted) {
241: *me->write_pointer++ = c;
242: } else if (c==' ' || c=='\t') { /* Skip space and tabs */
243: if (*(me->write_pointer-1) != ' ')
244: *me->write_pointer++ = ' ';
245: else
246: return;
247: } else {
248: if (c=='$' || c=='&' || c=='%' || c=='#' || /* Special chars */
249: c=='{' || c=='}' || c=='_') {
250: *me->write_pointer++ = '\\';
251: *me->write_pointer++ = c;
252: } else if (c=='\\') { /* Special names */
253: char *temp = "$\\backslash$";
254: strcpy(me->write_pointer, temp);
255: me->write_pointer += strlen(temp);
256: } else if (c=='^') {
257: char *temp = "$\\hat{ }$";
258: strcpy(me->write_pointer, temp);
259: me->write_pointer += strlen(temp);
260: } else if (c=='~') {
261: char *temp = "$\\tilde{ }$";
262: strcpy(me->write_pointer, temp);
263: me->write_pointer += strlen(temp);
264: } else if (c=='|' || c=='<' || c=='>') { /* Math mode */
265: *me->write_pointer++ = '$';
266: *me->write_pointer++ = c;
267: *me->write_pointer++ = '$';
268: } else
269: *me->write_pointer++ = c; /* Char seems normal */
270: }
271:
2.6 ! frystyk 272: if (c==' ') /* Find delimiter */
2.1 frystyk 273: me->line_break = me->write_pointer;
274: else if (strchr(WORD_DELIMITERS, c))
275: me->line_break = me->write_pointer-1;
276:
277: /* Flush buffer out when full */
278: if (me->write_pointer >= me->buffer+BUFFER_SIZE-3) {
279: if (me->markup || me->preformatted) {
280: *me->write_pointer = '\n';
281: (*me->targetClass.put_block)(me->target,
282: me->buffer,
283: me->write_pointer-me->buffer+1);
284: me->write_pointer = me->buffer;
285: } else { /* Use break-point */
286: char line_break_char = *me->line_break;
287: char *saved = me->line_break;
288: *me->line_break = '\n';
289: (*me->targetClass.put_block)(me->target,
290: me->buffer,
291: me->line_break-me->buffer+1);
292: *me->line_break = line_break_char;
293: { /* move next line in */
294: char *p = saved;
295: char *q;
296: for(q=me->buffer; p<me->write_pointer; )
297: *q++ = *p++;
298: }
299: me->write_pointer = me->buffer + (me->write_pointer-saved);
300: }
301: me->line_break = me->buffer;
302: }
303: }
304:
305:
306:
307: /* String handling
308: ** ---------------
309: */
310: PRIVATE void HTTeXGen_put_string ARGS2(HTStructured *, me, CONST char*, s)
311: {
312: CONST char * p;
313: for (p=s; *p; p++)
314: HTTeXGen_put_character(me, *p);
315: }
316:
317:
318: PRIVATE void HTTeXGen_write ARGS3(HTStructured *, me, CONST char*, s, int, l)
319: {
320: CONST char * p;
321: for(p=s; p<s+l; p++)
322: HTTeXGen_put_character(me, *p);
323: }
324:
325:
326: /* Start Element
327: ** -------------
328: **
329: ** No attributes are put to the output Henrik 07/03-94
330: ** Does no assumptions of WHAT element is started...
331: */
332: PRIVATE void HTTeXGen_start_element ARGS4(
333: HTStructured *, me,
334: int, element_number,
335: CONST BOOL*, present,
336: CONST char **, value)
337: {
338: me->startup = YES; /* Now, let's get down to it */
2.6 ! frystyk 339: if (me->preformatted == YES) { /* Don't start markup in here */
! 340: if (TRACE)
! 341: fprintf(stderr, "LaTeX....... No Markup in verbatim mode\n");
2.1 frystyk 342: return;
2.6 ! frystyk 343: }
2.1 frystyk 344: if (element_number == HTML_PRE)
345: me->preformatted = YES;
346: if (element_number == HTML_CITE || /* No \n here, please! */
347: element_number == HTML_DT ||
348: element_number == HTML_H1 ||
349: element_number == HTML_H2 ||
350: element_number == HTML_H3 ||
351: element_number == HTML_H4 ||
352: element_number == HTML_H5 ||
353: element_number == HTML_H6 ||
354: element_number == HTML_H7 ||
355: element_number == HTML_TITLE)
356: me->sensitive = YES;
357: else if (element_number == HTML_DD) /* Only way to turn <DT> off */
358: me->sensitive = NO;
359: me->markup = element_number == HTML_A ? NO : YES;
360: HTTeXGen_put_string(me, *TeX_names[element_number]);
361: me->markup = NO;
362: }
363:
364:
365: /* End Element
366: ** -----------
367: **
368: ** Ends an markup element Henrik 07/03-94
369: ** Does no assumptions of WHAT element is ended...
370: */
371: PRIVATE void HTTeXGen_end_element ARGS2(HTStructured *, me,
372: int , element_number)
373: {
2.6 ! frystyk 374: if (me->preformatted && element_number != HTML_PRE) {
! 375: if (TRACE)
! 376: fprintf(stderr, "LaTeX....... No markup in verbatim mode\n");
! 377: return;
! 378: }
2.1 frystyk 379: me->preformatted = NO;
380: me->markup = YES;
381: HTTeXGen_put_string(me, *(TeX_names[element_number]+1));
382: me->markup = NO;
383: if (element_number == HTML_CITE ||
384: element_number == HTML_DL ||
385: element_number == HTML_H1 ||
386: element_number == HTML_H2 ||
387: element_number == HTML_H3 ||
388: element_number == HTML_H4 ||
389: element_number == HTML_H5 ||
390: element_number == HTML_H6 ||
391: element_number == HTML_H7 ||
392: element_number == HTML_TITLE)
393: me->sensitive = NO;
394: }
395:
396:
397: /* Expanding entities
398: ** ------------------
399: **
400: */
401: PRIVATE void HTTeXGen_put_entity ARGS2(HTStructured *, me, int, entity_number)
402: {
403: BOOL mark = me->markup;
404: if (*TeX_entities[entity_number] != '&' && /* Theese are converted later */
405: *TeX_entities[entity_number] != '<' &&
406: *TeX_entities[entity_number] != '>')
407: me->markup = YES;
408: HTTeXGen_put_string(me, TeX_entities[entity_number]);
409: me->markup = mark;
410: }
411:
412:
413:
414: /* Free an HTML object
415: ** -------------------
416: **
417: */
418: PRIVATE void HTTeXGen_free ARGS1(HTStructured *, me)
419: {
420: HTTeXGen_flush(me);
421: (*me->targetClass.put_string)(me->target, "\n\\end{document}\n");
422: HTTeXGen_flush(me);
423: (*me->targetClass.free)(me->target); /* ripple through */
424: free(me);
425: }
426:
427:
428: PRIVATE void HTTeXGen_abort ARGS2(HTStructured *, me, HTError, e)
429: {
430: HTTeXGen_free(me);
431: }
432:
433:
434: /* Structured Object Class
435: ** -----------------------
436: */
437: PRIVATE CONST HTStructuredClass HTTeXGeneration = /* As opposed to print etc */
438: {
439: "HTMLToTeX",
440: HTTeXGen_free,
441: HTTeXGen_abort,
442: HTTeXGen_put_character, HTTeXGen_put_string, HTTeXGen_write,
443: HTTeXGen_start_element, HTTeXGen_end_element,
444: HTTeXGen_put_entity
445: };
446:
447:
448: /* HTConverter from HTML to TeX Stream
449: ** ------------------------------------------
450: **
451: */
452: PUBLIC HTStream* HTMLToTeX ARGS5(
453: HTRequest *, request,
454: void *, param,
455: HTFormat, input_format,
456: HTFormat, output_format,
457: HTStream *, output_stream)
458: {
459: HTStructured* me = (HTStructured*) calloc(1, sizeof(*me));
460: if (me == NULL) outofmem(__FILE__, "HTMLToTeX");
461:
462: me->isa = (HTStructuredClass*) &HTTeXGeneration;
463: me->dtd = &HTMLP_dtd;
464: me->target = output_stream;
465: me->targetClass = *me->target->isa;/* Copy pointers to routines for speed*/
466: me->write_pointer = me->buffer;
467: me->line_break = me->buffer;
468: (*me->targetClass.put_string)(me->target,
469: "\\documentstyle[11pt]{report}\n\\begin{document}\n");
470: return SGML_new(&HTMLP_dtd, me);
471: }
472:
473:
474: /* END OF FILE HTTeXGen.c */
475:
476:
477:
478:
479:
480:
481:
482:
Webmaster