Annotation of libwww/Library/src/HTFormat.html, revision 2.30

2.10      timbl       1: <HTML>
                      2: <HEAD>
2.1       timbl       3: <TITLE>HTFormat: The format manager in the WWW Library</TITLE>
2.29      howcome     4: <!-- Changed by: , 12-Oct-1994 -->
2.15      timbl       5: <NEXTID N="z18">
2.10      timbl       6: </HEAD>
2.1       timbl       7: <BODY>
                      8: <H1>Manage different document formats</H1>Here we describe the functions of
2.27      frystyk     9: the HTFormat module which handles conversion between different data
                     10: representations.  (In MIME parlance, a representation is known as a content-
                     11: type. In WWW  the term "format" is often used as it is shorter).<P>
                     12: This module is implemented by <A NAME="z0" HREF="HTFormat.c">HTFormat.c</A>.
                     13: 
                     14: The module is a part of the <A NAME="z10" HREF="Overview.html">WWW library</A>.
                     15: 
2.1       timbl      16: <H2>Preamble</H2>
                     17: <PRE>#ifndef HTFORMAT_H
                     18: #define HTFORMAT_H
                     19: 
                     20: #include "HTUtils.h"
                     21: #include <A
2.10      timbl      22: NAME="z7" HREF="HTStream.html">"HTStream.h"</A>
2.1       timbl      23: #include "HTAtom.h"
2.2       timbl      24: #include "HTList.h"
2.1       timbl      25: 
                     26: #ifdef SHORT_NAMES
                     27: #define HTOutputSource HTOuSour
                     28: #define HTOutputBinary HTOuBina
                     29: #endif
                     30: 
2.18      luotonen   31: 
                     32: typedef struct _HTContentDescription {
                     33:     char *     filename;
                     34:     HTAtom *   content_type;
                     35:     HTAtom *   content_language;
                     36:     HTAtom *   content_encoding;
                     37:     int                content_length;
                     38:     float      quality;
                     39: } HTContentDescription;
                     40: 
                     41: PUBLIC void HTAcceptEncoding PARAMS((HTList *  list,
                     42:                                     char *     enc,
                     43:                                     float      quality));
                     44: 
                     45: PUBLIC void HTAcceptLanguage PARAMS((HTList *  list,
                     46:                                     char *     lang,
                     47:                                     float      quality));
                     48: 
                     49: PUBLIC BOOL HTRank PARAMS((HTList * possibilities,
                     50:                           HTList * accepted_content_types,
                     51:                           HTList * accepted_content_languages,
                     52:                           HTList * accepted_content_encodings));
                     53: 
                     54: 
2.1       timbl      55: </PRE>
2.17      luotonen   56: <H2>HT<A
                     57: NAME="z15"> Input Socket: Buffering for network
                     58: in</A></H2>This routines provide simple character
                     59: input from sockets. These are used
                     60: for parsing input in arbitrary IP
                     61: protocols (Gopher, NNTP, FTP).
                     62: <PRE>#define INPUT_BUFFER_SIZE 4096            /* Tradeoff spped vs memory*/
                     63: typedef struct _socket_buffer {
2.25      luotonen   64:        char input_buffer[INPUT_BUFFER_SIZE];
2.17      luotonen   65:        char * input_pointer;
                     66:        char * input_limit;
                     67:        int input_file_number;
2.25      luotonen   68:        BOOL    s_do_buffering;
                     69:        char *  s_buffer;
                     70:        int     s_buffer_size;
                     71:        char *  s_buffer_cur;
2.17      luotonen   72: } HTInputSocket;
                     73: 
                     74: </PRE>
                     75: <H3>Create input buffer and set file
                     76: number</H3>
                     77: <PRE>extern HTInputSocket* HTInputSocket_new PARAMS((int file_number));
                     78: 
                     79: </PRE>
                     80: <H3>Get next character from buffer</H3>
2.24      frystyk    81: <PRE>extern int HTInputSocket_getCharacter PARAMS((HTInputSocket* isoc));
2.17      luotonen   82: 
                     83: </PRE>
2.22      timbl      84: <H3>Read block from input socket</H3>Read *len characters and return a
                     85: buffer (don't free) containing *len
                     86: characters ( *len may have changed).
                     87: Buffer is not NULL-terminated.
2.17      luotonen   88: <PRE>extern char * HTInputSocket_getBlock PARAMS((HTInputSocket * isoc,
                     89:                                                  int *           len));
                     90: 
                     91: </PRE>
                     92: <H3>Free input socket buffer</H3>
                     93: <PRE>extern void HTInputSocket_free PARAMS((HTInputSocket * isoc));
                     94: 
2.22      timbl      95: 
2.17      luotonen   96: PUBLIC char * HTInputSocket_getLine PARAMS((HTInputSocket * isoc));
                     97: PUBLIC char * HTInputSocket_getUnfoldedLine PARAMS((HTInputSocket * isoc));
                     98: PUBLIC char * HTInputSocket_getStatusLine PARAMS((HTInputSocket * isoc));
                     99: PUBLIC BOOL   HTInputSocket_seemsBinary PARAMS((HTInputSocket * isoc));
                    100: 
                    101: </PRE>
2.25      luotonen  102: 
                    103: 
                    104: <H3>Security Buffering</H3>
                    105: 
                    106: When it's necessary to get e.g. the header section, or part of it,
                    107: exactly as it came from the client to calculate the message digest,
                    108: these functions turn buffering on and off.  All the material returned
                    109: by <CODE>HTInputSocket_getStatusLine()</CODE>,
                    110: <CODE>HTInputSocket_getUnfoldedLine()</CODE> and
                    111: <CODE>HTInputSocket_getLine()</CODE> gets buffered after a call to
                    112: <CODE>HTInputSocket_startBuffering()</CODE> until either
                    113: <CODE>HTInputSocket_stopBuffering()</CODE> is called, or an empty line is
                    114: returned by any of these functions (end of body section).
                    115: <CODE>HTInputSocket_getBuffer()</CODE> returns the number of
                    116: characters buffered, and sets the given buffer pointer to point to
                    117: internal buffer.  This buffer exists until <CODE>HTInputSocket</CODE>
                    118: object is freed.
                    119: <PRE>
                    120: 
                    121: PUBLIC void HTInputSocket_startBuffering PARAMS((HTInputSocket * isoc));
                    122: PUBLIC void HTInputSocket_stopBuffering PARAMS((HTInputSocket * isoc));
                    123: PUBLIC int HTInputSocket_getBuffer PARAMS((HTInputSocket * isoc,
                    124:                                           char ** buffer_ptr));
                    125: </PRE>
                    126: 
2.1       timbl     127: <H2>The HTFormat type</H2>We use the HTAtom object for holding
                    128: representations. This allows faster
                    129: manipulation (comparison and copying)
2.14      timbl     130: that if we stayed with strings.<P>
                    131: The following have to be defined
                    132: in advance of the other include files
                    133: because of circular references.
2.1       timbl     134: <PRE>typedef HTAtom * HTFormat;
2.13      timbl     135: 
2.14      timbl     136: #include <A
                    137: NAME="z14" HREF="HTAccess.html">"HTAccess.h"</A>   /* Required for HTRequest definition */
                    138:                
2.28      frystyk   139: </PRE>
                    140: 
                    141: These macros (which used to be constants) define some basic internally
                    142: referenced representations.
                    143: 
                    144: <H3>Internal ones</H3>
                    145: 
                    146: The www/xxx ones are of course not MIME standard.<P>
                    147: 
                    148: star/star is an output format which leaves the input untouched. It is
                    149: useful for diagnostics, and for users who want to see the original,
                    150: whatever it is.
                    151: 
                    152: <PRE>
                    153: #define WWW_SOURCE     HTAtom_for("*/*")      /* Whatever it was originally */
                    154: </PRE>
                    155: 
                    156: www/present represents the user's perception of the document.  If you
                    157: convert to www/present, you present the material to the user.
2.10      timbl     158: 
2.28      frystyk   159: <PRE>
                    160: #define WWW_PRESENT    HTAtom_for("www/present")   /* The user's perception */
                    161: </PRE>
                    162: 
                    163: The message/rfc822 format means a MIME message or a plain text message
                    164: with no MIME header. This is what is returned by an HTTP server.
                    165: 
                    166: <PRE>
                    167: #define WWW_MIME       HTAtom_for("www/mime")             /* A MIME message */
                    168: </PRE>
                    169: 
                    170: www/print is like www/present except it represents a printed copy.
                    171: 
                    172: <PRE>
                    173: #define WWW_PRINT      HTAtom_for("www/print")            /* A printed copy */
                    174: </PRE>
2.13      timbl     175: 
2.28      frystyk   176: www/unknown is a really unknown type.  Some default action is
                    177: appropriate.
2.13      timbl     178: 
2.28      frystyk   179: <PRE>
                    180: #define WWW_UNKNOWN     HTAtom_for("www/unknown")
2.13      timbl     181: </PRE>
2.28      frystyk   182: 
                    183: 
                    184: <H3>MIME ones (a few)</H3>
                    185: 
                    186: These are regular MIME types.  HTML is assumed to be added by the W3
                    187: code. application/octet-stream was mistakenly application/binary in
2.11      timbl     188: earlier libwww versions (pre 2.11).
2.28      frystyk   189: 
                    190: <PRE>
                    191: #define WWW_PLAINTEXT  HTAtom_for("text/plain")
2.1       timbl     192: #define WWW_POSTSCRIPT         HTAtom_for("application/postscript")
                    193: #define WWW_RICHTEXT   HTAtom_for("application/rtf")
2.10      timbl     194: #define WWW_AUDIO       HTAtom_for("audio/basic")
2.1       timbl     195: #define WWW_HTML       HTAtom_for("text/html")
2.11      timbl     196: #define WWW_BINARY     HTAtom_for("application/octet-stream")
2.26      frystyk   197: #define WWW_VIDEO      HTAtom_for("video/mpeg")
2.28      frystyk   198: </PRE>
                    199: 
                    200: Extra types used in the library
                    201: 
                    202: <PRE>
                    203: #define WWW_NEWSLIST   HTAtom_for("text/newslist")
                    204: </PRE>
2.7       timbl     205: 
2.28      frystyk   206: We must include the following file after defining HTFormat, to which
2.10      timbl     207: it makes reference.
2.28      frystyk   208: 
2.10      timbl     209: <H2>The HTEncoding type</H2>
                    210: <PRE>typedef HTAtom* HTEncoding;
                    211: 
                    212: </PRE>The following are values for the
                    213: MIME types:
                    214: <PRE>#define WWW_ENC_7BIT              HTAtom_for("7bit")
                    215: #define WWW_ENC_8BIT           HTAtom_for("8bit")
                    216: #define WWW_ENC_BINARY         HTAtom_for("binary")
                    217: 
                    218: </PRE>We also add
                    219: <PRE>#define WWW_ENC_COMPRESS  HTAtom_for("compress")
                    220: 
                    221: #include "HTAnchor.h"
2.1       timbl     222: 
                    223: </PRE>
                    224: 
2.28      frystyk   225: <H2>The HTPresentation and HTConverter types</H2>
                    226: 
                    227: This HTPresentation structure represents a possible conversion
                    228: algorithm from one format to annother.  It includes a pointer to a
                    229: conversion routine.  The conversion routine returns a stream to which
                    230: data should be fed.  See also <A NAME="z5"
                    231: HREF="#z3">HTStreamStack</A> which scans the list of registered
                    232: converters and calls one. See the <A NAME="z6"
                    233: HREF="HTInit.html">initialisation module</A> for a list of conversion
                    234: routines.
                    235: 
                    236: <PRE>
                    237: typedef struct _HTPresentation HTPresentation;
                    238: 
                    239: typedef HTStream * <A NAME="z12">HTConverter</A> PARAMS((
2.13      timbl     240:        HTRequest *             request,
                    241:        void *                  param,
                    242:        HTFormat                input_format,
                    243:        HTFormat                output_format,
                    244:        HTStream *              output_stream));
2.1       timbl     245:        
                    246: struct _HTPresentation {
2.13      timbl     247:        HTAtom* rep;            /* representation name atomized */
2.1       timbl     248:        HTAtom* rep_out;        /* resulting representation */
2.2       timbl     249:        HTConverter *converter; /* The routine to gen the stream stack */
2.1       timbl     250:        char *  command;        /* MIME-format string */
2.29      howcome   251:        char *  test_command;   /* MIME-format string */
2.1       timbl     252:        float   quality;        /* Between 0 (bad) and 1 (good) */
                    253:        float   secs;
                    254:        float   secs_per_byte;
                    255: };
2.28      frystyk   256: </PRE>
                    257: 
                    258: A global list of converters is kept by this module.  It is also
                    259: scanned by modules which want to know the set of formats supported.
                    260: for example.  Note there is also an additional list associated with
                    261: each <A NAME="z16" HREF="HTAccess.html#z5">request</A>.
2.1       timbl     262: 
2.15      timbl     263: <PRE>extern HTList * <A
                    264: NAME="z17">HTConversions</A> ;
2.1       timbl     265: 
2.12      timbl     266: 
2.1       timbl     267: </PRE>
                    268: <H2>HTSetPresentation: Register a system
                    269: command to present a format</H2>
2.8       timbl     270: <H3>On entry,</H3>
2.1       timbl     271: <DL>
                    272: <DT>rep
                    273: <DD> is the MIME - style format name
                    274: <DT>command
                    275: <DD> is the MAILCAP - style command
                    276: template
                    277: <DT>quality
                    278: <DD> A degradation faction 0..1
                    279: <DT>maxbytes
                    280: <DD> A limit on the length acceptable
                    281: as input (0 infinite)
                    282: <DT>maxsecs
                    283: <DD> A limit on the time user
                    284: will wait (0 for infinity)
                    285: </DL>
                    286: 
                    287: <PRE>extern void HTSetPresentation PARAMS((
2.13      timbl     288:        HTList *        conversions,
                    289:        CONST char *    representation,
                    290:        CONST char *    command,
2.30    ! frystyk   291:        CONST char *    test_command,
2.13      timbl     292:        float           quality,
                    293:        float           secs, 
                    294:        float           secs_per_byte
2.1       timbl     295: ));
                    296: 
                    297: 
                    298: </PRE>
                    299: <H2>HTSetConversion:   Register a converstion
                    300: routine</H2>
2.8       timbl     301: <H3>On entry,</H3>
2.1       timbl     302: <DL>
                    303: <DT>rep_in
                    304: <DD> is the content-type input
                    305: <DT>rep_out
                    306: <DD> is the resulting content-type
                    307: <DT>converter
                    308: <DD> is the routine to make
                    309: the stream to do it
                    310: </DL>
                    311: 
                    312: <PRE>
                    313: extern void HTSetConversion PARAMS((
2.13      timbl     314:        HTList *        conversions,
2.1       timbl     315:        CONST char *    rep_in,
                    316:        CONST char *    rep_out,
2.2       timbl     317:        HTConverter *   converter,
2.1       timbl     318:        float           quality,
                    319:        float           secs, 
                    320:        float           secs_per_byte
                    321: ));
                    322: 
                    323: 
                    324: </PRE>
                    325: <H2><A
2.10      timbl     326: NAME="z3">HTStreamStack:   Create a stack of
2.1       timbl     327: streams</A></H2>This is the routine which actually
                    328: sets up the conversion. It currently
                    329: checks only for direct conversions,
2.8       timbl     330: but multi-stage conversions are forseen.
2.2       timbl     331: It takes a stream into which the
2.1       timbl     332: output should be sent in the final
                    333: format, builds the conversion stack,
                    334: and returns a stream into which the
                    335: data in the input format should be
                    336: fed.  The anchor is passed because
                    337: hypertxet objects load information
                    338: into the anchor object which represents
2.23      luotonen  339: them. <P>
                    340: If <CODE>guess</CODE> is true and input format is
                    341: <CODE>www/unknown</CODE>, try to guess the format
                    342: by looking at the first few butes of the stream. <P>
2.1       timbl     343: <PRE>extern HTStream * HTStreamStack PARAMS((
                    344:        HTFormat                format_in,
2.23      luotonen  345:        HTRequest *             request,
                    346:        BOOL                    guess));
2.1       timbl     347: 
                    348: </PRE>
                    349: <H2>HTStackValue: Find the cost of a
                    350: filter stack</H2>Must return the cost of the same
                    351: stack which HTStreamStack would set
                    352: up.
2.8       timbl     353: <H3>On entry,</H3>
2.1       timbl     354: <DL>
                    355: <DT>format_in
                    356: <DD> The fomat of the data to
                    357: be converted
                    358: <DT>format_out
                    359: <DD> The format required
                    360: <DT>initial_value
                    361: <DD> The intrinsic "value"
                    362: of the data before conversion on
                    363: a scale from 0 to 1
                    364: <DT>length
                    365: <DD> The number of bytes expected
                    366: in the input format
                    367: </DL>
                    368: 
                    369: <PRE>extern float HTStackValue PARAMS((
2.13      timbl     370:        HTList *                conversions,
2.1       timbl     371:        HTFormat                format_in,
2.13      timbl     372:        HTFormat                format_out,
2.1       timbl     373:        float                   initial_value,
                    374:        long int                length));
                    375: 
                    376: #define NO_VALUE_FOUND -1e20           /* returned if none found */
                    377: 
                    378: </PRE>
                    379: <H2><A
2.10      timbl     380: NAME="z1">HTCopy:  Copy a socket to a stream</A></H2>This is used by the protocol engines
2.6       secret    381: to send data down a stream, typically
2.22      timbl     382: one which has been generated by HTStreamStack.
                    383: Returns the number of bytes transferred.
2.19      luotonen  384: <PRE>extern int HTCopy PARAMS((
2.1       timbl     385:        int                     file_number,
                    386:        HTStream*               sink));
                    387: 
                    388:        
2.6       secret    389: </PRE>
                    390: <H2><A
2.10      timbl     391: NAME="c6">HTFileCopy:  Copy a file to a stream</A></H2>This is used by the protocol engines
2.6       secret    392: to send data down a stream, typically
2.7       timbl     393: one which has been generated by HTStreamStack.
                    394: It is currently called by <A
2.12      timbl     395: NAME="z9" HREF="#c7">HTParseFile</A>
2.6       secret    396: <PRE>extern void HTFileCopy PARAMS((
                    397:        FILE*                   fp,
                    398:        HTStream*               sink));
                    399: 
                    400:        
2.7       timbl     401: </PRE>
                    402: <H2><A
2.10      timbl     403: NAME="c2">HTCopyNoCR: Copy a socket to a stream,
2.7       timbl     404: stripping CR characters.</A></H2>It is slower than <A
2.12      timbl     405: NAME="z2" HREF="#z1">HTCopy</A> .
2.1       timbl     406: <PRE>
                    407: extern void HTCopyNoCR PARAMS((
                    408:        int                     file_number,
                    409:        HTStream*               sink));
                    410: 
2.16      luotonen  411: 
                    412: </PRE>
2.1       timbl     413: <H2>HTParseSocket: Parse a socket given
                    414: its format</H2>This routine is called by protocol
                    415: modules to load an object.  uses<A
2.12      timbl     416: NAME="z4" HREF="#z3">
2.1       timbl     417: HTStreamStack</A> and the copy routines
                    418: above.  Returns HT_LOADED if succesful,
                    419: &lt;0 if not.
                    420: <PRE>extern int HTParseSocket PARAMS((
                    421:        HTFormat        format_in,
                    422:        int             file_number,
2.13      timbl     423:        HTRequest *     request));
2.6       secret    424: 
                    425: </PRE>
                    426: <H2><A
2.10      timbl     427: NAME="c1">HTParseFile: Parse a File through
2.7       timbl     428: a file pointer</A></H2>This routine is called by protocols
                    429: modules to load an object. uses<A
2.12      timbl     430: NAME="z4" HREF="#z3"> HTStreamStack</A>
2.7       timbl     431: and <A
2.12      timbl     432: NAME="c7" HREF="#c6">HTFileCopy</A> .  Returns HT_LOADED
2.7       timbl     433: if succesful, &lt;0 if not.
2.6       secret    434: <PRE>extern int HTParseFile PARAMS((
                    435:        HTFormat        format_in,
                    436:        FILE            *fp,
2.13      timbl     437:        HTRequest *     request));
2.8       timbl     438: 
                    439: </PRE>
2.11      timbl     440: <H2><A
                    441: NAME="z11">HTNetToText: Convert Net ASCII to
                    442: local representation</A></H2>This is a filter stream suitable
                    443: for taking text from a socket and
                    444: passing it into a stream which expects
                    445: text in the local C representation.
                    446: It does ASCII and newline conversion.
                    447: As usual, pass its output stream
                    448: to it when creating it.
                    449: <PRE>extern HTStream *  HTNetToText PARAMS ((HTStream * sink));
                    450: 
                    451: </PRE>
2.8       timbl     452: <H2>HTFormatInit: Set up default presentations
                    453: and conversions</H2>These are defined in HTInit.c or
                    454: HTSInit.c if these have been replaced.
                    455: If you don't call this routine, and
                    456: you don't define any presentations,
                    457: then this routine will automatically
                    458: be called the first time a conversion
                    459: is needed. However, if you explicitly
                    460: add some conversions (eg using HTLoadRules)
                    461: then you may want also to explicitly
                    462: call this to get the defaults as
                    463: well.
2.20      frystyk   464: <PRE>
                    465: extern void HTFormatInit PARAMS((HTList * conversions));
2.22      timbl     466: 
2.21      frystyk   467: </PRE>
2.22      timbl     468: <H2>HTFormatInitNIM: Set up default presentations
                    469: and conversions</H2>This is a slightly different version
                    470: of HTFormatInit, but without any
                    471: conversions that might use third
                    472: party programs. This is intended
                    473: for Non Interactive Mode.
                    474: <PRE>extern void HTFormatInitNIM PARAMS((HTList * conversions));
2.21      frystyk   475: 
2.1       timbl     476: </PRE>
2.22      timbl     477: <H2>HTFormatDelete: Remove presentations
                    478: and conversions</H2>Deletes the list from HTFormatInit
                    479: or HTFormatInitNIM
                    480: <PRE>extern void HTFormatDelete PARAMS((HTList * conversions));
2.21      frystyk   481: 
                    482: </PRE>
                    483: 
2.1       timbl     484: <H2>Epilogue</H2>
                    485: <PRE>extern BOOL HTOutputSource;       /* Flag: shortcut parser */
                    486: #endif
                    487: 
2.22      timbl     488: </PRE>end</BODY>
2.10      timbl     489: </HTML>

Webmaster