Annotation of libwww/Library/src/HTFormat.html, revision 2.26

2.10      timbl       1: <HTML>
                      2: <HEAD>
2.1       timbl       3: <TITLE>HTFormat: The format manager in the WWW Library</TITLE>
2.15      timbl       4: <NEXTID N="z18">
2.10      timbl       5: </HEAD>
2.1       timbl       6: <BODY>
                      7: <H1>Manage different document formats</H1>Here we describe the functions of
                      8: the HTFormat module which handles
                      9: conversion between different data
                     10: representations.  (In MIME parlance,
                     11: a representation is known as a content-type.
2.2       timbl      12: In WWW  the term "format" is often
2.1       timbl      13: used as it is shorter).<P>
                     14: This module is implemented by <A
2.10      timbl      15: NAME="z0" HREF="HTFormat.c">HTFormat.c</A>
2.7       timbl      16: . This hypertext document is used
                     17: to generate the <A
2.11      timbl      18: NAME="z8" HREF="HTFormat.h">HTFormat.h</A> include
2.9       timbl      19: file.  Part of the <A
2.10      timbl      20: NAME="z10" HREF="Overview.html">WWW library</A> .
2.1       timbl      21: <H2>Preamble</H2>
                     22: <PRE>#ifndef HTFORMAT_H
                     23: #define HTFORMAT_H
                     24: 
                     25: #include "HTUtils.h"
                     26: #include <A
2.10      timbl      27: NAME="z7" HREF="HTStream.html">"HTStream.h"</A>
2.1       timbl      28: #include "HTAtom.h"
2.2       timbl      29: #include "HTList.h"
2.1       timbl      30: 
                     31: #ifdef SHORT_NAMES
                     32: #define HTOutputSource HTOuSour
                     33: #define HTOutputBinary HTOuBina
                     34: #endif
                     35: 
2.18      luotonen   36: 
                     37: typedef struct _HTContentDescription {
                     38:     char *     filename;
                     39:     HTAtom *   content_type;
                     40:     HTAtom *   content_language;
                     41:     HTAtom *   content_encoding;
                     42:     int                content_length;
                     43:     float      quality;
                     44: } HTContentDescription;
                     45: 
                     46: PUBLIC void HTAcceptEncoding PARAMS((HTList *  list,
                     47:                                     char *     enc,
                     48:                                     float      quality));
                     49: 
                     50: PUBLIC void HTAcceptLanguage PARAMS((HTList *  list,
                     51:                                     char *     lang,
                     52:                                     float      quality));
                     53: 
                     54: PUBLIC BOOL HTRank PARAMS((HTList * possibilities,
                     55:                           HTList * accepted_content_types,
                     56:                           HTList * accepted_content_languages,
                     57:                           HTList * accepted_content_encodings));
                     58: 
                     59: 
2.1       timbl      60: </PRE>
2.17      luotonen   61: <H2>HT<A
                     62: NAME="z15"> Input Socket: Buffering for network
                     63: in</A></H2>This routines provide simple character
                     64: input from sockets. These are used
                     65: for parsing input in arbitrary IP
                     66: protocols (Gopher, NNTP, FTP).
                     67: <PRE>#define INPUT_BUFFER_SIZE 4096            /* Tradeoff spped vs memory*/
                     68: typedef struct _socket_buffer {
2.25      luotonen   69:        char input_buffer[INPUT_BUFFER_SIZE];
2.17      luotonen   70:        char * input_pointer;
                     71:        char * input_limit;
                     72:        int input_file_number;
2.25      luotonen   73:        BOOL    s_do_buffering;
                     74:        char *  s_buffer;
                     75:        int     s_buffer_size;
                     76:        char *  s_buffer_cur;
2.17      luotonen   77: } HTInputSocket;
                     78: 
                     79: </PRE>
                     80: <H3>Create input buffer and set file
                     81: number</H3>
                     82: <PRE>extern HTInputSocket* HTInputSocket_new PARAMS((int file_number));
                     83: 
                     84: </PRE>
                     85: <H3>Get next character from buffer</H3>
2.24      frystyk    86: <PRE>extern int HTInputSocket_getCharacter PARAMS((HTInputSocket* isoc));
2.17      luotonen   87: 
                     88: </PRE>
2.22      timbl      89: <H3>Read block from input socket</H3>Read *len characters and return a
                     90: buffer (don't free) containing *len
                     91: characters ( *len may have changed).
                     92: Buffer is not NULL-terminated.
2.17      luotonen   93: <PRE>extern char * HTInputSocket_getBlock PARAMS((HTInputSocket * isoc,
                     94:                                                  int *           len));
                     95: 
                     96: </PRE>
                     97: <H3>Free input socket buffer</H3>
                     98: <PRE>extern void HTInputSocket_free PARAMS((HTInputSocket * isoc));
                     99: 
2.22      timbl     100: 
2.17      luotonen  101: PUBLIC char * HTInputSocket_getLine PARAMS((HTInputSocket * isoc));
                    102: PUBLIC char * HTInputSocket_getUnfoldedLine PARAMS((HTInputSocket * isoc));
                    103: PUBLIC char * HTInputSocket_getStatusLine PARAMS((HTInputSocket * isoc));
                    104: PUBLIC BOOL   HTInputSocket_seemsBinary PARAMS((HTInputSocket * isoc));
                    105: 
                    106: </PRE>
2.25      luotonen  107: 
                    108: 
                    109: <H3>Security Buffering</H3>
                    110: 
                    111: When it's necessary to get e.g. the header section, or part of it,
                    112: exactly as it came from the client to calculate the message digest,
                    113: these functions turn buffering on and off.  All the material returned
                    114: by <CODE>HTInputSocket_getStatusLine()</CODE>,
                    115: <CODE>HTInputSocket_getUnfoldedLine()</CODE> and
                    116: <CODE>HTInputSocket_getLine()</CODE> gets buffered after a call to
                    117: <CODE>HTInputSocket_startBuffering()</CODE> until either
                    118: <CODE>HTInputSocket_stopBuffering()</CODE> is called, or an empty line is
                    119: returned by any of these functions (end of body section).
                    120: <CODE>HTInputSocket_getBuffer()</CODE> returns the number of
                    121: characters buffered, and sets the given buffer pointer to point to
                    122: internal buffer.  This buffer exists until <CODE>HTInputSocket</CODE>
                    123: object is freed.
                    124: <PRE>
                    125: 
                    126: PUBLIC void HTInputSocket_startBuffering PARAMS((HTInputSocket * isoc));
                    127: PUBLIC void HTInputSocket_stopBuffering PARAMS((HTInputSocket * isoc));
                    128: PUBLIC int HTInputSocket_getBuffer PARAMS((HTInputSocket * isoc,
                    129:                                           char ** buffer_ptr));
                    130: </PRE>
                    131: 
2.1       timbl     132: <H2>The HTFormat type</H2>We use the HTAtom object for holding
                    133: representations. This allows faster
                    134: manipulation (comparison and copying)
2.14      timbl     135: that if we stayed with strings.<P>
                    136: The following have to be defined
                    137: in advance of the other include files
                    138: because of circular references.
2.1       timbl     139: <PRE>typedef HTAtom * HTFormat;
2.13      timbl     140: 
2.14      timbl     141: #include <A
                    142: NAME="z14" HREF="HTAccess.html">"HTAccess.h"</A>   /* Required for HTRequest definition */
                    143:                
2.1       timbl     144: </PRE>These macros (which used to be constants)
                    145: define some basic internally referenced
2.13      timbl     146: representations. 
                    147: <H3>Internal ones</H3>The www/xxx ones are of course not
                    148: MIME standard.<P>
2.20      frystyk   149: star/star  is an output format which
2.1       timbl     150: leaves the input untouched. It is
                    151: useful for diagnostics, and for users
                    152: who want to see the original, whatever
                    153: it is.
2.13      timbl     154: <H3></H3>
2.20      frystyk   155: <PRE>#define WWW_SOURCE HTAtom_for("*/*")      /* Whatever it was originally*/
2.1       timbl     156: 
                    157: </PRE>www/present represents the user's
                    158: perception of the document.  If you
                    159: convert to www/present, you present
                    160: the material to the user. 
                    161: <PRE>#define WWW_PRESENT HTAtom_for("www/present")     /* The user's perception */
                    162: 
                    163: </PRE>The message/rfc822 format means a
                    164: MIME message or a plain text message
                    165: with no MIME header. This is what
                    166: is returned by an HTTP server.
                    167: <PRE>#define WWW_MIME HTAtom_for("www/mime")           /* A MIME message */
2.10      timbl     168: 
2.1       timbl     169: </PRE>www/print is like www/present except
                    170: it represents a printed copy.
                    171: <PRE>#define WWW_PRINT HTAtom_for("www/print") /* A printed copy */
                    172: 
2.10      timbl     173: </PRE>www/unknown is a really unknown type.
2.11      timbl     174: Some default action is appropriate.
2.10      timbl     175: <PRE>#define WWW_UNKNOWN     HTAtom_for("www/unknown")
                    176: 
2.13      timbl     177: 
                    178: 
                    179: </PRE>
                    180: <H3>MIME ones (a few)</H3>These are regular MIME types.  HTML
2.11      timbl     181: is assumed to be added by the W3
                    182: code. application/octet-stream was
                    183: mistakenly application/binary in
                    184: earlier libwww versions (pre 2.11).
2.10      timbl     185: <PRE>#define WWW_PLAINTEXT     HTAtom_for("text/plain")
2.1       timbl     186: #define WWW_POSTSCRIPT         HTAtom_for("application/postscript")
                    187: #define WWW_RICHTEXT   HTAtom_for("application/rtf")
2.10      timbl     188: #define WWW_AUDIO       HTAtom_for("audio/basic")
2.1       timbl     189: #define WWW_HTML       HTAtom_for("text/html")
2.11      timbl     190: #define WWW_BINARY     HTAtom_for("application/octet-stream")
2.26    ! frystyk   191: #define WWW_VIDEO      HTAtom_for("video/mpeg")
2.7       timbl     192: 
2.1       timbl     193: </PRE>We must include the following file
                    194: after defining HTFormat, to which
2.10      timbl     195: it makes reference.
                    196: <H2>The HTEncoding type</H2>
                    197: <PRE>typedef HTAtom* HTEncoding;
                    198: 
                    199: </PRE>The following are values for the
                    200: MIME types:
                    201: <PRE>#define WWW_ENC_7BIT              HTAtom_for("7bit")
                    202: #define WWW_ENC_8BIT           HTAtom_for("8bit")
                    203: #define WWW_ENC_BINARY         HTAtom_for("binary")
                    204: 
                    205: </PRE>We also add
                    206: <PRE>#define WWW_ENC_COMPRESS  HTAtom_for("compress")
                    207: 
                    208: #include "HTAnchor.h"
2.1       timbl     209: 
                    210: </PRE>
                    211: <H2>The HTPresentation and HTConverter
                    212: types</H2>This HTPresentation structure represents
                    213: a possible conversion algorithm from
                    214: one format to annother.  It includes
                    215: a pointer to a conversion routine.
                    216: The conversion routine returns a
                    217: stream to which data should be fed.
                    218: See also <A
2.12      timbl     219: NAME="z5" HREF="#z3">HTStreamStack</A> which scans
2.1       timbl     220: the list of registered converters
                    221: and calls one. See the <A
2.10      timbl     222: NAME="z6" HREF="HTInit.html">initialisation
2.1       timbl     223: module</A> for a list of conversion routines.
                    224: <PRE>typedef struct _HTPresentation HTPresentation;
                    225: 
2.13      timbl     226: typedef HTStream * <A
                    227: NAME="z12">HTConverter</A> PARAMS((
                    228:        HTRequest *             request,
                    229:        void *                  param,
                    230:        HTFormat                input_format,
                    231:        HTFormat                output_format,
                    232:        HTStream *              output_stream));
2.1       timbl     233:        
                    234: struct _HTPresentation {
2.13      timbl     235:        HTAtom* rep;            /* representation name atomized */
2.1       timbl     236:        HTAtom* rep_out;        /* resulting representation */
2.2       timbl     237:        HTConverter *converter; /* The routine to gen the stream stack */
2.1       timbl     238:        char *  command;        /* MIME-format string */
                    239:        float   quality;        /* Between 0 (bad) and 1 (good) */
                    240:        float   secs;
                    241:        float   secs_per_byte;
                    242: };
                    243: 
2.15      timbl     244: </PRE>A global list of converters is kept
2.1       timbl     245: by this module.  It is also scanned
                    246: by modules which want to know the
                    247: set of formats supported. for example.
2.22      timbl     248: Note there is also an additional
2.15      timbl     249: list associated with each <A
2.22      timbl     250: NAME="z16" HREF="HTAccess.html#z5">request</A>
                    251: .
2.15      timbl     252: <PRE>extern HTList * <A
                    253: NAME="z17">HTConversions</A> ;
2.1       timbl     254: 
2.12      timbl     255: 
2.1       timbl     256: </PRE>
                    257: <H2>HTSetPresentation: Register a system
                    258: command to present a format</H2>
2.8       timbl     259: <H3>On entry,</H3>
2.1       timbl     260: <DL>
                    261: <DT>rep
                    262: <DD> is the MIME - style format name
                    263: <DT>command
                    264: <DD> is the MAILCAP - style command
                    265: template
                    266: <DT>quality
                    267: <DD> A degradation faction 0..1
                    268: <DT>maxbytes
                    269: <DD> A limit on the length acceptable
                    270: as input (0 infinite)
                    271: <DT>maxsecs
                    272: <DD> A limit on the time user
                    273: will wait (0 for infinity)
                    274: </DL>
                    275: 
                    276: <PRE>extern void HTSetPresentation PARAMS((
2.13      timbl     277:        HTList *        conversions,
                    278:        CONST char *    representation,
                    279:        CONST char *    command,
                    280:        float           quality,
                    281:        float           secs, 
                    282:        float           secs_per_byte
2.1       timbl     283: ));
                    284: 
                    285: 
                    286: </PRE>
                    287: <H2>HTSetConversion:   Register a converstion
                    288: routine</H2>
2.8       timbl     289: <H3>On entry,</H3>
2.1       timbl     290: <DL>
                    291: <DT>rep_in
                    292: <DD> is the content-type input
                    293: <DT>rep_out
                    294: <DD> is the resulting content-type
                    295: <DT>converter
                    296: <DD> is the routine to make
                    297: the stream to do it
                    298: </DL>
                    299: 
                    300: <PRE>
                    301: extern void HTSetConversion PARAMS((
2.13      timbl     302:        HTList *        conversions,
2.1       timbl     303:        CONST char *    rep_in,
                    304:        CONST char *    rep_out,
2.2       timbl     305:        HTConverter *   converter,
2.1       timbl     306:        float           quality,
                    307:        float           secs, 
                    308:        float           secs_per_byte
                    309: ));
                    310: 
                    311: 
                    312: </PRE>
                    313: <H2><A
2.10      timbl     314: NAME="z3">HTStreamStack:   Create a stack of
2.1       timbl     315: streams</A></H2>This is the routine which actually
                    316: sets up the conversion. It currently
                    317: checks only for direct conversions,
2.8       timbl     318: but multi-stage conversions are forseen.
2.2       timbl     319: It takes a stream into which the
2.1       timbl     320: output should be sent in the final
                    321: format, builds the conversion stack,
                    322: and returns a stream into which the
                    323: data in the input format should be
                    324: fed.  The anchor is passed because
                    325: hypertxet objects load information
                    326: into the anchor object which represents
2.23      luotonen  327: them. <P>
                    328: If <CODE>guess</CODE> is true and input format is
                    329: <CODE>www/unknown</CODE>, try to guess the format
                    330: by looking at the first few butes of the stream. <P>
2.1       timbl     331: <PRE>extern HTStream * HTStreamStack PARAMS((
                    332:        HTFormat                format_in,
2.23      luotonen  333:        HTRequest *             request,
                    334:        BOOL                    guess));
2.1       timbl     335: 
                    336: </PRE>
                    337: <H2>HTStackValue: Find the cost of a
                    338: filter stack</H2>Must return the cost of the same
                    339: stack which HTStreamStack would set
                    340: up.
2.8       timbl     341: <H3>On entry,</H3>
2.1       timbl     342: <DL>
                    343: <DT>format_in
                    344: <DD> The fomat of the data to
                    345: be converted
                    346: <DT>format_out
                    347: <DD> The format required
                    348: <DT>initial_value
                    349: <DD> The intrinsic "value"
                    350: of the data before conversion on
                    351: a scale from 0 to 1
                    352: <DT>length
                    353: <DD> The number of bytes expected
                    354: in the input format
                    355: </DL>
                    356: 
                    357: <PRE>extern float HTStackValue PARAMS((
2.13      timbl     358:        HTList *                conversions,
2.1       timbl     359:        HTFormat                format_in,
2.13      timbl     360:        HTFormat                format_out,
2.1       timbl     361:        float                   initial_value,
                    362:        long int                length));
                    363: 
                    364: #define NO_VALUE_FOUND -1e20           /* returned if none found */
                    365: 
                    366: </PRE>
                    367: <H2><A
2.10      timbl     368: NAME="z1">HTCopy:  Copy a socket to a stream</A></H2>This is used by the protocol engines
2.6       secret    369: to send data down a stream, typically
2.22      timbl     370: one which has been generated by HTStreamStack.
                    371: Returns the number of bytes transferred.
2.19      luotonen  372: <PRE>extern int HTCopy PARAMS((
2.1       timbl     373:        int                     file_number,
                    374:        HTStream*               sink));
                    375: 
                    376:        
2.6       secret    377: </PRE>
                    378: <H2><A
2.10      timbl     379: NAME="c6">HTFileCopy:  Copy a file to a stream</A></H2>This is used by the protocol engines
2.6       secret    380: to send data down a stream, typically
2.7       timbl     381: one which has been generated by HTStreamStack.
                    382: It is currently called by <A
2.12      timbl     383: NAME="z9" HREF="#c7">HTParseFile</A>
2.6       secret    384: <PRE>extern void HTFileCopy PARAMS((
                    385:        FILE*                   fp,
                    386:        HTStream*               sink));
                    387: 
                    388:        
2.7       timbl     389: </PRE>
                    390: <H2><A
2.10      timbl     391: NAME="c2">HTCopyNoCR: Copy a socket to a stream,
2.7       timbl     392: stripping CR characters.</A></H2>It is slower than <A
2.12      timbl     393: NAME="z2" HREF="#z1">HTCopy</A> .
2.1       timbl     394: <PRE>
                    395: extern void HTCopyNoCR PARAMS((
                    396:        int                     file_number,
                    397:        HTStream*               sink));
                    398: 
2.16      luotonen  399: 
                    400: </PRE>
2.1       timbl     401: <H2>HTParseSocket: Parse a socket given
                    402: its format</H2>This routine is called by protocol
                    403: modules to load an object.  uses<A
2.12      timbl     404: NAME="z4" HREF="#z3">
2.1       timbl     405: HTStreamStack</A> and the copy routines
                    406: above.  Returns HT_LOADED if succesful,
                    407: &lt;0 if not.
                    408: <PRE>extern int HTParseSocket PARAMS((
                    409:        HTFormat        format_in,
                    410:        int             file_number,
2.13      timbl     411:        HTRequest *     request));
2.6       secret    412: 
                    413: </PRE>
                    414: <H2><A
2.10      timbl     415: NAME="c1">HTParseFile: Parse a File through
2.7       timbl     416: a file pointer</A></H2>This routine is called by protocols
                    417: modules to load an object. uses<A
2.12      timbl     418: NAME="z4" HREF="#z3"> HTStreamStack</A>
2.7       timbl     419: and <A
2.12      timbl     420: NAME="c7" HREF="#c6">HTFileCopy</A> .  Returns HT_LOADED
2.7       timbl     421: if succesful, &lt;0 if not.
2.6       secret    422: <PRE>extern int HTParseFile PARAMS((
                    423:        HTFormat        format_in,
                    424:        FILE            *fp,
2.13      timbl     425:        HTRequest *     request));
2.8       timbl     426: 
                    427: </PRE>
2.11      timbl     428: <H2><A
                    429: NAME="z11">HTNetToText: Convert Net ASCII to
                    430: local representation</A></H2>This is a filter stream suitable
                    431: for taking text from a socket and
                    432: passing it into a stream which expects
                    433: text in the local C representation.
                    434: It does ASCII and newline conversion.
                    435: As usual, pass its output stream
                    436: to it when creating it.
                    437: <PRE>extern HTStream *  HTNetToText PARAMS ((HTStream * sink));
                    438: 
                    439: </PRE>
2.8       timbl     440: <H2>HTFormatInit: Set up default presentations
                    441: and conversions</H2>These are defined in HTInit.c or
                    442: HTSInit.c if these have been replaced.
                    443: If you don't call this routine, and
                    444: you don't define any presentations,
                    445: then this routine will automatically
                    446: be called the first time a conversion
                    447: is needed. However, if you explicitly
                    448: add some conversions (eg using HTLoadRules)
                    449: then you may want also to explicitly
                    450: call this to get the defaults as
                    451: well.
2.20      frystyk   452: <PRE>
                    453: extern void HTFormatInit PARAMS((HTList * conversions));
2.22      timbl     454: 
2.21      frystyk   455: </PRE>
2.22      timbl     456: <H2>HTFormatInitNIM: Set up default presentations
                    457: and conversions</H2>This is a slightly different version
                    458: of HTFormatInit, but without any
                    459: conversions that might use third
                    460: party programs. This is intended
                    461: for Non Interactive Mode.
                    462: <PRE>extern void HTFormatInitNIM PARAMS((HTList * conversions));
2.21      frystyk   463: 
2.1       timbl     464: </PRE>
2.22      timbl     465: <H2>HTFormatDelete: Remove presentations
                    466: and conversions</H2>Deletes the list from HTFormatInit
                    467: or HTFormatInitNIM
                    468: <PRE>extern void HTFormatDelete PARAMS((HTList * conversions));
2.21      frystyk   469: 
                    470: </PRE>
                    471: 
2.1       timbl     472: <H2>Epilogue</H2>
                    473: <PRE>extern BOOL HTOutputSource;       /* Flag: shortcut parser */
                    474: #endif
                    475: 
2.22      timbl     476: </PRE>end</BODY>
2.10      timbl     477: </HTML>

Webmaster