Annotation of libwww/Library/src/HTTP.c, revision 1.13

1.1       timbl       1: /*     HyperText Tranfer Protocol      - Client implementation         HTTP.c
                      2: **     ==========================
1.2       timbl       3: **
                      4: ** Bugs:
                      5: **     Not implemented:
                      6: **             Forward
                      7: **             Redirection
                      8: **             Error handling
1.1       timbl       9: */
                     10: 
                     11: /*     Module parameters:
                     12: **     -----------------
                     13: **
                     14: **  These may be undefined and redefined by syspec.h
                     15: */
1.2       timbl      16: 
1.12      timbl      17: /*     MOSAIC_HACK2 is a kludge to guess the file type of trabsferred
                     18: **     file from the URL.  It is STRICTLY illegal to do this!
                     19: */
                     20: 
1.2       timbl      21: /* Implements:
                     22: */
                     23: #include "HTTP.h"
                     24: 
                     25: #define HTTP_VERSION   "HTTP/1.0"
                     26: #define HTTP2                          /* Version is greater than 0.9 */
                     27: 
                     28: #define INIT_LINE_SIZE         1024    /* Start with line buffer this big */
                     29: #define LINE_EXTEND_THRESH     256     /* Minimum read size */
                     30: #define VERSION_LENGTH                 20      /* for returned protocol version */
                     31: 
                     32: /* Uses:
                     33: */
1.1       timbl      34: #include "HTParse.h"
                     35: #include "HTUtils.h"
                     36: #include "tcp.h"
                     37: #include "HTTCP.h"
                     38: #include "HTFormat.h"
1.2       timbl      39: #include <ctype.h>
                     40: #include "HTAlert.h"
                     41: #include "HTMIME.h"
1.5       timbl      42: #include "HTML.h"              /* SCW */
                     43: #include "HTInit.h"            /* SCW */
1.1       timbl      44: 
1.2       timbl      45: struct _HTStream {
                     46:        HTStreamClass * isa;            /* all we need to know */
                     47: };
                     48: 
                     49: 
1.6       timbl      50: extern char * HTAppName;       /* Application name: please supply */
                     51: extern char * HTAppVersion;    /* Application version: please supply */
                     52: 
1.1       timbl      53: /*             Load Document from HTTP Server                  HTLoadHTTP()
                     54: **             ==============================
                     55: **
                     56: **     Given a hypertext address, this routine loads a document.
                     57: **
                     58: **
                     59: ** On entry,
                     60: **     arg     is the hypertext reference of the article to be loaded.
                     61: **     gate    is nill if no gateway, else the gateway address.
                     62: **
                     63: ** On exit,
                     64: **     returns >=0     If no error, a good socket number
                     65: **             <0      Error.
                     66: **
                     67: **     The socket must be closed by the caller after the document has been
                     68: **     read.
                     69: **
                     70: */
1.2       timbl      71: PUBLIC int HTLoadHTTP ARGS4 (
                     72:        CONST char *,           arg,
                     73: /*     CONST char *,           gate, */
                     74:        HTParentAnchor *,       anAnchor,
                     75:        HTFormat,               format_out,
                     76:        HTStream*,              sink)
1.1       timbl      77: {
                     78:     int s;                             /* Socket number for returned data */
                     79:     char *command;                     /* The whole command */
1.3       timbl      80:     char * eol = 0;                    /* End of line if found */
1.7       timbl      81:     char * start_of_data;              /* Start of body of reply */
1.11      timbl      82:     int length;                                /* Number of valid bytes in buffer */
1.1       timbl      83:     int status;                                /* tcp return */
1.10      timbl      84:     char crlf[3];                      /* A CR LF equivalent string */
1.3       timbl      85:     HTStream * target = NULL;          /* Unconverted data */
                     86:     HTFormat format_in;                        /* Format arriving in the message */
                     87:     
1.2       timbl      88:     CONST char* gate = 0;              /* disable this feature */
1.1       timbl      89:     SockA soc_address;                 /* Binary network address */
                     90:     SockA * sin = &soc_address;
1.2       timbl      91:     BOOL had_header = NO;              /* Have we had at least one header? */
1.11      timbl      92:     char * text_buffer = NULL;
                     93:     char * binary_buffer = NULL;
1.2       timbl      94:     BOOL extensions = YES;             /* Assume good HTTP server */
1.1       timbl      95:     if (!arg) return -3;               /* Bad if no name sepcified     */
                     96:     if (!*arg) return -2;              /* Bad if name had zero length  */
                     97: 
                     98: /*  Set up defaults:
                     99: */
                    100: #ifdef DECNET
1.2       timbl     101:     sin->sdn_family = AF_DECnet;           /* Family = DECnet, host order */
                    102:     sin->sdn_objnum = DNP_OBJ;          /* Default: http object number */
1.1       timbl     103: #else  /* Internet */
1.2       timbl     104:     sin->sin_family = AF_INET;     /* Family = internet, host order */
                    105:     sin->sin_port = htons(TCP_PORT);    /* Default: http port    */
1.1       timbl     106: #endif
                    107: 
1.10      timbl     108:     sprintf(crlf, "%c%c", CR, LF);     /* To be corect on Mac, VM, etc */
                    109:     
1.1       timbl     110:     if (TRACE) {
                    111:         if (gate) fprintf(stderr,
                    112:                "HTTPAccess: Using gateway %s for %s\n", gate, arg);
                    113:         else fprintf(stderr, "HTTPAccess: Direct access for %s\n", arg);
                    114:     }
                    115:     
                    116: /* Get node name and optional port number:
                    117: */
                    118:     {
                    119:        char *p1 = HTParse(gate ? gate : arg, "", PARSE_HOST);
                    120:        int status = HTParseInet(sin, p1);  /* TBL 920622 */
                    121:         free(p1);
                    122:        if (status) return status;   /* No such host for example */
                    123:     }
                    124:     
1.2       timbl     125: retry:
1.1       timbl     126:    
1.10      timbl     127: /*     Now, let's get a socket set up from the server for the data:
1.1       timbl     128: */      
                    129: #ifdef DECNET
                    130:     s = socket(AF_DECnet, SOCK_STREAM, 0);
                    131: #else
                    132:     s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
                    133: #endif
                    134:     status = connect(s, (struct sockaddr*)&soc_address, sizeof(soc_address));
                    135:     if (status < 0) {
                    136:            if (TRACE) fprintf(stderr, 
                    137:              "HTTP: Unable to connect to remote host for `%s' (errno = %d).\n", arg, errno);
                    138:            /* free(command);   BUG OUT TBL 921121 */
                    139:            return HTInetStatus("connect");
                    140:       }
                    141:     
                    142:     if (TRACE) fprintf(stderr, "HTTP connected, socket %d\n", s);
                    143: 
                    144: /*     Ask that node for the document,
                    145: **     omitting the host name & anchor if not gatewayed.
                    146: */        
                    147:     if (gate) {
1.2       timbl     148:         command = malloc(4 + strlen(arg)+ 2 + 31);
1.1       timbl     149:         if (command == NULL) outofmem(__FILE__, "HTLoadHTTP");
                    150:         strcpy(command, "GET ");
                    151:        strcat(command, arg);
                    152:     } else { /* not gatewayed */
                    153:        char * p1 = HTParse(arg, "", PARSE_PATH|PARSE_PUNCTUATION);
1.2       timbl     154:         command = malloc(4 + strlen(p1)+ 2 + 31);
1.1       timbl     155:         if (command == NULL) outofmem(__FILE__, "HTLoadHTTP");
                    156:         strcpy(command, "GET ");
                    157:        strcat(command, p1);
                    158:        free(p1);
                    159:     }
1.2       timbl     160: #ifdef HTTP2
                    161:     if (extensions) {
                    162:         strcat(command, " ");
                    163:         strcat(command, HTTP_VERSION);
                    164:     }
                    165: #endif
1.10      timbl     166: 
                    167:     strcat(command, crlf);     /* CR LF, as in rfc 977 */
1.1       timbl     168: 
1.2       timbl     169:     if (extensions) {
                    170: 
                    171:        int n;
                    172:        int i;
                    173:         HTAtom * present = WWW_PRESENT;
                    174:        char line[256];    /*@@@@ */
                    175: 
                    176:        if (!HTPresentations) HTFormatInit();
                    177:        n = HTList_count(HTPresentations);
                    178: 
                    179:        for(i=0; i<n; i++) {
                    180:            HTPresentation * pres = HTList_objectAt(HTPresentations, i);
                    181:            if (pres->rep_out == present) {
                    182:              if (pres->quality != 1.0) {
1.3       timbl     183:                  sprintf(line, "Accept: %s q=%.3f%c%c",
                    184:                         HTAtom_name(pres->rep), pres->quality, CR, LF);
1.2       timbl     185:              } else {
1.3       timbl     186:                  sprintf(line, "Accept: %s%c%c",
                    187:                         HTAtom_name(pres->rep), CR, LF);
1.2       timbl     188:              }
                    189:              StrAllocCat(command, line);
                    190: 
                    191:            }
                    192:        }
1.6       timbl     193:        
                    194:        sprintf(line, "User-Agent:  %s/%s  libwww/%s%c%c",
                    195:                HTAppName ? HTAppName : "unknown",
                    196:                HTAppVersion ? HTAppVersion : "0.0",
                    197:                HTLibraryVersion, CR, LF);
                    198:              StrAllocCat(command, line);
1.2       timbl     199:     }
1.3       timbl     200:        
1.10      timbl     201:     StrAllocCat(command, crlf);        /* Blank line means "end" */
                    202: 
                    203:     if (TRACE) fprintf(stderr, "HTTP Tx: %s\n", command);
                    204: 
                    205: /*     Translate into ASCII if necessary
                    206: */
1.4       timbl     207: #ifdef NOT_ASCII
1.1       timbl     208:     {
                    209:        char * p;
                    210:        for(p = command; *p; p++) {
                    211:            *p = TOASCII(*p);
                    212:        }
1.4       timbl     213:     }
1.3       timbl     214: #endif
1.1       timbl     215: 
                    216:     status = NETWRITE(s, command, (int)strlen(command));
                    217:     free(command);
                    218:     if (status<0) {
                    219:        if (TRACE) fprintf(stderr, "HTTPAccess: Unable to send command.\n");
                    220:            return HTInetStatus("send");
                    221:     }
                    222: 
1.2       timbl     223: 
1.7       timbl     224: /*     Read the first line of the response
                    225: **     -----------------------------------
1.11      timbl     226: **
                    227: **     HTTP0 servers must return ASCII style text, though it can in
                    228: **     principle be just text without any markup at all.
                    229: **     Full HTTP servers must return a response
                    230: **     line and RFC822 style header.  The response must therefore in
                    231: **     either case have a CRLF somewhere soon.
                    232: **
                    233: **     This is the theory.  In practice, there are (1993) unfortunately
                    234: **     many binary documents just served up with HTTP0.9.  This
                    235: **     means we have to preserve the binary buffer (on the assumption that
                    236: **     conversion from ASCII may lose information) in case it turns
                    237: **     out that we want the binary original.
1.2       timbl     238: */
1.3       timbl     239: 
1.2       timbl     240:     {
                    241:     
                    242:     /* Get numeric status etc */
                    243: 
                    244:        BOOL end_of_file = NO;
                    245:        HTAtom * encoding = HTAtom_for("7bit");
                    246:        int buffer_length = INIT_LINE_SIZE;     /* Why not? */
                    247:        
1.11      timbl     248:        binary_buffer = (char *) malloc(buffer_length * sizeof(char));
                    249:        if (!binary_buffer) outofmem(__FILE__, "HTLoadHTTP");
                    250:        text_buffer = (char *) malloc(buffer_length * sizeof(char));
                    251:        if (!text_buffer) outofmem(__FILE__, "HTLoadHTTP");
                    252:        length = 0;
1.2       timbl     253:        
1.7       timbl     254:        do {    /* Loop to read in the first line */
1.2       timbl     255:            
                    256:           /* Extend line buffer if necessary for those crazy WAIS URLs ;-) */
                    257:           
                    258:            if (buffer_length - length < LINE_EXTEND_THRESH) {
                    259:                buffer_length = buffer_length + buffer_length;
1.11      timbl     260:                binary_buffer = (char *) realloc(
                    261:                        binary_buffer, buffer_length * sizeof(char));
                    262:                if (!binary_buffer) outofmem(__FILE__, "HTLoadHTTP");
                    263:                text_buffer = (char *) realloc(
                    264:                        text_buffer, buffer_length * sizeof(char));
                    265:                if (!text_buffer) outofmem(__FILE__, "HTLoadHTTP");
1.2       timbl     266:            }
1.11      timbl     267:            status = NETREAD(s, binary_buffer + length,
1.2       timbl     268:                                buffer_length - length -1);
                    269:            if (status < 0) {
                    270:                HTAlert("Unexpected network read error on response");
1.9       timbl     271:                NETCLOSE(s);
1.2       timbl     272:                return status;
                    273:            }
1.10      timbl     274: 
                    275:            if (TRACE) fprintf(stderr, "HTTP: read returned %d bytes.\n",
                    276:                 status);
                    277: 
1.2       timbl     278:            if (status == 0) {
                    279:                end_of_file = YES;
                    280:                break;
                    281:            }
1.11      timbl     282:            binary_buffer[length+status] = 0;
                    283: 
                    284: 
                    285: /*     Make an ASCII *copy* of the buffer
                    286: */
1.2       timbl     287: #ifdef NOT_ASCII
1.10      timbl     288:            if (TRACE) fprintf(stderr, "Local codes CR=%d, LF=%d\n", CR, LF);
1.11      timbl     289: #endif
1.2       timbl     290:            {
                    291:                char * p;
1.11      timbl     292:                char * q;
                    293:                for(p = binary_buffer+length, q=text_buffer+length;
                    294:                        *p; p++, q++) {
                    295:                    *q = FROMASCII(*p);
                    296:                }
                    297: 
                    298:                *q++ = 0;
                    299:            }
                    300: 
                    301: /* Kludge to trap binary responses from illegal HTTP0.9 servers.
                    302: ** First time we have enough, look at the stub in ASCII
                    303: ** and get out of here if it doesn't look right.
                    304: **
                    305: ** We also check for characters above 128 in the first few bytes, and
                    306: ** if we find them we forget the html default.
                    307: **
                    308: ** Bugs: A HTTP0.9 server returning a document starting "HTTP/"
                    309: **     will be taken as a HTTP 1.0 server.  Failure.
                    310: **     An HTTP 0.9 server returning a binary document with
                    311: **     characters < 128 will be read as ASCII.
                    312: */
                    313: #define STUB_LENGTH 20
                    314:            if (length < STUB_LENGTH && length+status >= STUB_LENGTH) {
                    315:                if(strncmp("HTTP/", text_buffer, 5)!=0) {
                    316:                    char *p;
                    317:                    start_of_data = text_buffer; /* reparse whole reply */
                    318:                    for(p=binary_buffer; p <binary_buffer+STUB_LENGTH;p++) {
1.13    ! duns      319:                        if (((int)*p)&128) {
1.11      timbl     320:                            format_in = HTAtom_for("www/unknown");
1.13    ! duns      321:                            length = length + status;
        !           322:                            goto copy; /* out of while loop */
1.11      timbl     323:                        }
                    324:                    }
1.2       timbl     325:                }
                    326:            }
1.11      timbl     327: /* end kludge */
                    328: 
                    329:            
                    330:            eol = strchr(text_buffer + length, 10);         
                    331:            if (eol) {
                    332:                *eol = 0;               /* Terminate the line */
                    333:                if (eol[-1] = CR) eol[-1] = 0;  /* Chop trailing CR */
                    334:             }
1.2       timbl     335: 
                    336:            length = length + status;
                    337: 
1.7       timbl     338:        } while (!eol && !end_of_file);         /* No LF */         
                    339:                
                    340:     } /* Scope of loop variables */
1.2       timbl     341: 
1.7       timbl     342:     
                    343: /*     We now have a terminated unfolded line. Parse it.
                    344: **     -------------------------------------------------
1.2       timbl     345: */
                    346: 
1.11      timbl     347:     if (TRACE)fprintf(stderr, "HTTP: Rx: %.70s\n", text_buffer);
1.7       timbl     348: 
                    349:     {
                    350:        int fields;
                    351:        char server_version [VERSION_LENGTH+1];
                    352:        int server_status;
                    353: 
1.2       timbl     354: 
                    355: /* Kludge to work with old buggy servers.  They can't handle the third word
                    356: ** so we try again without it.
                    357: */
1.7       timbl     358:        if (extensions &&
1.11      timbl     359:                0==strcmp(text_buffer,          /* Old buggy server? */
1.7       timbl     360:                "Document address invalid or access not authorised")) {
                    361:            extensions = NO;
1.11      timbl     362:            if (binary_buffer) free(binary_buffer);
                    363:            if (text_buffer) free(text_buffer);
1.7       timbl     364:            if (TRACE) fprintf(stderr,
                    365:                "HTTP: close socket %d to retry with HTTP0\n", s);
                    366:            NETCLOSE(s);
                    367:            goto retry;         /* @@@@@@@@@@ */
                    368:        }
1.11      timbl     369: /* end kludge */
1.2       timbl     370: 
1.11      timbl     371:        fields = sscanf(text_buffer, "%20s%d",
1.7       timbl     372:            server_version,
                    373:            &server_status);
                    374: 
1.11      timbl     375:        if (fields < 2 || 
                    376:               strncmp(server_version, "HTTP/", 5)!=0) { /* HTTP0 reply */
1.7       timbl     377:            format_in = WWW_HTML;
1.11      timbl     378:            start_of_data = text_buffer;        /* reread whole reply */
1.9       timbl     379:            if (eol) *eol = '\n';               /* Reconstitute buffer */
1.2       timbl     380:            
1.11      timbl     381:        } else {                                /* Full HTTP reply */
1.7       timbl     382:        
                    383:        /*      Decode full HTTP response */
                    384:        
1.3       timbl     385:            format_in = HTAtom_for("www/mime");
1.11      timbl     386:            start_of_data = eol ? eol + 1 : text_buffer + length;
1.3       timbl     387: 
1.2       timbl     388:            switch (server_status / 100) {
                    389:            
1.3       timbl     390:            default:            /* bad number */
                    391:                HTAlert("Unknown status reply from server!");
                    392:                break;
                    393:                
1.2       timbl     394:            case 3:             /* Various forms of redirection */
1.7       timbl     395:                HTAlert(
1.3       timbl     396:        "Redirection response from server is not handled by this client");
                    397:                break;
                    398:                
1.2       timbl     399:            case 4:             /* "I think I goofed" */
                    400:            case 5:             /* I think you goofed */
1.6       timbl     401:                {
                    402:                    char *p1 = HTParse(gate ? gate : arg, "", PARSE_HOST);
                    403:                    char * message = (char*)malloc(
1.11      timbl     404:                        strlen(text_buffer)+strlen(p1) + 100);
1.6       timbl     405:                    if (!message) outofmem(__FILE__, "HTTP 5xx status");
                    406:                    sprintf(message,
1.11      timbl     407:                    "HTTP server at %s replies:\n%s", p1, text_buffer);
1.8       timbl     408:                    status = HTLoadError(sink, server_status, message);
1.6       timbl     409:                    free(message);
                    410:                    free(p1);
                    411:                    goto clean_up;
                    412:                }
1.3       timbl     413:                break;
1.2       timbl     414:                
                    415:            case 2:             /* Good: Got MIME object */
                    416:                break;
                    417: 
1.7       timbl     418:            } /* switch on response code */
                    419:        
                    420:        }               /* Full HTTP reply */
                    421:        
                    422:     } /* scope of fields */
1.2       timbl     423: 
1.3       timbl     424: /*     Set up the stream stack to handle the body of the message
                    425: */
                    426: 
1.13    ! duns      427: copy:
        !           428: 
1.3       timbl     429:     target = HTStreamStack(format_in,
                    430:                        format_out,
                    431:                        sink , anAnchor);
                    432: 
                    433:     if (!target) {
                    434:        char buffer[1024];      /* @@@@@@@@ */
1.11      timbl     435:        if (binary_buffer) free(binary_buffer);
                    436:        if (text_buffer) free(text_buffer);
1.3       timbl     437:        sprintf(buffer, "Sorry, no known way of converting %s to %s.",
                    438:                HTAtom_name(format_in), HTAtom_name(format_out));
                    439:        fprintf(stderr, "HTTP: %s", buffer);
1.6       timbl     440:        status = HTLoadError(sink, 501, buffer);
                    441:        goto clean_up;
1.3       timbl     442:     }
                    443: 
                    444:     
1.11      timbl     445: /*     Push the data down the stream
1.3       timbl     446: **     We have to remember the end of the first buffer we just read
1.2       timbl     447: */
1.11      timbl     448:     if (format_in == WWW_HTML) {
                    449:         target = HTNetToText(target);  /* Pipe through CR stripper */
                    450:     }
                    451:     
                    452:     (*target->isa->put_block)(target,
                    453:                binary_buffer + (start_of_data - text_buffer),
                    454:                length - (start_of_data - text_buffer));
                    455:     HTCopy(s, target);
1.3       timbl     456:        
                    457:     (*target->isa->free)(target);
1.8       timbl     458:     status = HT_LOADED;
1.2       timbl     459: 
                    460: /*     Clean up
1.1       timbl     461: */
1.3       timbl     462:     
1.6       timbl     463: clean_up: 
1.11      timbl     464:     if (binary_buffer) free(binary_buffer);
                    465:     if (text_buffer) free(text_buffer);
1.3       timbl     466: 
1.1       timbl     467:     if (TRACE) fprintf(stderr, "HTTP: close socket %d.\n", s);
1.6       timbl     468:     (void) NETCLOSE(s);
1.1       timbl     469: 
1.8       timbl     470:     return status;                     /* Good return */
1.3       timbl     471: 
1.1       timbl     472: }
1.7       timbl     473: 
1.1       timbl     474: 
                    475: /*     Protocol descriptor
                    476: */
                    477: 
1.13    ! duns      478: GLOBALDEF PUBLIC HTProtocol HTTP = { "http", HTLoadHTTP, 0 };

Webmaster