Annotation of libwww/Library/src/HTParse.c, revision 2.42
2.41 frystyk 1: /* HTParse.c
2: ** URI MANAGEMENT
3: **
4: ** (c) COPYRIGHT CERN 1994.
5: ** Please first read the full copyright statement in the file COPYRIGH.
2.26 frystyk 6: **
7: ** history:
8: ** May 12 94 TAB added as legal char in HTCleanTelnetString
9: **
1.1 timbl 10: */
2.31 frystyk 11:
2.38 frystyk 12: /* Platform dependent stuff */
1.1 timbl 13: #include "HTUtils.h"
2.38 frystyk 14: #include "tcp.h"
15:
16: /* Library Includes */
17: #include "HTParse.h"
2.31 frystyk 18: #include "HTTCP.h"
2.6 timbl 19:
1.1 timbl 20: struct struct_parts {
2.20 timbl 21: char * access; /* Now known as "scheme" */
1.1 timbl 22: char * host;
23: char * absolute;
24: char * relative;
25: /* char * search; no - treated as part of path */
26: char * anchor;
27: };
28:
29: /* Strip white space off a string
30: ** ------------------------------
31: **
32: ** On exit,
33: ** Return value points to first non-white character, or to 0 if none.
34: ** All trailing white space is OVERWRITTEN with zero.
35: */
36:
2.13 luotonen 37: PUBLIC char * HTStrip ARGS1(char *, s)
1.1 timbl 38: {
39: #define SPACE(c) ((c==' ')||(c=='\t')||(c=='\n'))
40: char * p=s;
2.13 luotonen 41: if (!s) return NULL; /* Doesn't dump core if NULL */
42: for(p=s;*p;p++); /* Find end of string */
1.1 timbl 43: for(p--;p>=s;p--) {
44: if(SPACE(*p)) *p=0; /* Zap trailing blanks */
45: else break;
46: }
47: while(SPACE(*s))s++; /* Strip leading blanks */
48: return s;
49: }
50:
51:
52: /* Scan a filename for its consituents
53: ** -----------------------------------
54: **
55: ** On entry,
56: ** name points to a document name which may be incomplete.
57: ** On exit,
58: ** absolute or relative may be nonzero (but not both).
59: ** host, anchor and access may be nonzero if they were specified.
60: ** Any which are nonzero point to zero terminated strings.
61: */
2.32 frystyk 62: PRIVATE void scan ARGS2(char *, name, struct struct_parts *, parts)
1.1 timbl 63: {
64: char * after_access;
65: char * p;
66: int length = strlen(name);
67:
68: parts->access = 0;
69: parts->host = 0;
70: parts->absolute = 0;
71: parts->relative = 0;
72: parts->anchor = 0;
73:
74: after_access = name;
75: for(p=name; *p; p++) {
76: if (*p==':') {
77: *p = 0;
2.20 timbl 78: parts->access = after_access; /* Scheme has been specified */
2.37 howcome 79:
2.42 ! howcome 80: /* after_access = p;*/
! 81: /* while (*after_access == 0)*/ /* HWL 15/10/94: weird bug on hp */
! 82: /* after_access++;*/ /* after_access = p + 1 */
! 83:
1.1 timbl 84: after_access = p+1;
2.37 howcome 85:
2.22 luotonen 86: if (0==strcasecomp("URL", parts->access)) {
2.20 timbl 87: parts->access = NULL; /* Ignore IETF's URL: pre-prefix */
88: } else break;
1.1 timbl 89: }
2.20 timbl 90: if (*p=='/') break; /* Access has not been specified */
1.1 timbl 91: if (*p=='#') break;
92: }
93:
94: for(p=name+length-1; p>=name; p--) {
95: if (*p =='#') {
96: parts->anchor=p+1;
97: *p=0; /* terminate the rest */
98: }
99: }
100: p = after_access;
101: if (*p=='/'){
102: if (p[1]=='/') {
103: parts->host = p+2; /* host has been specified */
104: *p=0; /* Terminate access */
105: p=strchr(parts->host,'/'); /* look for end of host name if any */
106: if(p) {
107: *p=0; /* Terminate host */
108: parts->absolute = p+1; /* Root has been found */
109: }
110: } else {
111: parts->absolute = p+1; /* Root found but no host */
112: }
113: } else {
114: parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
115: }
116:
2.16 timbl 117: #ifdef OLD_CODE
1.1 timbl 118: /* Access specified but no host: the anchor was not really one
2.16 timbl 119: e.g. news:j462#36487@foo.bar -- JFG 10/jul/92, from bug report */
120: /* This kludge doesn't work for example when coming across
121: file:/usr/local/www/fred#123
122: which loses its anchor. Correct approach in news is to
123: escape weird characters not allowed in URL. TBL 21/dec/93
124: */
1.1 timbl 125: if (parts->access && ! parts->host && parts->anchor) {
126: *(parts->anchor - 1) = '#'; /* Restore the '#' in the address */
127: parts->anchor = 0;
128: }
2.16 timbl 129: #endif
1.1 timbl 130:
131: #ifdef NOT_DEFINED /* search is just treated as part of path */
132: {
133: char *p = relative ? relative : absolute;
134: if (p) {
135: char * q = strchr(p, '?'); /* Any search string? */
136: if (q) {
137: *q = 0; /* If so, chop that off. */
138: parts->search = q+1;
139: }
140: }
141: }
142: #endif
143: } /*scan */
144:
145:
146: /* Parse a Name relative to another name
147: ** -------------------------------------
148: **
149: ** This returns those parts of a name which are given (and requested)
150: ** substituting bits from the related name where necessary.
151: **
152: ** On entry,
153: ** aName A filename given
2.33 howcome 154: ** relatedName A name relative to which aName is to be parsed. Give
155: ** it an empty string if aName is absolute.
1.1 timbl 156: ** wanted A mask for the bits which are wanted.
157: **
158: ** On exit,
159: ** returns A pointer to a malloc'd string which MUST BE FREED
160: */
2.32 frystyk 161: char * HTParse ARGS3(CONST char *, aName, CONST char *, relatedName,
162: int, wanted)
1.1 timbl 163: {
164: char * result = 0;
165: char * return_value = 0;
166: int len;
167: char * name = 0;
168: char * rel = 0;
169: char * p;
2.12 timbl 170: char * access;
1.1 timbl 171: struct struct_parts given, related;
2.33 howcome 172:
173: if (!relatedName) /* HWL 23/8/94: dont dump due to NULL */
174: relatedName = "";
1.1 timbl 175:
176: /* Make working copies of input strings to cut up:
177: */
178: len = strlen(aName)+strlen(relatedName)+10;
179: result=(char *)malloc(len); /* Lots of space: more than enough */
180: if (result == NULL) outofmem(__FILE__, "HTParse");
181:
182: StrAllocCopy(name, aName);
183: StrAllocCopy(rel, relatedName);
184:
185: scan(name, &given);
186: scan(rel, &related);
187: result[0]=0; /* Clear string */
2.12 timbl 188: access = given.access ? given.access : related.access;
1.1 timbl 189: if (wanted & PARSE_ACCESS)
2.12 timbl 190: if (access) {
191: strcat(result, access);
1.1 timbl 192: if(wanted & PARSE_PUNCTUATION) strcat(result, ":");
193: }
194:
195: if (given.access && related.access) /* If different, inherit nothing. */
196: if (strcmp(given.access, related.access)!=0) {
197: related.host=0;
198: related.absolute=0;
199: related.relative=0;
200: related.anchor=0;
201: }
202:
203: if (wanted & PARSE_HOST)
204: if(given.host || related.host) {
205: if(wanted & PARSE_PUNCTUATION) strcat(result, "//");
206: strcat(result, given.host ? given.host : related.host);
207: }
208:
209: if (given.host && related.host) /* If different hosts, inherit no path. */
210: if (strcmp(given.host, related.host)!=0) {
211: related.absolute=0;
212: related.relative=0;
213: related.anchor=0;
214: }
215:
216: if (wanted & PARSE_PATH) {
217: if(given.absolute) { /* All is given */
218: if(wanted & PARSE_PUNCTUATION) strcat(result, "/");
219: strcat(result, given.absolute);
220: } else if(related.absolute) { /* Adopt path not name */
221: strcat(result, "/");
222: strcat(result, related.absolute);
223: if (given.relative) {
224: p = strchr(result, '?'); /* Search part? */
225: if (!p) p=result+strlen(result)-1;
226: for (; *p!='/'; p--); /* last / */
227: p[1]=0; /* Remove filename */
228: strcat(result, given.relative); /* Add given one */
2.31 frystyk 229: result = HTSimplify (result);
1.1 timbl 230: }
231: } else if(given.relative) {
232: strcat(result, given.relative); /* what we've got */
233: } else if(related.relative) {
234: strcat(result, related.relative);
235: } else { /* No inheritance */
236: strcat(result, "/");
237: }
238: }
239:
240: if (wanted & PARSE_ANCHOR)
241: if(given.anchor || related.anchor) {
242: if(wanted & PARSE_PUNCTUATION) strcat(result, "#");
243: strcat(result, given.anchor ? given.anchor : related.anchor);
244: }
245: free(rel);
246: free(name);
247:
248: StrAllocCopy(return_value, result);
249: free(result);
250: return return_value; /* exactly the right length */
251: }
252:
2.11 timbl 253:
2.21 frystyk 254: #if 0 /* NOT USED FOR THE MOMENT */
2.15 luotonen 255: /*
256: ** As strcpy() but guaranteed to work correctly
257: ** with overlapping parameters. AL 7 Feb 1994
258: */
259: PRIVATE void ari_strcpy ARGS2(char *, to,
260: char *, from)
261: {
262: char * tmp;
263:
264: if (!to || !from) return;
265:
266: tmp = (char*)malloc(strlen(from)+1);
267: if (!tmp) outofmem(__FILE__, "my_strcpy");
268:
269: strcpy(tmp, from);
270: strcpy(to, tmp);
271: free(tmp);
272: }
2.21 frystyk 273: #endif
274:
2.20 timbl 275:
276: /* Simplify a URI
277: // --------------
278: // A URI is allowed to contain the seqeunce xxx/../ which may be
1.1 timbl 279: // replaced by "" , and the seqeunce "/./" which may be replaced by "/".
2.20 timbl 280: // Simplification helps us recognize duplicate URIs.
1.1 timbl 281: //
282: // Thus, /etc/junk/../fred becomes /etc/fred
283: // /etc/junk/./fred becomes /etc/junk/fred
2.11 timbl 284: //
285: // but we should NOT change
286: // http://fred.xxx.edu/../..
287: //
288: // or ../../albert.html
2.26 frystyk 289: //
290: // In the same manner, the following prefixed are preserved:
291: //
292: // ./<etc>
293: // //<etc>
294: //
295: // In order to avoid empty URLs the following URLs become:
296: //
297: // /fred/.. becomes /fred/..
298: // /fred/././.. becomes /fred/..
2.27 frystyk 299: // /fred/.././junk/.././ becomes /fred/..
2.26 frystyk 300: //
2.30 frystyk 301: // If more than one set of `://' is found (several proxies in cascade) then
302: // only the part after the last `://' is simplified.
1.1 timbl 303: */
2.31 frystyk 304: PUBLIC char *HTSimplify ARGS1(char *, filename)
1.1 timbl 305: {
2.31 frystyk 306: char *path;
307: char *p;
2.19 frystyk 308:
2.31 frystyk 309: if (!filename) {
310: if (URI_TRACE)
311: fprintf(stderr, "HTSimplify.. Bad argument\n");
312: return filename;
313: }
314: if (URI_TRACE)
2.27 frystyk 315: fprintf(stderr, "HTSimplify.. `%s\' ", filename);
316:
2.31 frystyk 317: if ((path = strstr(filename, "://")) != NULL) { /* Find host name */
2.30 frystyk 318: char *newptr;
2.31 frystyk 319: path += 3;
320: while ((newptr = strstr(path, "://")) != NULL)
321: path = newptr+3;
322: path = HTCanon(&filename, path); /* We have a host name */
323: } else if ((path = strstr(filename, ":/")) != NULL) {
324: path += 2;
2.27 frystyk 325: } else
2.31 frystyk 326: path = filename;
327: if (*path == '/' && *(path+1)=='/') { /* Some URLs start //<foo> */
328: path += 1;
2.34 frystyk 329: } else if (!strncmp(path, "news:", 5)) {
330: char *ptr = strchr(path+5, '@');
331: if (!ptr) ptr = path+5;
332: while (*ptr) { /* Make group or host lower case */
333: *ptr = TOLOWER(*ptr);
334: ptr++;
2.31 frystyk 335: }
336: if (URI_TRACE)
337: fprintf(stderr, "into\n............ `%s'\n", filename);
338: return filename; /* Doesn't need to do any more */
339: }
340: if ((p = path)) {
341: int segments = 0;
342:
343: /* Parse string first time to find number of `real' tokens */
344: while (*p) {
345: if (*p=='/' || p==path) {
346: if (!((*(p+1)=='/' || !*(p+1)) ||
347: (*(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) ||
348: (*(p+1)=='.' && *(p+2)=='.' &&(*(p+3)=='/' || !*(p+3)))))
349: segments++;
350: }
351: p++;
352: }
2.19 frystyk 353:
2.31 frystyk 354: /* Parse string second time to simplify */
355: p = path;
356: while(*p) {
357: if (*p=='/') {
358: if (p>path && *(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) {
359: char *orig=p, *dest=p+2;
360: while ((*orig++ = *dest++)); /* Remove a slash and a dot */
361: p--;
362: } else if (segments>1 && *(p+1)=='.' && *(p+2)=='.' &&
363: (*(p+3)=='/' || !*(p+3))) {
364: char *q = p;
365: while (q>path && *--q!='/'); /* prev slash */
366: if (strncmp(q, "/../", 4) && strncmp(q, "/./", 3) &&
367: strncmp(q, "./", 2)) {
368: char *orig=q, *dest=p+3;
369: if (*q!='/') dest++;
370: while ((*orig++ = *dest++)); /* Remove /xxx/.. */
371: segments--;
372: p = q-1; /* Start again with prev slash */
373: } else
374: p++;
375: } else if (*(p+1)=='/') {
376: while (*(p+1)=='/') {
377: char *orig=p, *dest=p+1;
378: while ((*orig++ = *dest++)); /* Remove multiple /'s */
2.19 frystyk 379: }
380: }
381: }
2.31 frystyk 382: p++;
383: } /* end while (*p) */
2.19 frystyk 384: }
2.31 frystyk 385: if (URI_TRACE)
2.27 frystyk 386: fprintf(stderr, "into\n............ `%s'\n", filename);
2.31 frystyk 387: return filename;
2.19 frystyk 388: }
2.31 frystyk 389:
2.19 frystyk 390: #ifdef OLD_CODE
2.17 frystyk 391: char * p = filename;
1.1 timbl 392: char * q;
2.17 frystyk 393:
394: if (p) {
395: while (*p && (*p == '/' || *p == '.')) /* Pass starting / or .'s */
396: p++;
397: while(*p) {
398: if (*p=='/') {
1.1 timbl 399: if ((p[1]=='.') && (p[2]=='.') && (p[3]=='/' || !p[3] )) {
2.11 timbl 400: for (q=p-1; (q>=filename) && (*q!='/'); q--); /* prev slash */
401: if (q[0]=='/' && 0!=strncmp(q, "/../", 4)
402: &&!(q-1>filename && q[-1]=='/')) {
2.15 luotonen 403: ari_strcpy(q, p+3); /* Remove /xxx/.. */
1.1 timbl 404: if (!*filename) strcpy(filename, "/");
405: p = q-1; /* Start again with prev slash */
2.11 timbl 406: } else { /* xxx/.. leave it! */
2.9 timbl 407: #ifdef BUG_CODE
2.15 luotonen 408: ari_strcpy(filename, p[3] ? p+4 : p+3); /* rm xxx/../ */
1.1 timbl 409: p = filename; /* Start again */
2.9 timbl 410: #endif
1.1 timbl 411: }
412: } else if ((p[1]=='.') && (p[2]=='/' || !p[2])) {
2.15 luotonen 413: ari_strcpy(p, p+2); /* Remove a slash and a dot */
2.13 luotonen 414: } else if (p[-1] != ':') {
415: while (p[1] == '/') {
2.15 luotonen 416: ari_strcpy(p, p+1); /* Remove multiple slashes */
2.13 luotonen 417: }
1.1 timbl 418: }
2.17 frystyk 419: }
420: p++;
421: } /* end while (*p) */
422: } /* end if (p) */
1.1 timbl 423: }
2.19 frystyk 424: #endif /* OLD_CODE */
1.1 timbl 425:
426:
427: /* Make Relative Name
428: ** ------------------
429: **
430: ** This function creates and returns a string which gives an expression of
431: ** one address as related to another. Where there is no relation, an absolute
432: ** address is retured.
433: **
434: ** On entry,
435: ** Both names must be absolute, fully qualified names of nodes
436: ** (no anchor bits)
437: **
438: ** On exit,
439: ** The return result points to a newly allocated name which, if
440: ** parsed by HTParse relative to relatedName, will yield aName.
441: ** The caller is responsible for freeing the resulting name later.
442: **
443: */
2.32 frystyk 444: char * HTRelative ARGS2(CONST char *, aName, CONST char *, relatedName)
1.1 timbl 445: {
446: char * result = 0;
447: CONST char *p = aName;
448: CONST char *q = relatedName;
449: CONST char * after_access = 0;
450: CONST char * path = 0;
451: CONST char * last_slash = 0;
452: int slashes = 0;
453:
454: for(;*p; p++, q++) { /* Find extent of match */
455: if (*p!=*q) break;
456: if (*p==':') after_access = p+1;
457: if (*p=='/') {
458: last_slash = p;
459: slashes++;
460: if (slashes==3) path=p;
461: }
462: }
463:
464: /* q, p point to the first non-matching character or zero */
465:
466: if (!after_access) { /* Different access */
467: StrAllocCopy(result, aName);
468: } else if (slashes<3){ /* Different nodes */
469: StrAllocCopy(result, after_access);
2.29 frystyk 470: #if 0 /* Henrik */
1.1 timbl 471: } else if (slashes==3){ /* Same node, different path */
472: StrAllocCopy(result, path);
2.21 frystyk 473: #endif
1.1 timbl 474: } else { /* Some path in common */
475: int levels= 0;
476: for(; *q && (*q!='#'); q++) if (*q=='/') levels++;
477: result = (char *)malloc(3*levels + strlen(last_slash) + 1);
478: if (result == NULL) outofmem(__FILE__, "HTRelative");
479: result[0]=0;
480: for(;levels; levels--)strcat(result, "../");
481: strcat(result, last_slash+1);
482: }
2.31 frystyk 483: if (URI_TRACE) fprintf(stderr,
2.21 frystyk 484: "HTRelative.. `%s' expressed relative to `%s' is `%s'\n",
485: aName, relatedName, result);
1.1 timbl 486: return result;
487: }
2.1 timbl 488:
489:
2.31 frystyk 490: /* HTCanon
491: **
492: ** Canonicalizes the URL in the following manner starting from the host
493: ** pointer:
494: **
495: ** 1) The host name is converted to lowercase
496: ** 2) Expands the host name of the URL from a local name to a full
497: ** domain name. A host name is started by `://'.
2.38 frystyk 498: ** 3) Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp)
2.31 frystyk 499: **
500: ** Return: OK The position of the current path part of the URL
501: */
502: PUBLIC char *HTCanon ARGS2 (char **, filename, char *, host)
503: {
2.32 frystyk 504: char *newname = NULL;
2.31 frystyk 505: char *port;
506: char *strptr;
507: char *path;
2.36 frystyk 508: char *access = host-3;
2.31 frystyk 509:
2.36 frystyk 510: while (access>*filename && *(access-1)!='/') /* Find access method */
511: access--;
2.31 frystyk 512: if ((path = strchr(host, '/')) == NULL) /* Find path */
513: path = host + strlen(host);
514: if ((strptr = strchr(host, '@')) != NULL && strptr<path) /* UserId */
515: host = strptr;
2.39 frystyk 516: if ((port = strchr(host, ':')) != NULL && port>path) /* Port number */
517: port = NULL;
2.31 frystyk 518:
519: strptr = host; /* Convert to lower-case */
520: while (strptr<path) {
521: *strptr = TOLOWER(*strptr);
522: strptr++;
523: }
524:
525: /* Does the URL contain a full domain name? This also works for a
526: numerical host name. The domain name is already made lower-case
527: and without a trailing dot. */
2.35 frystyk 528: if (((strptr = strchr(host, '.')) == NULL || strptr >= path) &&
529: strncasecomp(host, "localhost", 9)) {
2.31 frystyk 530: CONST char *domain = HTGetDomainName();
531: if (domain) {
2.32 frystyk 532: if ((newname = (char *) calloc(1, strlen(*filename) +
2.31 frystyk 533: strlen(domain)+2)) == NULL)
534: outofmem(__FILE__, "HTCanon");
535: if (port)
2.32 frystyk 536: strncpy(newname, *filename, (int) (port-*filename));
2.31 frystyk 537: else
2.32 frystyk 538: strncpy(newname, *filename, (int) (path-*filename));
539: strcat(newname, ".");
540: strcat(newname, domain);
2.31 frystyk 541: }
542: } else { /* Look for a trailing dot */
543: char *dot = port ? port : path;
544: if (dot > *filename && *--dot=='.') {
545: char *orig=dot, *dest=dot+1;
546: while((*orig++ = *dest++));
547: if (port) port--;
548: path--;
549: }
550: }
2.36 frystyk 551: /* Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp) */
552: if (port) {
553: if ((!strncmp(access, "http", 4) &&
554: (*(port+1)=='8'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
555: (!strncmp(access, "gopher", 6) &&
556: (*(port+1)=='7'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) ||
557: (!strncmp(access, "ftp", 3) &&
558: (*(port+1)=='2'&&*(port+2)=='1'&&(*(port+3)=='/'||!*(port+3))))) {
559: if (!newname) {
560: char *orig=port, *dest=port+3;
561: while((*orig++ = *dest++));
562: }
563: } else if (newname)
564: strncat(newname, port, (int) (path-port));
565: }
566:
2.32 frystyk 567: if (newname) {
568: char *newpath = newname+strlen(newname);
569: strcat(newname, path);
2.31 frystyk 570: path = newpath;
571: free(*filename); /* Free old copy */
2.32 frystyk 572: *filename = newname;
2.31 frystyk 573: }
574: return path;
575: }
2.1 timbl 576:
577:
2.24 luotonen 578: /* HTCleanTelnetString()
579: * Make sure that the given string doesn't contain characters that
580: * could cause security holes, such as newlines in ftp, gopher,
581: * news or telnet URLs; more specifically: allows everything between
2.26 frystyk 582: * ASCII 20-7E, and also A0-FE, inclusive. Also TAB ('\t') allowed!
2.24 luotonen 583: *
584: * On entry,
585: * str the string that is *modified* if necessary. The
586: * string will be truncated at the first illegal
587: * character that is encountered.
588: * On exit,
589: * returns YES, if the string was modified.
590: * NO, otherwise.
591: */
592: PUBLIC BOOL HTCleanTelnetString ARGS1(char *, str)
593: {
594: char * cur = str;
595:
596: if (!str) return NO;
597:
598: while (*cur) {
599: int a = TOASCII(*cur);
2.26 frystyk 600: if (a != 0x9 && (a < 0x20 || (a > 0x7E && a < 0xA0) || a > 0xFE)) {
2.31 frystyk 601: if (URI_TRACE)
602: fprintf(stderr, "Illegal..... character in URL: \"%s\"\n",str);
2.24 luotonen 603: *cur = 0;
2.31 frystyk 604: if (URI_TRACE)
605: fprintf(stderr, "Truncated... \"%s\"\n",str);
2.24 luotonen 606: return YES;
607: }
608: cur++;
609: }
610: return NO;
611: }
612:
Webmaster