Annotation of libwww/Library/src/HTParse.c, revision 2.31
1.1 timbl 1: /* Parse HyperText Document Address HTParse.c
2: ** ================================
2.26 frystyk 3: **
4: ** history:
5: ** May 12 94 TAB added as legal char in HTCleanTelnetString
6: **
1.1 timbl 7: */
2.31 ! frystyk 8:
1.1 timbl 9: #include "HTUtils.h"
2.31 ! frystyk 10: #include "HTTCP.h"
1.1 timbl 11: #include "HTParse.h"
12:
2.6 timbl 13: #define HEX_ESCAPE '%'
14:
1.1 timbl 15: struct struct_parts {
2.20 timbl 16: char * access; /* Now known as "scheme" */
1.1 timbl 17: char * host;
18: char * absolute;
19: char * relative;
20: /* char * search; no - treated as part of path */
21: char * anchor;
22: };
23:
24: /* Strip white space off a string
25: ** ------------------------------
26: **
27: ** On exit,
28: ** Return value points to first non-white character, or to 0 if none.
29: ** All trailing white space is OVERWRITTEN with zero.
30: */
31:
2.13 luotonen 32: PUBLIC char * HTStrip ARGS1(char *, s)
1.1 timbl 33: {
34: #define SPACE(c) ((c==' ')||(c=='\t')||(c=='\n'))
35: char * p=s;
2.13 luotonen 36: if (!s) return NULL; /* Doesn't dump core if NULL */
37: for(p=s;*p;p++); /* Find end of string */
1.1 timbl 38: for(p--;p>=s;p--) {
39: if(SPACE(*p)) *p=0; /* Zap trailing blanks */
40: else break;
41: }
42: while(SPACE(*s))s++; /* Strip leading blanks */
43: return s;
44: }
45:
46:
47: /* Scan a filename for its consituents
48: ** -----------------------------------
49: **
50: ** On entry,
51: ** name points to a document name which may be incomplete.
52: ** On exit,
53: ** absolute or relative may be nonzero (but not both).
54: ** host, anchor and access may be nonzero if they were specified.
55: ** Any which are nonzero point to zero terminated strings.
56: */
57: #ifdef __STDC__
58: PRIVATE void scan(char * name, struct struct_parts *parts)
59: #else
60: PRIVATE void scan(name, parts)
61: char * name;
62: struct struct_parts *parts;
63: #endif
64: {
65: char * after_access;
66: char * p;
67: int length = strlen(name);
68:
69: parts->access = 0;
70: parts->host = 0;
71: parts->absolute = 0;
72: parts->relative = 0;
73: parts->anchor = 0;
74:
75: after_access = name;
76: for(p=name; *p; p++) {
77: if (*p==':') {
78: *p = 0;
2.20 timbl 79: parts->access = after_access; /* Scheme has been specified */
1.1 timbl 80: after_access = p+1;
2.22 luotonen 81: if (0==strcasecomp("URL", parts->access)) {
2.20 timbl 82: parts->access = NULL; /* Ignore IETF's URL: pre-prefix */
83: } else break;
1.1 timbl 84: }
2.20 timbl 85: if (*p=='/') break; /* Access has not been specified */
1.1 timbl 86: if (*p=='#') break;
87: }
88:
89: for(p=name+length-1; p>=name; p--) {
90: if (*p =='#') {
91: parts->anchor=p+1;
92: *p=0; /* terminate the rest */
93: }
94: }
95: p = after_access;
96: if (*p=='/'){
97: if (p[1]=='/') {
98: parts->host = p+2; /* host has been specified */
99: *p=0; /* Terminate access */
100: p=strchr(parts->host,'/'); /* look for end of host name if any */
101: if(p) {
102: *p=0; /* Terminate host */
103: parts->absolute = p+1; /* Root has been found */
104: }
105: } else {
106: parts->absolute = p+1; /* Root found but no host */
107: }
108: } else {
109: parts->relative = (*after_access) ? after_access : 0; /* zero for "" */
110: }
111:
2.16 timbl 112: #ifdef OLD_CODE
1.1 timbl 113: /* Access specified but no host: the anchor was not really one
2.16 timbl 114: e.g. news:j462#36487@foo.bar -- JFG 10/jul/92, from bug report */
115: /* This kludge doesn't work for example when coming across
116: file:/usr/local/www/fred#123
117: which loses its anchor. Correct approach in news is to
118: escape weird characters not allowed in URL. TBL 21/dec/93
119: */
1.1 timbl 120: if (parts->access && ! parts->host && parts->anchor) {
121: *(parts->anchor - 1) = '#'; /* Restore the '#' in the address */
122: parts->anchor = 0;
123: }
2.16 timbl 124: #endif
1.1 timbl 125:
126: #ifdef NOT_DEFINED /* search is just treated as part of path */
127: {
128: char *p = relative ? relative : absolute;
129: if (p) {
130: char * q = strchr(p, '?'); /* Any search string? */
131: if (q) {
132: *q = 0; /* If so, chop that off. */
133: parts->search = q+1;
134: }
135: }
136: }
137: #endif
138: } /*scan */
139:
140:
141: /* Parse a Name relative to another name
142: ** -------------------------------------
143: **
144: ** This returns those parts of a name which are given (and requested)
145: ** substituting bits from the related name where necessary.
146: **
147: ** On entry,
148: ** aName A filename given
149: ** relatedName A name relative to which aName is to be parsed
150: ** wanted A mask for the bits which are wanted.
151: **
152: ** On exit,
153: ** returns A pointer to a malloc'd string which MUST BE FREED
154: */
155: #ifdef __STDC__
156: char * HTParse(const char * aName, const char * relatedName, int wanted)
157: #else
158: char * HTParse(aName, relatedName, wanted)
159: char * aName;
160: char * relatedName;
161: int wanted;
162: #endif
163:
164: {
165: char * result = 0;
166: char * return_value = 0;
167: int len;
168: char * name = 0;
169: char * rel = 0;
170: char * p;
2.12 timbl 171: char * access;
1.1 timbl 172: struct struct_parts given, related;
173:
174: /* Make working copies of input strings to cut up:
175: */
176: len = strlen(aName)+strlen(relatedName)+10;
177: result=(char *)malloc(len); /* Lots of space: more than enough */
178: if (result == NULL) outofmem(__FILE__, "HTParse");
179:
180: StrAllocCopy(name, aName);
181: StrAllocCopy(rel, relatedName);
182:
183: scan(name, &given);
184: scan(rel, &related);
185: result[0]=0; /* Clear string */
2.12 timbl 186: access = given.access ? given.access : related.access;
1.1 timbl 187: if (wanted & PARSE_ACCESS)
2.12 timbl 188: if (access) {
189: strcat(result, access);
1.1 timbl 190: if(wanted & PARSE_PUNCTUATION) strcat(result, ":");
191: }
192:
193: if (given.access && related.access) /* If different, inherit nothing. */
194: if (strcmp(given.access, related.access)!=0) {
195: related.host=0;
196: related.absolute=0;
197: related.relative=0;
198: related.anchor=0;
199: }
200:
201: if (wanted & PARSE_HOST)
202: if(given.host || related.host) {
203: if(wanted & PARSE_PUNCTUATION) strcat(result, "//");
204: strcat(result, given.host ? given.host : related.host);
2.31 ! frystyk 205: #if 0
! 206: /* This is now done in HTCanon */
2.12 timbl 207: #define CLEAN_URLS
2.31 ! frystyk 208: #endif
2.12 timbl 209: #ifdef CLEAN_URLS
210: /* Ignore default port numbers, and trailing dots on FQDNs
211: which will only cause identical adreesses to look different */
212: {
2.31 ! frystyk 213: char *tail = result + strlen(result);
! 214: char *p = strchr(tail, ':');
2.12 timbl 215: if (p && access) { /* Port specified */
216: if ( ( strcmp(access, "http") == 0
217: && strcmp(p, ":80") == 0 )
218: ||
219: ( strcmp(access, "gopher") == 0
220: && strcmp(p, ":70") == 0 )
221: )
222: *p = (char)0; /* It is the default: ignore it */
223: }
224: if (!p) p = tail + strlen(tail); /* After hostname */
2.21 frystyk 225: if (*p) { /* Henrik 17/04-94 */
226: p--; /* End of hostname */
227: if (*p == '.') *p = (char)0; /* chop final . */
228: }
2.12 timbl 229: }
230: #endif
1.1 timbl 231: }
232:
233: if (given.host && related.host) /* If different hosts, inherit no path. */
234: if (strcmp(given.host, related.host)!=0) {
235: related.absolute=0;
236: related.relative=0;
237: related.anchor=0;
238: }
239:
240: if (wanted & PARSE_PATH) {
241: if(given.absolute) { /* All is given */
242: if(wanted & PARSE_PUNCTUATION) strcat(result, "/");
243: strcat(result, given.absolute);
244: } else if(related.absolute) { /* Adopt path not name */
245: strcat(result, "/");
246: strcat(result, related.absolute);
247: if (given.relative) {
248: p = strchr(result, '?'); /* Search part? */
249: if (!p) p=result+strlen(result)-1;
250: for (; *p!='/'; p--); /* last / */
251: p[1]=0; /* Remove filename */
252: strcat(result, given.relative); /* Add given one */
2.31 ! frystyk 253: result = HTSimplify (result);
1.1 timbl 254: }
255: } else if(given.relative) {
256: strcat(result, given.relative); /* what we've got */
257: } else if(related.relative) {
258: strcat(result, related.relative);
259: } else { /* No inheritance */
260: strcat(result, "/");
261: }
262: }
263:
264: if (wanted & PARSE_ANCHOR)
265: if(given.anchor || related.anchor) {
266: if(wanted & PARSE_PUNCTUATION) strcat(result, "#");
267: strcat(result, given.anchor ? given.anchor : related.anchor);
268: }
269: free(rel);
270: free(name);
271:
272: StrAllocCopy(return_value, result);
273: free(result);
274: return return_value; /* exactly the right length */
275: }
276:
2.11 timbl 277:
2.21 frystyk 278: #if 0 /* NOT USED FOR THE MOMENT */
2.15 luotonen 279: /*
280: ** As strcpy() but guaranteed to work correctly
281: ** with overlapping parameters. AL 7 Feb 1994
282: */
283: PRIVATE void ari_strcpy ARGS2(char *, to,
284: char *, from)
285: {
286: char * tmp;
287:
288: if (!to || !from) return;
289:
290: tmp = (char*)malloc(strlen(from)+1);
291: if (!tmp) outofmem(__FILE__, "my_strcpy");
292:
293: strcpy(tmp, from);
294: strcpy(to, tmp);
295: free(tmp);
296: }
2.21 frystyk 297: #endif
298:
2.20 timbl 299:
300: /* Simplify a URI
301: // --------------
302: // A URI is allowed to contain the seqeunce xxx/../ which may be
1.1 timbl 303: // replaced by "" , and the seqeunce "/./" which may be replaced by "/".
2.20 timbl 304: // Simplification helps us recognize duplicate URIs.
1.1 timbl 305: //
306: // Thus, /etc/junk/../fred becomes /etc/fred
307: // /etc/junk/./fred becomes /etc/junk/fred
2.11 timbl 308: //
309: // but we should NOT change
310: // http://fred.xxx.edu/../..
311: //
312: // or ../../albert.html
2.26 frystyk 313: //
314: // In the same manner, the following prefixed are preserved:
315: //
316: // ./<etc>
317: // //<etc>
318: //
319: // In order to avoid empty URLs the following URLs become:
320: //
321: // /fred/.. becomes /fred/..
322: // /fred/././.. becomes /fred/..
2.27 frystyk 323: // /fred/.././junk/.././ becomes /fred/..
2.26 frystyk 324: //
2.30 frystyk 325: // If more than one set of `://' is found (several proxies in cascade) then
326: // only the part after the last `://' is simplified.
1.1 timbl 327: */
2.31 ! frystyk 328: PUBLIC char *HTSimplify ARGS1(char *, filename)
1.1 timbl 329: {
2.31 ! frystyk 330: char *path;
! 331: char *p;
2.19 frystyk 332:
2.31 ! frystyk 333: if (!filename) {
! 334: if (URI_TRACE)
! 335: fprintf(stderr, "HTSimplify.. Bad argument\n");
! 336: return filename;
! 337: }
! 338: if (URI_TRACE)
2.27 frystyk 339: fprintf(stderr, "HTSimplify.. `%s\' ", filename);
340:
2.31 ! frystyk 341: if ((path = strstr(filename, "://")) != NULL) { /* Find host name */
2.30 frystyk 342: char *newptr;
2.31 ! frystyk 343: path += 3;
! 344: while ((newptr = strstr(path, "://")) != NULL)
! 345: path = newptr+3;
! 346: path = HTCanon(&filename, path); /* We have a host name */
! 347: } else if ((path = strstr(filename, ":/")) != NULL) {
! 348: path += 2;
2.27 frystyk 349: } else
2.31 ! frystyk 350: path = filename;
! 351: if (*path == '/' && *(path+1)=='/') { /* Some URLs start //<foo> */
! 352: path += 1;
! 353: } else if (!strncmp(path, "news:", 5)) { /* Make group lower case */
! 354: char *group = path+5;
! 355: while (*group && *group!='@' && *group!='/') {
! 356: *group = TOLOWER(*group);
! 357: group++;
! 358: }
! 359: if (URI_TRACE)
! 360: fprintf(stderr, "into\n............ `%s'\n", filename);
! 361: return filename; /* Doesn't need to do any more */
! 362: }
! 363: if ((p = path)) {
! 364: int segments = 0;
! 365:
! 366: /* Parse string first time to find number of `real' tokens */
! 367: while (*p) {
! 368: if (*p=='/' || p==path) {
! 369: if (!((*(p+1)=='/' || !*(p+1)) ||
! 370: (*(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) ||
! 371: (*(p+1)=='.' && *(p+2)=='.' &&(*(p+3)=='/' || !*(p+3)))))
! 372: segments++;
! 373: }
! 374: p++;
! 375: }
2.19 frystyk 376:
2.31 ! frystyk 377: /* Parse string second time to simplify */
! 378: p = path;
! 379: while(*p) {
! 380: if (*p=='/') {
! 381: if (p>path && *(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) {
! 382: char *orig=p, *dest=p+2;
! 383: while ((*orig++ = *dest++)); /* Remove a slash and a dot */
! 384: p--;
! 385: } else if (segments>1 && *(p+1)=='.' && *(p+2)=='.' &&
! 386: (*(p+3)=='/' || !*(p+3))) {
! 387: char *q = p;
! 388: while (q>path && *--q!='/'); /* prev slash */
! 389: if (strncmp(q, "/../", 4) && strncmp(q, "/./", 3) &&
! 390: strncmp(q, "./", 2)) {
! 391: char *orig=q, *dest=p+3;
! 392: if (*q!='/') dest++;
! 393: while ((*orig++ = *dest++)); /* Remove /xxx/.. */
! 394: segments--;
! 395: p = q-1; /* Start again with prev slash */
! 396: } else
! 397: p++;
! 398: } else if (*(p+1)=='/') {
! 399: while (*(p+1)=='/') {
! 400: char *orig=p, *dest=p+1;
! 401: while ((*orig++ = *dest++)); /* Remove multiple /'s */
2.19 frystyk 402: }
403: }
404: }
2.31 ! frystyk 405: p++;
! 406: } /* end while (*p) */
2.19 frystyk 407: }
2.31 ! frystyk 408: if (URI_TRACE)
2.27 frystyk 409: fprintf(stderr, "into\n............ `%s'\n", filename);
2.31 ! frystyk 410: return filename;
2.19 frystyk 411: }
2.31 ! frystyk 412:
2.19 frystyk 413: #ifdef OLD_CODE
2.17 frystyk 414: char * p = filename;
1.1 timbl 415: char * q;
2.17 frystyk 416:
417: if (p) {
418: while (*p && (*p == '/' || *p == '.')) /* Pass starting / or .'s */
419: p++;
420: while(*p) {
421: if (*p=='/') {
1.1 timbl 422: if ((p[1]=='.') && (p[2]=='.') && (p[3]=='/' || !p[3] )) {
2.11 timbl 423: for (q=p-1; (q>=filename) && (*q!='/'); q--); /* prev slash */
424: if (q[0]=='/' && 0!=strncmp(q, "/../", 4)
425: &&!(q-1>filename && q[-1]=='/')) {
2.15 luotonen 426: ari_strcpy(q, p+3); /* Remove /xxx/.. */
1.1 timbl 427: if (!*filename) strcpy(filename, "/");
428: p = q-1; /* Start again with prev slash */
2.11 timbl 429: } else { /* xxx/.. leave it! */
2.9 timbl 430: #ifdef BUG_CODE
2.15 luotonen 431: ari_strcpy(filename, p[3] ? p+4 : p+3); /* rm xxx/../ */
1.1 timbl 432: p = filename; /* Start again */
2.9 timbl 433: #endif
1.1 timbl 434: }
435: } else if ((p[1]=='.') && (p[2]=='/' || !p[2])) {
2.15 luotonen 436: ari_strcpy(p, p+2); /* Remove a slash and a dot */
2.13 luotonen 437: } else if (p[-1] != ':') {
438: while (p[1] == '/') {
2.15 luotonen 439: ari_strcpy(p, p+1); /* Remove multiple slashes */
2.13 luotonen 440: }
1.1 timbl 441: }
2.17 frystyk 442: }
443: p++;
444: } /* end while (*p) */
445: } /* end if (p) */
1.1 timbl 446: }
2.19 frystyk 447: #endif /* OLD_CODE */
1.1 timbl 448:
449:
450: /* Make Relative Name
451: ** ------------------
452: **
453: ** This function creates and returns a string which gives an expression of
454: ** one address as related to another. Where there is no relation, an absolute
455: ** address is retured.
456: **
457: ** On entry,
458: ** Both names must be absolute, fully qualified names of nodes
459: ** (no anchor bits)
460: **
461: ** On exit,
462: ** The return result points to a newly allocated name which, if
463: ** parsed by HTParse relative to relatedName, will yield aName.
464: ** The caller is responsible for freeing the resulting name later.
465: **
466: */
467: #ifdef __STDC__
468: char * HTRelative(const char * aName, const char *relatedName)
469: #else
470: char * HTRelative(aName, relatedName)
471: char * aName;
472: char * relatedName;
473: #endif
474: {
475: char * result = 0;
476: CONST char *p = aName;
477: CONST char *q = relatedName;
478: CONST char * after_access = 0;
479: CONST char * path = 0;
480: CONST char * last_slash = 0;
481: int slashes = 0;
482:
483: for(;*p; p++, q++) { /* Find extent of match */
484: if (*p!=*q) break;
485: if (*p==':') after_access = p+1;
486: if (*p=='/') {
487: last_slash = p;
488: slashes++;
489: if (slashes==3) path=p;
490: }
491: }
492:
493: /* q, p point to the first non-matching character or zero */
494:
495: if (!after_access) { /* Different access */
496: StrAllocCopy(result, aName);
497: } else if (slashes<3){ /* Different nodes */
498: StrAllocCopy(result, after_access);
2.29 frystyk 499: #if 0 /* Henrik */
1.1 timbl 500: } else if (slashes==3){ /* Same node, different path */
501: StrAllocCopy(result, path);
2.21 frystyk 502: #endif
1.1 timbl 503: } else { /* Some path in common */
504: int levels= 0;
505: for(; *q && (*q!='#'); q++) if (*q=='/') levels++;
506: result = (char *)malloc(3*levels + strlen(last_slash) + 1);
507: if (result == NULL) outofmem(__FILE__, "HTRelative");
508: result[0]=0;
509: for(;levels; levels--)strcat(result, "../");
510: strcat(result, last_slash+1);
511: }
2.31 ! frystyk 512: if (URI_TRACE) fprintf(stderr,
2.21 frystyk 513: "HTRelative.. `%s' expressed relative to `%s' is `%s'\n",
514: aName, relatedName, result);
1.1 timbl 515: return result;
516: }
2.1 timbl 517:
518:
2.31 ! frystyk 519: /* HTCanon
! 520: **
! 521: ** Canonicalizes the URL in the following manner starting from the host
! 522: ** pointer:
! 523: **
! 524: ** 1) The host name is converted to lowercase
! 525: ** 2) Expands the host name of the URL from a local name to a full
! 526: ** domain name. A host name is started by `://'.
! 527: ** 3) The default port indication :80, :70, and :21 for are stripped
! 528: **
! 529: ** Return: OK The position of the current path part of the URL
! 530: */
! 531: PUBLIC char *HTCanon ARGS2 (char **, filename, char *, host)
! 532: {
! 533: char *new = NULL;
! 534: char *port;
! 535: char *strptr;
! 536: char *path;
! 537:
! 538: if ((path = strchr(host, '/')) == NULL) /* Find path */
! 539: path = host + strlen(host);
! 540: if ((strptr = strchr(host, '@')) != NULL && strptr<path) /* UserId */
! 541: host = strptr;
! 542: port = strchr(host, ':'); /* Port number */
! 543:
! 544: strptr = host; /* Convert to lower-case */
! 545: while (strptr<path) {
! 546: *strptr = TOLOWER(*strptr);
! 547: strptr++;
! 548: }
! 549:
! 550: /* Does the URL contain a full domain name? This also works for a
! 551: numerical host name. The domain name is already made lower-case
! 552: and without a trailing dot. */
! 553: if ((strptr = strchr(host, '.')) == NULL || strptr >= path) {
! 554: CONST char *domain = HTGetDomainName();
! 555: if (domain) {
! 556: if ((new = (char *) calloc(1, strlen(*filename) +
! 557: strlen(domain)+2)) == NULL)
! 558: outofmem(__FILE__, "HTCanon");
! 559: if (port)
! 560: strncpy(new, *filename, (int) (port-*filename));
! 561: else
! 562: strncpy(new, *filename, (int) (path-*filename));
! 563: strcat(new, ".");
! 564: strcat(new, domain);
! 565: }
! 566: } else { /* Look for a trailing dot */
! 567: char *dot = port ? port : path;
! 568: if (dot > *filename && *--dot=='.') {
! 569: char *orig=dot, *dest=dot+1;
! 570: while((*orig++ = *dest++));
! 571: if (port) port--;
! 572: path--;
! 573: }
! 574: }
! 575:
! 576: /* Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp) */
! 577: if (port) {
! 578: if ((*(port+1)=='8' && *(port+2)=='0' &&
! 579: (*(port+3)=='/' || !*(port+3))) ||
! 580: (*(port+1)=='2' && *(port+2)=='1' &&
! 581: (*(port+3)=='/' || !*(port+3))) ||
! 582: (*(port+1)=='7' && *(port+2)=='0' &&
! 583: (*(port+3)=='/' || !*(port+3)))) {
! 584: if (!new) {
! 585: char *orig=port, *dest=port+3;
! 586: while((*orig++ = *dest++));
! 587: }
! 588: } else if (new)
! 589: strncat(new, port, (int) (path-port));
! 590: }
! 591: if (new) {
! 592: char *newpath = new+strlen(new);
! 593: strcat(new, path);
! 594: path = newpath;
! 595: free(*filename); /* Free old copy */
! 596: *filename = new;
! 597: }
! 598: return path;
! 599: }
! 600:
! 601:
2.6 timbl 602: /* Escape undesirable characters using % HTEscape()
603: ** -------------------------------------
604: **
605: ** This function takes a pointer to a string in which
606: ** some characters may be unacceptable unescaped.
607: ** It returns a string which has these characters
608: ** represented by a '%' character followed by two hex digits.
609: **
2.20 timbl 610: ** In the tradition of being conservative in what you do and liberal
611: ** in what you accept, we encode some characters which in fact are
612: ** allowed in URLs unencoded -- so DON'T use the table below for
613: ** parsing!
614: **
2.6 timbl 615: ** Unlike HTUnEscape(), this routine returns a malloced string.
2.20 timbl 616: **
2.6 timbl 617: */
618:
2.20 timbl 619: /* Not BOTH static AND const at the same time in gcc :-(, Henrik 18/03-94
620: ** code gen error in gcc when making random access to
621: ** static const table(!!) */
2.19 frystyk 622: /* PRIVATE CONST unsigned char isAcceptable[96] = */
623: PRIVATE unsigned char isAcceptable[96] =
2.6 timbl 624:
2.20 timbl 625: /* Overencodes */
2.6 timbl 626: /* Bit 0 xalpha -- see HTFile.h
627: ** Bit 1 xpalpha -- as xalpha but with plus.
2.20 timbl 628: ** Bit 2 ... path -- as xpalpha but with /
2.6 timbl 629: */
630: /* 0 1 2 3 4 5 6 7 8 9 A B C D E F */
631: { 0,0,0,0,0,0,0,0,0,0,7,6,0,7,7,4, /* 2x !"#$%&'()*+,-./ */
632: 7,7,7,7,7,7,7,7,7,7,0,0,0,0,0,0, /* 3x 0123456789:;<=>? */
633: 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 4x @ABCDEFGHIJKLMNO */
634: 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,7, /* 5X PQRSTUVWXYZ[\]^_ */
635: 0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /* 6x `abcdefghijklmno */
636: 7,7,7,7,7,7,7,7,7,7,7,0,0,0,0,0 }; /* 7X pqrstuvwxyz{\}~ DEL */
637:
638: PRIVATE char *hex = "0123456789ABCDEF";
639:
2.8 timbl 640: PUBLIC char * HTEscape ARGS2 (CONST char *, str,
2.6 timbl 641: unsigned char, mask)
642: {
643: #define ACCEPTABLE(a) ( a>=32 && a<128 && ((isAcceptable[a-32]) & mask))
644: CONST char * p;
645: char * q;
646: char * result;
647: int unacceptable = 0;
648: for(p=str; *p; p++)
649: if (!ACCEPTABLE((unsigned char)TOASCII(*p)))
650: unacceptable++;
651: result = (char *) malloc(p-str + unacceptable+ unacceptable + 1);
652: if (result == NULL) outofmem(__FILE__, "HTEscape");
653: for(q=result, p=str; *p; p++) {
654: unsigned char a = TOASCII(*p);
655: if (!ACCEPTABLE(a)) {
656: *q++ = HEX_ESCAPE; /* Means hex commming */
657: *q++ = hex[a >> 4];
658: *q++ = hex[a & 15];
659: }
660: else *q++ = *p;
661: }
662: *q++ = 0; /* Terminate */
663: return result;
664: }
665:
666:
2.1 timbl 667: /* Decode %xx escaped characters HTUnEscape()
668: ** -----------------------------
669: **
670: ** This function takes a pointer to a string in which some
671: ** characters may have been encoded in %xy form, where xy is
672: ** the acsii hex code for character 16x+y.
673: ** The string is converted in place, as it will never grow.
674: */
675:
676: PRIVATE char from_hex ARGS1(char, c)
677: {
2.6 timbl 678: return c >= '0' && c <= '9' ? c - '0'
679: : c >= 'A' && c <= 'F'? c - 'A' + 10
680: : c - 'a' + 10; /* accept small letters just in case */
2.1 timbl 681: }
682:
683: PUBLIC char * HTUnEscape ARGS1( char *, str)
684: {
685: char * p = str;
686: char * q = str;
2.25 frystyk 687:
688: if (!str) { /* Just for safety ;-) */
2.31 ! frystyk 689: if (URI_TRACE)
2.25 frystyk 690: fprintf(stderr, "HTUnEscape.. Called with NULL argument.\n");
691: return "";
692: }
2.1 timbl 693: while(*p) {
2.6 timbl 694: if (*p == HEX_ESCAPE) {
2.1 timbl 695: p++;
696: if (*p) *q = from_hex(*p++) * 16;
697: if (*p) *q = FROMASCII(*q + from_hex(*p++));
698: q++;
699: } else {
700: *q++ = *p++;
701: }
702: }
703:
704: *q++ = 0;
705: return str;
706:
707: } /* HTUnEscape */
708:
709:
2.24 luotonen 710: /* HTCleanTelnetString()
711: * Make sure that the given string doesn't contain characters that
712: * could cause security holes, such as newlines in ftp, gopher,
713: * news or telnet URLs; more specifically: allows everything between
2.26 frystyk 714: * ASCII 20-7E, and also A0-FE, inclusive. Also TAB ('\t') allowed!
2.24 luotonen 715: *
716: * On entry,
717: * str the string that is *modified* if necessary. The
718: * string will be truncated at the first illegal
719: * character that is encountered.
720: * On exit,
721: * returns YES, if the string was modified.
722: * NO, otherwise.
723: */
724: PUBLIC BOOL HTCleanTelnetString ARGS1(char *, str)
725: {
726: char * cur = str;
727:
728: if (!str) return NO;
729:
730: while (*cur) {
731: int a = TOASCII(*cur);
2.26 frystyk 732: if (a != 0x9 && (a < 0x20 || (a > 0x7E && a < 0xA0) || a > 0xFE)) {
2.31 ! frystyk 733: if (URI_TRACE)
! 734: fprintf(stderr, "Illegal..... character in URL: \"%s\"\n",str);
2.24 luotonen 735: *cur = 0;
2.31 ! frystyk 736: if (URI_TRACE)
! 737: fprintf(stderr, "Truncated... \"%s\"\n",str);
2.24 luotonen 738: return YES;
739: }
740: cur++;
741: }
742: return NO;
743: }
744:
Webmaster