libwww/Library/src/HTParse.c - view

File: [Public] / libwww / Library / src / HTParse.c
Revision 2.60: download - view: text, annotated - select for diffs
Mon Nov 13 00:14:17 1995 UTC (28 years, 6 months ago) by frystyk
Branches: MAIN
CVS tags: v4/0pre6, v4/0C, v4/0B, v4/0, HEAD

Better News support

/* HTParse.c ** URI MANAGEMENT ** ** (c) COPYRIGHT MIT 1995. ** Please first read the full copyright statement in the file COPYRIGH. ** ** history: ** May 12 94 TAB added as legal char in HTCleanTelnetString ** */ /* Library include files */ #include "tcp.h" #include "HTUtils.h" #include "HTParse.h" #include "HTString.h" #include "HTTCP.h" struct struct_parts { char * access; /* Now known as "scheme" */ char * host; char * absolute; char * relative; /* char * search; no - treated as part of path */ char * anchor; }; /* Scan a filename for its consituents ** ----------------------------------- ** ** On entry, ** name points to a document name which may be incomplete. ** On exit, ** absolute or relative may be nonzero (but not both). ** host, anchor and access may be nonzero if they were specified. ** Any which are nonzero point to zero terminated strings. */ PRIVATE void scan (char * name, struct struct_parts * parts) { char * after_access; char * p; int length = strlen(name); parts->access = 0; parts->host = 0; parts->absolute = 0; parts->relative = 0; parts->anchor = 0; after_access = name; for(p=name; *p; p++) { if (*p==':') { *p = 0; parts->access = after_access; /* Scheme has been specified */ /* The combination of gcc, the "-O" flag and the HP platform is unhealthy. The following three lines is a quick & dirty fix, but is not recommended. Rather, turn off "-O". */ /* after_access = p;*/ /* while (*after_access == 0)*/ /* after_access++;*/ after_access = p+1; if (0==strcasecomp("URL", parts->access)) { parts->access = NULL; /* Ignore IETF's URL: pre-prefix */ } else break; } if (*p=='/') break; /* Access has not been specified */ if (*p=='#') break; } for(p=name+length-1; p>=name; p--) { if (*p =='#') { parts->anchor=p+1; *p=0; /* terminate the rest */ } } p = after_access; if (*p=='/'){ if (p[1]=='/') { parts->host = p+2; /* host has been specified */ *p=0; /* Terminate access */ p=strchr(parts->host,'/'); /* look for end of host name if any */ if(p) { *p=0; /* Terminate host */ parts->absolute = p+1; /* Root has been found */ } } else { parts->absolute = p+1; /* Root found but no host */ } } else { parts->relative = (*after_access) ? after_access : 0; /* zero for "" */ } } /* Parse a Name relative to another name ** ------------------------------------- ** ** This returns those parts of a name which are given (and requested) ** substituting bits from the related name where necessary. ** ** On entry, ** aName A filename given ** relatedName A name relative to which aName is to be parsed. Give ** it an empty string if aName is absolute. ** wanted A mask for the bits which are wanted. ** ** On exit, ** returns A pointer to a malloc'd string which MUST BE FREED */ PUBLIC char * HTParse (CONST char *aName, CONST char *relatedName, int wanted) { char * result = 0; char * return_value = 0; int len; char * name = 0; char * rel = 0; char * p; char * access; struct struct_parts given, related; if (!relatedName) /* HWL 23/8/94: dont dump due to NULL */ relatedName = ""; /* Make working copies of input strings to cut up: */ len = strlen(aName)+strlen(relatedName)+10; result=(char *)malloc(len); /* Lots of space: more than enough */ if (result == NULL) outofmem(__FILE__, "HTParse"); StrAllocCopy(name, aName); StrAllocCopy(rel, relatedName); scan(name, &given); scan(rel, &related); result[0]=0; /* Clear string */ access = given.access ? given.access : related.access; if (wanted & PARSE_ACCESS) if (access) { strcat(result, access); if(wanted & PARSE_PUNCTUATION) strcat(result, ":"); } if (given.access && related.access) /* If different, inherit nothing. */ if (strcmp(given.access, related.access)!=0) { related.host=0; related.absolute=0; related.relative=0; related.anchor=0; } if (wanted & PARSE_HOST) if(given.host || related.host) { if(wanted & PARSE_PUNCTUATION) strcat(result, "//"); strcat(result, given.host ? given.host : related.host); } if (given.host && related.host) /* If different hosts, inherit no path. */ if (strcmp(given.host, related.host)!=0) { related.absolute=0; related.relative=0; related.anchor=0; } if (wanted & PARSE_PATH) { if(given.absolute) { /* All is given */ if(wanted & PARSE_PUNCTUATION) strcat(result, "/"); strcat(result, given.absolute); } else if(related.absolute) { /* Adopt path not name */ strcat(result, "/"); strcat(result, related.absolute); if (given.relative) { p = strchr(result, '?'); /* Search part? */ if (!p) p=result+strlen(result)-1; for (; *p!='/'; p--); /* last / */ p[1]=0; /* Remove filename */ strcat(result, given.relative); /* Add given one */ #if 0 result = HTSimplify (&result); #endif } } else if(given.relative) { strcat(result, given.relative); /* what we've got */ } else if(related.relative) { strcat(result, related.relative); } else { /* No inheritance */ strcat(result, "/"); } } if (wanted & PARSE_ANCHOR) if(given.anchor || related.anchor) { if(given.absolute && given.anchor) { /*Fixes for relURLs...*/ if(wanted & PARSE_PUNCTUATION) strcat(result, "#"); strcat(result, given.anchor); } else if (!(given.absolute) && !(given.anchor)) { strcat(result, ""); } else { if(wanted & PARSE_PUNCTUATION) strcat(result, "#"); strcat(result, given.anchor ? given.anchor : related.anchor); } } free(rel); free(name); StrAllocCopy(return_value, result); free(result); return return_value; /* exactly the right length */ } /* HTCanon ** ** Canonicalizes the URL in the following manner starting from the host ** pointer: ** ** 1) The host name is converted to lowercase ** 2) Expands the host name of the URL from a local name to a full ** domain name. A host name is started by `://'. ** 3) Chop off port if `:80' (http), `:70' (gopher), or `:21' (ftp) ** ** Return: OK The position of the current path part of the URL ** which might be the old one or a new one. */ PRIVATE char *HTCanon (char ** filename, char * host) { char *newname = NULL; char *port; char *strptr; char *path; char *access = host-3; while (access>*filename && *(access-1)!='/') /* Find access method */ access--; if ((path = strchr(host, '/')) == NULL) /* Find path */ path = host + strlen(host); if ((strptr = strchr(host, '@')) != NULL && strptr<path) /* UserId */ host = strptr; if ((port = strchr(host, ':')) != NULL && port>path) /* Port number */ port = NULL; strptr = host; /* Convert to lower-case */ while (strptr<path) { *strptr = TOLOWER(*strptr); strptr++; } /* Does the URL contain a full domain name? This also works for a numerical host name. The domain name is already made lower-case and without a trailing dot. */ if (((strptr = strchr(host, '.')) == NULL || strptr >= path) && strncasecomp(host, "localhost", 9)) { CONST char *domain = HTGetDomainName(); if (domain && *domain) { if ((newname = (char *) calloc(1, strlen(*filename) + strlen(domain)+2)) == NULL) outofmem(__FILE__, "HTCanon"); if (port) strncpy(newname, *filename, (int) (port-*filename)); else strncpy(newname, *filename, (int) (path-*filename)); strcat(newname, "."); strcat(newname, domain); } } else { /* Look for a trailing dot */ char *dot = port ? port : path; if (dot > *filename && *--dot=='.') { char *orig=dot, *dest=dot+1; while((*orig++ = *dest++)); if (port) port--; path--; } } /* Chop off port if `:', `:80' (http), `:70' (gopher), or `:21' (ftp) */ if (port) { if (!*(port+1) || *(port+1)=='/') { if (!newname) { char *orig=port, *dest=port+1; while((*orig++ = *dest++)); } } else if ((!strncmp(access, "http", 4) && (*(port+1)=='8'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) || (!strncmp(access, "gopher", 6) && (*(port+1)=='7'&&*(port+2)=='0'&&(*(port+3)=='/'||!*(port+3)))) || (!strncmp(access, "ftp", 3) && (*(port+1)=='2'&&*(port+2)=='1'&&(*(port+3)=='/'||!*(port+3))))) { if (!newname) { char *orig=port, *dest=port+3; while((*orig++ = *dest++)); path -= 3; /* Update path position, Henry Minsky */ } } else if (newname) strncat(newname, port, (int) (path-port)); } if (newname) { char *newpath = newname+strlen(newname); strcat(newname, path); path = newpath; free(*filename); /* Free old copy */ *filename = newname; } return path; } /* Simplify a URI // -------------- // A URI is allowed to contain the seqeunce xxx/../ which may be // replaced by "" , and the seqeunce "/./" which may be replaced by "/". // Simplification helps us recognize duplicate URIs. // // Thus, /etc/junk/../fred becomes /etc/fred // /etc/junk/./fred becomes /etc/junk/fred // // but we should NOT change // http://fred.xxx.edu/../.. // // or ../../albert.html // // In order to avoid empty URLs the following URLs become: // // /fred/.. becomes /fred/.. // /fred/././.. becomes /fred/.. // /fred/.././junk/.././ becomes /fred/.. // // If more than one set of `://' is found (several proxies in cascade) then // only the part after the last `://' is simplified. // // Returns: A string which might be the old one or a new one. */ PUBLIC char *HTSimplify (char ** url) { char *path; char *p; if (!url || !*url) { if (URI_TRACE) TTYPrint(TDEST, "HTSimplify.. Nothing done\n"); return *url; } if (URI_TRACE) TTYPrint(TDEST, "HTSimplify.. `%s\' ", *url); /* Find any scheme name */ if ((path = strstr(*url, "://")) != NULL) { /* Find host name */ char *newptr; char *access = *url; while (access<path && (*access=TOLOWER(*access))) access++; path += 3; while ((newptr = strstr(path, "://")) != NULL) /* For proxies */ path = newptr+3; path = HTCanon(url, path); /* We have a host name */ } else if ((path = strstr(*url, ":/")) != NULL) { path += 2; } else path = *url; if (*path == '/' && *(path+1)=='/') { /* Some URLs start //<foo> */ path += 1; } else if (!strncmp(path, "news:", 5)) { char *ptr = strchr(path+5, '@'); if (!ptr) ptr = path+5; while (*ptr) { /* Make group or host lower case */ *ptr = TOLOWER(*ptr); ptr++; } if (URI_TRACE) TTYPrint(TDEST, "into\n............ `%s'\n", *url); return *url; /* Doesn't need to do any more */ } if ((p = path)) { char *end; if (!((end = strchr(path, ';')) || (end = strchr(path, '?')) || (end = strchr(path, '#')))) end = path+strlen(path); /* Parse string second time to simplify */ p = path; while(p<end) { if (*p=='/') { if (p>*url && *(p+1)=='.' && (*(p+2)=='/' || !*(p+2))) { char *orig = p+1; char *dest = (*(p+2)!='/') ? p+2 : p+3; while ((*orig++ = *dest++)); /* Remove a slash and a dot */ end = orig-1; } else if (*(p+1)=='.' && *(p+2)=='.' && (*(p+3)=='/' || !*(p+3))) { char *q = p; while (q>path && *--q!='/'); /* prev slash */ if (strncmp(q, "/../", 4)) { char *orig = q+1; char *dest = (*(p+3)!='/') ? p+3 : p+4; while ((*orig++ = *dest++)); /* Remove /xxx/.. */ end = orig-1; p = q; /* Start again with prev slash */ } else p++; } else if (*(p+1)=='/') { while (*(p+1)=='/') { char *orig=p, *dest=p+1; while ((*orig++ = *dest++)); /* Remove multiple /'s */ end = orig-1; } } else p++; } else p++; } } if (URI_TRACE) TTYPrint(TDEST, "into\n............ `%s'\n", *url); return *url; } #ifdef OLD_CODE char * p = filename; char * q; if (p) { while (*p && (*p == '/' || *p == '.')) /* Pass starting / or .'s */ p++; while(*p) { if (*p=='/') { if ((p[1]=='.') && (p[2]=='.') && (p[3]=='/' || !p[3] )) { for (q=p-1; (q>=filename) && (*q!='/'); q--); /* prev slash */ if (q[0]=='/' && 0!=strncmp(q, "/../", 4) &&!(q-1>filename && q[-1]=='/')) { ari_strcpy(q, p+3); /* Remove /xxx/.. */ if (!*filename) strcpy(filename, "/"); p = q-1; /* Start again with prev slash */ } else { /* xxx/.. leave it! */ #ifdef BUG_CODE ari_strcpy(filename, p[3] ? p+4 : p+3); /* rm xxx/../ */ p = filename; /* Start again */ #endif } } else if ((p[1]=='.') && (p[2]=='/' || !p[2])) { ari_strcpy(p, p+2); /* Remove a slash and a dot */ } else if (p[-1] != ':') { while (p[1] == '/') { ari_strcpy(p, p+1); /* Remove multiple slashes */ } } } p++; } /* end while (*p) */ } /* end if (p) */ } #endif /* OLD_CODE */ /* Make Relative Name ** ------------------ ** ** This function creates and returns a string which gives an expression of ** one address as related to another. Where there is no relation, an absolute ** address is retured. ** ** On entry, ** Both names must be absolute, fully qualified names of nodes ** (no anchor bits) ** ** On exit, ** The return result points to a newly allocated name which, if ** parsed by HTParse relative to relatedName, will yield aName. ** The caller is responsible for freeing the resulting name later. ** */ PUBLIC char * HTRelative (CONST char * aName, CONST char * relatedName) { char * result = 0; CONST char *p = aName; CONST char *q = relatedName; CONST char * after_access = 0; CONST char * path = 0; CONST char * last_slash = 0; int slashes = 0; for(;*p; p++, q++) { /* Find extent of match */ if (*p!=*q) break; if (*p==':') after_access = p+1; if (*p=='/') { last_slash = p; slashes++; if (slashes==3) path=p; } } /* q, p point to the first non-matching character or zero */ if (!after_access) { /* Different access */ StrAllocCopy(result, aName); } else if (slashes<3){ /* Different nodes */ StrAllocCopy(result, after_access); } else { /* Some path in common */ int levels= 0; for(; *q && (*q!='#'); q++) if (*q=='/') levels++; result = (char *)malloc(3*levels + strlen(last_slash) + 1); if (result == NULL) outofmem(__FILE__, "HTRelative"); result[0]=0; for(;levels; levels--)strcat(result, "../"); strcat(result, last_slash+1); } if (URI_TRACE) TTYPrint(TDEST, "HTRelative.. `%s' expressed relative to `%s' is `%s'\n", aName, relatedName, result); return result; } /* HTCleanTelnetString() * Make sure that the given string doesn't contain characters that * could cause security holes, such as newlines in ftp, gopher, * news or telnet URLs; more specifically: allows everything between * ASCII 20-7E, and also A0-FE, inclusive. Also TAB ('\t') allowed! * * On entry, * str the string that is *modified* if necessary. The * string will be truncated at the first illegal * character that is encountered. * On exit, * returns YES, if the string was modified. * NO, otherwise. */ PUBLIC BOOL HTCleanTelnetString (char * str) { char * cur = str; if (!str) return NO; while (*cur) { int a = TOASCII((unsigned char) *cur); if (a != 0x9 && (a < 0x20 || (a > 0x7E && a < 0xA0) || a > 0xFE)) { if (URI_TRACE) TTYPrint(TDEST, "Illegal..... character in URL: \"%s\"\n",str); *cur = 0; if (URI_TRACE) TTYPrint(TDEST, "Truncated... \"%s\"\n",str); return YES; } cur++; } return NO; }