/* * * (c) COPYRIGHT MIT and INRIA, 1996. * Please first read the full copyright statement in file COPYRIGHT. * */ /* * AHTURLTools.c: contains all the functions for testing, manipulating, * and normalizing URLs. * * Authors: J. Kahan, I. Vatton * */ /* Amaya includes */ #define THOT_EXPORT extern #include "amaya.h" #include "init_f.h" #include "AHTURLTools_f.h" /*---------------------------------------------------------------------- ExplodeURL ----------------------------------------------------------------------*/ #ifdef __STDC__ void ExplodeURL (char *url, char **proto, char **host, char **dir, char **file) #else void ExplodeURL (url, proto, host, dir, file) char *url; char **proto; char **host; char **dir; char **file; #endif { char *curr, *temp; if ((url == NULL) || (proto == NULL) || (host == NULL) || (dir == NULL) || (file == NULL)) return; /* initialize every pointer */ *proto = *host = *dir = *file = NULL; /* skip any leading space */ while ((*url == SPACE) || (*url == TAB)) url++; curr = url; if (*curr == 0) goto finished; /* go to the end of the URL */ while ((*curr != 0) && (*curr != SPACE) && (*curr != '\b') && (*curr != '\r') && (*curr != EOL)) curr++; /* mark the end of the chain */ *curr = EOS; curr--; if (curr <= url) goto finished; /* search the next DIR_SEP indicating the beginning of the file name */ do curr--; while ((curr >= url) && (*curr != DIR_SEP)); if (curr < url) goto finished; *file = curr + 1; /* mark the end of the dir */ *curr = EOS; curr--; if (curr < url) goto finished; /* search for the "/" indicating the host name start */ while ((curr > url) && ((*curr != DIR_SEP) || (*(curr + 1) != DIR_SEP))) curr--; /* if we found it, separate the host name from the directory */ if ((*curr == DIR_SEP) && (*(curr + 1) == DIR_SEP)) { *host = temp = curr + 2; while ((*temp != 0) && (*temp != DIR_SEP)) temp++; if (*temp == DIR_SEP) { *temp = EOS; *dir = temp + 1; } } else *dir = curr; if (curr <= url) goto finished; /* mark the end of the proto */ *curr = EOS; curr--; if (curr < url) goto finished; if (*curr == ':') { *curr = EOS; curr--; } else goto finished; if (curr < url) goto finished; while ((curr > url) && (isalpha (*curr))) curr--; *proto = curr; finished:; #ifdef AMAYA_DEBUG fprintf (stderr, "ExplodeURL(%s)\n\t", url); if (*proto) fprintf (stderr, "proto : %s, ", *proto); if (*host) fprintf (stderr, "host : %s, ", *host); if (*dir) fprintf (stderr, "dir : %s, ", *dir); if (*file) fprintf (stderr, "file : %s ", *file); fprintf (stderr, "\n"); #endif } /*---------------------------------------------------------------------- IsHTMLName returns TRUE if path points to an HTML resource. ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsHTMLName (char *path) #else /* __STDC__ */ boolean IsHTMLName (path) char *path; #endif /* __STDC__ */ { char temppath[MAX_LENGTH]; char suffix[MAX_LENGTH]; char nsuffix[MAX_LENGTH]; int i; if (!path) return (FALSE); strcpy (temppath, path); ExtractSuffix (temppath, suffix); /* Normalize the suffix */ i = 0; while (suffix[i] != EOS) { nsuffix[i] = TOLOWER (suffix[i]); i++; } nsuffix[i] = EOS; if ((strcmp (nsuffix, "html")) && (strcmp (nsuffix, "htm")) && (strcmp (nsuffix, "shtml"))) return (FALSE); else if ((!strcmp (nsuffix, "gz")) || (!strcmp (nsuffix, "z"))) { /* take in account compressed files */ ExtractSuffix (temppath, suffix); /* Normalize the suffix */ i = 0; while (suffix[i] != EOS) { nsuffix[i] = TOLOWER (suffix[i]); i++; } nsuffix[i] = EOS; if ((strcmp (nsuffix, "html")) && (strcmp (nsuffix, "htm")) && (strcmp (nsuffix, "shtml"))) return (FALSE); else return (TRUE); } else return (TRUE); } /*---------------------------------------------------------------------- IsImageName returns TRUE if path points to an image resource. ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsImageName (char *path) #else /* __STDC__ */ boolean IsImageName (path) char *path; #endif /* __STDC__ */ { char temppath[MAX_LENGTH]; char suffix[MAX_LENGTH]; char nsuffix[MAX_LENGTH]; int i; if (!path) return (FALSE); strcpy (temppath, path); ExtractSuffix (temppath, suffix); /* Normalize the suffix */ i = 0; while (suffix[i] != EOS) { nsuffix[i] = TOLOWER (suffix[i]); i++; } nsuffix[i] = EOS; if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) && (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) && (strcmp (nsuffix, "png")) && (strcmp (nsuffix, "au"))) return (FALSE); return (TRUE); } /*---------------------------------------------------------------------- IsTextName ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsTextName (char *path) #else /* __STDC__ */ boolean IsTextName (path) char *path; #endif /* __STDC__ */ { char temppath[MAX_LENGTH]; char suffix[MAX_LENGTH]; char nsuffix[MAX_LENGTH]; int i; if (!path) return (FALSE); strcpy (temppath, path); ExtractSuffix (temppath, suffix); /* Normalize the suffix */ i = 0; while (suffix[i] != EOS) { nsuffix[i] = TOLOWER (suffix[i]); i++; } nsuffix[i] = EOS; if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) && (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) && (strcmp (nsuffix, "pdf")) && (strcmp (nsuffix, "png")) && (strcmp (nsuffix, "tgz")) && (strcmp (nsuffix, "xpg")) && (strcmp (nsuffix, "xpd")) && (strcmp (nsuffix, "ps")) && (strcmp (nsuffix, "au"))) return (TRUE); else if ((!strcmp (nsuffix, "gz")) || (!strcmp (nsuffix, "z"))) { /* take in account compressed files */ ExtractSuffix (temppath, suffix); /* Normalize the suffix */ i = 0; while (suffix[i] != EOS) { nsuffix[i] = TOLOWER (suffix[i]); i++; } nsuffix[i] = EOS; if ((!strcmp (nsuffix, "html")) || (!strcmp (nsuffix, "htm")) || (!strcmp (nsuffix, "shtml"))) return (TRUE); else return (FALSE); } else return (FALSE); } /*---------------------------------------------------------------------- IsHTTPPath returns TRUE if path is in fact an http URL. ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsHTTPPath (char *path) #else /* __STDC__ */ boolean IsHTTPPath (path) char *path; #endif /* __STDC__ */ { if (!path) return FALSE; if (strncmp (path, "http:", 5) != 0) return FALSE; return TRUE; } /*---------------------------------------------------------------------- IsWithParameters returns TRUE if url has a concatenated query string. ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsWithParameters (char *url) #else /* __STDC__ */ boolean IsWithParameters (url) char *url; #endif /* __STDC__ */ { int i; if ((!url) || (url[0] == EOS)) return FALSE; i = strlen (url) - 1; while (i > 0 && url[i--] != '?') if (i < 0) return FALSE; /* There is a parameter */ return TRUE; } /*---------------------------------------------------------------------- IsW3Path returns TRUE if path is in fact a URL. ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsW3Path (char *path) #else /* __STDC__ */ boolean IsW3Path (path) char *path; #endif /* __STDC__ */ { if ((strncmp (path, "http:", 5)) && (strncmp (path, "ftp:", 4)) && (strncmp (path, "telnet:", 7)) && (strncmp (path, "wais:", 5)) && (strncmp (path, "news:", 5)) && (strncmp (path, "gopher:", 7)) && (strncmp (path, "mailto:", 7)) && (strncmp (path, "archie:", 7))) return FALSE; return TRUE; } /*---------------------------------------------------------------------- IsValidProtocol returns true if the url protocol is supported by Amaya. ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsValidProtocol (char *url) #else /* __STDC__ */ boolean IsValidProtocol (url) char *url; #endif /* __STDC__ */ { if (!strncmp (url, "http:", 5) /***|| !strncmp (path, "ftp:", 4) || !strncmp (path, "news:", 5)***/ ) return (TRUE); else return (FALSE); } /*---------------------------------------------------------------------- NormalizeURL normalizes orgName according to a base associated with doc, and following the standard URL format rules. The function returns the new complete and normalized URL or file name path (newName) and the name of the document (docName). N.B. If the function can't find out what's the docName, it assigns the name "noname.html". ----------------------------------------------------------------------*/ #ifdef __STDC__ void NormalizeURL (char *orgName, Document doc, char *newName, char *docName) #else /* __STDC__ */ void NormalizeURL (orgName, doc, newName, docName) char *orgName; Document doc; char *newName; char *docName; #endif /* __STDC__ */ { char basename[MAX_LENGTH]; char tempname[MAX_LENGTH]; int i; char *ptr; char *basename_ptr; int basename_flag; Element el; ElementType elType; AttributeType attrType; Attribute attrHREF; int length; /* Fix up orgName, by erasing leading and trailing white space */ if (!newName || !docName) return; ptr = orgName; while (*ptr == ' ' && *ptr++ != EOS) ; strcpy (tempname, ptr); ptr = strchr (tempname, ' '); if (ptr) *ptr = EOS; if (IsW3Path (tempname) || doc == 0) /* the name is complete */ strcpy (newName, tempname); else { /* take into account the BASE element. */ length = MAX_LENGTH; /* get the root element */ el = TtaGetMainRoot (doc); /* search the BASE element */ elType.ElSSchema = TtaGetDocumentSSchema (doc); elType.ElTypeNum = HTML_EL_BASE; el = TtaSearchTypedElement (elType, SearchInTree, el); if (el) { /* ** The document has a BASE element ** Get the HREF attribute of the BASE Element */ attrType.AttrSSchema = elType.ElSSchema; attrType.AttrTypeNum = HTML_ATTR_HREF_; attrHREF = TtaGetAttribute (el, attrType); if (attrHREF) { /* Use the base path of the document */ TtaGiveTextAttributeValue (attrHREF, basename, &length); /* base and orgName have to be separated by a DIR_SEP */ if (basename[strlen (basename) - 1] != DIR_SEP) { if (IsHTMLName (basename)) { /* remove the document name from basename */ length = strlen (basename) - 1; while (basename[length] != DIR_SEP) basename[length--] = EOS; } else if (tempname[0] != DIR_SEP) strcat (basename, DIR_STR); } } else basename[0] = EOS; } else basename[0] = EOS; if (basename[0] == EOS) { /* there is no BASE element in that document. */ if (DocumentURLs[(int) doc]) { basename_ptr = HTParse (DocumentURLs[(int) doc], "", PARSE_ALL); basename_flag = TRUE; } else { basename_ptr = ""; basename_flag = FALSE; } } else { basename_ptr = HTParse (basename, "", PARSE_ALL); basename_flag = TRUE; } if (tempname[0] == '/') ptr = HTParse (tempname, basename_ptr, PARSE_ACCESS | PARSE_PUNCTUATION | PARSE_HOST); else ptr = HTParse (tempname, basename_ptr, PARSE_ALL); if (basename_flag) HT_FREE (basename_ptr); if (ptr) { ptr = HTSimplify (&ptr); strcpy (newName, ptr); HT_FREE (ptr); } else newName[0] = EOS; } i = strlen (newName) - 1; if (i > 0) { /* search now the document name */ ptr = strrchr (newName, DIR_SEP); if (ptr) ptr++; if (ptr && *ptr != EOS) strcpy (docName, ptr); else /* the docname was not comprised inside the URL, so let's */ /* assign a "noname.html" name :) */ strcpy (docName, "noname.html"); /* remove DIR_SEP at the end of complete path */ if (newName[i] == DIR_SEP) newName[i] = EOS; } } /*---------------------------------------------------------------------- IsSameHost ----------------------------------------------------------------------*/ #ifdef __STDC__ boolean IsSameHost (char *url1, char *url2) #else /* __STDC__ */ boolean IsSameHost (url1, url2) char *path; #endif /* __STDC__ */ { char *basename_ptr1, *basename_ptr2; boolean result; basename_ptr1 = HTParse (url1, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION); basename_ptr2 = HTParse (url2, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION); if (strcmp (basename_ptr1, basename_ptr2)) result = FALSE; else result = TRUE; HT_FREE (basename_ptr1); HT_FREE (basename_ptr2); return (result); } /*---------------------------------------------------------------------- AHTMakeRelativeURL converts url into a relative url to base_url. If succesful, returns the new URL, otherwise, it returns NULL. The caller has to free the new URL. ----------------------------------------------------------------------*/ #ifdef __STDC__ char *AHTMakeRelativeName (char *url, char *base_url) #else /* __STDC__ */ char *AHTMakeRelativeName (url, base_url) char url; char base_url; #endif /* __STDC__ */ { char *base_ptr, *url_ptr; char *result; /* verify if we are in the same host */ base_ptr = HTParse (base_url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION); url_ptr = HTParse (url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION); if (!strcmp (base_ptr, url_ptr)) { HT_FREE (base_ptr); HT_FREE (url_ptr); /* Normalize the URLs */ base_ptr = HTParse (base_url, "", PARSE_ALL); url_ptr = HTParse (url, "", PARSE_ALL); /* Use libwww to make relative name */ result = HTRelative (url_ptr, base_ptr); HT_FREE (base_ptr); HT_FREE (url_ptr); } else result = (char *) NULL; return (result); }