Annotation of Amaya/amaya/AHTURLTools.c, revision 1.15
1.7 cvs 1: /*
2: *
3: * (c) COPYRIGHT MIT and INRIA, 1996.
4: * Please first read the full copyright statement in file COPYRIGHT.
5: *
6: */
1.9 cvs 7:
1.10 cvs 8: /*
9: * AHTURLTools.c: contains all the functions for testing, manipulating,
10: * and normalizing URLs.
11: *
12: * Authors: J. Kahan, I. Vatton
13: *
14: */
1.7 cvs 15:
1.8 cvs 16: /* Amaya includes */
1.15 ! cvs 17: #define THOT_EXPORT extern
1.3 cvs 18: #include "amaya.h"
19:
1.8 cvs 20:
21: #include "init_f.h"
22: #include "AHTURLTools_f.h"
23:
24: /*----------------------------------------------------------------------
1.11 cvs 25: ExplodeURL
1.8 cvs 26: ----------------------------------------------------------------------*/
27:
28: #ifdef __STDC__
29: void ExplodeURL (char *url, char **proto, char **host, char **dir, char **file)
30: #else
31: void ExplodeURL (url, proto, host, dir, file)
32: char *url;
33: char **proto;
34: char **host;
35: char **dir;
36: char **file;
37:
38: #endif
39: {
1.9 cvs 40: char *curr, *temp;
1.8 cvs 41:
42: if ((url == NULL) || (proto == NULL) || (host == NULL) ||
43: (dir == NULL) || (file == NULL))
44: return;
45:
46: /* initialize every pointer */
47: *proto = *host = *dir = *file = NULL;
48:
49: /* skip any leading space */
50: while ((*url == SPACE) || (*url == TAB))
51: url++;
1.9 cvs 52: curr = url;
53: if (*curr == 0)
1.8 cvs 54: goto finished;
55:
56: /* go to the end of the URL */
1.9 cvs 57: while ((*curr != 0) && (*curr != SPACE) && (*curr != '\b') &&
58: (*curr != '\r') && (*curr != EOL))
59: curr++;
1.8 cvs 60:
61: /* mark the end of the chain */
1.9 cvs 62: *curr = EOS;
63: curr--;
64: if (curr <= url)
1.8 cvs 65: goto finished;
66:
67: /* search the next DIR_SEP indicating the beginning of the file name */
68: do
1.11 cvs 69: curr--;
1.9 cvs 70: while ((curr >= url) && (*curr != DIR_SEP));
1.11 cvs 71:
1.9 cvs 72: if (curr < url)
1.8 cvs 73: goto finished;
1.9 cvs 74: *file = curr + 1;
1.8 cvs 75:
76: /* mark the end of the dir */
1.9 cvs 77: *curr = EOS;
78: curr--;
79: if (curr < url)
1.8 cvs 80: goto finished;
81:
82: /* search for the "/" indicating the host name start */
1.9 cvs 83: while ((curr > url) && ((*curr != DIR_SEP) || (*(curr + 1) != DIR_SEP)))
84: curr--;
1.8 cvs 85:
86: /* if we found it, separate the host name from the directory */
1.9 cvs 87: if ((*curr == DIR_SEP) && (*(curr + 1) == DIR_SEP))
1.8 cvs 88: {
1.9 cvs 89: *host = temp = curr + 2;
1.8 cvs 90: while ((*temp != 0) && (*temp != DIR_SEP))
91: temp++;
92: if (*temp == DIR_SEP)
93: {
94: *temp = EOS;
95: *dir = temp + 1;
96: }
97: }
98: else
1.11 cvs 99: *dir = curr;
100:
1.9 cvs 101: if (curr <= url)
1.8 cvs 102: goto finished;
103:
104: /* mark the end of the proto */
1.9 cvs 105: *curr = EOS;
106: curr--;
107: if (curr < url)
1.8 cvs 108: goto finished;
109:
1.9 cvs 110: if (*curr == ':')
1.8 cvs 111: {
1.9 cvs 112: *curr = EOS;
113: curr--;
1.8 cvs 114: }
115: else
116: goto finished;
1.11 cvs 117:
1.9 cvs 118: if (curr < url)
1.8 cvs 119: goto finished;
1.9 cvs 120: while ((curr > url) && (isalpha (*curr)))
121: curr--;
122: *proto = curr;
1.8 cvs 123:
124: finished:;
125:
126: #ifdef AMAYA_DEBUG
127: fprintf (stderr, "ExplodeURL(%s)\n\t", url);
128: if (*proto)
129: fprintf (stderr, "proto : %s, ", *proto);
130: if (*host)
131: fprintf (stderr, "host : %s, ", *host);
132: if (*dir)
133: fprintf (stderr, "dir : %s, ", *dir);
134: if (*file)
135: fprintf (stderr, "file : %s ", *file);
136: fprintf (stderr, "\n");
137: #endif
138:
139: }
1.3 cvs 140:
1.4 cvs 141: /*----------------------------------------------------------------------
1.9 cvs 142: IsHTMLName
143: returns TRUE if path points to an HTML resource.
1.4 cvs 144: ----------------------------------------------------------------------*/
1.3 cvs 145: #ifdef __STDC__
146: boolean IsHTMLName (char *path)
147: #else /* __STDC__ */
148: boolean IsHTMLName (path)
149: char *path;
150: #endif /* __STDC__ */
151: {
1.5 cvs 152: char temppath[MAX_LENGTH];
153: char suffix[MAX_LENGTH];
154: char nsuffix[MAX_LENGTH];
155: int i;
156:
157: if (!path)
1.13 cvs 158: return (FALSE);
1.5 cvs 159:
160: strcpy (temppath, path);
161: ExtractSuffix (temppath, suffix);
162:
163: /* Normalize the suffix */
164: i = 0;
165: while (suffix[i] != EOS)
1.13 cvs 166: {
167: nsuffix[i] = TOLOWER (suffix[i]);
168: i++;
169: }
1.5 cvs 170: nsuffix[i] = EOS;
171: if ((strcmp (nsuffix, "html")) &&
172: (strcmp (nsuffix, "htm")) &&
173: (strcmp (nsuffix, "shtml")))
1.13 cvs 174: return (FALSE);
175: else if ((!strcmp (nsuffix, "gz")) ||
176: (!strcmp (nsuffix, "Z")))
177: {
178: /* take in account compressed files */
179: ExtractSuffix (temppath, suffix);
180: /* Normalize the suffix */
181: i = 0;
182: while (suffix[i] != EOS)
183: {
184: nsuffix[i] = TOLOWER (suffix[i]);
185: i++;
186: }
187: nsuffix[i] = EOS;
188: if ((strcmp (nsuffix, "html")) &&
189: (strcmp (nsuffix, "htm")) &&
190: (strcmp (nsuffix, "shtml")))
191: return (FALSE);
192: else
193: return (TRUE);
194: }
195: else
196: return (TRUE);
1.3 cvs 197: }
198:
1.4 cvs 199: /*----------------------------------------------------------------------
1.9 cvs 200: IsImageName
201: returns TRUE if path points to an image resource.
1.4 cvs 202: ----------------------------------------------------------------------*/
1.3 cvs 203: #ifdef __STDC__
204: boolean IsImageName (char *path)
205: #else /* __STDC__ */
206: boolean IsImageName (path)
207: char *path;
208: #endif /* __STDC__ */
209: {
1.5 cvs 210: char temppath[MAX_LENGTH];
211: char suffix[MAX_LENGTH];
212: char nsuffix[MAX_LENGTH];
213: int i;
214:
215: if (!path)
1.13 cvs 216: return (FALSE);
1.5 cvs 217:
218: strcpy (temppath, path);
219: ExtractSuffix (temppath, suffix);
220:
221: /* Normalize the suffix */
222: i = 0;
223: while (suffix[i] != EOS)
1.13 cvs 224: {
225: nsuffix[i] = TOLOWER (suffix[i]);
226: i++;
227: }
1.5 cvs 228: nsuffix[i] = EOS;
229: if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) &&
230: (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) &&
231: (strcmp (nsuffix, "png")) && (strcmp (nsuffix, "au")))
1.13 cvs 232: return (FALSE);
233: return (TRUE);
1.3 cvs 234: }
235:
1.4 cvs 236: /*----------------------------------------------------------------------
1.9 cvs 237: IsTextName
1.4 cvs 238: ----------------------------------------------------------------------*/
1.3 cvs 239: #ifdef __STDC__
240: boolean IsTextName (char *path)
241: #else /* __STDC__ */
242: boolean IsTextName (path)
243: char *path;
244:
245: #endif /* __STDC__ */
246: {
1.5 cvs 247: char temppath[MAX_LENGTH];
248: char suffix[MAX_LENGTH];
249: char nsuffix[MAX_LENGTH];
250: int i;
251:
252: if (!path)
1.13 cvs 253: return (FALSE);
1.5 cvs 254:
255: strcpy (temppath, path);
256: ExtractSuffix (temppath, suffix);
257:
258: /* Normalize the suffix */
259: i = 0;
260: while (suffix[i] != EOS)
261: {
262: nsuffix[i] = TOLOWER (suffix[i]);
263: i++;
264: }
265: nsuffix[i] = EOS;
266:
267: if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) &&
268: (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) &&
269: (strcmp (nsuffix, "pdf")) && (strcmp (nsuffix, "png")) &&
270: (strcmp (nsuffix, "tgz")) && (strcmp (nsuffix, "xpg")) &&
271: (strcmp (nsuffix, "xpd")) && (strcmp (nsuffix, "ps")) &&
272: (strcmp (nsuffix, "au")))
1.13 cvs 273: return (TRUE);
274: else if ((!strcmp (nsuffix, "gz")) || (!strcmp (nsuffix, "Z")))
275: {
276: /* take in account compressed files */
277: ExtractSuffix (temppath, suffix);
278: /* Normalize the suffix */
279: i = 0;
280: while (suffix[i] != EOS)
281: {
282: nsuffix[i] = TOLOWER (suffix[i]);
283: i++;
284: }
285: nsuffix[i] = EOS;
286: if ((!strcmp (nsuffix, "html")) ||
287: (!strcmp (nsuffix, "htm")) ||
288: (!strcmp (nsuffix, "shtml")))
289: return (TRUE);
290: else
291: return (FALSE);
292: }
293: else
294: return (FALSE);
1.3 cvs 295: }
296:
1.4 cvs 297: /*----------------------------------------------------------------------
1.9 cvs 298: IsHTTPPath
299: returns TRUE if path is in fact an http URL.
1.4 cvs 300: ----------------------------------------------------------------------*/
1.3 cvs 301: #ifdef __STDC__
302: boolean IsHTTPPath (char *path)
303: #else /* __STDC__ */
304: boolean IsHTTPPath (path)
305: char *path;
306: #endif /* __STDC__ */
307: {
1.5 cvs 308: if (!path)
309: return FALSE;
1.3 cvs 310:
1.5 cvs 311: if (strncmp (path, "http:", 5) != 0)
312: return FALSE;
313: return TRUE;
1.3 cvs 314: }
315:
1.4 cvs 316: /*----------------------------------------------------------------------
1.9 cvs 317: IsWithParameters
318: returns TRUE if url has a concatenated query string.
1.4 cvs 319: ----------------------------------------------------------------------*/
1.3 cvs 320: #ifdef __STDC__
1.9 cvs 321: boolean IsWithParameters (char *url)
1.3 cvs 322: #else /* __STDC__ */
1.9 cvs 323: boolean IsWithParameters (url)
324: char *url;
1.3 cvs 325: #endif /* __STDC__ */
326: {
1.5 cvs 327: int i;
1.3 cvs 328:
1.9 cvs 329: if ((!url) || (url[0] == EOS))
1.5 cvs 330: return FALSE;
1.3 cvs 331:
1.9 cvs 332: i = strlen (url) - 1;
333: while (i > 0 && url[i--] != '?')
1.5 cvs 334: if (i < 0)
335: return FALSE;
1.3 cvs 336:
1.5 cvs 337: /* There is a parameter */
338: return TRUE;
1.3 cvs 339: }
340:
1.4 cvs 341: /*----------------------------------------------------------------------
1.9 cvs 342: IsW3Path
343: returns TRUE if path is in fact a URL.
1.4 cvs 344: ----------------------------------------------------------------------*/
1.3 cvs 345: #ifdef __STDC__
346: boolean IsW3Path (char *path)
347: #else /* __STDC__ */
348: boolean IsW3Path (path)
349: char *path;
350: #endif /* __STDC__ */
351: {
1.5 cvs 352: if ((strncmp (path, "http:", 5)) && (strncmp (path, "ftp:", 4)) &&
353: (strncmp (path, "telnet:", 7)) && (strncmp (path, "wais:", 5)) &&
354: (strncmp (path, "news:", 5)) && (strncmp (path, "gopher:", 7)) &&
355: (strncmp (path, "mailto:", 7)) && (strncmp (path, "archie:", 7)))
356: return FALSE;
357: return TRUE;
1.3 cvs 358: }
359:
1.4 cvs 360: /*----------------------------------------------------------------------
1.9 cvs 361: IsValidProtocol
362: returns true if the url protocol is supported by Amaya.
1.4 cvs 363: ----------------------------------------------------------------------*/
1.3 cvs 364: #ifdef __STDC__
1.9 cvs 365: boolean IsValidProtocol (char *url)
1.3 cvs 366: #else /* __STDC__ */
1.9 cvs 367: boolean IsValidProtocol (url)
368: char *url;
1.3 cvs 369: #endif /* __STDC__ */
370: {
1.9 cvs 371: if (!strncmp (url, "http:", 5)
1.3 cvs 372: /***|| !strncmp (path, "ftp:", 4)
1.5 cvs 373: || !strncmp (path, "news:", 5)***/ )
1.8 cvs 374: return (TRUE);
1.5 cvs 375: else
1.8 cvs 376: return (FALSE);
1.3 cvs 377: }
378:
1.4 cvs 379: /*----------------------------------------------------------------------
1.9 cvs 380: NormalizeURL
381: normalizes orgName according to a base associated with doc, and
382: following the standard URL format rules.
383: The function returns the new complete and normalized URL
1.12 cvs 384: or file name path (newName) and the name of the document (docName).
1.9 cvs 385: N.B. If the function can't find out what's the docName, it assigns
386: the name "noname.html".
1.4 cvs 387: ----------------------------------------------------------------------*/
1.3 cvs 388: #ifdef __STDC__
389: void NormalizeURL (char *orgName, Document doc, char *newName, char *docName)
390: #else /* __STDC__ */
391: void NormalizeURL (orgName, doc, newName, docName)
392: char *orgName;
393: Document doc;
394: char *newName;
395: char *docName;
396: #endif /* __STDC__ */
397: {
1.5 cvs 398: char basename[MAX_LENGTH];
399: char tempname[MAX_LENGTH];
400: int i;
401: char *ptr;
402: char *basename_ptr;
403: int basename_flag;
404: Element el;
405: ElementType elType;
406: AttributeType attrType;
407: Attribute attrHREF;
408: int length;
409:
410: /* Fix up orgName, by erasing leading and trailing white space */
411: if (!newName || !docName)
412: return;
413: ptr = orgName;
414: while (*ptr == ' ' && *ptr++ != EOS) ;
415: strcpy (tempname, ptr);
416: ptr = strchr (tempname, ' ');
417: if (ptr)
418: *ptr = EOS;
419:
1.14 cvs 420: if (IsW3Path (tempname))
421: /* the name is complete */
422: strcpy (newName, tempname);
1.5 cvs 423: else
424: {
1.14 cvs 425: if (doc)
426: {
427: /* take into account the BASE element. */
428: length = MAX_LENGTH;
429: /* get the root element */
430: el = TtaGetMainRoot (doc);
431:
432: /* search the BASE element */
433: elType.ElSSchema = TtaGetDocumentSSchema (doc);
434: elType.ElTypeNum = HTML_EL_BASE;
435: el = TtaSearchTypedElement (elType, SearchInTree, el);
436: if (el)
437: {
438: /*
439: ** The document has a BASE element
440: ** Get the HREF attribute of the BASE Element
441: */
442: attrType.AttrSSchema = elType.ElSSchema;
443: attrType.AttrTypeNum = HTML_ATTR_HREF_;
444: attrHREF = TtaGetAttribute (el, attrType);
445: if (attrHREF)
446: {
447: /* Use the base path of the document */
448: TtaGiveTextAttributeValue (attrHREF, basename, &length);
449: /* base and orgName have to be separated by a DIR_SEP */
450: if (basename[strlen (basename) - 1] != DIR_SEP)
451: {
452: if (IsHTMLName (basename))
453: {
454: /* remove the document name from basename */
455: length = strlen (basename) - 1;
456: while (basename[length] != DIR_SEP)
457: basename[length--] = EOS;
458: }
459: else if (tempname[0] != DIR_SEP)
460: strcat (basename, DIR_STR);
461: }
462: }
463: else
464: basename[0] = EOS;
465: }
466: else
467: basename[0] = EOS;
468: }
469: else
470: basename[0] = EOS;
471:
472: if (basename[0] == EOS)
473: {
474: /* there is no BASE element in that document. */
475: if (DocumentURLs[(int) doc])
476: {
477: basename_ptr = HTParse (DocumentURLs[(int) doc], "", PARSE_ALL);
478: basename_flag = TRUE;
479: }
480: else
481: {
482: basename_ptr = "";
483: basename_flag = FALSE;
484: }
485: }
486: else
487: {
488: basename_ptr = HTParse (basename, "", PARSE_ALL);
489: basename_flag = TRUE;
490: }
491:
492: ptr = HTParse (tempname, basename_ptr, PARSE_ALL);
493: if (basename_flag)
494: HT_FREE (basename_ptr);
495: if (ptr)
496: {
497: ptr = HTSimplify (&ptr);
498: strcpy (newName, ptr);
499: HT_FREE (ptr);
500: }
501: else
502: newName[0] = EOS;
1.5 cvs 503: }
504:
505: i = strlen (newName) - 1;
506: if (i > 0)
507: {
1.14 cvs 508: /* search now the document name */
509: ptr = strrchr (newName, DIR_SEP);
510: if (ptr)
511: ptr++;
512: if (ptr && *ptr != EOS)
513: strcpy (docName, ptr);
514: else
515: /* the docname was not comprised inside the URL, so let's */
516: /* assign a "noname.html" name :) */
517: strcpy (docName, "noname.html");
518:
519: /* remove DIR_SEP at the end of complete path */
1.5 cvs 520: if (newName[i] == DIR_SEP)
521: newName[i] = EOS;
522: }
1.3 cvs 523: }
524:
1.4 cvs 525: /*----------------------------------------------------------------------
1.9 cvs 526: IsSameHost
1.4 cvs 527: ----------------------------------------------------------------------*/
1.3 cvs 528: #ifdef __STDC__
529: boolean IsSameHost (char *url1, char *url2)
530: #else /* __STDC__ */
531: boolean IsSameHost (url1, url2)
532: char *path;
533: #endif /* __STDC__ */
534: {
1.5 cvs 535: char *basename_ptr1, *basename_ptr2;
536: boolean result;
1.3 cvs 537:
1.5 cvs 538: basename_ptr1 = HTParse (url1, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
539: basename_ptr2 = HTParse (url2, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
1.3 cvs 540:
1.5 cvs 541: if (strcmp (basename_ptr1, basename_ptr2))
1.8 cvs 542: result = FALSE;
1.5 cvs 543: else
1.8 cvs 544: result = TRUE;
1.3 cvs 545:
1.5 cvs 546: HT_FREE (basename_ptr1);
547: HT_FREE (basename_ptr2);
1.3 cvs 548:
1.5 cvs 549: return (result);
1.3 cvs 550: }
551:
552:
1.4 cvs 553: /*----------------------------------------------------------------------
1.9 cvs 554: AHTMakeRelativeURL
555: converts url into a relative url to base_url.
556: If succesful, returns the new URL, otherwise, it returns NULL.
557: The caller has to free the new URL.
1.4 cvs 558: ----------------------------------------------------------------------*/
1.3 cvs 559: #ifdef __STDC__
1.5 cvs 560: char *AHTMakeRelativeName (char *url, char *base_url)
1.3 cvs 561: #else /* __STDC__ */
1.5 cvs 562: char *AHTMakeRelativeName (url, base_url)
563: char url;
564: char base_url;
565:
1.3 cvs 566: #endif /* __STDC__ */
567: {
1.5 cvs 568: char *base_ptr, *url_ptr;
569: char *result;
570:
571: /* verify if we are in the same host */
1.3 cvs 572:
1.5 cvs 573: base_ptr = HTParse (base_url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
574: url_ptr = HTParse (url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
1.3 cvs 575:
1.5 cvs 576: if (!strcmp (base_ptr, url_ptr))
577: {
578: HT_FREE (base_ptr);
579: HT_FREE (url_ptr);
1.3 cvs 580:
1.5 cvs 581: /* Normalize the URLs */
1.3 cvs 582:
1.5 cvs 583: base_ptr = HTParse (base_url, "", PARSE_ALL);
584: url_ptr = HTParse (url, "", PARSE_ALL);
1.3 cvs 585:
1.5 cvs 586: /* Use libwww to make relative name */
1.3 cvs 587:
1.5 cvs 588: result = HTRelative (url_ptr, base_ptr);
589: HT_FREE (base_ptr);
590: HT_FREE (url_ptr);
591: }
592: else
593: result = (char *) NULL;
1.3 cvs 594:
1.5 cvs 595: return (result);
1.3 cvs 596: }
1.9 cvs 597:
598:
599:
Webmaster