Annotation of Amaya/amaya/AHTURLTools.c, revision 1.10
1.7 cvs 1: /*
2: *
3: * (c) COPYRIGHT MIT and INRIA, 1996.
4: * Please first read the full copyright statement in file COPYRIGHT.
5: *
6: */
1.9 cvs 7:
1.10 ! cvs 8: /*
! 9: * AHTURLTools.c: contains all the functions for testing, manipulating,
! 10: * and normalizing URLs.
! 11: *
! 12: * Authors: J. Kahan, I. Vatton
! 13: *
! 14: */
1.7 cvs 15:
1.8 cvs 16: /* Amaya includes */
17: #define EXPORT extern
1.3 cvs 18: #include "amaya.h"
19:
1.8 cvs 20:
21: #include "init_f.h"
22: #include "AHTURLTools_f.h"
23:
24: /*----------------------------------------------------------------------
1.9 cvs 25: ExplodeURL
1.8 cvs 26: ----------------------------------------------------------------------*/
27:
28: #ifdef __STDC__
29: void ExplodeURL (char *url, char **proto, char **host, char **dir, char **file)
30: #else
31: void ExplodeURL (url, proto, host, dir, file)
32: char *url;
33: char **proto;
34: char **host;
35: char **dir;
36: char **file;
37:
38: #endif
39: {
1.9 cvs 40: char *curr, *temp;
1.8 cvs 41:
42: if ((url == NULL) || (proto == NULL) || (host == NULL) ||
43: (dir == NULL) || (file == NULL))
44: return;
45:
46: /* initialize every pointer */
47: *proto = *host = *dir = *file = NULL;
48:
49: /* skip any leading space */
50: while ((*url == SPACE) || (*url == TAB))
51: url++;
1.9 cvs 52: curr = url;
53: if (*curr == 0)
1.8 cvs 54: goto finished;
55:
56: /* go to the end of the URL */
1.9 cvs 57: while ((*curr != 0) && (*curr != SPACE) && (*curr != '\b') &&
58: (*curr != '\r') && (*curr != EOL))
59: curr++;
1.8 cvs 60:
61: /* mark the end of the chain */
1.9 cvs 62: *curr = EOS;
63: curr--;
64: if (curr <= url)
1.8 cvs 65: goto finished;
66:
67: /* search the next DIR_SEP indicating the beginning of the file name */
68: do
69: {
1.9 cvs 70: curr--;
1.8 cvs 71: }
1.9 cvs 72: while ((curr >= url) && (*curr != DIR_SEP));
73: if (curr < url)
1.8 cvs 74: goto finished;
1.9 cvs 75: *file = curr + 1;
1.8 cvs 76:
77: /* mark the end of the dir */
1.9 cvs 78: *curr = EOS;
79: curr--;
80: if (curr < url)
1.8 cvs 81: goto finished;
82:
83: /* search for the "/" indicating the host name start */
1.9 cvs 84: while ((curr > url) && ((*curr != DIR_SEP) || (*(curr + 1) != DIR_SEP)))
85: curr--;
1.8 cvs 86:
87: /* if we found it, separate the host name from the directory */
1.9 cvs 88: if ((*curr == DIR_SEP) && (*(curr + 1) == DIR_SEP))
1.8 cvs 89: {
1.9 cvs 90: *host = temp = curr + 2;
1.8 cvs 91: while ((*temp != 0) && (*temp != DIR_SEP))
92: temp++;
93: if (*temp == DIR_SEP)
94: {
95: *temp = EOS;
96: *dir = temp + 1;
97: }
98: }
99: else
100: {
1.9 cvs 101: *dir = curr;
1.8 cvs 102: }
1.9 cvs 103: if (curr <= url)
1.8 cvs 104: goto finished;
105:
106: /* mark the end of the proto */
1.9 cvs 107: *curr = EOS;
108: curr--;
109: if (curr < url)
1.8 cvs 110: goto finished;
111:
1.9 cvs 112: if (*curr == ':')
1.8 cvs 113: {
1.9 cvs 114: *curr = EOS;
115: curr--;
1.8 cvs 116: }
117: else
118: goto finished;
1.9 cvs 119: if (curr < url)
1.8 cvs 120: goto finished;
1.9 cvs 121: while ((curr > url) && (isalpha (*curr)))
122: curr--;
123: *proto = curr;
1.8 cvs 124:
125: finished:;
126:
127: #ifdef AMAYA_DEBUG
128: fprintf (stderr, "ExplodeURL(%s)\n\t", url);
129: if (*proto)
130: fprintf (stderr, "proto : %s, ", *proto);
131: if (*host)
132: fprintf (stderr, "host : %s, ", *host);
133: if (*dir)
134: fprintf (stderr, "dir : %s, ", *dir);
135: if (*file)
136: fprintf (stderr, "file : %s ", *file);
137: fprintf (stderr, "\n");
138: #endif
139:
140: }
1.3 cvs 141:
1.4 cvs 142: /*----------------------------------------------------------------------
1.9 cvs 143: IsHTMLName
144: returns TRUE if path points to an HTML resource.
1.4 cvs 145: ----------------------------------------------------------------------*/
1.3 cvs 146:
147: #ifdef __STDC__
148: boolean IsHTMLName (char *path)
149: #else /* __STDC__ */
150: boolean IsHTMLName (path)
151: char *path;
152:
153: #endif /* __STDC__ */
154: {
1.5 cvs 155: char temppath[MAX_LENGTH];
156: char suffix[MAX_LENGTH];
157: char nsuffix[MAX_LENGTH];
158: int i;
159:
160: if (!path)
161: return FALSE;
162:
163: strcpy (temppath, path);
164: ExtractSuffix (temppath, suffix);
165:
166: /* Normalize the suffix */
167: i = 0;
168: while (suffix[i] != EOS)
169: nsuffix[i] = TOLOWER (suffix[i++]);
170: nsuffix[i] = EOS;
171: if ((strcmp (nsuffix, "html")) &&
172: (strcmp (nsuffix, "htm")) &&
173: (strcmp (nsuffix, "shtml")))
174: return FALSE;
175: return TRUE;
1.3 cvs 176: }
177:
1.4 cvs 178: /*----------------------------------------------------------------------
1.9 cvs 179: IsImageName
180: returns TRUE if path points to an image resource.
1.4 cvs 181: ----------------------------------------------------------------------*/
1.3 cvs 182:
183: #ifdef __STDC__
184: boolean IsImageName (char *path)
185: #else /* __STDC__ */
186: boolean IsImageName (path)
187: char *path;
188:
189: #endif /* __STDC__ */
190: {
1.5 cvs 191: char temppath[MAX_LENGTH];
192: char suffix[MAX_LENGTH];
193: char nsuffix[MAX_LENGTH];
194: int i;
195:
196: if (!path)
197: return FALSE;
198:
199: strcpy (temppath, path);
200: ExtractSuffix (temppath, suffix);
201:
202: /* Normalize the suffix */
203: i = 0;
204: while (suffix[i] != EOS)
205: nsuffix[i] = TOLOWER (suffix[i++]);
206: nsuffix[i] = EOS;
207: if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) &&
208: (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) &&
209: (strcmp (nsuffix, "png")) && (strcmp (nsuffix, "au")))
210: return FALSE;
211: return TRUE;
1.3 cvs 212: }
213:
1.4 cvs 214: /*----------------------------------------------------------------------
1.9 cvs 215: IsTextName
1.4 cvs 216: ----------------------------------------------------------------------*/
1.3 cvs 217:
218: #ifdef __STDC__
219: boolean IsTextName (char *path)
220: #else /* __STDC__ */
221: boolean IsTextName (path)
222: char *path;
223:
224: #endif /* __STDC__ */
225: {
1.5 cvs 226: char temppath[MAX_LENGTH];
227: char suffix[MAX_LENGTH];
228: char nsuffix[MAX_LENGTH];
229: int i;
230:
231: if (!path)
232: return FALSE;
233:
234: strcpy (temppath, path);
235: ExtractSuffix (temppath, suffix);
236:
237: /* Normalize the suffix */
238: i = 0;
239: while (suffix[i] != EOS)
240: {
241: nsuffix[i] = TOLOWER (suffix[i]);
242: i++;
243: }
244: nsuffix[i] = EOS;
245:
246: if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) &&
247: (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) &&
248: (strcmp (nsuffix, "pdf")) && (strcmp (nsuffix, "png")) &&
249: (strcmp (nsuffix, "Z")) && (strcmp (nsuffix, "gz")) &&
250: (strcmp (nsuffix, "tgz")) && (strcmp (nsuffix, "xpg")) &&
251: (strcmp (nsuffix, "xpd")) && (strcmp (nsuffix, "ps")) &&
252: (strcmp (nsuffix, "au")))
253: return TRUE;
254: return FALSE;
1.3 cvs 255: }
256:
1.4 cvs 257: /*----------------------------------------------------------------------
1.9 cvs 258: IsHTTPPath
259: returns TRUE if path is in fact an http URL.
1.4 cvs 260: ----------------------------------------------------------------------*/
1.3 cvs 261:
262: #ifdef __STDC__
263: boolean IsHTTPPath (char *path)
264: #else /* __STDC__ */
265: boolean IsHTTPPath (path)
266: char *path;
267:
268: #endif /* __STDC__ */
269: {
1.5 cvs 270: if (!path)
271: return FALSE;
1.3 cvs 272:
1.5 cvs 273: if (strncmp (path, "http:", 5) != 0)
274: return FALSE;
275: return TRUE;
1.3 cvs 276: }
277:
1.4 cvs 278: /*----------------------------------------------------------------------
1.9 cvs 279: IsWithParameters
280: returns TRUE if url has a concatenated query string.
1.4 cvs 281: ----------------------------------------------------------------------*/
1.3 cvs 282:
283: #ifdef __STDC__
1.9 cvs 284: boolean IsWithParameters (char *url)
1.3 cvs 285: #else /* __STDC__ */
1.9 cvs 286: boolean IsWithParameters (url)
287: char *url;
1.3 cvs 288:
289: #endif /* __STDC__ */
290: {
1.5 cvs 291: int i;
1.3 cvs 292:
1.9 cvs 293: if ((!url) || (url[0] == EOS))
1.5 cvs 294: return FALSE;
1.3 cvs 295:
1.9 cvs 296: i = strlen (url) - 1;
297: while (i > 0 && url[i--] != '?')
1.5 cvs 298: if (i < 0)
299: return FALSE;
1.3 cvs 300:
1.5 cvs 301: /* There is a parameter */
302: return TRUE;
1.3 cvs 303: }
304:
1.4 cvs 305: /*----------------------------------------------------------------------
1.9 cvs 306: IsW3Path
307: returns TRUE if path is in fact a URL.
1.4 cvs 308: ----------------------------------------------------------------------*/
1.3 cvs 309:
310: #ifdef __STDC__
311: boolean IsW3Path (char *path)
312: #else /* __STDC__ */
313: boolean IsW3Path (path)
314: char *path;
315:
316: #endif /* __STDC__ */
317: {
1.5 cvs 318: if ((strncmp (path, "http:", 5)) && (strncmp (path, "ftp:", 4)) &&
319: (strncmp (path, "telnet:", 7)) && (strncmp (path, "wais:", 5)) &&
320: (strncmp (path, "news:", 5)) && (strncmp (path, "gopher:", 7)) &&
321: (strncmp (path, "mailto:", 7)) && (strncmp (path, "archie:", 7)))
322: return FALSE;
323: return TRUE;
1.3 cvs 324: }
325:
1.4 cvs 326: /*----------------------------------------------------------------------
1.9 cvs 327: IsValidProtocol
328: returns true if the url protocol is supported by Amaya.
1.4 cvs 329: ----------------------------------------------------------------------*/
1.3 cvs 330:
331: #ifdef __STDC__
1.9 cvs 332: boolean IsValidProtocol (char *url)
1.3 cvs 333: #else /* __STDC__ */
1.9 cvs 334: boolean IsValidProtocol (url)
335: char *url;
1.3 cvs 336:
337: #endif /* __STDC__ */
338: {
1.9 cvs 339: if (!strncmp (url, "http:", 5)
1.3 cvs 340: /***|| !strncmp (path, "ftp:", 4)
1.5 cvs 341: || !strncmp (path, "news:", 5)***/ )
1.8 cvs 342: return (TRUE);
1.5 cvs 343: else
1.8 cvs 344: return (FALSE);
1.3 cvs 345: }
346:
1.4 cvs 347: /*----------------------------------------------------------------------
1.9 cvs 348: IsValidNormalizeURL
349: says which URLs may be normalized
1.4 cvs 350: ----------------------------------------------------------------------*/
1.3 cvs 351:
352: #ifdef __STDC__
353: boolean IsValidNormalizeURL (char *path)
354: #else /* __STDC__ */
355: boolean IsValidNormalizeURL (path)
356: char *path;
357:
358: #endif /* __STDC__ */
359: {
1.5 cvs 360: if (strchr (path, ':') && !strncmp (path, "http:", 5))
1.8 cvs 361: return (TRUE);
1.5 cvs 362: else
1.8 cvs 363: return (FALSE);
1.3 cvs 364: }
365:
366:
1.4 cvs 367: /*----------------------------------------------------------------------
1.9 cvs 368: NormalizeURL
369: normalizes orgName according to a base associated with doc, and
370: following the standard URL format rules.
371: The function returns the new complete and normalized URL
372: or file name path (newName) and the name of the document (docName).
373: N.B. If the function can't find out what's the docName, it assigns
374: the name "noname.html".
1.4 cvs 375: ----------------------------------------------------------------------*/
1.3 cvs 376:
377: #ifdef __STDC__
378: void NormalizeURL (char *orgName, Document doc, char *newName, char *docName)
379: #else /* __STDC__ */
380: void NormalizeURL (orgName, doc, newName, docName)
381: char *orgName;
382: Document doc;
383: char *newName;
384: char *docName;
385:
386: #endif /* __STDC__ */
387: {
1.5 cvs 388: char basename[MAX_LENGTH];
389: char tempname[MAX_LENGTH];
390: int i;
391: char *ptr;
392: char *basename_ptr;
393: int basename_flag;
394: Element el;
395: ElementType elType;
396: AttributeType attrType;
397: Attribute attrHREF;
398: int length;
399:
400: /* Fix up orgName, by erasing leading and trailing white space */
401: if (!newName || !docName)
402: return;
403: ptr = orgName;
404: while (*ptr == ' ' && *ptr++ != EOS) ;
405: strcpy (tempname, ptr);
406: ptr = strchr (tempname, ' ');
407: if (ptr)
408: *ptr = EOS;
409:
410: /*
411: ** the following block to take into account the BASE element.
412: ** This is not very optimized, as this procedure is repeated for
413: ** each element which is retrieved. A better way would be to
414: ** move this higher up in the function call hierarchy.
415: */
416: if (IsValidNormalizeURL (tempname) && doc)
417: {
418: length = MAX_LENGTH;
419: /* get the root element */
420: el = TtaGetMainRoot (doc);
421:
422: /* search the BASE element */
423: elType.ElSSchema = TtaGetDocumentSSchema (doc);
424: elType.ElTypeNum = HTML_EL_BASE;
425: el = TtaSearchTypedElement (elType, SearchInTree, el);
426: if (el)
427: {
428: /*
429: ** The document has a BASE element
430: ** Get the HREF attribute of the BASE Element
1.3 cvs 431: */
1.5 cvs 432: attrType.AttrSSchema = elType.ElSSchema;
433: attrType.AttrTypeNum = HTML_ATTR_HREF_;
434: attrHREF = TtaGetAttribute (el, attrType);
435: if (attrHREF)
436: {
437: /*
438: ** Use the base path of the document
439: ** To do: verify length of the buffer
440: ** length > TtaGetTextAttributeLength (attrHREF) + strlen (orgName)
441: */
442: TtaGiveTextAttributeValue (attrHREF, basename, &length);
443:
444: /*
445: ** base and orgName have to be separated by a DIR_SEP
446: */
447: if (basename[strlen (basename) - 1] != DIR_SEP && tempname[0] != DIR_SEP)
448: strcat (basename, DIR_STR);
449: }
450: }
451: else
452: basename[0] = EOS;
453: }
454: else
455: basename[0] = EOS;
456:
457: if (basename[0] == EOS)
458: {
459: /*
460: ** There is no BASE element in that document.
461: ** A temporary fix as TtaExtractName does not tolerate a name
462: ** ending in /. Here, we reinsert the slash, in order to
463: ** parse the name in the following two lines. A bit
464: ** redundant and has to be reviewed.
465: */
466: if (DocumentURLs[(int) doc])
467: {
468: basename_ptr = HTParse (DocumentURLs[(int) doc], "", PARSE_ALL);
469: basename_flag = TRUE;
470: }
471: else
472: {
473: basename_ptr = "";
474: basename_flag = FALSE;
475: }
476: }
477: else
478: {
479: basename_ptr = HTParse (basename, "", PARSE_ALL);
480: basename_flag = TRUE;
481: } /* if-else tempname */
482:
483: ptr = HTParse (tempname, basename_ptr, PARSE_ALL);
484: if (basename_flag)
485: HT_FREE (basename_ptr);
486: if (ptr)
487: {
488: ptr = HTSimplify (&ptr);
489: strcpy (newName, ptr);
490: HT_FREE (ptr);
491: }
492: else
493: newName[0] = EOS;
494:
495: i = strlen (newName) - 1;
496: if (i > 0)
497: {
498: /*
499: ** A temporary fix for an interfacing problem:
500: ** TtaExtractName does not tolerate url's finished on DIR_SEP
501: */
502: ptr = strrchr (newName, DIR_SEP);
503: if (ptr)
504: ptr++;
505: if (ptr && *ptr != EOS)
506: strcpy (docName, ptr);
507: else
508: /*
509: ** The docname was not comprised inside the URL, so let's
510: ** assign a "noname.html" name :)
511: */
512: strcpy (docName, "noname.html");
513:
514: /*
515: ** A temporary fix for an interfacing problem:
516: ** TtaExtractName does not tolerate url's finished on DIR_SEP
517: */
518: if (newName[i] == DIR_SEP)
519: newName[i] = EOS;
520: }
1.3 cvs 521: }
522:
1.4 cvs 523: /*----------------------------------------------------------------------
1.9 cvs 524: IsSameHost
1.4 cvs 525: ----------------------------------------------------------------------*/
1.3 cvs 526:
527: #ifdef __STDC__
528: boolean IsSameHost (char *url1, char *url2)
529: #else /* __STDC__ */
530: boolean IsSameHost (url1, url2)
531: char *path;
532:
533: #endif /* __STDC__ */
534: {
1.5 cvs 535: char *basename_ptr1, *basename_ptr2;
536: boolean result;
1.3 cvs 537:
1.5 cvs 538: basename_ptr1 = HTParse (url1, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
539: basename_ptr2 = HTParse (url2, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
1.3 cvs 540:
1.5 cvs 541: if (strcmp (basename_ptr1, basename_ptr2))
1.8 cvs 542: result = FALSE;
1.5 cvs 543: else
1.8 cvs 544: result = TRUE;
1.3 cvs 545:
1.5 cvs 546: HT_FREE (basename_ptr1);
547: HT_FREE (basename_ptr2);
1.3 cvs 548:
1.5 cvs 549: return (result);
1.3 cvs 550: }
551:
552:
1.4 cvs 553: /*----------------------------------------------------------------------
1.9 cvs 554: AHTMakeRelativeURL
555: converts url into a relative url to base_url.
556: If succesful, returns the new URL, otherwise, it returns NULL.
557: The caller has to free the new URL.
1.4 cvs 558: ----------------------------------------------------------------------*/
1.3 cvs 559:
560: #ifdef __STDC__
1.5 cvs 561: char *AHTMakeRelativeName (char *url, char *base_url)
1.3 cvs 562: #else /* __STDC__ */
1.5 cvs 563: char *AHTMakeRelativeName (url, base_url)
564: char url;
565: char base_url;
566:
1.3 cvs 567: #endif /* __STDC__ */
568: {
1.5 cvs 569: char *base_ptr, *url_ptr;
570: char *result;
571:
572: /* verify if we are in the same host */
1.3 cvs 573:
1.5 cvs 574: base_ptr = HTParse (base_url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
575: url_ptr = HTParse (url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
1.3 cvs 576:
1.5 cvs 577: if (!strcmp (base_ptr, url_ptr))
578: {
579: HT_FREE (base_ptr);
580: HT_FREE (url_ptr);
1.3 cvs 581:
1.5 cvs 582: /* Normalize the URLs */
1.3 cvs 583:
1.5 cvs 584: base_ptr = HTParse (base_url, "", PARSE_ALL);
585: url_ptr = HTParse (url, "", PARSE_ALL);
1.3 cvs 586:
1.5 cvs 587: /* Use libwww to make relative name */
1.3 cvs 588:
1.5 cvs 589: result = HTRelative (url_ptr, base_ptr);
590: HT_FREE (base_ptr);
591: HT_FREE (url_ptr);
592: }
593: else
594: result = (char *) NULL;
1.3 cvs 595:
1.5 cvs 596: return (result);
1.3 cvs 597: }
1.9 cvs 598:
599:
600:
Webmaster