Annotation of Amaya/amaya/AHTURLTools.c, revision 1.12

1.7       cvs         1: /*
                      2:  *
                      3:  *  (c) COPYRIGHT MIT and INRIA, 1996.
                      4:  *  Please first read the full copyright statement in file COPYRIGHT.
                      5:  *
                      6:  */
1.9       cvs         7: 
1.10      cvs         8: /*
                      9:  * AHTURLTools.c: contains all the functions for testing, manipulating,
                     10:  * and normalizing URLs.
                     11:  *
                     12:  * Authors: J. Kahan, I. Vatton
                     13:  *
                     14:  */
1.7       cvs        15:  
1.8       cvs        16: /* Amaya includes  */
                     17: #define EXPORT extern
1.3       cvs        18: #include "amaya.h"
                     19: 
1.8       cvs        20: 
                     21: #include "init_f.h"
                     22: #include "AHTURLTools_f.h"
                     23: 
                     24: /*----------------------------------------------------------------------
1.11      cvs        25:   ExplodeURL 
1.8       cvs        26:   ----------------------------------------------------------------------*/
                     27: 
                     28: #ifdef __STDC__
                     29: void                ExplodeURL (char *url, char **proto, char **host, char **dir, char **file)
                     30: #else
                     31: void                ExplodeURL (url, proto, host, dir, file)
                     32: char               *url;
                     33: char              **proto;
                     34: char              **host;
                     35: char              **dir;
                     36: char              **file;
                     37: 
                     38: #endif
                     39: {
1.9       cvs        40:    char               *curr, *temp;
1.8       cvs        41: 
                     42:    if ((url == NULL) || (proto == NULL) || (host == NULL) ||
                     43:        (dir == NULL) || (file == NULL))
                     44:       return;
                     45: 
                     46:    /* initialize every pointer */
                     47:    *proto = *host = *dir = *file = NULL;
                     48: 
                     49:    /* skip any leading space */
                     50:    while ((*url == SPACE) || (*url == TAB))
                     51:       url++;
1.9       cvs        52:    curr = url;
                     53:    if (*curr == 0)
1.8       cvs        54:       goto finished;
                     55: 
                     56:    /* go to the end of the URL */
1.9       cvs        57:    while ((*curr != 0) && (*curr != SPACE) && (*curr != '\b') &&
                     58:          (*curr != '\r') && (*curr != EOL))
                     59:       curr++;
1.8       cvs        60: 
                     61:    /* mark the end of the chain */
1.9       cvs        62:    *curr = EOS;
                     63:    curr--;
                     64:    if (curr <= url)
1.8       cvs        65:       goto finished;
                     66: 
                     67:    /* search the next DIR_SEP indicating the beginning of the file name */
                     68:    do
1.11      cvs        69:      curr--;
1.9       cvs        70:    while ((curr >= url) && (*curr != DIR_SEP));
1.11      cvs        71: 
1.9       cvs        72:    if (curr < url)
1.8       cvs        73:       goto finished;
1.9       cvs        74:    *file = curr + 1;
1.8       cvs        75: 
                     76:    /* mark the end of the dir */
1.9       cvs        77:    *curr = EOS;
                     78:    curr--;
                     79:    if (curr < url)
1.8       cvs        80:       goto finished;
                     81: 
                     82:    /* search for the "/" indicating the host name start */
1.9       cvs        83:    while ((curr > url) && ((*curr != DIR_SEP) || (*(curr + 1) != DIR_SEP)))
                     84:       curr--;
1.8       cvs        85: 
                     86:    /* if we found it, separate the host name from the directory */
1.9       cvs        87:    if ((*curr == DIR_SEP) && (*(curr + 1) == DIR_SEP))
1.8       cvs        88:      {
1.9       cvs        89:        *host = temp = curr + 2;
1.8       cvs        90:        while ((*temp != 0) && (*temp != DIR_SEP))
                     91:           temp++;
                     92:        if (*temp == DIR_SEP)
                     93:          {
                     94:             *temp = EOS;
                     95:             *dir = temp + 1;
                     96:          }
                     97:      }
                     98:    else
1.11      cvs        99:      *dir = curr;
                    100: 
1.9       cvs       101:    if (curr <= url)
1.8       cvs       102:       goto finished;
                    103: 
                    104:    /* mark the end of the proto */
1.9       cvs       105:    *curr = EOS;
                    106:    curr--;
                    107:    if (curr < url)
1.8       cvs       108:       goto finished;
                    109: 
1.9       cvs       110:    if (*curr == ':')
1.8       cvs       111:      {
1.9       cvs       112:        *curr = EOS;
                    113:        curr--;
1.8       cvs       114:      }
                    115:    else
                    116:       goto finished;
1.11      cvs       117: 
1.9       cvs       118:    if (curr < url)
1.8       cvs       119:       goto finished;
1.9       cvs       120:    while ((curr > url) && (isalpha (*curr)))
                    121:       curr--;
                    122:    *proto = curr;
1.8       cvs       123: 
                    124:  finished:;
                    125: 
                    126: #ifdef AMAYA_DEBUG
                    127:    fprintf (stderr, "ExplodeURL(%s)\n\t", url);
                    128:    if (*proto)
                    129:       fprintf (stderr, "proto : %s, ", *proto);
                    130:    if (*host)
                    131:       fprintf (stderr, "host : %s, ", *host);
                    132:    if (*dir)
                    133:       fprintf (stderr, "dir : %s, ", *dir);
                    134:    if (*file)
                    135:       fprintf (stderr, "file : %s ", *file);
                    136:    fprintf (stderr, "\n");
                    137: #endif
                    138: 
                    139: }
1.3       cvs       140: 
1.4       cvs       141: /*----------------------------------------------------------------------
1.9       cvs       142:   IsHTMLName                                                         
                    143:   returns TRUE if path points to an HTML resource.
1.4       cvs       144:   ----------------------------------------------------------------------*/
1.3       cvs       145: #ifdef __STDC__
                    146: boolean             IsHTMLName (char *path)
                    147: #else  /* __STDC__ */
                    148: boolean             IsHTMLName (path)
                    149: char               *path;
                    150: #endif /* __STDC__ */
                    151: {
1.5       cvs       152:    char                temppath[MAX_LENGTH];
                    153:    char                suffix[MAX_LENGTH];
                    154:    char                nsuffix[MAX_LENGTH];
                    155:    int                 i;
                    156: 
                    157:    if (!path)
                    158:       return FALSE;
                    159: 
                    160:    strcpy (temppath, path);
                    161:    ExtractSuffix (temppath, suffix);
                    162: 
                    163:    /* Normalize the suffix */
                    164:    i = 0;
                    165:    while (suffix[i] != EOS)
                    166:       nsuffix[i] = TOLOWER (suffix[i++]);
                    167:    nsuffix[i] = EOS;
                    168:    if ((strcmp (nsuffix, "html")) &&
                    169:        (strcmp (nsuffix, "htm")) &&
                    170:        (strcmp (nsuffix, "shtml")))
                    171:       return FALSE;
                    172:    return TRUE;
1.3       cvs       173: }
                    174: 
1.4       cvs       175: /*----------------------------------------------------------------------
1.9       cvs       176:   IsImageName                                
                    177:   returns TRUE if path points to an image resource.
1.4       cvs       178:   ----------------------------------------------------------------------*/
1.3       cvs       179: #ifdef __STDC__
                    180: boolean             IsImageName (char *path)
                    181: #else  /* __STDC__ */
                    182: boolean             IsImageName (path)
                    183: char               *path;
                    184: #endif /* __STDC__ */
                    185: {
1.5       cvs       186:    char                temppath[MAX_LENGTH];
                    187:    char                suffix[MAX_LENGTH];
                    188:    char                nsuffix[MAX_LENGTH];
                    189:    int                 i;
                    190: 
                    191:    if (!path)
                    192:       return FALSE;
                    193: 
                    194:    strcpy (temppath, path);
                    195:    ExtractSuffix (temppath, suffix);
                    196: 
                    197:    /* Normalize the suffix */
                    198:    i = 0;
                    199:    while (suffix[i] != EOS)
                    200:       nsuffix[i] = TOLOWER (suffix[i++]);
                    201:    nsuffix[i] = EOS;
                    202:    if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) &&
                    203:        (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) &&
                    204:        (strcmp (nsuffix, "png")) && (strcmp (nsuffix, "au")))
                    205:       return FALSE;
                    206:    return TRUE;
1.3       cvs       207: }
                    208: 
1.4       cvs       209: /*----------------------------------------------------------------------
1.9       cvs       210:   IsTextName                                                         
1.4       cvs       211:   ----------------------------------------------------------------------*/
1.3       cvs       212: #ifdef __STDC__
                    213: boolean             IsTextName (char *path)
                    214: #else  /* __STDC__ */
                    215: boolean             IsTextName (path)
                    216: char               *path;
                    217: 
                    218: #endif /* __STDC__ */
                    219: {
1.5       cvs       220:    char                temppath[MAX_LENGTH];
                    221:    char                suffix[MAX_LENGTH];
                    222:    char                nsuffix[MAX_LENGTH];
                    223:    int                 i;
                    224: 
                    225:    if (!path)
                    226:       return FALSE;
                    227: 
                    228:    strcpy (temppath, path);
                    229:    ExtractSuffix (temppath, suffix);
                    230: 
                    231:    /* Normalize the suffix */
                    232:    i = 0;
                    233:    while (suffix[i] != EOS)
                    234:      {
                    235:        nsuffix[i] = TOLOWER (suffix[i]);
                    236:        i++;
                    237:      }
                    238:    nsuffix[i] = EOS;
                    239: 
                    240:    if ((strcmp (nsuffix, "gif")) && (strcmp (nsuffix, "xbm")) &&
                    241:        (strcmp (nsuffix, "xpm")) && (strcmp (nsuffix, "jpg")) &&
                    242:        (strcmp (nsuffix, "pdf")) && (strcmp (nsuffix, "png")) &&
                    243:        (strcmp (nsuffix, "Z")) && (strcmp (nsuffix, "gz")) &&
                    244:        (strcmp (nsuffix, "tgz")) && (strcmp (nsuffix, "xpg")) &&
                    245:        (strcmp (nsuffix, "xpd")) && (strcmp (nsuffix, "ps")) &&
                    246:        (strcmp (nsuffix, "au")))
                    247:       return TRUE;
                    248:    return FALSE;
1.3       cvs       249: }
                    250: 
1.4       cvs       251: /*----------------------------------------------------------------------
1.9       cvs       252:   IsHTTPPath                                     
                    253:   returns TRUE if path is in fact an http URL.
1.4       cvs       254:   ----------------------------------------------------------------------*/
1.3       cvs       255: #ifdef __STDC__
                    256: boolean             IsHTTPPath (char *path)
                    257: #else  /* __STDC__ */
                    258: boolean             IsHTTPPath (path)
                    259: char               *path;
                    260: #endif /* __STDC__ */
                    261: {
1.5       cvs       262:    if (!path)
                    263:       return FALSE;
1.3       cvs       264: 
1.5       cvs       265:    if (strncmp (path, "http:", 5) != 0)
                    266:       return FALSE;
                    267:    return TRUE;
1.3       cvs       268: }
                    269: 
1.4       cvs       270: /*----------------------------------------------------------------------
1.9       cvs       271:   IsWithParameters                           
                    272:   returns TRUE if url has a concatenated query string.
1.4       cvs       273:   ----------------------------------------------------------------------*/
1.3       cvs       274: #ifdef __STDC__
1.9       cvs       275: boolean             IsWithParameters (char *url)
1.3       cvs       276: #else  /* __STDC__ */
1.9       cvs       277: boolean             IsWithParameters (url)
                    278: char               *url;
1.3       cvs       279: #endif /* __STDC__ */
                    280: {
1.5       cvs       281:    int                 i;
1.3       cvs       282: 
1.9       cvs       283:    if ((!url) || (url[0] == EOS))
1.5       cvs       284:       return FALSE;
1.3       cvs       285: 
1.9       cvs       286:    i = strlen (url) - 1;
                    287:    while (i > 0 && url[i--] != '?')
1.5       cvs       288:       if (i < 0)
                    289:         return FALSE;
1.3       cvs       290: 
1.5       cvs       291:    /* There is a parameter */
                    292:    return TRUE;
1.3       cvs       293: }
                    294: 
1.4       cvs       295: /*----------------------------------------------------------------------
1.9       cvs       296:   IsW3Path                                           
                    297:   returns TRUE if path is in fact a URL.
1.4       cvs       298:   ----------------------------------------------------------------------*/
1.3       cvs       299: #ifdef __STDC__
                    300: boolean             IsW3Path (char *path)
                    301: #else  /* __STDC__ */
                    302: boolean             IsW3Path (path)
                    303: char               *path;
                    304: #endif /* __STDC__ */
                    305: {
1.5       cvs       306:    if ((strncmp (path, "http:", 5)) && (strncmp (path, "ftp:", 4)) &&
                    307:        (strncmp (path, "telnet:", 7)) && (strncmp (path, "wais:", 5)) &&
                    308:        (strncmp (path, "news:", 5)) && (strncmp (path, "gopher:", 7)) &&
                    309:        (strncmp (path, "mailto:", 7)) && (strncmp (path, "archie:", 7)))
                    310:       return FALSE;
                    311:    return TRUE;
1.3       cvs       312: }
                    313: 
1.4       cvs       314: /*----------------------------------------------------------------------
1.9       cvs       315:   IsValidProtocol                                                    
                    316:   returns true if the url protocol is supported by Amaya.
1.4       cvs       317:   ----------------------------------------------------------------------*/
1.3       cvs       318: #ifdef __STDC__
1.9       cvs       319: boolean             IsValidProtocol (char *url)
1.3       cvs       320: #else  /* __STDC__ */
1.9       cvs       321: boolean             IsValidProtocol (url)
                    322: char               *url;
1.3       cvs       323: #endif /* __STDC__ */
                    324: {
1.9       cvs       325:    if (!strncmp (url, "http:", 5)
1.3       cvs       326:       /***|| !strncmp (path, "ftp:", 4)
1.5       cvs       327:       || !strncmp (path, "news:", 5)***/ )
1.8       cvs       328:       return (TRUE);
1.5       cvs       329:    else
1.8       cvs       330:       return (FALSE);
1.3       cvs       331: }
                    332: 
1.4       cvs       333: /*----------------------------------------------------------------------
1.9       cvs       334:    NormalizeURL
                    335:    normalizes orgName according to a base associated with doc, and
                    336:    following the standard URL format rules.
                    337:    The function returns the new complete and normalized URL 
1.12    ! cvs       338:    or file name path (newName) and the name of the document (docName).        
1.9       cvs       339:    N.B. If the function can't find out what's the docName, it assigns
                    340:    the name "noname.html".
1.4       cvs       341:   ----------------------------------------------------------------------*/
1.3       cvs       342: #ifdef __STDC__
                    343: void                NormalizeURL (char *orgName, Document doc, char *newName, char *docName)
                    344: #else  /* __STDC__ */
                    345: void                NormalizeURL (orgName, doc, newName, docName)
                    346: char               *orgName;
                    347: Document            doc;
                    348: char               *newName;
                    349: char               *docName;
                    350: #endif /* __STDC__ */
                    351: {
1.5       cvs       352:    char                basename[MAX_LENGTH];
                    353:    char                tempname[MAX_LENGTH];
                    354:    int                 i;
                    355:    char               *ptr;
                    356:    char               *basename_ptr;
                    357:    int                 basename_flag;
                    358:    Element             el;
                    359:    ElementType         elType;
                    360:    AttributeType       attrType;
                    361:    Attribute           attrHREF;
                    362:    int                 length;
                    363: 
                    364:    /* Fix up orgName, by erasing leading and trailing white space */
                    365:    if (!newName || !docName)
                    366:       return;
                    367:    ptr = orgName;
                    368:    while (*ptr == ' ' && *ptr++ != EOS) ;
                    369:    strcpy (tempname, ptr);
                    370:    ptr = strchr (tempname, ' ');
                    371:    if (ptr)
                    372:       *ptr = EOS;
                    373: 
                    374:    /* 
                    375:       ** the following block to take into account the BASE element.
                    376:       ** This is not very optimized, as this procedure is repeated for
                    377:       ** each element which is retrieved. A better way would be to
                    378:       ** move this higher up in the function call hierarchy.
                    379:     */
1.12    ! cvs       380:    if (!IsW3Path (tempname) && doc)
1.5       cvs       381:      {
                    382:        length = MAX_LENGTH;
                    383:        /* get the root element    */
                    384:        el = TtaGetMainRoot (doc);
                    385: 
                    386:        /* search the BASE element */
                    387:        elType.ElSSchema = TtaGetDocumentSSchema (doc);
                    388:        elType.ElTypeNum = HTML_EL_BASE;
                    389:        el = TtaSearchTypedElement (elType, SearchInTree, el);
                    390:        if (el)
                    391:          {
                    392:             /* 
                    393:                ** The document has a BASE element 
                    394:                ** Get the HREF attribute of the BASE Element 
1.3       cvs       395:              */
1.5       cvs       396:             attrType.AttrSSchema = elType.ElSSchema;
                    397:             attrType.AttrTypeNum = HTML_ATTR_HREF_;
                    398:             attrHREF = TtaGetAttribute (el, attrType);
                    399:             if (attrHREF)
                    400:               {
                    401:                  /* 
                    402:                     ** Use the base path of the document 
                    403:                     ** To do: verify length of the buffer
                    404:                     ** length > TtaGetTextAttributeLength (attrHREF) + strlen (orgName) 
                    405:                   */
                    406:                  TtaGiveTextAttributeValue (attrHREF, basename, &length);
                    407: 
                    408:                  /* 
                    409:                     ** base and orgName have to be separated by a DIR_SEP 
                    410:                   */
                    411:                  if (basename[strlen (basename) - 1] != DIR_SEP && tempname[0] != DIR_SEP)
                    412:                     strcat (basename, DIR_STR);
                    413:               }
1.11      cvs       414:             else
                    415:               basename[0] = EOS;
1.5       cvs       416:          }
                    417:        else
                    418:           basename[0] = EOS;
                    419:      }
                    420:    else
                    421:       basename[0] = EOS;
                    422: 
                    423:    if (basename[0] == EOS)
                    424:      {
                    425:        /* 
                    426:           ** There is no BASE element in that document.
                    427:           ** A temporary fix as TtaExtractName does not tolerate a name
                    428:           ** ending in /. Here, we reinsert the slash, in order to
                    429:           ** parse the name in the following two lines. A bit
                    430:           ** redundant and has to be reviewed.  
                    431:         */
                    432:        if (DocumentURLs[(int) doc])
                    433:          {
                    434:             basename_ptr = HTParse (DocumentURLs[(int) doc], "", PARSE_ALL);
                    435:             basename_flag = TRUE;
                    436:          }
                    437:        else
                    438:          {
                    439:             basename_ptr = "";
                    440:             basename_flag = FALSE;
                    441:          }
                    442:      }
                    443:    else
                    444:      {
                    445:        basename_ptr = HTParse (basename, "", PARSE_ALL);
                    446:        basename_flag = TRUE;
                    447:      }                         /* if-else tempname */
                    448: 
                    449:    ptr = HTParse (tempname, basename_ptr, PARSE_ALL);
                    450:    if (basename_flag)
                    451:       HT_FREE (basename_ptr);
                    452:    if (ptr)
                    453:      {
                    454:        ptr = HTSimplify (&ptr);
                    455:        strcpy (newName, ptr);
                    456:        HT_FREE (ptr);
                    457:      }
                    458:    else
                    459:       newName[0] = EOS;
                    460: 
                    461:    i = strlen (newName) - 1;
                    462:    if (i > 0)
                    463:      {
                    464:        /* 
                    465:           ** A temporary fix for an interfacing problem:
                    466:           ** TtaExtractName does not tolerate url's finished on DIR_SEP
                    467:         */
                    468:        ptr = strrchr (newName, DIR_SEP);
                    469:        if (ptr)
                    470:           ptr++;
                    471:        if (ptr && *ptr != EOS)
                    472:           strcpy (docName, ptr);
                    473:        else
                    474:           /*
                    475:              ** The docname was not comprised inside the URL, so let's 
                    476:              ** assign a "noname.html" name :)
                    477:            */
                    478:           strcpy (docName, "noname.html");
                    479: 
                    480:        /* 
                    481:           ** A temporary fix for an interfacing problem:
                    482:           ** TtaExtractName does not tolerate url's finished on DIR_SEP
                    483:         */
                    484:        if (newName[i] == DIR_SEP)
                    485:           newName[i] = EOS;
                    486:      }
1.3       cvs       487: }
                    488: 
1.4       cvs       489: /*----------------------------------------------------------------------
1.9       cvs       490:   IsSameHost                                                         
1.4       cvs       491:   ----------------------------------------------------------------------*/
1.3       cvs       492: #ifdef __STDC__
                    493: boolean             IsSameHost (char *url1, char *url2)
                    494: #else  /* __STDC__ */
                    495: boolean             IsSameHost (url1, url2)
                    496: char               *path;
                    497: #endif /* __STDC__ */
                    498: {
1.5       cvs       499:    char               *basename_ptr1, *basename_ptr2;
                    500:    boolean             result;
1.3       cvs       501: 
1.5       cvs       502:    basename_ptr1 = HTParse (url1, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
                    503:    basename_ptr2 = HTParse (url2, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
1.3       cvs       504: 
1.5       cvs       505:    if (strcmp (basename_ptr1, basename_ptr2))
1.8       cvs       506:       result = FALSE;
1.5       cvs       507:    else
1.8       cvs       508:       result = TRUE;
1.3       cvs       509: 
1.5       cvs       510:    HT_FREE (basename_ptr1);
                    511:    HT_FREE (basename_ptr2);
1.3       cvs       512: 
1.5       cvs       513:    return (result);
1.3       cvs       514: }
                    515: 
                    516: 
1.4       cvs       517: /*----------------------------------------------------------------------
1.9       cvs       518:   AHTMakeRelativeURL                                                
                    519:   converts url into a relative url to base_url.
                    520:   If succesful, returns the new URL, otherwise, it returns NULL.
                    521:   The caller has to free the new URL.
1.4       cvs       522:   ----------------------------------------------------------------------*/
1.3       cvs       523: #ifdef __STDC__
1.5       cvs       524: char               *AHTMakeRelativeName (char *url, char *base_url)
1.3       cvs       525: #else  /* __STDC__ */
1.5       cvs       526: char               *AHTMakeRelativeName (url, base_url)
                    527: char                url;
                    528: char                base_url;
                    529: 
1.3       cvs       530: #endif /* __STDC__ */
                    531: {
1.5       cvs       532:    char               *base_ptr, *url_ptr;
                    533:    char               *result;
                    534: 
                    535:    /* verify if we are in the same host */
1.3       cvs       536: 
1.5       cvs       537:    base_ptr = HTParse (base_url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
                    538:    url_ptr = HTParse (url, "", PARSE_ACCESS | PARSE_HOST | PARSE_PUNCTUATION);
1.3       cvs       539: 
1.5       cvs       540:    if (!strcmp (base_ptr, url_ptr))
                    541:      {
                    542:        HT_FREE (base_ptr);
                    543:        HT_FREE (url_ptr);
1.3       cvs       544: 
1.5       cvs       545:        /* Normalize the URLs */
1.3       cvs       546: 
1.5       cvs       547:        base_ptr = HTParse (base_url, "", PARSE_ALL);
                    548:        url_ptr = HTParse (url, "", PARSE_ALL);
1.3       cvs       549: 
1.5       cvs       550:        /* Use libwww to make relative name */
1.3       cvs       551: 
1.5       cvs       552:        result = HTRelative (url_ptr, base_ptr);
                    553:        HT_FREE (base_ptr);
                    554:        HT_FREE (url_ptr);
                    555:      }
                    556:    else
                    557:       result = (char *) NULL;
1.3       cvs       558: 
1.5       cvs       559:    return (result);
1.3       cvs       560: }
1.9       cvs       561: 
                    562: 
                    563: 

Webmaster