File:  [Public] / libwww / Robot / src / HTRobot.c
Revision 1.80: download - view: text, annotated - select for diffs
Wed Jan 6 15:38:48 1999 UTC (25 years, 5 months ago) by frystyk
Branches: MAIN
CVS tags: HEAD
A completely rewritten HText interface that makes it a lot easier to use the libwww HTML parser and also doesn't throw away tags or entities that are not defined in the libwww HTML DTD. The new interface is based on registering callbacks so that the application doesn't have to provide the old HText functions that it didn't use. The HTML parser itself has also improved as it now knows about several HTML 4.0 tags. Finally, there are three new sample applications called showtags, showtext, and showlinks which illustrate how to use the new HText interface. Currently the line mode browser hasn't been updated so it doesn't compile for the moment.

/*
**	@(#) $Id: HTRobot.c,v 1.80 1999/01/06 15:38:48 frystyk Exp $
**	
**	W3C Webbot can be found at "http://www.w3.org/Robot/"
**	
**	Copyright  1995-1998 World Wide Web Consortium, (Massachusetts
**	Institute of Technology, Institut National de Recherche en
**	Informatique et en Automatique, Keio University). All Rights
**	Reserved. This program is distributed under the W3C's Software
**	Intellectual Property License. This program is distributed in the hope
**	that it will be useful, but WITHOUT ANY WARRANTY; without even the
**	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
**	PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
**	details.
**
**  Authors:
**	HFN		Henrik Frystyk Nielsen, (frystyk@w3.org)
**	BR		Bob Racko
**	JP		John Punin
**
**  History:
**	Dec 04 95	First version
**	Oct 1998	Split into separate files
*/

#include "HTRobMan.h"
#include "HTQueue.h"
#include "HTAncMan.h"

#define SHOW_QUIET(mr)		((mr) && !((mr)->flags & MR_QUIET))
#define SHOW_REAL_QUIET(mr)	((mr) && !((mr)->flags & MR_REAL_QUIET))

PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};

/*
**  Some sorting algorithms
*/
PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;

/*
**  Ths callbacks that we need from the libwww HTML parser
*/
PRIVATE HText_new	RHText_new;
PRIVATE HText_delete	RHText_delete;
PRIVATE HText_foundLink	RHText_foundLink;

/* ------------------------------------------------------------------------- */

/*	Standard (non-error) Output
**	---------------------------
*/
PUBLIC int OutputData(const char  * fmt, ...)
{
    int ret;
    va_list pArgs;
    va_start(pArgs, fmt);
    ret = vfprintf(stdout, fmt, pArgs);
    va_end(pArgs);
    return ret;
}

/* ------------------------------------------------------------------------- */

/*	Create a "HyperDoc" object
**	--------------------------
**	A HyperDoc object contains information about whether we have already
**	started checking the anchor and the depth in our search
*/
PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
{
    HyperDoc * hd;
    if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
	HT_OUTOFMEM("HyperDoc_new");
    hd->depth = depth;
    hd->hits = 1;

    hd->code = -1;
    hd->index = ++mr->cindex;

    /* Bind the HyperDoc object together with the Anchor Object */
    hd->anchor = anchor;
    HTAnchor_setDocument(anchor, (void *) hd);

    /* Add this HyperDoc object to our list */
    if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
    HTList_addObject(mr->hyperdoc, (void *) hd);
    return hd;
}

/*	Delete a "HyperDoc" object
**	--------------------------
*/
PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
{
    if (hd) {
	HT_FREE (hd);
	return YES;
    }
    return NO;
}

/*
**  Sort the anchor array and log reference count
*/
PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = HTLog_open(mr->hitfile, YES, YES);
        if (log) {
            void ** data = NULL;
            HTParentAnchor * anchor = NULL;
            HTArray_sort(array, HitSort);
            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	    while (anchor) {
                char * uri = HTAnchor_address((HTAnchor *) anchor);
                HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
                if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
                HT_FREE(uri);
                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
            }
	}
        HTLog_close(log);
        return YES;
    }
    return NO;
}

PRIVATE int HitSort (const void * a, const void * b)
{
    HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
    HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
    if (aa && bb) return (bb->hits - aa->hits);
    return bb - aa;
}

/*
**  Sort the anchor array and log link relations
*/
PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
	void ** data = NULL;
	HTParentAnchor * anchor = NULL;
	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	while (anchor) {

	    /*
	    **  If we have a specific link relation to look for then do this.
	    **  Otherwise look for all link relations.
	    */
	    if (mr->relation) {
		HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
		if (link) {
		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
		    char * src_uri = HTAnchor_address((HTAnchor *) anchor);
		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
		    if (src_uri && dest_uri) {
#ifdef HT_MYSQL
			if (mr->sqllog) {
			    HTSQLLog_addLinkRelationship (mr->sqllog,
							  src_uri, dest_uri,
							  HTAtom_name(mr->relation),
							  NULL);
			}
#endif
			if (log) {
			    HTFormat format = HTAnchor_format(dest);
			    HTLog_addText(log, "%s %s %s --> %s\n",
					  HTAtom_name(mr->relation),
					  format != WWW_UNKNOWN ?
					  HTAtom_name(format) : "<unknown>",
					  src_uri, dest_uri);
			}

			/* Cleanup */
			HT_FREE(src_uri);
			HT_FREE(dest_uri);
		    }
		}
	    } else {
		HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
		HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
		char * src_uri = HTAnchor_address((HTAnchor *) anchor);
		HTLinkType linktype;

		/* First look in the main link */
		if (link && (linktype = HTLink_type(link))) {		    
		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
		    if (src_uri && dest_uri) {
#ifdef HT_MYSQL
			if (mr->sqllog) {
			    HTSQLLog_addLinkRelationship (mr->sqllog,
							  src_uri, dest_uri,
							  HTAtom_name(linktype),
							  NULL);
			}
#endif
			if (log) {
			    HTFormat format = HTAnchor_format(dest);
			    HTLog_addText(log, "%s %s %s --> %s\n",
					  HTAtom_name(linktype),
					  format != WWW_UNKNOWN ?
					  HTAtom_name(format) : "<unknown>",
					  src_uri, dest_uri);
			}
		    }
		    HT_FREE(dest_uri);
		}

		/* and then in any sublinks */
		if (sublinks) {
		    HTLink * pres;
		    while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
			if ((linktype = HTLink_type(pres))) {
			    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
			    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
			    if (src_uri && dest_uri) {
#ifdef HT_MYSQL
				if (mr->sqllog) {
				    HTSQLLog_addLinkRelationship (mr->sqllog,
								  src_uri, dest_uri,
								  HTAtom_name(linktype),
								  NULL);
				}
#endif
				if (log) {
				    HTFormat format = HTAnchor_format(dest);
				    HTLog_addText(log, "%s %s %s --> %s\n",
						  HTAtom_name(linktype),
						  format != WWW_UNKNOWN ?
						  HTAtom_name(format) : "<unknown>",
						  src_uri, dest_uri);
				}
				HT_FREE(dest_uri);
			    }
			}
		    }
		}

		/* Cleanup */
		HT_FREE(src_uri);
	    }
	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
	}
        if (log) HTLog_close(log);
        return YES;
    }
    return NO;
}

/*
**  Sort the anchor array and log last modified date
*/
PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = HTLog_open(mr->lmfile, YES, YES);
        if (log) {
            void ** data = NULL;
            HTParentAnchor * anchor = NULL;
            HTArray_sort(array, LastModifiedSort);
            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	    while (anchor) {
                char * uri = HTAnchor_address((HTAnchor *) anchor);
                time_t lm = HTAnchor_lastModified(anchor);
                if (uri && lm > 0)
		    HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
                HT_FREE(uri);
                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
            }
	}
        HTLog_close(log);
        return YES;
    }
    return NO;
}

PRIVATE int LastModifiedSort (const void * a, const void * b)
{
    time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
    time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
    return bb - aa;
}

/*
**  Sort the anchor array and log the document title
*/
PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = HTLog_open(mr->titlefile, YES, YES);
        if (log) {
            void ** data = NULL;
            HTParentAnchor * anchor = NULL;
            HTArray_sort(array, TitleSort);
            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	    while (anchor) {
                char * uri = HTAnchor_address((HTAnchor *) anchor);
                const char * title = HTAnchor_title(anchor);
		HTCharset charset = HTAnchor_charset(anchor);
                if (uri) HTLog_addText(log, "%s `%s\' %s\n",
				       charset ? HTAtom_name(charset) : "<none>",
				       title ? title : "<none>",
				       uri);
                HT_FREE(uri);
                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
            }
	}
        HTLog_close(log);
        return YES;
    }
    return NO;
}

PRIVATE int TitleSort (const void * a, const void * b)
{
    const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
    const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
    return strcasecomp(bb?bb:"", aa?aa:"");
}

/*
**  Calculate distributions for media types. The same mechanism
**  can be used for other characteristics with relatively
**  few outcomes.
*/
PRIVATE HTList * mediatype_distribution (HTArray * array)
{
    if (array) {
	HTList * mt = HTList_new();
	MetaDist * pres = NULL;
	void ** data = NULL;
	HTParentAnchor * anchor = NULL;
	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	while (anchor) {
	    HTFormat format = HTAnchor_format(anchor);
	    if (format && format != WWW_UNKNOWN) {
		HTList * cur = mt;

		/* If found then increase counter */
		while ((pres = (MetaDist *) HTList_nextObject(cur))) {
		    if (pres->name == format) {
			pres->hits++;
			break;
		    }
		}

		/* If not found then add new format to list */
		if (!pres) {
                    if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
        	         HT_OUTOFMEM("mediatype_distribution");
		    pres->name = format;
		    pres->hits = 1;
		    HTList_addObject(mt, pres);
		    HTList_insertionSort(mt, FormatSort);
		}
	    }

	    /* Find next anchor in array */
	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
	}
	return mt;
    }
    return NULL;
}

/*
**  Calculate distributions for charsets. The same mechanism
**  can be used for other characteristics with relatively
**  few outcomes.
*/
PRIVATE HTList * charset_distribution (HTArray * array)
{
    if (array) {
	HTList * cs = HTList_new();
	MetaDist * pres = NULL;
	void ** data = NULL;
	HTParentAnchor * anchor = NULL;
	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	while (anchor) {
	    HTCharset charset = HTAnchor_charset(anchor);
	    if (charset) {
		HTList * cur = cs;

		/* If found then increase counter */
		while ((pres = (MetaDist *) HTList_nextObject(cur))) {
		    if (pres->name == charset) {
			pres->hits++;
			break;
		    }
		}

		/* If not found then add new format to list */
		if (!pres) {
                    if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
        	         HT_OUTOFMEM("charset_distribution");
		    pres->name = charset;
		    pres->hits = 1;
		    HTList_addObject(cs, pres);
		    HTList_insertionSort(cs, FormatSort);
		}
	    }

	    /* Find next anchor in array */
	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
	}
	return cs;
    }
    return NULL;
}

PRIVATE int FormatSort (const void * a, const void * b)
{
    MetaDist * aa = (MetaDist *) a;
    MetaDist * bb = (MetaDist *) b;
    return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
}

PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
{
    if (logfile && distribution) {
        HTLog * log = HTLog_open(logfile, YES, YES);
	if (log) {
	    HTList * cur = distribution;
	    MetaDist * pres;
	    while ((pres = (MetaDist *) HTList_nextObject(cur))) {
		if (pres->name) {
		    HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
		}
	    }
	    HTLog_close(log);
	}
    }
    return NO;
}

PRIVATE BOOL delete_meta_distribution (HTList * distribution)
{
    if (distribution) {
	HTList * cur = distribution;
	MetaDist * pres;
	while ((pres = (MetaDist *) HTList_nextObject(cur)))
	    HT_FREE(pres);
	HTList_delete(distribution);	
	return YES;	
    }
    return NO;
}


/*	Statistics
**	----------
**	Calculates a bunch of statistics for the anchors traversed
*/
PRIVATE BOOL calculate_statistics (Robot * mr)
{
    long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
    if (!mr) return NO;

    /* Calculate efficiency */
    if (mr->time > 0) {
	ms_t t = HTGetTimeInMillis() - mr->time;
	if (t > 0) {
	    double loadfactor = (mr->get_bytes / (t * 0.001));
	    double reqprsec = (total_docs / (t * 0.001));
	    double secs = t / 1000.0;
            char bytes[50];
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
			total_docs, secs, reqprsec);

            HTNumToStr(mr->get_bytes, bytes, 50);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
			mr->get_docs, bytes, loadfactor);

            HTNumToStr(mr->head_bytes, bytes, 50);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
			mr->head_docs, bytes);
	}
    }

    /* Create an array of existing anchors */
    if (total_docs > 1) {
	HTArray * array = HTAnchor_getArray(total_docs);
        if (array) {

	    /* Distributions */
	    if (mr->flags & MR_DISTRIBUTIONS) {
		if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
	    }

            /* Sort after hit counts */
            if (mr->hitfile) {
		if (SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged hit count distribution in file `%s\'\n",
			    mr->hitfile);
		calculate_hits(mr, array);
	    }

            /* Sort after link relations */
#ifdef HT_MYSQL
            if (mr->relfile || mr->sqllog) {
#else
            if (mr->relfile) {
#endif
		if (mr->relfile && SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged link relationship distribution in file `%s\'\n",
			    mr->relfile);
		calculate_linkRelations(mr, array);
	    }

            /* Sort after modified date */
            if (mr->lmfile) {
		if (SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged last modified distribution in file `%s\'\n",
			    mr->lmfile);
		calculate_lm(mr, array);
	    }

            /* Sort after title */
            if (mr->titlefile) {
		if (SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged title distribution in file `%s\'\n",
			    mr->titlefile);
		calculate_title(mr, array);
	    }

            /* Find mediatype distribution */
	    if (mr->mtfile) {
		HTList * mtdist = mediatype_distribution(array);
		if (mtdist) {
		    if (SHOW_REAL_QUIET(mr))
			HTTrace("\tLogged media type distribution in file `%s\'\n",
				mr->mtfile);
		    log_meta_distribution(mr->mtfile, mtdist);
		    delete_meta_distribution(mtdist);
		}
	    }

            /* Find charset distribution */
	    if (mr->charsetfile) {
		HTList * charsetdist = charset_distribution(array);
		if (charsetdist) {
		    if (SHOW_REAL_QUIET(mr))
			HTTrace("\tLogged charset distribution in file `%s\'\n",
				mr->charsetfile);
		    log_meta_distribution(mr->charsetfile, charsetdist);
		    delete_meta_distribution(charsetdist);
		}
	    }

            /* Add as may other stats here as you like */
	    /* ... */
	    
	    /* Delete the array */
            HTArray_delete(array);
        }
    }
    return YES;
}

PRIVATE HTParentAnchor *
get_last_parent(HTParentAnchor *anchor)
{
  HTAnchor *anc;
  HTList *sources = anchor->sources;

  while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
    {
      HTParentAnchor *panchor = HTAnchor_parent(anc);
      return panchor;
    }
  return NULL;
}

PRIVATE void
set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
{
  HTList * cur = HTRequest_error(request);
  HTError *pres;

  while((pres = (HTError *) HTList_nextObject(cur)) != NULL)
    {
      int code =HTErrors[HTError_index(pres)].code;

      hd->code = code;
    }
}


PRIVATE int
test_for_blank_spaces(char *uri)
{
  char *ptr = uri;
  for(;*ptr!='\0';ptr++)
    if(*ptr == ' ')
      return 1;
  return 0;
}


/*	Create a Command Line Object
**	----------------------------
*/
PUBLIC Robot * Robot_new (void)
{
    Robot * me;
    if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
	HT_OUTOFMEM("Robot_new");
    me->hyperdoc = HTList_new();
    me->htext = HTList_new();
    me->timer = DEFAULT_TIMEOUT*MILLIES;
    me->waits = 0;
    me->cwd = HTGetCurrentDirectoryURL();
    me->output = OUTPUT;
    me->cnt = 0;
    me->ndoc = -1;
    me->fingers = HTList_new();
 
   /* This is new */
    me->queue = HTQueue_new();
    me->cq = 0;
    me->furl = NULL;

    return me;
}

/*	Delete a Command Line Object
**	----------------------------
*/
PRIVATE BOOL Robot_delete (Robot * mr)
{
    if (mr) {
	HTList_delete(mr->fingers);

       	/* Calculate statistics */
	calculate_statistics(mr);

        if (mr->hyperdoc) {
	    HTList * cur = mr->hyperdoc;
	    HyperDoc * pres;
	    while ((pres = (HyperDoc *) HTList_nextObject(cur)))
		HyperDoc_delete(pres);
	    HTList_delete(mr->hyperdoc);
	}
	if (mr->htext) {
	    HTList * cur = mr->htext;
	    HText * pres;
	    while ((pres = (HText *) HTList_nextObject(cur)))
		RHText_delete(pres);
	    HTList_delete(mr->htext);
	}

	/* Close all the log files */
	if (mr->flags & MR_LOGGING) {
	    if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
	}

	if (mr->log) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in general log file `%s\'\n",
			HTLog_accessCount(mr->log), mr->logfile);
	    HTLog_close(mr->log);
	}
	if (mr->ref) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
			HTLog_accessCount(mr->ref), mr->reffile);
	    HTLog_close(mr->ref);
	}
	if (mr->reject) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
			HTLog_accessCount(mr->reject), mr->rejectfile);
	    HTLog_close(mr->reject);
	}
	if (mr->notfound) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
			HTLog_accessCount(mr->notfound), mr->notfoundfile);
	    HTLog_close(mr->notfound);
	}
	if (mr->conneg) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
			HTLog_accessCount(mr->conneg), mr->connegfile);
	    HTLog_close(mr->conneg);
	}
	if (mr->noalttag) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
			HTLog_accessCount(mr->noalttag), mr->noalttagfile);
	    HTLog_close(mr->noalttag);
	}

	if (mr->output && mr->output != STDOUT) fclose(mr->output);

	if (mr->flags & MR_TIME) {
	    time_t local = time(NULL);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
	}

	/* This is new */
	if(mr->cdepth)
	  HT_FREE(mr->cdepth);
	if(mr->furl) HT_FREE(mr->furl);

#ifdef HT_POSIX_REGEX
	if (mr->include) {
	    regfree(mr->include);
	    HT_FREE(mr->include);
	}
	if (mr->exclude) {
	    regfree(mr->exclude);
	    HT_FREE(mr->exclude);
	}
	if (mr->exc_robot) {
	    regfree(mr->exc_robot);
	    HT_FREE(mr->exc_robot);
	}
	if (mr->check) {
	    regfree(mr->check);
	    HT_FREE(mr->check);
	}
#endif

#ifdef HT_MYSQL
	if (mr->sqllog) {
	    HTSQLLog_close(mr->sqllog);
	    mr->sqllog = NULL;
	}
#endif

	HT_FREE(mr->cwd);
	HT_FREE(mr->prefix);
	HT_FREE(mr->img_prefix);
	HT_FREE(mr);
	return YES;
    }
    return NO;
}

/*
**  This function creates a new finger object and initializes it with a new request
*/
PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
{
    Finger * me;
    HTRequest * request = HTRequest_new();
    if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
	HT_OUTOFMEM("Finger_new");
    me->robot = robot;
    me->request = request;
    me->dest = dest;
    HTList_addObject(robot->fingers, (void *)me);

    /* Set the context for this request */
    HTRequest_setContext (request, me);

    /* Check the various flags to customize the request */
    if (robot->flags & MR_PREEMPTIVE)
	HTRequest_setPreemptive(request, YES);
    if (robot->flags & MR_VALIDATE)
	HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
    if (robot->flags & MR_END_VALIDATE)
	HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);

    /* We wanna make sure that we are sending a Host header (default) */
    HTRequest_addRqHd(request, HT_C_HOST);

    /* Set the method for this request */
    HTRequest_setMethod(request, method);
    robot->cnt++;
    return me;
}

PRIVATE int Finger_delete (Finger * me)
{
    HTList_removeObject(me->robot->fingers, (void *)me);
    me->robot->cnt--;

    /*
    **  If we are down at one request then flush the output buffer
    */
    if (me->request) {
	if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
	HTRequest_delete(me->request);
    }

    /*
    **  Delete the request and free myself
    */
    HT_FREE(me);
    return YES;
}

/*
**  Cleanup and make sure we close all connections including the persistent
**  ones
*/
PUBLIC void Cleanup (Robot * me, int status)
{
    Robot_delete(me);
    HTProfile_delete();
#ifdef HT_MEMLOG
    HTMemLog_close();
#endif

#ifdef VMS
    exit(status ? status : 1);
#else
    exit(status ? status : 0);
#endif
}

#ifdef CATCH_SIG
#include <signal.h>
/*								    SetSignal
**  This function sets up signal handlers. This might not be necessary to
**  call if the application has its own handlers (lossage on SVR4)
*/
PUBLIC void SetSignal (void)
{
    /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
    ** when attemting to connect to a remote host where you normally should
    ** get `connection refused' back
    */
    if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
	if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
    } else {
	if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
    }

#ifdef HT_MEMLOG
    HTMemLog_flush();
#endif

}
#endif /* CATCH_SIG */

#ifdef HT_POSIX_REGEX
PRIVATE char * get_regerror (int errcode, regex_t * compiled)
{
    size_t length = regerror (errcode, compiled, NULL, 0);
    char * str = NULL;
    if ((str = (char *) HT_MALLOC(length+1)) == NULL)
	HT_OUTOFMEM("get_regerror");
    (void) regerror (errcode, compiled, str, length);
    return str;
}

PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
{
    regex_t * regex = NULL;
    if (regex_str && *regex_str) {
	int status;
	if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
	    HT_OUTOFMEM("get_regtype");
	if ((status = regcomp(regex, regex_str, cflags))) {
	    char * err_msg = get_regerror(status, regex);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("Regular expression error: %s\n", err_msg);
	    HT_FREE(err_msg);
	    Cleanup(mr, -1);
	}
    }
    return regex;
}
#endif

PUBLIC void VersionInfo (void)
{
    OutputData("\nW3C OpenSource Software");
    OutputData("\n-----------------------\n\n");
    OutputData("\tWebbot version %s\n", APP_VERSION);
    OutputData("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
    OutputData("\tSee \"%s\" for help\n", COMMAND_LINE);
    OutputData("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
    OutputData("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
    OutputData("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
    OutputData("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
}

/*	terminate_handler
**	-----------------
**	This function is registered to handle the result of the request.
**	If no more requests are pending then terminate program
*/
PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
			       void * param, int status) 
{
    Finger * finger = (Finger *) HTRequest_context(request);
    Robot * mr = finger->robot;
    if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));

#ifdef HT_MYSQL
    if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
#endif

    /* Check if negotiated resource and whether we should log that*/
    if (mr->conneg) {
	HTAssocList * cur = HTResponse_variant(response);
	if (cur) {
	    BOOL first = YES;
	    HTChunk * buffer = HTChunk_new(128);
	    char * uri = HTAnchor_address((HTAnchor *) finger->dest);
	    HTAssoc * pres;
	    HTChunk_puts(buffer, uri);
	    while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
		char * value = HTAssoc_value(pres);
		if (first) {
		    HTChunk_puts(buffer, "\t(");
		    first = NO;
		} else
		    HTChunk_puts(buffer, ", ");

		/* Output the name */
		HTChunk_puts(buffer, HTAssoc_name(pres));

		/* Only output the value if not empty string */
		if (value && *value) {
		    HTChunk_puts(buffer, "=");
		    HTChunk_puts(buffer, value);
		}
	    }
	    if (!first) HTChunk_puts(buffer, ")");
	    HTLog_addLine(mr->conneg, HTChunk_data(buffer));
	    HTChunk_delete(buffer);
	    HT_FREE(uri);
	}
    }

    /* Count the amount of body data that we have read */
    if (HTRequest_method(request) == METHOD_GET) {
	int length = HTAnchor_length(HTRequest_anchor(request));
	if (length > 0) mr->get_bytes += length;
	mr->get_docs++;
    } else if (HTRequest_method(request) == METHOD_HEAD) {
	int length = HTAnchor_length(HTRequest_anchor(request));
	if (length > 0) mr->head_bytes += length;
	mr->head_docs++;
    } else {
	mr->other_docs++;
    }

    if (!(mr->flags & MR_BFS)) {

	/* Delete this thread */
	Finger_delete(finger);

	/* Should we stop? */
	if (mr->cnt <= 0) {
	    if (SHOW_QUIET(mr)) HTTrace("             Everything is finished...\n");
	    Cleanup(mr, 0);			/* No way back from here */
	}
    }

    if (SHOW_QUIET(mr)) HTTrace("             %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
    return HT_OK;

}
PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
			       void * param, int status) 
{
    Finger * finger = (Finger *) HTRequest_context(request);
    Robot * mr = finger->robot;
    HTParentAnchor * dest = finger->dest;
    HyperDoc * hd = HTAnchor_document(dest);
    int depth = (hd ? hd->depth : -1);

    if (hd) set_error_state_hyperdoc(hd,request);
      
    if(hd && (HTRequest_method(request)== METHOD_HEAD) && 
       (depth < mr->depth))
      {
	hd->method = METHOD_GET;
	HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
      }

    Finger_delete(finger);

    if(!(mr->flags & MR_PREEMPTIVE))
      Serving_queue(mr);

    return HT_OK;
}


PUBLIC void Serving_queue(Robot *mr)
{
  BOOL abort = NO;
  Finger *nfinger;
  
  while(!abort)
    {
      if(!HTQueue_isEmpty(mr->queue))
	{
	  HTRequest *newreq;
	  
	  HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
	  
	  if(nhd)
	    {
	      char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
	      HTQueue_dequeue(mr->queue); (mr->cq)--;

	      nfinger = Finger_new(mr, nhd->anchor, nhd->method); 
	      
	      newreq = nfinger->request;

	      if(SHOW_QUIET(mr))  HTTrace("Request from QUEUE  %s\n",uri);
	      HT_FREE(uri);
	      if(SHOW_QUIET(mr)) HTTrace("%d elements in queue \n", mr->cq);

	      HTRequest_setParent(newreq,get_last_parent(nhd->anchor));

	      /* @@@ Should be done using a timer and not sleep! @@@ */
#if 0
	      if(mr->waits)
		  sleep(mr->waits);
#endif
	      
	      if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES) 
		{
		  if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
		  Finger_delete(nfinger);
		}
	    }
	  else
	    abort = YES;
	}
      else
	abort = YES;
    }

  if(SHOW_QUIET(mr)) HTTrace("Queue size: %d \n", mr->cq);

    if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
      {
	if(mr->cnt > 0)
	  if(SHOW_QUIET(mr)) HTTrace("%d requests were not served\n", mr->cnt);

	if (SHOW_QUIET(mr)) HTTrace("             Everything is finished...\n");
	Cleanup(mr, 0);			/* No way back from here */
      }
}

/* ------------------------------------------------------------------------- */
/*				HTEXT INTERFACE				     */
/* ------------------------------------------------------------------------- */

PUBLIC BOOL Robot_registerHTMLParser (void)
{
    HText_registerCDCallback(RHText_new, RHText_delete);
    HText_registerLinkCallback(RHText_foundLink);
    return YES;
}

PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,
			    HTStream * stream)
{
    HText * me;
    Finger * finger = (Finger *) HTRequest_context(request);
    Robot * mr = finger->robot;
    char * robots = NULL;

    if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
	HT_OUTOFMEM("HText_new2");

    /* Bind the HText object together with the Request Object */
    me->request = request;
    me->follow = YES;

    /* Check to see if we have any meta tags */
    if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
	char * strval = NULL;
	char * ptr = NULL;
	char * token = NULL;
	StrAllocCopy(strval, robots);
	ptr = strval;
	while ((token = HTNextField(&ptr)) != NULL) {
	    if (!strcasecomp(token, "nofollow")) {
		me->follow = NO;
		break;
	    }
	}
	HT_FREE(strval);
    }

    /* Add this HyperDoc object to our list */
    if (!mr->htext) mr->htext = HTList_new();
    HTList_addObject(mr->htext, (void *) me);
    return me;
}

PRIVATE BOOL RHText_delete (HText * me) {
    if (me) HT_FREE (me);
}

PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor)
{
    if (text && anchor) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;
	HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
	HTParentAnchor * dest_parent = HTAnchor_parent(dest);
	char * uri = HTAnchor_address((HTAnchor *) dest_parent);
	HyperDoc * hd = HTAnchor_document(dest_parent);
	HTParentAnchor * referer = HTRequest_anchor(text->request);
	BOOL match = text->follow;
	BOOL check = NO;

	/* These are new variables */
	HyperDoc * nhd = NULL;
	BOOL follow = YES;

	/* These three variables were moved */
	/*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
	HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
	HyperDoc * last_doc = HTAnchor_document(last_anchor);
	int depth = last_doc ? last_doc->depth+1 : 0;

	if (!uri) return;
	if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");

        if (hd) {
	    if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
            hd->hits++;
#ifdef HT_MYSQL
	    if (mr->sqllog) {
		char * ref_addr = HTAnchor_address((HTAnchor *) referer);
		if (ref_addr) {
		    HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
						 "referer", NULL);
		    HT_FREE(ref_addr);
		}
	    }
#endif
	    HT_FREE(uri);
	    return;
	}

	/* Check for prefix match */
	if (match && mr->prefix) {
	    match = HTStrMatch(mr->prefix, uri) ? YES : NO;
	}

#ifdef HT_POSIX_REGEX
	/*
	**  Check for any regular expression. The include may override
	**  the prefix matching
	*/
	if (mr->include) {
	    match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
	}
	if (match && mr->exc_robot) {
	    match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
	}
	if (match && mr->exclude) {
	    match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
	}
	if (match && mr->check) {
	    check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
	}
#endif
	if(uri && test_for_blank_spaces(uri))
	  follow = NO;
	else if (mr->ndoc == 0) /* Number of Documents is reached */
	  follow = NO;

	/* Test whether we already have a hyperdoc for this document */
	if(!hd && dest_parent)
	  {
	    nhd = HyperDoc_new(mr, dest_parent, depth);
	    mr->cdepth[depth]++;
	  }

	/* Test whether we already have a hyperdoc for this document */
        if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
	    if (mr->flags & MR_BFS) {
		nhd->method = METHOD_HEAD;
		HTQueue_enqueue(mr->queue, (void *) nhd);
		(mr->cq)++;
		if(mr->ndoc > 0) mr->ndoc--;
	    } else {
		Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
		HTRequest * newreq = newfinger->request;
		HTRequest_setParent(newreq, referer);
		if (check || depth >= mr->depth) {
		    if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
		    HTRequest_setMethod(newreq, METHOD_HEAD);
		} else {
		    if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
		}
		if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
		    if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
		    Finger_delete(newfinger);
		}
	    }

	} else {
	    if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
#ifdef HT_MYSQL
	    if (mr->reject || mr->sqllog) {
#else	
	    if (mr->reject) {
#endif
		if (referer) {
		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);
		    if (mr->reject && ref_addr)
			HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
#ifdef HT_MYSQL
		    if (mr->sqllog && mr->sqlexternals && ref_addr)
			HTSQLLog_addLinkRelationship(mr->sqllog,
						     ref_addr, uri,
						     "referer", NULL);
#endif

		    HT_FREE(ref_addr);
		}
	    }
	}
	HT_FREE(uri);
    }
}

PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,
				const char *alt, const char * align, BOOL isMap)
{
    if (text && anchor) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;

	if (mr->flags & MR_IMG) {
	    HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
	    HTParentAnchor * dest_parent = HTAnchor_parent(dest);
	    char * uri = HTAnchor_address((HTAnchor *) dest_parent);
	    HyperDoc * hd = HTAnchor_document(dest_parent);
	    HTParentAnchor * referer = HTRequest_anchor(text->request);
	    BOOL match = YES;

	    if (!uri) return;
	    if (hd) {
		if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
		hd->hits++;
#ifdef HT_MYSQL
		if (mr->sqllog) {
		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);
		    if (ref_addr) {
			HTSQLLog_addLinkRelationship(mr->sqllog,
						     ref_addr, uri,
						     "image", alt);
			HT_FREE(ref_addr);
		    }
		}
#endif
		HT_FREE(uri);
		return;
	    }

	    /* Check for prefix match */
	    if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;

#ifdef HT_POSIX_REGEX
	/*
	**  Check for any regular expression. The include may override
	**  the prefix matching
	*/
	if (mr->include) {
	    match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
	}
	if (match && mr->exc_robot) {
	    match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
	}
	if (match && mr->exclude) {
	    match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
	}
#endif
	    /* Test whether we already have a hyperdoc for this document */
	    if (match && dest) {
		Finger * newfinger = Finger_new(mr, dest_parent,
						mr->flags & MR_SAVE ?
						METHOD_GET : METHOD_HEAD);
		HTRequest * newreq = newfinger->request;
		HyperDoc_new(mr, dest_parent, 1);
		HTRequest_setParent(newreq, referer);

		/* Check whether we should report missing ALT tags */
		if (mr->noalttag && (alt==NULL || *alt=='\0')) {
		    if (referer) {
			char * ref_addr = HTAnchor_address((HTAnchor *) referer);
			if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
			HT_FREE(ref_addr);
		    }
		}
		
		if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
		if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
		    if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
		    Finger_delete(newfinger);
		}
	    } else {
		if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
#ifdef HT_MYSQL
		if (mr->reject || mr->sqllog) {
#else	
		if (mr->reject) {
#endif
		    if (referer) {
			char * ref_addr = HTAnchor_address((HTAnchor *) referer);
			if (mr->reject && ref_addr)
			    HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
#ifdef HT_MYSQL
			if (mr->sqllog && mr->sqlexternals && ref_addr)
			    HTSQLLog_addLinkRelationship(mr->sqllog,
							 ref_addr, uri,
							 "image", alt);
#endif

			HT_FREE(ref_addr);
		    }
		}
	    }
	    HT_FREE(uri);
	}
    }
}

PRIVATE void RHText_foundLink (HText * text,
			       int element_number, int attribute_number,
			       HTChildAnchor * anchor,
			       const BOOL * present, const char ** value)
{
    if (text && anchor) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;
	if (SHOW_QUIET(mr))
	    HTTrace("Robot....... Received element %d, attribute %d with anchor %p\n",
		    element_number, attribute_number, anchor);
	if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) || 
	    (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND))
	    RHText_foundImage(text, anchor, NULL, NULL, NO);
	else
	    RHText_foundAnchor(text, anchor);
    }
}

PUBLIC char * get_robots_txt(char * uri)
{
  char *str = NULL;
  HTChunk * chunk;
  HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
  HTRequest *request = HTRequest_new();
  HTRequest_setOutputFormat(request, WWW_SOURCE);
  HTRequest_setPreemptive(request, YES);
  HTRequest_setMethod(request, METHOD_GET);
  chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
  str = HTChunk_toCString(chunk);
  HTRequest_delete(request);
  return str;
}


Webmaster