File:  [Public] / libwww / Robot / src / HTRobot.c
Revision 1.75: download - view: text, annotated - select for diffs
Mon Oct 26 22:45:34 1998 UTC (25 years, 7 months ago) by frystyk
Branches: MAIN
CVS tags: HEAD
New version of libwww webbot hacked up by John Punin: Conatins a full robots.txt parser and breadth first search algorithm

/*
**	@(#) $Id: HTRobot.c,v 1.75 1998/10/26 22:45:34 frystyk Exp $
**	
**	W3C Webbot can be found at "http://www.w3.org/Robot/"
**	
**	Copyright  1995-1998 World Wide Web Consortium, (Massachusetts
**	Institute of Technology, Institut National de Recherche en
**	Informatique et en Automatique, Keio University). All Rights
**	Reserved. This program is distributed under the W3C's Software
**	Intellectual Property License. This program is distributed in the hope
**	that it will be useful, but WITHOUT ANY WARRANTY; without even the
**	implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
**	PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
**	details.
**
**  Authors:
**	HFN		Henrik Frystyk Nielsen, (frystyk@w3.org)
**	BR		Bob Racko
**	JP		John Punin
**
**  History:
**	Dec 04 95	First version
**	Oct 1998	Split into separate files
*/

#include "HTRobMan.h"
#include "HTQueue.h"
#include "HTAncMan.h"

#define SHOW_QUIET(mr)		((mr) && !((mr)->flags & MR_QUIET))
#define SHOW_REAL_QUIET(mr)	((mr) && !((mr)->flags & MR_REAL_QUIET))

PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};

/*
**  Some sorting algorithms
*/
PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;

PUBLIC HText * HTMainText = NULL;
PUBLIC HTParentAnchor * HTMainAnchor = NULL;
PUBLIC HTStyleSheet * styleSheet = NULL;

/* ------------------------------------------------------------------------- */

/*	Standard (non-error) Output
**	---------------------------
*/
PUBLIC int OutputData(const char  * fmt, ...)
{
    int ret;
    va_list pArgs;
    va_start(pArgs, fmt);
    ret = vfprintf(stdout, fmt, pArgs);
    va_end(pArgs);
    return ret;
}

/* ------------------------------------------------------------------------- */

/*	Create a "HyperDoc" object
**	--------------------------
**	A HyperDoc object contains information about whether we have already
**	started checking the anchor and the depth in our search
*/
PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
{
    HyperDoc * hd;
    if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
	HT_OUTOFMEM("HyperDoc_new");
    hd->depth = depth;
    hd->hits = 1;

    hd->code = -1;
    hd->index = ++mr->cindex;

    /* Bind the HyperDoc object together with the Anchor Object */
    hd->anchor = anchor;
    HTAnchor_setDocument(anchor, (void *) hd);

    /* Add this HyperDoc object to our list */
    if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
    HTList_addObject(mr->hyperdoc, (void *) hd);
    return hd;
}

/*	Delete a "HyperDoc" object
**	--------------------------
*/
PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
{
    if (hd) {
	HT_FREE (hd);
	return YES;
    }
    return NO;
}

/*
**  Sort the anchor array and log reference count
*/
PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = HTLog_open(mr->hitfile, YES, YES);
        if (log) {
            void ** data = NULL;
            HTParentAnchor * anchor = NULL;
            HTArray_sort(array, HitSort);
            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	    while (anchor) {
                char * uri = HTAnchor_address((HTAnchor *) anchor);
                HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
                if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
                HT_FREE(uri);
                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
            }
	}
        HTLog_close(log);
        return YES;
    }
    return NO;
}

PRIVATE int HitSort (const void * a, const void * b)
{
    HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
    HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
    if (aa && bb) return (bb->hits - aa->hits);
    return bb - aa;
}

/*
**  Sort the anchor array and log link relations
*/
PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
	void ** data = NULL;
	HTParentAnchor * anchor = NULL;
	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	while (anchor) {

	    /*
	    **  If we have a specific link relation to look for then do this.
	    **  Otherwise look for all link relations.
	    */
	    if (mr->relation) {
		HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
		if (link) {
		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
		    char * src_uri = HTAnchor_address((HTAnchor *) anchor);
		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
		    if (src_uri && dest_uri) {
#ifdef HT_MYSQL
			if (mr->sqllog) {
			    HTSQLLog_addLinkRelationship (mr->sqllog,
							  src_uri, dest_uri,
							  HTAtom_name(mr->relation),
							  NULL);
			}
#endif
			if (log) {
			    HTFormat format = HTAnchor_format(dest);
			    HTLog_addText(log, "%s %s %s --> %s\n",
					  HTAtom_name(mr->relation),
					  format != WWW_UNKNOWN ?
					  HTAtom_name(format) : "<unknown>",
					  src_uri, dest_uri);
			}

			/* Cleanup */
			HT_FREE(src_uri);
			HT_FREE(dest_uri);
		    }
		}
	    } else {
		HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
		HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
		char * src_uri = HTAnchor_address((HTAnchor *) anchor);
		HTLinkType linktype;

		/* First look in the main link */
		if (link && (linktype = HTLink_type(link))) {		    
		    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
		    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
		    if (src_uri && dest_uri) {
#ifdef HT_MYSQL
			if (mr->sqllog) {
			    HTSQLLog_addLinkRelationship (mr->sqllog,
							  src_uri, dest_uri,
							  HTAtom_name(linktype),
							  NULL);
			}
#endif
			if (log) {
			    HTFormat format = HTAnchor_format(dest);
			    HTLog_addText(log, "%s %s %s --> %s\n",
					  HTAtom_name(linktype),
					  format != WWW_UNKNOWN ?
					  HTAtom_name(format) : "<unknown>",
					  src_uri, dest_uri);
			}
		    }
		    HT_FREE(dest_uri);
		}

		/* and then in any sublinks */
		if (sublinks) {
		    HTLink * pres;
		    while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
			if ((linktype = HTLink_type(pres))) {
			    HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
			    char * dest_uri = HTAnchor_address((HTAnchor *) dest);
			    if (src_uri && dest_uri) {
#ifdef HT_MYSQL
				if (mr->sqllog) {
				    HTSQLLog_addLinkRelationship (mr->sqllog,
								  src_uri, dest_uri,
								  HTAtom_name(linktype),
								  NULL);
				}
#endif
				if (log) {
				    HTFormat format = HTAnchor_format(dest);
				    HTLog_addText(log, "%s %s %s --> %s\n",
						  HTAtom_name(linktype),
						  format != WWW_UNKNOWN ?
						  HTAtom_name(format) : "<unknown>",
						  src_uri, dest_uri);
				}
				HT_FREE(dest_uri);
			    }
			}
		    }
		}

		/* Cleanup */
		HT_FREE(src_uri);
	    }
	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
	}
        if (log) HTLog_close(log);
        return YES;
    }
    return NO;
}

/*
**  Sort the anchor array and log last modified date
*/
PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = HTLog_open(mr->lmfile, YES, YES);
        if (log) {
            void ** data = NULL;
            HTParentAnchor * anchor = NULL;
            HTArray_sort(array, LastModifiedSort);
            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	    while (anchor) {
                char * uri = HTAnchor_address((HTAnchor *) anchor);
                time_t lm = HTAnchor_lastModified(anchor);
                if (uri && lm > 0)
		    HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
                HT_FREE(uri);
                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
            }
	}
        HTLog_close(log);
        return YES;
    }
    return NO;
}

PRIVATE int LastModifiedSort (const void * a, const void * b)
{
    time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
    time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
    return bb - aa;
}

/*
**  Sort the anchor array and log the document title
*/
PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
{
    if (mr && array) {
        HTLog * log = HTLog_open(mr->titlefile, YES, YES);
        if (log) {
            void ** data = NULL;
            HTParentAnchor * anchor = NULL;
            HTArray_sort(array, TitleSort);
            anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	    while (anchor) {
                char * uri = HTAnchor_address((HTAnchor *) anchor);
                const char * title = HTAnchor_title(anchor);
		HTCharset charset = HTAnchor_charset(anchor);
                if (uri) HTLog_addText(log, "%s `%s\' %s\n",
				       charset ? HTAtom_name(charset) : "<none>",
				       title ? title : "<none>",
				       uri);
                HT_FREE(uri);
                anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
            }
	}
        HTLog_close(log);
        return YES;
    }
    return NO;
}

PRIVATE int TitleSort (const void * a, const void * b)
{
    const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
    const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
    return strcasecomp(bb?bb:"", aa?aa:"");
}

/*
**  Calculate distributions for media types. The same mechanism
**  can be used for other characteristics with relatively
**  few outcomes.
*/
PRIVATE HTList * mediatype_distribution (HTArray * array)
{
    if (array) {
	HTList * mt = HTList_new();
	MetaDist * pres = NULL;
	void ** data = NULL;
	HTParentAnchor * anchor = NULL;
	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	while (anchor) {
	    HTFormat format = HTAnchor_format(anchor);
	    if (format && format != WWW_UNKNOWN) {
		HTList * cur = mt;

		/* If found then increase counter */
		while ((pres = (MetaDist *) HTList_nextObject(cur))) {
		    if (pres->name == format) {
			pres->hits++;
			break;
		    }
		}

		/* If not found then add new format to list */
		if (!pres) {
                    if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
        	         HT_OUTOFMEM("mediatype_distribution");
		    pres->name = format;
		    pres->hits = 1;
		    HTList_addObject(mt, pres);
		    HTList_insertionSort(mt, FormatSort);
		}
	    }

	    /* Find next anchor in array */
	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
	}
	return mt;
    }
    return NULL;
}

/*
**  Calculate distributions for charsets. The same mechanism
**  can be used for other characteristics with relatively
**  few outcomes.
*/
PRIVATE HTList * charset_distribution (HTArray * array)
{
    if (array) {
	HTList * cs = HTList_new();
	MetaDist * pres = NULL;
	void ** data = NULL;
	HTParentAnchor * anchor = NULL;
	anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
	while (anchor) {
	    HTCharset charset = HTAnchor_charset(anchor);
	    if (charset) {
		HTList * cur = cs;

		/* If found then increase counter */
		while ((pres = (MetaDist *) HTList_nextObject(cur))) {
		    if (pres->name == charset) {
			pres->hits++;
			break;
		    }
		}

		/* If not found then add new format to list */
		if (!pres) {
                    if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
        	         HT_OUTOFMEM("charset_distribution");
		    pres->name = charset;
		    pres->hits = 1;
		    HTList_addObject(cs, pres);
		    HTList_insertionSort(cs, FormatSort);
		}
	    }

	    /* Find next anchor in array */
	    anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
	}
	return cs;
    }
    return NULL;
}

PRIVATE int FormatSort (const void * a, const void * b)
{
    MetaDist * aa = (MetaDist *) a;
    MetaDist * bb = (MetaDist *) b;
    return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
}

PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
{
    if (logfile && distribution) {
        HTLog * log = HTLog_open(logfile, YES, YES);
	if (log) {
	    HTList * cur = distribution;
	    MetaDist * pres;
	    while ((pres = (MetaDist *) HTList_nextObject(cur))) {
		if (pres->name) {
		    HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
		}
	    }
	    HTLog_close(log);
	}
    }
    return NO;
}

PRIVATE BOOL delete_meta_distribution (HTList * distribution)
{
    if (distribution) {
	HTList * cur = distribution;
	MetaDist * pres;
	while ((pres = (MetaDist *) HTList_nextObject(cur)))
	    HT_FREE(pres);
	HTList_delete(distribution);	
	return YES;	
    }
    return NO;
}


/*	Statistics
**	----------
**	Calculates a bunch of statistics for the anchors traversed
*/
PRIVATE BOOL calculate_statistics (Robot * mr)
{
    long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
    if (!mr) return NO;

    /* Calculate efficiency */
    if (mr->time > 0) {
	ms_t t = HTGetTimeInMillis() - mr->time;
	if (t > 0) {
	    double loadfactor = (mr->get_bytes / (t * 0.001));
	    double reqprsec = (total_docs / (t * 0.001));
	    double secs = t / 1000.0;
            char bytes[50];
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
			total_docs, secs, reqprsec);

            HTNumToStr(mr->get_bytes, bytes, 50);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
			mr->get_docs, bytes, loadfactor);

            HTNumToStr(mr->head_bytes, bytes, 50);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
			mr->head_docs, bytes);
	}
    }

    /* Create an array of existing anchors */
    if (total_docs > 1) {
	HTArray * array = HTAnchor_getArray(total_docs);
        if (array) {

	    /* Distributions */
	    if (mr->flags & MR_DISTRIBUTIONS) {
		if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
	    }

            /* Sort after hit counts */
            if (mr->hitfile) {
		if (SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged hit count distribution in file `%s\'\n",
			    mr->hitfile);
		calculate_hits(mr, array);
	    }

            /* Sort after link relations */
#ifdef HT_MYSQL
            if (mr->relfile || mr->sqllog) {
#else
            if (mr->relfile) {
#endif
		if (mr->relfile && SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged link relationship distribution in file `%s\'\n",
			    mr->relfile);
		calculate_linkRelations(mr, array);
	    }

            /* Sort after modified date */
            if (mr->lmfile) {
		if (SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged last modified distribution in file `%s\'\n",
			    mr->lmfile);
		calculate_lm(mr, array);
	    }

            /* Sort after title */
            if (mr->titlefile) {
		if (SHOW_REAL_QUIET(mr))
		    HTTrace("\tLogged title distribution in file `%s\'\n",
			    mr->titlefile);
		calculate_title(mr, array);
	    }

            /* Find mediatype distribution */
	    if (mr->mtfile) {
		HTList * mtdist = mediatype_distribution(array);
		if (mtdist) {
		    if (SHOW_REAL_QUIET(mr))
			HTTrace("\tLogged media type distribution in file `%s\'\n",
				mr->mtfile);
		    log_meta_distribution(mr->mtfile, mtdist);
		    delete_meta_distribution(mtdist);
		}
	    }

            /* Find charset distribution */
	    if (mr->charsetfile) {
		HTList * charsetdist = charset_distribution(array);
		if (charsetdist) {
		    if (SHOW_REAL_QUIET(mr))
			HTTrace("\tLogged charset distribution in file `%s\'\n",
				mr->charsetfile);
		    log_meta_distribution(mr->charsetfile, charsetdist);
		    delete_meta_distribution(charsetdist);
		}
	    }

            /* Add as may other stats here as you like */
	    /* ... */
	    
	    /* Delete the array */
            HTArray_delete(array);
        }
    }
    return YES;
}

PRIVATE HTParentAnchor *
get_last_parent(HTParentAnchor *anchor)
{
  HTAnchor *anc;
  HTList *sources = anchor->sources;

  while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
    {
      HTParentAnchor *panchor = HTAnchor_parent(anc);
      return panchor;
    }
  return NULL;
}

PRIVATE void
set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
{
  HTList * cur = HTRequest_error(request);
  HTError *pres;

  while((pres = (HTError *) HTList_nextObject(cur)) != NULL)
    {
      int code =HTErrors[HTError_index(pres)].code;

      hd->code = code;
    }
}


PRIVATE int
test_for_blank_spaces(char *uri)
{
  char *ptr = uri;
  for(;*ptr!='\0';ptr++)
    if(*ptr == ' ')
      return 1;
  return 0;
}


/*	Create a Command Line Object
**	----------------------------
*/
PUBLIC Robot * Robot_new (void)
{
    Robot * me;
    if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
	HT_OUTOFMEM("Robot_new");
    me->hyperdoc = HTList_new();
    me->htext = HTList_new();
    me->timer = DEFAULT_TIMEOUT*MILLIES;
    me->waits = 0;
    me->cwd = HTGetCurrentDirectoryURL();
    me->output = OUTPUT;
    me->cnt = 0;
    me->ndoc = -1;
    me->fingers = HTList_new();
 
   /* This is new */
    me->queue = HTQueue_new();
    me->cq = 0;
    me->furl = NULL;

    return me;
}

/*	Delete a Command Line Object
**	----------------------------
*/
PRIVATE BOOL Robot_delete (Robot * mr)
{
    if (mr) {
	HTList_delete(mr->fingers);

       	/* Calculate statistics */
	calculate_statistics(mr);

        if (mr->hyperdoc) {
	    HTList * cur = mr->hyperdoc;
	    HyperDoc * pres;
	    while ((pres = (HyperDoc *) HTList_nextObject(cur)))
		HyperDoc_delete(pres);
	    HTList_delete(mr->hyperdoc);
	}
	if (mr->htext) {
	    HTList * cur = mr->htext;
	    HText * pres;
	    while ((pres = (HText *) HTList_nextObject(cur)))
		HText_free(pres);
	    HTList_delete(mr->htext);
	}

	/* Close all the log files */
	if (mr->flags & MR_LOGGING) {
	    if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
	}

	if (mr->log) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in general log file `%s\'\n",
			HTLog_accessCount(mr->log), mr->logfile);
	    HTLog_close(mr->log);
	}
	if (mr->ref) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
			HTLog_accessCount(mr->ref), mr->reffile);
	    HTLog_close(mr->ref);
	}
	if (mr->reject) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
			HTLog_accessCount(mr->reject), mr->rejectfile);
	    HTLog_close(mr->reject);
	}
	if (mr->notfound) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
			HTLog_accessCount(mr->notfound), mr->notfoundfile);
	    HTLog_close(mr->notfound);
	}
	if (mr->conneg) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
			HTLog_accessCount(mr->conneg), mr->connegfile);
	    HTLog_close(mr->conneg);
	}
	if (mr->noalttag) {
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
			HTLog_accessCount(mr->noalttag), mr->noalttagfile);
	    HTLog_close(mr->noalttag);
	}

	if (mr->output && mr->output != STDOUT) fclose(mr->output);

	if (mr->flags & MR_TIME) {
	    time_t local = time(NULL);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
	}

	/* This is new */
	if(mr->cdepth)
	  HT_FREE(mr->cdepth);
	if(mr->furl) HT_FREE(mr->furl);

#ifdef HT_POSIX_REGEX
	if (mr->include) {
	    regfree(mr->include);
	    HT_FREE(mr->include);
	}
	if (mr->exclude) {
	    regfree(mr->exclude);
	    HT_FREE(mr->exclude);
	}
	if (mr->exc_robot) {
	    regfree(mr->exc_robot);
	    HT_FREE(mr->exc_robot);
	}
	if (mr->check) {
	    regfree(mr->check);
	    HT_FREE(mr->check);
	}
#endif

#ifdef HT_MYSQL
	if (mr->sqllog) {
	    HTSQLLog_close(mr->sqllog);
	    mr->sqllog = NULL;
	}
#endif

	HT_FREE(mr->cwd);
	HT_FREE(mr->prefix);
	HT_FREE(mr->img_prefix);
	HT_FREE(mr);
	return YES;
    }
    return NO;
}

/*
**  This function creates a new finger object and initializes it with a new request
*/
PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
{
    Finger * me;
    HTRequest * request = HTRequest_new();
    if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
	HT_OUTOFMEM("Finger_new");
    me->robot = robot;
    me->request = request;
    me->dest = dest;
    HTList_addObject(robot->fingers, (void *)me);

    /* Set the context for this request */
    HTRequest_setContext (request, me);

    /* Check the various flags to customize the request */
    if (robot->flags & MR_PREEMPTIVE)
	HTRequest_setPreemptive(request, YES);
    if (robot->flags & MR_VALIDATE)
	HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
    if (robot->flags & MR_END_VALIDATE)
	HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);

    /* We wanna make sure that we are sending a Host header (default) */
    HTRequest_addRqHd(request, HT_C_HOST);

    /* Set the method for this request */
    HTRequest_setMethod(request, method);
    robot->cnt++;
    return me;
}

PRIVATE int Finger_delete (Finger * me)
{
    HTList_removeObject(me->robot->fingers, (void *)me);
    me->robot->cnt--;

    /*
    **  If we are down at one request then flush the output buffer
    */
    if (me->request) {
	if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
	HTRequest_delete(me->request);
    }

    /*
    **  Delete the request and free myself
    */
    HT_FREE(me);
    return YES;
}

/*
**  Cleanup and make sure we close all connections including the persistent
**  ones
*/
PUBLIC void Cleanup (Robot * me, int status)
{
    Robot_delete(me);
    HTProfile_delete();
#ifdef HT_MEMLOG
    HTMemLog_close();
#endif

#ifdef VMS
    exit(status ? status : 1);
#else
    exit(status ? status : 0);
#endif
}

#ifdef CATCH_SIG
#include <signal.h>
/*								    SetSignal
**  This function sets up signal handlers. This might not be necessary to
**  call if the application has its own handlers (lossage on SVR4)
*/
PUBLIC void SetSignal (void)
{
    /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
    ** when attemting to connect to a remote host where you normally should
    ** get `connection refused' back
    */
    if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
	if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
    } else {
	if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
    }

#ifdef HT_MEMLOG
    HTMemLog_flush();
#endif

}
#endif /* CATCH_SIG */

#ifdef HT_POSIX_REGEX
PRIVATE char * get_regerror (int errcode, regex_t * compiled)
{
    size_t length = regerror (errcode, compiled, NULL, 0);
    char * str = NULL;
    if ((str = (char *) HT_MALLOC(length+1)) == NULL)
	HT_OUTOFMEM("get_regerror");
    (void) regerror (errcode, compiled, str, length);
    return str;
}

PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
{
    regex_t * regex = NULL;
    if (regex_str && *regex_str) {
	int status;
	if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
	    HT_OUTOFMEM("get_regtype");
	if ((status = regcomp(regex, regex_str, cflags))) {
	    char * err_msg = get_regerror(status, regex);
	    if (SHOW_REAL_QUIET(mr))
		HTTrace("Regular expression error: %s\n", err_msg);
	    HT_FREE(err_msg);
	    Cleanup(mr, -1);
	}
    }
    return regex;
}
#endif

PUBLIC void VersionInfo (void)
{
    OutputData("\nW3C OpenSource Software");
    OutputData("\n-----------------------\n\n");
    OutputData("\tWebbot version %s\n", APP_VERSION);
    OutputData("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
    OutputData("\tSee \"%s\" for help\n", COMMAND_LINE);
    OutputData("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
    OutputData("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
    OutputData("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
    OutputData("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
}

/*	terminate_handler
**	-----------------
**	This function is registered to handle the result of the request.
**	If no more requests are pending then terminate program
*/
PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
			       void * param, int status) 
{
    Finger * finger = (Finger *) HTRequest_context(request);
    Robot * mr = finger->robot;
    if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));

#ifdef HT_MYSQL
    if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
#endif

    /* Check if negotiated resource and whether we should log that*/
    if (mr->conneg) {
	HTAssocList * cur = HTResponse_variant(response);
	if (cur) {
	    BOOL first = YES;
	    HTChunk * buffer = HTChunk_new(128);
	    char * uri = HTAnchor_address((HTAnchor *) finger->dest);
	    HTAssoc * pres;
	    HTChunk_puts(buffer, uri);
	    while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
		char * value = HTAssoc_value(pres);
		if (first) {
		    HTChunk_puts(buffer, "\t(");
		    first = NO;
		} else
		    HTChunk_puts(buffer, ", ");

		/* Output the name */
		HTChunk_puts(buffer, HTAssoc_name(pres));

		/* Only output the value if not empty string */
		if (value && *value) {
		    HTChunk_puts(buffer, "=");
		    HTChunk_puts(buffer, value);
		}
	    }
	    if (!first) HTChunk_puts(buffer, ")");
	    HTLog_addLine(mr->conneg, HTChunk_data(buffer));
	    HTChunk_delete(buffer);
	    HT_FREE(uri);
	}
    }

    /* Count the amount of body data that we have read */
    if (HTRequest_method(request) == METHOD_GET) {
	int length = HTAnchor_length(HTRequest_anchor(request));
	if (length > 0) mr->get_bytes += length;
	mr->get_docs++;
    } else if (HTRequest_method(request) == METHOD_HEAD) {
	int length = HTAnchor_length(HTRequest_anchor(request));
	if (length > 0) mr->head_bytes += length;
	mr->head_docs++;
    } else {
	mr->other_docs++;
    }

    if (SHOW_QUIET(mr)) HTTrace("             %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
    return HT_OK;

}
PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
			       void * param, int status) 
{
    Finger * finger = (Finger *) HTRequest_context(request);
    Robot * mr = finger->robot;
    HTParentAnchor * dest = finger->dest;
    HyperDoc * hd = HTAnchor_document(dest);
    int depth = (hd ? hd->depth : -1);

    if (hd) set_error_state_hyperdoc(hd,request);
      
    if(hd && (HTRequest_method(request)== METHOD_HEAD) && 
       (depth < mr->depth))
      {
	hd->method = METHOD_GET;
	HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
      }

    Finger_delete(finger);

    if(!(mr->flags & MR_PREEMPTIVE))
      Serving_queue(mr);

    return HT_OK;
}


PUBLIC void Serving_queue(Robot *mr)
{
  BOOL abort = NO;
  Finger *nfinger;
  
  while(!abort)
    {
      if(!HTQueue_isEmpty(mr->queue))
	{
	  HTRequest *newreq;
	  
	  HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
	  
	  if(nhd)
	    {
	      char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
	      HTQueue_dequeue(mr->queue); (mr->cq)--;

	      nfinger = Finger_new(mr, nhd->anchor, nhd->method); 
	      
	      newreq = nfinger->request;

	      if(SHOW_QUIET(mr))  HTTrace("Request from QUEUE  %s\n",uri);
	      HT_FREE(uri);
	      if(SHOW_QUIET(mr)) HTTrace("%d elements in queue \n", mr->cq);

	      HTRequest_setParent(newreq,get_last_parent(nhd->anchor));

	      if(mr->waits)
		  sleep(mr->waits);
	      
	      if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES) 
		{
		  if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
		  Finger_delete(nfinger);
		}
	    }
	  else
	    abort = YES;
	}
      else
	abort = YES;
    }

  if(SHOW_QUIET(mr)) HTTrace("Queue size: %d \n", mr->cq);

    if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
      {
	if(mr->cnt > 0)
	  if(SHOW_QUIET(mr)) HTTrace("%d requests were not served\n", mr->cnt);

	if (SHOW_QUIET(mr)) HTTrace("             Everything is finished...\n");
	Cleanup(mr, 0);			/* No way back from here */
      }
}

/* ------------------------------------------------------------------------- */
/*				HTEXT INTERFACE				     */
/* ------------------------------------------------------------------------- */

PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
			   HTStream * stream)
{
    HText * me;
    Finger * finger = (Finger *) HTRequest_context(request);
    Robot * mr = finger->robot;
    char * robots = NULL;

    if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
	HT_OUTOFMEM("HText_new2");

    /* Bind the HText object together with the Request Object */
    me->request = request;
    me->follow = YES;

    /* Check to see if we have any meta tags */
    if ((robots = HTAnchor_robots(anchor)) != NULL) {
	char * strval = NULL;
	char * ptr = NULL;
	char * token = NULL;
	StrAllocCopy(strval, robots);
	ptr = strval;
	while ((token = HTNextField(&ptr)) != NULL) {
	    if (!strcasecomp(token, "nofollow")) {
		me->follow = NO;
		break;
	    }
	}
	HT_FREE(strval);
    }

    /* Add this HyperDoc object to our list */
    if (!mr->htext) mr->htext = HTList_new();
    HTList_addObject(mr->htext, (void *) me);
    return me;
}

PUBLIC void HText_free (HText * me) {
    if (me) HT_FREE (me);
}

PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
{
    if (text && anchor) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;
	HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
	HTParentAnchor * dest_parent = HTAnchor_parent(dest);
	char * uri = HTAnchor_address((HTAnchor *) dest_parent);
	HyperDoc * hd = HTAnchor_document(dest_parent);
	HTParentAnchor * referer = HTRequest_anchor(text->request);
	BOOL match = text->follow;
	BOOL check = NO;

	/* These are new variables */
	HyperDoc * nhd = NULL;
	BOOL follow = YES;

	/* These three variables were moved */
	/*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
	HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
	HyperDoc * last_doc = HTAnchor_document(last_anchor);
	int depth = last_doc ? last_doc->depth+1 : 0;

	if (!uri) return;
	if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");

        if (hd) {
	    if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
            hd->hits++;
#ifdef HT_MYSQL
	    if (mr->sqllog) {
		char * ref_addr = HTAnchor_address((HTAnchor *) referer);
		if (ref_addr) {
		    HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
						 "referer", NULL);
		    HT_FREE(ref_addr);
		}
	    }
#endif
	    HT_FREE(uri);
	    return;
	}

	/* Check for prefix match */
	if (match && mr->prefix) {
	    match = HTStrMatch(mr->prefix, uri) ? YES : NO;
	}

#ifdef HT_POSIX_REGEX
	/*
	**  Check for any regular expression. The include may override
	**  the prefix matching
	*/
	if (mr->include) {
	    match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
	}
	if (match && mr->exc_robot) {
	    match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
	}
	if (match && mr->exclude) {
	    match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
	}
	if (match && mr->check) {
	    check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
	}
#endif
	if(uri && test_for_blank_spaces(uri))
	  follow = NO;
	else if (mr->ndoc == 0) /* Number of Documents is reached */
	  follow = NO;

	/* Test whether we already have a hyperdoc for this document */
	if(!hd && dest_parent)
	  {
	    nhd = HyperDoc_new(mr, dest_parent, depth);
	    mr->cdepth[depth]++;
	  }

	/* Test whether we already have a hyperdoc for this document */
        if (mr->flags & MR_LINK && match && dest_parent
	    && follow && !hd ) {

          nhd->method = METHOD_HEAD;
	  HTQueue_enqueue(mr->queue, (void *)nhd); (mr->cq)++;
	  if(mr->ndoc > 0)      mr->ndoc--;

	} else {
	    if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
#ifdef HT_MYSQL
	    if (mr->reject || mr->sqllog) {
#else	
	    if (mr->reject) {
#endif
		if (referer) {
		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);
		    if (mr->reject && ref_addr)
			HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
#ifdef HT_MYSQL
		    if (mr->sqllog && mr->sqlexternals && ref_addr)
			HTSQLLog_addLinkRelationship(mr->sqllog,
						     ref_addr, uri,
						     "referer", NULL);
#endif

		    HT_FREE(ref_addr);
		}
	    }
	}
	HT_FREE(uri);
    }
}

PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
			       const char *alt, const char * align, BOOL isMap)
{
    if (text && anchor) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;

	if (mr->flags & MR_IMG) {
	    HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
	    HTParentAnchor * dest_parent = HTAnchor_parent(dest);
	    char * uri = HTAnchor_address((HTAnchor *) dest_parent);
	    HyperDoc * hd = HTAnchor_document(dest_parent);
	    HTParentAnchor * referer = HTRequest_anchor(text->request);
	    BOOL match = YES;

	    if (!uri) return;
	    if (hd) {
		if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
		hd->hits++;
#ifdef HT_MYSQL
		if (mr->sqllog) {
		    char * ref_addr = HTAnchor_address((HTAnchor *) referer);
		    if (ref_addr) {
			HTSQLLog_addLinkRelationship(mr->sqllog,
						     ref_addr, uri,
						     "image", alt);
			HT_FREE(ref_addr);
		    }
		}
#endif
		HT_FREE(uri);
		return;
	    }

	    /* Check for prefix match */
	    if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;

	    /* Test whether we already have a hyperdoc for this document */
	    if (match && dest) {
		Finger * newfinger = Finger_new(mr, dest_parent,
						mr->flags & MR_SAVE ?
						METHOD_GET : METHOD_HEAD);
		HTRequest * newreq = newfinger->request;
		HyperDoc_new(mr, dest_parent, 1);
		HTRequest_setParent(newreq, referer);

		/* Check whether we should report missing ALT tags */
		if (mr->noalttag && (alt==NULL || *alt=='\0')) {
		    if (referer) {
			char * ref_addr = HTAnchor_address((HTAnchor *) referer);
			if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
			HT_FREE(ref_addr);
		    }
		}
		
		if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
		if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
		    if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
		    Finger_delete(newfinger);
		}
	    } else {
		if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
#ifdef HT_MYSQL
		if (mr->reject || mr->sqllog) {
#else	
		if (mr->reject) {
#endif
		    if (referer) {
			char * ref_addr = HTAnchor_address((HTAnchor *) referer);
			if (mr->reject && ref_addr)
			    HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
#ifdef HT_MYSQL
			if (mr->sqllog && mr->sqlexternals && ref_addr)
			    HTSQLLog_addLinkRelationship(mr->sqllog,
							 ref_addr, uri,
							 "image", alt);
#endif

			HT_FREE(ref_addr);
		    }
		}
	    }
	    HT_FREE(uri);
	}
    }
}

PUBLIC void HText_appendLink (HText * text, HTChildAnchor * anchor,
			      const BOOL * present, const char ** value)
{
    if (text && anchor) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;
	if (SHOW_QUIET(mr))
	    HTTrace("Robot....... Received Link element with anchor %p\n", anchor);
	HText_beginAnchor(text, anchor);
    }
}

PUBLIC void HText_appendObject (HText * text, int element_number,
	                        const BOOL * present, const char ** value)
{
    /* Here we can look for frames, link tags, meta tags etc. */
    if (text && text->request) {
	Finger * finger = (Finger *) HTRequest_context(text->request);
	Robot * mr = finger->robot;

	if (SHOW_QUIET(mr))
	    HTTrace("Robot....... HText Object %p called with HTML element number %d\n",
		    text, element_number);

	switch (element_number) {

	case HTML_FRAME:
	{
	    HTChildAnchor * source = HTAnchor_findChildAndLink(
		HTRequest_anchor(text->request),			/* Parent */
		NULL,							/* Tag */
		present[HTML_FRAME_SRC] ? value[HTML_FRAME_SRC] : NULL,	/* Addresss */
		NULL);							/* Rels */
	    HText_beginAnchor(text, source);
	}
	break;

	case HTML_BODY:
	{
	    HTChildAnchor * source = HTAnchor_findChildAndLink(
		HTRequest_anchor(text->request),			/* Parent */
		NULL,                     				/* Tag */
		present[HTML_BODY_BACKGROUND] ? value[HTML_BODY_BACKGROUND] : NULL,  /* Addresss */
		NULL);							/* Rels */
	    HText_appendImage(text, source, NULL, NULL, NO);
	}
	break;

	default:
	    break;
	}
    }
}

PUBLIC void HText_endAnchor (HText * text) {}
PUBLIC void HText_appendText (HText * text, const char * str) {}
PUBLIC void HText_appendCharacter (HText * text, char ch) {}
PUBLIC void HText_endAppend (HText * text) {}
PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
PUBLIC void HText_beginAppend (HText * text) {}
PUBLIC void HText_appendParagraph (HText * text) {}


PUBLIC char *
get_robots_txt(char *uri)
{
  char *str = NULL;
  HTChunk * chunk;
  HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
  HTRequest *request = HTRequest_new();
  HTRequest_setOutputFormat(request, WWW_SOURCE);
  HTRequest_setPreemptive(request, YES);
  HTRequest_setMethod(request, METHOD_GET);
  chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
  str = HTChunk_toCString(chunk);
  HTRequest_delete(request);
  return str;
}


Webmaster