/*
** @(#) $Id: HTRobot.c,v 1.82 1999/02/22 22:10:12 frystyk Exp $
**
** W3C Webbot can be found at "http://www.w3.org/Robot/"
**
** Copyright 1995-1998 World Wide Web Consortium, (Massachusetts
** Institute of Technology, Institut National de Recherche en
** Informatique et en Automatique, Keio University). All Rights
** Reserved. This program is distributed under the W3C's Software
** Intellectual Property License. This program is distributed in the hope
** that it will be useful, but WITHOUT ANY WARRANTY; without even the
** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
** details.
**
** Authors:
** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
** BR Bob Racko
** JP John Punin
**
** History:
** Dec 04 95 First version
** Oct 1998 Split into separate files
*/
#include "HTRobMan.h"
#include "HTQueue.h"
#include "HTAncMan.h"
#define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
#define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};
/*
** Some sorting algorithms
*/
PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
/*
** Ths callbacks that we need from the libwww HTML parser
*/
PRIVATE HText_new RHText_new;
PRIVATE HText_delete RHText_delete;
PRIVATE HText_foundLink RHText_foundLink;
/* ------------------------------------------------------------------------- */
/* Create a "HyperDoc" object
** --------------------------
** A HyperDoc object contains information about whether we have already
** started checking the anchor and the depth in our search
*/
PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
{
HyperDoc * hd;
if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
HT_OUTOFMEM("HyperDoc_new");
hd->depth = depth;
hd->hits = 1;
hd->code = -1;
hd->index = ++mr->cindex;
/* Bind the HyperDoc object together with the Anchor Object */
hd->anchor = anchor;
HTAnchor_setDocument(anchor, (void *) hd);
/* Add this HyperDoc object to our list */
if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
HTList_addObject(mr->hyperdoc, (void *) hd);
return hd;
}
/* Delete a "HyperDoc" object
** --------------------------
*/
PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
{
if (hd) {
HT_FREE (hd);
return YES;
}
return NO;
}
/*
** Sort the anchor array and log reference count
*/
PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
{
if (mr && array) {
HTLog * log = HTLog_open(mr->hitfile, YES, YES);
if (log) {
void ** data = NULL;
HTParentAnchor * anchor = NULL;
HTArray_sort(array, HitSort);
anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
while (anchor) {
char * uri = HTAnchor_address((HTAnchor *) anchor);
HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
HT_FREE(uri);
anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
}
}
HTLog_close(log);
return YES;
}
return NO;
}
PRIVATE int HitSort (const void * a, const void * b)
{
HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
if (aa && bb) return (bb->hits - aa->hits);
return bb - aa;
}
/*
** Sort the anchor array and log link relations
*/
PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
{
if (mr && array) {
HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
void ** data = NULL;
HTParentAnchor * anchor = NULL;
anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
while (anchor) {
/*
** If we have a specific link relation to look for then do this.
** Otherwise look for all link relations.
*/
if (mr->relation) {
HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
if (link) {
HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
char * src_uri = HTAnchor_address((HTAnchor *) anchor);
char * dest_uri = HTAnchor_address((HTAnchor *) dest);
if (src_uri && dest_uri) {
#ifdef HT_MYSQL
if (mr->sqllog) {
HTSQLLog_addLinkRelationship (mr->sqllog,
src_uri, dest_uri,
HTAtom_name(mr->relation),
NULL);
}
#endif
if (log) {
HTFormat format = HTAnchor_format(dest);
HTLog_addText(log, "%s %s %s --> %s\n",
HTAtom_name(mr->relation),
format != WWW_UNKNOWN ?
HTAtom_name(format) : "<unknown>",
src_uri, dest_uri);
}
/* Cleanup */
HT_FREE(src_uri);
HT_FREE(dest_uri);
}
}
} else {
HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
char * src_uri = HTAnchor_address((HTAnchor *) anchor);
HTLinkType linktype;
/* First look in the main link */
if (link && (linktype = HTLink_type(link))) {
HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
char * dest_uri = HTAnchor_address((HTAnchor *) dest);
if (src_uri && dest_uri) {
#ifdef HT_MYSQL
if (mr->sqllog) {
HTSQLLog_addLinkRelationship (mr->sqllog,
src_uri, dest_uri,
HTAtom_name(linktype),
NULL);
}
#endif
if (log) {
HTFormat format = HTAnchor_format(dest);
HTLog_addText(log, "%s %s %s --> %s\n",
HTAtom_name(linktype),
format != WWW_UNKNOWN ?
HTAtom_name(format) : "<unknown>",
src_uri, dest_uri);
}
}
HT_FREE(dest_uri);
}
/* and then in any sublinks */
if (sublinks) {
HTLink * pres;
while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
if ((linktype = HTLink_type(pres))) {
HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
char * dest_uri = HTAnchor_address((HTAnchor *) dest);
if (src_uri && dest_uri) {
#ifdef HT_MYSQL
if (mr->sqllog) {
HTSQLLog_addLinkRelationship (mr->sqllog,
src_uri, dest_uri,
HTAtom_name(linktype),
NULL);
}
#endif
if (log) {
HTFormat format = HTAnchor_format(dest);
HTLog_addText(log, "%s %s %s --> %s\n",
HTAtom_name(linktype),
format != WWW_UNKNOWN ?
HTAtom_name(format) : "<unknown>",
src_uri, dest_uri);
}
HT_FREE(dest_uri);
}
}
}
}
/* Cleanup */
HT_FREE(src_uri);
}
anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
}
if (log) HTLog_close(log);
return YES;
}
return NO;
}
/*
** Sort the anchor array and log last modified date
*/
PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
{
if (mr && array) {
HTLog * log = HTLog_open(mr->lmfile, YES, YES);
if (log) {
void ** data = NULL;
HTParentAnchor * anchor = NULL;
HTArray_sort(array, LastModifiedSort);
anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
while (anchor) {
char * uri = HTAnchor_address((HTAnchor *) anchor);
time_t lm = HTAnchor_lastModified(anchor);
if (uri && lm > 0)
HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
HT_FREE(uri);
anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
}
}
HTLog_close(log);
return YES;
}
return NO;
}
PRIVATE int LastModifiedSort (const void * a, const void * b)
{
time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
return bb - aa;
}
/*
** Sort the anchor array and log the document title
*/
PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
{
if (mr && array) {
HTLog * log = HTLog_open(mr->titlefile, YES, YES);
if (log) {
void ** data = NULL;
HTParentAnchor * anchor = NULL;
HTArray_sort(array, TitleSort);
anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
while (anchor) {
char * uri = HTAnchor_address((HTAnchor *) anchor);
const char * title = HTAnchor_title(anchor);
HTCharset charset = HTAnchor_charset(anchor);
if (uri) HTLog_addText(log, "%s `%s\' %s\n",
charset ? HTAtom_name(charset) : "<none>",
title ? title : "<none>",
uri);
HT_FREE(uri);
anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
}
}
HTLog_close(log);
return YES;
}
return NO;
}
PRIVATE int TitleSort (const void * a, const void * b)
{
const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
return strcasecomp(bb?bb:"", aa?aa:"");
}
/*
** Calculate distributions for media types. The same mechanism
** can be used for other characteristics with relatively
** few outcomes.
*/
PRIVATE HTList * mediatype_distribution (HTArray * array)
{
if (array) {
HTList * mt = HTList_new();
MetaDist * pres = NULL;
void ** data = NULL;
HTParentAnchor * anchor = NULL;
anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
while (anchor) {
HTFormat format = HTAnchor_format(anchor);
if (format && format != WWW_UNKNOWN) {
HTList * cur = mt;
/* If found then increase counter */
while ((pres = (MetaDist *) HTList_nextObject(cur))) {
if (pres->name == format) {
pres->hits++;
break;
}
}
/* If not found then add new format to list */
if (!pres) {
if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
HT_OUTOFMEM("mediatype_distribution");
pres->name = format;
pres->hits = 1;
HTList_addObject(mt, pres);
HTList_insertionSort(mt, FormatSort);
}
}
/* Find next anchor in array */
anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
}
return mt;
}
return NULL;
}
/*
** Calculate distributions for charsets. The same mechanism
** can be used for other characteristics with relatively
** few outcomes.
*/
PRIVATE HTList * charset_distribution (HTArray * array)
{
if (array) {
HTList * cs = HTList_new();
MetaDist * pres = NULL;
void ** data = NULL;
HTParentAnchor * anchor = NULL;
anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
while (anchor) {
HTCharset charset = HTAnchor_charset(anchor);
if (charset) {
HTList * cur = cs;
/* If found then increase counter */
while ((pres = (MetaDist *) HTList_nextObject(cur))) {
if (pres->name == charset) {
pres->hits++;
break;
}
}
/* If not found then add new format to list */
if (!pres) {
if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
HT_OUTOFMEM("charset_distribution");
pres->name = charset;
pres->hits = 1;
HTList_addObject(cs, pres);
HTList_insertionSort(cs, FormatSort);
}
}
/* Find next anchor in array */
anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
}
return cs;
}
return NULL;
}
PRIVATE int FormatSort (const void * a, const void * b)
{
MetaDist * aa = (MetaDist *) a;
MetaDist * bb = (MetaDist *) b;
return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
}
PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
{
if (logfile && distribution) {
HTLog * log = HTLog_open(logfile, YES, YES);
if (log) {
HTList * cur = distribution;
MetaDist * pres;
while ((pres = (MetaDist *) HTList_nextObject(cur))) {
if (pres->name) {
HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
}
}
HTLog_close(log);
}
}
return NO;
}
PRIVATE BOOL delete_meta_distribution (HTList * distribution)
{
if (distribution) {
HTList * cur = distribution;
MetaDist * pres;
while ((pres = (MetaDist *) HTList_nextObject(cur)))
HT_FREE(pres);
HTList_delete(distribution);
return YES;
}
return NO;
}
/* Statistics
** ----------
** Calculates a bunch of statistics for the anchors traversed
*/
PRIVATE BOOL calculate_statistics (Robot * mr)
{
long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
if (!mr) return NO;
/* Calculate efficiency */
if (mr->time > 0) {
ms_t t = HTGetTimeInMillis() - mr->time;
if (t > 0) {
double loadfactor = (mr->get_bytes / (t * 0.001));
double reqprsec = (total_docs / (t * 0.001));
double secs = t / 1000.0;
char bytes[50];
if (SHOW_REAL_QUIET(mr))
HTPrint("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
total_docs, secs, reqprsec);
HTNumToStr(mr->get_bytes, bytes, 50);
if (SHOW_REAL_QUIET(mr))
HTPrint("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
mr->get_docs, bytes, loadfactor);
HTNumToStr(mr->head_bytes, bytes, 50);
if (SHOW_REAL_QUIET(mr))
HTPrint("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
mr->head_docs, bytes);
}
}
/* Create an array of existing anchors */
if (total_docs > 1) {
HTArray * array = HTAnchor_getArray(total_docs);
if (array) {
/* Distributions */
if (mr->flags & MR_DISTRIBUTIONS) {
if (SHOW_REAL_QUIET(mr)) HTPrint("\nDistributions:\n");
}
/* Sort after hit counts */
if (mr->hitfile) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged hit count distribution in file `%s\'\n",
mr->hitfile);
calculate_hits(mr, array);
}
/* Sort after link relations */
#ifdef HT_MYSQL
if (mr->relfile || mr->sqllog) {
#else
if (mr->relfile) {
#endif
if (mr->relfile && SHOW_REAL_QUIET(mr))
HTPrint("\tLogged link relationship distribution in file `%s\'\n",
mr->relfile);
calculate_linkRelations(mr, array);
}
/* Sort after modified date */
if (mr->lmfile) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged last modified distribution in file `%s\'\n",
mr->lmfile);
calculate_lm(mr, array);
}
/* Sort after title */
if (mr->titlefile) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged title distribution in file `%s\'\n",
mr->titlefile);
calculate_title(mr, array);
}
/* Find mediatype distribution */
if (mr->mtfile) {
HTList * mtdist = mediatype_distribution(array);
if (mtdist) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged media type distribution in file `%s\'\n",
mr->mtfile);
log_meta_distribution(mr->mtfile, mtdist);
delete_meta_distribution(mtdist);
}
}
/* Find charset distribution */
if (mr->charsetfile) {
HTList * charsetdist = charset_distribution(array);
if (charsetdist) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged charset distribution in file `%s\'\n",
mr->charsetfile);
log_meta_distribution(mr->charsetfile, charsetdist);
delete_meta_distribution(charsetdist);
}
}
/* Add as may other stats here as you like */
/* ... */
/* Delete the array */
HTArray_delete(array);
}
}
return YES;
}
PRIVATE HTParentAnchor *
get_last_parent(HTParentAnchor *anchor)
{
HTAnchor *anc;
HTList *sources = anchor->sources;
while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
{
HTParentAnchor *panchor = HTAnchor_parent(anc);
return panchor;
}
return NULL;
}
PRIVATE void
set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
{
HTList * cur = HTRequest_error(request);
HTError *pres;
while((pres = (HTError *) HTList_nextObject(cur)) != NULL)
{
int code =HTErrors[HTError_index(pres)].code;
hd->code = code;
}
}
PRIVATE int
test_for_blank_spaces(char *uri)
{
char *ptr = uri;
for(;*ptr!='\0';ptr++)
if(*ptr == ' ')
return 1;
return 0;
}
/* Create a Command Line Object
** ----------------------------
*/
PUBLIC Robot * Robot_new (void)
{
Robot * me;
if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
HT_OUTOFMEM("Robot_new");
me->hyperdoc = HTList_new();
me->htext = HTList_new();
me->timer = DEFAULT_TIMEOUT*MILLIES;
me->waits = 0;
me->cwd = HTGetCurrentDirectoryURL();
me->output = OUTPUT;
me->cnt = 0;
me->ndoc = -1;
me->fingers = HTList_new();
/* This is new */
me->queue = HTQueue_new();
me->cq = 0;
me->furl = NULL;
return me;
}
/* Delete a Command Line Object
** ----------------------------
*/
PRIVATE BOOL Robot_delete (Robot * mr)
{
if (mr) {
HTList_delete(mr->fingers);
/* Calculate statistics */
calculate_statistics(mr);
if (mr->hyperdoc) {
HTList * cur = mr->hyperdoc;
HyperDoc * pres;
while ((pres = (HyperDoc *) HTList_nextObject(cur)))
HyperDoc_delete(pres);
HTList_delete(mr->hyperdoc);
}
if (mr->htext) {
HTList * cur = mr->htext;
HText * pres;
while ((pres = (HText *) HTList_nextObject(cur)))
RHText_delete(pres);
HTList_delete(mr->htext);
}
/* Close all the log files */
if (mr->flags & MR_LOGGING) {
if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n");
}
if (mr->log) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged %5d entries in general log file `%s\'\n",
HTLog_accessCount(mr->log), mr->logfile);
HTLog_close(mr->log);
}
if (mr->ref) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged %5d entries in referer log file `%s\'\n",
HTLog_accessCount(mr->ref), mr->reffile);
HTLog_close(mr->ref);
}
if (mr->reject) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged %5d entries in rejected log file `%s\'\n",
HTLog_accessCount(mr->reject), mr->rejectfile);
HTLog_close(mr->reject);
}
if (mr->notfound) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged %5d entries in not found log file `%s\'\n",
HTLog_accessCount(mr->notfound), mr->notfoundfile);
HTLog_close(mr->notfound);
}
if (mr->conneg) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n",
HTLog_accessCount(mr->conneg), mr->connegfile);
HTLog_close(mr->conneg);
}
if (mr->noalttag) {
if (SHOW_REAL_QUIET(mr))
HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n",
HTLog_accessCount(mr->noalttag), mr->noalttagfile);
HTLog_close(mr->noalttag);
}
if (mr->output && mr->output != STDOUT) fclose(mr->output);
if (mr->flags & MR_TIME) {
time_t local = time(NULL);
if (SHOW_REAL_QUIET(mr))
HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
}
/* This is new */
if(mr->cdepth)
HT_FREE(mr->cdepth);
if(mr->furl) HT_FREE(mr->furl);
#ifdef HT_POSIX_REGEX
if (mr->include) {
regfree(mr->include);
HT_FREE(mr->include);
}
if (mr->exclude) {
regfree(mr->exclude);
HT_FREE(mr->exclude);
}
if (mr->exc_robot) {
regfree(mr->exc_robot);
HT_FREE(mr->exc_robot);
}
if (mr->check) {
regfree(mr->check);
HT_FREE(mr->check);
}
#endif
#ifdef HT_MYSQL
if (mr->sqllog) {
HTSQLLog_close(mr->sqllog);
mr->sqllog = NULL;
}
#endif
if (mr->queue) HTQueue_delete(mr->queue);
HT_FREE(mr->cwd);
HT_FREE(mr->prefix);
HT_FREE(mr->img_prefix);
HT_FREE(mr);
return YES;
}
return NO;
}
/*
** This function creates a new finger object and initializes it with a new request
*/
PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
{
Finger * me;
HTRequest * request = HTRequest_new();
if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
HT_OUTOFMEM("Finger_new");
me->robot = robot;
me->request = request;
me->dest = dest;
HTList_addObject(robot->fingers, (void *)me);
/* Set the context for this request */
HTRequest_setContext (request, me);
/* Check the various flags to customize the request */
if (robot->flags & MR_PREEMPTIVE)
HTRequest_setPreemptive(request, YES);
if (robot->flags & MR_VALIDATE)
HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
if (robot->flags & MR_END_VALIDATE)
HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
/* We wanna make sure that we are sending a Host header (default) */
HTRequest_addRqHd(request, HT_C_HOST);
/* Set the method for this request */
HTRequest_setMethod(request, method);
robot->cnt++;
return me;
}
PRIVATE int Finger_delete (Finger * me)
{
HTList_removeObject(me->robot->fingers, (void *)me);
me->robot->cnt--;
/*
** If we are down at one request then flush the output buffer
*/
if (me->request) {
if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
HTRequest_delete(me->request);
}
/*
** Delete the request and free myself
*/
HT_FREE(me);
return YES;
}
/*
** Cleanup and make sure we close all connections including the persistent
** ones
*/
PUBLIC void Cleanup (Robot * me, int status)
{
HTProfile_delete();
Robot_delete(me);
#ifdef HT_MEMLOG
HTMemLog_close();
#endif
#ifdef VMS
exit(status ? status : 1);
#else
exit(status ? status : 0);
#endif
}
#ifdef HT_POSIX_REGEX
PRIVATE char * get_regerror (int errcode, regex_t * compiled)
{
size_t length = regerror (errcode, compiled, NULL, 0);
char * str = NULL;
if ((str = (char *) HT_MALLOC(length+1)) == NULL)
HT_OUTOFMEM("get_regerror");
(void) regerror (errcode, compiled, str, length);
return str;
}
PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
{
regex_t * regex = NULL;
if (regex_str && *regex_str) {
int status;
if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
HT_OUTOFMEM("get_regtype");
if ((status = regcomp(regex, regex_str, cflags))) {
char * err_msg = get_regerror(status, regex);
if (SHOW_REAL_QUIET(mr))
HTPrint("Regular expression error: %s\n", err_msg);
HT_FREE(err_msg);
Cleanup(mr, -1);
}
}
return regex;
}
#endif
PUBLIC void VersionInfo (void)
{
HTPrint("\nW3C OpenSource Software");
HTPrint("\n-----------------------\n\n");
HTPrint("\tWebbot version %s\n", APP_VERSION);
HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE);
HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
}
/* terminate_handler
** -----------------
** This function is registered to handle the result of the request.
** If no more requests are pending then terminate program
*/
PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
void * param, int status)
{
Finger * finger = (Finger *) HTRequest_context(request);
Robot * mr = finger->robot;
if (SHOW_QUIET(mr)) HTPrint("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
#ifdef HT_MYSQL
if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
#endif
/* Check if negotiated resource and whether we should log that*/
if (mr->conneg) {
HTAssocList * cur = HTResponse_variant(response);
if (cur) {
BOOL first = YES;
HTChunk * buffer = HTChunk_new(128);
char * uri = HTAnchor_address((HTAnchor *) finger->dest);
HTAssoc * pres;
HTChunk_puts(buffer, uri);
while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
char * value = HTAssoc_value(pres);
if (first) {
HTChunk_puts(buffer, "\t(");
first = NO;
} else
HTChunk_puts(buffer, ", ");
/* Output the name */
HTChunk_puts(buffer, HTAssoc_name(pres));
/* Only output the value if not empty string */
if (value && *value) {
HTChunk_puts(buffer, "=");
HTChunk_puts(buffer, value);
}
}
if (!first) HTChunk_puts(buffer, ")");
HTLog_addLine(mr->conneg, HTChunk_data(buffer));
HTChunk_delete(buffer);
HT_FREE(uri);
}
}
/* Count the amount of body data that we have read */
if (HTRequest_method(request) == METHOD_GET) {
int length = HTAnchor_length(HTRequest_anchor(request));
if (length > 0) mr->get_bytes += length;
mr->get_docs++;
} else if (HTRequest_method(request) == METHOD_HEAD) {
int length = HTAnchor_length(HTRequest_anchor(request));
if (length > 0) mr->head_bytes += length;
mr->head_docs++;
} else {
mr->other_docs++;
}
if (!(mr->flags & MR_BFS)) {
/* Delete this thread */
Finger_delete(finger);
/* Should we stop? */
if (mr->cnt <= 0) {
if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n");
Cleanup(mr, 0); /* No way back from here */
}
}
if (SHOW_QUIET(mr)) HTPrint(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
return HT_OK;
}
PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
void * param, int status)
{
Finger * finger = (Finger *) HTRequest_context(request);
Robot * mr = finger->robot;
HTParentAnchor * dest = finger->dest;
HyperDoc * hd = HTAnchor_document(dest);
int depth = (hd ? hd->depth : -1);
if (hd) set_error_state_hyperdoc(hd,request);
if(hd && (HTRequest_method(request)== METHOD_HEAD) &&
(depth < mr->depth))
{
hd->method = METHOD_GET;
HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
}
Finger_delete(finger);
if(!(mr->flags & MR_PREEMPTIVE))
Serving_queue(mr);
return HT_OK;
}
PUBLIC void Serving_queue(Robot *mr)
{
BOOL abort = NO;
Finger *nfinger;
while(!abort)
{
if(!HTQueue_isEmpty(mr->queue))
{
HTRequest *newreq;
HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
if(nhd)
{
char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
HTQueue_dequeue(mr->queue); (mr->cq)--;
nfinger = Finger_new(mr, nhd->anchor, nhd->method);
newreq = nfinger->request;
if(SHOW_QUIET(mr)) HTPrint("Request from QUEUE %s\n",uri);
HT_FREE(uri);
if(SHOW_QUIET(mr)) HTPrint("%d elements in queue \n", mr->cq);
HTRequest_setParent(newreq,get_last_parent(nhd->anchor));
/* @@@ Should be done using a timer and not sleep! @@@ */
#if 0
if(mr->waits)
sleep(mr->waits);
#endif
if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES)
{
if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
Finger_delete(nfinger);
}
}
else
abort = YES;
}
else
abort = YES;
}
if(SHOW_QUIET(mr)) HTPrint("Queue size: %d \n", mr->cq);
if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
{
if(mr->cnt > 0)
if(SHOW_QUIET(mr)) HTPrint("%d requests were not served\n", mr->cnt);
if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n");
Cleanup(mr, 0); /* No way back from here */
}
}
/* ------------------------------------------------------------------------- */
/* HTEXT INTERFACE */
/* ------------------------------------------------------------------------- */
PUBLIC BOOL Robot_registerHTMLParser (void)
{
HText_registerCDCallback(RHText_new, RHText_delete);
HText_registerLinkCallback(RHText_foundLink);
return YES;
}
PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,
HTStream * stream)
{
HText * me;
Finger * finger = (Finger *) HTRequest_context(request);
Robot * mr = finger->robot;
char * robots = NULL;
if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
HT_OUTOFMEM("HText_new2");
/* Bind the HText object together with the Request Object */
me->request = request;
me->follow = YES;
/* Check to see if we have any meta tags */
if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
char * strval = NULL;
char * ptr = NULL;
char * token = NULL;
StrAllocCopy(strval, robots);
ptr = strval;
while ((token = HTNextField(&ptr)) != NULL) {
if (!strcasecomp(token, "nofollow")) {
me->follow = NO;
break;
}
}
HT_FREE(strval);
}
/* Add this HyperDoc object to our list */
if (!mr->htext) mr->htext = HTList_new();
HTList_addObject(mr->htext, (void *) me);
return me;
}
PRIVATE BOOL RHText_delete (HText * me) {
if (me) {
HT_FREE(me);
return YES;
}
return NO;
}
PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor)
{
if (text && anchor) {
Finger * finger = (Finger *) HTRequest_context(text->request);
Robot * mr = finger->robot;
HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
HTParentAnchor * dest_parent = HTAnchor_parent(dest);
char * uri = HTAnchor_address((HTAnchor *) dest_parent);
HyperDoc * hd = HTAnchor_document(dest_parent);
HTParentAnchor * referer = HTRequest_anchor(text->request);
BOOL match = text->follow;
BOOL check = NO;
/* These are new variables */
HyperDoc * nhd = NULL;
BOOL follow = YES;
/* These three variables were moved */
/*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
HyperDoc * last_doc = HTAnchor_document(last_anchor);
int depth = last_doc ? last_doc->depth+1 : 0;
if (!uri) return;
if (SHOW_QUIET(mr)) HTPrint("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");
if (hd) {
if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
hd->hits++;
#ifdef HT_MYSQL
if (mr->sqllog) {
char * ref_addr = HTAnchor_address((HTAnchor *) referer);
if (ref_addr) {
HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
"referer", NULL);
HT_FREE(ref_addr);
}
}
#endif
HT_FREE(uri);
return;
}
/* Check for prefix match */
if (match && mr->prefix) {
match = HTStrMatch(mr->prefix, uri) ? YES : NO;
}
#ifdef HT_POSIX_REGEX
/*
** Check for any regular expression. The include may override
** the prefix matching
*/
if (mr->include) {
match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
}
if (match && mr->exc_robot) {
match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
}
if (match && mr->exclude) {
match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
}
if (match && mr->check) {
check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
}
#endif
if(uri && test_for_blank_spaces(uri))
follow = NO;
else if (mr->ndoc == 0) /* Number of Documents is reached */
follow = NO;
/* Test whether we already have a hyperdoc for this document */
if(!hd && dest_parent)
{
nhd = HyperDoc_new(mr, dest_parent, depth);
mr->cdepth[depth]++;
}
/* Test whether we already have a hyperdoc for this document */
if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
if (mr->flags & MR_BFS) {
nhd->method = METHOD_HEAD;
HTQueue_enqueue(mr->queue, (void *) nhd);
(mr->cq)++;
if(mr->ndoc > 0) mr->ndoc--;
} else {
Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
HTRequest * newreq = newfinger->request;
HTRequest_setParent(newreq, referer);
if (check || depth >= mr->depth) {
if (SHOW_QUIET(mr)) HTPrint("loading at depth %d using HEAD\n", depth);
HTRequest_setMethod(newreq, METHOD_HEAD);
} else {
if (SHOW_QUIET(mr)) HTPrint("loading at depth %d\n", depth);
}
if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
Finger_delete(newfinger);
}
}
} else {
if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
#ifdef HT_MYSQL
if (mr->reject || mr->sqllog) {
#else
if (mr->reject) {
#endif
if (referer) {
char * ref_addr = HTAnchor_address((HTAnchor *) referer);
if (mr->reject && ref_addr)
HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
#ifdef HT_MYSQL
if (mr->sqllog && mr->sqlexternals && ref_addr)
HTSQLLog_addLinkRelationship(mr->sqllog,
ref_addr, uri,
"referer", NULL);
#endif
HT_FREE(ref_addr);
}
}
}
HT_FREE(uri);
}
}
PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,
const char *alt, const char * align, BOOL isMap)
{
if (text && anchor) {
Finger * finger = (Finger *) HTRequest_context(text->request);
Robot * mr = finger->robot;
if (mr->flags & MR_IMG) {
HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
HTParentAnchor * dest_parent = HTAnchor_parent(dest);
char * uri = HTAnchor_address((HTAnchor *) dest_parent);
HyperDoc * hd = HTAnchor_document(dest_parent);
HTParentAnchor * referer = HTRequest_anchor(text->request);
BOOL match = YES;
if (!uri) return;
if (hd) {
if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
hd->hits++;
#ifdef HT_MYSQL
if (mr->sqllog) {
char * ref_addr = HTAnchor_address((HTAnchor *) referer);
if (ref_addr) {
HTSQLLog_addLinkRelationship(mr->sqllog,
ref_addr, uri,
"image", alt);
HT_FREE(ref_addr);
}
}
#endif
HT_FREE(uri);
return;
}
/* Check for prefix match */
if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
#ifdef HT_POSIX_REGEX
/*
** Check for any regular expression. The include may override
** the prefix matching
*/
if (mr->include) {
match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
}
if (match && mr->exc_robot) {
match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
}
if (match && mr->exclude) {
match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
}
#endif
/* Test whether we already have a hyperdoc for this document */
if (match && dest) {
Finger * newfinger = Finger_new(mr, dest_parent,
mr->flags & MR_SAVE ?
METHOD_GET : METHOD_HEAD);
HTRequest * newreq = newfinger->request;
HyperDoc_new(mr, dest_parent, 1);
HTRequest_setParent(newreq, referer);
/* Check whether we should report missing ALT tags */
if (mr->noalttag && (alt==NULL || *alt=='\0')) {
if (referer) {
char * ref_addr = HTAnchor_address((HTAnchor *) referer);
if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
HT_FREE(ref_addr);
}
}
if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking Image `%s\'\n", uri);
if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
if (SHOW_QUIET(mr)) HTPrint("Robot....... Image not tested!\n");
Finger_delete(newfinger);
}
} else {
if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
#ifdef HT_MYSQL
if (mr->reject || mr->sqllog) {
#else
if (mr->reject) {
#endif
if (referer) {
char * ref_addr = HTAnchor_address((HTAnchor *) referer);
if (mr->reject && ref_addr)
HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
#ifdef HT_MYSQL
if (mr->sqllog && mr->sqlexternals && ref_addr)
HTSQLLog_addLinkRelationship(mr->sqllog,
ref_addr, uri,
"image", alt);
#endif
HT_FREE(ref_addr);
}
}
}
HT_FREE(uri);
}
}
}
PRIVATE void RHText_foundLink (HText * text,
int element_number, int attribute_number,
HTChildAnchor * anchor,
const BOOL * present, const char ** value)
{
if (text && anchor) {
Finger * finger = (Finger *) HTRequest_context(text->request);
Robot * mr = finger->robot;
if (SHOW_QUIET(mr))
HTPrint("Robot....... Received element %d, attribute %d with anchor %p\n",
element_number, attribute_number, anchor);
if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) ||
(element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND))
RHText_foundImage(text, anchor, NULL, NULL, NO);
else
RHText_foundAnchor(text, anchor);
}
}
PUBLIC char * get_robots_txt(char * uri)
{
char *str = NULL;
HTChunk * chunk;
HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
HTRequest *request = HTRequest_new();
HTRequest_setOutputFormat(request, WWW_SOURCE);
HTRequest_setPreemptive(request, YES);
HTRequest_setMethod(request, METHOD_GET);
chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
str = HTChunk_toCString(chunk);
HTRequest_delete(request);
return str;
}
Webmaster