/* HTRobot.c ** W3C MINI ROBOT ** ** (c) COPRIGHT MIT 1995. ** Please first read the full copyright statement in the file COPYRIGH. ** ** This program illustrates how to travers links using the Anchor object ** ** Authors: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org) ** ** History: ** Dec 04 95 First version */ #include "WWWLib.h" /* Global Library Include file */ #include "WWWApp.h" /* Application stuff */ #include "WWWTrans.h" #include "WWWInit.h" #include "WWWSQL.h" #include "HText.h" #include "HTRobot.h" /* Implemented here */ #ifdef HT_POSIX_REGEX #ifdef HAVE_RXPOSIX_H #include #else #ifdef HAVE_REGEX_H #include #endif #endif #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE) #endif #ifndef W3C_VERSION #define W3C_VERSION "Unspecified" #endif #define APP_NAME "W3CRobot" #define APP_VERSION W3C_VERSION #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine" #define DEFAULT_OUTPUT_FILE "robot.out" #define DEFAULT_RULE_FILE "robot.conf" #define DEFAULT_LOG_FILE "log-clf.txt" #define DEFAULT_HIT_FILE "log-hit.txt" #define DEFAULT_REL_FILE "log-rel.txt" #define DEFAULT_LM_FILE "log-lastmodified.txt" #define DEFAULT_TITLE_FILE "log-title.txt" #define DEFAULT_REFERER_FILE "log-referer.txt" #define DEFAULT_REJECT_FILE "log-reject.txt" #define DEFAULT_NOTFOUND_FILE "log-notfound.txt" #define DEFAULT_CONNEG_FILE "log-conneg.txt" #define DEFAULT_NOALTTAG_FILE "log-alt.txt" #define DEFAULT_FORMAT_FILE "log-format.txt" #define DEFAULT_CHARSET_FILE "log-charset.txt" #define DEFAULT_MEMLOG "robot.mem" #define DEFAULT_PREFIX "" #define DEFAULT_IMG_PREFIX "" #define DEFAULT_DEPTH 0 #define DEFAULT_DELAY 50 /* Write delay in ms */ #define DEFAULT_SQL_SERVER "localhost" #define DEFAULT_SQL_DB "webbot" #define DEFAULT_SQL_USER "webbot" #define DEFAULT_SQL_PW "" #if 0 #define HT_MEMLOG /* Is expensive in performance! */ #endif /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */ #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET)) #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET)) #define MILLIES 1000 #define DEFAULT_TIMEOUT 20 /* timeout in secs */ #if defined(__svr4__) #define CATCH_SIG #endif typedef enum _MRFlags { MR_IMG = 0x1, MR_LINK = 0x2, MR_PREEMPTIVE = 0x4, MR_TIME = 0x8, MR_SAVE = 0x10, MR_QUIET = 0x20, MR_REAL_QUIET = 0x40, MR_VALIDATE = 0x80, MR_END_VALIDATE = 0x100, MR_KEEP_META = 0x200, MR_LOGGING = 0x400, MR_DISTRIBUTIONS = 0x800 } MRFlags; typedef struct _Robot { int depth; /* How deep is our tree */ int cnt; /* Count of requests */ HTList * hyperdoc; /* List of our HyperDoc Objects */ HTList * htext; /* List of our HText Objects */ HTList * fingers; int timer; char * cwd; /* Current dir URL */ char * rules; char * prefix; char * img_prefix; char * logfile; /* clf log */ HTLog * log; char * reffile; /* referer log */ HTLog * ref; char * rejectfile; /* unchecked links */ HTLog * reject; char * notfoundfile; /* links that returned 404 */ HTLog * notfound; char * connegfile; /* links that were conneg'ed */ HTLog * conneg; char * noalttagfile; /* images without alt tags*/ HTLog * noalttag; char * hitfile; /* links sorted after hit counts */ char * relfile; /* link sorted after relationships */ HTLinkType relation; /* Specific relation to look for */ char * titlefile; /* links with titles */ char * mtfile; /* media types encountered */ char * charsetfile; /* charsets encountered */ char * lmfile; /* sortef after last modified dates */ char * outputfile; FILE * output; MRFlags flags; long get_bytes; /* Total number of bytes processed using GET*/ long get_docs; /* Total number of documents using GET */ long head_bytes; /* bytes processed bytes processed using HEAD */ long head_docs; /* Total number of documents using HEAD*/ long other_docs; ms_t time; /* Time of run */ #ifdef HT_POSIX_REGEX regex_t * include; regex_t * exclude; regex_t * check; #endif #ifdef HT_MYSQL HTSQLLog * sqllog; char * sqlserver; char * sqldb; char * sqluser; char * sqlpw; char * sqlrelative; BOOL sqlexternals; int sqlflags; #endif } Robot; typedef struct _Finger { Robot * robot; HTRequest * request; HTParentAnchor * dest; } Finger; typedef enum _LoadState { L_INVALID = -2, L_LOADING = -1, L_SUCCESS = 0, L_ERROR } LoadState; /* ** The HyperDoc object is bound to the anchor and contains information about ** where we are in the search for recursive searches */ typedef struct _HyperDoc { HTParentAnchor * anchor; LoadState state; int depth; int hits; } HyperDoc; /* ** This is the HText object that is created every time we start parsing an ** HTML object */ struct _HText { HTRequest * request; BOOL follow; }; /* ** A structure for calculating metadata distributions */ typedef struct _MetaDist { HTAtom * name; int hits; } MetaDist; /* ** Some sorting algorithms */ PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort; PUBLIC HText * HTMainText = NULL; PUBLIC HTParentAnchor * HTMainAnchor = NULL; PUBLIC HTStyleSheet * styleSheet = NULL; /* ------------------------------------------------------------------------- */ /* Standard (non-error) Output ** --------------------------- */ PUBLIC int OutputData(const char * fmt, ...) { int ret; va_list pArgs; va_start(pArgs, fmt); ret = vfprintf(stdout, fmt, pArgs); va_end(pArgs); return ret; } /* ------------------------------------------------------------------------- */ /* Create a "HyperDoc" object ** -------------------------- ** A HyperDoc object contains information about whether we have already ** started checking the anchor and the depth in our search */ PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth) { HyperDoc * hd; if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL) HT_OUTOFMEM("HyperDoc_new"); hd->state = L_INVALID; hd->depth = depth; hd->hits = 1; /* Bind the HyperDoc object together with the Anchor Object */ hd->anchor = anchor; HTAnchor_setDocument(anchor, (void *) hd); /* Add this HyperDoc object to our list */ if (!mr->hyperdoc) mr->hyperdoc = HTList_new(); HTList_addObject(mr->hyperdoc, (void *) hd); return hd; } /* Delete a "HyperDoc" object ** -------------------------- */ PRIVATE BOOL HyperDoc_delete (HyperDoc * hd) { if (hd) { HT_FREE (hd); return YES; } return NO; } /* ** Sort the anchor array and log reference count */ PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array) { if (mr && array) { HTLog * log = HTLog_open(mr->hitfile, YES, YES); if (log) { void ** data = NULL; HTParentAnchor * anchor = NULL; HTArray_sort(array, HitSort); anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { char * uri = HTAnchor_address((HTAnchor *) anchor); HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor); if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri); HT_FREE(uri); anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } } HTLog_close(log); return YES; } return NO; } PRIVATE int HitSort (const void * a, const void * b) { HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a); HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b); if (aa && bb) return (bb->hits - aa->hits); return bb - aa; } /* ** Sort the anchor array and log link relations */ PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array) { if (mr && array) { HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL; void ** data = NULL; HTParentAnchor * anchor = NULL; anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { /* ** If we have a specific link relation to look for then do this. ** Otherwise look for all link relations. */ if (mr->relation) { HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation); if (link) { HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link)); char * src_uri = HTAnchor_address((HTAnchor *) anchor); char * dest_uri = HTAnchor_address((HTAnchor *) dest); if (src_uri && dest_uri) { #ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_addLinkRelationship (mr->sqllog, src_uri, dest_uri, HTAtom_name(mr->relation), NULL); } #endif if (log) { HTFormat format = HTAnchor_format(dest); HTLog_addText(log, "%s %s %s --> %s\n", HTAtom_name(mr->relation), format != WWW_UNKNOWN ? HTAtom_name(format) : "", src_uri, dest_uri); } /* Cleanup */ HT_FREE(src_uri); HT_FREE(dest_uri); } } } else { HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor); HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor); char * src_uri = HTAnchor_address((HTAnchor *) anchor); HTLinkType linktype; /* First look in the main link */ if (link && (linktype = HTLink_type(link))) { HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link)); char * dest_uri = HTAnchor_address((HTAnchor *) dest); if (src_uri && dest_uri) { #ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_addLinkRelationship (mr->sqllog, src_uri, dest_uri, HTAtom_name(linktype), NULL); } #endif if (log) { HTFormat format = HTAnchor_format(dest); HTLog_addText(log, "%s %s %s --> %s\n", HTAtom_name(linktype), format != WWW_UNKNOWN ? HTAtom_name(format) : "", src_uri, dest_uri); } } HT_FREE(dest_uri); } /* and then in any sublinks */ if (sublinks) { HTLink * pres; while ((pres = (HTLink *) HTList_nextObject(sublinks))) { if ((linktype = HTLink_type(pres))) { HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres)); char * dest_uri = HTAnchor_address((HTAnchor *) dest); if (src_uri && dest_uri) { #ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_addLinkRelationship (mr->sqllog, src_uri, dest_uri, HTAtom_name(linktype), NULL); } #endif if (log) { HTFormat format = HTAnchor_format(dest); HTLog_addText(log, "%s %s %s --> %s\n", HTAtom_name(linktype), format != WWW_UNKNOWN ? HTAtom_name(format) : "", src_uri, dest_uri); } HT_FREE(dest_uri); } } } } /* Cleanup */ HT_FREE(src_uri); } anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } if (log) HTLog_close(log); return YES; } return NO; } /* ** Sort the anchor array and log last modified date */ PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array) { if (mr && array) { HTLog * log = HTLog_open(mr->lmfile, YES, YES); if (log) { void ** data = NULL; HTParentAnchor * anchor = NULL; HTArray_sort(array, LastModifiedSort); anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { char * uri = HTAnchor_address((HTAnchor *) anchor); time_t lm = HTAnchor_lastModified(anchor); if (uri && lm > 0) HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri); HT_FREE(uri); anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } } HTLog_close(log); return YES; } return NO; } PRIVATE int LastModifiedSort (const void * a, const void * b) { time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a); time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b); return bb - aa; } /* ** Sort the anchor array and log the document title */ PRIVATE BOOL calculate_title (Robot * mr, HTArray * array) { if (mr && array) { HTLog * log = HTLog_open(mr->titlefile, YES, YES); if (log) { void ** data = NULL; HTParentAnchor * anchor = NULL; HTArray_sort(array, TitleSort); anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { char * uri = HTAnchor_address((HTAnchor *) anchor); const char * title = HTAnchor_title(anchor); HTCharset charset = HTAnchor_charset(anchor); if (uri) HTLog_addText(log, "%s `%s\' %s\n", charset ? HTAtom_name(charset) : "", title ? title : "", uri); HT_FREE(uri); anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } } HTLog_close(log); return YES; } return NO; } PRIVATE int TitleSort (const void * a, const void * b) { const char * aa = HTAnchor_title(*(HTParentAnchor **) a); const char * bb = HTAnchor_title(*(HTParentAnchor **) b); return strcasecomp(bb?bb:"", aa?aa:""); } /* ** Calculate distributions for media types. The same mechanism ** can be used for other characteristics with relatively ** few outcomes. */ PRIVATE HTList * mediatype_distribution (HTArray * array) { if (array) { HTList * mt = HTList_new(); MetaDist * pres = NULL; void ** data = NULL; HTParentAnchor * anchor = NULL; anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { HTFormat format = HTAnchor_format(anchor); if (format && format != WWW_UNKNOWN) { HTList * cur = mt; /* If found then increase counter */ while ((pres = (MetaDist *) HTList_nextObject(cur))) { if (pres->name == format) { pres->hits++; break; } } /* If not found then add new format to list */ if (!pres) { if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL) HT_OUTOFMEM("mediatype_distribution"); pres->name = format; pres->hits = 1; HTList_addObject(mt, pres); HTList_insertionSort(mt, FormatSort); } } /* Find next anchor in array */ anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } return mt; } return NULL; } /* ** Calculate distributions for charsets. The same mechanism ** can be used for other characteristics with relatively ** few outcomes. */ PRIVATE HTList * charset_distribution (HTArray * array) { if (array) { HTList * cs = HTList_new(); MetaDist * pres = NULL; void ** data = NULL; HTParentAnchor * anchor = NULL; anchor = (HTParentAnchor *) HTArray_firstObject(array, data); while (anchor) { HTCharset charset = HTAnchor_charset(anchor); if (charset) { HTList * cur = cs; /* If found then increase counter */ while ((pres = (MetaDist *) HTList_nextObject(cur))) { if (pres->name == charset) { pres->hits++; break; } } /* If not found then add new format to list */ if (!pres) { if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL) HT_OUTOFMEM("charset_distribution"); pres->name = charset; pres->hits = 1; HTList_addObject(cs, pres); HTList_insertionSort(cs, FormatSort); } } /* Find next anchor in array */ anchor = (HTParentAnchor *) HTArray_nextObject(array, data); } return cs; } return NULL; } PRIVATE int FormatSort (const void * a, const void * b) { MetaDist * aa = (MetaDist *) a; MetaDist * bb = (MetaDist *) b; return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name)); } PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution) { if (logfile && distribution) { HTLog * log = HTLog_open(logfile, YES, YES); if (log) { HTList * cur = distribution; MetaDist * pres; while ((pres = (MetaDist *) HTList_nextObject(cur))) { if (pres->name) { HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name)); } } HTLog_close(log); } } return NO; } PRIVATE BOOL delete_meta_distribution (HTList * distribution) { if (distribution) { HTList * cur = distribution; MetaDist * pres; while ((pres = (MetaDist *) HTList_nextObject(cur))) HT_FREE(pres); HTList_delete(distribution); return YES; } return NO; } /* Statistics ** ---------- ** Calculates a bunch of statistics for the anchors traversed */ PRIVATE BOOL calculate_statistics (Robot * mr) { long total_docs = mr->get_docs + mr->head_docs + mr->other_docs; if (!mr) return NO; /* Calculate efficiency */ if (mr->time > 0) { ms_t t = HTGetTimeInMillis() - mr->time; if (t > 0) { double loadfactor = (mr->get_bytes / (t * 0.001)); double reqprsec = (total_docs / (t * 0.001)); double secs = t / 1000.0; char bytes[50]; if (SHOW_REAL_QUIET(mr)) HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n", total_docs, secs, reqprsec); HTNumToStr(mr->get_bytes, bytes, 50); if (SHOW_REAL_QUIET(mr)) HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n", mr->get_docs, bytes, loadfactor); HTNumToStr(mr->head_bytes, bytes, 50); if (SHOW_REAL_QUIET(mr)) HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n", mr->head_docs, bytes); } } /* Create an array of existing anchors */ if (total_docs > 1) { HTArray * array = HTAnchor_getArray(total_docs); if (array) { /* Distributions */ if (mr->flags & MR_DISTRIBUTIONS) { if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n"); } /* Sort after hit counts */ if (mr->hitfile) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged hit count distribution in file `%s\'\n", mr->hitfile); calculate_hits(mr, array); } /* Sort after link relations */ #ifdef HT_MYSQL if (mr->relfile || mr->sqllog) { #else if (mr->relfile) { #endif if (mr->relfile && SHOW_REAL_QUIET(mr)) HTTrace("\tLogged link relationship distribution in file `%s\'\n", mr->relfile); calculate_linkRelations(mr, array); } /* Sort after modified date */ if (mr->lmfile) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged last modified distribution in file `%s\'\n", mr->lmfile); calculate_lm(mr, array); } /* Sort after title */ if (mr->titlefile) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged title distribution in file `%s\'\n", mr->titlefile); calculate_title(mr, array); } /* Find mediatype distribution */ if (mr->mtfile) { HTList * mtdist = mediatype_distribution(array); if (mtdist) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged media type distribution in file `%s\'\n", mr->mtfile); log_meta_distribution(mr->mtfile, mtdist); delete_meta_distribution(mtdist); } } /* Find charset distribution */ if (mr->charsetfile) { HTList * charsetdist = charset_distribution(array); if (charsetdist) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged charset distribution in file `%s\'\n", mr->charsetfile); log_meta_distribution(mr->charsetfile, charsetdist); delete_meta_distribution(charsetdist); } } /* Add as may other stats here as you like */ /* ... */ /* Delete the array */ HTArray_delete(array); } } return YES; } /* Create a Command Line Object ** ---------------------------- */ PRIVATE Robot * Robot_new (void) { Robot * me; if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL) HT_OUTOFMEM("Robot_new"); me->hyperdoc = HTList_new(); me->htext = HTList_new(); me->timer = DEFAULT_TIMEOUT*MILLIES; me->cwd = HTGetCurrentDirectoryURL(); me->output = OUTPUT; me->cnt = 0; me->fingers = HTList_new(); return me; } /* Delete a Command Line Object ** ---------------------------- */ PRIVATE BOOL Robot_delete (Robot * mr) { if (mr) { HTList_delete(mr->fingers); /* Calculate statistics */ calculate_statistics(mr); if (mr->hyperdoc) { HTList * cur = mr->hyperdoc; HyperDoc * pres; while ((pres = (HyperDoc *) HTList_nextObject(cur))) HyperDoc_delete(pres); HTList_delete(mr->hyperdoc); } if (mr->htext) { HTList * cur = mr->htext; HText * pres; while ((pres = (HText *) HTList_nextObject(cur))) HText_free(pres); HTList_delete(mr->htext); } /* Close all the log files */ if (mr->flags & MR_LOGGING) { if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n"); } if (mr->log) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged %5d entries in general log file `%s\'\n", HTLog_accessCount(mr->log), mr->logfile); HTLog_close(mr->log); } if (mr->ref) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged %5d entries in referer log file `%s\'\n", HTLog_accessCount(mr->ref), mr->reffile); HTLog_close(mr->ref); } if (mr->reject) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged %5d entries in rejected log file `%s\'\n", HTLog_accessCount(mr->reject), mr->rejectfile); HTLog_close(mr->reject); } if (mr->notfound) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged %5d entries in not found log file `%s\'\n", HTLog_accessCount(mr->notfound), mr->notfoundfile); HTLog_close(mr->notfound); } if (mr->conneg) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n", HTLog_accessCount(mr->conneg), mr->connegfile); HTLog_close(mr->conneg); } if (mr->noalttag) { if (SHOW_REAL_QUIET(mr)) HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n", HTLog_accessCount(mr->noalttag), mr->noalttagfile); HTLog_close(mr->noalttag); } if (mr->output && mr->output != STDOUT) fclose(mr->output); if (mr->flags & MR_TIME) { time_t local = time(NULL); if (SHOW_REAL_QUIET(mr)) HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES)); } #ifdef HT_POSIX_REGEX if (mr->include) { regfree(mr->include); HT_FREE(mr->include); } if (mr->exclude) { regfree(mr->exclude); HT_FREE(mr->exclude); } if (mr->check) { regfree(mr->check); HT_FREE(mr->check); } #endif #ifdef HT_MYSQL if (mr->sqllog) { HTSQLLog_close(mr->sqllog); mr->sqllog = NULL; } #endif HT_FREE(mr->cwd); HT_FREE(mr->prefix); HT_FREE(mr->img_prefix); HT_FREE(mr); return YES; } return NO; } /* ** This function creates a new finger object and initializes it with a new request */ PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method) { Finger * me; HTRequest * request = HTRequest_new(); if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL) HT_OUTOFMEM("Finger_new"); me->robot = robot; me->request = request; me->dest = dest; HTList_addObject(robot->fingers, (void *)me); /* Set the context for this request */ HTRequest_setContext (request, me); /* Check the various flags to customize the request */ if (robot->flags & MR_PREEMPTIVE) HTRequest_setPreemptive(request, YES); if (robot->flags & MR_VALIDATE) HTRequest_setReloadMode(request, HT_CACHE_VALIDATE); if (robot->flags & MR_END_VALIDATE) HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE); /* We wanna make sure that we are sending a Host header (default) */ HTRequest_addRqHd(request, HT_C_HOST); /* Set the method for this request */ HTRequest_setMethod(request, method); robot->cnt++; return me; } PRIVATE int Finger_delete (Finger * me) { HTList_removeObject(me->robot->fingers, (void *)me); me->robot->cnt--; /* ** If we are down at one request then flush the output buffer */ if (me->request) { if (me->robot->cnt == 1) HTRequest_forceFlush(me->request); HTRequest_delete(me->request); } /* ** Delete the request and free myself */ HT_FREE(me); return YES; } /* ** Cleanup and make sure we close all connections including the persistent ** ones */ PRIVATE void Cleanup (Robot * me, int status) { Robot_delete(me); HTProfile_delete(); #ifdef HT_MEMLOG HTMemLog_close(); #endif #ifdef VMS exit(status ? status : 1); #else exit(status ? status : 0); #endif } #ifdef CATCH_SIG #include /* SetSignal ** This function sets up signal handlers. This might not be necessary to ** call if the application has its own handlers (lossage on SVR4) */ PRIVATE void SetSignal (void) { /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal ** when attemting to connect to a remote host where you normally should ** get `connection refused' back */ if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n"); } else { if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n"); } #ifdef HT_MEMLOG HTMemLog_flush(); #endif } #endif /* CATCH_SIG */ #ifdef HT_POSIX_REGEX PRIVATE char * get_regerror (int errcode, regex_t * compiled) { size_t length = regerror (errcode, compiled, NULL, 0); char * str = NULL; if ((str = (char *) HT_MALLOC(length+1)) == NULL) HT_OUTOFMEM("get_regerror"); (void) regerror (errcode, compiled, str, length); return str; } PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags) { regex_t * regex = NULL; if (regex_str && *regex_str) { int status; if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL) HT_OUTOFMEM("get_regtype"); if ((status = regcomp(regex, regex_str, cflags))) { char * err_msg = get_regerror(status, regex); if (SHOW_REAL_QUIET(mr)) HTTrace("Regular expression error: %s\n", err_msg); HT_FREE(err_msg); Cleanup(mr, -1); } } return regex; } #endif PRIVATE void VersionInfo (void) { OutputData("W3C Sample Software\n\n"); OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION); OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version()); OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE); OutputData("Please send feedback to \n"); } /* terminate_handler ** ----------------- ** This function is registered to handle the result of the request. ** If no more requests are pending then terminate program */ PRIVATE int terminate_handler (HTRequest * request, HTResponse * response, void * param, int status) { Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest)); #ifdef HT_MYSQL if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status); #endif /* Check if negotiated resource and whether we should log that*/ if (mr->conneg) { HTAssocList * cur = HTResponse_variant(response); if (cur) { BOOL first = YES; HTChunk * buffer = HTChunk_new(128); char * uri = HTAnchor_address((HTAnchor *) finger->dest); HTAssoc * pres; HTChunk_puts(buffer, uri); while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) { char * value = HTAssoc_value(pres); if (first) { HTChunk_puts(buffer, "\t("); first = NO; } else HTChunk_puts(buffer, ", "); /* Output the name */ HTChunk_puts(buffer, HTAssoc_name(pres)); /* Only output the value if not empty string */ if (value && *value) { HTChunk_puts(buffer, "="); HTChunk_puts(buffer, value); } } if (!first) HTChunk_puts(buffer, ")"); HTLog_addLine(mr->conneg, HTChunk_data(buffer)); HTChunk_delete(buffer); HT_FREE(uri); } } /* Count the amount of body data that we have read */ if (HTRequest_method(request) == METHOD_GET) { int length = HTAnchor_length(HTRequest_anchor(request)); if (length > 0) mr->get_bytes += length; mr->get_docs++; } else if (HTRequest_method(request) == METHOD_HEAD) { int length = HTAnchor_length(HTRequest_anchor(request)); if (length > 0) mr->head_bytes += length; mr->head_docs++; } else { mr->other_docs++; } /* Cleanup the anchor so that we don't drown in metainformation */ if (!(mr->flags & MR_KEEP_META)) HTAnchor_clearHeader(HTRequest_anchor(request)); /* Delete this thread */ Finger_delete(finger); /* Should we stop? */ if (mr->cnt <= 0) { if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n"); Cleanup(mr, 0); /* No way back from here */ } if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s"); return HT_OK; } /* ------------------------------------------------------------------------- */ /* HTEXT INTERFACE */ /* ------------------------------------------------------------------------- */ PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor, HTStream * stream) { HText * me; Finger * finger = (Finger *) HTRequest_context(request); Robot * mr = finger->robot; char * robots = NULL; if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL) HT_OUTOFMEM("HText_new2"); /* Bind the HText object together with the Request Object */ me->request = request; me->follow = YES; /* Check to see if we have any meta tags */ if ((robots = HTAnchor_robots(anchor)) != NULL) { char * strval = NULL; char * ptr = NULL; char * token = NULL; StrAllocCopy(strval, robots); ptr = strval; while ((token = HTNextField(&ptr)) != NULL) { if (!strcasecomp(token, "nofollow")) { me->follow = NO; break; } } HT_FREE(strval); } /* Add this HyperDoc object to our list */ if (!mr->htext) mr->htext = HTList_new(); HTList_addObject(mr->htext, (void *) me); return me; } PUBLIC void HText_free (HText * me) { if (me) HT_FREE (me); } PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor) { if (text && anchor) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor); HTParentAnchor * dest_parent = HTAnchor_parent(dest); char * uri = HTAnchor_address((HTAnchor *) dest_parent); HyperDoc * hd = HTAnchor_document(dest_parent); HTParentAnchor * referer = HTRequest_anchor(text->request); BOOL match = text->follow; BOOL check = NO; if (!uri) return; if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n"); if (hd) { if (SHOW_QUIET(mr)) HTTrace("Already checked\n"); hd->hits++; #ifdef HT_MYSQL if (mr->sqllog) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (ref_addr) { HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "referer", NULL); HT_FREE(ref_addr); } } #endif HT_FREE(uri); return; } /* Check for prefix match */ if (match && mr->prefix) { match = HTStrMatch(mr->prefix, uri) ? YES : NO; } #ifdef HT_POSIX_REGEX /* ** Check for any regular expression. The include may override ** the prefix matching */ if (mr->include) { match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES; } if (match && mr->exclude) { match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO; } if (match && mr->check) { check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES; } #endif /* Test whether we already have a hyperdoc for this document */ if (mr->flags & MR_LINK && match && dest_parent) { HTParentAnchor * last_anchor = HTRequest_parent(text->request); HyperDoc * last_doc = HTAnchor_document(last_anchor); int depth = last_doc ? last_doc->depth+1 : 0; Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET); HTRequest * newreq = newfinger->request; HyperDoc_new(mr, dest_parent, depth); HTRequest_setParent(newreq, referer); if (check || depth >= mr->depth) { if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth); HTRequest_setMethod(newreq, METHOD_HEAD); } else { if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth); } if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) { if (SHOW_QUIET(mr)) HTTrace("not tested!\n"); Finger_delete(newfinger); } } else { if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n"); #ifdef HT_MYSQL if (mr->reject || mr->sqllog) { #else if (mr->reject) { #endif if (referer) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (mr->reject && ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri); #ifdef HT_MYSQL if (mr->sqllog && mr->sqlexternals && ref_addr) HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "referer", NULL); #endif HT_FREE(ref_addr); } } } HT_FREE(uri); } } PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor, const char *alt, const char * align, BOOL isMap) { if (text && anchor) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; if (mr->flags & MR_IMG) { HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor); HTParentAnchor * dest_parent = HTAnchor_parent(dest); char * uri = HTAnchor_address((HTAnchor *) dest_parent); HyperDoc * hd = HTAnchor_document(dest_parent); HTParentAnchor * referer = HTRequest_anchor(text->request); BOOL match = YES; if (!uri) return; if (hd) { if (SHOW_QUIET(mr)) HTTrace("Already checked\n"); hd->hits++; #ifdef HT_MYSQL if (mr->sqllog) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (ref_addr) { HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "image", alt); HT_FREE(ref_addr); } } #endif HT_FREE(uri); return; } /* Check for prefix match */ if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO; /* Test whether we already have a hyperdoc for this document */ if (match && dest) { Finger * newfinger = Finger_new(mr, dest_parent, mr->flags & MR_SAVE ? METHOD_GET : METHOD_HEAD); HTRequest * newreq = newfinger->request; HyperDoc_new(mr, dest_parent, 1); HTRequest_setParent(newreq, referer); /* Check whether we should report missing ALT tags */ if (mr->noalttag && (alt==NULL || *alt=='\0')) { if (referer) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri); HT_FREE(ref_addr); } } if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri); if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) { if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n"); Finger_delete(newfinger); } } else { if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n"); #ifdef HT_MYSQL if (mr->reject || mr->sqllog) { #else if (mr->reject) { #endif if (referer) { char * ref_addr = HTAnchor_address((HTAnchor *) referer); if (mr->reject && ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri); #ifdef HT_MYSQL if (mr->sqllog && mr->sqlexternals && ref_addr) HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri, "image", alt); #endif HT_FREE(ref_addr); } } } HT_FREE(uri); } } } PUBLIC void HText_appendLink (HText * text, HTChildAnchor * anchor, const BOOL * present, const char ** value) { if (text && anchor) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; if (SHOW_QUIET(mr)) HTTrace("Robot....... Received Link element with anchor %p\n", anchor); HText_beginAnchor(text, anchor); } } PUBLIC void HText_appendObject (HText * text, int element_number, const BOOL * present, const char ** value) { /* Here we can look for frames, link tags, meta tags etc. */ if (text && text->request) { Finger * finger = (Finger *) HTRequest_context(text->request); Robot * mr = finger->robot; if (SHOW_QUIET(mr)) HTTrace("Robot....... HText Object %p called with HTML element number %d\n", text, element_number); switch (element_number) { case HTML_FRAME: { HTChildAnchor * source = HTAnchor_findChildAndLink( HTRequest_anchor(text->request), /* Parent */ NULL, /* Tag */ present[HTML_FRAME_SRC] ? value[HTML_FRAME_SRC] : NULL, /* Addresss */ NULL); /* Rels */ HText_beginAnchor(text, source); } break; case HTML_BODY: { HTChildAnchor * source = HTAnchor_findChildAndLink( HTRequest_anchor(text->request), /* Parent */ NULL, /* Tag */ present[HTML_BODY_BACKGROUND] ? value[HTML_BODY_BACKGROUND] : NULL, /* Addresss */ NULL); /* Rels */ HText_appendImage(text, source, NULL, NULL, NO); } break; default: break; } } } PUBLIC void HText_endAnchor (HText * text) {} PUBLIC void HText_appendText (HText * text, const char * str) {} PUBLIC void HText_appendCharacter (HText * text, char ch) {} PUBLIC void HText_endAppend (HText * text) {} PUBLIC void HText_setStyle (HText * text, HTStyle * style) {} PUBLIC void HText_beginAppend (HText * text) {} PUBLIC void HText_appendParagraph (HText * text) {} PRIVATE int RobotTrace (const char * fmt, va_list pArgs) { return (vfprintf(stderr, fmt, pArgs)); } /* ------------------------------------------------------------------------- */ /* MAIN PROGRAM */ /* ------------------------------------------------------------------------- */ int main (int argc, char ** argv) { int status = 0; int arg; BOOL cache = NO; /* Use persistent cache */ BOOL flush = NO; /* flush the persistent cache */ char * cache_root = NULL; HTChunk * keywords = NULL; /* From command line */ int keycnt = 0; Robot * mr = NULL; Finger * finger = NULL; HTParentAnchor * startAnchor = NULL; /* Starts Mac GUSI socket library */ #ifdef GUSI GUSISetup(GUSIwithSIOUXSockets); GUSISetup(GUSIwithInternetSockets); #endif #ifdef __MWERKS__ /* STR */ InitGraf((Ptr) &qd.thePort); InitFonts(); InitWindows(); InitMenus(); TEInit(); InitDialogs(nil); InitCursor(); SIOUXSettings.asktosaveonclose = false; argc=ccommand(&argv); #endif /* __MWERKS__ */ #ifdef HT_MEMLOG HTMemLog_open(DEFAULT_MEMLOG, 8192, YES); #endif /* Initiate W3C Reference Library with a robot profile */ HTProfile_newRobot(APP_NAME, APP_VERSION); HTTrace_setCallback(RobotTrace); /* Add the default HTML parser to the set of converters */ { HTList * converters = HTFormat_conversion(); HTMLInit(converters); } /* Build a new robot object */ mr = Robot_new(); /* Scan command Line for parameters */ for (arg=1; arglogfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_LOG_FILE; mr->flags |= MR_LOGGING; /* referer log file */ } else if (!strncmp(argv[arg], "-ref", 4)) { mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_REFERER_FILE; mr->flags |= MR_LOGGING; /* Not found error log file */ } else if (!strncmp(argv[arg], "-404", 4)) { mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_NOTFOUND_FILE; mr->flags |= MR_LOGGING; /* reject log file */ } else if (!strncmp(argv[arg], "-rej", 4)) { mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_REJECT_FILE; mr->flags |= MR_LOGGING; /* no alt tags log file */ } else if (!strncmp(argv[arg], "-alt", 4)) { mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_NOALTTAG_FILE; mr->flags |= MR_LOGGING; /* negotiated resource log file */ } else if (!strncmp(argv[arg], "-neg", 4)) { mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_CONNEG_FILE; mr->flags |= MR_LOGGING; /* hit file log */ } else if (!strcmp(argv[arg], "-hit")) { mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_HIT_FILE; mr->flags |= MR_DISTRIBUTIONS; /* link relations file log */ } else if (!strcmp(argv[arg], "-rellog")) { mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_REL_FILE; mr->flags |= MR_DISTRIBUTIONS; /* Specific link relation to look for (only used i also -rellog) */ } else if (!strcmp(argv[arg], "-relation")) { mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ? (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL; mr->flags |= MR_DISTRIBUTIONS; /* last modified log file */ } else if (!strcmp(argv[arg], "-lm")) { mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_LM_FILE; mr->flags |= MR_DISTRIBUTIONS; /* title log file */ } else if (!strcmp(argv[arg], "-title")) { mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_TITLE_FILE; mr->flags |= MR_DISTRIBUTIONS; /* mediatype distribution log file */ } else if (!strncmp(argv[arg], "-for", 4)) { mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_FORMAT_FILE; mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS); /* charset distribution log file */ } else if (!strncmp(argv[arg], "-char", 5)) { mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_CHARSET_FILE; mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS); /* rule file */ } else if (!strcmp(argv[arg], "-r")) { mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_RULE_FILE; /* output filename */ } else if (!strcmp(argv[arg], "-o")) { mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_OUTPUT_FILE; /* URI prefix */ } else if (!strcmp(argv[arg], "-prefix")) { char * prefix = NULL; prefix = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_PREFIX; if (*prefix && *prefix != '*') { StrAllocCopy(mr->prefix, prefix); StrAllocCat(mr->prefix, "*"); } /* timeout -- Change the default request timeout */ } else if (!strcmp(argv[arg], "-timeout")) { int timeout = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_TIMEOUT; if (timeout > 1) mr->timer = timeout*MILLIES; /* Force no pipelined requests */ } else if (!strcmp(argv[arg], "-nopipe")) { HTTP_setConnectionMode(HTTP_11_NO_PIPELINING); /* Start the persistent cache */ } else if (!strcmp(argv[arg], "-cache")) { cache = YES; /* Determine the cache root */ } else if (!strcmp(argv[arg], "-cacheroot")) { cache_root = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : NULL; /* Stream write flush delay in ms */ } else if (!strcmp(argv[arg], "-delay")) { int delay = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_DELAY; HTHost_setDefaultWriteDelay(delay); /* Persistent cache flush */ } else if (!strcmp(argv[arg], "-flush")) { flush = YES; /* Do a cache validation */ } else if (!strcmp(argv[arg], "-validate")) { mr->flags |= MR_VALIDATE; /* Do an end-to-end cache-validation */ } else if (!strcmp(argv[arg], "-endvalidate")) { mr->flags |= MR_END_VALIDATE; /* preemptive or non-preemptive access */ } else if (!strcmp(argv[arg], "-single")) { mr->flags |= MR_PREEMPTIVE; /* test inlined images */ } else if (!strcmp(argv[arg], "-img")) { mr->flags |= MR_IMG; /* load inlined images */ } else if (!strcmp(argv[arg], "-saveimg")) { mr->flags |= (MR_IMG | MR_SAVE); /* URI prefix for inlined images */ } else if (!strcmp(argv[arg], "-imgprefix")) { char * prefix = NULL; prefix = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_IMG_PREFIX; if (*prefix && *prefix!='*') { StrAllocCopy(mr->img_prefix, prefix); StrAllocCat(mr->img_prefix, "*"); } /* load anchors */ } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) { mr->flags |= MR_LINK; mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_DEPTH; /* Output start and end time */ } else if (!strcmp(argv[arg], "-ss")) { mr->flags |= MR_TIME; /* print version and exit */ } else if (!strcmp(argv[arg], "-version")) { VersionInfo(); Cleanup(mr, 0); /* run in quiet mode */ } else if (!strcmp(argv[arg], "-q")) { mr->flags |= MR_QUIET; /* run in really quiet mode */ } else if (!strcmp(argv[arg], "-Q")) { mr->flags |= MR_REAL_QUIET; #ifdef WWWTRACE /* trace flags */ } else if (!strncmp(argv[arg], "-v", 2)) { HTSetTraceMessageMask(argv[arg]+2); #endif #ifdef HT_POSIX_REGEX /* If we can link against a POSIX regex library */ } else if (!strncmp(argv[arg], "-inc", 4)) { if (arg+1 < argc && *argv[arg+1] != '-') { mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS); } } else if (!strncmp(argv[arg], "-exc", 4)) { if (arg+1 < argc && *argv[arg+1] != '-') { mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS); } } else if (!strncmp(argv[arg], "-check", 6)) { if (arg+1 < argc && *argv[arg+1] != '-') { mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS); } #endif #ifdef HT_MYSQL /* If we can link against a MYSQL database library */ } else if (!strncmp(argv[arg], "-sqldb", 5)) { mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_DB; } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) { mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE; } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) { mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE; } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) { mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE; } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) { mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE; } else if (!strncmp(argv[arg], "-sqlexternals", 5)) { mr->sqlexternals = YES; } else if (!strncmp(argv[arg], "-sqlpassword", 5)) { mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_PW; } else if (!strncmp(argv[arg], "-sqlrelative", 5)) { mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : NULL; } else if (!strncmp(argv[arg], "-sqlserver", 5)) { mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_SERVER; } else if (!strncmp(argv[arg], "-sqluser", 5)) { mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_USER; #endif } else { if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]); } } else { /* If no leading `-' then check for URL or keywords */ if (!keycnt) { char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL); startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref)); HyperDoc_new(mr, startAnchor, 0); keycnt = 1; HT_FREE(ref); } else { /* Check for successive keyword arguments */ char *escaped = HTEscape(argv[arg], URL_XALPHAS); if (keycnt++ <= 1) keywords = HTChunk_new(128); else HTChunk_putc(keywords, ' '); HTChunk_puts(keywords, HTStrip(escaped)); HT_FREE(escaped); } } } #ifdef CATCH_SIG SetSignal(); #endif if (!keycnt) { if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n"); Cleanup(mr, -1); } if (mr->depth != DEFAULT_DEPTH && (mr->prefix == NULL || *mr->prefix == '*')) { if (SHOW_REAL_QUIET(mr)) HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n", mr->depth); Cleanup(mr, -1); } /* Testing that HTTrace is working */ if (mr->flags & MR_TIME) { if (SHOW_REAL_QUIET(mr)) { time_t local = time(NULL); HTTrace("Welcome to the W3C mini Robot version %s - started on %s\n", APP_VERSION, HTDateTimeStr(&local, YES)); } } /* Rule file specified? */ if (mr->rules) { char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL); if (!HTLoadRulesAutomatically(rules)) if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n"); HT_FREE(rules); } /* Output file specified? */ if (mr->outputfile) { if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) { if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile); mr->output = OUTPUT; } } /* Should we use persistent cache? */ if (cache) { HTCacheInit(cache_root, 20); HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE); HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL, HT_NOT_MODIFIED, HT_FILTER_MIDDLE); /* Should we start by flushing? */ if (flush) HTCache_flushAll(); } /* SQL Log specified? */ #ifdef HT_MYSQL if (mr->sqlserver) { if ((mr->sqllog = HTSQLLog_open(mr->sqlserver, mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER, mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW, mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB, mr->sqlflags)) != NULL) { if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative); } } #endif /* CLF Log file specified? */ if (mr->logfile) { mr->log = HTLog_open(mr->logfile, YES, YES); if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE); } /* Referer Log file specified? */ if (mr->reffile) { mr->ref = HTLog_open(mr->reffile, YES, YES); if (mr->ref) HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE); } /* Not found error log specified? */ if (mr->notfoundfile) { mr->notfound = HTLog_open(mr->notfoundfile, YES, YES); if (mr->notfound) HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE); } /* Negotiated resource log specified? */ if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES); /* No alt tags log file specified? */ if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES); /* Reject Log file specified? */ if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES); /* Register our own terminate filter */ HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST); /* Setting event timeout */ HTHost_setEventTimeout(mr->timer); mr->time = HTGetTimeInMillis(); /* Start the request */ finger = Finger_new(mr, startAnchor, METHOD_GET); /* ** Make sure that the first request is flushed immediately and not ** buffered in the output buffer */ HTRequest_setFlush(finger->request, YES); /* ** Check whether we should do some kind of cache validation on ** the load */ if (mr->flags & MR_VALIDATE) HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE); if (mr->flags & MR_END_VALIDATE) HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE); /* ** Now do the load */ if (mr->flags & MR_PREEMPTIVE) HTRequest_setPreemptive(finger->request, YES); if (keywords) /* Search */ status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request); else status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request); if (keywords) HTChunk_delete(keywords); if (status != YES) { if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n"); Cleanup(mr, -1); } /* Go into the event loop... */ HTEventList_loop(finger->request); /* Only gets here if event loop fails */ Cleanup(mr, 0); return 0; }