Annotation of libwww/Robot/src/HTRobot.c, revision 1.60
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
26: #include "rxposix.h"
1.60 ! frystyk 27: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 28: #endif
29:
1.14 frystyk 30: #ifndef W3C_VERSION
1.33 eric 31: #define W3C_VERSION "Unspecified"
1.1 frystyk 32: #endif
33:
34: #define APP_NAME "W3CRobot"
1.14 frystyk 35: #define APP_VERSION W3C_VERSION
1.1 frystyk 36:
37: #define DEFAULT_OUTPUT_FILE "robot.out"
38: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 39: #define DEFAULT_LOG_FILE "log-clf.txt"
40: #define DEFAULT_HIT_FILE "log-hit.txt"
41: #define DEFAULT_REFERER_FILE "log-referer.txt"
42: #define DEFAULT_REJECT_FILE "log-reject.txt"
43: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
44: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 ! frystyk 45: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 46: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 ! frystyk 47: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 48: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 49: #define DEFAULT_PREFIX ""
1.59 frystyk 50: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 51: #define DEFAULT_DEPTH 0
1.53 frystyk 52: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 53:
1.51 frystyk 54: #if 0
1.53 frystyk 55: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 56: #endif
57:
1.46 eric 58: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
59: #define SHOW_MSG (!(mr->flags & MR_QUIET))
1.1 frystyk 60:
1.40 frystyk 61: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 62:
63: #if defined(__svr4__)
64: #define CATCH_SIG
65: #endif
66:
67: typedef enum _MRFlags {
1.45 frystyk 68: MR_IMG = 0x1,
69: MR_LINK = 0x2,
70: MR_PREEMPTIVE = 0x4,
71: MR_TIME = 0x8,
1.46 eric 72: MR_SAVE = 0x10,
1.48 frystyk 73: MR_QUIET = 0x20,
74: MR_VALIDATE = 0x40,
1.58 frystyk 75: MR_END_VALIDATE = 0x80,
76: MR_KEEP_META = 0x100
1.1 frystyk 77: } MRFlags;
78:
79: typedef struct _Robot {
1.2 frystyk 80: int depth; /* How deep is our tree */
1.30 frystyk 81: int cnt; /* Count of requests */
1.2 frystyk 82: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 83: HTList * htext; /* List of our HText Objects */
1.34 eric 84: HTList * fingers;
1.59 frystyk 85:
1.40 frystyk 86: int timer;
1.1 frystyk 87: char * cwd; /* Current dir URL */
88: char * rules;
1.55 frystyk 89: char * prefix;
1.59 frystyk 90: char * img_prefix;
91:
1.60 ! frystyk 92: char * logfile; /* clf log */
1.55 frystyk 93: HTLog * log;
1.60 ! frystyk 94: char * reffile; /* referer log */
1.57 frystyk 95: HTLog * ref;
1.60 ! frystyk 96: char * rejectfile; /* unchecked links */
1.58 frystyk 97: HTLog * reject;
1.60 ! frystyk 98: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 99: HTLog * notfound;
1.60 ! frystyk 100: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 101: HTLog * conneg;
1.60 ! frystyk 102: char * noalttagfile; /* images without alt tags*/
! 103: HTLog * noalttag;
! 104:
! 105: char * hitfile; /* links sorted after hit counts */
! 106: char * mtfile; /* media types encountered */
! 107: char * charsetfile; /* charsets encountered */
! 108:
! 109: char * outputfile;
1.1 frystyk 110: FILE * output;
1.59 frystyk 111:
1.1 frystyk 112: MRFlags flags;
1.55 frystyk 113:
1.59 frystyk 114: long get_bytes; /* Total number of bytes processed using GET*/
115: long get_docs; /* Total number of documents using GET */
116:
117: long head_bytes; /* bytes processed bytes processed using HEAD */
118: long head_docs; /* Total number of documents using HEAD*/
119:
120: long other_docs;
121:
1.56 frystyk 122: ms_t time; /* Time of run */
1.58 frystyk 123:
124: #ifdef HT_POSIX_REGEX
125: regex_t * include;
126: regex_t * exclude;
127: regex_t * check;
128: #endif
129:
1.1 frystyk 130: } Robot;
1.34 eric 131:
132: typedef struct _Finger {
133: Robot * robot;
134: HTRequest * request;
135: HTParentAnchor * dest;
136: } Finger;
137:
1.1 frystyk 138: typedef enum _LoadState {
139: L_INVALID = -2,
140: L_LOADING = -1,
141: L_SUCCESS = 0,
142: L_ERROR
143: } LoadState;
144:
145: /*
146: ** The HyperDoc object is bound to the anchor and contains information about
147: ** where we are in the search for recursive searches
148: */
149: typedef struct _HyperDoc {
150: HTParentAnchor * anchor;
151: LoadState state;
152: int depth;
1.55 frystyk 153: int hits;
1.1 frystyk 154: } HyperDoc;
155:
156: /*
157: ** This is the HText object that is created every time we start parsing a
158: ** HTML object
159: */
1.4 frystyk 160: struct _HText {
1.1 frystyk 161: HTRequest * request;
1.4 frystyk 162: };
1.1 frystyk 163:
1.58 frystyk 164: /*
165: ** A structure for calculating metadata distributions
166: */
167: typedef struct _MetaDist {
168: HTAtom * name;
169: int hits;
170: } MetaDist;
171:
172: /*
173: ** Some sorting algorithms
174: */
175: PRIVATE HTComparer HitSort, FormatSort;
176:
1.1 frystyk 177: PUBLIC HText * HTMainText = NULL;
178: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
179: PUBLIC HTStyleSheet * styleSheet = NULL;
180:
181: /* ------------------------------------------------------------------------- */
182:
1.13 eric 183: /* Standard (non-error) Output
184: ** ---------------------------
185: */
186: PUBLIC int OutputData(const char * fmt, ...)
187: {
188: int ret;
189: va_list pArgs;
190: va_start(pArgs, fmt);
191: ret = vfprintf(stdout, fmt, pArgs);
192: va_end(pArgs);
193: return ret;
194: }
195:
196: /* ------------------------------------------------------------------------- */
197:
1.2 frystyk 198: /* Create a "HyperDoc" object
199: ** --------------------------
200: ** A HyperDoc object contains information about whether we have already
201: ** started checking the anchor and the depth in our search
202: */
203: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
204: {
205: HyperDoc * hd;
1.14 frystyk 206: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
207: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 208: hd->state = L_INVALID;
209: hd->depth = depth;
1.55 frystyk 210: hd->hits = 1;
1.2 frystyk 211:
212: /* Bind the HyperDoc object together with the Anchor Object */
213: hd->anchor = anchor;
214: HTAnchor_setDocument(anchor, (void *) hd);
215:
216: /* Add this HyperDoc object to our list */
217: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
218: HTList_addObject(mr->hyperdoc, (void *) hd);
219: return hd;
220: }
221:
222: /* Delete a "HyperDoc" object
223: ** --------------------------
224: */
225: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
226: {
227: if (hd) {
1.11 frystyk 228: HT_FREE (hd);
1.2 frystyk 229: return YES;
230: }
231: return NO;
232: }
233:
1.55 frystyk 234: /*
235: ** Sort the anchor array and log reference count
236: */
237: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
238: {
239: if (mr && array) {
240: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
241: if (log) {
242: void ** data = NULL;
243: HTParentAnchor * anchor = NULL;
244: HTArray_sort(array, HitSort);
245: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
246: while (anchor) {
247: char * str = NULL;
248: char * uri = HTAnchor_address((HTAnchor *) anchor);
249: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
250: if (uri && hd) {
251: if ((str = (char *) HT_MALLOC(strlen(uri) + 50)) == NULL)
252: HT_OUTOFMEM("calculate_hits");
1.58 frystyk 253: sprintf(str, "%8d %s", hd->hits, uri);
1.55 frystyk 254: HTLog_addLine(log, str);
255: HT_FREE(str);
256: }
257: HT_FREE(uri);
258: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
259: }
260: }
261: HTLog_close(log);
262: return YES;
263: }
264: return NO;
265: }
266:
267: PRIVATE int HitSort (const void * a, const void * b)
268: {
269: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
270: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
271: if (aa && bb) return (bb->hits - aa->hits);
272: return bb - aa;
273: }
274:
1.58 frystyk 275: /*
276: ** Calculate distributions for media types. The same mechanism
277: ** can be used for other characteristics with relatively
278: ** few outcomes.
279: */
280: PRIVATE HTList * mediatype_distribution (HTArray * array)
281: {
282: if (array) {
283: HTList * mt = HTList_new();
284: MetaDist * pres = NULL;
285: void ** data = NULL;
286: HTParentAnchor * anchor = NULL;
287: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
288: while (anchor) {
289: HTFormat format = HTAnchor_format(anchor);
290: if (format && format != WWW_UNKNOWN) {
291: HTList * cur = mt;
292:
293: /* If found then increase counter */
294: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
295: if (pres->name == format) {
296: pres->hits++;
297: break;
298: }
299: }
300:
301: /* If not found then add new format to list */
302: if (!pres) {
303: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
304: HT_OUTOFMEM("mediatype_distribution");
305: pres->name = format;
306: pres->hits = 1;
307: HTList_addObject(mt, pres);
308: HTList_insertionSort(mt, FormatSort);
309: }
310: }
311:
312: /* Find next anchor in array */
313: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
314: }
315: return mt;
316: }
317: return NULL;
318: }
319:
1.60 ! frystyk 320: /*
! 321: ** Calculate distributions for charsets. The same mechanism
! 322: ** can be used for other characteristics with relatively
! 323: ** few outcomes.
! 324: */
! 325: PRIVATE HTList * charset_distribution (HTArray * array)
! 326: {
! 327: if (array) {
! 328: HTList * cs = HTList_new();
! 329: MetaDist * pres = NULL;
! 330: void ** data = NULL;
! 331: HTParentAnchor * anchor = NULL;
! 332: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
! 333: while (anchor) {
! 334: HTCharset charset = HTAnchor_charset(anchor);
! 335: if (charset) {
! 336: HTList * cur = cs;
! 337:
! 338: /* If found then increase counter */
! 339: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
! 340: if (pres->name == charset) {
! 341: pres->hits++;
! 342: break;
! 343: }
! 344: }
! 345:
! 346: /* If not found then add new format to list */
! 347: if (!pres) {
! 348: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
! 349: HT_OUTOFMEM("charset_distribution");
! 350: pres->name = charset;
! 351: pres->hits = 1;
! 352: HTList_addObject(cs, pres);
! 353: HTList_insertionSort(cs, FormatSort);
! 354: }
! 355: }
! 356:
! 357: /* Find next anchor in array */
! 358: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
! 359: }
! 360: return cs;
! 361: }
! 362: return NULL;
! 363: }
! 364:
1.58 frystyk 365: PRIVATE int FormatSort (const void * a, const void * b)
366: {
367: MetaDist * aa = (MetaDist *) a;
368: MetaDist * bb = (MetaDist *) b;
369: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
370: }
371:
372: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
373: {
374: if (logfile && distribution) {
375: HTLog * log = HTLog_open(logfile, YES, YES);
376: if (log) {
377: HTList * cur = distribution;
378: MetaDist * pres;
379: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
380: if (pres->name) {
1.60 ! frystyk 381: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 382: }
383: }
384: HTLog_close(log);
385: }
386: }
387: return NO;
388: }
389:
390: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
391: {
392: if (distribution) {
393: HTList * cur = distribution;
394: MetaDist * pres;
395: while ((pres = (MetaDist *) HTList_nextObject(cur)))
396: HT_FREE(pres);
397: HTList_delete(distribution);
398: return YES;
399: }
400: return NO;
401: }
402:
403:
1.55 frystyk 404: /* Statistics
405: ** ----------
406: ** Calculates a bunch of statistics for the anchors traversed
407: */
408: PRIVATE BOOL calculate_statistics (Robot * mr)
409: {
1.59 frystyk 410: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 411: if (!mr) return NO;
412:
413: /* Calculate efficiency */
1.59 frystyk 414: if (mr->time > 0) {
1.56 frystyk 415: ms_t t = HTGetTimeInMillis() - mr->time;
416: if (t > 0) {
1.60 ! frystyk 417: double loadfactor = (mr->get_bytes / (t * 0.001));
! 418: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 419: double secs = t / 1000.0;
1.55 frystyk 420: char bytes[50];
1.60 ! frystyk 421: HTTrace("Accessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
! 422: total_docs, secs, reqprsec);
1.59 frystyk 423:
424: HTNumToStr(mr->get_bytes, bytes, 50);
425: HTTrace("Did a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
426: mr->get_docs, bytes, loadfactor);
427:
428: HTNumToStr(mr->head_bytes, bytes, 50);
429: HTTrace("Did a HEAD on %ld document(s) with a total of %s bytes\n",
430: mr->head_docs, bytes);
1.55 frystyk 431: }
432: }
433:
434: /* Create an array of existing anchors */
1.59 frystyk 435: if (total_docs > 1) {
436: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 437: if (array) {
438:
439: /* Sort after hit counts */
440: if (mr->hitfile) calculate_hits(mr, array);
441:
1.58 frystyk 442: /* Find mediatype distribution */
443: if (mr->mtfile) {
444: HTList * mtdist = mediatype_distribution(array);
445: if (mtdist) {
446: log_meta_distribution(mr->mtfile, mtdist);
447: delete_meta_distribution(mtdist);
448: }
449: }
1.55 frystyk 450:
1.60 ! frystyk 451: /* Find charset distribution */
! 452: if (mr->charsetfile) {
! 453: HTList * charsetdist = charset_distribution(array);
! 454: if (charsetdist) {
! 455: log_meta_distribution(mr->charsetfile, charsetdist);
! 456: delete_meta_distribution(charsetdist);
! 457: }
! 458: }
! 459:
1.55 frystyk 460: /* Add as may other stats here as you like */
1.60 ! frystyk 461: /* ... */
1.58 frystyk 462:
463: /* Delete the array */
1.55 frystyk 464: HTArray_delete(array);
465: }
466: }
467: return YES;
468: }
469:
1.1 frystyk 470: /* Create a Command Line Object
471: ** ----------------------------
472: */
473: PRIVATE Robot * Robot_new (void)
474: {
475: Robot * me;
1.41 frystyk 476: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 477: HT_OUTOFMEM("Robot_new");
1.2 frystyk 478: me->hyperdoc = HTList_new();
1.4 frystyk 479: me->htext = HTList_new();
1.40 frystyk 480: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 481: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 482: me->output = OUTPUT;
1.35 eric 483: me->cnt = 0;
1.34 eric 484: me->fingers = HTList_new();
1.1 frystyk 485: return me;
486: }
487:
488: /* Delete a Command Line Object
489: ** ----------------------------
490: */
491: PRIVATE BOOL Robot_delete (Robot * me)
492: {
493: if (me) {
1.34 eric 494: HTList_delete(me->fingers);
1.55 frystyk 495:
496: /* Calculate statistics */
497: calculate_statistics(me);
498:
499: if (me->hyperdoc) {
1.2 frystyk 500: HTList * cur = me->hyperdoc;
501: HyperDoc * pres;
502: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
503: HyperDoc_delete(pres);
504: HTList_delete(me->hyperdoc);
505: }
1.4 frystyk 506: if (me->htext) {
507: HTList * cur = me->htext;
508: HText * pres;
509: while ((pres = (HText *) HTList_nextObject(cur)))
510: HText_free(pres);
511: HTList_delete(me->htext);
512: }
1.55 frystyk 513: if (me->log) HTLog_close(me->log);
1.57 frystyk 514: if (me->ref) HTLog_close(me->ref);
1.58 frystyk 515: if (me->reject) HTLog_close(me->reject);
516: if (me->notfound) HTLog_close(me->notfound);
517: if (me->conneg) HTLog_close(me->conneg);
1.60 ! frystyk 518: if (me->noalttag) HTLog_close(me->noalttag);
1.1 frystyk 519: if (me->output && me->output != STDOUT) fclose(me->output);
1.12 frystyk 520: if (me->flags & MR_TIME) {
521: time_t local = time(NULL);
1.13 eric 522: HTTrace("Robot terminated %s\n",HTDateTimeStr(&local,YES));
1.12 frystyk 523: }
1.55 frystyk 524:
1.58 frystyk 525: #ifdef HT_POSIX_REGEX
526: if (me->include) {
527: regfree(me->include);
528: HT_FREE(me->include);
529: }
530: if (me->exclude) {
531: regfree(me->exclude);
532: HT_FREE(me->exclude);
533: }
534: if (me->check) {
535: regfree(me->check);
536: HT_FREE(me->check);
537: }
538: #endif
539:
1.11 frystyk 540: HT_FREE(me->cwd);
1.55 frystyk 541: HT_FREE(me->prefix);
1.59 frystyk 542: HT_FREE(me->img_prefix);
1.11 frystyk 543: HT_FREE(me);
1.1 frystyk 544: return YES;
545: }
546: return NO;
547: }
548:
1.2 frystyk 549: /*
1.34 eric 550: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 551: */
1.34 eric 552: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 553: {
1.34 eric 554: Finger * me;
555: HTRequest * request = HTRequest_new();
556: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
557: HT_OUTOFMEM("Finger_new");
558: me->robot = robot;
559: me->request = request;
560: me->dest = dest;
561: HTList_addObject(robot->fingers, (void *)me);
562:
1.48 frystyk 563: /* Set the context for this request */
1.34 eric 564: HTRequest_setContext (request, me);
1.48 frystyk 565:
566: /* Check the various flags to customize the request */
567: if (robot->flags & MR_PREEMPTIVE)
568: HTRequest_setPreemptive(request, YES);
569: if (robot->flags & MR_VALIDATE)
570: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
571: if (robot->flags & MR_END_VALIDATE)
572: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
573:
574: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 575: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 576:
577: /* Set the method for this request */
1.34 eric 578: HTRequest_setMethod(request, method);
579: robot->cnt++;
580: return me;
1.2 frystyk 581: }
582:
1.34 eric 583: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 584: {
1.34 eric 585: HTList_removeObject(me->robot->fingers, (void *)me);
586: me->robot->cnt--;
1.37 frystyk 587:
588: /*
589: ** If we are down at one request then flush the output buffer
590: */
591: if (me->request) {
592: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 593: HTRequest_delete(me->request);
1.37 frystyk 594: }
595:
596: /*
597: ** Delete the request and free myself
598: */
1.34 eric 599: HT_FREE(me);
600: return YES;
1.2 frystyk 601: }
602:
603: /*
604: ** Cleanup and make sure we close all connections including the persistent
605: ** ones
606: */
1.1 frystyk 607: PRIVATE void Cleanup (Robot * me, int status)
608: {
609: Robot_delete(me);
1.29 eric 610: HTProfile_delete();
1.50 frystyk 611: #ifdef HT_MEMLOG
1.39 eric 612: HTMemLog_close();
1.47 frystyk 613: #endif
614:
1.1 frystyk 615: #ifdef VMS
616: exit(status ? status : 1);
617: #else
618: exit(status ? status : 0);
619: #endif
620: }
621:
622: #ifdef CATCH_SIG
623: #include <signal.h>
624: /* SetSignal
625: ** This function sets up signal handlers. This might not be necessary to
626: ** call if the application has its own handlers (lossage on SVR4)
627: */
628: PRIVATE void SetSignal (void)
629: {
630: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
631: ** when attemting to connect to a remote host where you normally should
632: ** get `connection refused' back
633: */
634: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 635: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 636: } else {
1.13 eric 637: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 638: }
1.47 frystyk 639:
1.50 frystyk 640: #ifdef HT_MEMLOG
1.44 eric 641: HTMemLog_flush();
1.47 frystyk 642: #endif
643:
1.1 frystyk 644: }
645: #endif /* CATCH_SIG */
646:
1.58 frystyk 647: #ifdef HT_POSIX_REGEX
648: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
649: {
650: size_t length = regerror (errcode, compiled, NULL, 0);
651: char * str = NULL;
652: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
653: HT_OUTOFMEM("get_regerror");
654: (void) regerror (errcode, compiled, str, length);
655: return str;
656: }
657:
1.60 ! frystyk 658: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 659: {
660: regex_t * regex = NULL;
661: if (regex_str && *regex_str) {
662: int status;
663: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
664: HT_OUTOFMEM("get_regtype");
1.60 ! frystyk 665: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 666: char * err_msg = get_regerror(status, regex);
667: HTTrace("Regular expression error: %s\n", err_msg);
668: HT_FREE(err_msg);
669: Cleanup(mr, -1);
670: }
671: }
672: return regex;
673: }
674: #endif
675:
1.1 frystyk 676: PRIVATE void VersionInfo (void)
677: {
1.13 eric 678: OutputData("\n\nW3C Reference Software\n\n");
679: OutputData("\tW3C Mini Robot (%s) version %s.\n",
1.1 frystyk 680: APP_NAME, APP_VERSION);
1.13 eric 681: OutputData("\tW3C Reference Library version %s.\n\n",HTLib_version());
682: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 683: }
684:
685: /* terminate_handler
686: ** -----------------
1.2 frystyk 687: ** This function is registered to handle the result of the request.
688: ** If no more requests are pending then terminate program
1.1 frystyk 689: */
1.32 frystyk 690: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
691: void * param, int status)
1.1 frystyk 692: {
1.34 eric 693: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 694: Robot * mr = finger->robot;
1.34 eric 695: if (SHOW_MSG) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 696:
1.58 frystyk 697: /* Check if negotiated resource and whether we should log that*/
698: if (mr->conneg) {
699: HTAssocList * cur = HTResponse_variant(response);
700: if (cur) {
701: BOOL first = YES;
702: HTChunk * buffer = HTChunk_new(128);
703: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
704: HTAssoc * pres;
1.60 ! frystyk 705: HTChunk_puts(buffer, uri);
1.58 frystyk 706: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
707: char * value = HTAssoc_value(pres);
708: if (first) {
1.60 ! frystyk 709: HTChunk_puts(buffer, "\t(");
1.58 frystyk 710: first = NO;
711: } else
712: HTChunk_puts(buffer, ", ");
713:
714: /* Output the name */
715: HTChunk_puts(buffer, HTAssoc_name(pres));
716:
717: /* Only output the value if not empty string */
1.60 ! frystyk 718: if (value && *value) {
1.58 frystyk 719: HTChunk_puts(buffer, "=");
720: HTChunk_puts(buffer, value);
721: }
722: }
1.60 ! frystyk 723: if (!first) HTChunk_puts(buffer, ")");
! 724: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 725: HTChunk_delete(buffer);
726: HT_FREE(uri);
727: }
728: }
729:
1.55 frystyk 730: /* Count the amount of body data that we have read */
1.59 frystyk 731: if (HTRequest_method(request) == METHOD_GET) {
732: int length = HTAnchor_length(HTRequest_anchor(request));
733: if (length > 0) mr->get_bytes += length;
734: mr->get_docs++;
735: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 736: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 737: if (length > 0) mr->head_bytes += length;
738: mr->head_docs++;
739: } else {
740: mr->other_docs++;
1.55 frystyk 741: }
742:
1.58 frystyk 743: /* Cleanup the anchor so that we don't drown in metainformation */
744: if (!(mr->flags & MR_KEEP_META))
745: HTAnchor_clearHeader(HTRequest_anchor(request));
746:
1.55 frystyk 747: /* Delete this thread */
1.34 eric 748: Finger_delete(finger);
1.55 frystyk 749:
750: /* Should we stop? */
1.46 eric 751: if (mr->cnt <= 0) {
1.34 eric 752: if (SHOW_MSG) HTTrace(" Everything is finished...\n");
1.46 eric 753: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 754: }
1.46 eric 755: if (SHOW_MSG) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 756: return HT_OK;
757: }
758:
759: /* ------------------------------------------------------------------------- */
760: /* HTEXT INTERFACE */
761: /* ------------------------------------------------------------------------- */
762:
763: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
764: HTStream * stream)
765: {
766: HText * me;
1.34 eric 767: Finger * finger = (Finger *) HTRequest_context(request);
768: Robot * mr = finger->robot;
1.14 frystyk 769: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
770: HT_OUTOFMEM("HText_new2");
1.4 frystyk 771:
772: /* Bind the HText object together with the Request Object */
1.1 frystyk 773: me->request = request;
1.4 frystyk 774:
775: /* Add this HyperDoc object to our list */
776: if (!mr->htext) mr->htext = HTList_new();
777: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 778: return me;
779: }
780:
1.4 frystyk 781: PUBLIC void HText_free (HText * me) {
1.11 frystyk 782: if (me) HT_FREE (me);
1.4 frystyk 783: }
784:
1.1 frystyk 785: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
786: {
787: if (text && anchor) {
1.34 eric 788: Finger * finger = (Finger *) HTRequest_context(text->request);
789: Robot * mr = finger->robot;
1.1 frystyk 790: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
791: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 792: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 793: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 ! frystyk 794: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.58 frystyk 795: BOOL match = YES;
796: BOOL check = NO;
1.1 frystyk 797:
1.55 frystyk 798: if (!uri) return;
799: if (SHOW_MSG) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
800:
801: if (hd) {
802: if (SHOW_MSG) HTTrace("Already checked\n");
803: hd->hits++;
1.58 frystyk 804: HT_FREE(uri);
805: return;
806: }
807:
808: /* Check for prefix match */
809: if (mr->prefix) match = HTStrMatch(mr->prefix, uri) ? YES : NO;
810:
811: #ifdef HT_POSIX_REGEX
812: /* Check for any regular expression */
813: if (match && mr->include) {
814: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
815: }
816: if (match && mr->exclude) {
817: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
818: }
819: if (match && mr->check) {
820: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
821: }
822: #endif
823:
824: /* Test whether we already have a hyperdoc for this document */
825: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 ! frystyk 826: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
! 827: HyperDoc * last_doc = HTAnchor_document(last_anchor);
! 828: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 829: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
830: HTRequest * newreq = newfinger->request;
1.2 frystyk 831: HyperDoc_new(mr, dest_parent, depth);
1.60 ! frystyk 832: HTRequest_setParent(newreq, referer);
1.58 frystyk 833: if (check || depth >= mr->depth) {
834: if (SHOW_MSG) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 835: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 836: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 837: } else {
1.13 eric 838: if (SHOW_MSG) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 839: }
840: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.13 eric 841: if (SHOW_MSG) HTTrace("not tested!\n");
1.34 eric 842: Finger_delete(newfinger);
1.2 frystyk 843: }
1.7 frystyk 844: } else {
1.55 frystyk 845: if (SHOW_MSG) HTTrace("does not fulfill constraints\n");
1.60 ! frystyk 846: if (mr->reject) {
! 847: if (referer) {
! 848: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
! 849: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
! 850: HT_FREE(ref_addr);
! 851: }
! 852: }
1.2 frystyk 853: }
1.11 frystyk 854: HT_FREE(uri);
1.2 frystyk 855: }
856: }
857:
858: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 859: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 860: {
861: if (text && anchor) {
1.34 eric 862: Finger * finger = (Finger *) HTRequest_context(text->request);
863: Robot * mr = finger->robot;
1.59 frystyk 864: if (mr->flags & MR_IMG) {
1.60 ! frystyk 865: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
! 866: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
! 867: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
! 868: HyperDoc * hd = HTAnchor_document(dest_parent);
! 869: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 870: BOOL match = YES;
871:
872: if (hd) {
873: if (SHOW_MSG) HTTrace("Already checked\n");
874: hd->hits++;
1.11 frystyk 875: HT_FREE(uri);
1.59 frystyk 876: return;
1.2 frystyk 877: }
1.59 frystyk 878:
879: /* Check for prefix match */
880: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
881:
882: /* Test whether we already have a hyperdoc for this document */
883: if (match && dest) {
1.60 ! frystyk 884: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 885: mr->flags & MR_SAVE ?
886: METHOD_GET : METHOD_HEAD);
887: HTRequest * newreq = newfinger->request;
1.60 ! frystyk 888: HyperDoc_new(mr, dest_parent, 1);
! 889: HTRequest_setParent(newreq, referer);
! 890:
! 891: /* Check whether we should report missing ALT tags */
! 892: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
! 893: if (referer) {
! 894: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
! 895: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
! 896: HT_FREE(ref_addr);
! 897: }
! 898: }
! 899:
1.59 frystyk 900: if (SHOW_MSG) HTTrace("Robot....... Checking Image `%s\'\n", uri);
901: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
902: if (SHOW_MSG) HTTrace("Robot....... Image not tested!\n");
903: Finger_delete(newfinger);
904: }
905: } else {
906: if (SHOW_MSG) HTTrace("does not fulfill constraints\n");
1.60 ! frystyk 907: if (mr->reject) {
! 908: if (referer) {
! 909: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
! 910: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
! 911: HT_FREE(ref_addr);
! 912: }
! 913: }
1.1 frystyk 914: }
1.59 frystyk 915: HT_FREE(uri);
1.1 frystyk 916: }
917: }
918: }
919:
920: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 921: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 922: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
923: PUBLIC void HText_endAppend (HText * text) {}
924: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
925: PUBLIC void HText_beginAppend (HText * text) {}
926: PUBLIC void HText_appendParagraph (HText * text) {}
927:
1.48 frystyk 928: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
929: {
930: return (vfprintf(stderr, fmt, pArgs));
931: }
932:
1.1 frystyk 933: /* ------------------------------------------------------------------------- */
934: /* MAIN PROGRAM */
935: /* ------------------------------------------------------------------------- */
936:
937: int main (int argc, char ** argv)
938: {
1.48 frystyk 939: int status = 0;
1.1 frystyk 940: int arg;
1.48 frystyk 941: BOOL cache = NO; /* Use persistent cache */
942: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 943: char * cache_root = NULL;
1.1 frystyk 944: HTChunk * keywords = NULL; /* From command line */
945: int keycnt = 0;
1.12 frystyk 946: Robot * mr = NULL;
1.43 frystyk 947: Finger * finger = NULL;
948: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 949:
950: /* Starts Mac GUSI socket library */
951: #ifdef GUSI
952: GUSISetup(GUSIwithSIOUXSockets);
953: GUSISetup(GUSIwithInternetSockets);
954: #endif
955:
956: #ifdef __MWERKS__ /* STR */
957: InitGraf((Ptr) &qd.thePort);
958: InitFonts();
959: InitWindows();
960: InitMenus(); TEInit();
961: InitDialogs(nil);
962: InitCursor();
963: SIOUXSettings.asktosaveonclose = false;
964: argc=ccommand(&argv);
1.50 frystyk 965: #endif /* __MWERKS__ */
1.1 frystyk 966:
1.50 frystyk 967: #ifdef HT_MEMLOG
1.51 frystyk 968: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 969: #endif
1.46 eric 970:
1.27 frystyk 971: /* Initiate W3C Reference Library with a robot profile */
972: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 973: HTTrace_setCallback(RobotTrace);
1.27 frystyk 974:
975: /* Add the default HTML parser to the set of converters */
976: {
977: HTList * converters = HTFormat_conversion();
978: HTMLInit(converters);
979: }
1.1 frystyk 980:
1.12 frystyk 981: /* Build a new robot object */
982: mr = Robot_new();
983:
1.1 frystyk 984: /* Scan command Line for parameters */
985: for (arg=1; arg<argc; arg++) {
986: if (*argv[arg] == '-') {
987:
988: /* non-interactive */
1.17 frystyk 989: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 990: HTAlert_setInteractive(NO);
991:
1.55 frystyk 992: /* log file */
1.1 frystyk 993: } else if (!strcmp(argv[arg], "-l")) {
994: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
995: argv[++arg] : DEFAULT_LOG_FILE;
996:
1.55 frystyk 997: /* hit file */
998: } else if (!strcmp(argv[arg], "-hit")) {
999: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1000: argv[++arg] : DEFAULT_HIT_FILE;
1001:
1.57 frystyk 1002: /* referer file */
1.58 frystyk 1003: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1004: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1005: argv[++arg] : DEFAULT_REFERER_FILE;
1006:
1.58 frystyk 1007: /* Not found error log file */
1008: } else if (!strncmp(argv[arg], "-404", 4)) {
1009: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1010: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1011:
1012: /* reject log file */
1013: } else if (!strncmp(argv[arg], "-rej", 4)) {
1014: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1015: argv[++arg] : DEFAULT_REJECT_FILE;
1016:
1017: /* negoatiated resource log file */
1018: } else if (!strncmp(argv[arg], "-neg", 4)) {
1019: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1020: argv[++arg] : DEFAULT_CONNEG_FILE;
1021:
1022: /* mediatype distribution log file */
1023: } else if (!strncmp(argv[arg], "-for", 4)) {
1024: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1025: argv[++arg] : DEFAULT_FORMAT_FILE;
1026: mr->flags |= MR_KEEP_META;
1027:
1.60 ! frystyk 1028: /* charset distribution log file */
! 1029: } else if (!strncmp(argv[arg], "-char", 5)) {
! 1030: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1031: argv[++arg] : DEFAULT_CHARSET_FILE;
! 1032: mr->flags |= MR_KEEP_META;
! 1033:
! 1034: /* no alt tags log file */
! 1035: } else if (!strncmp(argv[arg], "-alt", 4)) {
! 1036: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1037: argv[++arg] : DEFAULT_NOALTTAG_FILE;
! 1038:
1.55 frystyk 1039: /* rule file */
1.1 frystyk 1040: } else if (!strcmp(argv[arg], "-r")) {
1041: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1042: argv[++arg] : DEFAULT_RULE_FILE;
1043:
1044: /* output filename */
1045: } else if (!strcmp(argv[arg], "-o")) {
1046: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1047: argv[++arg] : DEFAULT_OUTPUT_FILE;
1048:
1.55 frystyk 1049: /* URI prefix */
1050: } else if (!strcmp(argv[arg], "-prefix")) {
1051: char * prefix = NULL;
1052: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1053: argv[++arg] : DEFAULT_PREFIX;
1054: if (*prefix) {
1055: StrAllocCopy(mr->prefix, prefix);
1056: StrAllocCat(mr->prefix, "*");
1057: }
1058:
1.1 frystyk 1059: /* timeout -- Change the default request timeout */
1060: } else if (!strcmp(argv[arg], "-timeout")) {
1061: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1062: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1063: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1064:
1.54 frystyk 1065: /* Force no pipelined requests */
1066: } else if (!strcmp(argv[arg], "-nopipe")) {
1067: HTTP_setConnectionMode(HTTP_NO_PIPELINING);
1068:
1.48 frystyk 1069: /* Start the persistent cache */
1070: } else if (!strcmp(argv[arg], "-cache")) {
1071: cache = YES;
1072:
1.54 frystyk 1073: /* Determine the cache root */
1074: } else if (!strcmp(argv[arg], "-cacheroot")) {
1075: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1076: argv[++arg] : NULL;
1.51 frystyk 1077:
1.52 frystyk 1078: /* Stream write flush delay in ms */
1079: } else if (!strcmp(argv[arg], "-delay")) {
1080: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1081: atoi(argv[++arg]) : DEFAULT_DELAY;
1082: HTHost_setDefaultWriteDelay(delay);
1083:
1.48 frystyk 1084: /* Persistent cache flush */
1085: } else if (!strcmp(argv[arg], "-flush")) {
1086: flush = YES;
1087:
1088: /* Do a cache validation */
1089: } else if (!strcmp(argv[arg], "-validate")) {
1090: mr->flags |= MR_VALIDATE;
1091:
1092: /* Do an end-to-end cache-validation */
1093: } else if (!strcmp(argv[arg], "-endvalidate")) {
1094: mr->flags |= MR_END_VALIDATE;
1095:
1.7 frystyk 1096: /* preemptive or non-preemptive access */
1.1 frystyk 1097: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1098: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1099:
1100: /* test inlined images */
1101: } else if (!strcmp(argv[arg], "-img")) {
1102: mr->flags |= MR_IMG;
1.45 frystyk 1103:
1104: /* load inlined images */
1105: } else if (!strcmp(argv[arg], "-saveimg")) {
1106: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1107:
1108: /* URI prefix for inlined images */
1109: } else if (!strcmp(argv[arg], "-imgprefix")) {
1110: char * prefix = NULL;
1111: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1112: argv[++arg] : DEFAULT_IMG_PREFIX;
1113: if (*prefix) {
1114: StrAllocCopy(mr->img_prefix, prefix);
1115: StrAllocCat(mr->img_prefix, "*");
1116: }
1.2 frystyk 1117:
1118: /* load anchors */
1.58 frystyk 1119: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1120: mr->flags |= MR_LINK;
1.7 frystyk 1121: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1122: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1123:
1.12 frystyk 1124: /* Output start and end time */
1125: } else if (!strcmp(argv[arg], "-ss")) {
1126: time_t local = time(NULL);
1.13 eric 1127: HTTrace("Robot started on %s\n",
1.12 frystyk 1128: HTDateTimeStr(&local, YES));
1129: mr->flags |= MR_TIME;
1130:
1.1 frystyk 1131: /* print version and exit */
1132: } else if (!strcmp(argv[arg], "-version")) {
1133: VersionInfo();
1134: Cleanup(mr, 0);
1.46 eric 1135:
1136: /* run in quiet mode */
1137: } else if (!strcmp(argv[arg], "-q")) {
1138: mr->flags |= MR_QUIET;
1.1 frystyk 1139:
1140: #ifdef WWWTRACE
1141: /* trace flags */
1142: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1143: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1144: #endif
1145:
1.58 frystyk 1146: #ifdef HT_POSIX_REGEX
1147:
1148: /* If we can link against a POSIX regex library */
1149: } else if (!strncmp(argv[arg], "-inc", 4)) {
1150: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 ! frystyk 1151: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1152: }
1153: } else if (!strncmp(argv[arg], "-exc", 4)) {
1154: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 ! frystyk 1155: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1156: }
1157: } else if (!strncmp(argv[arg], "-check", 6)) {
1158: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 ! frystyk 1159: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1160: }
1161: #endif
1162:
1.1 frystyk 1163: } else {
1.13 eric 1164: if (SHOW_MSG) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1165: }
1.17 frystyk 1166: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1167: if (!keycnt) {
1168: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1169: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1170: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1171: keycnt = 1;
1.11 frystyk 1172: HT_FREE(ref);
1.1 frystyk 1173: } else { /* Check for successive keyword arguments */
1174: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1175: if (keycnt++ <= 1)
1.5 frystyk 1176: keywords = HTChunk_new(128);
1.1 frystyk 1177: else
1.5 frystyk 1178: HTChunk_putc(keywords, ' ');
1179: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1180: HT_FREE(escaped);
1.1 frystyk 1181: }
1182: }
1183: }
1184:
1185: #ifdef CATCH_SIG
1186: SetSignal();
1187: #endif
1188:
1189: if (!keycnt) {
1.13 eric 1190: if (SHOW_MSG) HTTrace("Please specify URL to check.\n");
1.1 frystyk 1191: Cleanup(mr, -1);
1192: }
1193:
1.23 manoli 1194: /* Testing that HTTrace is working */
1.47 frystyk 1195: if (SHOW_MSG) HTTrace ("Welcome to the W3C mini Robot\n");
1.23 manoli 1196:
1.1 frystyk 1197: /* Rule file specified? */
1198: if (mr->rules) {
1199: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1200: if (!HTLoadRules(rules))
1.13 eric 1201: if (SHOW_MSG) HTTrace("Can't access rules\n");
1.11 frystyk 1202: HT_FREE(rules);
1.1 frystyk 1203: }
1204:
1205: /* Output file specified? */
1206: if (mr->outputfile) {
1207: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.13 eric 1208: if (SHOW_MSG) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1209: mr->output = OUTPUT;
1210: }
1211: }
1212:
1.48 frystyk 1213: /* Should we use persistent cache? */
1214: if (cache) {
1.54 frystyk 1215: HTCacheInit(cache_root, 20);
1.49 frystyk 1216: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1217: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1218: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1219:
1220: /* Should we start by flushing? */
1221: if (flush) HTCache_flushAll();
1222: }
1223:
1.58 frystyk 1224: /* CLF Log file specified? */
1.55 frystyk 1225: if (mr->logfile) {
1226: mr->log = HTLog_open(mr->logfile, YES, YES);
1227: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1228: }
1229:
1.58 frystyk 1230: /* Referer Log file specified? */
1.57 frystyk 1231: if (mr->reffile) {
1232: mr->ref = HTLog_open(mr->reffile, YES, YES);
1233: if (mr->ref)
1234: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1235: }
1.1 frystyk 1236:
1.58 frystyk 1237: /* Not found error log specified? */
1238: if (mr->notfoundfile) {
1239: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1240: if (mr->notfound)
1241: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1242: }
1243:
1244: /* Negotiated resource log specified? */
1245: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 ! frystyk 1246:
! 1247: /* No alt tags log file specified? */
! 1248: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1249:
1250: /* Reject Log file specified? */
1251: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1252:
1253: /* Register our own terminate filter */
1.32 frystyk 1254: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1255:
1256: /* Setting event timeout */
1257: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1258:
1.56 frystyk 1259: mr->time = HTGetTimeInMillis();
1.37 frystyk 1260:
1.34 eric 1261: /* Start the request */
1262: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1263:
1264: /*
1265: ** Make sure that the first request is flushed immediately and not
1266: ** buffered in the output buffer
1267: */
1268: HTRequest_setFlush(finger->request, YES);
1269:
1270: /*
1.48 frystyk 1271: ** Check whether we should do some kind of cache validation on
1272: ** the load
1273: */
1274: if (mr->flags & MR_VALIDATE)
1275: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1276: if (mr->flags & MR_END_VALIDATE)
1277: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1278:
1279: /*
1.43 frystyk 1280: ** Now do the load
1281: */
1.34 eric 1282: if (mr->flags & MR_PREEMPTIVE)
1283: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1284:
1285: if (keywords) /* Search */
1.34 eric 1286: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1287: else
1.34 eric 1288: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1289:
1.5 frystyk 1290: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1291: if (status != YES) {
1.13 eric 1292: if (SHOW_MSG) HTTrace("Can't access resource\n");
1.1 frystyk 1293: Cleanup(mr, -1);
1294: }
1295:
1296: /* Go into the event loop... */
1.34 eric 1297: HTEventList_loop(finger->request);
1.1 frystyk 1298:
1299: /* Only gets here if event loop fails */
1300: Cleanup(mr, 0);
1301: return 0;
1302: }
Webmaster