Annotation of libwww/Robot/src/HTRobot.c, revision 1.55
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.14 frystyk 25: #ifndef W3C_VERSION
1.33 eric 26: #define W3C_VERSION "Unspecified"
1.1 frystyk 27: #endif
28:
29: #define APP_NAME "W3CRobot"
1.14 frystyk 30: #define APP_VERSION W3C_VERSION
1.1 frystyk 31:
32: #define DEFAULT_OUTPUT_FILE "robot.out"
33: #define DEFAULT_RULE_FILE "robot.conf"
34: #define DEFAULT_LOG_FILE "robot.log"
1.55 ! frystyk 35: #define DEFAULT_HIT_FILE "robot.hit"
1.51 frystyk 36: #define DEFAULT_MEMLOG "robot.mem"
1.55 ! frystyk 37: #define DEFAULT_PREFIX ""
1.7 frystyk 38: #define DEFAULT_DEPTH 0
1.53 frystyk 39: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 40:
1.51 frystyk 41: #if 0
1.53 frystyk 42: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 43: #endif
44:
1.46 eric 45: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
46: #define SHOW_MSG (!(mr->flags & MR_QUIET))
1.1 frystyk 47:
1.40 frystyk 48: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 49:
50: #if defined(__svr4__)
51: #define CATCH_SIG
52: #endif
53:
54: typedef enum _MRFlags {
1.45 frystyk 55: MR_IMG = 0x1,
56: MR_LINK = 0x2,
57: MR_PREEMPTIVE = 0x4,
58: MR_TIME = 0x8,
1.46 eric 59: MR_SAVE = 0x10,
1.48 frystyk 60: MR_QUIET = 0x20,
61: MR_VALIDATE = 0x40,
62: MR_END_VALIDATE = 0x80
1.1 frystyk 63: } MRFlags;
64:
65: typedef struct _Robot {
1.2 frystyk 66: int depth; /* How deep is our tree */
1.30 frystyk 67: int cnt; /* Count of requests */
1.2 frystyk 68: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 69: HTList * htext; /* List of our HText Objects */
1.34 eric 70: HTList * fingers;
1.40 frystyk 71: int timer;
1.1 frystyk 72: char * cwd; /* Current dir URL */
73: char * rules;
1.55 ! frystyk 74: char * prefix;
1.1 frystyk 75: char * logfile;
1.55 ! frystyk 76: HTLog * log;
1.1 frystyk 77: char * outputfile;
78: FILE * output;
1.55 ! frystyk 79: char * hitfile;
1.1 frystyk 80: MRFlags flags;
1.55 ! frystyk 81:
! 82: long total_bytes; /* Total number of bytes processed */
! 83: long total_docs; /* Total number of documents processed */
! 84: time_t time; /* Time of run */
1.1 frystyk 85: } Robot;
1.34 eric 86:
87: typedef struct _Finger {
88: Robot * robot;
89: HTRequest * request;
90: HTParentAnchor * dest;
91: } Finger;
92:
1.1 frystyk 93: typedef enum _LoadState {
94: L_INVALID = -2,
95: L_LOADING = -1,
96: L_SUCCESS = 0,
97: L_ERROR
98: } LoadState;
99:
100: /*
101: ** The HyperDoc object is bound to the anchor and contains information about
102: ** where we are in the search for recursive searches
103: */
104: typedef struct _HyperDoc {
105: HTParentAnchor * anchor;
106: LoadState state;
107: int depth;
1.55 ! frystyk 108: int hits;
1.1 frystyk 109: } HyperDoc;
110:
111: /*
112: ** This is the HText object that is created every time we start parsing a
113: ** HTML object
114: */
1.4 frystyk 115: struct _HText {
1.1 frystyk 116: HTRequest * request;
1.4 frystyk 117: };
1.1 frystyk 118:
119: PUBLIC HText * HTMainText = NULL;
120: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
121: PUBLIC HTStyleSheet * styleSheet = NULL;
122:
1.55 ! frystyk 123: PRIVATE HTComparer HitSort;
! 124:
1.1 frystyk 125: /* ------------------------------------------------------------------------- */
126:
1.13 eric 127: /* Standard (non-error) Output
128: ** ---------------------------
129: */
130: PUBLIC int OutputData(const char * fmt, ...)
131: {
132: int ret;
133: va_list pArgs;
134: va_start(pArgs, fmt);
135: ret = vfprintf(stdout, fmt, pArgs);
136: va_end(pArgs);
137: return ret;
138: }
139:
140: /* ------------------------------------------------------------------------- */
141:
1.2 frystyk 142: /* Create a "HyperDoc" object
143: ** --------------------------
144: ** A HyperDoc object contains information about whether we have already
145: ** started checking the anchor and the depth in our search
146: */
147: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
148: {
149: HyperDoc * hd;
1.14 frystyk 150: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
151: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 152: hd->state = L_INVALID;
153: hd->depth = depth;
1.55 ! frystyk 154: hd->hits = 1;
1.2 frystyk 155:
156: /* Bind the HyperDoc object together with the Anchor Object */
157: hd->anchor = anchor;
158: HTAnchor_setDocument(anchor, (void *) hd);
159:
160: /* Add this HyperDoc object to our list */
161: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
162: HTList_addObject(mr->hyperdoc, (void *) hd);
163: return hd;
164: }
165:
166: /* Delete a "HyperDoc" object
167: ** --------------------------
168: */
169: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
170: {
171: if (hd) {
1.11 frystyk 172: HT_FREE (hd);
1.2 frystyk 173: return YES;
174: }
175: return NO;
176: }
177:
1.55 ! frystyk 178: /*
! 179: ** Sort the anchor array and log reference count
! 180: */
! 181: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
! 182: {
! 183: if (mr && array) {
! 184: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
! 185: if (log) {
! 186: void ** data = NULL;
! 187: HTParentAnchor * anchor = NULL;
! 188: HTArray_sort(array, HitSort);
! 189: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
! 190: while (anchor) {
! 191: char * str = NULL;
! 192: char * uri = HTAnchor_address((HTAnchor *) anchor);
! 193: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
! 194: if (uri && hd) {
! 195: if ((str = (char *) HT_MALLOC(strlen(uri) + 50)) == NULL)
! 196: HT_OUTOFMEM("calculate_hits");
! 197: sprintf(str, "%8d %s\n", hd->hits, uri);
! 198: HTLog_addLine(log, str);
! 199: HT_FREE(str);
! 200: }
! 201: HT_FREE(uri);
! 202: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
! 203: }
! 204: }
! 205: HTLog_close(log);
! 206: return YES;
! 207: }
! 208: return NO;
! 209: }
! 210:
! 211: PRIVATE int HitSort (const void * a, const void * b)
! 212: {
! 213: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
! 214: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
! 215: if (aa && bb) return (bb->hits - aa->hits);
! 216: return bb - aa;
! 217: }
! 218:
! 219: /* Statistics
! 220: ** ----------
! 221: ** Calculates a bunch of statistics for the anchors traversed
! 222: */
! 223: PRIVATE BOOL calculate_statistics (Robot * mr)
! 224: {
! 225: if (!mr) return NO;
! 226:
! 227: /* Calculate efficiency */
! 228: {
! 229: time_t t = time(NULL) - mr->time;
! 230: if (t > 0.0) {
! 231: double loadfactor = mr->total_bytes / t;
! 232: char bytes[50];
! 233: HTNumToStr(mr->total_bytes, bytes, 50);
! 234: HTTrace("Downloaded %s bytes in %ld document bodies in %ld seconds (%2.1f bytes/sec)\n",
! 235: bytes, mr->total_docs, t, loadfactor);
! 236: }
! 237: }
! 238:
! 239: /* Create an array of existing anchors */
! 240: if (mr->total_docs > 1) {
! 241: HTArray * array = HTAnchor_getArray(mr->total_docs);
! 242: if (array) {
! 243:
! 244: /* Sort after hit counts */
! 245: if (mr->hitfile) calculate_hits(mr, array);
! 246:
! 247:
! 248: /* Add as may other stats here as you like */
! 249:
! 250: HTArray_delete(array);
! 251: }
! 252: }
! 253: return YES;
! 254: }
! 255:
1.1 frystyk 256: /* Create a Command Line Object
257: ** ----------------------------
258: */
259: PRIVATE Robot * Robot_new (void)
260: {
261: Robot * me;
1.41 frystyk 262: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 263: HT_OUTOFMEM("Robot_new");
1.2 frystyk 264: me->hyperdoc = HTList_new();
1.4 frystyk 265: me->htext = HTList_new();
1.40 frystyk 266: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 267: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 268: me->output = OUTPUT;
1.35 eric 269: me->cnt = 0;
1.34 eric 270: me->fingers = HTList_new();
1.1 frystyk 271: return me;
272: }
273:
274: /* Delete a Command Line Object
275: ** ----------------------------
276: */
277: PRIVATE BOOL Robot_delete (Robot * me)
278: {
279: if (me) {
1.34 eric 280: HTList_delete(me->fingers);
1.55 ! frystyk 281:
! 282: /* Calculate statistics */
! 283: calculate_statistics(me);
! 284:
! 285: if (me->hyperdoc) {
1.2 frystyk 286: HTList * cur = me->hyperdoc;
287: HyperDoc * pres;
288: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
289: HyperDoc_delete(pres);
290: HTList_delete(me->hyperdoc);
291: }
1.4 frystyk 292: if (me->htext) {
293: HTList * cur = me->htext;
294: HText * pres;
295: while ((pres = (HText *) HTList_nextObject(cur)))
296: HText_free(pres);
297: HTList_delete(me->htext);
298: }
1.55 ! frystyk 299: if (me->log) HTLog_close(me->log);
1.1 frystyk 300: if (me->output && me->output != STDOUT) fclose(me->output);
1.12 frystyk 301: if (me->flags & MR_TIME) {
302: time_t local = time(NULL);
1.13 eric 303: HTTrace("Robot terminated %s\n",HTDateTimeStr(&local,YES));
1.12 frystyk 304: }
1.55 ! frystyk 305:
1.11 frystyk 306: HT_FREE(me->cwd);
1.55 ! frystyk 307: HT_FREE(me->prefix);
1.11 frystyk 308: HT_FREE(me);
1.1 frystyk 309: return YES;
310: }
311: return NO;
312: }
313:
1.2 frystyk 314: /*
1.34 eric 315: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 316: */
1.34 eric 317: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 318: {
1.34 eric 319: Finger * me;
320: HTRequest * request = HTRequest_new();
321: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
322: HT_OUTOFMEM("Finger_new");
323: me->robot = robot;
324: me->request = request;
325: me->dest = dest;
326: HTList_addObject(robot->fingers, (void *)me);
327:
1.48 frystyk 328: /* Set the context for this request */
1.34 eric 329: HTRequest_setContext (request, me);
1.48 frystyk 330:
331: /* Check the various flags to customize the request */
332: if (robot->flags & MR_PREEMPTIVE)
333: HTRequest_setPreemptive(request, YES);
334: if (robot->flags & MR_VALIDATE)
335: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
336: if (robot->flags & MR_END_VALIDATE)
337: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
338:
339: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 340: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 341:
342: /* Set the method for this request */
1.34 eric 343: HTRequest_setMethod(request, method);
344: robot->cnt++;
345: return me;
1.2 frystyk 346: }
347:
1.34 eric 348: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 349: {
1.34 eric 350: HTList_removeObject(me->robot->fingers, (void *)me);
351: me->robot->cnt--;
1.37 frystyk 352:
353: /*
354: ** If we are down at one request then flush the output buffer
355: */
356: if (me->request) {
357: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 358: HTRequest_delete(me->request);
1.37 frystyk 359: }
360:
361: /*
362: ** Delete the request and free myself
363: */
1.34 eric 364: HT_FREE(me);
365: return YES;
1.2 frystyk 366: }
367:
368: /*
369: ** Cleanup and make sure we close all connections including the persistent
370: ** ones
371: */
1.1 frystyk 372: PRIVATE void Cleanup (Robot * me, int status)
373: {
374: Robot_delete(me);
1.29 eric 375: HTProfile_delete();
1.50 frystyk 376: #ifdef HT_MEMLOG
1.39 eric 377: HTMemLog_close();
1.47 frystyk 378: #endif
379:
1.1 frystyk 380: #ifdef VMS
381: exit(status ? status : 1);
382: #else
383: exit(status ? status : 0);
384: #endif
385: }
386:
387: #ifdef CATCH_SIG
388: #include <signal.h>
389: /* SetSignal
390: ** This function sets up signal handlers. This might not be necessary to
391: ** call if the application has its own handlers (lossage on SVR4)
392: */
393: PRIVATE void SetSignal (void)
394: {
395: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
396: ** when attemting to connect to a remote host where you normally should
397: ** get `connection refused' back
398: */
399: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 400: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 401: } else {
1.13 eric 402: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 403: }
1.47 frystyk 404:
1.50 frystyk 405: #ifdef HT_MEMLOG
1.44 eric 406: HTMemLog_flush();
1.47 frystyk 407: #endif
408:
1.1 frystyk 409: }
410: #endif /* CATCH_SIG */
411:
412: PRIVATE void VersionInfo (void)
413: {
1.13 eric 414: OutputData("\n\nW3C Reference Software\n\n");
415: OutputData("\tW3C Mini Robot (%s) version %s.\n",
1.1 frystyk 416: APP_NAME, APP_VERSION);
1.13 eric 417: OutputData("\tW3C Reference Library version %s.\n\n",HTLib_version());
418: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 419: }
420:
421: /* terminate_handler
422: ** -----------------
1.2 frystyk 423: ** This function is registered to handle the result of the request.
424: ** If no more requests are pending then terminate program
1.1 frystyk 425: */
1.32 frystyk 426: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
427: void * param, int status)
1.1 frystyk 428: {
1.34 eric 429: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 430: Robot * mr = finger->robot;
1.34 eric 431: if (SHOW_MSG) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 ! frystyk 432:
! 433: /* Count the amount of body data that we have read */
! 434: if (status == HT_LOADED && HTRequest_method(request) == METHOD_GET) {
! 435: mr->total_bytes += HTAnchor_length(HTRequest_anchor(request));
! 436: }
! 437:
! 438: /* Count the number of documents that we have processed */
! 439: mr->total_docs++;
! 440:
! 441: /* Delete this thread */
1.34 eric 442: Finger_delete(finger);
1.55 ! frystyk 443:
! 444: /* Should we stop? */
1.46 eric 445: if (mr->cnt <= 0) {
1.34 eric 446: if (SHOW_MSG) HTTrace(" Everything is finished...\n");
1.46 eric 447: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 448: }
1.46 eric 449: if (SHOW_MSG) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 450: return HT_OK;
451: }
452:
453: /* ------------------------------------------------------------------------- */
454: /* HTEXT INTERFACE */
455: /* ------------------------------------------------------------------------- */
456:
457: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
458: HTStream * stream)
459: {
460: HText * me;
1.34 eric 461: Finger * finger = (Finger *) HTRequest_context(request);
462: Robot * mr = finger->robot;
1.14 frystyk 463: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
464: HT_OUTOFMEM("HText_new2");
1.4 frystyk 465:
466: /* Bind the HText object together with the Request Object */
1.1 frystyk 467: me->request = request;
1.4 frystyk 468:
469: /* Add this HyperDoc object to our list */
470: if (!mr->htext) mr->htext = HTList_new();
471: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 472: return me;
473: }
474:
1.4 frystyk 475: PUBLIC void HText_free (HText * me) {
1.11 frystyk 476: if (me) HT_FREE (me);
1.4 frystyk 477: }
478:
1.1 frystyk 479: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
480: {
481: if (text && anchor) {
1.34 eric 482: Finger * finger = (Finger *) HTRequest_context(text->request);
483: Robot * mr = finger->robot;
1.1 frystyk 484: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
485: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 486: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 487: HyperDoc * hd = HTAnchor_document(dest_parent);
1.55 ! frystyk 488: BOOL prefix_match = YES;
1.1 frystyk 489:
1.55 ! frystyk 490: if (!uri) return;
! 491: if (SHOW_MSG) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
! 492:
! 493: /* Check for prefix match */
! 494: if (mr->prefix) prefix_match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1.7 frystyk 495:
1.2 frystyk 496: /* Test whether we already have a hyperdoc for this document */
1.55 ! frystyk 497: if (hd) {
! 498: if (SHOW_MSG) HTTrace("Already checked\n");
! 499: hd->hits++;
! 500: } else if (mr->flags & MR_LINK && prefix_match && dest_parent) {
1.1 frystyk 501: HTParentAnchor * parent = HTRequest_parent(text->request);
502: HyperDoc * last = HTAnchor_document(parent);
503: int depth = last ? last->depth+1 : 0;
1.34 eric 504: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
505: HTRequest * newreq = newfinger->request;
1.2 frystyk 506: HyperDoc_new(mr, dest_parent, depth);
1.7 frystyk 507: HTRequest_setParent(newreq, HTRequest_anchor(text->request));
508: if (depth >= mr->depth) {
509: if (SHOW_MSG)
1.13 eric 510: HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 511: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 512: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 513: } else {
1.13 eric 514: if (SHOW_MSG) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 515: }
516: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.13 eric 517: if (SHOW_MSG) HTTrace("not tested!\n");
1.34 eric 518: Finger_delete(newfinger);
1.2 frystyk 519: }
1.7 frystyk 520: } else {
1.55 ! frystyk 521: if (SHOW_MSG) HTTrace("does not fulfill constraints\n");
1.2 frystyk 522: }
1.11 frystyk 523: HT_FREE(uri);
1.2 frystyk 524: }
525: }
526:
527: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 528: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 529: {
530: if (text && anchor) {
1.34 eric 531: Finger * finger = (Finger *) HTRequest_context(text->request);
532: Robot * mr = finger->robot;
1.2 frystyk 533: HTParentAnchor * dest = (HTParentAnchor *)
534: HTAnchor_followMainLink((HTAnchor *) anchor);
535: HyperDoc * hd = HTAnchor_document(dest);
1.1 frystyk 536:
1.2 frystyk 537: /* Test whether we already have a hyperdoc for this document */
538: if (mr->flags & MR_IMG && dest && !hd) {
539: HTParentAnchor * parent = HTRequest_parent(text->request);
540: HyperDoc * last = HTAnchor_document(parent);
541: int depth = last ? last->depth+1 : 0;
1.45 frystyk 542: Finger * newfinger = Finger_new(mr, dest,
543: mr->flags & MR_SAVE ?
544: METHOD_GET : METHOD_HEAD);
1.34 eric 545: HTRequest * newreq = newfinger->request;
1.2 frystyk 546: HyperDoc_new(mr, dest, depth);
547: if (SHOW_MSG) {
548: char * uri = HTAnchor_address((HTAnchor *) dest);
1.13 eric 549: HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.11 frystyk 550: HT_FREE(uri);
1.2 frystyk 551: }
552: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
553: if (SHOW_MSG)
1.13 eric 554: HTTrace("Robot....... Image not tested!\n");
1.34 eric 555: Finger_delete(newfinger);
1.1 frystyk 556: }
557: }
558: }
559: }
560:
561: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 562: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 563: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
564: PUBLIC void HText_endAppend (HText * text) {}
565: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
566: PUBLIC void HText_beginAppend (HText * text) {}
567: PUBLIC void HText_appendParagraph (HText * text) {}
568:
1.48 frystyk 569: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
570: {
571: return (vfprintf(stderr, fmt, pArgs));
572: }
573:
1.1 frystyk 574: /* ------------------------------------------------------------------------- */
575: /* MAIN PROGRAM */
576: /* ------------------------------------------------------------------------- */
577:
578: int main (int argc, char ** argv)
579: {
1.48 frystyk 580: int status = 0;
1.1 frystyk 581: int arg;
1.48 frystyk 582: BOOL cache = NO; /* Use persistent cache */
583: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 584: char * cache_root = NULL;
1.1 frystyk 585: HTChunk * keywords = NULL; /* From command line */
586: int keycnt = 0;
1.12 frystyk 587: Robot * mr = NULL;
1.43 frystyk 588: Finger * finger = NULL;
589: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 590:
591: /* Starts Mac GUSI socket library */
592: #ifdef GUSI
593: GUSISetup(GUSIwithSIOUXSockets);
594: GUSISetup(GUSIwithInternetSockets);
595: #endif
596:
597: #ifdef __MWERKS__ /* STR */
598: InitGraf((Ptr) &qd.thePort);
599: InitFonts();
600: InitWindows();
601: InitMenus(); TEInit();
602: InitDialogs(nil);
603: InitCursor();
604: SIOUXSettings.asktosaveonclose = false;
605: argc=ccommand(&argv);
1.50 frystyk 606: #endif /* __MWERKS__ */
1.1 frystyk 607:
1.50 frystyk 608: #ifdef HT_MEMLOG
1.51 frystyk 609: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 610: #endif
1.46 eric 611:
1.27 frystyk 612: /* Initiate W3C Reference Library with a robot profile */
613: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 614: HTTrace_setCallback(RobotTrace);
1.27 frystyk 615:
616: /* Add the default HTML parser to the set of converters */
617: {
618: HTList * converters = HTFormat_conversion();
619: HTMLInit(converters);
620: }
1.1 frystyk 621:
1.12 frystyk 622: /* Build a new robot object */
623: mr = Robot_new();
624:
1.1 frystyk 625: /* Scan command Line for parameters */
626: for (arg=1; arg<argc; arg++) {
627: if (*argv[arg] == '-') {
628:
629: /* non-interactive */
1.17 frystyk 630: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 631: HTAlert_setInteractive(NO);
632:
1.55 ! frystyk 633: /* log file */
1.1 frystyk 634: } else if (!strcmp(argv[arg], "-l")) {
635: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
636: argv[++arg] : DEFAULT_LOG_FILE;
637:
1.55 ! frystyk 638: /* hit file */
! 639: } else if (!strcmp(argv[arg], "-hit")) {
! 640: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 641: argv[++arg] : DEFAULT_HIT_FILE;
! 642:
! 643: /* rule file */
1.1 frystyk 644: } else if (!strcmp(argv[arg], "-r")) {
645: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
646: argv[++arg] : DEFAULT_RULE_FILE;
647:
648: /* output filename */
649: } else if (!strcmp(argv[arg], "-o")) {
650: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
651: argv[++arg] : DEFAULT_OUTPUT_FILE;
652:
1.55 ! frystyk 653: /* URI prefix */
! 654: } else if (!strcmp(argv[arg], "-prefix")) {
! 655: char * prefix = NULL;
! 656: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
! 657: argv[++arg] : DEFAULT_PREFIX;
! 658: if (*prefix) {
! 659: StrAllocCopy(mr->prefix, prefix);
! 660: StrAllocCat(mr->prefix, "*");
! 661: }
! 662:
1.1 frystyk 663: /* timeout -- Change the default request timeout */
664: } else if (!strcmp(argv[arg], "-timeout")) {
665: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
666: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 667: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 668:
1.54 frystyk 669: /* Force no pipelined requests */
670: } else if (!strcmp(argv[arg], "-nopipe")) {
671: HTTP_setConnectionMode(HTTP_NO_PIPELINING);
672:
1.48 frystyk 673: /* Start the persistent cache */
674: } else if (!strcmp(argv[arg], "-cache")) {
675: cache = YES;
676:
1.54 frystyk 677: /* Determine the cache root */
678: } else if (!strcmp(argv[arg], "-cacheroot")) {
679: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
680: argv[++arg] : NULL;
1.51 frystyk 681:
1.52 frystyk 682: /* Stream write flush delay in ms */
683: } else if (!strcmp(argv[arg], "-delay")) {
684: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
685: atoi(argv[++arg]) : DEFAULT_DELAY;
686: HTHost_setDefaultWriteDelay(delay);
687:
1.48 frystyk 688: /* Persistent cache flush */
689: } else if (!strcmp(argv[arg], "-flush")) {
690: flush = YES;
691:
692: /* Do a cache validation */
693: } else if (!strcmp(argv[arg], "-validate")) {
694: mr->flags |= MR_VALIDATE;
695:
696: /* Do an end-to-end cache-validation */
697: } else if (!strcmp(argv[arg], "-endvalidate")) {
698: mr->flags |= MR_END_VALIDATE;
699:
1.7 frystyk 700: /* preemptive or non-preemptive access */
1.1 frystyk 701: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 702: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 703:
704: /* test inlined images */
705: } else if (!strcmp(argv[arg], "-img")) {
706: mr->flags |= MR_IMG;
1.45 frystyk 707:
708: /* load inlined images */
709: } else if (!strcmp(argv[arg], "-saveimg")) {
710: mr->flags |= (MR_IMG | MR_SAVE);
1.2 frystyk 711:
712: /* load anchors */
713: } else if (!strcmp(argv[arg], "-link")) {
714: mr->flags |= MR_LINK;
1.7 frystyk 715: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
716: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 717:
1.12 frystyk 718: /* Output start and end time */
719: } else if (!strcmp(argv[arg], "-ss")) {
720: time_t local = time(NULL);
1.13 eric 721: HTTrace("Robot started on %s\n",
1.12 frystyk 722: HTDateTimeStr(&local, YES));
723: mr->flags |= MR_TIME;
724:
1.1 frystyk 725: /* print version and exit */
726: } else if (!strcmp(argv[arg], "-version")) {
727: VersionInfo();
728: Cleanup(mr, 0);
1.46 eric 729:
730: /* run in quiet mode */
731: } else if (!strcmp(argv[arg], "-q")) {
732: mr->flags |= MR_QUIET;
1.1 frystyk 733:
734: #ifdef WWWTRACE
735: /* trace flags */
736: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 737: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 738: #endif
739:
740: } else {
1.13 eric 741: if (SHOW_MSG) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 742: }
1.17 frystyk 743: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 744: if (!keycnt) {
745: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.34 eric 746: startAnchor = (HTParentAnchor *) HTAnchor_findAddress(ref);
747: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 748: keycnt = 1;
1.11 frystyk 749: HT_FREE(ref);
1.1 frystyk 750: } else { /* Check for successive keyword arguments */
751: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
752: if (keycnt++ <= 1)
1.5 frystyk 753: keywords = HTChunk_new(128);
1.1 frystyk 754: else
1.5 frystyk 755: HTChunk_putc(keywords, ' ');
756: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 757: HT_FREE(escaped);
1.1 frystyk 758: }
759: }
760: }
761:
762: #ifdef CATCH_SIG
763: SetSignal();
764: #endif
765:
766: if (!keycnt) {
1.13 eric 767: if (SHOW_MSG) HTTrace("Please specify URL to check.\n");
1.1 frystyk 768: Cleanup(mr, -1);
769: }
770:
1.23 manoli 771: /* Testing that HTTrace is working */
1.47 frystyk 772: if (SHOW_MSG) HTTrace ("Welcome to the W3C mini Robot\n");
1.23 manoli 773:
1.1 frystyk 774: /* Rule file specified? */
775: if (mr->rules) {
776: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 777: if (!HTLoadRules(rules))
1.13 eric 778: if (SHOW_MSG) HTTrace("Can't access rules\n");
1.11 frystyk 779: HT_FREE(rules);
1.1 frystyk 780: }
781:
782: /* Output file specified? */
783: if (mr->outputfile) {
784: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.13 eric 785: if (SHOW_MSG) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 786: mr->output = OUTPUT;
787: }
788: }
789:
1.48 frystyk 790: /* Should we use persistent cache? */
791: if (cache) {
1.54 frystyk 792: HTCacheInit(cache_root, 20);
1.49 frystyk 793: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
794: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
795: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 796:
797: /* Should we start by flushing? */
798: if (flush) HTCache_flushAll();
799: }
800:
1.1 frystyk 801: /* Log file specifed? */
1.55 ! frystyk 802: if (mr->logfile) {
! 803: mr->log = HTLog_open(mr->logfile, YES, YES);
! 804: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
! 805: }
1.1 frystyk 806:
1.27 frystyk 807: /* Register our own someterminater filter */
1.32 frystyk 808: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 809:
810: /* Setting event timeout */
811: HTHost_setEventTimeout(mr->timer);
1.55 ! frystyk 812:
! 813: mr->time = time(NULL);
1.37 frystyk 814:
1.34 eric 815: /* Start the request */
816: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 817:
818: /*
819: ** Make sure that the first request is flushed immediately and not
820: ** buffered in the output buffer
821: */
822: HTRequest_setFlush(finger->request, YES);
823:
824: /*
1.48 frystyk 825: ** Check whether we should do some kind of cache validation on
826: ** the load
827: */
828: if (mr->flags & MR_VALIDATE)
829: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
830: if (mr->flags & MR_END_VALIDATE)
831: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
832:
833: /*
1.43 frystyk 834: ** Now do the load
835: */
1.34 eric 836: if (mr->flags & MR_PREEMPTIVE)
837: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 838:
839: if (keywords) /* Search */
1.34 eric 840: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 841: else
1.34 eric 842: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 843:
1.5 frystyk 844: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 845: if (status != YES) {
1.13 eric 846: if (SHOW_MSG) HTTrace("Can't access resource\n");
1.1 frystyk 847: Cleanup(mr, -1);
848: }
849:
850: /* Go into the event loop... */
1.34 eric 851: HTEventList_loop(finger->request);
1.1 frystyk 852:
853: /* Only gets here if event loop fails */
854: Cleanup(mr, 0);
855: return 0;
856: }
Webmaster