Annotation of libwww/Robot/src/HTRobot.c, revision 1.56
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.14 frystyk 25: #ifndef W3C_VERSION
1.33 eric 26: #define W3C_VERSION "Unspecified"
1.1 frystyk 27: #endif
28:
29: #define APP_NAME "W3CRobot"
1.14 frystyk 30: #define APP_VERSION W3C_VERSION
1.1 frystyk 31:
32: #define DEFAULT_OUTPUT_FILE "robot.out"
33: #define DEFAULT_RULE_FILE "robot.conf"
34: #define DEFAULT_LOG_FILE "robot.log"
1.55 frystyk 35: #define DEFAULT_HIT_FILE "robot.hit"
1.51 frystyk 36: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 37: #define DEFAULT_PREFIX ""
1.7 frystyk 38: #define DEFAULT_DEPTH 0
1.53 frystyk 39: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 40:
1.51 frystyk 41: #if 0
1.53 frystyk 42: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 43: #endif
44:
1.46 eric 45: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
46: #define SHOW_MSG (!(mr->flags & MR_QUIET))
1.1 frystyk 47:
1.40 frystyk 48: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 49:
50: #if defined(__svr4__)
51: #define CATCH_SIG
52: #endif
53:
54: typedef enum _MRFlags {
1.45 frystyk 55: MR_IMG = 0x1,
56: MR_LINK = 0x2,
57: MR_PREEMPTIVE = 0x4,
58: MR_TIME = 0x8,
1.46 eric 59: MR_SAVE = 0x10,
1.48 frystyk 60: MR_QUIET = 0x20,
61: MR_VALIDATE = 0x40,
62: MR_END_VALIDATE = 0x80
1.1 frystyk 63: } MRFlags;
64:
65: typedef struct _Robot {
1.2 frystyk 66: int depth; /* How deep is our tree */
1.30 frystyk 67: int cnt; /* Count of requests */
1.2 frystyk 68: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 69: HTList * htext; /* List of our HText Objects */
1.34 eric 70: HTList * fingers;
1.40 frystyk 71: int timer;
1.1 frystyk 72: char * cwd; /* Current dir URL */
73: char * rules;
1.55 frystyk 74: char * prefix;
1.1 frystyk 75: char * logfile;
1.55 frystyk 76: HTLog * log;
1.1 frystyk 77: char * outputfile;
78: FILE * output;
1.55 frystyk 79: char * hitfile;
1.1 frystyk 80: MRFlags flags;
1.55 frystyk 81:
82: long total_bytes; /* Total number of bytes processed */
83: long total_docs; /* Total number of documents processed */
1.56 ! frystyk 84: ms_t time; /* Time of run */
1.1 frystyk 85: } Robot;
1.34 eric 86:
87: typedef struct _Finger {
88: Robot * robot;
89: HTRequest * request;
90: HTParentAnchor * dest;
91: } Finger;
92:
1.1 frystyk 93: typedef enum _LoadState {
94: L_INVALID = -2,
95: L_LOADING = -1,
96: L_SUCCESS = 0,
97: L_ERROR
98: } LoadState;
99:
100: /*
101: ** The HyperDoc object is bound to the anchor and contains information about
102: ** where we are in the search for recursive searches
103: */
104: typedef struct _HyperDoc {
105: HTParentAnchor * anchor;
106: LoadState state;
107: int depth;
1.55 frystyk 108: int hits;
1.1 frystyk 109: } HyperDoc;
110:
111: /*
112: ** This is the HText object that is created every time we start parsing a
113: ** HTML object
114: */
1.4 frystyk 115: struct _HText {
1.1 frystyk 116: HTRequest * request;
1.4 frystyk 117: };
1.1 frystyk 118:
119: PUBLIC HText * HTMainText = NULL;
120: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
121: PUBLIC HTStyleSheet * styleSheet = NULL;
122:
1.55 frystyk 123: PRIVATE HTComparer HitSort;
124:
1.1 frystyk 125: /* ------------------------------------------------------------------------- */
126:
1.13 eric 127: /* Standard (non-error) Output
128: ** ---------------------------
129: */
130: PUBLIC int OutputData(const char * fmt, ...)
131: {
132: int ret;
133: va_list pArgs;
134: va_start(pArgs, fmt);
135: ret = vfprintf(stdout, fmt, pArgs);
136: va_end(pArgs);
137: return ret;
138: }
139:
140: /* ------------------------------------------------------------------------- */
141:
1.2 frystyk 142: /* Create a "HyperDoc" object
143: ** --------------------------
144: ** A HyperDoc object contains information about whether we have already
145: ** started checking the anchor and the depth in our search
146: */
147: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
148: {
149: HyperDoc * hd;
1.14 frystyk 150: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
151: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 152: hd->state = L_INVALID;
153: hd->depth = depth;
1.55 frystyk 154: hd->hits = 1;
1.2 frystyk 155:
156: /* Bind the HyperDoc object together with the Anchor Object */
157: hd->anchor = anchor;
158: HTAnchor_setDocument(anchor, (void *) hd);
159:
160: /* Add this HyperDoc object to our list */
161: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
162: HTList_addObject(mr->hyperdoc, (void *) hd);
163: return hd;
164: }
165:
166: /* Delete a "HyperDoc" object
167: ** --------------------------
168: */
169: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
170: {
171: if (hd) {
1.11 frystyk 172: HT_FREE (hd);
1.2 frystyk 173: return YES;
174: }
175: return NO;
176: }
177:
1.55 frystyk 178: /*
179: ** Sort the anchor array and log reference count
180: */
181: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
182: {
183: if (mr && array) {
184: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
185: if (log) {
186: void ** data = NULL;
187: HTParentAnchor * anchor = NULL;
188: HTArray_sort(array, HitSort);
189: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
190: while (anchor) {
191: char * str = NULL;
192: char * uri = HTAnchor_address((HTAnchor *) anchor);
193: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
194: if (uri && hd) {
195: if ((str = (char *) HT_MALLOC(strlen(uri) + 50)) == NULL)
196: HT_OUTOFMEM("calculate_hits");
197: sprintf(str, "%8d %s\n", hd->hits, uri);
198: HTLog_addLine(log, str);
199: HT_FREE(str);
200: }
201: HT_FREE(uri);
202: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
203: }
204: }
205: HTLog_close(log);
206: return YES;
207: }
208: return NO;
209: }
210:
211: PRIVATE int HitSort (const void * a, const void * b)
212: {
213: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
214: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
215: if (aa && bb) return (bb->hits - aa->hits);
216: return bb - aa;
217: }
218:
219: /* Statistics
220: ** ----------
221: ** Calculates a bunch of statistics for the anchors traversed
222: */
223: PRIVATE BOOL calculate_statistics (Robot * mr)
224: {
225: if (!mr) return NO;
226:
227: /* Calculate efficiency */
228: {
1.56 ! frystyk 229: ms_t t = HTGetTimeInMillis() - mr->time;
! 230: if (t > 0) {
! 231: double loadfactor = 1000 * (mr->total_bytes / t);
! 232: double secs = t / 1000.0;
1.55 frystyk 233: char bytes[50];
234: HTNumToStr(mr->total_bytes, bytes, 50);
1.56 ! frystyk 235: HTTrace("Downloaded %s bytes in %ld document bodies in %.2f seconds (%2.1f bytes/sec)\n",
! 236: bytes, mr->total_docs, secs, loadfactor);
1.55 frystyk 237: }
238: }
239:
240: /* Create an array of existing anchors */
241: if (mr->total_docs > 1) {
242: HTArray * array = HTAnchor_getArray(mr->total_docs);
243: if (array) {
244:
245: /* Sort after hit counts */
246: if (mr->hitfile) calculate_hits(mr, array);
247:
248:
249: /* Add as may other stats here as you like */
250:
251: HTArray_delete(array);
252: }
253: }
254: return YES;
255: }
256:
1.1 frystyk 257: /* Create a Command Line Object
258: ** ----------------------------
259: */
260: PRIVATE Robot * Robot_new (void)
261: {
262: Robot * me;
1.41 frystyk 263: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 264: HT_OUTOFMEM("Robot_new");
1.2 frystyk 265: me->hyperdoc = HTList_new();
1.4 frystyk 266: me->htext = HTList_new();
1.40 frystyk 267: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 268: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 269: me->output = OUTPUT;
1.35 eric 270: me->cnt = 0;
1.34 eric 271: me->fingers = HTList_new();
1.1 frystyk 272: return me;
273: }
274:
275: /* Delete a Command Line Object
276: ** ----------------------------
277: */
278: PRIVATE BOOL Robot_delete (Robot * me)
279: {
280: if (me) {
1.34 eric 281: HTList_delete(me->fingers);
1.55 frystyk 282:
283: /* Calculate statistics */
284: calculate_statistics(me);
285:
286: if (me->hyperdoc) {
1.2 frystyk 287: HTList * cur = me->hyperdoc;
288: HyperDoc * pres;
289: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
290: HyperDoc_delete(pres);
291: HTList_delete(me->hyperdoc);
292: }
1.4 frystyk 293: if (me->htext) {
294: HTList * cur = me->htext;
295: HText * pres;
296: while ((pres = (HText *) HTList_nextObject(cur)))
297: HText_free(pres);
298: HTList_delete(me->htext);
299: }
1.55 frystyk 300: if (me->log) HTLog_close(me->log);
1.1 frystyk 301: if (me->output && me->output != STDOUT) fclose(me->output);
1.12 frystyk 302: if (me->flags & MR_TIME) {
303: time_t local = time(NULL);
1.13 eric 304: HTTrace("Robot terminated %s\n",HTDateTimeStr(&local,YES));
1.12 frystyk 305: }
1.55 frystyk 306:
1.11 frystyk 307: HT_FREE(me->cwd);
1.55 frystyk 308: HT_FREE(me->prefix);
1.11 frystyk 309: HT_FREE(me);
1.1 frystyk 310: return YES;
311: }
312: return NO;
313: }
314:
1.2 frystyk 315: /*
1.34 eric 316: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 317: */
1.34 eric 318: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 319: {
1.34 eric 320: Finger * me;
321: HTRequest * request = HTRequest_new();
322: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
323: HT_OUTOFMEM("Finger_new");
324: me->robot = robot;
325: me->request = request;
326: me->dest = dest;
327: HTList_addObject(robot->fingers, (void *)me);
328:
1.48 frystyk 329: /* Set the context for this request */
1.34 eric 330: HTRequest_setContext (request, me);
1.48 frystyk 331:
332: /* Check the various flags to customize the request */
333: if (robot->flags & MR_PREEMPTIVE)
334: HTRequest_setPreemptive(request, YES);
335: if (robot->flags & MR_VALIDATE)
336: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
337: if (robot->flags & MR_END_VALIDATE)
338: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
339:
340: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 341: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 342:
343: /* Set the method for this request */
1.34 eric 344: HTRequest_setMethod(request, method);
345: robot->cnt++;
346: return me;
1.2 frystyk 347: }
348:
1.34 eric 349: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 350: {
1.34 eric 351: HTList_removeObject(me->robot->fingers, (void *)me);
352: me->robot->cnt--;
1.37 frystyk 353:
354: /*
355: ** If we are down at one request then flush the output buffer
356: */
357: if (me->request) {
358: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 359: HTRequest_delete(me->request);
1.37 frystyk 360: }
361:
362: /*
363: ** Delete the request and free myself
364: */
1.34 eric 365: HT_FREE(me);
366: return YES;
1.2 frystyk 367: }
368:
369: /*
370: ** Cleanup and make sure we close all connections including the persistent
371: ** ones
372: */
1.1 frystyk 373: PRIVATE void Cleanup (Robot * me, int status)
374: {
375: Robot_delete(me);
1.29 eric 376: HTProfile_delete();
1.50 frystyk 377: #ifdef HT_MEMLOG
1.39 eric 378: HTMemLog_close();
1.47 frystyk 379: #endif
380:
1.1 frystyk 381: #ifdef VMS
382: exit(status ? status : 1);
383: #else
384: exit(status ? status : 0);
385: #endif
386: }
387:
388: #ifdef CATCH_SIG
389: #include <signal.h>
390: /* SetSignal
391: ** This function sets up signal handlers. This might not be necessary to
392: ** call if the application has its own handlers (lossage on SVR4)
393: */
394: PRIVATE void SetSignal (void)
395: {
396: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
397: ** when attemting to connect to a remote host where you normally should
398: ** get `connection refused' back
399: */
400: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 401: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 402: } else {
1.13 eric 403: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 404: }
1.47 frystyk 405:
1.50 frystyk 406: #ifdef HT_MEMLOG
1.44 eric 407: HTMemLog_flush();
1.47 frystyk 408: #endif
409:
1.1 frystyk 410: }
411: #endif /* CATCH_SIG */
412:
413: PRIVATE void VersionInfo (void)
414: {
1.13 eric 415: OutputData("\n\nW3C Reference Software\n\n");
416: OutputData("\tW3C Mini Robot (%s) version %s.\n",
1.1 frystyk 417: APP_NAME, APP_VERSION);
1.13 eric 418: OutputData("\tW3C Reference Library version %s.\n\n",HTLib_version());
419: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 420: }
421:
422: /* terminate_handler
423: ** -----------------
1.2 frystyk 424: ** This function is registered to handle the result of the request.
425: ** If no more requests are pending then terminate program
1.1 frystyk 426: */
1.32 frystyk 427: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
428: void * param, int status)
1.1 frystyk 429: {
1.34 eric 430: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 431: Robot * mr = finger->robot;
1.34 eric 432: if (SHOW_MSG) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 433:
434: /* Count the amount of body data that we have read */
435: if (status == HT_LOADED && HTRequest_method(request) == METHOD_GET) {
1.56 ! frystyk 436: int length = HTAnchor_length(HTRequest_anchor(request));
! 437: if (length > 0) mr->total_bytes += length;
1.55 frystyk 438: }
439:
440: /* Count the number of documents that we have processed */
441: mr->total_docs++;
442:
443: /* Delete this thread */
1.34 eric 444: Finger_delete(finger);
1.55 frystyk 445:
446: /* Should we stop? */
1.46 eric 447: if (mr->cnt <= 0) {
1.34 eric 448: if (SHOW_MSG) HTTrace(" Everything is finished...\n");
1.46 eric 449: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 450: }
1.46 eric 451: if (SHOW_MSG) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 452: return HT_OK;
453: }
454:
455: /* ------------------------------------------------------------------------- */
456: /* HTEXT INTERFACE */
457: /* ------------------------------------------------------------------------- */
458:
459: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
460: HTStream * stream)
461: {
462: HText * me;
1.34 eric 463: Finger * finger = (Finger *) HTRequest_context(request);
464: Robot * mr = finger->robot;
1.14 frystyk 465: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
466: HT_OUTOFMEM("HText_new2");
1.4 frystyk 467:
468: /* Bind the HText object together with the Request Object */
1.1 frystyk 469: me->request = request;
1.4 frystyk 470:
471: /* Add this HyperDoc object to our list */
472: if (!mr->htext) mr->htext = HTList_new();
473: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 474: return me;
475: }
476:
1.4 frystyk 477: PUBLIC void HText_free (HText * me) {
1.11 frystyk 478: if (me) HT_FREE (me);
1.4 frystyk 479: }
480:
1.1 frystyk 481: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
482: {
483: if (text && anchor) {
1.34 eric 484: Finger * finger = (Finger *) HTRequest_context(text->request);
485: Robot * mr = finger->robot;
1.1 frystyk 486: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
487: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 488: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 489: HyperDoc * hd = HTAnchor_document(dest_parent);
1.55 frystyk 490: BOOL prefix_match = YES;
1.1 frystyk 491:
1.55 frystyk 492: if (!uri) return;
493: if (SHOW_MSG) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
494:
495: /* Check for prefix match */
496: if (mr->prefix) prefix_match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1.7 frystyk 497:
1.2 frystyk 498: /* Test whether we already have a hyperdoc for this document */
1.55 frystyk 499: if (hd) {
500: if (SHOW_MSG) HTTrace("Already checked\n");
501: hd->hits++;
502: } else if (mr->flags & MR_LINK && prefix_match && dest_parent) {
1.1 frystyk 503: HTParentAnchor * parent = HTRequest_parent(text->request);
504: HyperDoc * last = HTAnchor_document(parent);
505: int depth = last ? last->depth+1 : 0;
1.34 eric 506: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
507: HTRequest * newreq = newfinger->request;
1.2 frystyk 508: HyperDoc_new(mr, dest_parent, depth);
1.7 frystyk 509: HTRequest_setParent(newreq, HTRequest_anchor(text->request));
510: if (depth >= mr->depth) {
511: if (SHOW_MSG)
1.13 eric 512: HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 513: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 514: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 515: } else {
1.13 eric 516: if (SHOW_MSG) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 517: }
518: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.13 eric 519: if (SHOW_MSG) HTTrace("not tested!\n");
1.34 eric 520: Finger_delete(newfinger);
1.2 frystyk 521: }
1.7 frystyk 522: } else {
1.55 frystyk 523: if (SHOW_MSG) HTTrace("does not fulfill constraints\n");
1.2 frystyk 524: }
1.11 frystyk 525: HT_FREE(uri);
1.2 frystyk 526: }
527: }
528:
529: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 530: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 531: {
532: if (text && anchor) {
1.34 eric 533: Finger * finger = (Finger *) HTRequest_context(text->request);
534: Robot * mr = finger->robot;
1.2 frystyk 535: HTParentAnchor * dest = (HTParentAnchor *)
536: HTAnchor_followMainLink((HTAnchor *) anchor);
537: HyperDoc * hd = HTAnchor_document(dest);
1.1 frystyk 538:
1.2 frystyk 539: /* Test whether we already have a hyperdoc for this document */
540: if (mr->flags & MR_IMG && dest && !hd) {
541: HTParentAnchor * parent = HTRequest_parent(text->request);
542: HyperDoc * last = HTAnchor_document(parent);
543: int depth = last ? last->depth+1 : 0;
1.45 frystyk 544: Finger * newfinger = Finger_new(mr, dest,
545: mr->flags & MR_SAVE ?
546: METHOD_GET : METHOD_HEAD);
1.34 eric 547: HTRequest * newreq = newfinger->request;
1.2 frystyk 548: HyperDoc_new(mr, dest, depth);
549: if (SHOW_MSG) {
550: char * uri = HTAnchor_address((HTAnchor *) dest);
1.13 eric 551: HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.11 frystyk 552: HT_FREE(uri);
1.2 frystyk 553: }
554: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
555: if (SHOW_MSG)
1.13 eric 556: HTTrace("Robot....... Image not tested!\n");
1.34 eric 557: Finger_delete(newfinger);
1.1 frystyk 558: }
559: }
560: }
561: }
562:
563: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 564: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 565: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
566: PUBLIC void HText_endAppend (HText * text) {}
567: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
568: PUBLIC void HText_beginAppend (HText * text) {}
569: PUBLIC void HText_appendParagraph (HText * text) {}
570:
1.48 frystyk 571: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
572: {
573: return (vfprintf(stderr, fmt, pArgs));
574: }
575:
1.1 frystyk 576: /* ------------------------------------------------------------------------- */
577: /* MAIN PROGRAM */
578: /* ------------------------------------------------------------------------- */
579:
580: int main (int argc, char ** argv)
581: {
1.48 frystyk 582: int status = 0;
1.1 frystyk 583: int arg;
1.48 frystyk 584: BOOL cache = NO; /* Use persistent cache */
585: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 586: char * cache_root = NULL;
1.1 frystyk 587: HTChunk * keywords = NULL; /* From command line */
588: int keycnt = 0;
1.12 frystyk 589: Robot * mr = NULL;
1.43 frystyk 590: Finger * finger = NULL;
591: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 592:
593: /* Starts Mac GUSI socket library */
594: #ifdef GUSI
595: GUSISetup(GUSIwithSIOUXSockets);
596: GUSISetup(GUSIwithInternetSockets);
597: #endif
598:
599: #ifdef __MWERKS__ /* STR */
600: InitGraf((Ptr) &qd.thePort);
601: InitFonts();
602: InitWindows();
603: InitMenus(); TEInit();
604: InitDialogs(nil);
605: InitCursor();
606: SIOUXSettings.asktosaveonclose = false;
607: argc=ccommand(&argv);
1.50 frystyk 608: #endif /* __MWERKS__ */
1.1 frystyk 609:
1.50 frystyk 610: #ifdef HT_MEMLOG
1.51 frystyk 611: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 612: #endif
1.46 eric 613:
1.27 frystyk 614: /* Initiate W3C Reference Library with a robot profile */
615: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 616: HTTrace_setCallback(RobotTrace);
1.27 frystyk 617:
618: /* Add the default HTML parser to the set of converters */
619: {
620: HTList * converters = HTFormat_conversion();
621: HTMLInit(converters);
622: }
1.1 frystyk 623:
1.12 frystyk 624: /* Build a new robot object */
625: mr = Robot_new();
626:
1.1 frystyk 627: /* Scan command Line for parameters */
628: for (arg=1; arg<argc; arg++) {
629: if (*argv[arg] == '-') {
630:
631: /* non-interactive */
1.17 frystyk 632: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 633: HTAlert_setInteractive(NO);
634:
1.55 frystyk 635: /* log file */
1.1 frystyk 636: } else if (!strcmp(argv[arg], "-l")) {
637: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
638: argv[++arg] : DEFAULT_LOG_FILE;
639:
1.55 frystyk 640: /* hit file */
641: } else if (!strcmp(argv[arg], "-hit")) {
642: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
643: argv[++arg] : DEFAULT_HIT_FILE;
644:
645: /* rule file */
1.1 frystyk 646: } else if (!strcmp(argv[arg], "-r")) {
647: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
648: argv[++arg] : DEFAULT_RULE_FILE;
649:
650: /* output filename */
651: } else if (!strcmp(argv[arg], "-o")) {
652: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
653: argv[++arg] : DEFAULT_OUTPUT_FILE;
654:
1.55 frystyk 655: /* URI prefix */
656: } else if (!strcmp(argv[arg], "-prefix")) {
657: char * prefix = NULL;
658: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
659: argv[++arg] : DEFAULT_PREFIX;
660: if (*prefix) {
661: StrAllocCopy(mr->prefix, prefix);
662: StrAllocCat(mr->prefix, "*");
663: }
664:
1.1 frystyk 665: /* timeout -- Change the default request timeout */
666: } else if (!strcmp(argv[arg], "-timeout")) {
667: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
668: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 669: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 670:
1.54 frystyk 671: /* Force no pipelined requests */
672: } else if (!strcmp(argv[arg], "-nopipe")) {
673: HTTP_setConnectionMode(HTTP_NO_PIPELINING);
674:
1.48 frystyk 675: /* Start the persistent cache */
676: } else if (!strcmp(argv[arg], "-cache")) {
677: cache = YES;
678:
1.54 frystyk 679: /* Determine the cache root */
680: } else if (!strcmp(argv[arg], "-cacheroot")) {
681: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
682: argv[++arg] : NULL;
1.51 frystyk 683:
1.52 frystyk 684: /* Stream write flush delay in ms */
685: } else if (!strcmp(argv[arg], "-delay")) {
686: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
687: atoi(argv[++arg]) : DEFAULT_DELAY;
688: HTHost_setDefaultWriteDelay(delay);
689:
1.48 frystyk 690: /* Persistent cache flush */
691: } else if (!strcmp(argv[arg], "-flush")) {
692: flush = YES;
693:
694: /* Do a cache validation */
695: } else if (!strcmp(argv[arg], "-validate")) {
696: mr->flags |= MR_VALIDATE;
697:
698: /* Do an end-to-end cache-validation */
699: } else if (!strcmp(argv[arg], "-endvalidate")) {
700: mr->flags |= MR_END_VALIDATE;
701:
1.7 frystyk 702: /* preemptive or non-preemptive access */
1.1 frystyk 703: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 704: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 705:
706: /* test inlined images */
707: } else if (!strcmp(argv[arg], "-img")) {
708: mr->flags |= MR_IMG;
1.45 frystyk 709:
710: /* load inlined images */
711: } else if (!strcmp(argv[arg], "-saveimg")) {
712: mr->flags |= (MR_IMG | MR_SAVE);
1.2 frystyk 713:
714: /* load anchors */
715: } else if (!strcmp(argv[arg], "-link")) {
716: mr->flags |= MR_LINK;
1.7 frystyk 717: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
718: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 719:
1.12 frystyk 720: /* Output start and end time */
721: } else if (!strcmp(argv[arg], "-ss")) {
722: time_t local = time(NULL);
1.13 eric 723: HTTrace("Robot started on %s\n",
1.12 frystyk 724: HTDateTimeStr(&local, YES));
725: mr->flags |= MR_TIME;
726:
1.1 frystyk 727: /* print version and exit */
728: } else if (!strcmp(argv[arg], "-version")) {
729: VersionInfo();
730: Cleanup(mr, 0);
1.46 eric 731:
732: /* run in quiet mode */
733: } else if (!strcmp(argv[arg], "-q")) {
734: mr->flags |= MR_QUIET;
1.1 frystyk 735:
736: #ifdef WWWTRACE
737: /* trace flags */
738: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 739: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 740: #endif
741:
742: } else {
1.13 eric 743: if (SHOW_MSG) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 744: }
1.17 frystyk 745: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 746: if (!keycnt) {
747: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 ! frystyk 748: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 749: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 750: keycnt = 1;
1.11 frystyk 751: HT_FREE(ref);
1.1 frystyk 752: } else { /* Check for successive keyword arguments */
753: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
754: if (keycnt++ <= 1)
1.5 frystyk 755: keywords = HTChunk_new(128);
1.1 frystyk 756: else
1.5 frystyk 757: HTChunk_putc(keywords, ' ');
758: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 759: HT_FREE(escaped);
1.1 frystyk 760: }
761: }
762: }
763:
764: #ifdef CATCH_SIG
765: SetSignal();
766: #endif
767:
768: if (!keycnt) {
1.13 eric 769: if (SHOW_MSG) HTTrace("Please specify URL to check.\n");
1.1 frystyk 770: Cleanup(mr, -1);
771: }
772:
1.23 manoli 773: /* Testing that HTTrace is working */
1.47 frystyk 774: if (SHOW_MSG) HTTrace ("Welcome to the W3C mini Robot\n");
1.23 manoli 775:
1.1 frystyk 776: /* Rule file specified? */
777: if (mr->rules) {
778: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 779: if (!HTLoadRules(rules))
1.13 eric 780: if (SHOW_MSG) HTTrace("Can't access rules\n");
1.11 frystyk 781: HT_FREE(rules);
1.1 frystyk 782: }
783:
784: /* Output file specified? */
785: if (mr->outputfile) {
786: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.13 eric 787: if (SHOW_MSG) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 788: mr->output = OUTPUT;
789: }
790: }
791:
1.48 frystyk 792: /* Should we use persistent cache? */
793: if (cache) {
1.54 frystyk 794: HTCacheInit(cache_root, 20);
1.49 frystyk 795: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
796: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
797: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 798:
799: /* Should we start by flushing? */
800: if (flush) HTCache_flushAll();
801: }
802:
1.1 frystyk 803: /* Log file specifed? */
1.55 frystyk 804: if (mr->logfile) {
805: mr->log = HTLog_open(mr->logfile, YES, YES);
806: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
807: }
1.1 frystyk 808:
1.27 frystyk 809: /* Register our own someterminater filter */
1.32 frystyk 810: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 811:
812: /* Setting event timeout */
813: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 814:
1.56 ! frystyk 815: mr->time = HTGetTimeInMillis();
1.37 frystyk 816:
1.34 eric 817: /* Start the request */
818: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 819:
820: /*
821: ** Make sure that the first request is flushed immediately and not
822: ** buffered in the output buffer
823: */
824: HTRequest_setFlush(finger->request, YES);
825:
826: /*
1.48 frystyk 827: ** Check whether we should do some kind of cache validation on
828: ** the load
829: */
830: if (mr->flags & MR_VALIDATE)
831: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
832: if (mr->flags & MR_END_VALIDATE)
833: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
834:
835: /*
1.43 frystyk 836: ** Now do the load
837: */
1.34 eric 838: if (mr->flags & MR_PREEMPTIVE)
839: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 840:
841: if (keywords) /* Search */
1.34 eric 842: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 843: else
1.34 eric 844: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 845:
1.5 frystyk 846: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 847: if (status != YES) {
1.13 eric 848: if (SHOW_MSG) HTTrace("Can't access resource\n");
1.1 frystyk 849: Cleanup(mr, -1);
850: }
851:
852: /* Go into the event loop... */
1.34 eric 853: HTEventList_loop(finger->request);
1.1 frystyk 854:
855: /* Only gets here if event loop fails */
856: Cleanup(mr, 0);
857: return 0;
858: }
Webmaster