Annotation of libwww/Robot/src/HTRobot.c, revision 1.57
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.14 frystyk 25: #ifndef W3C_VERSION
1.33 eric 26: #define W3C_VERSION "Unspecified"
1.1 frystyk 27: #endif
28:
29: #define APP_NAME "W3CRobot"
1.14 frystyk 30: #define APP_VERSION W3C_VERSION
1.1 frystyk 31:
32: #define DEFAULT_OUTPUT_FILE "robot.out"
33: #define DEFAULT_RULE_FILE "robot.conf"
34: #define DEFAULT_LOG_FILE "robot.log"
1.55 frystyk 35: #define DEFAULT_HIT_FILE "robot.hit"
1.57 ! frystyk 36: #define DEFAULT_REFERER_FILE "robot.ref"
1.51 frystyk 37: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 38: #define DEFAULT_PREFIX ""
1.7 frystyk 39: #define DEFAULT_DEPTH 0
1.53 frystyk 40: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 41:
1.51 frystyk 42: #if 0
1.53 frystyk 43: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 44: #endif
45:
1.46 eric 46: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
47: #define SHOW_MSG (!(mr->flags & MR_QUIET))
1.1 frystyk 48:
1.40 frystyk 49: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 50:
51: #if defined(__svr4__)
52: #define CATCH_SIG
53: #endif
54:
55: typedef enum _MRFlags {
1.45 frystyk 56: MR_IMG = 0x1,
57: MR_LINK = 0x2,
58: MR_PREEMPTIVE = 0x4,
59: MR_TIME = 0x8,
1.46 eric 60: MR_SAVE = 0x10,
1.48 frystyk 61: MR_QUIET = 0x20,
62: MR_VALIDATE = 0x40,
63: MR_END_VALIDATE = 0x80
1.1 frystyk 64: } MRFlags;
65:
66: typedef struct _Robot {
1.2 frystyk 67: int depth; /* How deep is our tree */
1.30 frystyk 68: int cnt; /* Count of requests */
1.2 frystyk 69: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 70: HTList * htext; /* List of our HText Objects */
1.34 eric 71: HTList * fingers;
1.40 frystyk 72: int timer;
1.1 frystyk 73: char * cwd; /* Current dir URL */
74: char * rules;
1.55 frystyk 75: char * prefix;
1.1 frystyk 76: char * logfile;
1.55 frystyk 77: HTLog * log;
1.57 ! frystyk 78: char * reffile;
! 79: HTLog * ref;
1.1 frystyk 80: char * outputfile;
81: FILE * output;
1.55 frystyk 82: char * hitfile;
1.1 frystyk 83: MRFlags flags;
1.55 frystyk 84:
85: long total_bytes; /* Total number of bytes processed */
86: long total_docs; /* Total number of documents processed */
1.56 frystyk 87: ms_t time; /* Time of run */
1.1 frystyk 88: } Robot;
1.34 eric 89:
90: typedef struct _Finger {
91: Robot * robot;
92: HTRequest * request;
93: HTParentAnchor * dest;
94: } Finger;
95:
1.1 frystyk 96: typedef enum _LoadState {
97: L_INVALID = -2,
98: L_LOADING = -1,
99: L_SUCCESS = 0,
100: L_ERROR
101: } LoadState;
102:
103: /*
104: ** The HyperDoc object is bound to the anchor and contains information about
105: ** where we are in the search for recursive searches
106: */
107: typedef struct _HyperDoc {
108: HTParentAnchor * anchor;
109: LoadState state;
110: int depth;
1.55 frystyk 111: int hits;
1.1 frystyk 112: } HyperDoc;
113:
114: /*
115: ** This is the HText object that is created every time we start parsing a
116: ** HTML object
117: */
1.4 frystyk 118: struct _HText {
1.1 frystyk 119: HTRequest * request;
1.4 frystyk 120: };
1.1 frystyk 121:
122: PUBLIC HText * HTMainText = NULL;
123: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
124: PUBLIC HTStyleSheet * styleSheet = NULL;
125:
1.55 frystyk 126: PRIVATE HTComparer HitSort;
127:
1.1 frystyk 128: /* ------------------------------------------------------------------------- */
129:
1.13 eric 130: /* Standard (non-error) Output
131: ** ---------------------------
132: */
133: PUBLIC int OutputData(const char * fmt, ...)
134: {
135: int ret;
136: va_list pArgs;
137: va_start(pArgs, fmt);
138: ret = vfprintf(stdout, fmt, pArgs);
139: va_end(pArgs);
140: return ret;
141: }
142:
143: /* ------------------------------------------------------------------------- */
144:
1.2 frystyk 145: /* Create a "HyperDoc" object
146: ** --------------------------
147: ** A HyperDoc object contains information about whether we have already
148: ** started checking the anchor and the depth in our search
149: */
150: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
151: {
152: HyperDoc * hd;
1.14 frystyk 153: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
154: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 155: hd->state = L_INVALID;
156: hd->depth = depth;
1.55 frystyk 157: hd->hits = 1;
1.2 frystyk 158:
159: /* Bind the HyperDoc object together with the Anchor Object */
160: hd->anchor = anchor;
161: HTAnchor_setDocument(anchor, (void *) hd);
162:
163: /* Add this HyperDoc object to our list */
164: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
165: HTList_addObject(mr->hyperdoc, (void *) hd);
166: return hd;
167: }
168:
169: /* Delete a "HyperDoc" object
170: ** --------------------------
171: */
172: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
173: {
174: if (hd) {
1.11 frystyk 175: HT_FREE (hd);
1.2 frystyk 176: return YES;
177: }
178: return NO;
179: }
180:
1.55 frystyk 181: /*
182: ** Sort the anchor array and log reference count
183: */
184: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
185: {
186: if (mr && array) {
187: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
188: if (log) {
189: void ** data = NULL;
190: HTParentAnchor * anchor = NULL;
191: HTArray_sort(array, HitSort);
192: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
193: while (anchor) {
194: char * str = NULL;
195: char * uri = HTAnchor_address((HTAnchor *) anchor);
196: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
197: if (uri && hd) {
198: if ((str = (char *) HT_MALLOC(strlen(uri) + 50)) == NULL)
199: HT_OUTOFMEM("calculate_hits");
200: sprintf(str, "%8d %s\n", hd->hits, uri);
201: HTLog_addLine(log, str);
202: HT_FREE(str);
203: }
204: HT_FREE(uri);
205: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
206: }
207: }
208: HTLog_close(log);
209: return YES;
210: }
211: return NO;
212: }
213:
214: PRIVATE int HitSort (const void * a, const void * b)
215: {
216: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
217: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
218: if (aa && bb) return (bb->hits - aa->hits);
219: return bb - aa;
220: }
221:
222: /* Statistics
223: ** ----------
224: ** Calculates a bunch of statistics for the anchors traversed
225: */
226: PRIVATE BOOL calculate_statistics (Robot * mr)
227: {
228: if (!mr) return NO;
229:
230: /* Calculate efficiency */
231: {
1.56 frystyk 232: ms_t t = HTGetTimeInMillis() - mr->time;
233: if (t > 0) {
234: double loadfactor = 1000 * (mr->total_bytes / t);
235: double secs = t / 1000.0;
1.55 frystyk 236: char bytes[50];
237: HTNumToStr(mr->total_bytes, bytes, 50);
1.56 frystyk 238: HTTrace("Downloaded %s bytes in %ld document bodies in %.2f seconds (%2.1f bytes/sec)\n",
239: bytes, mr->total_docs, secs, loadfactor);
1.55 frystyk 240: }
241: }
242:
243: /* Create an array of existing anchors */
244: if (mr->total_docs > 1) {
245: HTArray * array = HTAnchor_getArray(mr->total_docs);
246: if (array) {
247:
248: /* Sort after hit counts */
249: if (mr->hitfile) calculate_hits(mr, array);
250:
251:
252: /* Add as may other stats here as you like */
253:
254: HTArray_delete(array);
255: }
256: }
257: return YES;
258: }
259:
1.1 frystyk 260: /* Create a Command Line Object
261: ** ----------------------------
262: */
263: PRIVATE Robot * Robot_new (void)
264: {
265: Robot * me;
1.41 frystyk 266: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 267: HT_OUTOFMEM("Robot_new");
1.2 frystyk 268: me->hyperdoc = HTList_new();
1.4 frystyk 269: me->htext = HTList_new();
1.40 frystyk 270: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 271: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 272: me->output = OUTPUT;
1.35 eric 273: me->cnt = 0;
1.34 eric 274: me->fingers = HTList_new();
1.1 frystyk 275: return me;
276: }
277:
278: /* Delete a Command Line Object
279: ** ----------------------------
280: */
281: PRIVATE BOOL Robot_delete (Robot * me)
282: {
283: if (me) {
1.34 eric 284: HTList_delete(me->fingers);
1.55 frystyk 285:
286: /* Calculate statistics */
287: calculate_statistics(me);
288:
289: if (me->hyperdoc) {
1.2 frystyk 290: HTList * cur = me->hyperdoc;
291: HyperDoc * pres;
292: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
293: HyperDoc_delete(pres);
294: HTList_delete(me->hyperdoc);
295: }
1.4 frystyk 296: if (me->htext) {
297: HTList * cur = me->htext;
298: HText * pres;
299: while ((pres = (HText *) HTList_nextObject(cur)))
300: HText_free(pres);
301: HTList_delete(me->htext);
302: }
1.55 frystyk 303: if (me->log) HTLog_close(me->log);
1.57 ! frystyk 304: if (me->ref) HTLog_close(me->ref);
1.1 frystyk 305: if (me->output && me->output != STDOUT) fclose(me->output);
1.12 frystyk 306: if (me->flags & MR_TIME) {
307: time_t local = time(NULL);
1.13 eric 308: HTTrace("Robot terminated %s\n",HTDateTimeStr(&local,YES));
1.12 frystyk 309: }
1.55 frystyk 310:
1.11 frystyk 311: HT_FREE(me->cwd);
1.55 frystyk 312: HT_FREE(me->prefix);
1.11 frystyk 313: HT_FREE(me);
1.1 frystyk 314: return YES;
315: }
316: return NO;
317: }
318:
1.2 frystyk 319: /*
1.34 eric 320: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 321: */
1.34 eric 322: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 323: {
1.34 eric 324: Finger * me;
325: HTRequest * request = HTRequest_new();
326: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
327: HT_OUTOFMEM("Finger_new");
328: me->robot = robot;
329: me->request = request;
330: me->dest = dest;
331: HTList_addObject(robot->fingers, (void *)me);
332:
1.48 frystyk 333: /* Set the context for this request */
1.34 eric 334: HTRequest_setContext (request, me);
1.48 frystyk 335:
336: /* Check the various flags to customize the request */
337: if (robot->flags & MR_PREEMPTIVE)
338: HTRequest_setPreemptive(request, YES);
339: if (robot->flags & MR_VALIDATE)
340: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
341: if (robot->flags & MR_END_VALIDATE)
342: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
343:
344: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 345: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 346:
347: /* Set the method for this request */
1.34 eric 348: HTRequest_setMethod(request, method);
349: robot->cnt++;
350: return me;
1.2 frystyk 351: }
352:
1.34 eric 353: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 354: {
1.34 eric 355: HTList_removeObject(me->robot->fingers, (void *)me);
356: me->robot->cnt--;
1.37 frystyk 357:
358: /*
359: ** If we are down at one request then flush the output buffer
360: */
361: if (me->request) {
362: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 363: HTRequest_delete(me->request);
1.37 frystyk 364: }
365:
366: /*
367: ** Delete the request and free myself
368: */
1.34 eric 369: HT_FREE(me);
370: return YES;
1.2 frystyk 371: }
372:
373: /*
374: ** Cleanup and make sure we close all connections including the persistent
375: ** ones
376: */
1.1 frystyk 377: PRIVATE void Cleanup (Robot * me, int status)
378: {
379: Robot_delete(me);
1.29 eric 380: HTProfile_delete();
1.50 frystyk 381: #ifdef HT_MEMLOG
1.39 eric 382: HTMemLog_close();
1.47 frystyk 383: #endif
384:
1.1 frystyk 385: #ifdef VMS
386: exit(status ? status : 1);
387: #else
388: exit(status ? status : 0);
389: #endif
390: }
391:
392: #ifdef CATCH_SIG
393: #include <signal.h>
394: /* SetSignal
395: ** This function sets up signal handlers. This might not be necessary to
396: ** call if the application has its own handlers (lossage on SVR4)
397: */
398: PRIVATE void SetSignal (void)
399: {
400: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
401: ** when attemting to connect to a remote host where you normally should
402: ** get `connection refused' back
403: */
404: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 405: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 406: } else {
1.13 eric 407: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 408: }
1.47 frystyk 409:
1.50 frystyk 410: #ifdef HT_MEMLOG
1.44 eric 411: HTMemLog_flush();
1.47 frystyk 412: #endif
413:
1.1 frystyk 414: }
415: #endif /* CATCH_SIG */
416:
417: PRIVATE void VersionInfo (void)
418: {
1.13 eric 419: OutputData("\n\nW3C Reference Software\n\n");
420: OutputData("\tW3C Mini Robot (%s) version %s.\n",
1.1 frystyk 421: APP_NAME, APP_VERSION);
1.13 eric 422: OutputData("\tW3C Reference Library version %s.\n\n",HTLib_version());
423: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 424: }
425:
426: /* terminate_handler
427: ** -----------------
1.2 frystyk 428: ** This function is registered to handle the result of the request.
429: ** If no more requests are pending then terminate program
1.1 frystyk 430: */
1.32 frystyk 431: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
432: void * param, int status)
1.1 frystyk 433: {
1.34 eric 434: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 435: Robot * mr = finger->robot;
1.34 eric 436: if (SHOW_MSG) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 437:
438: /* Count the amount of body data that we have read */
439: if (status == HT_LOADED && HTRequest_method(request) == METHOD_GET) {
1.56 frystyk 440: int length = HTAnchor_length(HTRequest_anchor(request));
441: if (length > 0) mr->total_bytes += length;
1.55 frystyk 442: }
443:
444: /* Count the number of documents that we have processed */
445: mr->total_docs++;
446:
447: /* Delete this thread */
1.34 eric 448: Finger_delete(finger);
1.55 frystyk 449:
450: /* Should we stop? */
1.46 eric 451: if (mr->cnt <= 0) {
1.34 eric 452: if (SHOW_MSG) HTTrace(" Everything is finished...\n");
1.46 eric 453: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 454: }
1.46 eric 455: if (SHOW_MSG) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 456: return HT_OK;
457: }
458:
459: /* ------------------------------------------------------------------------- */
460: /* HTEXT INTERFACE */
461: /* ------------------------------------------------------------------------- */
462:
463: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
464: HTStream * stream)
465: {
466: HText * me;
1.34 eric 467: Finger * finger = (Finger *) HTRequest_context(request);
468: Robot * mr = finger->robot;
1.14 frystyk 469: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
470: HT_OUTOFMEM("HText_new2");
1.4 frystyk 471:
472: /* Bind the HText object together with the Request Object */
1.1 frystyk 473: me->request = request;
1.4 frystyk 474:
475: /* Add this HyperDoc object to our list */
476: if (!mr->htext) mr->htext = HTList_new();
477: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 478: return me;
479: }
480:
1.4 frystyk 481: PUBLIC void HText_free (HText * me) {
1.11 frystyk 482: if (me) HT_FREE (me);
1.4 frystyk 483: }
484:
1.1 frystyk 485: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
486: {
487: if (text && anchor) {
1.34 eric 488: Finger * finger = (Finger *) HTRequest_context(text->request);
489: Robot * mr = finger->robot;
1.1 frystyk 490: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
491: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 492: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 493: HyperDoc * hd = HTAnchor_document(dest_parent);
1.55 frystyk 494: BOOL prefix_match = YES;
1.1 frystyk 495:
1.55 frystyk 496: if (!uri) return;
497: if (SHOW_MSG) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
498:
499: /* Check for prefix match */
500: if (mr->prefix) prefix_match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1.7 frystyk 501:
1.2 frystyk 502: /* Test whether we already have a hyperdoc for this document */
1.55 frystyk 503: if (hd) {
504: if (SHOW_MSG) HTTrace("Already checked\n");
505: hd->hits++;
506: } else if (mr->flags & MR_LINK && prefix_match && dest_parent) {
1.1 frystyk 507: HTParentAnchor * parent = HTRequest_parent(text->request);
508: HyperDoc * last = HTAnchor_document(parent);
509: int depth = last ? last->depth+1 : 0;
1.34 eric 510: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
511: HTRequest * newreq = newfinger->request;
1.2 frystyk 512: HyperDoc_new(mr, dest_parent, depth);
1.7 frystyk 513: HTRequest_setParent(newreq, HTRequest_anchor(text->request));
514: if (depth >= mr->depth) {
515: if (SHOW_MSG)
1.13 eric 516: HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 517: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 518: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 519: } else {
1.13 eric 520: if (SHOW_MSG) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 521: }
522: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.13 eric 523: if (SHOW_MSG) HTTrace("not tested!\n");
1.34 eric 524: Finger_delete(newfinger);
1.2 frystyk 525: }
1.7 frystyk 526: } else {
1.55 frystyk 527: if (SHOW_MSG) HTTrace("does not fulfill constraints\n");
1.2 frystyk 528: }
1.11 frystyk 529: HT_FREE(uri);
1.2 frystyk 530: }
531: }
532:
533: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 534: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 535: {
536: if (text && anchor) {
1.34 eric 537: Finger * finger = (Finger *) HTRequest_context(text->request);
538: Robot * mr = finger->robot;
1.2 frystyk 539: HTParentAnchor * dest = (HTParentAnchor *)
540: HTAnchor_followMainLink((HTAnchor *) anchor);
541: HyperDoc * hd = HTAnchor_document(dest);
1.1 frystyk 542:
1.2 frystyk 543: /* Test whether we already have a hyperdoc for this document */
544: if (mr->flags & MR_IMG && dest && !hd) {
545: HTParentAnchor * parent = HTRequest_parent(text->request);
546: HyperDoc * last = HTAnchor_document(parent);
547: int depth = last ? last->depth+1 : 0;
1.45 frystyk 548: Finger * newfinger = Finger_new(mr, dest,
549: mr->flags & MR_SAVE ?
550: METHOD_GET : METHOD_HEAD);
1.34 eric 551: HTRequest * newreq = newfinger->request;
1.2 frystyk 552: HyperDoc_new(mr, dest, depth);
553: if (SHOW_MSG) {
554: char * uri = HTAnchor_address((HTAnchor *) dest);
1.13 eric 555: HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.11 frystyk 556: HT_FREE(uri);
1.2 frystyk 557: }
558: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
559: if (SHOW_MSG)
1.13 eric 560: HTTrace("Robot....... Image not tested!\n");
1.34 eric 561: Finger_delete(newfinger);
1.1 frystyk 562: }
563: }
564: }
565: }
566:
567: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 568: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 569: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
570: PUBLIC void HText_endAppend (HText * text) {}
571: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
572: PUBLIC void HText_beginAppend (HText * text) {}
573: PUBLIC void HText_appendParagraph (HText * text) {}
574:
1.48 frystyk 575: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
576: {
577: return (vfprintf(stderr, fmt, pArgs));
578: }
579:
1.1 frystyk 580: /* ------------------------------------------------------------------------- */
581: /* MAIN PROGRAM */
582: /* ------------------------------------------------------------------------- */
583:
584: int main (int argc, char ** argv)
585: {
1.48 frystyk 586: int status = 0;
1.1 frystyk 587: int arg;
1.48 frystyk 588: BOOL cache = NO; /* Use persistent cache */
589: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 590: char * cache_root = NULL;
1.1 frystyk 591: HTChunk * keywords = NULL; /* From command line */
592: int keycnt = 0;
1.12 frystyk 593: Robot * mr = NULL;
1.43 frystyk 594: Finger * finger = NULL;
595: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 596:
597: /* Starts Mac GUSI socket library */
598: #ifdef GUSI
599: GUSISetup(GUSIwithSIOUXSockets);
600: GUSISetup(GUSIwithInternetSockets);
601: #endif
602:
603: #ifdef __MWERKS__ /* STR */
604: InitGraf((Ptr) &qd.thePort);
605: InitFonts();
606: InitWindows();
607: InitMenus(); TEInit();
608: InitDialogs(nil);
609: InitCursor();
610: SIOUXSettings.asktosaveonclose = false;
611: argc=ccommand(&argv);
1.50 frystyk 612: #endif /* __MWERKS__ */
1.1 frystyk 613:
1.50 frystyk 614: #ifdef HT_MEMLOG
1.51 frystyk 615: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 616: #endif
1.46 eric 617:
1.27 frystyk 618: /* Initiate W3C Reference Library with a robot profile */
619: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 620: HTTrace_setCallback(RobotTrace);
1.27 frystyk 621:
622: /* Add the default HTML parser to the set of converters */
623: {
624: HTList * converters = HTFormat_conversion();
625: HTMLInit(converters);
626: }
1.1 frystyk 627:
1.12 frystyk 628: /* Build a new robot object */
629: mr = Robot_new();
630:
1.1 frystyk 631: /* Scan command Line for parameters */
632: for (arg=1; arg<argc; arg++) {
633: if (*argv[arg] == '-') {
634:
635: /* non-interactive */
1.17 frystyk 636: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 637: HTAlert_setInteractive(NO);
638:
1.55 frystyk 639: /* log file */
1.1 frystyk 640: } else if (!strcmp(argv[arg], "-l")) {
641: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
642: argv[++arg] : DEFAULT_LOG_FILE;
643:
1.55 frystyk 644: /* hit file */
645: } else if (!strcmp(argv[arg], "-hit")) {
646: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
647: argv[++arg] : DEFAULT_HIT_FILE;
648:
1.57 ! frystyk 649: /* referer file */
! 650: } else if (!strcmp(argv[arg], "-referer")) {
! 651: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 652: argv[++arg] : DEFAULT_REFERER_FILE;
! 653:
1.55 frystyk 654: /* rule file */
1.1 frystyk 655: } else if (!strcmp(argv[arg], "-r")) {
656: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
657: argv[++arg] : DEFAULT_RULE_FILE;
658:
659: /* output filename */
660: } else if (!strcmp(argv[arg], "-o")) {
661: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
662: argv[++arg] : DEFAULT_OUTPUT_FILE;
663:
1.55 frystyk 664: /* URI prefix */
665: } else if (!strcmp(argv[arg], "-prefix")) {
666: char * prefix = NULL;
667: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
668: argv[++arg] : DEFAULT_PREFIX;
669: if (*prefix) {
670: StrAllocCopy(mr->prefix, prefix);
671: StrAllocCat(mr->prefix, "*");
672: }
673:
1.1 frystyk 674: /* timeout -- Change the default request timeout */
675: } else if (!strcmp(argv[arg], "-timeout")) {
676: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
677: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 678: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 679:
1.54 frystyk 680: /* Force no pipelined requests */
681: } else if (!strcmp(argv[arg], "-nopipe")) {
682: HTTP_setConnectionMode(HTTP_NO_PIPELINING);
683:
1.48 frystyk 684: /* Start the persistent cache */
685: } else if (!strcmp(argv[arg], "-cache")) {
686: cache = YES;
687:
1.54 frystyk 688: /* Determine the cache root */
689: } else if (!strcmp(argv[arg], "-cacheroot")) {
690: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
691: argv[++arg] : NULL;
1.51 frystyk 692:
1.52 frystyk 693: /* Stream write flush delay in ms */
694: } else if (!strcmp(argv[arg], "-delay")) {
695: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
696: atoi(argv[++arg]) : DEFAULT_DELAY;
697: HTHost_setDefaultWriteDelay(delay);
698:
1.48 frystyk 699: /* Persistent cache flush */
700: } else if (!strcmp(argv[arg], "-flush")) {
701: flush = YES;
702:
703: /* Do a cache validation */
704: } else if (!strcmp(argv[arg], "-validate")) {
705: mr->flags |= MR_VALIDATE;
706:
707: /* Do an end-to-end cache-validation */
708: } else if (!strcmp(argv[arg], "-endvalidate")) {
709: mr->flags |= MR_END_VALIDATE;
710:
1.7 frystyk 711: /* preemptive or non-preemptive access */
1.1 frystyk 712: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 713: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 714:
715: /* test inlined images */
716: } else if (!strcmp(argv[arg], "-img")) {
717: mr->flags |= MR_IMG;
1.45 frystyk 718:
719: /* load inlined images */
720: } else if (!strcmp(argv[arg], "-saveimg")) {
721: mr->flags |= (MR_IMG | MR_SAVE);
1.2 frystyk 722:
723: /* load anchors */
724: } else if (!strcmp(argv[arg], "-link")) {
725: mr->flags |= MR_LINK;
1.7 frystyk 726: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
727: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 728:
1.12 frystyk 729: /* Output start and end time */
730: } else if (!strcmp(argv[arg], "-ss")) {
731: time_t local = time(NULL);
1.13 eric 732: HTTrace("Robot started on %s\n",
1.12 frystyk 733: HTDateTimeStr(&local, YES));
734: mr->flags |= MR_TIME;
735:
1.1 frystyk 736: /* print version and exit */
737: } else if (!strcmp(argv[arg], "-version")) {
738: VersionInfo();
739: Cleanup(mr, 0);
1.46 eric 740:
741: /* run in quiet mode */
742: } else if (!strcmp(argv[arg], "-q")) {
743: mr->flags |= MR_QUIET;
1.1 frystyk 744:
745: #ifdef WWWTRACE
746: /* trace flags */
747: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 748: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 749: #endif
750:
751: } else {
1.13 eric 752: if (SHOW_MSG) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 753: }
1.17 frystyk 754: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 755: if (!keycnt) {
756: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 757: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 758: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 759: keycnt = 1;
1.11 frystyk 760: HT_FREE(ref);
1.1 frystyk 761: } else { /* Check for successive keyword arguments */
762: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
763: if (keycnt++ <= 1)
1.5 frystyk 764: keywords = HTChunk_new(128);
1.1 frystyk 765: else
1.5 frystyk 766: HTChunk_putc(keywords, ' ');
767: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 768: HT_FREE(escaped);
1.1 frystyk 769: }
770: }
771: }
772:
773: #ifdef CATCH_SIG
774: SetSignal();
775: #endif
776:
777: if (!keycnt) {
1.13 eric 778: if (SHOW_MSG) HTTrace("Please specify URL to check.\n");
1.1 frystyk 779: Cleanup(mr, -1);
780: }
781:
1.23 manoli 782: /* Testing that HTTrace is working */
1.47 frystyk 783: if (SHOW_MSG) HTTrace ("Welcome to the W3C mini Robot\n");
1.23 manoli 784:
1.1 frystyk 785: /* Rule file specified? */
786: if (mr->rules) {
787: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 788: if (!HTLoadRules(rules))
1.13 eric 789: if (SHOW_MSG) HTTrace("Can't access rules\n");
1.11 frystyk 790: HT_FREE(rules);
1.1 frystyk 791: }
792:
793: /* Output file specified? */
794: if (mr->outputfile) {
795: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.13 eric 796: if (SHOW_MSG) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 797: mr->output = OUTPUT;
798: }
799: }
800:
1.48 frystyk 801: /* Should we use persistent cache? */
802: if (cache) {
1.54 frystyk 803: HTCacheInit(cache_root, 20);
1.49 frystyk 804: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
805: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
806: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 807:
808: /* Should we start by flushing? */
809: if (flush) HTCache_flushAll();
810: }
811:
1.57 ! frystyk 812: /* CLF Log file specifed? */
1.55 frystyk 813: if (mr->logfile) {
814: mr->log = HTLog_open(mr->logfile, YES, YES);
815: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 ! frystyk 816: }
! 817:
! 818: /* Referer Log file specifed? */
! 819: if (mr->reffile) {
! 820: mr->ref = HTLog_open(mr->reffile, YES, YES);
! 821: if (mr->ref)
! 822: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 823: }
1.1 frystyk 824:
1.27 frystyk 825: /* Register our own someterminater filter */
1.32 frystyk 826: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 827:
828: /* Setting event timeout */
829: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 830:
1.56 frystyk 831: mr->time = HTGetTimeInMillis();
1.37 frystyk 832:
1.34 eric 833: /* Start the request */
834: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 835:
836: /*
837: ** Make sure that the first request is flushed immediately and not
838: ** buffered in the output buffer
839: */
840: HTRequest_setFlush(finger->request, YES);
841:
842: /*
1.48 frystyk 843: ** Check whether we should do some kind of cache validation on
844: ** the load
845: */
846: if (mr->flags & MR_VALIDATE)
847: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
848: if (mr->flags & MR_END_VALIDATE)
849: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
850:
851: /*
1.43 frystyk 852: ** Now do the load
853: */
1.34 eric 854: if (mr->flags & MR_PREEMPTIVE)
855: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 856:
857: if (keywords) /* Search */
1.34 eric 858: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 859: else
1.34 eric 860: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 861:
1.5 frystyk 862: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 863: if (status != YES) {
1.13 eric 864: if (SHOW_MSG) HTTrace("Can't access resource\n");
1.1 frystyk 865: Cleanup(mr, -1);
866: }
867:
868: /* Go into the event loop... */
1.34 eric 869: HTEventList_loop(finger->request);
1.1 frystyk 870:
871: /* Only gets here if event loop fails */
872: Cleanup(mr, 0);
873: return 0;
874: }
Webmaster