Annotation of libwww/Robot/src/HTRobot.c, revision 1.2
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
18:
19: #include "HTRobot.h" /* Implemented here */
20:
21: #ifndef VR
22: #define VR "unspecified"
23: #endif
24:
25: #define APP_NAME "W3CRobot"
26: #define APP_VERSION VR
27:
28: /* Default page for "-help" command line option */
29: #define HELP "http://www.w3.org/pub/WWW/Robot/User/CommandLine.html"
30:
31: #define DEFAULT_OUTPUT_FILE "robot.out"
32: #define DEFAULT_RULE_FILE "robot.conf"
33: #define DEFAULT_LOG_FILE "robot.log"
34:
35: #define SHOW_MSG (WWWTRACE || HTAlert_interactive())
36:
37: #define DEFAULT_TIMEOUT 10 /* timeout in seconds */
38:
39: #if defined(__svr4__)
40: #define CATCH_SIG
41: #endif
42:
43: typedef enum _MRFlags {
1.2 ! frystyk 44: MR_IMG = 0x1,
! 45: MR_LINK = 0x2,
! 46: MR_PREEMTIVE= 0x4
1.1 frystyk 47: } MRFlags;
48:
49: typedef struct _Robot {
50: HTRequest * request;
51: HTParentAnchor * anchor;
1.2 ! frystyk 52: int depth; /* How deep is our tree */
! 53: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.1 frystyk 54: struct timeval * tv; /* Timeout on socket */
55: char * cwd; /* Current dir URL */
56: HTList * converters;
57: char * rules;
58: char * logfile;
59: char * outputfile;
60: FILE * output;
61: MRFlags flags;
62: } Robot;
63:
64: typedef enum _LoadState {
65: L_INVALID = -2,
66: L_LOADING = -1,
67: L_SUCCESS = 0,
68: L_ERROR
69: } LoadState;
70:
71: /*
72: ** The HyperDoc object is bound to the anchor and contains information about
73: ** where we are in the search for recursive searches
74: */
75: typedef struct _HyperDoc {
76: HTParentAnchor * anchor;
77: LoadState state;
78: int depth;
79: } HyperDoc;
80:
81: /*
82: ** This is the HText object that is created every time we start parsing a
83: ** HTML object
84: */
85: typedef struct _HText {
86: HTRequest * request;
87: } HText;
88:
89: typedef struct _HTStyle HTStyle;
90: typedef struct _HTStyleSheet HTStyleSheet;
91:
92: PUBLIC HText * HTMainText = NULL;
93: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
94: PUBLIC HTStyleSheet * styleSheet = NULL;
95:
96: /* ------------------------------------------------------------------------- */
97:
1.2 ! frystyk 98: /* Create a "HyperDoc" object
! 99: ** --------------------------
! 100: ** A HyperDoc object contains information about whether we have already
! 101: ** started checking the anchor and the depth in our search
! 102: */
! 103: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
! 104: {
! 105: HyperDoc * hd;
! 106: if ((hd = (HyperDoc *) calloc(1, sizeof(HyperDoc))) == NULL)
! 107: outofmem(__FILE__, "HyperDoc_new");
! 108: hd->state = L_INVALID;
! 109: hd->depth = depth;
! 110:
! 111: /* Bind the HyperDoc object together with the Anchor Object */
! 112: hd->anchor = anchor;
! 113: HTAnchor_setDocument(anchor, (void *) hd);
! 114:
! 115: /* Add this HyperDoc object to our list */
! 116: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
! 117: HTList_addObject(mr->hyperdoc, (void *) hd);
! 118:
! 119: if (SHOW_MSG)
! 120: TTYPrint(TDEST, "HyperDoc.... %p bound to anchor %p with depth %d\n",
! 121: hd, anchor, depth);
! 122: return hd;
! 123: }
! 124:
! 125: /* Delete a "HyperDoc" object
! 126: ** --------------------------
! 127: */
! 128: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
! 129: {
! 130: if (hd) {
! 131: free (hd);
! 132: return YES;
! 133: }
! 134: return NO;
! 135: }
! 136:
1.1 frystyk 137: /* Create a Command Line Object
138: ** ----------------------------
139: */
140: PRIVATE Robot * Robot_new (void)
141: {
142: Robot * me;
143: if ((me = (Robot *) calloc(1, sizeof(Robot))) == NULL ||
144: (me->tv = (struct timeval*) calloc(1, sizeof(struct timeval))) == NULL)
145: outofmem(__FILE__, "Robot_new");
1.2 ! frystyk 146: me->hyperdoc = HTList_new();
1.1 frystyk 147: me->tv->tv_sec = DEFAULT_TIMEOUT;
148: me->cwd = HTFindRelatedName();
149: me->output = OUTPUT;
150:
151: /* Bind the Robot object together with the Request Object */
152: me->request = HTRequest_new();
153: HTRequest_setContext (me->request, me);
154: return me;
155: }
156:
157: /* Delete a Command Line Object
158: ** ----------------------------
159: */
160: PRIVATE BOOL Robot_delete (Robot * me)
161: {
162: if (me) {
1.2 ! frystyk 163: if (me->hyperdoc) {
! 164: HTList * cur = me->hyperdoc;
! 165: HyperDoc * pres;
! 166: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
! 167: HyperDoc_delete(pres);
! 168: HTList_delete(me->hyperdoc);
! 169: }
1.1 frystyk 170: if (me->logfile) HTLog_close();
171: if (me->output && me->output != STDOUT) fclose(me->output);
172: FREE(me->cwd);
173: free(me->tv);
174: free(me);
175: return YES;
176: }
177: return NO;
178: }
179:
1.2 ! frystyk 180: /*
! 181: ** This function creates a new request object and initializes it
! 182: */
! 183: PRIVATE HTRequest * Thread_new (Robot * mr, HTMethod method)
! 184: {
! 185: HTRequest * newreq = HTRequest_new();
! 186: HTRequest_setContext (newreq, mr);
! 187: if (mr->flags & MR_PREEMTIVE) HTRequest_setPreemtive(newreq, YES);
! 188: HTRequest_addRqHd(newreq, HT_HOST);
! 189: HTRequest_setMethod(newreq, method);
! 190: return newreq;
! 191: }
! 192:
! 193: PRIVATE BOOL Thread_delete (Robot * mr, HTRequest * request)
! 194: {
! 195: if (mr && request) {
! 196: HTRequest_delete(request);
! 197: return YES;
! 198: }
! 199: return NO;
! 200: }
! 201:
! 202: /*
! 203: ** Cleanup and make sure we close all connections including the persistent
! 204: ** ones
! 205: */
1.1 frystyk 206: PRIVATE void Cleanup (Robot * me, int status)
207: {
1.2 ! frystyk 208: HTNet_killAll();
1.1 frystyk 209: Robot_delete(me);
210: HTLibTerminate();
211: #ifdef VMS
212: exit(status ? status : 1);
213: #else
214: exit(status ? status : 0);
215: #endif
216: }
217:
218: #ifdef CATCH_SIG
219: #include <signal.h>
220: /* SetSignal
221: ** This function sets up signal handlers. This might not be necessary to
222: ** call if the application has its own handlers (lossage on SVR4)
223: */
224: PRIVATE void SetSignal (void)
225: {
226: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
227: ** when attemting to connect to a remote host where you normally should
228: ** get `connection refused' back
229: */
230: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
231: if (PROT_TRACE) TTYPrint(TDEST, "HTSignal.... Can't catch SIGPIPE\n");
232: } else {
233: if (PROT_TRACE) TTYPrint(TDEST, "HTSignal.... Ignoring SIGPIPE\n");
234: }
235: }
236: #endif /* CATCH_SIG */
237:
238: PRIVATE void VersionInfo (void)
239: {
240: TTYPrint(OUTPUT,"\n\nW3C Reference Software\n\n");
241: TTYPrint(OUTPUT,"\tW3C Mini Robot (%s) version %s.\n",
242: APP_NAME, APP_VERSION);
243: TTYPrint(OUTPUT,"\tW3C Reference Library version %s.\n\n",HTLib_version());
244: TTYPrint(OUTPUT,"Please send feedback to <libwww@w3.org>\n");
245: }
246:
247: /* terminate_handler
248: ** -----------------
1.2 ! frystyk 249: ** This function is registered to handle the result of the request.
! 250: ** If no more requests are pending then terminate program
1.1 frystyk 251: */
252: PRIVATE int terminate_handler (HTRequest * request, int status)
253: {
254: Robot * mr = (Robot *) HTRequest_context(request);
255: if (mr->logfile) HTLog_add(request, status);
1.2 ! frystyk 256: Thread_delete(mr, request);
! 257: if (HTNet_idle()) Cleanup(mr, 0);
1.1 frystyk 258: return HT_OK;
259: }
260:
261: /* timeout_handler
262: ** ---------------
263: ** This function is registered to handle timeout in select eventloop
264: */
265: PRIVATE int timeout_handler (HTRequest * request)
266: {
1.2 ! frystyk 267: Robot * mr = (Robot *) HTRequest_context(request);
! 268: if (SHOW_MSG) TTYPrint(TDEST, "Robot....... Request timeout...\n");
1.1 frystyk 269: HTRequest_kill(request);
1.2 ! frystyk 270: Thread_delete(mr, request);
1.1 frystyk 271: return 0;
272: }
273:
274: /* ------------------------------------------------------------------------- */
275: /* HTEXT INTERFACE */
276: /* ------------------------------------------------------------------------- */
277:
278: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
279: HTStream * stream)
280: {
281: HText * me;
282: if ((me = (HText *) calloc(1, sizeof(HText))) == NULL)
283: outofmem(__FILE__, "HText_new2");
284: me->request = request;
285: return me;
286: }
287:
288: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
289: {
290: if (text && anchor) {
1.2 ! frystyk 291: Robot * mr = (Robot *) HTRequest_context(text->request);
1.1 frystyk 292: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
293: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
294: HyperDoc * hd = HTAnchor_document(dest_parent);
295:
1.2 ! frystyk 296: /* Test whether we already have a hyperdoc for this document */
! 297: if (mr->flags & MR_LINK && dest_parent && !hd) {
1.1 frystyk 298: HTParentAnchor * parent = HTRequest_parent(text->request);
299: HyperDoc * last = HTAnchor_document(parent);
300: int depth = last ? last->depth+1 : 0;
1.2 ! frystyk 301: HTRequest * newreq = Thread_new(mr, METHOD_GET);
! 302: HyperDoc_new(mr, dest_parent, depth);
! 303: if (SHOW_MSG) {
! 304: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
! 305: TTYPrint(TDEST, "Robot....... Loading `%s\'\n", uri);
! 306: free(uri);
! 307: }
! 308: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
! 309: if (SHOW_MSG) TTYPrint(TDEST, "Robot...... URI Not tested!\n");
! 310: Thread_delete(mr, newreq);
! 311: }
! 312: }
! 313: }
! 314: }
! 315:
! 316: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
! 317: CONST char *alt, CONST char * align, BOOL isMap)
! 318: {
! 319: if (text && anchor) {
! 320: Robot * mr = (Robot *) HTRequest_context(text->request);
! 321: HTParentAnchor * dest = (HTParentAnchor *)
! 322: HTAnchor_followMainLink((HTAnchor *) anchor);
! 323: HyperDoc * hd = HTAnchor_document(dest);
1.1 frystyk 324:
1.2 ! frystyk 325: /* Test whether we already have a hyperdoc for this document */
! 326: if (mr->flags & MR_IMG && dest && !hd) {
! 327: HTParentAnchor * parent = HTRequest_parent(text->request);
! 328: HyperDoc * last = HTAnchor_document(parent);
! 329: int depth = last ? last->depth+1 : 0;
! 330: HTRequest * newreq = Thread_new(mr, METHOD_HEAD);
! 331: HyperDoc_new(mr, dest, depth);
! 332: if (SHOW_MSG) {
! 333: char * uri = HTAnchor_address((HTAnchor *) dest);
! 334: TTYPrint(TDEST, "Robot....... Checking Image `%s\'\n", uri);
! 335: free(uri);
! 336: }
! 337: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
! 338: if (SHOW_MSG)
! 339: TTYPrint(TDEST, "Robot....... Image not tested!\n");
! 340: Thread_delete(mr, newreq);
1.1 frystyk 341: }
342: }
343: }
344: }
345:
346: PUBLIC void HText_endAnchor (HText * text) {}
347: PUBLIC void HText_appendText (HText * text, CONST char * str) {}
348: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
349: PUBLIC void HText_endAppend (HText * text) {}
350: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
351: PUBLIC void HText_beginAppend (HText * text) {}
352: PUBLIC void HText_appendParagraph (HText * text) {}
1.2 ! frystyk 353: PUBLIC BOOL HText_delete (HText * me) { return YES; }
1.1 frystyk 354:
355: /* ------------------------------------------------------------------------- */
356: /* MAIN PROGRAM */
357: /* ------------------------------------------------------------------------- */
358:
359: int main (int argc, char ** argv)
360: {
361: int status = 0;
362: int arg;
363: HTChunk * keywords = NULL; /* From command line */
364: int keycnt = 0;
365: Robot * mr = Robot_new(); /* Create new Robot instance */
366:
367: /* Starts Mac GUSI socket library */
368: #ifdef GUSI
369: GUSISetup(GUSIwithSIOUXSockets);
370: GUSISetup(GUSIwithInternetSockets);
371: #endif
372:
373: #ifdef __MWERKS__ /* STR */
374: InitGraf((Ptr) &qd.thePort);
375: InitFonts();
376: InitWindows();
377: InitMenus(); TEInit();
378: InitDialogs(nil);
379: InitCursor();
380: SIOUXSettings.asktosaveonclose = false;
381: argc=ccommand(&argv);
382: #endif
383:
384: /* Initiate W3C Reference Library */
385: HTLibInit(APP_NAME, APP_VERSION);
386:
387: /* Initialize the protocol modules */
388: HTAccessInit();
389:
390: /* Initialize set of converters */
391: mr->converters = HTList_new();
392: HTConverterInit(mr->converters);
393: HTFormat_setConversion(mr->converters);
394:
395: /* Initialize bindings between file suffixes and media types */
396: HTFileInit();
397:
398: /* Get any proxy or gateway environment variables */
399: HTProxy_getEnvVar();
400:
401: /* Scan command Line for parameters */
402: for (arg=1; arg<argc; arg++) {
403: if (*argv[arg] == '-') {
404:
405: /* -? or -help: show the command line help page */
406: if (!strcmp(argv[arg],"-?") || !strcmp(argv[arg],"-help")) {
407: mr->anchor = (HTParentAnchor *) HTAnchor_findAddress(HELP);
408: keycnt = 1;
409:
410: /* non-interactive */
411: } else if (!strcmp(argv[arg], "-n")) {
412: HTAlert_setInteractive(NO);
413:
414: /* log file */
415: } else if (!strcmp(argv[arg], "-l")) {
416: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
417: argv[++arg] : DEFAULT_LOG_FILE;
418:
419: /* rule file */
420: } else if (!strcmp(argv[arg], "-r")) {
421: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
422: argv[++arg] : DEFAULT_RULE_FILE;
423:
424: /* output filename */
425: } else if (!strcmp(argv[arg], "-o")) {
426: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
427: argv[++arg] : DEFAULT_OUTPUT_FILE;
428:
429: /* timeout -- Change the default request timeout */
430: } else if (!strcmp(argv[arg], "-timeout")) {
431: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
432: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
433: if (timeout > 0) mr->tv->tv_sec = timeout;
434:
435: /* preemtive or non-preemtive access */
436: } else if (!strcmp(argv[arg], "-single")) {
437: HTRequest_setPreemtive(mr->request, YES);
1.2 ! frystyk 438: mr->flags |= MR_PREEMTIVE;
! 439:
! 440: /* test inlined images */
! 441: } else if (!strcmp(argv[arg], "-img")) {
! 442: mr->flags |= MR_IMG;
! 443:
! 444: /* load anchors */
! 445: } else if (!strcmp(argv[arg], "-link")) {
! 446: mr->flags |= MR_LINK;
! 447:
! 448: /* preemtive or non-preemtive access */
! 449: } else if (!strcmp(argv[arg], "-single")) {
! 450: HTRequest_setPreemtive(mr->request, YES);
! 451: mr->flags |= MR_PREEMTIVE;
1.1 frystyk 452:
453: /* print version and exit */
454: } else if (!strcmp(argv[arg], "-version")) {
455: VersionInfo();
456: Cleanup(mr, 0);
457:
458: #ifdef WWWTRACE
459: /* trace flags */
460: } else if (!strncmp(argv[arg], "-v", 2)) {
461: char *p = argv[arg]+2;
462: WWWTRACE = 0;
463: for(; *p; p++) {
464: switch (*p) {
465: case 'a': WWWTRACE |= SHOW_ANCHOR_TRACE; break;
466: case 'b': WWWTRACE |= SHOW_BIND_TRACE; break;
467: case 'c': WWWTRACE |= SHOW_CACHE_TRACE; break;
468: case 'g': WWWTRACE |= SHOW_SGML_TRACE; break;
469: case 'p': WWWTRACE |= SHOW_PROTOCOL_TRACE; break;
470: case 's': WWWTRACE |= SHOW_STREAM_TRACE; break;
471: case 't': WWWTRACE |= SHOW_THREAD_TRACE; break;
472: case 'u': WWWTRACE |= SHOW_URI_TRACE; break;
473: default:
474: if (SHOW_MSG)
475: TTYPrint(TDEST,"Bad parameter (%s) in -v option\n",
476: argv[arg]);
477: }
478: }
479: if (!WWWTRACE) WWWTRACE = SHOW_ALL_TRACE;
480: #endif
481:
482: } else {
483: if (SHOW_MSG) TTYPrint(TDEST,"Bad Argument (%s)\n", argv[arg]);
484: }
485: } else { /* If no leading `-' then check for URL or keywords */
486: if (!keycnt) {
487: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
488: mr->anchor = (HTParentAnchor *) HTAnchor_findAddress(ref);
489: keycnt = 1;
490: FREE(ref);
491: } else { /* Check for successive keyword arguments */
492: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
493: if (keycnt++ <= 1)
494: keywords = HTChunkCreate(128);
495: else
496: HTChunkPutc(keywords, ' ');
497: HTChunkPuts(keywords, HTStrip(escaped));
498: free(escaped);
499: }
500: }
501: }
502:
503: #ifdef CATCH_SIG
504: SetSignal();
505: #endif
506:
507: if (!keycnt) {
1.2 ! frystyk 508: if (SHOW_MSG) TTYPrint(TDEST, "Please specify URL to check.\n");
1.1 frystyk 509: Cleanup(mr, -1);
510: }
511:
512: /* Rule file specified? */
513: if (mr->rules) {
514: HTList * list = HTList_new();
515: HTRequest * rr = HTRequest_new();
516: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
517: HTParentAnchor * ra = (HTParentAnchor *) HTAnchor_findAddress(rules);
518: HTRequest_setPreemtive(rr, YES);
519: HTConversion_add(list, "application/x-www-rules", "*/*", HTRules,
520: 1.0, 0.0, 0.0);
521: HTRequest_setConversion(rr, list, YES);
522: if (HTLoadAnchor((HTAnchor *) ra, rr) != YES)
523: if (SHOW_MSG) TTYPrint(TDEST, "Can't access rules\n");
524: HTConversion_deleteAll(list);
525: HTRequest_delete(rr);
526: FREE(rules);
527: }
528:
529: /* Output file specified? */
530: if (mr->outputfile) {
531: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
532: if (SHOW_MSG) TTYPrint(TDEST, "Can't open `%s'\n", mr->outputfile);
533: mr->output = OUTPUT;
534: }
535: }
536:
537: /* Set up the output */
538: HTRequest_setOutputStream(mr->request, HTFWriter_new(mr->output, YES));
539:
540: /* Log file specifed? */
541: if (mr->logfile) HTLog_open(mr->logfile, YES, YES);
542:
543: /* Register our User Prompts etc in the Alert Manager */
544: if (HTAlert_interactive()) {
545: HTAlert_add(HTError_print, HT_A_MESSAGE);
546: HTAlert_add(HTConfirm, HT_A_CONFIRM);
547: HTAlert_add(HTPrompt, HT_A_PROMPT);
548: HTAlert_add(HTPromptPassword, HT_A_SECRET);
549: HTAlert_add(HTPromptUsernameAndPassword, HT_A_USER_PW);
550: }
551:
552: /* Register a call back function for the Net Manager */
553: HTNetCall_addAfter(terminate_handler, HT_ALL);
554:
555: /* Set timeout on sockets */
556: HTEvent_registerTimeout(mr->tv, mr->request, timeout_handler, NO);
557:
558: /* Start the request */
559: if (keywords) /* Search */
560: status = HTSearch(HTChunkData(keywords), mr->anchor, mr->request);
561: else
562: status = HTLoadAnchor((HTAnchor *) mr->anchor, mr->request);
563:
564: if (keywords) HTChunkFree(keywords);
565: if (status != YES) {
566: if (SHOW_MSG) TTYPrint(TDEST, "Can't access resource\n");
567: Cleanup(mr, -1);
568: }
569:
570: /* Go into the event loop... */
571: HTEvent_Loop(mr->request);
572:
573: /* Only gets here if event loop fails */
574: Cleanup(mr, 0);
575: return 0;
576: }
Webmaster