Annotation of libwww/Robot/src/HTRobot.c, revision 1.4
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.4 ! frystyk 18: #include "HText.h"
1.1 frystyk 19:
20: #include "HTRobot.h" /* Implemented here */
21:
22: #ifndef VR
23: #define VR "unspecified"
24: #endif
25:
26: #define APP_NAME "W3CRobot"
27: #define APP_VERSION VR
28:
29: /* Default page for "-help" command line option */
30: #define HELP "http://www.w3.org/pub/WWW/Robot/User/CommandLine.html"
31:
32: #define DEFAULT_OUTPUT_FILE "robot.out"
33: #define DEFAULT_RULE_FILE "robot.conf"
34: #define DEFAULT_LOG_FILE "robot.log"
35:
36: #define SHOW_MSG (WWWTRACE || HTAlert_interactive())
37:
38: #define DEFAULT_TIMEOUT 10 /* timeout in seconds */
39:
40: #if defined(__svr4__)
41: #define CATCH_SIG
42: #endif
43:
44: typedef enum _MRFlags {
1.2 frystyk 45: MR_IMG = 0x1,
46: MR_LINK = 0x2,
47: MR_PREEMTIVE= 0x4
1.1 frystyk 48: } MRFlags;
49:
50: typedef struct _Robot {
51: HTRequest * request;
52: HTParentAnchor * anchor;
1.2 frystyk 53: int depth; /* How deep is our tree */
54: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 ! frystyk 55: HTList * htext; /* List of our HText Objects */
1.1 frystyk 56: struct timeval * tv; /* Timeout on socket */
57: char * cwd; /* Current dir URL */
58: HTList * converters;
59: char * rules;
60: char * logfile;
61: char * outputfile;
62: FILE * output;
63: MRFlags flags;
64: } Robot;
65:
66: typedef enum _LoadState {
67: L_INVALID = -2,
68: L_LOADING = -1,
69: L_SUCCESS = 0,
70: L_ERROR
71: } LoadState;
72:
73: /*
74: ** The HyperDoc object is bound to the anchor and contains information about
75: ** where we are in the search for recursive searches
76: */
77: typedef struct _HyperDoc {
78: HTParentAnchor * anchor;
79: LoadState state;
80: int depth;
81: } HyperDoc;
82:
83: /*
84: ** This is the HText object that is created every time we start parsing a
85: ** HTML object
86: */
1.4 ! frystyk 87: struct _HText {
1.1 frystyk 88: HTRequest * request;
1.4 ! frystyk 89: };
1.1 frystyk 90:
91: PUBLIC HText * HTMainText = NULL;
92: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
93: PUBLIC HTStyleSheet * styleSheet = NULL;
94:
95: /* ------------------------------------------------------------------------- */
96:
1.2 frystyk 97: /* Create a "HyperDoc" object
98: ** --------------------------
99: ** A HyperDoc object contains information about whether we have already
100: ** started checking the anchor and the depth in our search
101: */
102: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
103: {
104: HyperDoc * hd;
105: if ((hd = (HyperDoc *) calloc(1, sizeof(HyperDoc))) == NULL)
106: outofmem(__FILE__, "HyperDoc_new");
107: hd->state = L_INVALID;
108: hd->depth = depth;
109:
110: /* Bind the HyperDoc object together with the Anchor Object */
111: hd->anchor = anchor;
112: HTAnchor_setDocument(anchor, (void *) hd);
113:
114: /* Add this HyperDoc object to our list */
115: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
116: HTList_addObject(mr->hyperdoc, (void *) hd);
117:
118: if (SHOW_MSG)
119: TTYPrint(TDEST, "HyperDoc.... %p bound to anchor %p with depth %d\n",
120: hd, anchor, depth);
121: return hd;
122: }
123:
124: /* Delete a "HyperDoc" object
125: ** --------------------------
126: */
127: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
128: {
129: if (hd) {
130: free (hd);
131: return YES;
132: }
133: return NO;
134: }
135:
1.1 frystyk 136: /* Create a Command Line Object
137: ** ----------------------------
138: */
139: PRIVATE Robot * Robot_new (void)
140: {
141: Robot * me;
142: if ((me = (Robot *) calloc(1, sizeof(Robot))) == NULL ||
143: (me->tv = (struct timeval*) calloc(1, sizeof(struct timeval))) == NULL)
144: outofmem(__FILE__, "Robot_new");
1.2 frystyk 145: me->hyperdoc = HTList_new();
1.4 ! frystyk 146: me->htext = HTList_new();
1.1 frystyk 147: me->tv->tv_sec = DEFAULT_TIMEOUT;
148: me->cwd = HTFindRelatedName();
149: me->output = OUTPUT;
150:
151: /* Bind the Robot object together with the Request Object */
152: me->request = HTRequest_new();
153: HTRequest_setContext (me->request, me);
154: return me;
155: }
156:
157: /* Delete a Command Line Object
158: ** ----------------------------
159: */
160: PRIVATE BOOL Robot_delete (Robot * me)
161: {
162: if (me) {
1.2 frystyk 163: if (me->hyperdoc) {
164: HTList * cur = me->hyperdoc;
165: HyperDoc * pres;
166: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
167: HyperDoc_delete(pres);
168: HTList_delete(me->hyperdoc);
169: }
1.4 ! frystyk 170: if (me->htext) {
! 171: HTList * cur = me->htext;
! 172: HText * pres;
! 173: while ((pres = (HText *) HTList_nextObject(cur)))
! 174: HText_free(pres);
! 175: HTList_delete(me->htext);
! 176: }
1.1 frystyk 177: if (me->logfile) HTLog_close();
178: if (me->output && me->output != STDOUT) fclose(me->output);
179: FREE(me->cwd);
180: free(me->tv);
181: free(me);
182: return YES;
183: }
184: return NO;
185: }
186:
1.2 frystyk 187: /*
188: ** This function creates a new request object and initializes it
189: */
190: PRIVATE HTRequest * Thread_new (Robot * mr, HTMethod method)
191: {
192: HTRequest * newreq = HTRequest_new();
193: HTRequest_setContext (newreq, mr);
194: if (mr->flags & MR_PREEMTIVE) HTRequest_setPreemtive(newreq, YES);
195: HTRequest_addRqHd(newreq, HT_HOST);
196: HTRequest_setMethod(newreq, method);
197: return newreq;
198: }
199:
200: PRIVATE BOOL Thread_delete (Robot * mr, HTRequest * request)
201: {
202: if (mr && request) {
203: HTRequest_delete(request);
204: return YES;
205: }
206: return NO;
207: }
208:
209: /*
210: ** Cleanup and make sure we close all connections including the persistent
211: ** ones
212: */
1.1 frystyk 213: PRIVATE void Cleanup (Robot * me, int status)
214: {
1.2 frystyk 215: HTNet_killAll();
1.1 frystyk 216: Robot_delete(me);
217: HTLibTerminate();
218: #ifdef VMS
219: exit(status ? status : 1);
220: #else
221: exit(status ? status : 0);
222: #endif
223: }
224:
225: #ifdef CATCH_SIG
226: #include <signal.h>
227: /* SetSignal
228: ** This function sets up signal handlers. This might not be necessary to
229: ** call if the application has its own handlers (lossage on SVR4)
230: */
231: PRIVATE void SetSignal (void)
232: {
233: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
234: ** when attemting to connect to a remote host where you normally should
235: ** get `connection refused' back
236: */
237: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
238: if (PROT_TRACE) TTYPrint(TDEST, "HTSignal.... Can't catch SIGPIPE\n");
239: } else {
240: if (PROT_TRACE) TTYPrint(TDEST, "HTSignal.... Ignoring SIGPIPE\n");
241: }
242: }
243: #endif /* CATCH_SIG */
244:
245: PRIVATE void VersionInfo (void)
246: {
247: TTYPrint(OUTPUT,"\n\nW3C Reference Software\n\n");
248: TTYPrint(OUTPUT,"\tW3C Mini Robot (%s) version %s.\n",
249: APP_NAME, APP_VERSION);
250: TTYPrint(OUTPUT,"\tW3C Reference Library version %s.\n\n",HTLib_version());
251: TTYPrint(OUTPUT,"Please send feedback to <libwww@w3.org>\n");
252: }
253:
254: /* terminate_handler
255: ** -----------------
1.2 frystyk 256: ** This function is registered to handle the result of the request.
257: ** If no more requests are pending then terminate program
1.1 frystyk 258: */
259: PRIVATE int terminate_handler (HTRequest * request, int status)
260: {
261: Robot * mr = (Robot *) HTRequest_context(request);
262: if (mr->logfile) HTLog_add(request, status);
1.2 frystyk 263: Thread_delete(mr, request);
1.3 frystyk 264: if (HTNet_isEmpty()) Cleanup(mr, 0);
1.1 frystyk 265: return HT_OK;
266: }
267:
268: /* timeout_handler
269: ** ---------------
270: ** This function is registered to handle timeout in select eventloop
271: */
272: PRIVATE int timeout_handler (HTRequest * request)
273: {
1.2 frystyk 274: Robot * mr = (Robot *) HTRequest_context(request);
275: if (SHOW_MSG) TTYPrint(TDEST, "Robot....... Request timeout...\n");
1.1 frystyk 276: HTRequest_kill(request);
1.2 frystyk 277: Thread_delete(mr, request);
1.4 ! frystyk 278: if (HTNet_isEmpty()) Cleanup(mr, -1);
! 279: return HT_OK;
1.1 frystyk 280: }
281:
282: /* ------------------------------------------------------------------------- */
283: /* HTEXT INTERFACE */
284: /* ------------------------------------------------------------------------- */
285:
286: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
287: HTStream * stream)
288: {
289: HText * me;
1.4 ! frystyk 290: Robot * mr = (Robot *) HTRequest_context(request);
1.1 frystyk 291: if ((me = (HText *) calloc(1, sizeof(HText))) == NULL)
292: outofmem(__FILE__, "HText_new2");
1.4 ! frystyk 293:
! 294: /* Bind the HText object together with the Request Object */
1.1 frystyk 295: me->request = request;
1.4 ! frystyk 296:
! 297: /* Add this HyperDoc object to our list */
! 298: if (!mr->htext) mr->htext = HTList_new();
! 299: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 300: return me;
301: }
302:
1.4 ! frystyk 303: PUBLIC void HText_free (HText * me) {
! 304: if (me) free (me);
! 305: }
! 306:
1.1 frystyk 307: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
308: {
309: if (text && anchor) {
1.2 frystyk 310: Robot * mr = (Robot *) HTRequest_context(text->request);
1.1 frystyk 311: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
312: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
313: HyperDoc * hd = HTAnchor_document(dest_parent);
314:
1.2 frystyk 315: /* Test whether we already have a hyperdoc for this document */
316: if (mr->flags & MR_LINK && dest_parent && !hd) {
1.1 frystyk 317: HTParentAnchor * parent = HTRequest_parent(text->request);
318: HyperDoc * last = HTAnchor_document(parent);
319: int depth = last ? last->depth+1 : 0;
1.2 frystyk 320: HTRequest * newreq = Thread_new(mr, METHOD_GET);
321: HyperDoc_new(mr, dest_parent, depth);
322: if (SHOW_MSG) {
323: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
324: TTYPrint(TDEST, "Robot....... Loading `%s\'\n", uri);
325: free(uri);
326: }
327: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
328: if (SHOW_MSG) TTYPrint(TDEST, "Robot...... URI Not tested!\n");
329: Thread_delete(mr, newreq);
330: }
331: }
332: }
333: }
334:
335: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
336: CONST char *alt, CONST char * align, BOOL isMap)
337: {
338: if (text && anchor) {
339: Robot * mr = (Robot *) HTRequest_context(text->request);
340: HTParentAnchor * dest = (HTParentAnchor *)
341: HTAnchor_followMainLink((HTAnchor *) anchor);
342: HyperDoc * hd = HTAnchor_document(dest);
1.1 frystyk 343:
1.2 frystyk 344: /* Test whether we already have a hyperdoc for this document */
345: if (mr->flags & MR_IMG && dest && !hd) {
346: HTParentAnchor * parent = HTRequest_parent(text->request);
347: HyperDoc * last = HTAnchor_document(parent);
348: int depth = last ? last->depth+1 : 0;
349: HTRequest * newreq = Thread_new(mr, METHOD_HEAD);
350: HyperDoc_new(mr, dest, depth);
351: if (SHOW_MSG) {
352: char * uri = HTAnchor_address((HTAnchor *) dest);
353: TTYPrint(TDEST, "Robot....... Checking Image `%s\'\n", uri);
354: free(uri);
355: }
356: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
357: if (SHOW_MSG)
358: TTYPrint(TDEST, "Robot....... Image not tested!\n");
359: Thread_delete(mr, newreq);
1.1 frystyk 360: }
361: }
362: }
363: }
364:
365: PUBLIC void HText_endAnchor (HText * text) {}
366: PUBLIC void HText_appendText (HText * text, CONST char * str) {}
367: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
368: PUBLIC void HText_endAppend (HText * text) {}
369: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
370: PUBLIC void HText_beginAppend (HText * text) {}
371: PUBLIC void HText_appendParagraph (HText * text) {}
372:
373: /* ------------------------------------------------------------------------- */
374: /* MAIN PROGRAM */
375: /* ------------------------------------------------------------------------- */
376:
377: int main (int argc, char ** argv)
378: {
379: int status = 0;
380: int arg;
381: HTChunk * keywords = NULL; /* From command line */
382: int keycnt = 0;
383: Robot * mr = Robot_new(); /* Create new Robot instance */
384:
385: /* Starts Mac GUSI socket library */
386: #ifdef GUSI
387: GUSISetup(GUSIwithSIOUXSockets);
388: GUSISetup(GUSIwithInternetSockets);
389: #endif
390:
391: #ifdef __MWERKS__ /* STR */
392: InitGraf((Ptr) &qd.thePort);
393: InitFonts();
394: InitWindows();
395: InitMenus(); TEInit();
396: InitDialogs(nil);
397: InitCursor();
398: SIOUXSettings.asktosaveonclose = false;
399: argc=ccommand(&argv);
400: #endif
401:
402: /* Initiate W3C Reference Library */
403: HTLibInit(APP_NAME, APP_VERSION);
404:
405: /* Initialize the protocol modules */
406: HTAccessInit();
407:
408: /* Initialize set of converters */
409: mr->converters = HTList_new();
410: HTConverterInit(mr->converters);
411: HTFormat_setConversion(mr->converters);
412:
413: /* Initialize bindings between file suffixes and media types */
414: HTFileInit();
415:
416: /* Get any proxy or gateway environment variables */
417: HTProxy_getEnvVar();
418:
419: /* Scan command Line for parameters */
420: for (arg=1; arg<argc; arg++) {
421: if (*argv[arg] == '-') {
422:
423: /* -? or -help: show the command line help page */
424: if (!strcmp(argv[arg],"-?") || !strcmp(argv[arg],"-help")) {
425: mr->anchor = (HTParentAnchor *) HTAnchor_findAddress(HELP);
426: keycnt = 1;
427:
428: /* non-interactive */
429: } else if (!strcmp(argv[arg], "-n")) {
430: HTAlert_setInteractive(NO);
431:
432: /* log file */
433: } else if (!strcmp(argv[arg], "-l")) {
434: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
435: argv[++arg] : DEFAULT_LOG_FILE;
436:
437: /* rule file */
438: } else if (!strcmp(argv[arg], "-r")) {
439: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
440: argv[++arg] : DEFAULT_RULE_FILE;
441:
442: /* output filename */
443: } else if (!strcmp(argv[arg], "-o")) {
444: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
445: argv[++arg] : DEFAULT_OUTPUT_FILE;
446:
447: /* timeout -- Change the default request timeout */
448: } else if (!strcmp(argv[arg], "-timeout")) {
449: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
450: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
451: if (timeout > 0) mr->tv->tv_sec = timeout;
452:
453: /* preemtive or non-preemtive access */
454: } else if (!strcmp(argv[arg], "-single")) {
455: HTRequest_setPreemtive(mr->request, YES);
1.2 frystyk 456: mr->flags |= MR_PREEMTIVE;
457:
458: /* test inlined images */
459: } else if (!strcmp(argv[arg], "-img")) {
460: mr->flags |= MR_IMG;
461:
462: /* load anchors */
463: } else if (!strcmp(argv[arg], "-link")) {
464: mr->flags |= MR_LINK;
465:
466: /* preemtive or non-preemtive access */
467: } else if (!strcmp(argv[arg], "-single")) {
468: HTRequest_setPreemtive(mr->request, YES);
469: mr->flags |= MR_PREEMTIVE;
1.1 frystyk 470:
471: /* print version and exit */
472: } else if (!strcmp(argv[arg], "-version")) {
473: VersionInfo();
474: Cleanup(mr, 0);
475:
476: #ifdef WWWTRACE
477: /* trace flags */
478: } else if (!strncmp(argv[arg], "-v", 2)) {
479: char *p = argv[arg]+2;
480: WWWTRACE = 0;
481: for(; *p; p++) {
482: switch (*p) {
483: case 'a': WWWTRACE |= SHOW_ANCHOR_TRACE; break;
484: case 'b': WWWTRACE |= SHOW_BIND_TRACE; break;
485: case 'c': WWWTRACE |= SHOW_CACHE_TRACE; break;
486: case 'g': WWWTRACE |= SHOW_SGML_TRACE; break;
487: case 'p': WWWTRACE |= SHOW_PROTOCOL_TRACE; break;
488: case 's': WWWTRACE |= SHOW_STREAM_TRACE; break;
489: case 't': WWWTRACE |= SHOW_THREAD_TRACE; break;
490: case 'u': WWWTRACE |= SHOW_URI_TRACE; break;
491: default:
492: if (SHOW_MSG)
493: TTYPrint(TDEST,"Bad parameter (%s) in -v option\n",
494: argv[arg]);
495: }
496: }
497: if (!WWWTRACE) WWWTRACE = SHOW_ALL_TRACE;
498: #endif
499:
500: } else {
501: if (SHOW_MSG) TTYPrint(TDEST,"Bad Argument (%s)\n", argv[arg]);
502: }
503: } else { /* If no leading `-' then check for URL or keywords */
504: if (!keycnt) {
505: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
506: mr->anchor = (HTParentAnchor *) HTAnchor_findAddress(ref);
507: keycnt = 1;
508: FREE(ref);
509: } else { /* Check for successive keyword arguments */
510: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
511: if (keycnt++ <= 1)
512: keywords = HTChunkCreate(128);
513: else
514: HTChunkPutc(keywords, ' ');
515: HTChunkPuts(keywords, HTStrip(escaped));
516: free(escaped);
517: }
518: }
519: }
520:
521: #ifdef CATCH_SIG
522: SetSignal();
523: #endif
524:
525: if (!keycnt) {
1.2 frystyk 526: if (SHOW_MSG) TTYPrint(TDEST, "Please specify URL to check.\n");
1.1 frystyk 527: Cleanup(mr, -1);
528: }
529:
530: /* Rule file specified? */
531: if (mr->rules) {
532: HTList * list = HTList_new();
533: HTRequest * rr = HTRequest_new();
534: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
535: HTParentAnchor * ra = (HTParentAnchor *) HTAnchor_findAddress(rules);
536: HTRequest_setPreemtive(rr, YES);
537: HTConversion_add(list, "application/x-www-rules", "*/*", HTRules,
538: 1.0, 0.0, 0.0);
539: HTRequest_setConversion(rr, list, YES);
540: if (HTLoadAnchor((HTAnchor *) ra, rr) != YES)
541: if (SHOW_MSG) TTYPrint(TDEST, "Can't access rules\n");
542: HTConversion_deleteAll(list);
543: HTRequest_delete(rr);
544: FREE(rules);
545: }
546:
547: /* Output file specified? */
548: if (mr->outputfile) {
549: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
550: if (SHOW_MSG) TTYPrint(TDEST, "Can't open `%s'\n", mr->outputfile);
551: mr->output = OUTPUT;
552: }
553: }
554:
555: /* Set up the output */
556: HTRequest_setOutputStream(mr->request, HTFWriter_new(mr->output, YES));
557:
558: /* Log file specifed? */
559: if (mr->logfile) HTLog_open(mr->logfile, YES, YES);
560:
561: /* Register our User Prompts etc in the Alert Manager */
562: if (HTAlert_interactive()) {
563: HTAlert_add(HTError_print, HT_A_MESSAGE);
564: HTAlert_add(HTConfirm, HT_A_CONFIRM);
565: HTAlert_add(HTPrompt, HT_A_PROMPT);
566: HTAlert_add(HTPromptPassword, HT_A_SECRET);
567: HTAlert_add(HTPromptUsernameAndPassword, HT_A_USER_PW);
568: }
569:
570: /* Register a call back function for the Net Manager */
571: HTNetCall_addAfter(terminate_handler, HT_ALL);
572:
573: /* Set timeout on sockets */
574: HTEvent_registerTimeout(mr->tv, mr->request, timeout_handler, NO);
575:
576: /* Start the request */
577: if (keywords) /* Search */
578: status = HTSearch(HTChunkData(keywords), mr->anchor, mr->request);
579: else
580: status = HTLoadAnchor((HTAnchor *) mr->anchor, mr->request);
581:
582: if (keywords) HTChunkFree(keywords);
583: if (status != YES) {
584: if (SHOW_MSG) TTYPrint(TDEST, "Can't access resource\n");
585: Cleanup(mr, -1);
586: }
587:
588: /* Go into the event loop... */
589: HTEvent_Loop(mr->request);
590:
591: /* Only gets here if event loop fails */
592: Cleanup(mr, 0);
593: return 0;
594: }
Webmaster