Annotation of libwww/Robot/src/HTRobot.c, revision 1.1
1.1 ! frystyk 1: /* HTRobot.c
! 2: ** W3C MINI ROBOT
! 3: **
! 4: ** (c) COPRIGHT MIT 1995.
! 5: ** Please first read the full copyright statement in the file COPYRIGH.
! 6: **
! 7: ** This program illustrates how to travers links using the Anchor object
! 8: **
! 9: ** Authors:
! 10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
! 11: **
! 12: ** History:
! 13: ** Dec 04 95 First version
! 14: */
! 15:
! 16: #include "WWWLib.h" /* Global Library Include file */
! 17: #include "WWWApp.h" /* Application stuff */
! 18:
! 19: #include "HTRobot.h" /* Implemented here */
! 20:
! 21: #ifndef VR
! 22: #define VR "unspecified"
! 23: #endif
! 24:
! 25: #define APP_NAME "W3CRobot"
! 26: #define APP_VERSION VR
! 27:
! 28: /* Default page for "-help" command line option */
! 29: #define HELP "http://www.w3.org/pub/WWW/Robot/User/CommandLine.html"
! 30:
! 31: #define DEFAULT_OUTPUT_FILE "robot.out"
! 32: #define DEFAULT_RULE_FILE "robot.conf"
! 33: #define DEFAULT_LOG_FILE "robot.log"
! 34:
! 35: #define SHOW_MSG (WWWTRACE || HTAlert_interactive())
! 36:
! 37: #define DEFAULT_TIMEOUT 10 /* timeout in seconds */
! 38:
! 39: #if defined(__svr4__)
! 40: #define CATCH_SIG
! 41: #endif
! 42:
! 43: typedef enum _MRFlags {
! 44: MR_FILTER =0x1,
! 45: MR_COUNT =0x2
! 46: } MRFlags;
! 47:
! 48: typedef struct _Robot {
! 49: HTRequest * request;
! 50: HTParentAnchor * anchor;
! 51: struct timeval * tv; /* Timeout on socket */
! 52: char * cwd; /* Current dir URL */
! 53: HTList * converters;
! 54: char * rules;
! 55: char * logfile;
! 56: char * outputfile;
! 57: FILE * output;
! 58: MRFlags flags;
! 59: } Robot;
! 60:
! 61: typedef enum _LoadState {
! 62: L_INVALID = -2,
! 63: L_LOADING = -1,
! 64: L_SUCCESS = 0,
! 65: L_ERROR
! 66: } LoadState;
! 67:
! 68: /*
! 69: ** The HyperDoc object is bound to the anchor and contains information about
! 70: ** where we are in the search for recursive searches
! 71: */
! 72: typedef struct _HyperDoc {
! 73: HTParentAnchor * anchor;
! 74: LoadState state;
! 75: int depth;
! 76: } HyperDoc;
! 77:
! 78: /*
! 79: ** This is the HText object that is created every time we start parsing a
! 80: ** HTML object
! 81: */
! 82: typedef struct _HText {
! 83: HTRequest * request;
! 84: } HText;
! 85:
! 86: typedef struct _HTStyle HTStyle;
! 87: typedef struct _HTStyleSheet HTStyleSheet;
! 88:
! 89: PUBLIC HText * HTMainText = NULL;
! 90: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
! 91: PUBLIC HTStyleSheet * styleSheet = NULL;
! 92:
! 93: /* ------------------------------------------------------------------------- */
! 94:
! 95: /* Create a Command Line Object
! 96: ** ----------------------------
! 97: */
! 98: PRIVATE Robot * Robot_new (void)
! 99: {
! 100: Robot * me;
! 101: if ((me = (Robot *) calloc(1, sizeof(Robot))) == NULL ||
! 102: (me->tv = (struct timeval*) calloc(1, sizeof(struct timeval))) == NULL)
! 103: outofmem(__FILE__, "Robot_new");
! 104: me->tv->tv_sec = DEFAULT_TIMEOUT;
! 105: me->cwd = HTFindRelatedName();
! 106: me->output = OUTPUT;
! 107:
! 108: /* Bind the Robot object together with the Request Object */
! 109: me->request = HTRequest_new();
! 110: HTRequest_setContext (me->request, me);
! 111: return me;
! 112: }
! 113:
! 114: /* Delete a Command Line Object
! 115: ** ----------------------------
! 116: */
! 117: PRIVATE BOOL Robot_delete (Robot * me)
! 118: {
! 119: if (me) {
! 120: HTRequest_delete(me->request);
! 121: if (me->logfile) HTLog_close();
! 122: if (me->output && me->output != STDOUT) fclose(me->output);
! 123: FREE(me->cwd);
! 124: free(me->tv);
! 125: free(me);
! 126: return YES;
! 127: }
! 128: return NO;
! 129: }
! 130:
! 131: PRIVATE void Cleanup (Robot * me, int status)
! 132: {
! 133: Robot_delete(me);
! 134: HTLibTerminate();
! 135: #ifdef VMS
! 136: exit(status ? status : 1);
! 137: #else
! 138: exit(status ? status : 0);
! 139: #endif
! 140: }
! 141:
! 142: #ifdef CATCH_SIG
! 143: #include <signal.h>
! 144: /* SetSignal
! 145: ** This function sets up signal handlers. This might not be necessary to
! 146: ** call if the application has its own handlers (lossage on SVR4)
! 147: */
! 148: PRIVATE void SetSignal (void)
! 149: {
! 150: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
! 151: ** when attemting to connect to a remote host where you normally should
! 152: ** get `connection refused' back
! 153: */
! 154: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
! 155: if (PROT_TRACE) TTYPrint(TDEST, "HTSignal.... Can't catch SIGPIPE\n");
! 156: } else {
! 157: if (PROT_TRACE) TTYPrint(TDEST, "HTSignal.... Ignoring SIGPIPE\n");
! 158: }
! 159: }
! 160: #endif /* CATCH_SIG */
! 161:
! 162: PRIVATE void VersionInfo (void)
! 163: {
! 164: TTYPrint(OUTPUT,"\n\nW3C Reference Software\n\n");
! 165: TTYPrint(OUTPUT,"\tW3C Mini Robot (%s) version %s.\n",
! 166: APP_NAME, APP_VERSION);
! 167: TTYPrint(OUTPUT,"\tW3C Reference Library version %s.\n\n",HTLib_version());
! 168: TTYPrint(OUTPUT,"Please send feedback to <libwww@w3.org>\n");
! 169: }
! 170:
! 171: /* terminate_handler
! 172: ** -----------------
! 173: ** This function is registered to handle the result of the request
! 174: */
! 175: PRIVATE int terminate_handler (HTRequest * request, int status)
! 176: {
! 177: Robot * mr = (Robot *) HTRequest_context(request);
! 178: if (status == HT_LOADED) {
! 179: if (mr->flags & MR_COUNT) {
! 180: TTYPrint(OUTPUT, "Content Length found to be %ld\n",
! 181: HTAnchor_length(mr->anchor));
! 182: }
! 183: }
! 184: if (mr->logfile) HTLog_add(request, status);
! 185: return HT_OK;
! 186: }
! 187:
! 188: /* timeout_handler
! 189: ** ---------------
! 190: ** This function is registered to handle timeout in select eventloop
! 191: */
! 192: PRIVATE int timeout_handler (HTRequest * request)
! 193: {
! 194: if (SHOW_MSG) TTYPrint(TDEST, "Request timeout...\n");
! 195: HTRequest_kill(request);
! 196: return 0;
! 197: }
! 198:
! 199: /* ------------------------------------------------------------------------- */
! 200: /* HTEXT INTERFACE */
! 201: /* ------------------------------------------------------------------------- */
! 202:
! 203: /* Create a "HyperDoc" object
! 204: ** --------------------------
! 205: ** A HyperDoc object contains information about whether we have already
! 206: ** started checking the anchor and the depth in our search
! 207: */
! 208: PRIVATE HyperDoc * HyperDoc_new (HTParentAnchor * anchor, int depth)
! 209: {
! 210: HyperDoc * hd;
! 211: if ((hd = (HyperDoc *) calloc(1, sizeof(HyperDoc))) == NULL)
! 212: outofmem(__FILE__, "HyperDoc_new");
! 213: hd->state = L_INVALID;
! 214: hd->depth = depth;
! 215:
! 216: /* Bind the HyperDoc object together with the Anchor Object */
! 217: hd->anchor = anchor;
! 218: HTAnchor_setDocument(anchor, (void *) hd);
! 219:
! 220: if (SHOW_MSG)
! 221: TTYPrint(TDEST, "HyperDoc.... %p bound to anchor %p with depth %d\n",
! 222: hd, anchor, depth);
! 223: return hd;
! 224: }
! 225:
! 226: /* Delete a "HyperDoc" object
! 227: ** --------------------------
! 228: */
! 229: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
! 230: {
! 231: if (hd) {
! 232: free (hd);
! 233: return YES;
! 234: }
! 235: return NO;
! 236: }
! 237:
! 238: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
! 239: HTStream * stream)
! 240: {
! 241: HText * me;
! 242: if ((me = (HText *) calloc(1, sizeof(HText))) == NULL)
! 243: outofmem(__FILE__, "HText_new2");
! 244: me->request = request;
! 245: return me;
! 246: }
! 247:
! 248: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
! 249: {
! 250: if (text && anchor) {
! 251: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
! 252: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
! 253: HyperDoc * hd = HTAnchor_document(dest_parent);
! 254:
! 255: /* Test whether we have already a hyperdoc for this document */
! 256: if (!hd) {
! 257: HTParentAnchor * parent = HTRequest_parent(text->request);
! 258: HyperDoc * last = HTAnchor_document(parent);
! 259: int depth = last ? last->depth+1 : 0;
! 260: HyperDoc_new(dest_parent, depth);
! 261:
! 262: /* Create a new request object */
! 263: if (dest_parent) {
! 264: Robot * mr = (Robot *) HTRequest_context(text->request);
! 265: HTRequest * newreq = HTRequest_new();
! 266: HTRequest_setContext (newreq, mr);
! 267: if (SHOW_MSG) {
! 268: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
! 269: TTYPrint(TDEST, "Robot....... Loading `%s\'\n", uri);
! 270: free(uri);
! 271: }
! 272: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES)
! 273: if (SHOW_MSG) TTYPrint(TDEST, "URI Not tested!\n");
! 274: }
! 275: }
! 276: }
! 277: }
! 278:
! 279: PUBLIC void HText_endAnchor (HText * text) {}
! 280: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
! 281: CONST char * alt, CONST char * alignment,
! 282: BOOL isMap) {}
! 283: PUBLIC void HText_appendText (HText * text, CONST char * str) {}
! 284: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
! 285: PUBLIC void HText_endAppend (HText * text) {}
! 286: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
! 287: PUBLIC void HText_beginAppend (HText * text) {}
! 288: PUBLIC void HText_appendParagraph (HText * text) {}
! 289: PUBLIC BOOL HText_delete (HText * me)
! 290: {
! 291: return YES;
! 292: }
! 293:
! 294: /* ------------------------------------------------------------------------- */
! 295: /* MAIN PROGRAM */
! 296: /* ------------------------------------------------------------------------- */
! 297:
! 298: int main (int argc, char ** argv)
! 299: {
! 300: int status = 0;
! 301: int arg;
! 302: HTChunk * keywords = NULL; /* From command line */
! 303: int keycnt = 0;
! 304: Robot * mr = Robot_new(); /* Create new Robot instance */
! 305:
! 306: /* Starts Mac GUSI socket library */
! 307: #ifdef GUSI
! 308: GUSISetup(GUSIwithSIOUXSockets);
! 309: GUSISetup(GUSIwithInternetSockets);
! 310: #endif
! 311:
! 312: #ifdef __MWERKS__ /* STR */
! 313: InitGraf((Ptr) &qd.thePort);
! 314: InitFonts();
! 315: InitWindows();
! 316: InitMenus(); TEInit();
! 317: InitDialogs(nil);
! 318: InitCursor();
! 319: SIOUXSettings.asktosaveonclose = false;
! 320: argc=ccommand(&argv);
! 321: #endif
! 322:
! 323: /* Initiate W3C Reference Library */
! 324: HTLibInit(APP_NAME, APP_VERSION);
! 325:
! 326: /* Initialize the protocol modules */
! 327: HTAccessInit();
! 328:
! 329: /* Initialize set of converters */
! 330: mr->converters = HTList_new();
! 331: HTConverterInit(mr->converters);
! 332: HTFormat_setConversion(mr->converters);
! 333:
! 334: /* Initialize bindings between file suffixes and media types */
! 335: HTFileInit();
! 336:
! 337: /* Get any proxy or gateway environment variables */
! 338: HTProxy_getEnvVar();
! 339:
! 340: /* Scan command Line for parameters */
! 341: for (arg=1; arg<argc; arg++) {
! 342: if (*argv[arg] == '-') {
! 343:
! 344: /* -? or -help: show the command line help page */
! 345: if (!strcmp(argv[arg],"-?") || !strcmp(argv[arg],"-help")) {
! 346: mr->anchor = (HTParentAnchor *) HTAnchor_findAddress(HELP);
! 347: keycnt = 1;
! 348:
! 349: /* non-interactive */
! 350: } else if (!strcmp(argv[arg], "-n")) {
! 351: HTAlert_setInteractive(NO);
! 352:
! 353: /* log file */
! 354: } else if (!strcmp(argv[arg], "-l")) {
! 355: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 356: argv[++arg] : DEFAULT_LOG_FILE;
! 357:
! 358: /* rule file */
! 359: } else if (!strcmp(argv[arg], "-r")) {
! 360: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
! 361: argv[++arg] : DEFAULT_RULE_FILE;
! 362:
! 363: /* output filename */
! 364: } else if (!strcmp(argv[arg], "-o")) {
! 365: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 366: argv[++arg] : DEFAULT_OUTPUT_FILE;
! 367:
! 368: /* timeout -- Change the default request timeout */
! 369: } else if (!strcmp(argv[arg], "-timeout")) {
! 370: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
! 371: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
! 372: if (timeout > 0) mr->tv->tv_sec = timeout;
! 373:
! 374: /* preemtive or non-preemtive access */
! 375: } else if (!strcmp(argv[arg], "-single")) {
! 376: HTRequest_setPreemtive(mr->request, YES);
! 377:
! 378: /* print version and exit */
! 379: } else if (!strcmp(argv[arg], "-version")) {
! 380: VersionInfo();
! 381: Cleanup(mr, 0);
! 382:
! 383: #ifdef WWWTRACE
! 384: /* trace flags */
! 385: } else if (!strncmp(argv[arg], "-v", 2)) {
! 386: char *p = argv[arg]+2;
! 387: WWWTRACE = 0;
! 388: for(; *p; p++) {
! 389: switch (*p) {
! 390: case 'a': WWWTRACE |= SHOW_ANCHOR_TRACE; break;
! 391: case 'b': WWWTRACE |= SHOW_BIND_TRACE; break;
! 392: case 'c': WWWTRACE |= SHOW_CACHE_TRACE; break;
! 393: case 'g': WWWTRACE |= SHOW_SGML_TRACE; break;
! 394: case 'p': WWWTRACE |= SHOW_PROTOCOL_TRACE; break;
! 395: case 's': WWWTRACE |= SHOW_STREAM_TRACE; break;
! 396: case 't': WWWTRACE |= SHOW_THREAD_TRACE; break;
! 397: case 'u': WWWTRACE |= SHOW_URI_TRACE; break;
! 398: default:
! 399: if (SHOW_MSG)
! 400: TTYPrint(TDEST,"Bad parameter (%s) in -v option\n",
! 401: argv[arg]);
! 402: }
! 403: }
! 404: if (!WWWTRACE) WWWTRACE = SHOW_ALL_TRACE;
! 405: #endif
! 406:
! 407: } else {
! 408: if (SHOW_MSG) TTYPrint(TDEST,"Bad Argument (%s)\n", argv[arg]);
! 409: }
! 410: } else { /* If no leading `-' then check for URL or keywords */
! 411: if (!keycnt) {
! 412: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
! 413: mr->anchor = (HTParentAnchor *) HTAnchor_findAddress(ref);
! 414: keycnt = 1;
! 415: FREE(ref);
! 416: } else { /* Check for successive keyword arguments */
! 417: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
! 418: if (keycnt++ <= 1)
! 419: keywords = HTChunkCreate(128);
! 420: else
! 421: HTChunkPutc(keywords, ' ');
! 422: HTChunkPuts(keywords, HTStrip(escaped));
! 423: free(escaped);
! 424: }
! 425: }
! 426: }
! 427:
! 428: #ifdef CATCH_SIG
! 429: SetSignal();
! 430: #endif
! 431:
! 432: if (!keycnt) {
! 433: if (SHOW_MSG) TTYPrint(TDEST, "No URL specified\n");
! 434: Cleanup(mr, -1);
! 435: }
! 436:
! 437: /* Rule file specified? */
! 438: if (mr->rules) {
! 439: HTList * list = HTList_new();
! 440: HTRequest * rr = HTRequest_new();
! 441: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
! 442: HTParentAnchor * ra = (HTParentAnchor *) HTAnchor_findAddress(rules);
! 443: HTRequest_setPreemtive(rr, YES);
! 444: HTConversion_add(list, "application/x-www-rules", "*/*", HTRules,
! 445: 1.0, 0.0, 0.0);
! 446: HTRequest_setConversion(rr, list, YES);
! 447: if (HTLoadAnchor((HTAnchor *) ra, rr) != YES)
! 448: if (SHOW_MSG) TTYPrint(TDEST, "Can't access rules\n");
! 449: HTConversion_deleteAll(list);
! 450: HTRequest_delete(rr);
! 451: FREE(rules);
! 452: }
! 453:
! 454: /* Output file specified? */
! 455: if (mr->outputfile) {
! 456: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
! 457: if (SHOW_MSG) TTYPrint(TDEST, "Can't open `%s'\n", mr->outputfile);
! 458: mr->output = OUTPUT;
! 459: }
! 460: }
! 461:
! 462: /* Set up the output */
! 463: HTRequest_setOutputStream(mr->request, HTFWriter_new(mr->output, YES));
! 464:
! 465: /* Log file specifed? */
! 466: if (mr->logfile) HTLog_open(mr->logfile, YES, YES);
! 467:
! 468: /* Register our User Prompts etc in the Alert Manager */
! 469: if (HTAlert_interactive()) {
! 470: HTAlert_add(HTError_print, HT_A_MESSAGE);
! 471: HTAlert_add(HTConfirm, HT_A_CONFIRM);
! 472: HTAlert_add(HTPrompt, HT_A_PROMPT);
! 473: HTAlert_add(HTPromptPassword, HT_A_SECRET);
! 474: HTAlert_add(HTPromptUsernameAndPassword, HT_A_USER_PW);
! 475: }
! 476:
! 477: /* Register a call back function for the Net Manager */
! 478: HTNetCall_addAfter(terminate_handler, HT_ALL);
! 479:
! 480: /* Set timeout on sockets */
! 481: HTEvent_registerTimeout(mr->tv, mr->request, timeout_handler, NO);
! 482:
! 483: /* Start the request */
! 484: if (keywords) /* Search */
! 485: status = HTSearch(HTChunkData(keywords), mr->anchor, mr->request);
! 486: else
! 487: status = HTLoadAnchor((HTAnchor *) mr->anchor, mr->request);
! 488:
! 489: if (keywords) HTChunkFree(keywords);
! 490: if (status != YES) {
! 491: if (SHOW_MSG) TTYPrint(TDEST, "Can't access resource\n");
! 492: Cleanup(mr, -1);
! 493: }
! 494:
! 495: /* Go into the event loop... */
! 496: HTEvent_Loop(mr->request);
! 497:
! 498: /* Only gets here if event loop fails */
! 499: Cleanup(mr, 0);
! 500: return 0;
! 501: }
Webmaster