Annotation of libwww/Robot/src/HTRobot.c, revision 1.63
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
1.62 frystyk 26: #ifdef HAVE_REGEX_H
27: #include <regex.h>
28: #else
29: #ifdef HAVE_RXPOSIX_H
30: #include <rxposix.h>
31: #endif
32: #endif
1.60 frystyk 33: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 34: #endif
35:
1.14 frystyk 36: #ifndef W3C_VERSION
1.33 eric 37: #define W3C_VERSION "Unspecified"
1.1 frystyk 38: #endif
39:
40: #define APP_NAME "W3CRobot"
1.14 frystyk 41: #define APP_VERSION W3C_VERSION
1.62 frystyk 42: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 43:
44: #define DEFAULT_OUTPUT_FILE "robot.out"
45: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 46: #define DEFAULT_LOG_FILE "log-clf.txt"
47: #define DEFAULT_HIT_FILE "log-hit.txt"
1.63 ! frystyk 48: #define DEFAULT_LM_FILE "log-lastmodified.txt"
! 49: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 50: #define DEFAULT_REFERER_FILE "log-referer.txt"
51: #define DEFAULT_REJECT_FILE "log-reject.txt"
52: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
53: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 54: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 55: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 56: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 57: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 58: #define DEFAULT_PREFIX ""
1.59 frystyk 59: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 60: #define DEFAULT_DEPTH 0
1.53 frystyk 61: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 62:
1.51 frystyk 63: #if 0
1.53 frystyk 64: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 65: #endif
66:
1.46 eric 67: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 68: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
69: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 70:
1.40 frystyk 71: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 72:
73: #if defined(__svr4__)
74: #define CATCH_SIG
75: #endif
76:
77: typedef enum _MRFlags {
1.45 frystyk 78: MR_IMG = 0x1,
79: MR_LINK = 0x2,
80: MR_PREEMPTIVE = 0x4,
81: MR_TIME = 0x8,
1.46 eric 82: MR_SAVE = 0x10,
1.48 frystyk 83: MR_QUIET = 0x20,
1.62 frystyk 84: MR_REAL_QUIET = 0x40,
85: MR_VALIDATE = 0x80,
86: MR_END_VALIDATE = 0x100,
1.63 ! frystyk 87: MR_KEEP_META = 0x200,
! 88: MR_LOGGING = 0x400,
! 89: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 90: } MRFlags;
91:
92: typedef struct _Robot {
1.2 frystyk 93: int depth; /* How deep is our tree */
1.30 frystyk 94: int cnt; /* Count of requests */
1.2 frystyk 95: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 96: HTList * htext; /* List of our HText Objects */
1.34 eric 97: HTList * fingers;
1.59 frystyk 98:
1.40 frystyk 99: int timer;
1.1 frystyk 100: char * cwd; /* Current dir URL */
101: char * rules;
1.55 frystyk 102: char * prefix;
1.59 frystyk 103: char * img_prefix;
104:
1.60 frystyk 105: char * logfile; /* clf log */
1.55 frystyk 106: HTLog * log;
1.60 frystyk 107: char * reffile; /* referer log */
1.57 frystyk 108: HTLog * ref;
1.60 frystyk 109: char * rejectfile; /* unchecked links */
1.58 frystyk 110: HTLog * reject;
1.60 frystyk 111: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 112: HTLog * notfound;
1.60 frystyk 113: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 114: HTLog * conneg;
1.60 frystyk 115: char * noalttagfile; /* images without alt tags*/
116: HTLog * noalttag;
117:
118: char * hitfile; /* links sorted after hit counts */
1.63 ! frystyk 119: char * titlefile; /* links with titles */
1.60 frystyk 120: char * mtfile; /* media types encountered */
121: char * charsetfile; /* charsets encountered */
1.63 ! frystyk 122: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 123:
124: char * outputfile;
1.1 frystyk 125: FILE * output;
1.59 frystyk 126:
1.1 frystyk 127: MRFlags flags;
1.55 frystyk 128:
1.59 frystyk 129: long get_bytes; /* Total number of bytes processed using GET*/
130: long get_docs; /* Total number of documents using GET */
131:
132: long head_bytes; /* bytes processed bytes processed using HEAD */
133: long head_docs; /* Total number of documents using HEAD*/
134:
135: long other_docs;
136:
1.56 frystyk 137: ms_t time; /* Time of run */
1.58 frystyk 138:
139: #ifdef HT_POSIX_REGEX
140: regex_t * include;
141: regex_t * exclude;
142: regex_t * check;
143: #endif
144:
1.1 frystyk 145: } Robot;
1.34 eric 146:
147: typedef struct _Finger {
148: Robot * robot;
149: HTRequest * request;
150: HTParentAnchor * dest;
151: } Finger;
152:
1.1 frystyk 153: typedef enum _LoadState {
154: L_INVALID = -2,
155: L_LOADING = -1,
156: L_SUCCESS = 0,
157: L_ERROR
158: } LoadState;
159:
160: /*
161: ** The HyperDoc object is bound to the anchor and contains information about
162: ** where we are in the search for recursive searches
163: */
164: typedef struct _HyperDoc {
165: HTParentAnchor * anchor;
166: LoadState state;
167: int depth;
1.55 frystyk 168: int hits;
1.1 frystyk 169: } HyperDoc;
170:
171: /*
172: ** This is the HText object that is created every time we start parsing a
173: ** HTML object
174: */
1.4 frystyk 175: struct _HText {
1.1 frystyk 176: HTRequest * request;
1.4 frystyk 177: };
1.1 frystyk 178:
1.58 frystyk 179: /*
180: ** A structure for calculating metadata distributions
181: */
182: typedef struct _MetaDist {
183: HTAtom * name;
184: int hits;
185: } MetaDist;
186:
187: /*
188: ** Some sorting algorithms
189: */
1.63 ! frystyk 190: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 191:
1.1 frystyk 192: PUBLIC HText * HTMainText = NULL;
193: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
194: PUBLIC HTStyleSheet * styleSheet = NULL;
195:
196: /* ------------------------------------------------------------------------- */
197:
1.13 eric 198: /* Standard (non-error) Output
199: ** ---------------------------
200: */
201: PUBLIC int OutputData(const char * fmt, ...)
202: {
203: int ret;
204: va_list pArgs;
205: va_start(pArgs, fmt);
206: ret = vfprintf(stdout, fmt, pArgs);
207: va_end(pArgs);
208: return ret;
209: }
210:
211: /* ------------------------------------------------------------------------- */
212:
1.2 frystyk 213: /* Create a "HyperDoc" object
214: ** --------------------------
215: ** A HyperDoc object contains information about whether we have already
216: ** started checking the anchor and the depth in our search
217: */
218: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
219: {
220: HyperDoc * hd;
1.14 frystyk 221: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
222: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 223: hd->state = L_INVALID;
224: hd->depth = depth;
1.55 frystyk 225: hd->hits = 1;
1.2 frystyk 226:
227: /* Bind the HyperDoc object together with the Anchor Object */
228: hd->anchor = anchor;
229: HTAnchor_setDocument(anchor, (void *) hd);
230:
231: /* Add this HyperDoc object to our list */
232: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
233: HTList_addObject(mr->hyperdoc, (void *) hd);
234: return hd;
235: }
236:
237: /* Delete a "HyperDoc" object
238: ** --------------------------
239: */
240: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
241: {
242: if (hd) {
1.11 frystyk 243: HT_FREE (hd);
1.2 frystyk 244: return YES;
245: }
246: return NO;
247: }
248:
1.55 frystyk 249: /*
250: ** Sort the anchor array and log reference count
251: */
252: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
253: {
254: if (mr && array) {
255: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
256: if (log) {
257: void ** data = NULL;
258: HTParentAnchor * anchor = NULL;
259: HTArray_sort(array, HitSort);
260: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
261: while (anchor) {
262: char * uri = HTAnchor_address((HTAnchor *) anchor);
263: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 ! frystyk 264: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 265: HT_FREE(uri);
266: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
267: }
268: }
269: HTLog_close(log);
270: return YES;
271: }
272: return NO;
273: }
274:
275: PRIVATE int HitSort (const void * a, const void * b)
276: {
277: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
278: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
279: if (aa && bb) return (bb->hits - aa->hits);
280: return bb - aa;
281: }
282:
1.58 frystyk 283: /*
1.63 ! frystyk 284: ** Sort the anchor array and log last modified date
! 285: */
! 286: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
! 287: {
! 288: if (mr && array) {
! 289: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
! 290: if (log) {
! 291: void ** data = NULL;
! 292: HTParentAnchor * anchor = NULL;
! 293: HTArray_sort(array, LastModifiedSort);
! 294: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
! 295: while (anchor) {
! 296: char * uri = HTAnchor_address((HTAnchor *) anchor);
! 297: time_t lm = HTAnchor_lastModified(anchor);
! 298: if (uri && lm > 0)
! 299: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
! 300: HT_FREE(uri);
! 301: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
! 302: }
! 303: }
! 304: HTLog_close(log);
! 305: return YES;
! 306: }
! 307: return NO;
! 308: }
! 309:
! 310: PRIVATE int LastModifiedSort (const void * a, const void * b)
! 311: {
! 312: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
! 313: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
! 314: return bb - aa;
! 315: }
! 316:
! 317: /*
! 318: ** Sort the anchor array and log the document title
! 319: */
! 320: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
! 321: {
! 322: if (mr && array) {
! 323: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
! 324: if (log) {
! 325: void ** data = NULL;
! 326: HTParentAnchor * anchor = NULL;
! 327: HTArray_sort(array, TitleSort);
! 328: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
! 329: while (anchor) {
! 330: char * uri = HTAnchor_address((HTAnchor *) anchor);
! 331: const char * title = HTAnchor_title(anchor);
! 332: HTCharset charset = HTAnchor_charset(anchor);
! 333: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
! 334: charset ? HTAtom_name(charset) : "<none>",
! 335: title ? title : "<none>",
! 336: uri);
! 337: HT_FREE(uri);
! 338: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
! 339: }
! 340: }
! 341: HTLog_close(log);
! 342: return YES;
! 343: }
! 344: return NO;
! 345: }
! 346:
! 347: PRIVATE int TitleSort (const void * a, const void * b)
! 348: {
! 349: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
! 350: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
! 351: return strcasecomp(bb?bb:"", aa?aa:"");
! 352: }
! 353:
! 354: /*
1.58 frystyk 355: ** Calculate distributions for media types. The same mechanism
356: ** can be used for other characteristics with relatively
357: ** few outcomes.
358: */
359: PRIVATE HTList * mediatype_distribution (HTArray * array)
360: {
361: if (array) {
362: HTList * mt = HTList_new();
363: MetaDist * pres = NULL;
364: void ** data = NULL;
365: HTParentAnchor * anchor = NULL;
366: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
367: while (anchor) {
368: HTFormat format = HTAnchor_format(anchor);
369: if (format && format != WWW_UNKNOWN) {
370: HTList * cur = mt;
371:
372: /* If found then increase counter */
373: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
374: if (pres->name == format) {
375: pres->hits++;
376: break;
377: }
378: }
379:
380: /* If not found then add new format to list */
381: if (!pres) {
382: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
383: HT_OUTOFMEM("mediatype_distribution");
384: pres->name = format;
385: pres->hits = 1;
386: HTList_addObject(mt, pres);
387: HTList_insertionSort(mt, FormatSort);
388: }
389: }
390:
391: /* Find next anchor in array */
392: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
393: }
394: return mt;
395: }
396: return NULL;
397: }
398:
1.60 frystyk 399: /*
400: ** Calculate distributions for charsets. The same mechanism
401: ** can be used for other characteristics with relatively
402: ** few outcomes.
403: */
404: PRIVATE HTList * charset_distribution (HTArray * array)
405: {
406: if (array) {
407: HTList * cs = HTList_new();
408: MetaDist * pres = NULL;
409: void ** data = NULL;
410: HTParentAnchor * anchor = NULL;
411: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
412: while (anchor) {
413: HTCharset charset = HTAnchor_charset(anchor);
414: if (charset) {
415: HTList * cur = cs;
416:
417: /* If found then increase counter */
418: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
419: if (pres->name == charset) {
420: pres->hits++;
421: break;
422: }
423: }
424:
425: /* If not found then add new format to list */
426: if (!pres) {
427: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
428: HT_OUTOFMEM("charset_distribution");
429: pres->name = charset;
430: pres->hits = 1;
431: HTList_addObject(cs, pres);
432: HTList_insertionSort(cs, FormatSort);
433: }
434: }
435:
436: /* Find next anchor in array */
437: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
438: }
439: return cs;
440: }
441: return NULL;
442: }
443:
1.58 frystyk 444: PRIVATE int FormatSort (const void * a, const void * b)
445: {
446: MetaDist * aa = (MetaDist *) a;
447: MetaDist * bb = (MetaDist *) b;
448: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
449: }
450:
451: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
452: {
453: if (logfile && distribution) {
454: HTLog * log = HTLog_open(logfile, YES, YES);
455: if (log) {
456: HTList * cur = distribution;
457: MetaDist * pres;
458: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
459: if (pres->name) {
1.60 frystyk 460: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 461: }
462: }
463: HTLog_close(log);
464: }
465: }
466: return NO;
467: }
468:
469: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
470: {
471: if (distribution) {
472: HTList * cur = distribution;
473: MetaDist * pres;
474: while ((pres = (MetaDist *) HTList_nextObject(cur)))
475: HT_FREE(pres);
476: HTList_delete(distribution);
477: return YES;
478: }
479: return NO;
480: }
481:
482:
1.55 frystyk 483: /* Statistics
484: ** ----------
485: ** Calculates a bunch of statistics for the anchors traversed
486: */
487: PRIVATE BOOL calculate_statistics (Robot * mr)
488: {
1.59 frystyk 489: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 490: if (!mr) return NO;
491:
492: /* Calculate efficiency */
1.59 frystyk 493: if (mr->time > 0) {
1.56 frystyk 494: ms_t t = HTGetTimeInMillis() - mr->time;
495: if (t > 0) {
1.60 frystyk 496: double loadfactor = (mr->get_bytes / (t * 0.001));
497: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 498: double secs = t / 1000.0;
1.55 frystyk 499: char bytes[50];
1.62 frystyk 500: if (SHOW_REAL_QUIET(mr))
501: HTTrace("Accessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
502: total_docs, secs, reqprsec);
1.59 frystyk 503:
504: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 505: if (SHOW_REAL_QUIET(mr))
506: HTTrace("Did a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
507: mr->get_docs, bytes, loadfactor);
1.59 frystyk 508:
509: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 510: if (SHOW_REAL_QUIET(mr))
511: HTTrace("Did a HEAD on %ld document(s) with a total of %s bytes\n",
512: mr->head_docs, bytes);
1.55 frystyk 513: }
514: }
515:
516: /* Create an array of existing anchors */
1.59 frystyk 517: if (total_docs > 1) {
518: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 519: if (array) {
520:
1.63 ! frystyk 521: /* Distributions */
! 522: if (mr->flags & MR_DISTRIBUTIONS) {
! 523: if (SHOW_REAL_QUIET(mr)) HTTrace("Distributions:\n");
! 524: }
! 525:
1.55 frystyk 526: /* Sort after hit counts */
1.63 ! frystyk 527: if (mr->hitfile) {
! 528: if (SHOW_REAL_QUIET(mr))
! 529: HTTrace("Logged hit count distribution in file `%s\'\n",
! 530: mr->hitfile);
! 531: calculate_hits(mr, array);
! 532: }
! 533:
! 534: /* Sort after modified date */
! 535: if (mr->lmfile) {
! 536: if (SHOW_REAL_QUIET(mr))
! 537: HTTrace("Logged last modified distribution in file `%s\'\n",
! 538: mr->lmfile);
! 539: calculate_lm(mr, array);
! 540: }
! 541:
! 542: /* Sort after title */
! 543: if (mr->titlefile) {
! 544: if (SHOW_REAL_QUIET(mr))
! 545: HTTrace("Logged title distribution in file `%s\'\n",
! 546: mr->titlefile);
! 547: calculate_title(mr, array);
! 548: }
1.55 frystyk 549:
1.58 frystyk 550: /* Find mediatype distribution */
551: if (mr->mtfile) {
552: HTList * mtdist = mediatype_distribution(array);
553: if (mtdist) {
1.63 ! frystyk 554: if (SHOW_REAL_QUIET(mr))
! 555: HTTrace("Logged media type distribution in file `%s\'\n",
! 556: mr->mtfile);
1.58 frystyk 557: log_meta_distribution(mr->mtfile, mtdist);
558: delete_meta_distribution(mtdist);
559: }
560: }
1.55 frystyk 561:
1.60 frystyk 562: /* Find charset distribution */
563: if (mr->charsetfile) {
564: HTList * charsetdist = charset_distribution(array);
565: if (charsetdist) {
1.63 ! frystyk 566: if (SHOW_REAL_QUIET(mr))
! 567: HTTrace("Logged charset distribution in file `%s\'\n",
! 568: mr->charsetfile);
1.60 frystyk 569: log_meta_distribution(mr->charsetfile, charsetdist);
570: delete_meta_distribution(charsetdist);
571: }
572: }
573:
1.55 frystyk 574: /* Add as may other stats here as you like */
1.60 frystyk 575: /* ... */
1.58 frystyk 576:
577: /* Delete the array */
1.55 frystyk 578: HTArray_delete(array);
579: }
580: }
581: return YES;
582: }
583:
1.1 frystyk 584: /* Create a Command Line Object
585: ** ----------------------------
586: */
587: PRIVATE Robot * Robot_new (void)
588: {
589: Robot * me;
1.41 frystyk 590: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 591: HT_OUTOFMEM("Robot_new");
1.2 frystyk 592: me->hyperdoc = HTList_new();
1.4 frystyk 593: me->htext = HTList_new();
1.40 frystyk 594: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 595: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 596: me->output = OUTPUT;
1.35 eric 597: me->cnt = 0;
1.34 eric 598: me->fingers = HTList_new();
1.1 frystyk 599: return me;
600: }
601:
602: /* Delete a Command Line Object
603: ** ----------------------------
604: */
1.62 frystyk 605: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 606: {
1.62 frystyk 607: if (mr) {
608: HTList_delete(mr->fingers);
1.55 frystyk 609:
610: /* Calculate statistics */
1.62 frystyk 611: calculate_statistics(mr);
1.55 frystyk 612:
1.62 frystyk 613: if (mr->hyperdoc) {
614: HTList * cur = mr->hyperdoc;
1.2 frystyk 615: HyperDoc * pres;
616: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
617: HyperDoc_delete(pres);
1.62 frystyk 618: HTList_delete(mr->hyperdoc);
1.2 frystyk 619: }
1.62 frystyk 620: if (mr->htext) {
621: HTList * cur = mr->htext;
1.4 frystyk 622: HText * pres;
623: while ((pres = (HText *) HTList_nextObject(cur)))
624: HText_free(pres);
1.62 frystyk 625: HTList_delete(mr->htext);
1.4 frystyk 626: }
1.62 frystyk 627:
628: /* Close all the log files */
1.63 ! frystyk 629: if (mr->flags & MR_LOGGING) {
! 630: if (SHOW_REAL_QUIET(mr)) HTTrace("Raw Log files:\n");
! 631: }
! 632:
1.62 frystyk 633: if (mr->log) {
634: if (SHOW_REAL_QUIET(mr))
635: HTTrace("Logged %5d entries in general log file `%s\'\n",
636: HTLog_accessCount(mr->log), mr->logfile);
637: HTLog_close(mr->log);
638: }
639: if (mr->ref) {
640: if (SHOW_REAL_QUIET(mr))
641: HTTrace("Logged %5d entries in referer log file `%s\'\n",
642: HTLog_accessCount(mr->ref), mr->reffile);
643: HTLog_close(mr->ref);
644: }
645: if (mr->reject) {
646: if (SHOW_REAL_QUIET(mr))
647: HTTrace("Logged %5d entries in rejected log file `%s\'\n",
648: HTLog_accessCount(mr->reject), mr->rejectfile);
649: HTLog_close(mr->reject);
650: }
651: if (mr->notfound) {
652: if (SHOW_REAL_QUIET(mr))
653: HTTrace("Logged %5d entries in not found log file `%s\'\n",
654: HTLog_accessCount(mr->notfound), mr->notfoundfile);
655: HTLog_close(mr->notfound);
656: }
657: if (mr->conneg) {
658: if (SHOW_REAL_QUIET(mr))
659: HTTrace("Logged %5d entries in content negotiation log file `%s\'\n",
660: HTLog_accessCount(mr->conneg), mr->connegfile);
661: HTLog_close(mr->conneg);
662: }
663: if (mr->noalttag) {
664: if (SHOW_REAL_QUIET(mr))
665: HTTrace("Logged %5d entries in missing alt tag log file `%s\'\n",
666: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
667: HTLog_close(mr->noalttag);
668: }
669:
670: if (mr->output && mr->output != STDOUT) fclose(mr->output);
671:
672: if (mr->flags & MR_TIME) {
1.12 frystyk 673: time_t local = time(NULL);
1.62 frystyk 674: if (SHOW_REAL_QUIET(mr))
675: HTTrace("Robot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 676: }
1.55 frystyk 677:
1.58 frystyk 678: #ifdef HT_POSIX_REGEX
1.62 frystyk 679: if (mr->include) {
680: regfree(mr->include);
681: HT_FREE(mr->include);
682: }
683: if (mr->exclude) {
684: regfree(mr->exclude);
685: HT_FREE(mr->exclude);
686: }
687: if (mr->check) {
688: regfree(mr->check);
689: HT_FREE(mr->check);
1.58 frystyk 690: }
691: #endif
692:
1.62 frystyk 693: HT_FREE(mr->cwd);
694: HT_FREE(mr->prefix);
695: HT_FREE(mr->img_prefix);
696: HT_FREE(mr);
1.1 frystyk 697: return YES;
698: }
699: return NO;
700: }
701:
1.2 frystyk 702: /*
1.34 eric 703: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 704: */
1.34 eric 705: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 706: {
1.34 eric 707: Finger * me;
708: HTRequest * request = HTRequest_new();
709: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
710: HT_OUTOFMEM("Finger_new");
711: me->robot = robot;
712: me->request = request;
713: me->dest = dest;
714: HTList_addObject(robot->fingers, (void *)me);
715:
1.48 frystyk 716: /* Set the context for this request */
1.34 eric 717: HTRequest_setContext (request, me);
1.48 frystyk 718:
719: /* Check the various flags to customize the request */
720: if (robot->flags & MR_PREEMPTIVE)
721: HTRequest_setPreemptive(request, YES);
722: if (robot->flags & MR_VALIDATE)
723: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
724: if (robot->flags & MR_END_VALIDATE)
725: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
726:
727: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 728: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 729:
730: /* Set the method for this request */
1.34 eric 731: HTRequest_setMethod(request, method);
732: robot->cnt++;
733: return me;
1.2 frystyk 734: }
735:
1.34 eric 736: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 737: {
1.34 eric 738: HTList_removeObject(me->robot->fingers, (void *)me);
739: me->robot->cnt--;
1.37 frystyk 740:
741: /*
742: ** If we are down at one request then flush the output buffer
743: */
744: if (me->request) {
745: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 746: HTRequest_delete(me->request);
1.37 frystyk 747: }
748:
749: /*
750: ** Delete the request and free myself
751: */
1.34 eric 752: HT_FREE(me);
753: return YES;
1.2 frystyk 754: }
755:
756: /*
757: ** Cleanup and make sure we close all connections including the persistent
758: ** ones
759: */
1.1 frystyk 760: PRIVATE void Cleanup (Robot * me, int status)
761: {
762: Robot_delete(me);
1.29 eric 763: HTProfile_delete();
1.50 frystyk 764: #ifdef HT_MEMLOG
1.39 eric 765: HTMemLog_close();
1.47 frystyk 766: #endif
767:
1.1 frystyk 768: #ifdef VMS
769: exit(status ? status : 1);
770: #else
771: exit(status ? status : 0);
772: #endif
773: }
774:
775: #ifdef CATCH_SIG
776: #include <signal.h>
777: /* SetSignal
778: ** This function sets up signal handlers. This might not be necessary to
779: ** call if the application has its own handlers (lossage on SVR4)
780: */
781: PRIVATE void SetSignal (void)
782: {
783: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
784: ** when attemting to connect to a remote host where you normally should
785: ** get `connection refused' back
786: */
787: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 788: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 789: } else {
1.13 eric 790: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 791: }
1.47 frystyk 792:
1.50 frystyk 793: #ifdef HT_MEMLOG
1.44 eric 794: HTMemLog_flush();
1.47 frystyk 795: #endif
796:
1.1 frystyk 797: }
798: #endif /* CATCH_SIG */
799:
1.58 frystyk 800: #ifdef HT_POSIX_REGEX
801: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
802: {
803: size_t length = regerror (errcode, compiled, NULL, 0);
804: char * str = NULL;
805: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
806: HT_OUTOFMEM("get_regerror");
807: (void) regerror (errcode, compiled, str, length);
808: return str;
809: }
810:
1.60 frystyk 811: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 812: {
813: regex_t * regex = NULL;
814: if (regex_str && *regex_str) {
815: int status;
816: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
817: HT_OUTOFMEM("get_regtype");
1.60 frystyk 818: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 819: char * err_msg = get_regerror(status, regex);
1.62 frystyk 820: if (SHOW_REAL_QUIET(mr))
821: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 822: HT_FREE(err_msg);
823: Cleanup(mr, -1);
824: }
825: }
826: return regex;
827: }
828: #endif
829:
1.1 frystyk 830: PRIVATE void VersionInfo (void)
831: {
1.62 frystyk 832: OutputData("W3C Sample Software\n\n");
833: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
834: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
835: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 836: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 837: }
838:
839: /* terminate_handler
840: ** -----------------
1.2 frystyk 841: ** This function is registered to handle the result of the request.
842: ** If no more requests are pending then terminate program
1.1 frystyk 843: */
1.32 frystyk 844: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
845: void * param, int status)
1.1 frystyk 846: {
1.34 eric 847: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 848: Robot * mr = finger->robot;
1.62 frystyk 849: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 850:
1.58 frystyk 851: /* Check if negotiated resource and whether we should log that*/
852: if (mr->conneg) {
853: HTAssocList * cur = HTResponse_variant(response);
854: if (cur) {
855: BOOL first = YES;
856: HTChunk * buffer = HTChunk_new(128);
857: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
858: HTAssoc * pres;
1.60 frystyk 859: HTChunk_puts(buffer, uri);
1.58 frystyk 860: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
861: char * value = HTAssoc_value(pres);
862: if (first) {
1.60 frystyk 863: HTChunk_puts(buffer, "\t(");
1.58 frystyk 864: first = NO;
865: } else
866: HTChunk_puts(buffer, ", ");
867:
868: /* Output the name */
869: HTChunk_puts(buffer, HTAssoc_name(pres));
870:
871: /* Only output the value if not empty string */
1.60 frystyk 872: if (value && *value) {
1.58 frystyk 873: HTChunk_puts(buffer, "=");
874: HTChunk_puts(buffer, value);
875: }
876: }
1.60 frystyk 877: if (!first) HTChunk_puts(buffer, ")");
878: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 879: HTChunk_delete(buffer);
880: HT_FREE(uri);
881: }
882: }
883:
1.55 frystyk 884: /* Count the amount of body data that we have read */
1.59 frystyk 885: if (HTRequest_method(request) == METHOD_GET) {
886: int length = HTAnchor_length(HTRequest_anchor(request));
887: if (length > 0) mr->get_bytes += length;
888: mr->get_docs++;
889: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 890: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 891: if (length > 0) mr->head_bytes += length;
892: mr->head_docs++;
893: } else {
894: mr->other_docs++;
1.55 frystyk 895: }
896:
1.58 frystyk 897: /* Cleanup the anchor so that we don't drown in metainformation */
898: if (!(mr->flags & MR_KEEP_META))
899: HTAnchor_clearHeader(HTRequest_anchor(request));
900:
1.55 frystyk 901: /* Delete this thread */
1.34 eric 902: Finger_delete(finger);
1.55 frystyk 903:
904: /* Should we stop? */
1.46 eric 905: if (mr->cnt <= 0) {
1.62 frystyk 906: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 907: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 908: }
1.62 frystyk 909: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 910: return HT_OK;
911: }
912:
913: /* ------------------------------------------------------------------------- */
914: /* HTEXT INTERFACE */
915: /* ------------------------------------------------------------------------- */
916:
917: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
918: HTStream * stream)
919: {
920: HText * me;
1.34 eric 921: Finger * finger = (Finger *) HTRequest_context(request);
922: Robot * mr = finger->robot;
1.14 frystyk 923: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
924: HT_OUTOFMEM("HText_new2");
1.4 frystyk 925:
926: /* Bind the HText object together with the Request Object */
1.1 frystyk 927: me->request = request;
1.4 frystyk 928:
929: /* Add this HyperDoc object to our list */
930: if (!mr->htext) mr->htext = HTList_new();
931: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 932: return me;
933: }
934:
1.4 frystyk 935: PUBLIC void HText_free (HText * me) {
1.11 frystyk 936: if (me) HT_FREE (me);
1.4 frystyk 937: }
938:
1.1 frystyk 939: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
940: {
941: if (text && anchor) {
1.34 eric 942: Finger * finger = (Finger *) HTRequest_context(text->request);
943: Robot * mr = finger->robot;
1.1 frystyk 944: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
945: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 946: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 947: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 948: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.58 frystyk 949: BOOL match = YES;
950: BOOL check = NO;
1.1 frystyk 951:
1.55 frystyk 952: if (!uri) return;
1.62 frystyk 953: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 954:
955: if (hd) {
1.62 frystyk 956: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 957: hd->hits++;
1.58 frystyk 958: HT_FREE(uri);
959: return;
960: }
961:
962: /* Check for prefix match */
963: if (mr->prefix) match = HTStrMatch(mr->prefix, uri) ? YES : NO;
964:
965: #ifdef HT_POSIX_REGEX
966: /* Check for any regular expression */
967: if (match && mr->include) {
968: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
969: }
970: if (match && mr->exclude) {
971: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
972: }
973: if (match && mr->check) {
974: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
975: }
976: #endif
977:
978: /* Test whether we already have a hyperdoc for this document */
979: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 980: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
981: HyperDoc * last_doc = HTAnchor_document(last_anchor);
982: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 983: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
984: HTRequest * newreq = newfinger->request;
1.2 frystyk 985: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 986: HTRequest_setParent(newreq, referer);
1.58 frystyk 987: if (check || depth >= mr->depth) {
1.62 frystyk 988: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 989: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 990: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 991: } else {
1.62 frystyk 992: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 993: }
994: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 995: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 996: Finger_delete(newfinger);
1.2 frystyk 997: }
1.7 frystyk 998: } else {
1.62 frystyk 999: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 1000: if (mr->reject) {
1001: if (referer) {
1002: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1003: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1004: HT_FREE(ref_addr);
1005: }
1006: }
1.2 frystyk 1007: }
1.11 frystyk 1008: HT_FREE(uri);
1.2 frystyk 1009: }
1010: }
1011:
1012: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1013: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1014: {
1015: if (text && anchor) {
1.34 eric 1016: Finger * finger = (Finger *) HTRequest_context(text->request);
1017: Robot * mr = finger->robot;
1.59 frystyk 1018: if (mr->flags & MR_IMG) {
1.60 frystyk 1019: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1020: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1021: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1022: HyperDoc * hd = HTAnchor_document(dest_parent);
1023: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1024: BOOL match = YES;
1025:
1026: if (hd) {
1.62 frystyk 1027: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1028: hd->hits++;
1.11 frystyk 1029: HT_FREE(uri);
1.59 frystyk 1030: return;
1.2 frystyk 1031: }
1.59 frystyk 1032:
1033: /* Check for prefix match */
1034: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1035:
1036: /* Test whether we already have a hyperdoc for this document */
1037: if (match && dest) {
1.60 frystyk 1038: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1039: mr->flags & MR_SAVE ?
1040: METHOD_GET : METHOD_HEAD);
1041: HTRequest * newreq = newfinger->request;
1.60 frystyk 1042: HyperDoc_new(mr, dest_parent, 1);
1043: HTRequest_setParent(newreq, referer);
1044:
1045: /* Check whether we should report missing ALT tags */
1046: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1047: if (referer) {
1048: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1049: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1050: HT_FREE(ref_addr);
1051: }
1052: }
1053:
1.62 frystyk 1054: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1055: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1056: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1057: Finger_delete(newfinger);
1058: }
1059: } else {
1.62 frystyk 1060: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 1061: if (mr->reject) {
1062: if (referer) {
1063: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1064: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1065: HT_FREE(ref_addr);
1066: }
1067: }
1.1 frystyk 1068: }
1.59 frystyk 1069: HT_FREE(uri);
1.1 frystyk 1070: }
1071: }
1072: }
1073:
1074: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1075: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1076: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1077: PUBLIC void HText_endAppend (HText * text) {}
1078: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1079: PUBLIC void HText_beginAppend (HText * text) {}
1080: PUBLIC void HText_appendParagraph (HText * text) {}
1081:
1.48 frystyk 1082: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1083: {
1084: return (vfprintf(stderr, fmt, pArgs));
1085: }
1086:
1.1 frystyk 1087: /* ------------------------------------------------------------------------- */
1088: /* MAIN PROGRAM */
1089: /* ------------------------------------------------------------------------- */
1090:
1091: int main (int argc, char ** argv)
1092: {
1.48 frystyk 1093: int status = 0;
1.1 frystyk 1094: int arg;
1.48 frystyk 1095: BOOL cache = NO; /* Use persistent cache */
1096: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1097: char * cache_root = NULL;
1.1 frystyk 1098: HTChunk * keywords = NULL; /* From command line */
1099: int keycnt = 0;
1.12 frystyk 1100: Robot * mr = NULL;
1.43 frystyk 1101: Finger * finger = NULL;
1102: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1103:
1104: /* Starts Mac GUSI socket library */
1105: #ifdef GUSI
1106: GUSISetup(GUSIwithSIOUXSockets);
1107: GUSISetup(GUSIwithInternetSockets);
1108: #endif
1109:
1110: #ifdef __MWERKS__ /* STR */
1111: InitGraf((Ptr) &qd.thePort);
1112: InitFonts();
1113: InitWindows();
1114: InitMenus(); TEInit();
1115: InitDialogs(nil);
1116: InitCursor();
1117: SIOUXSettings.asktosaveonclose = false;
1118: argc=ccommand(&argv);
1.50 frystyk 1119: #endif /* __MWERKS__ */
1.1 frystyk 1120:
1.50 frystyk 1121: #ifdef HT_MEMLOG
1.51 frystyk 1122: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1123: #endif
1.46 eric 1124:
1.27 frystyk 1125: /* Initiate W3C Reference Library with a robot profile */
1126: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1127: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1128:
1129: /* Add the default HTML parser to the set of converters */
1130: {
1131: HTList * converters = HTFormat_conversion();
1132: HTMLInit(converters);
1133: }
1.1 frystyk 1134:
1.12 frystyk 1135: /* Build a new robot object */
1136: mr = Robot_new();
1137:
1.1 frystyk 1138: /* Scan command Line for parameters */
1139: for (arg=1; arg<argc; arg++) {
1140: if (*argv[arg] == '-') {
1141:
1142: /* non-interactive */
1.17 frystyk 1143: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1144: HTAlert_setInteractive(NO);
1145:
1.62 frystyk 1146: /* help */
1147: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1148: VersionInfo();
1149: Cleanup(mr, 0);
1150:
1.63 ! frystyk 1151: /* clf log file */
1.1 frystyk 1152: } else if (!strcmp(argv[arg], "-l")) {
1153: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1154: argv[++arg] : DEFAULT_LOG_FILE;
1.63 ! frystyk 1155: mr->flags |= MR_LOGGING;
1.1 frystyk 1156:
1.63 ! frystyk 1157: /* referer log file */
1.58 frystyk 1158: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1159: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1160: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 ! frystyk 1161: mr->flags |= MR_LOGGING;
1.57 frystyk 1162:
1.58 frystyk 1163: /* Not found error log file */
1164: } else if (!strncmp(argv[arg], "-404", 4)) {
1165: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1166: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 ! frystyk 1167: mr->flags |= MR_LOGGING;
1.58 frystyk 1168:
1169: /* reject log file */
1170: } else if (!strncmp(argv[arg], "-rej", 4)) {
1171: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1172: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 ! frystyk 1173: mr->flags |= MR_LOGGING;
1.58 frystyk 1174:
1.63 ! frystyk 1175: /* no alt tags log file */
! 1176: } else if (!strncmp(argv[arg], "-alt", 4)) {
! 1177: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1178: argv[++arg] : DEFAULT_NOALTTAG_FILE;
! 1179: mr->flags |= MR_LOGGING;
! 1180:
! 1181: /* negotiated resource log file */
1.58 frystyk 1182: } else if (!strncmp(argv[arg], "-neg", 4)) {
1183: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1184: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 ! frystyk 1185: mr->flags |= MR_LOGGING;
! 1186:
! 1187: /* hit file log */
! 1188: } else if (!strcmp(argv[arg], "-hit")) {
! 1189: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1190: argv[++arg] : DEFAULT_HIT_FILE;
! 1191: mr->flags |= MR_DISTRIBUTIONS;
! 1192:
! 1193: /* last modified log file */
! 1194: } else if (!strcmp(argv[arg], "-lm")) {
! 1195: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1196: argv[++arg] : DEFAULT_LM_FILE;
! 1197: mr->flags |= MR_DISTRIBUTIONS;
! 1198:
! 1199: /* title log file */
! 1200: } else if (!strcmp(argv[arg], "-title")) {
! 1201: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1202: argv[++arg] : DEFAULT_TITLE_FILE;
! 1203: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1204:
1205: /* mediatype distribution log file */
1206: } else if (!strncmp(argv[arg], "-for", 4)) {
1207: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1208: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 ! frystyk 1209: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1210:
1.60 frystyk 1211: /* charset distribution log file */
1212: } else if (!strncmp(argv[arg], "-char", 5)) {
1213: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1214: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 ! frystyk 1215: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1216:
1.55 frystyk 1217: /* rule file */
1.1 frystyk 1218: } else if (!strcmp(argv[arg], "-r")) {
1219: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1220: argv[++arg] : DEFAULT_RULE_FILE;
1221:
1222: /* output filename */
1223: } else if (!strcmp(argv[arg], "-o")) {
1224: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1225: argv[++arg] : DEFAULT_OUTPUT_FILE;
1226:
1.55 frystyk 1227: /* URI prefix */
1228: } else if (!strcmp(argv[arg], "-prefix")) {
1229: char * prefix = NULL;
1230: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1231: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1232: if (*prefix && *prefix != '*') {
1.55 frystyk 1233: StrAllocCopy(mr->prefix, prefix);
1234: StrAllocCat(mr->prefix, "*");
1235: }
1236:
1.1 frystyk 1237: /* timeout -- Change the default request timeout */
1238: } else if (!strcmp(argv[arg], "-timeout")) {
1239: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1240: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1241: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1242:
1.54 frystyk 1243: /* Force no pipelined requests */
1244: } else if (!strcmp(argv[arg], "-nopipe")) {
1245: HTTP_setConnectionMode(HTTP_NO_PIPELINING);
1246:
1.48 frystyk 1247: /* Start the persistent cache */
1248: } else if (!strcmp(argv[arg], "-cache")) {
1249: cache = YES;
1250:
1.54 frystyk 1251: /* Determine the cache root */
1252: } else if (!strcmp(argv[arg], "-cacheroot")) {
1253: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1254: argv[++arg] : NULL;
1.51 frystyk 1255:
1.52 frystyk 1256: /* Stream write flush delay in ms */
1257: } else if (!strcmp(argv[arg], "-delay")) {
1258: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1259: atoi(argv[++arg]) : DEFAULT_DELAY;
1260: HTHost_setDefaultWriteDelay(delay);
1261:
1.48 frystyk 1262: /* Persistent cache flush */
1263: } else if (!strcmp(argv[arg], "-flush")) {
1264: flush = YES;
1265:
1266: /* Do a cache validation */
1267: } else if (!strcmp(argv[arg], "-validate")) {
1268: mr->flags |= MR_VALIDATE;
1269:
1270: /* Do an end-to-end cache-validation */
1271: } else if (!strcmp(argv[arg], "-endvalidate")) {
1272: mr->flags |= MR_END_VALIDATE;
1273:
1.7 frystyk 1274: /* preemptive or non-preemptive access */
1.1 frystyk 1275: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1276: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1277:
1278: /* test inlined images */
1279: } else if (!strcmp(argv[arg], "-img")) {
1280: mr->flags |= MR_IMG;
1.45 frystyk 1281:
1282: /* load inlined images */
1283: } else if (!strcmp(argv[arg], "-saveimg")) {
1284: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1285:
1286: /* URI prefix for inlined images */
1287: } else if (!strcmp(argv[arg], "-imgprefix")) {
1288: char * prefix = NULL;
1289: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1290: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1291: if (*prefix && *prefix!='*') {
1.59 frystyk 1292: StrAllocCopy(mr->img_prefix, prefix);
1293: StrAllocCat(mr->img_prefix, "*");
1294: }
1.2 frystyk 1295:
1296: /* load anchors */
1.58 frystyk 1297: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1298: mr->flags |= MR_LINK;
1.7 frystyk 1299: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1300: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1301:
1.12 frystyk 1302: /* Output start and end time */
1303: } else if (!strcmp(argv[arg], "-ss")) {
1304: mr->flags |= MR_TIME;
1305:
1.1 frystyk 1306: /* print version and exit */
1307: } else if (!strcmp(argv[arg], "-version")) {
1308: VersionInfo();
1309: Cleanup(mr, 0);
1.46 eric 1310:
1311: /* run in quiet mode */
1312: } else if (!strcmp(argv[arg], "-q")) {
1313: mr->flags |= MR_QUIET;
1.1 frystyk 1314:
1.62 frystyk 1315: /* run in really quiet mode */
1316: } else if (!strcmp(argv[arg], "-Q")) {
1317: mr->flags |= MR_REAL_QUIET;
1318:
1.1 frystyk 1319: #ifdef WWWTRACE
1320: /* trace flags */
1321: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1322: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1323: #endif
1324:
1.58 frystyk 1325: #ifdef HT_POSIX_REGEX
1326:
1327: /* If we can link against a POSIX regex library */
1328: } else if (!strncmp(argv[arg], "-inc", 4)) {
1329: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1330: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1331: }
1332: } else if (!strncmp(argv[arg], "-exc", 4)) {
1333: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1334: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1335: }
1336: } else if (!strncmp(argv[arg], "-check", 6)) {
1337: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1338: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1339: }
1340: #endif
1341:
1.1 frystyk 1342: } else {
1.62 frystyk 1343: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1344: }
1.17 frystyk 1345: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1346: if (!keycnt) {
1347: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1348: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1349: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1350: keycnt = 1;
1.11 frystyk 1351: HT_FREE(ref);
1.1 frystyk 1352: } else { /* Check for successive keyword arguments */
1353: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1354: if (keycnt++ <= 1)
1.5 frystyk 1355: keywords = HTChunk_new(128);
1.1 frystyk 1356: else
1.5 frystyk 1357: HTChunk_putc(keywords, ' ');
1358: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1359: HT_FREE(escaped);
1.1 frystyk 1360: }
1361: }
1362: }
1363:
1364: #ifdef CATCH_SIG
1365: SetSignal();
1366: #endif
1367:
1368: if (!keycnt) {
1.62 frystyk 1369: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1370: Cleanup(mr, -1);
1371: }
1372:
1373: if (mr->depth != DEFAULT_DEPTH &&
1374: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1375: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1376: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1377: mr->depth);
1.1 frystyk 1378: Cleanup(mr, -1);
1379: }
1380:
1.23 manoli 1381: /* Testing that HTTrace is working */
1.62 frystyk 1382: if (mr->flags & MR_TIME) {
1383: if (SHOW_REAL_QUIET(mr)) {
1384: time_t local = time(NULL);
1385: HTTrace("Welcome to the W3C mini Robot - started on %s\n",
1386: HTDateTimeStr(&local, YES));
1387: }
1388: }
1.23 manoli 1389:
1.1 frystyk 1390: /* Rule file specified? */
1391: if (mr->rules) {
1392: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1393: if (!HTLoadRules(rules))
1.62 frystyk 1394: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1395: HT_FREE(rules);
1.1 frystyk 1396: }
1397:
1398: /* Output file specified? */
1399: if (mr->outputfile) {
1400: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1401: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1402: mr->output = OUTPUT;
1403: }
1404: }
1405:
1.48 frystyk 1406: /* Should we use persistent cache? */
1407: if (cache) {
1.54 frystyk 1408: HTCacheInit(cache_root, 20);
1.49 frystyk 1409: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1410: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1411: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1412:
1413: /* Should we start by flushing? */
1414: if (flush) HTCache_flushAll();
1415: }
1416:
1.58 frystyk 1417: /* CLF Log file specified? */
1.55 frystyk 1418: if (mr->logfile) {
1419: mr->log = HTLog_open(mr->logfile, YES, YES);
1420: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1421: }
1422:
1.58 frystyk 1423: /* Referer Log file specified? */
1.57 frystyk 1424: if (mr->reffile) {
1425: mr->ref = HTLog_open(mr->reffile, YES, YES);
1426: if (mr->ref)
1427: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1428: }
1.1 frystyk 1429:
1.58 frystyk 1430: /* Not found error log specified? */
1431: if (mr->notfoundfile) {
1432: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1433: if (mr->notfound)
1434: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1435: }
1436:
1437: /* Negotiated resource log specified? */
1438: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1439:
1440: /* No alt tags log file specified? */
1441: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1442:
1443: /* Reject Log file specified? */
1444: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1445:
1446: /* Register our own terminate filter */
1.32 frystyk 1447: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1448:
1449: /* Setting event timeout */
1450: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1451:
1.56 frystyk 1452: mr->time = HTGetTimeInMillis();
1.37 frystyk 1453:
1.34 eric 1454: /* Start the request */
1455: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1456:
1457: /*
1458: ** Make sure that the first request is flushed immediately and not
1459: ** buffered in the output buffer
1460: */
1461: HTRequest_setFlush(finger->request, YES);
1462:
1463: /*
1.48 frystyk 1464: ** Check whether we should do some kind of cache validation on
1465: ** the load
1466: */
1467: if (mr->flags & MR_VALIDATE)
1468: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1469: if (mr->flags & MR_END_VALIDATE)
1470: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1471:
1472: /*
1.43 frystyk 1473: ** Now do the load
1474: */
1.34 eric 1475: if (mr->flags & MR_PREEMPTIVE)
1476: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1477:
1478: if (keywords) /* Search */
1.34 eric 1479: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1480: else
1.34 eric 1481: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1482:
1.5 frystyk 1483: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1484: if (status != YES) {
1.62 frystyk 1485: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1486: Cleanup(mr, -1);
1487: }
1488:
1489: /* Go into the event loop... */
1.34 eric 1490: HTEventList_loop(finger->request);
1.1 frystyk 1491:
1492: /* Only gets here if event loop fails */
1493: Cleanup(mr, 0);
1494: return 0;
1495: }
Webmaster