Annotation of libwww/Robot/src/HTRobot.c, revision 1.64
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
1.64 ! frystyk 26: #ifdef HAVE_RXPOSIX_H
! 27: #include <rxposix.h>
! 28: #else
1.62 frystyk 29: #ifdef HAVE_REGEX_H
30: #include <regex.h>
31: #endif
32: #endif
1.60 frystyk 33: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 34: #endif
35:
1.14 frystyk 36: #ifndef W3C_VERSION
1.33 eric 37: #define W3C_VERSION "Unspecified"
1.1 frystyk 38: #endif
39:
40: #define APP_NAME "W3CRobot"
1.14 frystyk 41: #define APP_VERSION W3C_VERSION
1.62 frystyk 42: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 43:
44: #define DEFAULT_OUTPUT_FILE "robot.out"
45: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 46: #define DEFAULT_LOG_FILE "log-clf.txt"
47: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 ! frystyk 48: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 49: #define DEFAULT_LM_FILE "log-lastmodified.txt"
50: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 51: #define DEFAULT_REFERER_FILE "log-referer.txt"
52: #define DEFAULT_REJECT_FILE "log-reject.txt"
53: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
54: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 55: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 56: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 57: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 58: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 59: #define DEFAULT_PREFIX ""
1.59 frystyk 60: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 61: #define DEFAULT_DEPTH 0
1.53 frystyk 62: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 63:
1.51 frystyk 64: #if 0
1.53 frystyk 65: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 66: #endif
67:
1.46 eric 68: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 69: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
70: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 71:
1.40 frystyk 72: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 73:
74: #if defined(__svr4__)
75: #define CATCH_SIG
76: #endif
77:
78: typedef enum _MRFlags {
1.45 frystyk 79: MR_IMG = 0x1,
80: MR_LINK = 0x2,
81: MR_PREEMPTIVE = 0x4,
82: MR_TIME = 0x8,
1.46 eric 83: MR_SAVE = 0x10,
1.48 frystyk 84: MR_QUIET = 0x20,
1.62 frystyk 85: MR_REAL_QUIET = 0x40,
86: MR_VALIDATE = 0x80,
87: MR_END_VALIDATE = 0x100,
1.63 frystyk 88: MR_KEEP_META = 0x200,
89: MR_LOGGING = 0x400,
90: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 91: } MRFlags;
92:
93: typedef struct _Robot {
1.2 frystyk 94: int depth; /* How deep is our tree */
1.30 frystyk 95: int cnt; /* Count of requests */
1.2 frystyk 96: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 97: HTList * htext; /* List of our HText Objects */
1.34 eric 98: HTList * fingers;
1.59 frystyk 99:
1.40 frystyk 100: int timer;
1.1 frystyk 101: char * cwd; /* Current dir URL */
102: char * rules;
1.55 frystyk 103: char * prefix;
1.59 frystyk 104: char * img_prefix;
105:
1.60 frystyk 106: char * logfile; /* clf log */
1.55 frystyk 107: HTLog * log;
1.60 frystyk 108: char * reffile; /* referer log */
1.57 frystyk 109: HTLog * ref;
1.60 frystyk 110: char * rejectfile; /* unchecked links */
1.58 frystyk 111: HTLog * reject;
1.60 frystyk 112: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 113: HTLog * notfound;
1.60 frystyk 114: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 115: HTLog * conneg;
1.60 frystyk 116: char * noalttagfile; /* images without alt tags*/
117: HTLog * noalttag;
118:
119: char * hitfile; /* links sorted after hit counts */
1.64 ! frystyk 120: char * relfile; /* link sorted after relationships */
! 121: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 122: char * titlefile; /* links with titles */
1.60 frystyk 123: char * mtfile; /* media types encountered */
124: char * charsetfile; /* charsets encountered */
1.63 frystyk 125: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 126:
127: char * outputfile;
1.1 frystyk 128: FILE * output;
1.59 frystyk 129:
1.1 frystyk 130: MRFlags flags;
1.55 frystyk 131:
1.59 frystyk 132: long get_bytes; /* Total number of bytes processed using GET*/
133: long get_docs; /* Total number of documents using GET */
134:
135: long head_bytes; /* bytes processed bytes processed using HEAD */
136: long head_docs; /* Total number of documents using HEAD*/
137:
138: long other_docs;
139:
1.56 frystyk 140: ms_t time; /* Time of run */
1.58 frystyk 141:
142: #ifdef HT_POSIX_REGEX
143: regex_t * include;
144: regex_t * exclude;
145: regex_t * check;
146: #endif
147:
1.1 frystyk 148: } Robot;
1.34 eric 149:
150: typedef struct _Finger {
151: Robot * robot;
152: HTRequest * request;
153: HTParentAnchor * dest;
154: } Finger;
155:
1.1 frystyk 156: typedef enum _LoadState {
157: L_INVALID = -2,
158: L_LOADING = -1,
159: L_SUCCESS = 0,
160: L_ERROR
161: } LoadState;
162:
163: /*
164: ** The HyperDoc object is bound to the anchor and contains information about
165: ** where we are in the search for recursive searches
166: */
167: typedef struct _HyperDoc {
168: HTParentAnchor * anchor;
169: LoadState state;
170: int depth;
1.55 frystyk 171: int hits;
1.1 frystyk 172: } HyperDoc;
173:
174: /*
175: ** This is the HText object that is created every time we start parsing a
176: ** HTML object
177: */
1.4 frystyk 178: struct _HText {
1.1 frystyk 179: HTRequest * request;
1.4 frystyk 180: };
1.1 frystyk 181:
1.58 frystyk 182: /*
183: ** A structure for calculating metadata distributions
184: */
185: typedef struct _MetaDist {
186: HTAtom * name;
187: int hits;
188: } MetaDist;
189:
190: /*
191: ** Some sorting algorithms
192: */
1.63 frystyk 193: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 194:
1.1 frystyk 195: PUBLIC HText * HTMainText = NULL;
196: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
197: PUBLIC HTStyleSheet * styleSheet = NULL;
198:
199: /* ------------------------------------------------------------------------- */
200:
1.13 eric 201: /* Standard (non-error) Output
202: ** ---------------------------
203: */
204: PUBLIC int OutputData(const char * fmt, ...)
205: {
206: int ret;
207: va_list pArgs;
208: va_start(pArgs, fmt);
209: ret = vfprintf(stdout, fmt, pArgs);
210: va_end(pArgs);
211: return ret;
212: }
213:
214: /* ------------------------------------------------------------------------- */
215:
1.2 frystyk 216: /* Create a "HyperDoc" object
217: ** --------------------------
218: ** A HyperDoc object contains information about whether we have already
219: ** started checking the anchor and the depth in our search
220: */
221: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
222: {
223: HyperDoc * hd;
1.14 frystyk 224: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
225: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 226: hd->state = L_INVALID;
227: hd->depth = depth;
1.55 frystyk 228: hd->hits = 1;
1.2 frystyk 229:
230: /* Bind the HyperDoc object together with the Anchor Object */
231: hd->anchor = anchor;
232: HTAnchor_setDocument(anchor, (void *) hd);
233:
234: /* Add this HyperDoc object to our list */
235: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
236: HTList_addObject(mr->hyperdoc, (void *) hd);
237: return hd;
238: }
239:
240: /* Delete a "HyperDoc" object
241: ** --------------------------
242: */
243: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
244: {
245: if (hd) {
1.11 frystyk 246: HT_FREE (hd);
1.2 frystyk 247: return YES;
248: }
249: return NO;
250: }
251:
1.55 frystyk 252: /*
253: ** Sort the anchor array and log reference count
254: */
255: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
256: {
257: if (mr && array) {
258: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
259: if (log) {
260: void ** data = NULL;
261: HTParentAnchor * anchor = NULL;
262: HTArray_sort(array, HitSort);
263: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
264: while (anchor) {
265: char * uri = HTAnchor_address((HTAnchor *) anchor);
266: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 267: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 268: HT_FREE(uri);
269: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
270: }
271: }
272: HTLog_close(log);
273: return YES;
274: }
275: return NO;
276: }
277:
278: PRIVATE int HitSort (const void * a, const void * b)
279: {
280: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
281: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
282: if (aa && bb) return (bb->hits - aa->hits);
283: return bb - aa;
284: }
285:
1.58 frystyk 286: /*
1.64 ! frystyk 287: ** Sort the anchor array and log link relations
! 288: */
! 289: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
! 290: {
! 291: if (mr && array) {
! 292: HTLog * log = HTLog_open(mr->relfile, YES, YES);
! 293: if (log) {
! 294: void ** data = NULL;
! 295: HTParentAnchor * anchor = NULL;
! 296: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
! 297: while (anchor) {
! 298: char * uri = HTAnchor_address((HTAnchor *) anchor);
! 299: if (uri) {
! 300: /*
! 301: ** If we have a specific relation to look for then use that.
! 302: */
! 303: if (mr->relation) {
! 304: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor,
! 305: mr->relation);
! 306: if (link) {
! 307: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
! 308: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
! 309: HTFormat format = HTAnchor_format(dest);
! 310: if (dest_uri) {
! 311: HTLog_addText(log, "%s %s %s --> %s\n",
! 312: HTAtom_name(mr->relation),
! 313: format != WWW_UNKNOWN ?
! 314: HTAtom_name(format) : "<unknown>",
! 315: uri, dest_uri);
! 316: HT_FREE(dest_uri);
! 317: }
! 318: }
! 319: }
! 320: HT_FREE(uri);
! 321: }
! 322: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
! 323: }
! 324: }
! 325: HTLog_close(log);
! 326: return YES;
! 327: }
! 328: return NO;
! 329: }
! 330:
! 331: /*
1.63 frystyk 332: ** Sort the anchor array and log last modified date
333: */
334: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
335: {
336: if (mr && array) {
337: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
338: if (log) {
339: void ** data = NULL;
340: HTParentAnchor * anchor = NULL;
341: HTArray_sort(array, LastModifiedSort);
342: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
343: while (anchor) {
344: char * uri = HTAnchor_address((HTAnchor *) anchor);
345: time_t lm = HTAnchor_lastModified(anchor);
346: if (uri && lm > 0)
347: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
348: HT_FREE(uri);
349: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
350: }
351: }
352: HTLog_close(log);
353: return YES;
354: }
355: return NO;
356: }
357:
358: PRIVATE int LastModifiedSort (const void * a, const void * b)
359: {
360: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
361: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
362: return bb - aa;
363: }
364:
365: /*
366: ** Sort the anchor array and log the document title
367: */
368: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
369: {
370: if (mr && array) {
371: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
372: if (log) {
373: void ** data = NULL;
374: HTParentAnchor * anchor = NULL;
375: HTArray_sort(array, TitleSort);
376: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
377: while (anchor) {
378: char * uri = HTAnchor_address((HTAnchor *) anchor);
379: const char * title = HTAnchor_title(anchor);
380: HTCharset charset = HTAnchor_charset(anchor);
381: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
382: charset ? HTAtom_name(charset) : "<none>",
383: title ? title : "<none>",
384: uri);
385: HT_FREE(uri);
386: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
387: }
388: }
389: HTLog_close(log);
390: return YES;
391: }
392: return NO;
393: }
394:
395: PRIVATE int TitleSort (const void * a, const void * b)
396: {
397: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
398: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
399: return strcasecomp(bb?bb:"", aa?aa:"");
400: }
401:
402: /*
1.58 frystyk 403: ** Calculate distributions for media types. The same mechanism
404: ** can be used for other characteristics with relatively
405: ** few outcomes.
406: */
407: PRIVATE HTList * mediatype_distribution (HTArray * array)
408: {
409: if (array) {
410: HTList * mt = HTList_new();
411: MetaDist * pres = NULL;
412: void ** data = NULL;
413: HTParentAnchor * anchor = NULL;
414: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
415: while (anchor) {
416: HTFormat format = HTAnchor_format(anchor);
417: if (format && format != WWW_UNKNOWN) {
418: HTList * cur = mt;
419:
420: /* If found then increase counter */
421: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
422: if (pres->name == format) {
423: pres->hits++;
424: break;
425: }
426: }
427:
428: /* If not found then add new format to list */
429: if (!pres) {
430: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
431: HT_OUTOFMEM("mediatype_distribution");
432: pres->name = format;
433: pres->hits = 1;
434: HTList_addObject(mt, pres);
435: HTList_insertionSort(mt, FormatSort);
436: }
437: }
438:
439: /* Find next anchor in array */
440: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
441: }
442: return mt;
443: }
444: return NULL;
445: }
446:
1.60 frystyk 447: /*
448: ** Calculate distributions for charsets. The same mechanism
449: ** can be used for other characteristics with relatively
450: ** few outcomes.
451: */
452: PRIVATE HTList * charset_distribution (HTArray * array)
453: {
454: if (array) {
455: HTList * cs = HTList_new();
456: MetaDist * pres = NULL;
457: void ** data = NULL;
458: HTParentAnchor * anchor = NULL;
459: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
460: while (anchor) {
461: HTCharset charset = HTAnchor_charset(anchor);
462: if (charset) {
463: HTList * cur = cs;
464:
465: /* If found then increase counter */
466: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
467: if (pres->name == charset) {
468: pres->hits++;
469: break;
470: }
471: }
472:
473: /* If not found then add new format to list */
474: if (!pres) {
475: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
476: HT_OUTOFMEM("charset_distribution");
477: pres->name = charset;
478: pres->hits = 1;
479: HTList_addObject(cs, pres);
480: HTList_insertionSort(cs, FormatSort);
481: }
482: }
483:
484: /* Find next anchor in array */
485: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
486: }
487: return cs;
488: }
489: return NULL;
490: }
491:
1.58 frystyk 492: PRIVATE int FormatSort (const void * a, const void * b)
493: {
494: MetaDist * aa = (MetaDist *) a;
495: MetaDist * bb = (MetaDist *) b;
496: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
497: }
498:
499: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
500: {
501: if (logfile && distribution) {
502: HTLog * log = HTLog_open(logfile, YES, YES);
503: if (log) {
504: HTList * cur = distribution;
505: MetaDist * pres;
506: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
507: if (pres->name) {
1.60 frystyk 508: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 509: }
510: }
511: HTLog_close(log);
512: }
513: }
514: return NO;
515: }
516:
517: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
518: {
519: if (distribution) {
520: HTList * cur = distribution;
521: MetaDist * pres;
522: while ((pres = (MetaDist *) HTList_nextObject(cur)))
523: HT_FREE(pres);
524: HTList_delete(distribution);
525: return YES;
526: }
527: return NO;
528: }
529:
530:
1.55 frystyk 531: /* Statistics
532: ** ----------
533: ** Calculates a bunch of statistics for the anchors traversed
534: */
535: PRIVATE BOOL calculate_statistics (Robot * mr)
536: {
1.59 frystyk 537: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 538: if (!mr) return NO;
539:
540: /* Calculate efficiency */
1.59 frystyk 541: if (mr->time > 0) {
1.56 frystyk 542: ms_t t = HTGetTimeInMillis() - mr->time;
543: if (t > 0) {
1.60 frystyk 544: double loadfactor = (mr->get_bytes / (t * 0.001));
545: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 546: double secs = t / 1000.0;
1.55 frystyk 547: char bytes[50];
1.62 frystyk 548: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 549: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 550: total_docs, secs, reqprsec);
1.59 frystyk 551:
552: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 553: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 554: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 555: mr->get_docs, bytes, loadfactor);
1.59 frystyk 556:
557: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 558: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 559: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 560: mr->head_docs, bytes);
1.55 frystyk 561: }
562: }
563:
564: /* Create an array of existing anchors */
1.59 frystyk 565: if (total_docs > 1) {
566: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 567: if (array) {
568:
1.63 frystyk 569: /* Distributions */
570: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 ! frystyk 571: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 572: }
573:
1.55 frystyk 574: /* Sort after hit counts */
1.63 frystyk 575: if (mr->hitfile) {
576: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 577: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 578: mr->hitfile);
579: calculate_hits(mr, array);
580: }
581:
1.64 ! frystyk 582: /* Sort after link relations */
! 583: if (mr->relfile) {
! 584: if (SHOW_REAL_QUIET(mr))
! 585: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
! 586: mr->relfile);
! 587: calculate_linkRelations(mr, array);
! 588: }
! 589:
1.63 frystyk 590: /* Sort after modified date */
591: if (mr->lmfile) {
592: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 593: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 594: mr->lmfile);
595: calculate_lm(mr, array);
596: }
597:
598: /* Sort after title */
599: if (mr->titlefile) {
600: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 601: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 602: mr->titlefile);
603: calculate_title(mr, array);
604: }
1.55 frystyk 605:
1.58 frystyk 606: /* Find mediatype distribution */
607: if (mr->mtfile) {
608: HTList * mtdist = mediatype_distribution(array);
609: if (mtdist) {
1.63 frystyk 610: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 611: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 612: mr->mtfile);
1.58 frystyk 613: log_meta_distribution(mr->mtfile, mtdist);
614: delete_meta_distribution(mtdist);
615: }
616: }
1.55 frystyk 617:
1.60 frystyk 618: /* Find charset distribution */
619: if (mr->charsetfile) {
620: HTList * charsetdist = charset_distribution(array);
621: if (charsetdist) {
1.63 frystyk 622: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 623: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 624: mr->charsetfile);
1.60 frystyk 625: log_meta_distribution(mr->charsetfile, charsetdist);
626: delete_meta_distribution(charsetdist);
627: }
628: }
629:
1.55 frystyk 630: /* Add as may other stats here as you like */
1.60 frystyk 631: /* ... */
1.58 frystyk 632:
633: /* Delete the array */
1.55 frystyk 634: HTArray_delete(array);
635: }
636: }
637: return YES;
638: }
639:
1.1 frystyk 640: /* Create a Command Line Object
641: ** ----------------------------
642: */
643: PRIVATE Robot * Robot_new (void)
644: {
645: Robot * me;
1.41 frystyk 646: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 647: HT_OUTOFMEM("Robot_new");
1.2 frystyk 648: me->hyperdoc = HTList_new();
1.4 frystyk 649: me->htext = HTList_new();
1.40 frystyk 650: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 651: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 652: me->output = OUTPUT;
1.35 eric 653: me->cnt = 0;
1.34 eric 654: me->fingers = HTList_new();
1.1 frystyk 655: return me;
656: }
657:
658: /* Delete a Command Line Object
659: ** ----------------------------
660: */
1.62 frystyk 661: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 662: {
1.62 frystyk 663: if (mr) {
664: HTList_delete(mr->fingers);
1.55 frystyk 665:
666: /* Calculate statistics */
1.62 frystyk 667: calculate_statistics(mr);
1.55 frystyk 668:
1.62 frystyk 669: if (mr->hyperdoc) {
670: HTList * cur = mr->hyperdoc;
1.2 frystyk 671: HyperDoc * pres;
672: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
673: HyperDoc_delete(pres);
1.62 frystyk 674: HTList_delete(mr->hyperdoc);
1.2 frystyk 675: }
1.62 frystyk 676: if (mr->htext) {
677: HTList * cur = mr->htext;
1.4 frystyk 678: HText * pres;
679: while ((pres = (HText *) HTList_nextObject(cur)))
680: HText_free(pres);
1.62 frystyk 681: HTList_delete(mr->htext);
1.4 frystyk 682: }
1.62 frystyk 683:
684: /* Close all the log files */
1.63 frystyk 685: if (mr->flags & MR_LOGGING) {
1.64 ! frystyk 686: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 687: }
688:
1.62 frystyk 689: if (mr->log) {
690: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 691: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 692: HTLog_accessCount(mr->log), mr->logfile);
693: HTLog_close(mr->log);
694: }
695: if (mr->ref) {
696: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 697: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 698: HTLog_accessCount(mr->ref), mr->reffile);
699: HTLog_close(mr->ref);
700: }
701: if (mr->reject) {
702: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 703: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 704: HTLog_accessCount(mr->reject), mr->rejectfile);
705: HTLog_close(mr->reject);
706: }
707: if (mr->notfound) {
708: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 709: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 710: HTLog_accessCount(mr->notfound), mr->notfoundfile);
711: HTLog_close(mr->notfound);
712: }
713: if (mr->conneg) {
714: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 715: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 716: HTLog_accessCount(mr->conneg), mr->connegfile);
717: HTLog_close(mr->conneg);
718: }
719: if (mr->noalttag) {
720: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 721: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 722: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
723: HTLog_close(mr->noalttag);
724: }
725:
726: if (mr->output && mr->output != STDOUT) fclose(mr->output);
727:
728: if (mr->flags & MR_TIME) {
1.12 frystyk 729: time_t local = time(NULL);
1.62 frystyk 730: if (SHOW_REAL_QUIET(mr))
1.64 ! frystyk 731: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 732: }
1.55 frystyk 733:
1.58 frystyk 734: #ifdef HT_POSIX_REGEX
1.62 frystyk 735: if (mr->include) {
736: regfree(mr->include);
737: HT_FREE(mr->include);
738: }
739: if (mr->exclude) {
740: regfree(mr->exclude);
741: HT_FREE(mr->exclude);
742: }
743: if (mr->check) {
744: regfree(mr->check);
745: HT_FREE(mr->check);
1.58 frystyk 746: }
747: #endif
748:
1.62 frystyk 749: HT_FREE(mr->cwd);
750: HT_FREE(mr->prefix);
751: HT_FREE(mr->img_prefix);
752: HT_FREE(mr);
1.1 frystyk 753: return YES;
754: }
755: return NO;
756: }
757:
1.2 frystyk 758: /*
1.34 eric 759: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 760: */
1.34 eric 761: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 762: {
1.34 eric 763: Finger * me;
764: HTRequest * request = HTRequest_new();
765: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
766: HT_OUTOFMEM("Finger_new");
767: me->robot = robot;
768: me->request = request;
769: me->dest = dest;
770: HTList_addObject(robot->fingers, (void *)me);
771:
1.48 frystyk 772: /* Set the context for this request */
1.34 eric 773: HTRequest_setContext (request, me);
1.48 frystyk 774:
775: /* Check the various flags to customize the request */
776: if (robot->flags & MR_PREEMPTIVE)
777: HTRequest_setPreemptive(request, YES);
778: if (robot->flags & MR_VALIDATE)
779: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
780: if (robot->flags & MR_END_VALIDATE)
781: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
782:
783: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 784: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 785:
786: /* Set the method for this request */
1.34 eric 787: HTRequest_setMethod(request, method);
788: robot->cnt++;
789: return me;
1.2 frystyk 790: }
791:
1.34 eric 792: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 793: {
1.34 eric 794: HTList_removeObject(me->robot->fingers, (void *)me);
795: me->robot->cnt--;
1.37 frystyk 796:
797: /*
798: ** If we are down at one request then flush the output buffer
799: */
800: if (me->request) {
801: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 802: HTRequest_delete(me->request);
1.37 frystyk 803: }
804:
805: /*
806: ** Delete the request and free myself
807: */
1.34 eric 808: HT_FREE(me);
809: return YES;
1.2 frystyk 810: }
811:
812: /*
813: ** Cleanup and make sure we close all connections including the persistent
814: ** ones
815: */
1.1 frystyk 816: PRIVATE void Cleanup (Robot * me, int status)
817: {
818: Robot_delete(me);
1.29 eric 819: HTProfile_delete();
1.50 frystyk 820: #ifdef HT_MEMLOG
1.39 eric 821: HTMemLog_close();
1.47 frystyk 822: #endif
823:
1.1 frystyk 824: #ifdef VMS
825: exit(status ? status : 1);
826: #else
827: exit(status ? status : 0);
828: #endif
829: }
830:
831: #ifdef CATCH_SIG
832: #include <signal.h>
833: /* SetSignal
834: ** This function sets up signal handlers. This might not be necessary to
835: ** call if the application has its own handlers (lossage on SVR4)
836: */
837: PRIVATE void SetSignal (void)
838: {
839: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
840: ** when attemting to connect to a remote host where you normally should
841: ** get `connection refused' back
842: */
843: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 844: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 845: } else {
1.13 eric 846: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 847: }
1.47 frystyk 848:
1.50 frystyk 849: #ifdef HT_MEMLOG
1.44 eric 850: HTMemLog_flush();
1.47 frystyk 851: #endif
852:
1.1 frystyk 853: }
854: #endif /* CATCH_SIG */
855:
1.58 frystyk 856: #ifdef HT_POSIX_REGEX
857: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
858: {
859: size_t length = regerror (errcode, compiled, NULL, 0);
860: char * str = NULL;
861: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
862: HT_OUTOFMEM("get_regerror");
863: (void) regerror (errcode, compiled, str, length);
864: return str;
865: }
866:
1.60 frystyk 867: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 868: {
869: regex_t * regex = NULL;
870: if (regex_str && *regex_str) {
871: int status;
872: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
873: HT_OUTOFMEM("get_regtype");
1.60 frystyk 874: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 875: char * err_msg = get_regerror(status, regex);
1.62 frystyk 876: if (SHOW_REAL_QUIET(mr))
877: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 878: HT_FREE(err_msg);
879: Cleanup(mr, -1);
880: }
881: }
882: return regex;
883: }
884: #endif
885:
1.1 frystyk 886: PRIVATE void VersionInfo (void)
887: {
1.62 frystyk 888: OutputData("W3C Sample Software\n\n");
889: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
890: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
891: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 892: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 893: }
894:
895: /* terminate_handler
896: ** -----------------
1.2 frystyk 897: ** This function is registered to handle the result of the request.
898: ** If no more requests are pending then terminate program
1.1 frystyk 899: */
1.32 frystyk 900: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
901: void * param, int status)
1.1 frystyk 902: {
1.34 eric 903: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 904: Robot * mr = finger->robot;
1.62 frystyk 905: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 906:
1.58 frystyk 907: /* Check if negotiated resource and whether we should log that*/
908: if (mr->conneg) {
909: HTAssocList * cur = HTResponse_variant(response);
910: if (cur) {
911: BOOL first = YES;
912: HTChunk * buffer = HTChunk_new(128);
913: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
914: HTAssoc * pres;
1.60 frystyk 915: HTChunk_puts(buffer, uri);
1.58 frystyk 916: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
917: char * value = HTAssoc_value(pres);
918: if (first) {
1.60 frystyk 919: HTChunk_puts(buffer, "\t(");
1.58 frystyk 920: first = NO;
921: } else
922: HTChunk_puts(buffer, ", ");
923:
924: /* Output the name */
925: HTChunk_puts(buffer, HTAssoc_name(pres));
926:
927: /* Only output the value if not empty string */
1.60 frystyk 928: if (value && *value) {
1.58 frystyk 929: HTChunk_puts(buffer, "=");
930: HTChunk_puts(buffer, value);
931: }
932: }
1.60 frystyk 933: if (!first) HTChunk_puts(buffer, ")");
934: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 935: HTChunk_delete(buffer);
936: HT_FREE(uri);
937: }
938: }
939:
1.55 frystyk 940: /* Count the amount of body data that we have read */
1.59 frystyk 941: if (HTRequest_method(request) == METHOD_GET) {
942: int length = HTAnchor_length(HTRequest_anchor(request));
943: if (length > 0) mr->get_bytes += length;
944: mr->get_docs++;
945: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 946: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 947: if (length > 0) mr->head_bytes += length;
948: mr->head_docs++;
949: } else {
950: mr->other_docs++;
1.55 frystyk 951: }
952:
1.58 frystyk 953: /* Cleanup the anchor so that we don't drown in metainformation */
954: if (!(mr->flags & MR_KEEP_META))
955: HTAnchor_clearHeader(HTRequest_anchor(request));
956:
1.55 frystyk 957: /* Delete this thread */
1.34 eric 958: Finger_delete(finger);
1.55 frystyk 959:
960: /* Should we stop? */
1.46 eric 961: if (mr->cnt <= 0) {
1.62 frystyk 962: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 963: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 964: }
1.62 frystyk 965: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 966: return HT_OK;
967: }
968:
969: /* ------------------------------------------------------------------------- */
970: /* HTEXT INTERFACE */
971: /* ------------------------------------------------------------------------- */
972:
973: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
974: HTStream * stream)
975: {
976: HText * me;
1.34 eric 977: Finger * finger = (Finger *) HTRequest_context(request);
978: Robot * mr = finger->robot;
1.14 frystyk 979: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
980: HT_OUTOFMEM("HText_new2");
1.4 frystyk 981:
982: /* Bind the HText object together with the Request Object */
1.1 frystyk 983: me->request = request;
1.4 frystyk 984:
985: /* Add this HyperDoc object to our list */
986: if (!mr->htext) mr->htext = HTList_new();
987: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 988: return me;
989: }
990:
1.4 frystyk 991: PUBLIC void HText_free (HText * me) {
1.11 frystyk 992: if (me) HT_FREE (me);
1.4 frystyk 993: }
994:
1.1 frystyk 995: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
996: {
997: if (text && anchor) {
1.34 eric 998: Finger * finger = (Finger *) HTRequest_context(text->request);
999: Robot * mr = finger->robot;
1.1 frystyk 1000: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1001: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1002: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1003: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1004: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.58 frystyk 1005: BOOL match = YES;
1006: BOOL check = NO;
1.1 frystyk 1007:
1.55 frystyk 1008: if (!uri) return;
1.62 frystyk 1009: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1010:
1011: if (hd) {
1.62 frystyk 1012: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1013: hd->hits++;
1.58 frystyk 1014: HT_FREE(uri);
1015: return;
1016: }
1017:
1018: /* Check for prefix match */
1019: if (mr->prefix) match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1020:
1021: #ifdef HT_POSIX_REGEX
1022: /* Check for any regular expression */
1023: if (match && mr->include) {
1024: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1025: }
1026: if (match && mr->exclude) {
1027: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1028: }
1029: if (match && mr->check) {
1030: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1031: }
1032: #endif
1033:
1034: /* Test whether we already have a hyperdoc for this document */
1035: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1036: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1037: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1038: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1039: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1040: HTRequest * newreq = newfinger->request;
1.2 frystyk 1041: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1042: HTRequest_setParent(newreq, referer);
1.58 frystyk 1043: if (check || depth >= mr->depth) {
1.62 frystyk 1044: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1045: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 1046: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 1047: } else {
1.62 frystyk 1048: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1049: }
1050: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1051: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1052: Finger_delete(newfinger);
1.2 frystyk 1053: }
1.7 frystyk 1054: } else {
1.62 frystyk 1055: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 1056: if (mr->reject) {
1057: if (referer) {
1058: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1059: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1060: HT_FREE(ref_addr);
1061: }
1062: }
1.2 frystyk 1063: }
1.11 frystyk 1064: HT_FREE(uri);
1.2 frystyk 1065: }
1066: }
1067:
1068: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1069: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1070: {
1071: if (text && anchor) {
1.34 eric 1072: Finger * finger = (Finger *) HTRequest_context(text->request);
1073: Robot * mr = finger->robot;
1.59 frystyk 1074: if (mr->flags & MR_IMG) {
1.60 frystyk 1075: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1076: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1077: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1078: HyperDoc * hd = HTAnchor_document(dest_parent);
1079: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1080: BOOL match = YES;
1081:
1082: if (hd) {
1.62 frystyk 1083: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1084: hd->hits++;
1.11 frystyk 1085: HT_FREE(uri);
1.59 frystyk 1086: return;
1.2 frystyk 1087: }
1.59 frystyk 1088:
1089: /* Check for prefix match */
1090: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1091:
1092: /* Test whether we already have a hyperdoc for this document */
1093: if (match && dest) {
1.60 frystyk 1094: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1095: mr->flags & MR_SAVE ?
1096: METHOD_GET : METHOD_HEAD);
1097: HTRequest * newreq = newfinger->request;
1.60 frystyk 1098: HyperDoc_new(mr, dest_parent, 1);
1099: HTRequest_setParent(newreq, referer);
1100:
1101: /* Check whether we should report missing ALT tags */
1102: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1103: if (referer) {
1104: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1105: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1106: HT_FREE(ref_addr);
1107: }
1108: }
1109:
1.62 frystyk 1110: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1111: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1112: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1113: Finger_delete(newfinger);
1114: }
1115: } else {
1.62 frystyk 1116: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 1117: if (mr->reject) {
1118: if (referer) {
1119: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1120: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1121: HT_FREE(ref_addr);
1122: }
1123: }
1.1 frystyk 1124: }
1.59 frystyk 1125: HT_FREE(uri);
1.1 frystyk 1126: }
1127: }
1128: }
1129:
1130: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1131: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1132: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1133: PUBLIC void HText_endAppend (HText * text) {}
1134: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1135: PUBLIC void HText_beginAppend (HText * text) {}
1136: PUBLIC void HText_appendParagraph (HText * text) {}
1137:
1.48 frystyk 1138: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1139: {
1140: return (vfprintf(stderr, fmt, pArgs));
1141: }
1142:
1.1 frystyk 1143: /* ------------------------------------------------------------------------- */
1144: /* MAIN PROGRAM */
1145: /* ------------------------------------------------------------------------- */
1146:
1147: int main (int argc, char ** argv)
1148: {
1.48 frystyk 1149: int status = 0;
1.1 frystyk 1150: int arg;
1.48 frystyk 1151: BOOL cache = NO; /* Use persistent cache */
1152: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1153: char * cache_root = NULL;
1.1 frystyk 1154: HTChunk * keywords = NULL; /* From command line */
1155: int keycnt = 0;
1.12 frystyk 1156: Robot * mr = NULL;
1.43 frystyk 1157: Finger * finger = NULL;
1158: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1159:
1160: /* Starts Mac GUSI socket library */
1161: #ifdef GUSI
1162: GUSISetup(GUSIwithSIOUXSockets);
1163: GUSISetup(GUSIwithInternetSockets);
1164: #endif
1165:
1166: #ifdef __MWERKS__ /* STR */
1167: InitGraf((Ptr) &qd.thePort);
1168: InitFonts();
1169: InitWindows();
1170: InitMenus(); TEInit();
1171: InitDialogs(nil);
1172: InitCursor();
1173: SIOUXSettings.asktosaveonclose = false;
1174: argc=ccommand(&argv);
1.50 frystyk 1175: #endif /* __MWERKS__ */
1.1 frystyk 1176:
1.50 frystyk 1177: #ifdef HT_MEMLOG
1.51 frystyk 1178: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1179: #endif
1.46 eric 1180:
1.27 frystyk 1181: /* Initiate W3C Reference Library with a robot profile */
1182: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1183: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1184:
1185: /* Add the default HTML parser to the set of converters */
1186: {
1187: HTList * converters = HTFormat_conversion();
1188: HTMLInit(converters);
1189: }
1.1 frystyk 1190:
1.12 frystyk 1191: /* Build a new robot object */
1192: mr = Robot_new();
1193:
1.1 frystyk 1194: /* Scan command Line for parameters */
1195: for (arg=1; arg<argc; arg++) {
1196: if (*argv[arg] == '-') {
1197:
1198: /* non-interactive */
1.17 frystyk 1199: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1200: HTAlert_setInteractive(NO);
1201:
1.62 frystyk 1202: /* help */
1203: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1204: VersionInfo();
1205: Cleanup(mr, 0);
1206:
1.63 frystyk 1207: /* clf log file */
1.1 frystyk 1208: } else if (!strcmp(argv[arg], "-l")) {
1209: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1210: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1211: mr->flags |= MR_LOGGING;
1.1 frystyk 1212:
1.63 frystyk 1213: /* referer log file */
1.58 frystyk 1214: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1215: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1216: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1217: mr->flags |= MR_LOGGING;
1.57 frystyk 1218:
1.58 frystyk 1219: /* Not found error log file */
1220: } else if (!strncmp(argv[arg], "-404", 4)) {
1221: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1222: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1223: mr->flags |= MR_LOGGING;
1.58 frystyk 1224:
1225: /* reject log file */
1226: } else if (!strncmp(argv[arg], "-rej", 4)) {
1227: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1228: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1229: mr->flags |= MR_LOGGING;
1.58 frystyk 1230:
1.63 frystyk 1231: /* no alt tags log file */
1232: } else if (!strncmp(argv[arg], "-alt", 4)) {
1233: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1234: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1235: mr->flags |= MR_LOGGING;
1236:
1237: /* negotiated resource log file */
1.58 frystyk 1238: } else if (!strncmp(argv[arg], "-neg", 4)) {
1239: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1240: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1241: mr->flags |= MR_LOGGING;
1242:
1243: /* hit file log */
1244: } else if (!strcmp(argv[arg], "-hit")) {
1245: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1246: argv[++arg] : DEFAULT_HIT_FILE;
1247: mr->flags |= MR_DISTRIBUTIONS;
1248:
1.64 ! frystyk 1249: /* link relations file log */
! 1250: } else if (!strcmp(argv[arg], "-rellog")) {
! 1251: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1252: argv[++arg] : DEFAULT_REL_FILE;
! 1253: mr->flags |= MR_DISTRIBUTIONS;
! 1254:
! 1255: /* Specific link relation to look for (only used i also -rellog) */
! 1256: } else if (!strcmp(argv[arg], "-relation")) {
! 1257: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1258: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
! 1259: mr->flags |= MR_DISTRIBUTIONS;
! 1260:
1.63 frystyk 1261: /* last modified log file */
1262: } else if (!strcmp(argv[arg], "-lm")) {
1263: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1264: argv[++arg] : DEFAULT_LM_FILE;
1265: mr->flags |= MR_DISTRIBUTIONS;
1266:
1267: /* title log file */
1268: } else if (!strcmp(argv[arg], "-title")) {
1269: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1270: argv[++arg] : DEFAULT_TITLE_FILE;
1271: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1272:
1273: /* mediatype distribution log file */
1274: } else if (!strncmp(argv[arg], "-for", 4)) {
1275: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1276: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1277: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1278:
1.60 frystyk 1279: /* charset distribution log file */
1280: } else if (!strncmp(argv[arg], "-char", 5)) {
1281: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1282: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1283: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1284:
1.55 frystyk 1285: /* rule file */
1.1 frystyk 1286: } else if (!strcmp(argv[arg], "-r")) {
1287: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1288: argv[++arg] : DEFAULT_RULE_FILE;
1289:
1290: /* output filename */
1291: } else if (!strcmp(argv[arg], "-o")) {
1292: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1293: argv[++arg] : DEFAULT_OUTPUT_FILE;
1294:
1.55 frystyk 1295: /* URI prefix */
1296: } else if (!strcmp(argv[arg], "-prefix")) {
1297: char * prefix = NULL;
1298: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1299: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1300: if (*prefix && *prefix != '*') {
1.55 frystyk 1301: StrAllocCopy(mr->prefix, prefix);
1302: StrAllocCat(mr->prefix, "*");
1303: }
1304:
1.1 frystyk 1305: /* timeout -- Change the default request timeout */
1306: } else if (!strcmp(argv[arg], "-timeout")) {
1307: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1308: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1309: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1310:
1.54 frystyk 1311: /* Force no pipelined requests */
1312: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 ! frystyk 1313: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1314:
1.48 frystyk 1315: /* Start the persistent cache */
1316: } else if (!strcmp(argv[arg], "-cache")) {
1317: cache = YES;
1318:
1.54 frystyk 1319: /* Determine the cache root */
1320: } else if (!strcmp(argv[arg], "-cacheroot")) {
1321: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1322: argv[++arg] : NULL;
1.51 frystyk 1323:
1.52 frystyk 1324: /* Stream write flush delay in ms */
1325: } else if (!strcmp(argv[arg], "-delay")) {
1326: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1327: atoi(argv[++arg]) : DEFAULT_DELAY;
1328: HTHost_setDefaultWriteDelay(delay);
1329:
1.48 frystyk 1330: /* Persistent cache flush */
1331: } else if (!strcmp(argv[arg], "-flush")) {
1332: flush = YES;
1333:
1334: /* Do a cache validation */
1335: } else if (!strcmp(argv[arg], "-validate")) {
1336: mr->flags |= MR_VALIDATE;
1337:
1338: /* Do an end-to-end cache-validation */
1339: } else if (!strcmp(argv[arg], "-endvalidate")) {
1340: mr->flags |= MR_END_VALIDATE;
1341:
1.7 frystyk 1342: /* preemptive or non-preemptive access */
1.1 frystyk 1343: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1344: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1345:
1346: /* test inlined images */
1347: } else if (!strcmp(argv[arg], "-img")) {
1348: mr->flags |= MR_IMG;
1.45 frystyk 1349:
1350: /* load inlined images */
1351: } else if (!strcmp(argv[arg], "-saveimg")) {
1352: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1353:
1354: /* URI prefix for inlined images */
1355: } else if (!strcmp(argv[arg], "-imgprefix")) {
1356: char * prefix = NULL;
1357: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1358: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1359: if (*prefix && *prefix!='*') {
1.59 frystyk 1360: StrAllocCopy(mr->img_prefix, prefix);
1361: StrAllocCat(mr->img_prefix, "*");
1362: }
1.2 frystyk 1363:
1364: /* load anchors */
1.58 frystyk 1365: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1366: mr->flags |= MR_LINK;
1.7 frystyk 1367: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1368: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1369:
1.12 frystyk 1370: /* Output start and end time */
1371: } else if (!strcmp(argv[arg], "-ss")) {
1372: mr->flags |= MR_TIME;
1373:
1.1 frystyk 1374: /* print version and exit */
1375: } else if (!strcmp(argv[arg], "-version")) {
1376: VersionInfo();
1377: Cleanup(mr, 0);
1.46 eric 1378:
1379: /* run in quiet mode */
1380: } else if (!strcmp(argv[arg], "-q")) {
1381: mr->flags |= MR_QUIET;
1.1 frystyk 1382:
1.62 frystyk 1383: /* run in really quiet mode */
1384: } else if (!strcmp(argv[arg], "-Q")) {
1385: mr->flags |= MR_REAL_QUIET;
1386:
1.1 frystyk 1387: #ifdef WWWTRACE
1388: /* trace flags */
1389: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1390: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1391: #endif
1392:
1.58 frystyk 1393: #ifdef HT_POSIX_REGEX
1394:
1395: /* If we can link against a POSIX regex library */
1396: } else if (!strncmp(argv[arg], "-inc", 4)) {
1397: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1398: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1399: }
1400: } else if (!strncmp(argv[arg], "-exc", 4)) {
1401: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1402: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1403: }
1404: } else if (!strncmp(argv[arg], "-check", 6)) {
1405: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1406: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1407: }
1408: #endif
1409:
1.1 frystyk 1410: } else {
1.62 frystyk 1411: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1412: }
1.17 frystyk 1413: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1414: if (!keycnt) {
1415: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1416: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1417: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1418: keycnt = 1;
1.11 frystyk 1419: HT_FREE(ref);
1.1 frystyk 1420: } else { /* Check for successive keyword arguments */
1421: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1422: if (keycnt++ <= 1)
1.5 frystyk 1423: keywords = HTChunk_new(128);
1.1 frystyk 1424: else
1.5 frystyk 1425: HTChunk_putc(keywords, ' ');
1426: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1427: HT_FREE(escaped);
1.1 frystyk 1428: }
1429: }
1430: }
1431:
1432: #ifdef CATCH_SIG
1433: SetSignal();
1434: #endif
1435:
1436: if (!keycnt) {
1.62 frystyk 1437: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1438: Cleanup(mr, -1);
1439: }
1440:
1441: if (mr->depth != DEFAULT_DEPTH &&
1442: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1443: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1444: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1445: mr->depth);
1.1 frystyk 1446: Cleanup(mr, -1);
1447: }
1448:
1.23 manoli 1449: /* Testing that HTTrace is working */
1.62 frystyk 1450: if (mr->flags & MR_TIME) {
1451: if (SHOW_REAL_QUIET(mr)) {
1452: time_t local = time(NULL);
1453: HTTrace("Welcome to the W3C mini Robot - started on %s\n",
1454: HTDateTimeStr(&local, YES));
1455: }
1456: }
1.23 manoli 1457:
1.1 frystyk 1458: /* Rule file specified? */
1459: if (mr->rules) {
1460: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1461: if (!HTLoadRules(rules))
1.62 frystyk 1462: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1463: HT_FREE(rules);
1.1 frystyk 1464: }
1465:
1466: /* Output file specified? */
1467: if (mr->outputfile) {
1468: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1469: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1470: mr->output = OUTPUT;
1471: }
1472: }
1473:
1.48 frystyk 1474: /* Should we use persistent cache? */
1475: if (cache) {
1.54 frystyk 1476: HTCacheInit(cache_root, 20);
1.49 frystyk 1477: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1478: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1479: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1480:
1481: /* Should we start by flushing? */
1482: if (flush) HTCache_flushAll();
1483: }
1484:
1.58 frystyk 1485: /* CLF Log file specified? */
1.55 frystyk 1486: if (mr->logfile) {
1487: mr->log = HTLog_open(mr->logfile, YES, YES);
1488: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1489: }
1490:
1.58 frystyk 1491: /* Referer Log file specified? */
1.57 frystyk 1492: if (mr->reffile) {
1493: mr->ref = HTLog_open(mr->reffile, YES, YES);
1494: if (mr->ref)
1495: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1496: }
1.1 frystyk 1497:
1.58 frystyk 1498: /* Not found error log specified? */
1499: if (mr->notfoundfile) {
1500: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1501: if (mr->notfound)
1502: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1503: }
1504:
1505: /* Negotiated resource log specified? */
1506: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1507:
1508: /* No alt tags log file specified? */
1509: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1510:
1511: /* Reject Log file specified? */
1512: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1513:
1514: /* Register our own terminate filter */
1.32 frystyk 1515: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1516:
1517: /* Setting event timeout */
1518: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1519:
1.56 frystyk 1520: mr->time = HTGetTimeInMillis();
1.37 frystyk 1521:
1.34 eric 1522: /* Start the request */
1523: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1524:
1525: /*
1526: ** Make sure that the first request is flushed immediately and not
1527: ** buffered in the output buffer
1528: */
1529: HTRequest_setFlush(finger->request, YES);
1530:
1531: /*
1.48 frystyk 1532: ** Check whether we should do some kind of cache validation on
1533: ** the load
1534: */
1535: if (mr->flags & MR_VALIDATE)
1536: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1537: if (mr->flags & MR_END_VALIDATE)
1538: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1539:
1540: /*
1.43 frystyk 1541: ** Now do the load
1542: */
1.34 eric 1543: if (mr->flags & MR_PREEMPTIVE)
1544: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1545:
1546: if (keywords) /* Search */
1.34 eric 1547: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1548: else
1.34 eric 1549: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1550:
1.5 frystyk 1551: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1552: if (status != YES) {
1.62 frystyk 1553: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1554: Cleanup(mr, -1);
1555: }
1556:
1557: /* Go into the event loop... */
1.34 eric 1558: HTEventList_loop(finger->request);
1.1 frystyk 1559:
1560: /* Only gets here if event loop fails */
1561: Cleanup(mr, 0);
1562: return 0;
1563: }
Webmaster