Annotation of libwww/Robot/src/HTRobot.c, revision 1.65
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
1.64 frystyk 26: #ifdef HAVE_RXPOSIX_H
27: #include <rxposix.h>
28: #else
1.62 frystyk 29: #ifdef HAVE_REGEX_H
30: #include <regex.h>
31: #endif
32: #endif
1.60 frystyk 33: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 34: #endif
35:
1.14 frystyk 36: #ifndef W3C_VERSION
1.33 eric 37: #define W3C_VERSION "Unspecified"
1.1 frystyk 38: #endif
39:
40: #define APP_NAME "W3CRobot"
1.14 frystyk 41: #define APP_VERSION W3C_VERSION
1.62 frystyk 42: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 43:
44: #define DEFAULT_OUTPUT_FILE "robot.out"
45: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 46: #define DEFAULT_LOG_FILE "log-clf.txt"
47: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 frystyk 48: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 49: #define DEFAULT_LM_FILE "log-lastmodified.txt"
50: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 51: #define DEFAULT_REFERER_FILE "log-referer.txt"
52: #define DEFAULT_REJECT_FILE "log-reject.txt"
53: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
54: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 55: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 56: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 57: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 58: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 59: #define DEFAULT_PREFIX ""
1.59 frystyk 60: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 61: #define DEFAULT_DEPTH 0
1.53 frystyk 62: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 63:
1.51 frystyk 64: #if 0
1.65 ! frystyk 65: #define HT_MEMLOG /* Is expensive in performance! */
1.51 frystyk 66: #endif
67:
1.46 eric 68: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 69: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
70: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 71:
1.40 frystyk 72: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 73:
74: #if defined(__svr4__)
75: #define CATCH_SIG
76: #endif
77:
78: typedef enum _MRFlags {
1.45 frystyk 79: MR_IMG = 0x1,
80: MR_LINK = 0x2,
81: MR_PREEMPTIVE = 0x4,
82: MR_TIME = 0x8,
1.46 eric 83: MR_SAVE = 0x10,
1.48 frystyk 84: MR_QUIET = 0x20,
1.62 frystyk 85: MR_REAL_QUIET = 0x40,
86: MR_VALIDATE = 0x80,
87: MR_END_VALIDATE = 0x100,
1.63 frystyk 88: MR_KEEP_META = 0x200,
89: MR_LOGGING = 0x400,
90: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 91: } MRFlags;
92:
93: typedef struct _Robot {
1.2 frystyk 94: int depth; /* How deep is our tree */
1.30 frystyk 95: int cnt; /* Count of requests */
1.2 frystyk 96: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 97: HTList * htext; /* List of our HText Objects */
1.34 eric 98: HTList * fingers;
1.59 frystyk 99:
1.40 frystyk 100: int timer;
1.65 ! frystyk 101: char * cwd; /* Current dir URL */
1.1 frystyk 102: char * rules;
1.55 frystyk 103: char * prefix;
1.59 frystyk 104: char * img_prefix;
105:
1.60 frystyk 106: char * logfile; /* clf log */
1.55 frystyk 107: HTLog * log;
1.60 frystyk 108: char * reffile; /* referer log */
1.57 frystyk 109: HTLog * ref;
1.60 frystyk 110: char * rejectfile; /* unchecked links */
1.58 frystyk 111: HTLog * reject;
1.60 frystyk 112: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 113: HTLog * notfound;
1.60 frystyk 114: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 115: HTLog * conneg;
1.60 frystyk 116: char * noalttagfile; /* images without alt tags*/
117: HTLog * noalttag;
118:
119: char * hitfile; /* links sorted after hit counts */
1.64 frystyk 120: char * relfile; /* link sorted after relationships */
121: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 122: char * titlefile; /* links with titles */
1.60 frystyk 123: char * mtfile; /* media types encountered */
124: char * charsetfile; /* charsets encountered */
1.63 frystyk 125: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 126:
127: char * outputfile;
1.1 frystyk 128: FILE * output;
1.59 frystyk 129:
1.1 frystyk 130: MRFlags flags;
1.55 frystyk 131:
1.59 frystyk 132: long get_bytes; /* Total number of bytes processed using GET*/
133: long get_docs; /* Total number of documents using GET */
134:
135: long head_bytes; /* bytes processed bytes processed using HEAD */
136: long head_docs; /* Total number of documents using HEAD*/
137:
138: long other_docs;
139:
1.56 frystyk 140: ms_t time; /* Time of run */
1.58 frystyk 141:
142: #ifdef HT_POSIX_REGEX
143: regex_t * include;
144: regex_t * exclude;
145: regex_t * check;
146: #endif
147:
1.1 frystyk 148: } Robot;
1.34 eric 149:
150: typedef struct _Finger {
151: Robot * robot;
152: HTRequest * request;
153: HTParentAnchor * dest;
154: } Finger;
155:
1.1 frystyk 156: typedef enum _LoadState {
157: L_INVALID = -2,
158: L_LOADING = -1,
159: L_SUCCESS = 0,
160: L_ERROR
161: } LoadState;
162:
163: /*
164: ** The HyperDoc object is bound to the anchor and contains information about
165: ** where we are in the search for recursive searches
166: */
167: typedef struct _HyperDoc {
168: HTParentAnchor * anchor;
169: LoadState state;
170: int depth;
1.55 frystyk 171: int hits;
1.1 frystyk 172: } HyperDoc;
173:
174: /*
1.65 ! frystyk 175: ** This is the HText object that is created every time we start parsing an
1.1 frystyk 176: ** HTML object
177: */
1.4 frystyk 178: struct _HText {
1.1 frystyk 179: HTRequest * request;
1.65 ! frystyk 180: BOOL follow;
1.4 frystyk 181: };
1.1 frystyk 182:
1.58 frystyk 183: /*
184: ** A structure for calculating metadata distributions
185: */
186: typedef struct _MetaDist {
187: HTAtom * name;
188: int hits;
189: } MetaDist;
190:
191: /*
192: ** Some sorting algorithms
193: */
1.63 frystyk 194: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 195:
1.1 frystyk 196: PUBLIC HText * HTMainText = NULL;
197: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
198: PUBLIC HTStyleSheet * styleSheet = NULL;
199:
200: /* ------------------------------------------------------------------------- */
201:
1.13 eric 202: /* Standard (non-error) Output
203: ** ---------------------------
204: */
205: PUBLIC int OutputData(const char * fmt, ...)
206: {
207: int ret;
208: va_list pArgs;
209: va_start(pArgs, fmt);
210: ret = vfprintf(stdout, fmt, pArgs);
211: va_end(pArgs);
212: return ret;
213: }
214:
215: /* ------------------------------------------------------------------------- */
216:
1.2 frystyk 217: /* Create a "HyperDoc" object
218: ** --------------------------
219: ** A HyperDoc object contains information about whether we have already
220: ** started checking the anchor and the depth in our search
221: */
222: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
223: {
224: HyperDoc * hd;
1.14 frystyk 225: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
226: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 227: hd->state = L_INVALID;
228: hd->depth = depth;
1.55 frystyk 229: hd->hits = 1;
1.2 frystyk 230:
231: /* Bind the HyperDoc object together with the Anchor Object */
232: hd->anchor = anchor;
233: HTAnchor_setDocument(anchor, (void *) hd);
234:
235: /* Add this HyperDoc object to our list */
236: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
237: HTList_addObject(mr->hyperdoc, (void *) hd);
238: return hd;
239: }
240:
241: /* Delete a "HyperDoc" object
242: ** --------------------------
243: */
244: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
245: {
246: if (hd) {
1.11 frystyk 247: HT_FREE (hd);
1.2 frystyk 248: return YES;
249: }
250: return NO;
251: }
252:
1.55 frystyk 253: /*
254: ** Sort the anchor array and log reference count
255: */
256: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
257: {
258: if (mr && array) {
259: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
260: if (log) {
261: void ** data = NULL;
262: HTParentAnchor * anchor = NULL;
263: HTArray_sort(array, HitSort);
264: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
265: while (anchor) {
266: char * uri = HTAnchor_address((HTAnchor *) anchor);
267: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 268: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 269: HT_FREE(uri);
270: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
271: }
272: }
273: HTLog_close(log);
274: return YES;
275: }
276: return NO;
277: }
278:
279: PRIVATE int HitSort (const void * a, const void * b)
280: {
281: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
282: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
283: if (aa && bb) return (bb->hits - aa->hits);
284: return bb - aa;
285: }
286:
1.58 frystyk 287: /*
1.64 frystyk 288: ** Sort the anchor array and log link relations
289: */
290: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
291: {
292: if (mr && array) {
293: HTLog * log = HTLog_open(mr->relfile, YES, YES);
294: if (log) {
295: void ** data = NULL;
296: HTParentAnchor * anchor = NULL;
297: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
298: while (anchor) {
299: char * uri = HTAnchor_address((HTAnchor *) anchor);
300: if (uri) {
301: /*
302: ** If we have a specific relation to look for then use that.
303: */
304: if (mr->relation) {
305: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor,
306: mr->relation);
307: if (link) {
308: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
309: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
310: HTFormat format = HTAnchor_format(dest);
311: if (dest_uri) {
312: HTLog_addText(log, "%s %s %s --> %s\n",
313: HTAtom_name(mr->relation),
314: format != WWW_UNKNOWN ?
315: HTAtom_name(format) : "<unknown>",
316: uri, dest_uri);
317: HT_FREE(dest_uri);
318: }
319: }
320: }
321: HT_FREE(uri);
322: }
323: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
324: }
325: }
326: HTLog_close(log);
327: return YES;
328: }
329: return NO;
330: }
331:
332: /*
1.63 frystyk 333: ** Sort the anchor array and log last modified date
334: */
335: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
336: {
337: if (mr && array) {
338: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
339: if (log) {
340: void ** data = NULL;
341: HTParentAnchor * anchor = NULL;
342: HTArray_sort(array, LastModifiedSort);
343: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
344: while (anchor) {
345: char * uri = HTAnchor_address((HTAnchor *) anchor);
346: time_t lm = HTAnchor_lastModified(anchor);
347: if (uri && lm > 0)
348: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
349: HT_FREE(uri);
350: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
351: }
352: }
353: HTLog_close(log);
354: return YES;
355: }
356: return NO;
357: }
358:
359: PRIVATE int LastModifiedSort (const void * a, const void * b)
360: {
361: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
362: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
363: return bb - aa;
364: }
365:
366: /*
367: ** Sort the anchor array and log the document title
368: */
369: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
370: {
371: if (mr && array) {
372: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
373: if (log) {
374: void ** data = NULL;
375: HTParentAnchor * anchor = NULL;
376: HTArray_sort(array, TitleSort);
377: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
378: while (anchor) {
379: char * uri = HTAnchor_address((HTAnchor *) anchor);
380: const char * title = HTAnchor_title(anchor);
381: HTCharset charset = HTAnchor_charset(anchor);
382: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
383: charset ? HTAtom_name(charset) : "<none>",
384: title ? title : "<none>",
385: uri);
386: HT_FREE(uri);
387: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
388: }
389: }
390: HTLog_close(log);
391: return YES;
392: }
393: return NO;
394: }
395:
396: PRIVATE int TitleSort (const void * a, const void * b)
397: {
398: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
399: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
400: return strcasecomp(bb?bb:"", aa?aa:"");
401: }
402:
403: /*
1.58 frystyk 404: ** Calculate distributions for media types. The same mechanism
405: ** can be used for other characteristics with relatively
406: ** few outcomes.
407: */
408: PRIVATE HTList * mediatype_distribution (HTArray * array)
409: {
410: if (array) {
411: HTList * mt = HTList_new();
412: MetaDist * pres = NULL;
413: void ** data = NULL;
414: HTParentAnchor * anchor = NULL;
415: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
416: while (anchor) {
417: HTFormat format = HTAnchor_format(anchor);
418: if (format && format != WWW_UNKNOWN) {
419: HTList * cur = mt;
420:
421: /* If found then increase counter */
422: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
423: if (pres->name == format) {
424: pres->hits++;
425: break;
426: }
427: }
428:
429: /* If not found then add new format to list */
430: if (!pres) {
431: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
432: HT_OUTOFMEM("mediatype_distribution");
433: pres->name = format;
434: pres->hits = 1;
435: HTList_addObject(mt, pres);
436: HTList_insertionSort(mt, FormatSort);
437: }
438: }
439:
440: /* Find next anchor in array */
441: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
442: }
443: return mt;
444: }
445: return NULL;
446: }
447:
1.60 frystyk 448: /*
449: ** Calculate distributions for charsets. The same mechanism
450: ** can be used for other characteristics with relatively
451: ** few outcomes.
452: */
453: PRIVATE HTList * charset_distribution (HTArray * array)
454: {
455: if (array) {
456: HTList * cs = HTList_new();
457: MetaDist * pres = NULL;
458: void ** data = NULL;
459: HTParentAnchor * anchor = NULL;
460: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
461: while (anchor) {
462: HTCharset charset = HTAnchor_charset(anchor);
463: if (charset) {
464: HTList * cur = cs;
465:
466: /* If found then increase counter */
467: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
468: if (pres->name == charset) {
469: pres->hits++;
470: break;
471: }
472: }
473:
474: /* If not found then add new format to list */
475: if (!pres) {
476: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
477: HT_OUTOFMEM("charset_distribution");
478: pres->name = charset;
479: pres->hits = 1;
480: HTList_addObject(cs, pres);
481: HTList_insertionSort(cs, FormatSort);
482: }
483: }
484:
485: /* Find next anchor in array */
486: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
487: }
488: return cs;
489: }
490: return NULL;
491: }
492:
1.58 frystyk 493: PRIVATE int FormatSort (const void * a, const void * b)
494: {
495: MetaDist * aa = (MetaDist *) a;
496: MetaDist * bb = (MetaDist *) b;
497: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
498: }
499:
500: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
501: {
502: if (logfile && distribution) {
503: HTLog * log = HTLog_open(logfile, YES, YES);
504: if (log) {
505: HTList * cur = distribution;
506: MetaDist * pres;
507: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
508: if (pres->name) {
1.60 frystyk 509: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 510: }
511: }
512: HTLog_close(log);
513: }
514: }
515: return NO;
516: }
517:
518: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
519: {
520: if (distribution) {
521: HTList * cur = distribution;
522: MetaDist * pres;
523: while ((pres = (MetaDist *) HTList_nextObject(cur)))
524: HT_FREE(pres);
525: HTList_delete(distribution);
526: return YES;
527: }
528: return NO;
529: }
530:
531:
1.55 frystyk 532: /* Statistics
533: ** ----------
534: ** Calculates a bunch of statistics for the anchors traversed
535: */
536: PRIVATE BOOL calculate_statistics (Robot * mr)
537: {
1.59 frystyk 538: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 539: if (!mr) return NO;
540:
541: /* Calculate efficiency */
1.59 frystyk 542: if (mr->time > 0) {
1.56 frystyk 543: ms_t t = HTGetTimeInMillis() - mr->time;
544: if (t > 0) {
1.60 frystyk 545: double loadfactor = (mr->get_bytes / (t * 0.001));
546: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 547: double secs = t / 1000.0;
1.55 frystyk 548: char bytes[50];
1.62 frystyk 549: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 550: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 551: total_docs, secs, reqprsec);
1.59 frystyk 552:
553: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 554: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 555: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 556: mr->get_docs, bytes, loadfactor);
1.59 frystyk 557:
558: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 559: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 560: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 561: mr->head_docs, bytes);
1.55 frystyk 562: }
563: }
564:
565: /* Create an array of existing anchors */
1.59 frystyk 566: if (total_docs > 1) {
567: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 568: if (array) {
569:
1.63 frystyk 570: /* Distributions */
571: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 572: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 573: }
574:
1.55 frystyk 575: /* Sort after hit counts */
1.63 frystyk 576: if (mr->hitfile) {
577: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 578: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 579: mr->hitfile);
580: calculate_hits(mr, array);
581: }
582:
1.64 frystyk 583: /* Sort after link relations */
584: if (mr->relfile) {
585: if (SHOW_REAL_QUIET(mr))
586: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
587: mr->relfile);
588: calculate_linkRelations(mr, array);
589: }
590:
1.63 frystyk 591: /* Sort after modified date */
592: if (mr->lmfile) {
593: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 594: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 595: mr->lmfile);
596: calculate_lm(mr, array);
597: }
598:
599: /* Sort after title */
600: if (mr->titlefile) {
601: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 602: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 603: mr->titlefile);
604: calculate_title(mr, array);
605: }
1.55 frystyk 606:
1.58 frystyk 607: /* Find mediatype distribution */
608: if (mr->mtfile) {
609: HTList * mtdist = mediatype_distribution(array);
610: if (mtdist) {
1.63 frystyk 611: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 612: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 613: mr->mtfile);
1.58 frystyk 614: log_meta_distribution(mr->mtfile, mtdist);
615: delete_meta_distribution(mtdist);
616: }
617: }
1.55 frystyk 618:
1.60 frystyk 619: /* Find charset distribution */
620: if (mr->charsetfile) {
621: HTList * charsetdist = charset_distribution(array);
622: if (charsetdist) {
1.63 frystyk 623: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 624: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 625: mr->charsetfile);
1.60 frystyk 626: log_meta_distribution(mr->charsetfile, charsetdist);
627: delete_meta_distribution(charsetdist);
628: }
629: }
630:
1.55 frystyk 631: /* Add as may other stats here as you like */
1.60 frystyk 632: /* ... */
1.58 frystyk 633:
634: /* Delete the array */
1.55 frystyk 635: HTArray_delete(array);
636: }
637: }
638: return YES;
639: }
640:
1.1 frystyk 641: /* Create a Command Line Object
642: ** ----------------------------
643: */
644: PRIVATE Robot * Robot_new (void)
645: {
646: Robot * me;
1.41 frystyk 647: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 648: HT_OUTOFMEM("Robot_new");
1.2 frystyk 649: me->hyperdoc = HTList_new();
1.4 frystyk 650: me->htext = HTList_new();
1.40 frystyk 651: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 652: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 653: me->output = OUTPUT;
1.35 eric 654: me->cnt = 0;
1.34 eric 655: me->fingers = HTList_new();
1.1 frystyk 656: return me;
657: }
658:
659: /* Delete a Command Line Object
660: ** ----------------------------
661: */
1.62 frystyk 662: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 663: {
1.62 frystyk 664: if (mr) {
665: HTList_delete(mr->fingers);
1.55 frystyk 666:
667: /* Calculate statistics */
1.62 frystyk 668: calculate_statistics(mr);
1.55 frystyk 669:
1.62 frystyk 670: if (mr->hyperdoc) {
671: HTList * cur = mr->hyperdoc;
1.2 frystyk 672: HyperDoc * pres;
673: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
674: HyperDoc_delete(pres);
1.62 frystyk 675: HTList_delete(mr->hyperdoc);
1.2 frystyk 676: }
1.62 frystyk 677: if (mr->htext) {
678: HTList * cur = mr->htext;
1.4 frystyk 679: HText * pres;
680: while ((pres = (HText *) HTList_nextObject(cur)))
681: HText_free(pres);
1.62 frystyk 682: HTList_delete(mr->htext);
1.4 frystyk 683: }
1.62 frystyk 684:
685: /* Close all the log files */
1.63 frystyk 686: if (mr->flags & MR_LOGGING) {
1.64 frystyk 687: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 688: }
689:
1.62 frystyk 690: if (mr->log) {
691: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 692: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 693: HTLog_accessCount(mr->log), mr->logfile);
694: HTLog_close(mr->log);
695: }
696: if (mr->ref) {
697: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 698: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 699: HTLog_accessCount(mr->ref), mr->reffile);
700: HTLog_close(mr->ref);
701: }
702: if (mr->reject) {
703: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 704: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 705: HTLog_accessCount(mr->reject), mr->rejectfile);
706: HTLog_close(mr->reject);
707: }
708: if (mr->notfound) {
709: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 710: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 711: HTLog_accessCount(mr->notfound), mr->notfoundfile);
712: HTLog_close(mr->notfound);
713: }
714: if (mr->conneg) {
715: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 716: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 717: HTLog_accessCount(mr->conneg), mr->connegfile);
718: HTLog_close(mr->conneg);
719: }
720: if (mr->noalttag) {
721: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 722: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 723: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
724: HTLog_close(mr->noalttag);
725: }
726:
727: if (mr->output && mr->output != STDOUT) fclose(mr->output);
728:
729: if (mr->flags & MR_TIME) {
1.12 frystyk 730: time_t local = time(NULL);
1.62 frystyk 731: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 732: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 733: }
1.55 frystyk 734:
1.58 frystyk 735: #ifdef HT_POSIX_REGEX
1.62 frystyk 736: if (mr->include) {
737: regfree(mr->include);
738: HT_FREE(mr->include);
739: }
740: if (mr->exclude) {
741: regfree(mr->exclude);
742: HT_FREE(mr->exclude);
743: }
744: if (mr->check) {
745: regfree(mr->check);
746: HT_FREE(mr->check);
1.58 frystyk 747: }
748: #endif
749:
1.62 frystyk 750: HT_FREE(mr->cwd);
751: HT_FREE(mr->prefix);
752: HT_FREE(mr->img_prefix);
753: HT_FREE(mr);
1.1 frystyk 754: return YES;
755: }
756: return NO;
757: }
758:
1.2 frystyk 759: /*
1.34 eric 760: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 761: */
1.34 eric 762: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 763: {
1.34 eric 764: Finger * me;
765: HTRequest * request = HTRequest_new();
766: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
767: HT_OUTOFMEM("Finger_new");
768: me->robot = robot;
769: me->request = request;
770: me->dest = dest;
771: HTList_addObject(robot->fingers, (void *)me);
772:
1.48 frystyk 773: /* Set the context for this request */
1.34 eric 774: HTRequest_setContext (request, me);
1.48 frystyk 775:
776: /* Check the various flags to customize the request */
777: if (robot->flags & MR_PREEMPTIVE)
778: HTRequest_setPreemptive(request, YES);
779: if (robot->flags & MR_VALIDATE)
780: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
781: if (robot->flags & MR_END_VALIDATE)
782: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
783:
784: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 785: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 786:
787: /* Set the method for this request */
1.34 eric 788: HTRequest_setMethod(request, method);
789: robot->cnt++;
790: return me;
1.2 frystyk 791: }
792:
1.34 eric 793: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 794: {
1.34 eric 795: HTList_removeObject(me->robot->fingers, (void *)me);
796: me->robot->cnt--;
1.37 frystyk 797:
798: /*
799: ** If we are down at one request then flush the output buffer
800: */
801: if (me->request) {
802: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 803: HTRequest_delete(me->request);
1.37 frystyk 804: }
805:
806: /*
807: ** Delete the request and free myself
808: */
1.34 eric 809: HT_FREE(me);
810: return YES;
1.2 frystyk 811: }
812:
813: /*
814: ** Cleanup and make sure we close all connections including the persistent
815: ** ones
816: */
1.1 frystyk 817: PRIVATE void Cleanup (Robot * me, int status)
818: {
819: Robot_delete(me);
1.29 eric 820: HTProfile_delete();
1.50 frystyk 821: #ifdef HT_MEMLOG
1.39 eric 822: HTMemLog_close();
1.47 frystyk 823: #endif
824:
1.1 frystyk 825: #ifdef VMS
826: exit(status ? status : 1);
827: #else
828: exit(status ? status : 0);
829: #endif
830: }
831:
832: #ifdef CATCH_SIG
833: #include <signal.h>
834: /* SetSignal
835: ** This function sets up signal handlers. This might not be necessary to
836: ** call if the application has its own handlers (lossage on SVR4)
837: */
838: PRIVATE void SetSignal (void)
839: {
840: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
841: ** when attemting to connect to a remote host where you normally should
842: ** get `connection refused' back
843: */
844: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 845: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 846: } else {
1.13 eric 847: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 848: }
1.47 frystyk 849:
1.50 frystyk 850: #ifdef HT_MEMLOG
1.44 eric 851: HTMemLog_flush();
1.47 frystyk 852: #endif
853:
1.1 frystyk 854: }
855: #endif /* CATCH_SIG */
856:
1.58 frystyk 857: #ifdef HT_POSIX_REGEX
858: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
859: {
860: size_t length = regerror (errcode, compiled, NULL, 0);
861: char * str = NULL;
862: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
863: HT_OUTOFMEM("get_regerror");
864: (void) regerror (errcode, compiled, str, length);
865: return str;
866: }
867:
1.60 frystyk 868: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 869: {
870: regex_t * regex = NULL;
871: if (regex_str && *regex_str) {
872: int status;
873: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
874: HT_OUTOFMEM("get_regtype");
1.60 frystyk 875: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 876: char * err_msg = get_regerror(status, regex);
1.62 frystyk 877: if (SHOW_REAL_QUIET(mr))
878: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 879: HT_FREE(err_msg);
880: Cleanup(mr, -1);
881: }
882: }
883: return regex;
884: }
885: #endif
886:
1.1 frystyk 887: PRIVATE void VersionInfo (void)
888: {
1.62 frystyk 889: OutputData("W3C Sample Software\n\n");
890: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
891: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
892: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 893: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 894: }
895:
896: /* terminate_handler
897: ** -----------------
1.2 frystyk 898: ** This function is registered to handle the result of the request.
899: ** If no more requests are pending then terminate program
1.1 frystyk 900: */
1.32 frystyk 901: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
902: void * param, int status)
1.1 frystyk 903: {
1.34 eric 904: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 905: Robot * mr = finger->robot;
1.62 frystyk 906: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 907:
1.58 frystyk 908: /* Check if negotiated resource and whether we should log that*/
909: if (mr->conneg) {
910: HTAssocList * cur = HTResponse_variant(response);
911: if (cur) {
912: BOOL first = YES;
913: HTChunk * buffer = HTChunk_new(128);
914: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
915: HTAssoc * pres;
1.60 frystyk 916: HTChunk_puts(buffer, uri);
1.58 frystyk 917: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
918: char * value = HTAssoc_value(pres);
919: if (first) {
1.60 frystyk 920: HTChunk_puts(buffer, "\t(");
1.58 frystyk 921: first = NO;
922: } else
923: HTChunk_puts(buffer, ", ");
924:
925: /* Output the name */
926: HTChunk_puts(buffer, HTAssoc_name(pres));
927:
928: /* Only output the value if not empty string */
1.60 frystyk 929: if (value && *value) {
1.58 frystyk 930: HTChunk_puts(buffer, "=");
931: HTChunk_puts(buffer, value);
932: }
933: }
1.60 frystyk 934: if (!first) HTChunk_puts(buffer, ")");
935: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 936: HTChunk_delete(buffer);
937: HT_FREE(uri);
938: }
939: }
940:
1.55 frystyk 941: /* Count the amount of body data that we have read */
1.59 frystyk 942: if (HTRequest_method(request) == METHOD_GET) {
943: int length = HTAnchor_length(HTRequest_anchor(request));
944: if (length > 0) mr->get_bytes += length;
945: mr->get_docs++;
946: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 947: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 948: if (length > 0) mr->head_bytes += length;
949: mr->head_docs++;
950: } else {
951: mr->other_docs++;
1.55 frystyk 952: }
953:
1.58 frystyk 954: /* Cleanup the anchor so that we don't drown in metainformation */
955: if (!(mr->flags & MR_KEEP_META))
956: HTAnchor_clearHeader(HTRequest_anchor(request));
957:
1.55 frystyk 958: /* Delete this thread */
1.34 eric 959: Finger_delete(finger);
1.55 frystyk 960:
961: /* Should we stop? */
1.46 eric 962: if (mr->cnt <= 0) {
1.62 frystyk 963: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 964: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 965: }
1.62 frystyk 966: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 967: return HT_OK;
968: }
969:
970: /* ------------------------------------------------------------------------- */
971: /* HTEXT INTERFACE */
972: /* ------------------------------------------------------------------------- */
973:
974: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
975: HTStream * stream)
976: {
977: HText * me;
1.34 eric 978: Finger * finger = (Finger *) HTRequest_context(request);
979: Robot * mr = finger->robot;
1.65 ! frystyk 980: char * robots = NULL;
! 981:
1.14 frystyk 982: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
983: HT_OUTOFMEM("HText_new2");
1.4 frystyk 984:
985: /* Bind the HText object together with the Request Object */
1.1 frystyk 986: me->request = request;
1.65 ! frystyk 987: me->follow = YES;
! 988:
! 989: /* Check to see if we have any meta tags */
! 990: if ((robots = HTAnchor_robots(anchor)) != NULL) {
! 991: char * strval = NULL;
! 992: char * ptr = NULL;
! 993: char * token = NULL;
! 994: StrAllocCopy(strval, robots);
! 995: ptr = strval;
! 996: while ((token = HTNextField(&ptr)) != NULL) {
! 997: if (!strcasecomp(token, "nofollow")) {
! 998: me->follow = NO;
! 999: break;
! 1000: }
! 1001: }
! 1002: HT_FREE(strval);
! 1003: }
1.4 frystyk 1004:
1005: /* Add this HyperDoc object to our list */
1006: if (!mr->htext) mr->htext = HTList_new();
1007: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1008: return me;
1009: }
1010:
1.4 frystyk 1011: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1012: if (me) HT_FREE (me);
1.4 frystyk 1013: }
1014:
1.1 frystyk 1015: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1016: {
1017: if (text && anchor) {
1.34 eric 1018: Finger * finger = (Finger *) HTRequest_context(text->request);
1019: Robot * mr = finger->robot;
1.1 frystyk 1020: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1021: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1022: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1023: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1024: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 ! frystyk 1025: BOOL match = text->follow;
1.58 frystyk 1026: BOOL check = NO;
1.1 frystyk 1027:
1.55 frystyk 1028: if (!uri) return;
1.62 frystyk 1029: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1030:
1031: if (hd) {
1.62 frystyk 1032: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1033: hd->hits++;
1.58 frystyk 1034: HT_FREE(uri);
1035: return;
1036: }
1037:
1038: /* Check for prefix match */
1.65 ! frystyk 1039: if (match && mr->prefix) {
! 1040: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
! 1041: }
1.58 frystyk 1042:
1043: #ifdef HT_POSIX_REGEX
1044: /* Check for any regular expression */
1045: if (match && mr->include) {
1046: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1047: }
1048: if (match && mr->exclude) {
1049: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1050: }
1051: if (match && mr->check) {
1052: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1053: }
1054: #endif
1055:
1056: /* Test whether we already have a hyperdoc for this document */
1057: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1058: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1059: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1060: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1061: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1062: HTRequest * newreq = newfinger->request;
1.2 frystyk 1063: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1064: HTRequest_setParent(newreq, referer);
1.58 frystyk 1065: if (check || depth >= mr->depth) {
1.62 frystyk 1066: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1067: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 1068: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 1069: } else {
1.62 frystyk 1070: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1071: }
1072: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1073: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1074: Finger_delete(newfinger);
1.2 frystyk 1075: }
1.7 frystyk 1076: } else {
1.62 frystyk 1077: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 1078: if (mr->reject) {
1079: if (referer) {
1080: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1081: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1082: HT_FREE(ref_addr);
1083: }
1084: }
1.2 frystyk 1085: }
1.11 frystyk 1086: HT_FREE(uri);
1.2 frystyk 1087: }
1088: }
1089:
1090: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1091: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1092: {
1093: if (text && anchor) {
1.34 eric 1094: Finger * finger = (Finger *) HTRequest_context(text->request);
1095: Robot * mr = finger->robot;
1.59 frystyk 1096: if (mr->flags & MR_IMG) {
1.60 frystyk 1097: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1098: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1099: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1100: HyperDoc * hd = HTAnchor_document(dest_parent);
1101: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1102: BOOL match = YES;
1103:
1104: if (hd) {
1.62 frystyk 1105: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1106: hd->hits++;
1.11 frystyk 1107: HT_FREE(uri);
1.59 frystyk 1108: return;
1.2 frystyk 1109: }
1.59 frystyk 1110:
1111: /* Check for prefix match */
1112: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1113:
1114: /* Test whether we already have a hyperdoc for this document */
1115: if (match && dest) {
1.60 frystyk 1116: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1117: mr->flags & MR_SAVE ?
1118: METHOD_GET : METHOD_HEAD);
1119: HTRequest * newreq = newfinger->request;
1.60 frystyk 1120: HyperDoc_new(mr, dest_parent, 1);
1121: HTRequest_setParent(newreq, referer);
1122:
1123: /* Check whether we should report missing ALT tags */
1124: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1125: if (referer) {
1126: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1127: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1128: HT_FREE(ref_addr);
1129: }
1130: }
1131:
1.62 frystyk 1132: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1133: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1134: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1135: Finger_delete(newfinger);
1136: }
1137: } else {
1.62 frystyk 1138: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 1139: if (mr->reject) {
1140: if (referer) {
1141: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1142: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1143: HT_FREE(ref_addr);
1144: }
1145: }
1.1 frystyk 1146: }
1.59 frystyk 1147: HT_FREE(uri);
1.1 frystyk 1148: }
1149: }
1150: }
1151:
1152: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1153: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1154: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1155: PUBLIC void HText_endAppend (HText * text) {}
1156: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1157: PUBLIC void HText_beginAppend (HText * text) {}
1158: PUBLIC void HText_appendParagraph (HText * text) {}
1159:
1.48 frystyk 1160: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1161: {
1162: return (vfprintf(stderr, fmt, pArgs));
1163: }
1164:
1.1 frystyk 1165: /* ------------------------------------------------------------------------- */
1166: /* MAIN PROGRAM */
1167: /* ------------------------------------------------------------------------- */
1168:
1169: int main (int argc, char ** argv)
1170: {
1.48 frystyk 1171: int status = 0;
1.1 frystyk 1172: int arg;
1.48 frystyk 1173: BOOL cache = NO; /* Use persistent cache */
1174: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1175: char * cache_root = NULL;
1.1 frystyk 1176: HTChunk * keywords = NULL; /* From command line */
1177: int keycnt = 0;
1.12 frystyk 1178: Robot * mr = NULL;
1.43 frystyk 1179: Finger * finger = NULL;
1180: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1181:
1182: /* Starts Mac GUSI socket library */
1183: #ifdef GUSI
1184: GUSISetup(GUSIwithSIOUXSockets);
1185: GUSISetup(GUSIwithInternetSockets);
1186: #endif
1187:
1188: #ifdef __MWERKS__ /* STR */
1189: InitGraf((Ptr) &qd.thePort);
1190: InitFonts();
1191: InitWindows();
1192: InitMenus(); TEInit();
1193: InitDialogs(nil);
1194: InitCursor();
1195: SIOUXSettings.asktosaveonclose = false;
1196: argc=ccommand(&argv);
1.50 frystyk 1197: #endif /* __MWERKS__ */
1.1 frystyk 1198:
1.50 frystyk 1199: #ifdef HT_MEMLOG
1.51 frystyk 1200: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1201: #endif
1.46 eric 1202:
1.27 frystyk 1203: /* Initiate W3C Reference Library with a robot profile */
1204: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1205: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1206:
1207: /* Add the default HTML parser to the set of converters */
1208: {
1209: HTList * converters = HTFormat_conversion();
1210: HTMLInit(converters);
1211: }
1.1 frystyk 1212:
1.12 frystyk 1213: /* Build a new robot object */
1214: mr = Robot_new();
1215:
1.1 frystyk 1216: /* Scan command Line for parameters */
1217: for (arg=1; arg<argc; arg++) {
1218: if (*argv[arg] == '-') {
1219:
1220: /* non-interactive */
1.17 frystyk 1221: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1222: HTAlert_setInteractive(NO);
1223:
1.62 frystyk 1224: /* help */
1225: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1226: VersionInfo();
1227: Cleanup(mr, 0);
1228:
1.63 frystyk 1229: /* clf log file */
1.1 frystyk 1230: } else if (!strcmp(argv[arg], "-l")) {
1231: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1232: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1233: mr->flags |= MR_LOGGING;
1.1 frystyk 1234:
1.63 frystyk 1235: /* referer log file */
1.58 frystyk 1236: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1237: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1238: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1239: mr->flags |= MR_LOGGING;
1.57 frystyk 1240:
1.58 frystyk 1241: /* Not found error log file */
1242: } else if (!strncmp(argv[arg], "-404", 4)) {
1243: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1244: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1245: mr->flags |= MR_LOGGING;
1.58 frystyk 1246:
1247: /* reject log file */
1248: } else if (!strncmp(argv[arg], "-rej", 4)) {
1249: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1250: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1251: mr->flags |= MR_LOGGING;
1.58 frystyk 1252:
1.63 frystyk 1253: /* no alt tags log file */
1254: } else if (!strncmp(argv[arg], "-alt", 4)) {
1255: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1256: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1257: mr->flags |= MR_LOGGING;
1258:
1259: /* negotiated resource log file */
1.58 frystyk 1260: } else if (!strncmp(argv[arg], "-neg", 4)) {
1261: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1262: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1263: mr->flags |= MR_LOGGING;
1264:
1265: /* hit file log */
1266: } else if (!strcmp(argv[arg], "-hit")) {
1267: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1268: argv[++arg] : DEFAULT_HIT_FILE;
1269: mr->flags |= MR_DISTRIBUTIONS;
1270:
1.64 frystyk 1271: /* link relations file log */
1272: } else if (!strcmp(argv[arg], "-rellog")) {
1273: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1274: argv[++arg] : DEFAULT_REL_FILE;
1275: mr->flags |= MR_DISTRIBUTIONS;
1276:
1277: /* Specific link relation to look for (only used i also -rellog) */
1278: } else if (!strcmp(argv[arg], "-relation")) {
1279: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
1280: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
1281: mr->flags |= MR_DISTRIBUTIONS;
1282:
1.63 frystyk 1283: /* last modified log file */
1284: } else if (!strcmp(argv[arg], "-lm")) {
1285: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1286: argv[++arg] : DEFAULT_LM_FILE;
1287: mr->flags |= MR_DISTRIBUTIONS;
1288:
1289: /* title log file */
1290: } else if (!strcmp(argv[arg], "-title")) {
1291: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1292: argv[++arg] : DEFAULT_TITLE_FILE;
1293: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1294:
1295: /* mediatype distribution log file */
1296: } else if (!strncmp(argv[arg], "-for", 4)) {
1297: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1298: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1299: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1300:
1.60 frystyk 1301: /* charset distribution log file */
1302: } else if (!strncmp(argv[arg], "-char", 5)) {
1303: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1304: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1305: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1306:
1.55 frystyk 1307: /* rule file */
1.1 frystyk 1308: } else if (!strcmp(argv[arg], "-r")) {
1309: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1310: argv[++arg] : DEFAULT_RULE_FILE;
1311:
1312: /* output filename */
1313: } else if (!strcmp(argv[arg], "-o")) {
1314: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1315: argv[++arg] : DEFAULT_OUTPUT_FILE;
1316:
1.55 frystyk 1317: /* URI prefix */
1318: } else if (!strcmp(argv[arg], "-prefix")) {
1319: char * prefix = NULL;
1320: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1321: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1322: if (*prefix && *prefix != '*') {
1.55 frystyk 1323: StrAllocCopy(mr->prefix, prefix);
1324: StrAllocCat(mr->prefix, "*");
1325: }
1326:
1.1 frystyk 1327: /* timeout -- Change the default request timeout */
1328: } else if (!strcmp(argv[arg], "-timeout")) {
1329: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1330: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1331: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1332:
1.54 frystyk 1333: /* Force no pipelined requests */
1334: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 frystyk 1335: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1336:
1.48 frystyk 1337: /* Start the persistent cache */
1338: } else if (!strcmp(argv[arg], "-cache")) {
1339: cache = YES;
1340:
1.54 frystyk 1341: /* Determine the cache root */
1342: } else if (!strcmp(argv[arg], "-cacheroot")) {
1343: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1344: argv[++arg] : NULL;
1.51 frystyk 1345:
1.52 frystyk 1346: /* Stream write flush delay in ms */
1347: } else if (!strcmp(argv[arg], "-delay")) {
1348: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1349: atoi(argv[++arg]) : DEFAULT_DELAY;
1350: HTHost_setDefaultWriteDelay(delay);
1351:
1.48 frystyk 1352: /* Persistent cache flush */
1353: } else if (!strcmp(argv[arg], "-flush")) {
1354: flush = YES;
1355:
1356: /* Do a cache validation */
1357: } else if (!strcmp(argv[arg], "-validate")) {
1358: mr->flags |= MR_VALIDATE;
1359:
1360: /* Do an end-to-end cache-validation */
1361: } else if (!strcmp(argv[arg], "-endvalidate")) {
1362: mr->flags |= MR_END_VALIDATE;
1363:
1.7 frystyk 1364: /* preemptive or non-preemptive access */
1.1 frystyk 1365: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1366: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1367:
1368: /* test inlined images */
1369: } else if (!strcmp(argv[arg], "-img")) {
1370: mr->flags |= MR_IMG;
1.45 frystyk 1371:
1372: /* load inlined images */
1373: } else if (!strcmp(argv[arg], "-saveimg")) {
1374: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1375:
1376: /* URI prefix for inlined images */
1377: } else if (!strcmp(argv[arg], "-imgprefix")) {
1378: char * prefix = NULL;
1379: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1380: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1381: if (*prefix && *prefix!='*') {
1.59 frystyk 1382: StrAllocCopy(mr->img_prefix, prefix);
1383: StrAllocCat(mr->img_prefix, "*");
1384: }
1.2 frystyk 1385:
1386: /* load anchors */
1.58 frystyk 1387: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1388: mr->flags |= MR_LINK;
1.7 frystyk 1389: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1390: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1391:
1.12 frystyk 1392: /* Output start and end time */
1393: } else if (!strcmp(argv[arg], "-ss")) {
1394: mr->flags |= MR_TIME;
1395:
1.1 frystyk 1396: /* print version and exit */
1397: } else if (!strcmp(argv[arg], "-version")) {
1398: VersionInfo();
1399: Cleanup(mr, 0);
1.46 eric 1400:
1401: /* run in quiet mode */
1402: } else if (!strcmp(argv[arg], "-q")) {
1403: mr->flags |= MR_QUIET;
1.1 frystyk 1404:
1.62 frystyk 1405: /* run in really quiet mode */
1406: } else if (!strcmp(argv[arg], "-Q")) {
1407: mr->flags |= MR_REAL_QUIET;
1408:
1.1 frystyk 1409: #ifdef WWWTRACE
1410: /* trace flags */
1411: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1412: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1413: #endif
1414:
1.58 frystyk 1415: #ifdef HT_POSIX_REGEX
1416:
1417: /* If we can link against a POSIX regex library */
1418: } else if (!strncmp(argv[arg], "-inc", 4)) {
1419: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1420: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1421: }
1422: } else if (!strncmp(argv[arg], "-exc", 4)) {
1423: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1424: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1425: }
1426: } else if (!strncmp(argv[arg], "-check", 6)) {
1427: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1428: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1429: }
1430: #endif
1431:
1.1 frystyk 1432: } else {
1.62 frystyk 1433: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1434: }
1.17 frystyk 1435: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1436: if (!keycnt) {
1437: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1438: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1439: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1440: keycnt = 1;
1.11 frystyk 1441: HT_FREE(ref);
1.1 frystyk 1442: } else { /* Check for successive keyword arguments */
1443: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1444: if (keycnt++ <= 1)
1.5 frystyk 1445: keywords = HTChunk_new(128);
1.1 frystyk 1446: else
1.5 frystyk 1447: HTChunk_putc(keywords, ' ');
1448: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1449: HT_FREE(escaped);
1.1 frystyk 1450: }
1451: }
1452: }
1453:
1454: #ifdef CATCH_SIG
1455: SetSignal();
1456: #endif
1457:
1458: if (!keycnt) {
1.62 frystyk 1459: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1460: Cleanup(mr, -1);
1461: }
1462:
1463: if (mr->depth != DEFAULT_DEPTH &&
1464: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1465: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1466: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1467: mr->depth);
1.1 frystyk 1468: Cleanup(mr, -1);
1469: }
1470:
1.23 manoli 1471: /* Testing that HTTrace is working */
1.62 frystyk 1472: if (mr->flags & MR_TIME) {
1473: if (SHOW_REAL_QUIET(mr)) {
1474: time_t local = time(NULL);
1475: HTTrace("Welcome to the W3C mini Robot - started on %s\n",
1476: HTDateTimeStr(&local, YES));
1477: }
1478: }
1.23 manoli 1479:
1.1 frystyk 1480: /* Rule file specified? */
1481: if (mr->rules) {
1482: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1483: if (!HTLoadRules(rules))
1.62 frystyk 1484: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1485: HT_FREE(rules);
1.1 frystyk 1486: }
1487:
1488: /* Output file specified? */
1489: if (mr->outputfile) {
1490: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1491: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1492: mr->output = OUTPUT;
1493: }
1494: }
1495:
1.48 frystyk 1496: /* Should we use persistent cache? */
1497: if (cache) {
1.54 frystyk 1498: HTCacheInit(cache_root, 20);
1.49 frystyk 1499: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1500: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1501: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1502:
1503: /* Should we start by flushing? */
1504: if (flush) HTCache_flushAll();
1505: }
1506:
1.58 frystyk 1507: /* CLF Log file specified? */
1.55 frystyk 1508: if (mr->logfile) {
1509: mr->log = HTLog_open(mr->logfile, YES, YES);
1510: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1511: }
1512:
1.58 frystyk 1513: /* Referer Log file specified? */
1.57 frystyk 1514: if (mr->reffile) {
1515: mr->ref = HTLog_open(mr->reffile, YES, YES);
1516: if (mr->ref)
1517: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1518: }
1.1 frystyk 1519:
1.58 frystyk 1520: /* Not found error log specified? */
1521: if (mr->notfoundfile) {
1522: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1523: if (mr->notfound)
1524: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1525: }
1526:
1527: /* Negotiated resource log specified? */
1528: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1529:
1530: /* No alt tags log file specified? */
1531: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1532:
1533: /* Reject Log file specified? */
1534: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1535:
1536: /* Register our own terminate filter */
1.32 frystyk 1537: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1538:
1539: /* Setting event timeout */
1540: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1541:
1.56 frystyk 1542: mr->time = HTGetTimeInMillis();
1.37 frystyk 1543:
1.34 eric 1544: /* Start the request */
1545: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1546:
1547: /*
1548: ** Make sure that the first request is flushed immediately and not
1549: ** buffered in the output buffer
1550: */
1551: HTRequest_setFlush(finger->request, YES);
1552:
1553: /*
1.48 frystyk 1554: ** Check whether we should do some kind of cache validation on
1555: ** the load
1556: */
1557: if (mr->flags & MR_VALIDATE)
1558: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1559: if (mr->flags & MR_END_VALIDATE)
1560: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1561:
1562: /*
1.43 frystyk 1563: ** Now do the load
1564: */
1.34 eric 1565: if (mr->flags & MR_PREEMPTIVE)
1566: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1567:
1568: if (keywords) /* Search */
1.34 eric 1569: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1570: else
1.34 eric 1571: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1572:
1.5 frystyk 1573: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1574: if (status != YES) {
1.62 frystyk 1575: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1576: Cleanup(mr, -1);
1577: }
1578:
1579: /* Go into the event loop... */
1.34 eric 1580: HTEventList_loop(finger->request);
1.1 frystyk 1581:
1582: /* Only gets here if event loop fails */
1583: Cleanup(mr, 0);
1584: return 0;
1585: }
Webmaster