Annotation of libwww/Robot/src/HTRobot.c, revision 1.69
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
1.64 frystyk 26: #ifdef HAVE_RXPOSIX_H
27: #include <rxposix.h>
28: #else
1.62 frystyk 29: #ifdef HAVE_REGEX_H
30: #include <regex.h>
31: #endif
32: #endif
1.60 frystyk 33: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 34: #endif
35:
1.14 frystyk 36: #ifndef W3C_VERSION
1.33 eric 37: #define W3C_VERSION "Unspecified"
1.1 frystyk 38: #endif
39:
40: #define APP_NAME "W3CRobot"
1.14 frystyk 41: #define APP_VERSION W3C_VERSION
1.62 frystyk 42: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 43:
44: #define DEFAULT_OUTPUT_FILE "robot.out"
45: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 46: #define DEFAULT_LOG_FILE "log-clf.txt"
47: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 frystyk 48: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 49: #define DEFAULT_LM_FILE "log-lastmodified.txt"
50: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 51: #define DEFAULT_REFERER_FILE "log-referer.txt"
52: #define DEFAULT_REJECT_FILE "log-reject.txt"
53: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
54: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 55: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 56: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 57: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 58: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 59: #define DEFAULT_PREFIX ""
1.59 frystyk 60: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 61: #define DEFAULT_DEPTH 0
1.53 frystyk 62: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 63:
1.68 frystyk 64: #define DEFAULT_SQL_SERVER "localhost"
65: #define DEFAULT_SQL_DB "webbot"
66: #define DEFAULT_SQL_USER "webbot"
67: #define DEFAULT_SQL_PW ""
68:
1.51 frystyk 69: #if 0
1.65 frystyk 70: #define HT_MEMLOG /* Is expensive in performance! */
1.51 frystyk 71: #endif
72:
1.46 eric 73: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 74: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
75: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 76:
1.66 frystyk 77: #define DEFAULT_TIMEOUT 50000 /* timeout in millis */
1.1 frystyk 78:
79: #if defined(__svr4__)
80: #define CATCH_SIG
81: #endif
82:
83: typedef enum _MRFlags {
1.45 frystyk 84: MR_IMG = 0x1,
85: MR_LINK = 0x2,
86: MR_PREEMPTIVE = 0x4,
87: MR_TIME = 0x8,
1.46 eric 88: MR_SAVE = 0x10,
1.48 frystyk 89: MR_QUIET = 0x20,
1.62 frystyk 90: MR_REAL_QUIET = 0x40,
91: MR_VALIDATE = 0x80,
92: MR_END_VALIDATE = 0x100,
1.63 frystyk 93: MR_KEEP_META = 0x200,
94: MR_LOGGING = 0x400,
95: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 96: } MRFlags;
97:
98: typedef struct _Robot {
1.2 frystyk 99: int depth; /* How deep is our tree */
1.30 frystyk 100: int cnt; /* Count of requests */
1.2 frystyk 101: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 102: HTList * htext; /* List of our HText Objects */
1.34 eric 103: HTList * fingers;
1.59 frystyk 104:
1.40 frystyk 105: int timer;
1.65 frystyk 106: char * cwd; /* Current dir URL */
1.1 frystyk 107: char * rules;
1.55 frystyk 108: char * prefix;
1.59 frystyk 109: char * img_prefix;
110:
1.60 frystyk 111: char * logfile; /* clf log */
1.55 frystyk 112: HTLog * log;
1.60 frystyk 113: char * reffile; /* referer log */
1.57 frystyk 114: HTLog * ref;
1.60 frystyk 115: char * rejectfile; /* unchecked links */
1.58 frystyk 116: HTLog * reject;
1.60 frystyk 117: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 118: HTLog * notfound;
1.60 frystyk 119: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 120: HTLog * conneg;
1.60 frystyk 121: char * noalttagfile; /* images without alt tags*/
122: HTLog * noalttag;
123:
124: char * hitfile; /* links sorted after hit counts */
1.64 frystyk 125: char * relfile; /* link sorted after relationships */
126: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 127: char * titlefile; /* links with titles */
1.60 frystyk 128: char * mtfile; /* media types encountered */
129: char * charsetfile; /* charsets encountered */
1.63 frystyk 130: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 131:
132: char * outputfile;
1.1 frystyk 133: FILE * output;
1.59 frystyk 134:
1.1 frystyk 135: MRFlags flags;
1.55 frystyk 136:
1.59 frystyk 137: long get_bytes; /* Total number of bytes processed using GET*/
138: long get_docs; /* Total number of documents using GET */
139:
140: long head_bytes; /* bytes processed bytes processed using HEAD */
141: long head_docs; /* Total number of documents using HEAD*/
142:
143: long other_docs;
144:
1.56 frystyk 145: ms_t time; /* Time of run */
1.58 frystyk 146:
147: #ifdef HT_POSIX_REGEX
148: regex_t * include;
149: regex_t * exclude;
150: regex_t * check;
151: #endif
152:
1.68 frystyk 153: #ifdef HT_MYSQL
154: HTSQLLog * sqllog;
155: char * sqlserver;
156: char * sqldb;
157: char * sqluser;
158: char * sqlpw;
159: char * sqlrelative;
160: BOOL sqlexternals;
161: int sqlflags;
162: #endif
163:
1.1 frystyk 164: } Robot;
1.34 eric 165:
166: typedef struct _Finger {
167: Robot * robot;
168: HTRequest * request;
169: HTParentAnchor * dest;
170: } Finger;
171:
1.1 frystyk 172: typedef enum _LoadState {
173: L_INVALID = -2,
174: L_LOADING = -1,
175: L_SUCCESS = 0,
176: L_ERROR
177: } LoadState;
178:
179: /*
180: ** The HyperDoc object is bound to the anchor and contains information about
181: ** where we are in the search for recursive searches
182: */
183: typedef struct _HyperDoc {
184: HTParentAnchor * anchor;
185: LoadState state;
186: int depth;
1.55 frystyk 187: int hits;
1.1 frystyk 188: } HyperDoc;
189:
190: /*
1.65 frystyk 191: ** This is the HText object that is created every time we start parsing an
1.1 frystyk 192: ** HTML object
193: */
1.4 frystyk 194: struct _HText {
1.1 frystyk 195: HTRequest * request;
1.65 frystyk 196: BOOL follow;
1.4 frystyk 197: };
1.1 frystyk 198:
1.58 frystyk 199: /*
200: ** A structure for calculating metadata distributions
201: */
202: typedef struct _MetaDist {
203: HTAtom * name;
204: int hits;
205: } MetaDist;
206:
207: /*
208: ** Some sorting algorithms
209: */
1.63 frystyk 210: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 211:
1.1 frystyk 212: PUBLIC HText * HTMainText = NULL;
213: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
214: PUBLIC HTStyleSheet * styleSheet = NULL;
215:
216: /* ------------------------------------------------------------------------- */
217:
1.13 eric 218: /* Standard (non-error) Output
219: ** ---------------------------
220: */
221: PUBLIC int OutputData(const char * fmt, ...)
222: {
223: int ret;
224: va_list pArgs;
225: va_start(pArgs, fmt);
226: ret = vfprintf(stdout, fmt, pArgs);
227: va_end(pArgs);
228: return ret;
229: }
230:
231: /* ------------------------------------------------------------------------- */
232:
1.2 frystyk 233: /* Create a "HyperDoc" object
234: ** --------------------------
235: ** A HyperDoc object contains information about whether we have already
236: ** started checking the anchor and the depth in our search
237: */
238: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
239: {
240: HyperDoc * hd;
1.14 frystyk 241: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
242: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 243: hd->state = L_INVALID;
244: hd->depth = depth;
1.55 frystyk 245: hd->hits = 1;
1.2 frystyk 246:
247: /* Bind the HyperDoc object together with the Anchor Object */
248: hd->anchor = anchor;
249: HTAnchor_setDocument(anchor, (void *) hd);
250:
251: /* Add this HyperDoc object to our list */
252: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
253: HTList_addObject(mr->hyperdoc, (void *) hd);
254: return hd;
255: }
256:
257: /* Delete a "HyperDoc" object
258: ** --------------------------
259: */
260: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
261: {
262: if (hd) {
1.11 frystyk 263: HT_FREE (hd);
1.2 frystyk 264: return YES;
265: }
266: return NO;
267: }
268:
1.55 frystyk 269: /*
270: ** Sort the anchor array and log reference count
271: */
272: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
273: {
274: if (mr && array) {
275: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
276: if (log) {
277: void ** data = NULL;
278: HTParentAnchor * anchor = NULL;
279: HTArray_sort(array, HitSort);
280: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
281: while (anchor) {
282: char * uri = HTAnchor_address((HTAnchor *) anchor);
283: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 284: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 285: HT_FREE(uri);
286: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
287: }
288: }
289: HTLog_close(log);
290: return YES;
291: }
292: return NO;
293: }
294:
295: PRIVATE int HitSort (const void * a, const void * b)
296: {
297: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
298: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
299: if (aa && bb) return (bb->hits - aa->hits);
300: return bb - aa;
301: }
302:
1.58 frystyk 303: /*
1.64 frystyk 304: ** Sort the anchor array and log link relations
305: */
306: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
307: {
308: if (mr && array) {
1.68 frystyk 309: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
310: void ** data = NULL;
311: HTParentAnchor * anchor = NULL;
312: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
313: while (anchor) {
314:
315: /*
316: ** If we have a specific link relation to look for then do this.
317: ** Otherwise look for all link relations.
318: */
319: if (mr->relation) {
320: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
321: if (link) {
322: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
323: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
324: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
325: if (src_uri && dest_uri) {
326: #ifdef HT_MYSQL
327: if (mr->sqllog) {
328: HTSQLLog_addLinkRelationship (mr->sqllog,
329: src_uri, dest_uri,
330: HTAtom_name(mr->relation),
331: NULL);
332: }
333: #endif
334: if (log) {
335: HTFormat format = HTAnchor_format(dest);
336: HTLog_addText(log, "%s %s %s --> %s\n",
337: HTAtom_name(mr->relation),
338: format != WWW_UNKNOWN ?
339: HTAtom_name(format) : "<unknown>",
340: src_uri, dest_uri);
341: }
342:
343: /* Cleanup */
344: HT_FREE(src_uri);
345: HT_FREE(dest_uri);
346: }
347: }
348: } else {
349: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
350: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
351: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
352: HTLinkType linktype;
353:
354: /* First look in the main link */
355: if (link && (linktype = HTLink_type(link))) {
356: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
357: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
358: if (src_uri && dest_uri) {
359: #ifdef HT_MYSQL
360: if (mr->sqllog) {
361: HTSQLLog_addLinkRelationship (mr->sqllog,
362: src_uri, dest_uri,
363: HTAtom_name(linktype),
364: NULL);
365: }
366: #endif
367: if (log) {
368: HTFormat format = HTAnchor_format(dest);
369: HTLog_addText(log, "%s %s %s --> %s\n",
370: HTAtom_name(linktype),
371: format != WWW_UNKNOWN ?
372: HTAtom_name(format) : "<unknown>",
373: src_uri, dest_uri);
374: }
375: }
376: HT_FREE(dest_uri);
377: }
378:
379: /* and then in any sublinks */
380: if (sublinks) {
381: HTLink * pres;
382: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
383: if ((linktype = HTLink_type(pres))) {
384: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 385: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 386: if (src_uri && dest_uri) {
387: #ifdef HT_MYSQL
388: if (mr->sqllog) {
389: HTSQLLog_addLinkRelationship (mr->sqllog,
390: src_uri, dest_uri,
391: HTAtom_name(linktype),
392: NULL);
393: }
394: #endif
395: if (log) {
396: HTFormat format = HTAnchor_format(dest);
397: HTLog_addText(log, "%s %s %s --> %s\n",
398: HTAtom_name(linktype),
399: format != WWW_UNKNOWN ?
400: HTAtom_name(format) : "<unknown>",
401: src_uri, dest_uri);
402: }
1.64 frystyk 403: HT_FREE(dest_uri);
404: }
405: }
406: }
407: }
1.68 frystyk 408:
409: /* Cleanup */
410: HT_FREE(src_uri);
411: }
412: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 413: }
1.68 frystyk 414: if (log) HTLog_close(log);
1.64 frystyk 415: return YES;
416: }
417: return NO;
418: }
419:
420: /*
1.63 frystyk 421: ** Sort the anchor array and log last modified date
422: */
423: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
424: {
425: if (mr && array) {
426: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
427: if (log) {
428: void ** data = NULL;
429: HTParentAnchor * anchor = NULL;
430: HTArray_sort(array, LastModifiedSort);
431: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
432: while (anchor) {
433: char * uri = HTAnchor_address((HTAnchor *) anchor);
434: time_t lm = HTAnchor_lastModified(anchor);
435: if (uri && lm > 0)
436: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
437: HT_FREE(uri);
438: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
439: }
440: }
441: HTLog_close(log);
442: return YES;
443: }
444: return NO;
445: }
446:
447: PRIVATE int LastModifiedSort (const void * a, const void * b)
448: {
449: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
450: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
451: return bb - aa;
452: }
453:
454: /*
455: ** Sort the anchor array and log the document title
456: */
457: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
458: {
459: if (mr && array) {
460: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
461: if (log) {
462: void ** data = NULL;
463: HTParentAnchor * anchor = NULL;
464: HTArray_sort(array, TitleSort);
465: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
466: while (anchor) {
467: char * uri = HTAnchor_address((HTAnchor *) anchor);
468: const char * title = HTAnchor_title(anchor);
469: HTCharset charset = HTAnchor_charset(anchor);
470: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
471: charset ? HTAtom_name(charset) : "<none>",
472: title ? title : "<none>",
473: uri);
474: HT_FREE(uri);
475: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
476: }
477: }
478: HTLog_close(log);
479: return YES;
480: }
481: return NO;
482: }
483:
484: PRIVATE int TitleSort (const void * a, const void * b)
485: {
486: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
487: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
488: return strcasecomp(bb?bb:"", aa?aa:"");
489: }
490:
491: /*
1.58 frystyk 492: ** Calculate distributions for media types. The same mechanism
493: ** can be used for other characteristics with relatively
494: ** few outcomes.
495: */
496: PRIVATE HTList * mediatype_distribution (HTArray * array)
497: {
498: if (array) {
499: HTList * mt = HTList_new();
500: MetaDist * pres = NULL;
501: void ** data = NULL;
502: HTParentAnchor * anchor = NULL;
503: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
504: while (anchor) {
505: HTFormat format = HTAnchor_format(anchor);
506: if (format && format != WWW_UNKNOWN) {
507: HTList * cur = mt;
508:
509: /* If found then increase counter */
510: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
511: if (pres->name == format) {
512: pres->hits++;
513: break;
514: }
515: }
516:
517: /* If not found then add new format to list */
518: if (!pres) {
519: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
520: HT_OUTOFMEM("mediatype_distribution");
521: pres->name = format;
522: pres->hits = 1;
523: HTList_addObject(mt, pres);
524: HTList_insertionSort(mt, FormatSort);
525: }
526: }
527:
528: /* Find next anchor in array */
529: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
530: }
531: return mt;
532: }
533: return NULL;
534: }
535:
1.60 frystyk 536: /*
537: ** Calculate distributions for charsets. The same mechanism
538: ** can be used for other characteristics with relatively
539: ** few outcomes.
540: */
541: PRIVATE HTList * charset_distribution (HTArray * array)
542: {
543: if (array) {
544: HTList * cs = HTList_new();
545: MetaDist * pres = NULL;
546: void ** data = NULL;
547: HTParentAnchor * anchor = NULL;
548: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
549: while (anchor) {
550: HTCharset charset = HTAnchor_charset(anchor);
551: if (charset) {
552: HTList * cur = cs;
553:
554: /* If found then increase counter */
555: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
556: if (pres->name == charset) {
557: pres->hits++;
558: break;
559: }
560: }
561:
562: /* If not found then add new format to list */
563: if (!pres) {
564: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
565: HT_OUTOFMEM("charset_distribution");
566: pres->name = charset;
567: pres->hits = 1;
568: HTList_addObject(cs, pres);
569: HTList_insertionSort(cs, FormatSort);
570: }
571: }
572:
573: /* Find next anchor in array */
574: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
575: }
576: return cs;
577: }
578: return NULL;
579: }
580:
1.58 frystyk 581: PRIVATE int FormatSort (const void * a, const void * b)
582: {
583: MetaDist * aa = (MetaDist *) a;
584: MetaDist * bb = (MetaDist *) b;
585: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
586: }
587:
588: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
589: {
590: if (logfile && distribution) {
591: HTLog * log = HTLog_open(logfile, YES, YES);
592: if (log) {
593: HTList * cur = distribution;
594: MetaDist * pres;
595: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
596: if (pres->name) {
1.60 frystyk 597: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 598: }
599: }
600: HTLog_close(log);
601: }
602: }
603: return NO;
604: }
605:
606: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
607: {
608: if (distribution) {
609: HTList * cur = distribution;
610: MetaDist * pres;
611: while ((pres = (MetaDist *) HTList_nextObject(cur)))
612: HT_FREE(pres);
613: HTList_delete(distribution);
614: return YES;
615: }
616: return NO;
617: }
618:
619:
1.55 frystyk 620: /* Statistics
621: ** ----------
622: ** Calculates a bunch of statistics for the anchors traversed
623: */
624: PRIVATE BOOL calculate_statistics (Robot * mr)
625: {
1.59 frystyk 626: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 627: if (!mr) return NO;
628:
629: /* Calculate efficiency */
1.59 frystyk 630: if (mr->time > 0) {
1.56 frystyk 631: ms_t t = HTGetTimeInMillis() - mr->time;
632: if (t > 0) {
1.60 frystyk 633: double loadfactor = (mr->get_bytes / (t * 0.001));
634: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 635: double secs = t / 1000.0;
1.55 frystyk 636: char bytes[50];
1.62 frystyk 637: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 638: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 639: total_docs, secs, reqprsec);
1.59 frystyk 640:
641: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 642: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 643: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 644: mr->get_docs, bytes, loadfactor);
1.59 frystyk 645:
646: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 647: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 648: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 649: mr->head_docs, bytes);
1.55 frystyk 650: }
651: }
652:
653: /* Create an array of existing anchors */
1.59 frystyk 654: if (total_docs > 1) {
655: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 656: if (array) {
657:
1.63 frystyk 658: /* Distributions */
659: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 660: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 661: }
662:
1.55 frystyk 663: /* Sort after hit counts */
1.63 frystyk 664: if (mr->hitfile) {
665: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 666: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 667: mr->hitfile);
668: calculate_hits(mr, array);
669: }
670:
1.64 frystyk 671: /* Sort after link relations */
1.68 frystyk 672: #ifdef HT_MYSQL
673: if (mr->relfile || mr->sqllog) {
1.69 ! frystyk 674: #else
! 675: if (mr->relfile) {
! 676: #endif
1.68 frystyk 677: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 678: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
679: mr->relfile);
680: calculate_linkRelations(mr, array);
681: }
682:
1.63 frystyk 683: /* Sort after modified date */
684: if (mr->lmfile) {
685: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 686: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 687: mr->lmfile);
688: calculate_lm(mr, array);
689: }
690:
691: /* Sort after title */
692: if (mr->titlefile) {
693: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 694: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 695: mr->titlefile);
696: calculate_title(mr, array);
697: }
1.55 frystyk 698:
1.58 frystyk 699: /* Find mediatype distribution */
700: if (mr->mtfile) {
701: HTList * mtdist = mediatype_distribution(array);
702: if (mtdist) {
1.63 frystyk 703: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 704: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 705: mr->mtfile);
1.58 frystyk 706: log_meta_distribution(mr->mtfile, mtdist);
707: delete_meta_distribution(mtdist);
708: }
709: }
1.55 frystyk 710:
1.60 frystyk 711: /* Find charset distribution */
712: if (mr->charsetfile) {
713: HTList * charsetdist = charset_distribution(array);
714: if (charsetdist) {
1.63 frystyk 715: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 716: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 717: mr->charsetfile);
1.60 frystyk 718: log_meta_distribution(mr->charsetfile, charsetdist);
719: delete_meta_distribution(charsetdist);
720: }
721: }
722:
1.55 frystyk 723: /* Add as may other stats here as you like */
1.60 frystyk 724: /* ... */
1.58 frystyk 725:
726: /* Delete the array */
1.55 frystyk 727: HTArray_delete(array);
728: }
729: }
730: return YES;
731: }
732:
1.1 frystyk 733: /* Create a Command Line Object
734: ** ----------------------------
735: */
736: PRIVATE Robot * Robot_new (void)
737: {
738: Robot * me;
1.41 frystyk 739: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 740: HT_OUTOFMEM("Robot_new");
1.2 frystyk 741: me->hyperdoc = HTList_new();
1.4 frystyk 742: me->htext = HTList_new();
1.40 frystyk 743: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 744: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 745: me->output = OUTPUT;
1.35 eric 746: me->cnt = 0;
1.34 eric 747: me->fingers = HTList_new();
1.1 frystyk 748: return me;
749: }
750:
751: /* Delete a Command Line Object
752: ** ----------------------------
753: */
1.62 frystyk 754: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 755: {
1.62 frystyk 756: if (mr) {
757: HTList_delete(mr->fingers);
1.55 frystyk 758:
759: /* Calculate statistics */
1.62 frystyk 760: calculate_statistics(mr);
1.55 frystyk 761:
1.62 frystyk 762: if (mr->hyperdoc) {
763: HTList * cur = mr->hyperdoc;
1.2 frystyk 764: HyperDoc * pres;
765: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
766: HyperDoc_delete(pres);
1.62 frystyk 767: HTList_delete(mr->hyperdoc);
1.2 frystyk 768: }
1.62 frystyk 769: if (mr->htext) {
770: HTList * cur = mr->htext;
1.4 frystyk 771: HText * pres;
772: while ((pres = (HText *) HTList_nextObject(cur)))
773: HText_free(pres);
1.62 frystyk 774: HTList_delete(mr->htext);
1.4 frystyk 775: }
1.62 frystyk 776:
777: /* Close all the log files */
1.63 frystyk 778: if (mr->flags & MR_LOGGING) {
1.64 frystyk 779: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 780: }
781:
1.62 frystyk 782: if (mr->log) {
783: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 784: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 785: HTLog_accessCount(mr->log), mr->logfile);
786: HTLog_close(mr->log);
787: }
788: if (mr->ref) {
789: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 790: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 791: HTLog_accessCount(mr->ref), mr->reffile);
792: HTLog_close(mr->ref);
793: }
794: if (mr->reject) {
795: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 796: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 797: HTLog_accessCount(mr->reject), mr->rejectfile);
798: HTLog_close(mr->reject);
799: }
800: if (mr->notfound) {
801: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 802: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 803: HTLog_accessCount(mr->notfound), mr->notfoundfile);
804: HTLog_close(mr->notfound);
805: }
806: if (mr->conneg) {
807: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 808: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 809: HTLog_accessCount(mr->conneg), mr->connegfile);
810: HTLog_close(mr->conneg);
811: }
812: if (mr->noalttag) {
813: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 814: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 815: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
816: HTLog_close(mr->noalttag);
817: }
818:
819: if (mr->output && mr->output != STDOUT) fclose(mr->output);
820:
821: if (mr->flags & MR_TIME) {
1.12 frystyk 822: time_t local = time(NULL);
1.62 frystyk 823: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 824: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 825: }
1.55 frystyk 826:
1.58 frystyk 827: #ifdef HT_POSIX_REGEX
1.62 frystyk 828: if (mr->include) {
829: regfree(mr->include);
830: HT_FREE(mr->include);
831: }
832: if (mr->exclude) {
833: regfree(mr->exclude);
834: HT_FREE(mr->exclude);
835: }
836: if (mr->check) {
837: regfree(mr->check);
838: HT_FREE(mr->check);
1.58 frystyk 839: }
840: #endif
841:
1.68 frystyk 842: #ifdef HT_MYSQL
843: if (mr->sqllog) {
844: HTSQLLog_close(mr->sqllog);
845: mr->sqllog = NULL;
846: }
847: #endif
848:
1.62 frystyk 849: HT_FREE(mr->cwd);
850: HT_FREE(mr->prefix);
851: HT_FREE(mr->img_prefix);
852: HT_FREE(mr);
1.1 frystyk 853: return YES;
854: }
855: return NO;
856: }
857:
1.2 frystyk 858: /*
1.34 eric 859: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 860: */
1.34 eric 861: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 862: {
1.34 eric 863: Finger * me;
864: HTRequest * request = HTRequest_new();
865: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
866: HT_OUTOFMEM("Finger_new");
867: me->robot = robot;
868: me->request = request;
869: me->dest = dest;
870: HTList_addObject(robot->fingers, (void *)me);
871:
1.48 frystyk 872: /* Set the context for this request */
1.34 eric 873: HTRequest_setContext (request, me);
1.48 frystyk 874:
875: /* Check the various flags to customize the request */
876: if (robot->flags & MR_PREEMPTIVE)
877: HTRequest_setPreemptive(request, YES);
878: if (robot->flags & MR_VALIDATE)
879: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
880: if (robot->flags & MR_END_VALIDATE)
881: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
882:
883: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 884: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 885:
886: /* Set the method for this request */
1.34 eric 887: HTRequest_setMethod(request, method);
888: robot->cnt++;
889: return me;
1.2 frystyk 890: }
891:
1.34 eric 892: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 893: {
1.34 eric 894: HTList_removeObject(me->robot->fingers, (void *)me);
895: me->robot->cnt--;
1.37 frystyk 896:
897: /*
898: ** If we are down at one request then flush the output buffer
899: */
900: if (me->request) {
901: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 902: HTRequest_delete(me->request);
1.37 frystyk 903: }
904:
905: /*
906: ** Delete the request and free myself
907: */
1.34 eric 908: HT_FREE(me);
909: return YES;
1.2 frystyk 910: }
911:
912: /*
913: ** Cleanup and make sure we close all connections including the persistent
914: ** ones
915: */
1.1 frystyk 916: PRIVATE void Cleanup (Robot * me, int status)
917: {
918: Robot_delete(me);
1.29 eric 919: HTProfile_delete();
1.50 frystyk 920: #ifdef HT_MEMLOG
1.39 eric 921: HTMemLog_close();
1.47 frystyk 922: #endif
923:
1.1 frystyk 924: #ifdef VMS
925: exit(status ? status : 1);
926: #else
927: exit(status ? status : 0);
928: #endif
929: }
930:
931: #ifdef CATCH_SIG
932: #include <signal.h>
933: /* SetSignal
934: ** This function sets up signal handlers. This might not be necessary to
935: ** call if the application has its own handlers (lossage on SVR4)
936: */
937: PRIVATE void SetSignal (void)
938: {
939: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
940: ** when attemting to connect to a remote host where you normally should
941: ** get `connection refused' back
942: */
943: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 944: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 945: } else {
1.13 eric 946: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 947: }
1.47 frystyk 948:
1.50 frystyk 949: #ifdef HT_MEMLOG
1.44 eric 950: HTMemLog_flush();
1.47 frystyk 951: #endif
952:
1.1 frystyk 953: }
954: #endif /* CATCH_SIG */
955:
1.58 frystyk 956: #ifdef HT_POSIX_REGEX
957: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
958: {
959: size_t length = regerror (errcode, compiled, NULL, 0);
960: char * str = NULL;
961: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
962: HT_OUTOFMEM("get_regerror");
963: (void) regerror (errcode, compiled, str, length);
964: return str;
965: }
966:
1.60 frystyk 967: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 968: {
969: regex_t * regex = NULL;
970: if (regex_str && *regex_str) {
971: int status;
972: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
973: HT_OUTOFMEM("get_regtype");
1.60 frystyk 974: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 975: char * err_msg = get_regerror(status, regex);
1.62 frystyk 976: if (SHOW_REAL_QUIET(mr))
977: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 978: HT_FREE(err_msg);
979: Cleanup(mr, -1);
980: }
981: }
982: return regex;
983: }
984: #endif
985:
1.1 frystyk 986: PRIVATE void VersionInfo (void)
987: {
1.62 frystyk 988: OutputData("W3C Sample Software\n\n");
989: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
990: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
991: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 992: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 993: }
994:
995: /* terminate_handler
996: ** -----------------
1.2 frystyk 997: ** This function is registered to handle the result of the request.
998: ** If no more requests are pending then terminate program
1.1 frystyk 999: */
1.32 frystyk 1000: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
1001: void * param, int status)
1.1 frystyk 1002: {
1.34 eric 1003: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 1004: Robot * mr = finger->robot;
1.62 frystyk 1005: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 1006:
1.68 frystyk 1007: #ifdef HT_MYSQL
1008: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1009: #endif
1010:
1.58 frystyk 1011: /* Check if negotiated resource and whether we should log that*/
1012: if (mr->conneg) {
1013: HTAssocList * cur = HTResponse_variant(response);
1014: if (cur) {
1015: BOOL first = YES;
1016: HTChunk * buffer = HTChunk_new(128);
1017: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1018: HTAssoc * pres;
1.60 frystyk 1019: HTChunk_puts(buffer, uri);
1.58 frystyk 1020: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1021: char * value = HTAssoc_value(pres);
1022: if (first) {
1.60 frystyk 1023: HTChunk_puts(buffer, "\t(");
1.58 frystyk 1024: first = NO;
1025: } else
1026: HTChunk_puts(buffer, ", ");
1027:
1028: /* Output the name */
1029: HTChunk_puts(buffer, HTAssoc_name(pres));
1030:
1031: /* Only output the value if not empty string */
1.60 frystyk 1032: if (value && *value) {
1.58 frystyk 1033: HTChunk_puts(buffer, "=");
1034: HTChunk_puts(buffer, value);
1035: }
1036: }
1.60 frystyk 1037: if (!first) HTChunk_puts(buffer, ")");
1038: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 1039: HTChunk_delete(buffer);
1040: HT_FREE(uri);
1041: }
1042: }
1043:
1.55 frystyk 1044: /* Count the amount of body data that we have read */
1.59 frystyk 1045: if (HTRequest_method(request) == METHOD_GET) {
1046: int length = HTAnchor_length(HTRequest_anchor(request));
1047: if (length > 0) mr->get_bytes += length;
1048: mr->get_docs++;
1049: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 1050: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 1051: if (length > 0) mr->head_bytes += length;
1052: mr->head_docs++;
1053: } else {
1054: mr->other_docs++;
1.55 frystyk 1055: }
1056:
1.58 frystyk 1057: /* Cleanup the anchor so that we don't drown in metainformation */
1058: if (!(mr->flags & MR_KEEP_META))
1059: HTAnchor_clearHeader(HTRequest_anchor(request));
1060:
1.55 frystyk 1061: /* Delete this thread */
1.34 eric 1062: Finger_delete(finger);
1.55 frystyk 1063:
1064: /* Should we stop? */
1.46 eric 1065: if (mr->cnt <= 0) {
1.62 frystyk 1066: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1067: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 1068: }
1.62 frystyk 1069: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 1070: return HT_OK;
1071: }
1072:
1073: /* ------------------------------------------------------------------------- */
1074: /* HTEXT INTERFACE */
1075: /* ------------------------------------------------------------------------- */
1076:
1077: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
1078: HTStream * stream)
1079: {
1080: HText * me;
1.34 eric 1081: Finger * finger = (Finger *) HTRequest_context(request);
1082: Robot * mr = finger->robot;
1.65 frystyk 1083: char * robots = NULL;
1084:
1.14 frystyk 1085: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1086: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1087:
1088: /* Bind the HText object together with the Request Object */
1.1 frystyk 1089: me->request = request;
1.65 frystyk 1090: me->follow = YES;
1091:
1092: /* Check to see if we have any meta tags */
1093: if ((robots = HTAnchor_robots(anchor)) != NULL) {
1094: char * strval = NULL;
1095: char * ptr = NULL;
1096: char * token = NULL;
1097: StrAllocCopy(strval, robots);
1098: ptr = strval;
1099: while ((token = HTNextField(&ptr)) != NULL) {
1100: if (!strcasecomp(token, "nofollow")) {
1101: me->follow = NO;
1102: break;
1103: }
1104: }
1105: HT_FREE(strval);
1106: }
1.4 frystyk 1107:
1108: /* Add this HyperDoc object to our list */
1109: if (!mr->htext) mr->htext = HTList_new();
1110: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1111: return me;
1112: }
1113:
1.4 frystyk 1114: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1115: if (me) HT_FREE (me);
1.4 frystyk 1116: }
1117:
1.1 frystyk 1118: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1119: {
1120: if (text && anchor) {
1.34 eric 1121: Finger * finger = (Finger *) HTRequest_context(text->request);
1122: Robot * mr = finger->robot;
1.1 frystyk 1123: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1124: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1125: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1126: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1127: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1128: BOOL match = text->follow;
1.58 frystyk 1129: BOOL check = NO;
1.1 frystyk 1130:
1.55 frystyk 1131: if (!uri) return;
1.62 frystyk 1132: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1133:
1134: if (hd) {
1.62 frystyk 1135: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1136: hd->hits++;
1.68 frystyk 1137: #ifdef HT_MYSQL
1138: if (mr->sqllog) {
1139: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1140: if (ref_addr) {
1141: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1142: "referer", NULL);
1143: HT_FREE(ref_addr);
1144: }
1145: }
1146: #endif
1.58 frystyk 1147: HT_FREE(uri);
1148: return;
1149: }
1150:
1151: /* Check for prefix match */
1.65 frystyk 1152: if (match && mr->prefix) {
1153: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1154: }
1.58 frystyk 1155:
1156: #ifdef HT_POSIX_REGEX
1.69 ! frystyk 1157: /*
! 1158: ** Check for any regular expression. The include may override
! 1159: ** the prefix matching
! 1160: */
! 1161: if (mr->include) {
1.58 frystyk 1162: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1163: }
1164: if (match && mr->exclude) {
1165: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1166: }
1167: if (match && mr->check) {
1168: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1169: }
1170: #endif
1171:
1172: /* Test whether we already have a hyperdoc for this document */
1173: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1174: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1175: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1176: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1177: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1178: HTRequest * newreq = newfinger->request;
1.2 frystyk 1179: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1180: HTRequest_setParent(newreq, referer);
1.58 frystyk 1181: if (check || depth >= mr->depth) {
1.62 frystyk 1182: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1183: HTRequest_setMethod(newreq, METHOD_HEAD);
1184: } else {
1.62 frystyk 1185: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1186: }
1187: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1188: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1189: Finger_delete(newfinger);
1.2 frystyk 1190: }
1.7 frystyk 1191: } else {
1.62 frystyk 1192: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1193: #ifdef HT_MYSQL
1194: if (mr->reject || mr->sqllog) {
1195: #else
1.60 frystyk 1196: if (mr->reject) {
1.68 frystyk 1197: #endif
1.60 frystyk 1198: if (referer) {
1199: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1200: if (mr->reject && ref_addr)
1201: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1202: #ifdef HT_MYSQL
1203: if (mr->sqllog && mr->sqlexternals && ref_addr)
1204: HTSQLLog_addLinkRelationship(mr->sqllog,
1205: ref_addr, uri,
1206: "referer", NULL);
1207: #endif
1208:
1.60 frystyk 1209: HT_FREE(ref_addr);
1210: }
1211: }
1.2 frystyk 1212: }
1.11 frystyk 1213: HT_FREE(uri);
1.2 frystyk 1214: }
1215: }
1216:
1217: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1218: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1219: {
1220: if (text && anchor) {
1.34 eric 1221: Finger * finger = (Finger *) HTRequest_context(text->request);
1222: Robot * mr = finger->robot;
1.59 frystyk 1223: if (mr->flags & MR_IMG) {
1.60 frystyk 1224: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1225: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1226: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1227: HyperDoc * hd = HTAnchor_document(dest_parent);
1228: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1229: BOOL match = YES;
1230:
1231: if (hd) {
1.62 frystyk 1232: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1233: hd->hits++;
1.68 frystyk 1234: #ifdef HT_MYSQL
1235: if (mr->sqllog) {
1236: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1237: if (ref_addr) {
1238: HTSQLLog_addLinkRelationship(mr->sqllog,
1239: ref_addr, uri,
1240: "image", alt);
1241: HT_FREE(ref_addr);
1242: }
1243: }
1244: #endif
1.11 frystyk 1245: HT_FREE(uri);
1.59 frystyk 1246: return;
1.2 frystyk 1247: }
1.59 frystyk 1248:
1249: /* Check for prefix match */
1250: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1251:
1252: /* Test whether we already have a hyperdoc for this document */
1253: if (match && dest) {
1.60 frystyk 1254: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1255: mr->flags & MR_SAVE ?
1256: METHOD_GET : METHOD_HEAD);
1257: HTRequest * newreq = newfinger->request;
1.60 frystyk 1258: HyperDoc_new(mr, dest_parent, 1);
1259: HTRequest_setParent(newreq, referer);
1260:
1261: /* Check whether we should report missing ALT tags */
1262: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1263: if (referer) {
1264: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1265: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1266: HT_FREE(ref_addr);
1267: }
1268: }
1269:
1.62 frystyk 1270: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1271: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1272: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1273: Finger_delete(newfinger);
1274: }
1275: } else {
1.62 frystyk 1276: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1277: #ifdef HT_MYSQL
1278: if (mr->reject || mr->sqllog) {
1279: #else
1.60 frystyk 1280: if (mr->reject) {
1.68 frystyk 1281: #endif
1.60 frystyk 1282: if (referer) {
1283: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1284: if (mr->reject && ref_addr)
1285: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1286: #ifdef HT_MYSQL
1287: if (mr->sqllog && mr->sqlexternals && ref_addr)
1288: HTSQLLog_addLinkRelationship(mr->sqllog,
1289: ref_addr, uri,
1290: "image", alt);
1291: #endif
1292:
1.60 frystyk 1293: HT_FREE(ref_addr);
1294: }
1295: }
1.1 frystyk 1296: }
1.59 frystyk 1297: HT_FREE(uri);
1.1 frystyk 1298: }
1299: }
1300: }
1301:
1302: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1303: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1304: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1305: PUBLIC void HText_endAppend (HText * text) {}
1306: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1307: PUBLIC void HText_beginAppend (HText * text) {}
1308: PUBLIC void HText_appendParagraph (HText * text) {}
1309:
1.48 frystyk 1310: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1311: {
1312: return (vfprintf(stderr, fmt, pArgs));
1313: }
1314:
1.1 frystyk 1315: /* ------------------------------------------------------------------------- */
1316: /* MAIN PROGRAM */
1317: /* ------------------------------------------------------------------------- */
1318:
1319: int main (int argc, char ** argv)
1320: {
1.48 frystyk 1321: int status = 0;
1.1 frystyk 1322: int arg;
1.48 frystyk 1323: BOOL cache = NO; /* Use persistent cache */
1324: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1325: char * cache_root = NULL;
1.1 frystyk 1326: HTChunk * keywords = NULL; /* From command line */
1327: int keycnt = 0;
1.12 frystyk 1328: Robot * mr = NULL;
1.43 frystyk 1329: Finger * finger = NULL;
1330: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1331:
1332: /* Starts Mac GUSI socket library */
1333: #ifdef GUSI
1334: GUSISetup(GUSIwithSIOUXSockets);
1335: GUSISetup(GUSIwithInternetSockets);
1336: #endif
1337:
1338: #ifdef __MWERKS__ /* STR */
1339: InitGraf((Ptr) &qd.thePort);
1340: InitFonts();
1341: InitWindows();
1342: InitMenus(); TEInit();
1343: InitDialogs(nil);
1344: InitCursor();
1345: SIOUXSettings.asktosaveonclose = false;
1346: argc=ccommand(&argv);
1.50 frystyk 1347: #endif /* __MWERKS__ */
1.1 frystyk 1348:
1.50 frystyk 1349: #ifdef HT_MEMLOG
1.51 frystyk 1350: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1351: #endif
1.46 eric 1352:
1.27 frystyk 1353: /* Initiate W3C Reference Library with a robot profile */
1354: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1355: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1356:
1357: /* Add the default HTML parser to the set of converters */
1358: {
1359: HTList * converters = HTFormat_conversion();
1360: HTMLInit(converters);
1361: }
1.1 frystyk 1362:
1.12 frystyk 1363: /* Build a new robot object */
1364: mr = Robot_new();
1365:
1.1 frystyk 1366: /* Scan command Line for parameters */
1367: for (arg=1; arg<argc; arg++) {
1368: if (*argv[arg] == '-') {
1369:
1370: /* non-interactive */
1.17 frystyk 1371: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1372: HTAlert_setInteractive(NO);
1373:
1.62 frystyk 1374: /* help */
1375: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1376: VersionInfo();
1377: Cleanup(mr, 0);
1378:
1.63 frystyk 1379: /* clf log file */
1.1 frystyk 1380: } else if (!strcmp(argv[arg], "-l")) {
1381: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1382: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1383: mr->flags |= MR_LOGGING;
1.1 frystyk 1384:
1.63 frystyk 1385: /* referer log file */
1.58 frystyk 1386: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1387: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1388: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1389: mr->flags |= MR_LOGGING;
1.57 frystyk 1390:
1.58 frystyk 1391: /* Not found error log file */
1392: } else if (!strncmp(argv[arg], "-404", 4)) {
1393: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1394: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1395: mr->flags |= MR_LOGGING;
1.58 frystyk 1396:
1397: /* reject log file */
1398: } else if (!strncmp(argv[arg], "-rej", 4)) {
1399: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1400: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1401: mr->flags |= MR_LOGGING;
1.58 frystyk 1402:
1.63 frystyk 1403: /* no alt tags log file */
1404: } else if (!strncmp(argv[arg], "-alt", 4)) {
1405: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1406: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1407: mr->flags |= MR_LOGGING;
1408:
1409: /* negotiated resource log file */
1.58 frystyk 1410: } else if (!strncmp(argv[arg], "-neg", 4)) {
1411: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1412: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1413: mr->flags |= MR_LOGGING;
1414:
1415: /* hit file log */
1416: } else if (!strcmp(argv[arg], "-hit")) {
1417: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1418: argv[++arg] : DEFAULT_HIT_FILE;
1419: mr->flags |= MR_DISTRIBUTIONS;
1420:
1.64 frystyk 1421: /* link relations file log */
1422: } else if (!strcmp(argv[arg], "-rellog")) {
1423: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1424: argv[++arg] : DEFAULT_REL_FILE;
1425: mr->flags |= MR_DISTRIBUTIONS;
1426:
1427: /* Specific link relation to look for (only used i also -rellog) */
1428: } else if (!strcmp(argv[arg], "-relation")) {
1429: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
1430: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
1431: mr->flags |= MR_DISTRIBUTIONS;
1432:
1.63 frystyk 1433: /* last modified log file */
1434: } else if (!strcmp(argv[arg], "-lm")) {
1435: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1436: argv[++arg] : DEFAULT_LM_FILE;
1437: mr->flags |= MR_DISTRIBUTIONS;
1438:
1439: /* title log file */
1440: } else if (!strcmp(argv[arg], "-title")) {
1441: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1442: argv[++arg] : DEFAULT_TITLE_FILE;
1443: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1444:
1445: /* mediatype distribution log file */
1446: } else if (!strncmp(argv[arg], "-for", 4)) {
1447: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1448: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1449: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1450:
1.60 frystyk 1451: /* charset distribution log file */
1452: } else if (!strncmp(argv[arg], "-char", 5)) {
1453: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1454: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1455: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1456:
1.55 frystyk 1457: /* rule file */
1.1 frystyk 1458: } else if (!strcmp(argv[arg], "-r")) {
1459: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1460: argv[++arg] : DEFAULT_RULE_FILE;
1461:
1462: /* output filename */
1463: } else if (!strcmp(argv[arg], "-o")) {
1464: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1465: argv[++arg] : DEFAULT_OUTPUT_FILE;
1466:
1.55 frystyk 1467: /* URI prefix */
1468: } else if (!strcmp(argv[arg], "-prefix")) {
1469: char * prefix = NULL;
1470: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1471: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1472: if (*prefix && *prefix != '*') {
1.55 frystyk 1473: StrAllocCopy(mr->prefix, prefix);
1474: StrAllocCat(mr->prefix, "*");
1475: }
1476:
1.1 frystyk 1477: /* timeout -- Change the default request timeout */
1478: } else if (!strcmp(argv[arg], "-timeout")) {
1479: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1480: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1481: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1482:
1.54 frystyk 1483: /* Force no pipelined requests */
1484: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 frystyk 1485: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1486:
1.48 frystyk 1487: /* Start the persistent cache */
1488: } else if (!strcmp(argv[arg], "-cache")) {
1489: cache = YES;
1490:
1.54 frystyk 1491: /* Determine the cache root */
1492: } else if (!strcmp(argv[arg], "-cacheroot")) {
1493: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1494: argv[++arg] : NULL;
1.51 frystyk 1495:
1.52 frystyk 1496: /* Stream write flush delay in ms */
1497: } else if (!strcmp(argv[arg], "-delay")) {
1498: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1499: atoi(argv[++arg]) : DEFAULT_DELAY;
1500: HTHost_setDefaultWriteDelay(delay);
1501:
1.48 frystyk 1502: /* Persistent cache flush */
1503: } else if (!strcmp(argv[arg], "-flush")) {
1504: flush = YES;
1505:
1506: /* Do a cache validation */
1507: } else if (!strcmp(argv[arg], "-validate")) {
1508: mr->flags |= MR_VALIDATE;
1509:
1510: /* Do an end-to-end cache-validation */
1511: } else if (!strcmp(argv[arg], "-endvalidate")) {
1512: mr->flags |= MR_END_VALIDATE;
1513:
1.7 frystyk 1514: /* preemptive or non-preemptive access */
1.1 frystyk 1515: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1516: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1517:
1518: /* test inlined images */
1519: } else if (!strcmp(argv[arg], "-img")) {
1520: mr->flags |= MR_IMG;
1.45 frystyk 1521:
1522: /* load inlined images */
1523: } else if (!strcmp(argv[arg], "-saveimg")) {
1524: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1525:
1526: /* URI prefix for inlined images */
1527: } else if (!strcmp(argv[arg], "-imgprefix")) {
1528: char * prefix = NULL;
1529: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1530: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1531: if (*prefix && *prefix!='*') {
1.59 frystyk 1532: StrAllocCopy(mr->img_prefix, prefix);
1533: StrAllocCat(mr->img_prefix, "*");
1534: }
1.2 frystyk 1535:
1536: /* load anchors */
1.58 frystyk 1537: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1538: mr->flags |= MR_LINK;
1.7 frystyk 1539: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1540: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1541:
1.12 frystyk 1542: /* Output start and end time */
1543: } else if (!strcmp(argv[arg], "-ss")) {
1544: mr->flags |= MR_TIME;
1545:
1.1 frystyk 1546: /* print version and exit */
1547: } else if (!strcmp(argv[arg], "-version")) {
1548: VersionInfo();
1549: Cleanup(mr, 0);
1.46 eric 1550:
1551: /* run in quiet mode */
1552: } else if (!strcmp(argv[arg], "-q")) {
1553: mr->flags |= MR_QUIET;
1.1 frystyk 1554:
1.62 frystyk 1555: /* run in really quiet mode */
1556: } else if (!strcmp(argv[arg], "-Q")) {
1557: mr->flags |= MR_REAL_QUIET;
1558:
1.1 frystyk 1559: #ifdef WWWTRACE
1560: /* trace flags */
1561: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1562: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1563: #endif
1564:
1.58 frystyk 1565: #ifdef HT_POSIX_REGEX
1566:
1567: /* If we can link against a POSIX regex library */
1568: } else if (!strncmp(argv[arg], "-inc", 4)) {
1569: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1570: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1571: }
1572: } else if (!strncmp(argv[arg], "-exc", 4)) {
1573: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1574: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1575: }
1576: } else if (!strncmp(argv[arg], "-check", 6)) {
1577: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1578: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1579: }
1580: #endif
1581:
1.68 frystyk 1582: #ifdef HT_MYSQL
1583: /* If we can link against a MYSQL database library */
1584: } else if (!strncmp(argv[arg], "-sqldb", 5)) {
1585: mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ?
1586: argv[++arg] : DEFAULT_SQL_DB;
1587:
1588: } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) {
1589: mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE;
1590:
1591: } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) {
1592: mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE;
1593:
1594: } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) {
1595: mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE;
1596:
1597: } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) {
1598: mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE;
1599:
1600: } else if (!strncmp(argv[arg], "-sqlexternals", 5)) {
1601: mr->sqlexternals = YES;
1602:
1603: } else if (!strncmp(argv[arg], "-sqlpassword", 5)) {
1604: mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ?
1605: argv[++arg] : DEFAULT_SQL_PW;
1606:
1607: } else if (!strncmp(argv[arg], "-sqlrelative", 5)) {
1608: mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ?
1609: argv[++arg] : NULL;
1610:
1611: } else if (!strncmp(argv[arg], "-sqlserver", 5)) {
1612: mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ?
1613: argv[++arg] : DEFAULT_SQL_SERVER;
1614:
1615: } else if (!strncmp(argv[arg], "-sqluser", 5)) {
1616: mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ?
1617: argv[++arg] : DEFAULT_SQL_USER;
1618:
1619: #endif
1620:
1.1 frystyk 1621: } else {
1.62 frystyk 1622: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1623: }
1.17 frystyk 1624: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1625: if (!keycnt) {
1626: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1627: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1628: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1629: keycnt = 1;
1.11 frystyk 1630: HT_FREE(ref);
1.1 frystyk 1631: } else { /* Check for successive keyword arguments */
1632: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1633: if (keycnt++ <= 1)
1.5 frystyk 1634: keywords = HTChunk_new(128);
1.1 frystyk 1635: else
1.5 frystyk 1636: HTChunk_putc(keywords, ' ');
1637: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1638: HT_FREE(escaped);
1.1 frystyk 1639: }
1640: }
1641: }
1642:
1643: #ifdef CATCH_SIG
1644: SetSignal();
1645: #endif
1646:
1647: if (!keycnt) {
1.62 frystyk 1648: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1649: Cleanup(mr, -1);
1650: }
1651:
1652: if (mr->depth != DEFAULT_DEPTH &&
1653: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1654: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1655: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1656: mr->depth);
1.1 frystyk 1657: Cleanup(mr, -1);
1658: }
1659:
1.23 manoli 1660: /* Testing that HTTrace is working */
1.62 frystyk 1661: if (mr->flags & MR_TIME) {
1662: if (SHOW_REAL_QUIET(mr)) {
1663: time_t local = time(NULL);
1.67 frystyk 1664: HTTrace("Welcome to the W3C mini Robot version %s - started on %s\n",
1665: APP_VERSION, HTDateTimeStr(&local, YES));
1.62 frystyk 1666: }
1667: }
1.23 manoli 1668:
1.1 frystyk 1669: /* Rule file specified? */
1670: if (mr->rules) {
1671: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1672: if (!HTLoadRules(rules))
1.62 frystyk 1673: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1674: HT_FREE(rules);
1.1 frystyk 1675: }
1676:
1677: /* Output file specified? */
1678: if (mr->outputfile) {
1679: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1680: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1681: mr->output = OUTPUT;
1682: }
1683: }
1684:
1.48 frystyk 1685: /* Should we use persistent cache? */
1686: if (cache) {
1.54 frystyk 1687: HTCacheInit(cache_root, 20);
1.49 frystyk 1688: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1689: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1690: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1691:
1692: /* Should we start by flushing? */
1693: if (flush) HTCache_flushAll();
1694: }
1.68 frystyk 1695:
1696: /* SQL Log specified? */
1697: #ifdef HT_MYSQL
1698: if (mr->sqlserver) {
1699: if ((mr->sqllog =
1.69 ! frystyk 1700: HTSQLLog_open(mr->sqlserver,
! 1701: mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER,
! 1702: mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW,
! 1703: mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB,
! 1704: mr->sqlflags)) != NULL) {
1.68 frystyk 1705: if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative);
1706: }
1707: }
1708: #endif
1.48 frystyk 1709:
1.58 frystyk 1710: /* CLF Log file specified? */
1.55 frystyk 1711: if (mr->logfile) {
1712: mr->log = HTLog_open(mr->logfile, YES, YES);
1713: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1714: }
1715:
1.58 frystyk 1716: /* Referer Log file specified? */
1.57 frystyk 1717: if (mr->reffile) {
1718: mr->ref = HTLog_open(mr->reffile, YES, YES);
1719: if (mr->ref)
1720: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1721: }
1.1 frystyk 1722:
1.58 frystyk 1723: /* Not found error log specified? */
1724: if (mr->notfoundfile) {
1725: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1726: if (mr->notfound)
1727: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1728: }
1729:
1730: /* Negotiated resource log specified? */
1731: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1732:
1733: /* No alt tags log file specified? */
1734: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1735:
1736: /* Reject Log file specified? */
1737: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1738:
1739: /* Register our own terminate filter */
1.32 frystyk 1740: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1741:
1742: /* Setting event timeout */
1743: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1744:
1.56 frystyk 1745: mr->time = HTGetTimeInMillis();
1.37 frystyk 1746:
1.34 eric 1747: /* Start the request */
1748: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1749:
1750: /*
1751: ** Make sure that the first request is flushed immediately and not
1752: ** buffered in the output buffer
1753: */
1754: HTRequest_setFlush(finger->request, YES);
1755:
1756: /*
1.48 frystyk 1757: ** Check whether we should do some kind of cache validation on
1758: ** the load
1759: */
1760: if (mr->flags & MR_VALIDATE)
1761: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1762: if (mr->flags & MR_END_VALIDATE)
1763: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1764:
1765: /*
1.43 frystyk 1766: ** Now do the load
1767: */
1.34 eric 1768: if (mr->flags & MR_PREEMPTIVE)
1769: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1770:
1771: if (keywords) /* Search */
1.34 eric 1772: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1773: else
1.34 eric 1774: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1775:
1.5 frystyk 1776: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1777: if (status != YES) {
1.62 frystyk 1778: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1779: Cleanup(mr, -1);
1780: }
1781:
1782: /* Go into the event loop... */
1.34 eric 1783: HTEventList_loop(finger->request);
1.1 frystyk 1784:
1785: /* Only gets here if event loop fails */
1786: Cleanup(mr, 0);
1787: return 0;
1788: }
Webmaster