Annotation of libwww/Robot/src/HTRobot.c, revision 1.71
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.71 ! frystyk 20: #include "WWWSQL.h"
1.9 frystyk 21:
1.4 frystyk 22: #include "HText.h"
1.1 frystyk 23:
24: #include "HTRobot.h" /* Implemented here */
25:
1.58 frystyk 26: #ifdef HT_POSIX_REGEX
1.64 frystyk 27: #ifdef HAVE_RXPOSIX_H
28: #include <rxposix.h>
29: #else
1.62 frystyk 30: #ifdef HAVE_REGEX_H
31: #include <regex.h>
32: #endif
33: #endif
1.60 frystyk 34: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 35: #endif
36:
1.14 frystyk 37: #ifndef W3C_VERSION
1.33 eric 38: #define W3C_VERSION "Unspecified"
1.1 frystyk 39: #endif
40:
41: #define APP_NAME "W3CRobot"
1.14 frystyk 42: #define APP_VERSION W3C_VERSION
1.62 frystyk 43: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 44:
45: #define DEFAULT_OUTPUT_FILE "robot.out"
46: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 47: #define DEFAULT_LOG_FILE "log-clf.txt"
48: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 frystyk 49: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 50: #define DEFAULT_LM_FILE "log-lastmodified.txt"
51: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 52: #define DEFAULT_REFERER_FILE "log-referer.txt"
53: #define DEFAULT_REJECT_FILE "log-reject.txt"
54: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
55: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 56: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 57: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 58: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 59: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 60: #define DEFAULT_PREFIX ""
1.59 frystyk 61: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 62: #define DEFAULT_DEPTH 0
1.53 frystyk 63: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 64:
1.68 frystyk 65: #define DEFAULT_SQL_SERVER "localhost"
66: #define DEFAULT_SQL_DB "webbot"
67: #define DEFAULT_SQL_USER "webbot"
68: #define DEFAULT_SQL_PW ""
69:
1.51 frystyk 70: #if 0
1.65 frystyk 71: #define HT_MEMLOG /* Is expensive in performance! */
1.51 frystyk 72: #endif
73:
1.46 eric 74: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 75: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
76: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 77:
1.66 frystyk 78: #define DEFAULT_TIMEOUT 50000 /* timeout in millis */
1.1 frystyk 79:
80: #if defined(__svr4__)
81: #define CATCH_SIG
82: #endif
83:
84: typedef enum _MRFlags {
1.45 frystyk 85: MR_IMG = 0x1,
86: MR_LINK = 0x2,
87: MR_PREEMPTIVE = 0x4,
88: MR_TIME = 0x8,
1.46 eric 89: MR_SAVE = 0x10,
1.48 frystyk 90: MR_QUIET = 0x20,
1.62 frystyk 91: MR_REAL_QUIET = 0x40,
92: MR_VALIDATE = 0x80,
93: MR_END_VALIDATE = 0x100,
1.63 frystyk 94: MR_KEEP_META = 0x200,
95: MR_LOGGING = 0x400,
96: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 97: } MRFlags;
98:
99: typedef struct _Robot {
1.2 frystyk 100: int depth; /* How deep is our tree */
1.30 frystyk 101: int cnt; /* Count of requests */
1.2 frystyk 102: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 103: HTList * htext; /* List of our HText Objects */
1.34 eric 104: HTList * fingers;
1.59 frystyk 105:
1.40 frystyk 106: int timer;
1.65 frystyk 107: char * cwd; /* Current dir URL */
1.1 frystyk 108: char * rules;
1.55 frystyk 109: char * prefix;
1.59 frystyk 110: char * img_prefix;
111:
1.60 frystyk 112: char * logfile; /* clf log */
1.55 frystyk 113: HTLog * log;
1.60 frystyk 114: char * reffile; /* referer log */
1.57 frystyk 115: HTLog * ref;
1.60 frystyk 116: char * rejectfile; /* unchecked links */
1.58 frystyk 117: HTLog * reject;
1.60 frystyk 118: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 119: HTLog * notfound;
1.60 frystyk 120: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 121: HTLog * conneg;
1.60 frystyk 122: char * noalttagfile; /* images without alt tags*/
123: HTLog * noalttag;
124:
125: char * hitfile; /* links sorted after hit counts */
1.64 frystyk 126: char * relfile; /* link sorted after relationships */
127: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 128: char * titlefile; /* links with titles */
1.60 frystyk 129: char * mtfile; /* media types encountered */
130: char * charsetfile; /* charsets encountered */
1.63 frystyk 131: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 132:
133: char * outputfile;
1.1 frystyk 134: FILE * output;
1.59 frystyk 135:
1.1 frystyk 136: MRFlags flags;
1.55 frystyk 137:
1.59 frystyk 138: long get_bytes; /* Total number of bytes processed using GET*/
139: long get_docs; /* Total number of documents using GET */
140:
141: long head_bytes; /* bytes processed bytes processed using HEAD */
142: long head_docs; /* Total number of documents using HEAD*/
143:
144: long other_docs;
145:
1.56 frystyk 146: ms_t time; /* Time of run */
1.58 frystyk 147:
148: #ifdef HT_POSIX_REGEX
149: regex_t * include;
150: regex_t * exclude;
151: regex_t * check;
152: #endif
153:
1.68 frystyk 154: #ifdef HT_MYSQL
155: HTSQLLog * sqllog;
156: char * sqlserver;
157: char * sqldb;
158: char * sqluser;
159: char * sqlpw;
160: char * sqlrelative;
161: BOOL sqlexternals;
162: int sqlflags;
163: #endif
164:
1.1 frystyk 165: } Robot;
1.34 eric 166:
167: typedef struct _Finger {
168: Robot * robot;
169: HTRequest * request;
170: HTParentAnchor * dest;
171: } Finger;
172:
1.1 frystyk 173: typedef enum _LoadState {
174: L_INVALID = -2,
175: L_LOADING = -1,
176: L_SUCCESS = 0,
177: L_ERROR
178: } LoadState;
179:
180: /*
181: ** The HyperDoc object is bound to the anchor and contains information about
182: ** where we are in the search for recursive searches
183: */
184: typedef struct _HyperDoc {
185: HTParentAnchor * anchor;
186: LoadState state;
187: int depth;
1.55 frystyk 188: int hits;
1.1 frystyk 189: } HyperDoc;
190:
191: /*
1.65 frystyk 192: ** This is the HText object that is created every time we start parsing an
1.1 frystyk 193: ** HTML object
194: */
1.4 frystyk 195: struct _HText {
1.1 frystyk 196: HTRequest * request;
1.65 frystyk 197: BOOL follow;
1.4 frystyk 198: };
1.1 frystyk 199:
1.58 frystyk 200: /*
201: ** A structure for calculating metadata distributions
202: */
203: typedef struct _MetaDist {
204: HTAtom * name;
205: int hits;
206: } MetaDist;
207:
208: /*
209: ** Some sorting algorithms
210: */
1.63 frystyk 211: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 212:
1.1 frystyk 213: PUBLIC HText * HTMainText = NULL;
214: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
215: PUBLIC HTStyleSheet * styleSheet = NULL;
216:
217: /* ------------------------------------------------------------------------- */
218:
1.13 eric 219: /* Standard (non-error) Output
220: ** ---------------------------
221: */
222: PUBLIC int OutputData(const char * fmt, ...)
223: {
224: int ret;
225: va_list pArgs;
226: va_start(pArgs, fmt);
227: ret = vfprintf(stdout, fmt, pArgs);
228: va_end(pArgs);
229: return ret;
230: }
231:
232: /* ------------------------------------------------------------------------- */
233:
1.2 frystyk 234: /* Create a "HyperDoc" object
235: ** --------------------------
236: ** A HyperDoc object contains information about whether we have already
237: ** started checking the anchor and the depth in our search
238: */
239: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
240: {
241: HyperDoc * hd;
1.14 frystyk 242: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
243: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 244: hd->state = L_INVALID;
245: hd->depth = depth;
1.55 frystyk 246: hd->hits = 1;
1.2 frystyk 247:
248: /* Bind the HyperDoc object together with the Anchor Object */
249: hd->anchor = anchor;
250: HTAnchor_setDocument(anchor, (void *) hd);
251:
252: /* Add this HyperDoc object to our list */
253: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
254: HTList_addObject(mr->hyperdoc, (void *) hd);
255: return hd;
256: }
257:
258: /* Delete a "HyperDoc" object
259: ** --------------------------
260: */
261: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
262: {
263: if (hd) {
1.11 frystyk 264: HT_FREE (hd);
1.2 frystyk 265: return YES;
266: }
267: return NO;
268: }
269:
1.55 frystyk 270: /*
271: ** Sort the anchor array and log reference count
272: */
273: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
274: {
275: if (mr && array) {
276: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
277: if (log) {
278: void ** data = NULL;
279: HTParentAnchor * anchor = NULL;
280: HTArray_sort(array, HitSort);
281: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
282: while (anchor) {
283: char * uri = HTAnchor_address((HTAnchor *) anchor);
284: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 285: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 286: HT_FREE(uri);
287: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
288: }
289: }
290: HTLog_close(log);
291: return YES;
292: }
293: return NO;
294: }
295:
296: PRIVATE int HitSort (const void * a, const void * b)
297: {
298: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
299: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
300: if (aa && bb) return (bb->hits - aa->hits);
301: return bb - aa;
302: }
303:
1.58 frystyk 304: /*
1.64 frystyk 305: ** Sort the anchor array and log link relations
306: */
307: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
308: {
309: if (mr && array) {
1.68 frystyk 310: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
311: void ** data = NULL;
312: HTParentAnchor * anchor = NULL;
313: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
314: while (anchor) {
315:
316: /*
317: ** If we have a specific link relation to look for then do this.
318: ** Otherwise look for all link relations.
319: */
320: if (mr->relation) {
321: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
322: if (link) {
323: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
324: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
325: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
326: if (src_uri && dest_uri) {
327: #ifdef HT_MYSQL
328: if (mr->sqllog) {
329: HTSQLLog_addLinkRelationship (mr->sqllog,
330: src_uri, dest_uri,
331: HTAtom_name(mr->relation),
332: NULL);
333: }
334: #endif
335: if (log) {
336: HTFormat format = HTAnchor_format(dest);
337: HTLog_addText(log, "%s %s %s --> %s\n",
338: HTAtom_name(mr->relation),
339: format != WWW_UNKNOWN ?
340: HTAtom_name(format) : "<unknown>",
341: src_uri, dest_uri);
342: }
343:
344: /* Cleanup */
345: HT_FREE(src_uri);
346: HT_FREE(dest_uri);
347: }
348: }
349: } else {
350: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
351: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
352: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
353: HTLinkType linktype;
354:
355: /* First look in the main link */
356: if (link && (linktype = HTLink_type(link))) {
357: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
358: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
359: if (src_uri && dest_uri) {
360: #ifdef HT_MYSQL
361: if (mr->sqllog) {
362: HTSQLLog_addLinkRelationship (mr->sqllog,
363: src_uri, dest_uri,
364: HTAtom_name(linktype),
365: NULL);
366: }
367: #endif
368: if (log) {
369: HTFormat format = HTAnchor_format(dest);
370: HTLog_addText(log, "%s %s %s --> %s\n",
371: HTAtom_name(linktype),
372: format != WWW_UNKNOWN ?
373: HTAtom_name(format) : "<unknown>",
374: src_uri, dest_uri);
375: }
376: }
377: HT_FREE(dest_uri);
378: }
379:
380: /* and then in any sublinks */
381: if (sublinks) {
382: HTLink * pres;
383: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
384: if ((linktype = HTLink_type(pres))) {
385: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 386: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 387: if (src_uri && dest_uri) {
388: #ifdef HT_MYSQL
389: if (mr->sqllog) {
390: HTSQLLog_addLinkRelationship (mr->sqllog,
391: src_uri, dest_uri,
392: HTAtom_name(linktype),
393: NULL);
394: }
395: #endif
396: if (log) {
397: HTFormat format = HTAnchor_format(dest);
398: HTLog_addText(log, "%s %s %s --> %s\n",
399: HTAtom_name(linktype),
400: format != WWW_UNKNOWN ?
401: HTAtom_name(format) : "<unknown>",
402: src_uri, dest_uri);
403: }
1.64 frystyk 404: HT_FREE(dest_uri);
405: }
406: }
407: }
408: }
1.68 frystyk 409:
410: /* Cleanup */
411: HT_FREE(src_uri);
412: }
413: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 414: }
1.68 frystyk 415: if (log) HTLog_close(log);
1.64 frystyk 416: return YES;
417: }
418: return NO;
419: }
420:
421: /*
1.63 frystyk 422: ** Sort the anchor array and log last modified date
423: */
424: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
425: {
426: if (mr && array) {
427: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
428: if (log) {
429: void ** data = NULL;
430: HTParentAnchor * anchor = NULL;
431: HTArray_sort(array, LastModifiedSort);
432: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
433: while (anchor) {
434: char * uri = HTAnchor_address((HTAnchor *) anchor);
435: time_t lm = HTAnchor_lastModified(anchor);
436: if (uri && lm > 0)
437: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
438: HT_FREE(uri);
439: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
440: }
441: }
442: HTLog_close(log);
443: return YES;
444: }
445: return NO;
446: }
447:
448: PRIVATE int LastModifiedSort (const void * a, const void * b)
449: {
450: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
451: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
452: return bb - aa;
453: }
454:
455: /*
456: ** Sort the anchor array and log the document title
457: */
458: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
459: {
460: if (mr && array) {
461: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
462: if (log) {
463: void ** data = NULL;
464: HTParentAnchor * anchor = NULL;
465: HTArray_sort(array, TitleSort);
466: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
467: while (anchor) {
468: char * uri = HTAnchor_address((HTAnchor *) anchor);
469: const char * title = HTAnchor_title(anchor);
470: HTCharset charset = HTAnchor_charset(anchor);
471: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
472: charset ? HTAtom_name(charset) : "<none>",
473: title ? title : "<none>",
474: uri);
475: HT_FREE(uri);
476: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
477: }
478: }
479: HTLog_close(log);
480: return YES;
481: }
482: return NO;
483: }
484:
485: PRIVATE int TitleSort (const void * a, const void * b)
486: {
487: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
488: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
489: return strcasecomp(bb?bb:"", aa?aa:"");
490: }
491:
492: /*
1.58 frystyk 493: ** Calculate distributions for media types. The same mechanism
494: ** can be used for other characteristics with relatively
495: ** few outcomes.
496: */
497: PRIVATE HTList * mediatype_distribution (HTArray * array)
498: {
499: if (array) {
500: HTList * mt = HTList_new();
501: MetaDist * pres = NULL;
502: void ** data = NULL;
503: HTParentAnchor * anchor = NULL;
504: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
505: while (anchor) {
506: HTFormat format = HTAnchor_format(anchor);
507: if (format && format != WWW_UNKNOWN) {
508: HTList * cur = mt;
509:
510: /* If found then increase counter */
511: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
512: if (pres->name == format) {
513: pres->hits++;
514: break;
515: }
516: }
517:
518: /* If not found then add new format to list */
519: if (!pres) {
520: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
521: HT_OUTOFMEM("mediatype_distribution");
522: pres->name = format;
523: pres->hits = 1;
524: HTList_addObject(mt, pres);
525: HTList_insertionSort(mt, FormatSort);
526: }
527: }
528:
529: /* Find next anchor in array */
530: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
531: }
532: return mt;
533: }
534: return NULL;
535: }
536:
1.60 frystyk 537: /*
538: ** Calculate distributions for charsets. The same mechanism
539: ** can be used for other characteristics with relatively
540: ** few outcomes.
541: */
542: PRIVATE HTList * charset_distribution (HTArray * array)
543: {
544: if (array) {
545: HTList * cs = HTList_new();
546: MetaDist * pres = NULL;
547: void ** data = NULL;
548: HTParentAnchor * anchor = NULL;
549: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
550: while (anchor) {
551: HTCharset charset = HTAnchor_charset(anchor);
552: if (charset) {
553: HTList * cur = cs;
554:
555: /* If found then increase counter */
556: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
557: if (pres->name == charset) {
558: pres->hits++;
559: break;
560: }
561: }
562:
563: /* If not found then add new format to list */
564: if (!pres) {
565: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
566: HT_OUTOFMEM("charset_distribution");
567: pres->name = charset;
568: pres->hits = 1;
569: HTList_addObject(cs, pres);
570: HTList_insertionSort(cs, FormatSort);
571: }
572: }
573:
574: /* Find next anchor in array */
575: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
576: }
577: return cs;
578: }
579: return NULL;
580: }
581:
1.58 frystyk 582: PRIVATE int FormatSort (const void * a, const void * b)
583: {
584: MetaDist * aa = (MetaDist *) a;
585: MetaDist * bb = (MetaDist *) b;
586: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
587: }
588:
589: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
590: {
591: if (logfile && distribution) {
592: HTLog * log = HTLog_open(logfile, YES, YES);
593: if (log) {
594: HTList * cur = distribution;
595: MetaDist * pres;
596: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
597: if (pres->name) {
1.60 frystyk 598: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 599: }
600: }
601: HTLog_close(log);
602: }
603: }
604: return NO;
605: }
606:
607: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
608: {
609: if (distribution) {
610: HTList * cur = distribution;
611: MetaDist * pres;
612: while ((pres = (MetaDist *) HTList_nextObject(cur)))
613: HT_FREE(pres);
614: HTList_delete(distribution);
615: return YES;
616: }
617: return NO;
618: }
619:
620:
1.55 frystyk 621: /* Statistics
622: ** ----------
623: ** Calculates a bunch of statistics for the anchors traversed
624: */
625: PRIVATE BOOL calculate_statistics (Robot * mr)
626: {
1.59 frystyk 627: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 628: if (!mr) return NO;
629:
630: /* Calculate efficiency */
1.59 frystyk 631: if (mr->time > 0) {
1.56 frystyk 632: ms_t t = HTGetTimeInMillis() - mr->time;
633: if (t > 0) {
1.60 frystyk 634: double loadfactor = (mr->get_bytes / (t * 0.001));
635: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 636: double secs = t / 1000.0;
1.55 frystyk 637: char bytes[50];
1.62 frystyk 638: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 639: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 640: total_docs, secs, reqprsec);
1.59 frystyk 641:
642: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 643: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 644: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 645: mr->get_docs, bytes, loadfactor);
1.59 frystyk 646:
647: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 648: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 649: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 650: mr->head_docs, bytes);
1.55 frystyk 651: }
652: }
653:
654: /* Create an array of existing anchors */
1.59 frystyk 655: if (total_docs > 1) {
656: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 657: if (array) {
658:
1.63 frystyk 659: /* Distributions */
660: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 661: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 662: }
663:
1.55 frystyk 664: /* Sort after hit counts */
1.63 frystyk 665: if (mr->hitfile) {
666: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 667: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 668: mr->hitfile);
669: calculate_hits(mr, array);
670: }
671:
1.64 frystyk 672: /* Sort after link relations */
1.68 frystyk 673: #ifdef HT_MYSQL
674: if (mr->relfile || mr->sqllog) {
1.69 frystyk 675: #else
676: if (mr->relfile) {
677: #endif
1.68 frystyk 678: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 679: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
680: mr->relfile);
681: calculate_linkRelations(mr, array);
682: }
683:
1.63 frystyk 684: /* Sort after modified date */
685: if (mr->lmfile) {
686: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 687: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 688: mr->lmfile);
689: calculate_lm(mr, array);
690: }
691:
692: /* Sort after title */
693: if (mr->titlefile) {
694: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 695: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 696: mr->titlefile);
697: calculate_title(mr, array);
698: }
1.55 frystyk 699:
1.58 frystyk 700: /* Find mediatype distribution */
701: if (mr->mtfile) {
702: HTList * mtdist = mediatype_distribution(array);
703: if (mtdist) {
1.63 frystyk 704: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 705: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 706: mr->mtfile);
1.58 frystyk 707: log_meta_distribution(mr->mtfile, mtdist);
708: delete_meta_distribution(mtdist);
709: }
710: }
1.55 frystyk 711:
1.60 frystyk 712: /* Find charset distribution */
713: if (mr->charsetfile) {
714: HTList * charsetdist = charset_distribution(array);
715: if (charsetdist) {
1.63 frystyk 716: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 717: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 718: mr->charsetfile);
1.60 frystyk 719: log_meta_distribution(mr->charsetfile, charsetdist);
720: delete_meta_distribution(charsetdist);
721: }
722: }
723:
1.55 frystyk 724: /* Add as may other stats here as you like */
1.60 frystyk 725: /* ... */
1.58 frystyk 726:
727: /* Delete the array */
1.55 frystyk 728: HTArray_delete(array);
729: }
730: }
731: return YES;
732: }
733:
1.1 frystyk 734: /* Create a Command Line Object
735: ** ----------------------------
736: */
737: PRIVATE Robot * Robot_new (void)
738: {
739: Robot * me;
1.41 frystyk 740: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 741: HT_OUTOFMEM("Robot_new");
1.2 frystyk 742: me->hyperdoc = HTList_new();
1.4 frystyk 743: me->htext = HTList_new();
1.40 frystyk 744: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 745: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 746: me->output = OUTPUT;
1.35 eric 747: me->cnt = 0;
1.34 eric 748: me->fingers = HTList_new();
1.1 frystyk 749: return me;
750: }
751:
752: /* Delete a Command Line Object
753: ** ----------------------------
754: */
1.62 frystyk 755: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 756: {
1.62 frystyk 757: if (mr) {
758: HTList_delete(mr->fingers);
1.55 frystyk 759:
760: /* Calculate statistics */
1.62 frystyk 761: calculate_statistics(mr);
1.55 frystyk 762:
1.62 frystyk 763: if (mr->hyperdoc) {
764: HTList * cur = mr->hyperdoc;
1.2 frystyk 765: HyperDoc * pres;
766: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
767: HyperDoc_delete(pres);
1.62 frystyk 768: HTList_delete(mr->hyperdoc);
1.2 frystyk 769: }
1.62 frystyk 770: if (mr->htext) {
771: HTList * cur = mr->htext;
1.4 frystyk 772: HText * pres;
773: while ((pres = (HText *) HTList_nextObject(cur)))
774: HText_free(pres);
1.62 frystyk 775: HTList_delete(mr->htext);
1.4 frystyk 776: }
1.62 frystyk 777:
778: /* Close all the log files */
1.63 frystyk 779: if (mr->flags & MR_LOGGING) {
1.64 frystyk 780: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 781: }
782:
1.62 frystyk 783: if (mr->log) {
784: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 785: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 786: HTLog_accessCount(mr->log), mr->logfile);
787: HTLog_close(mr->log);
788: }
789: if (mr->ref) {
790: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 791: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 792: HTLog_accessCount(mr->ref), mr->reffile);
793: HTLog_close(mr->ref);
794: }
795: if (mr->reject) {
796: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 797: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 798: HTLog_accessCount(mr->reject), mr->rejectfile);
799: HTLog_close(mr->reject);
800: }
801: if (mr->notfound) {
802: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 803: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 804: HTLog_accessCount(mr->notfound), mr->notfoundfile);
805: HTLog_close(mr->notfound);
806: }
807: if (mr->conneg) {
808: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 809: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 810: HTLog_accessCount(mr->conneg), mr->connegfile);
811: HTLog_close(mr->conneg);
812: }
813: if (mr->noalttag) {
814: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 815: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 816: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
817: HTLog_close(mr->noalttag);
818: }
819:
820: if (mr->output && mr->output != STDOUT) fclose(mr->output);
821:
822: if (mr->flags & MR_TIME) {
1.12 frystyk 823: time_t local = time(NULL);
1.62 frystyk 824: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 825: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 826: }
1.55 frystyk 827:
1.58 frystyk 828: #ifdef HT_POSIX_REGEX
1.62 frystyk 829: if (mr->include) {
830: regfree(mr->include);
831: HT_FREE(mr->include);
832: }
833: if (mr->exclude) {
834: regfree(mr->exclude);
835: HT_FREE(mr->exclude);
836: }
837: if (mr->check) {
838: regfree(mr->check);
839: HT_FREE(mr->check);
1.58 frystyk 840: }
841: #endif
842:
1.68 frystyk 843: #ifdef HT_MYSQL
844: if (mr->sqllog) {
845: HTSQLLog_close(mr->sqllog);
846: mr->sqllog = NULL;
847: }
848: #endif
849:
1.62 frystyk 850: HT_FREE(mr->cwd);
851: HT_FREE(mr->prefix);
852: HT_FREE(mr->img_prefix);
853: HT_FREE(mr);
1.1 frystyk 854: return YES;
855: }
856: return NO;
857: }
858:
1.2 frystyk 859: /*
1.34 eric 860: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 861: */
1.34 eric 862: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 863: {
1.34 eric 864: Finger * me;
865: HTRequest * request = HTRequest_new();
866: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
867: HT_OUTOFMEM("Finger_new");
868: me->robot = robot;
869: me->request = request;
870: me->dest = dest;
871: HTList_addObject(robot->fingers, (void *)me);
872:
1.48 frystyk 873: /* Set the context for this request */
1.34 eric 874: HTRequest_setContext (request, me);
1.48 frystyk 875:
876: /* Check the various flags to customize the request */
877: if (robot->flags & MR_PREEMPTIVE)
878: HTRequest_setPreemptive(request, YES);
879: if (robot->flags & MR_VALIDATE)
880: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
881: if (robot->flags & MR_END_VALIDATE)
882: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
883:
884: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 885: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 886:
887: /* Set the method for this request */
1.34 eric 888: HTRequest_setMethod(request, method);
889: robot->cnt++;
890: return me;
1.2 frystyk 891: }
892:
1.34 eric 893: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 894: {
1.34 eric 895: HTList_removeObject(me->robot->fingers, (void *)me);
896: me->robot->cnt--;
1.37 frystyk 897:
898: /*
899: ** If we are down at one request then flush the output buffer
900: */
901: if (me->request) {
902: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 903: HTRequest_delete(me->request);
1.37 frystyk 904: }
905:
906: /*
907: ** Delete the request and free myself
908: */
1.34 eric 909: HT_FREE(me);
910: return YES;
1.2 frystyk 911: }
912:
913: /*
914: ** Cleanup and make sure we close all connections including the persistent
915: ** ones
916: */
1.1 frystyk 917: PRIVATE void Cleanup (Robot * me, int status)
918: {
919: Robot_delete(me);
1.29 eric 920: HTProfile_delete();
1.50 frystyk 921: #ifdef HT_MEMLOG
1.39 eric 922: HTMemLog_close();
1.47 frystyk 923: #endif
924:
1.1 frystyk 925: #ifdef VMS
926: exit(status ? status : 1);
927: #else
928: exit(status ? status : 0);
929: #endif
930: }
931:
932: #ifdef CATCH_SIG
933: #include <signal.h>
934: /* SetSignal
935: ** This function sets up signal handlers. This might not be necessary to
936: ** call if the application has its own handlers (lossage on SVR4)
937: */
938: PRIVATE void SetSignal (void)
939: {
940: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
941: ** when attemting to connect to a remote host where you normally should
942: ** get `connection refused' back
943: */
944: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 945: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 946: } else {
1.13 eric 947: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 948: }
1.47 frystyk 949:
1.50 frystyk 950: #ifdef HT_MEMLOG
1.44 eric 951: HTMemLog_flush();
1.47 frystyk 952: #endif
953:
1.1 frystyk 954: }
955: #endif /* CATCH_SIG */
956:
1.58 frystyk 957: #ifdef HT_POSIX_REGEX
958: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
959: {
960: size_t length = regerror (errcode, compiled, NULL, 0);
961: char * str = NULL;
962: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
963: HT_OUTOFMEM("get_regerror");
964: (void) regerror (errcode, compiled, str, length);
965: return str;
966: }
967:
1.60 frystyk 968: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 969: {
970: regex_t * regex = NULL;
971: if (regex_str && *regex_str) {
972: int status;
973: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
974: HT_OUTOFMEM("get_regtype");
1.60 frystyk 975: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 976: char * err_msg = get_regerror(status, regex);
1.62 frystyk 977: if (SHOW_REAL_QUIET(mr))
978: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 979: HT_FREE(err_msg);
980: Cleanup(mr, -1);
981: }
982: }
983: return regex;
984: }
985: #endif
986:
1.1 frystyk 987: PRIVATE void VersionInfo (void)
988: {
1.62 frystyk 989: OutputData("W3C Sample Software\n\n");
990: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
991: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
992: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 993: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 994: }
995:
996: /* terminate_handler
997: ** -----------------
1.2 frystyk 998: ** This function is registered to handle the result of the request.
999: ** If no more requests are pending then terminate program
1.1 frystyk 1000: */
1.32 frystyk 1001: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
1002: void * param, int status)
1.1 frystyk 1003: {
1.34 eric 1004: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 1005: Robot * mr = finger->robot;
1.62 frystyk 1006: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 1007:
1.68 frystyk 1008: #ifdef HT_MYSQL
1009: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1010: #endif
1011:
1.58 frystyk 1012: /* Check if negotiated resource and whether we should log that*/
1013: if (mr->conneg) {
1014: HTAssocList * cur = HTResponse_variant(response);
1015: if (cur) {
1016: BOOL first = YES;
1017: HTChunk * buffer = HTChunk_new(128);
1018: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1019: HTAssoc * pres;
1.60 frystyk 1020: HTChunk_puts(buffer, uri);
1.58 frystyk 1021: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1022: char * value = HTAssoc_value(pres);
1023: if (first) {
1.60 frystyk 1024: HTChunk_puts(buffer, "\t(");
1.58 frystyk 1025: first = NO;
1026: } else
1027: HTChunk_puts(buffer, ", ");
1028:
1029: /* Output the name */
1030: HTChunk_puts(buffer, HTAssoc_name(pres));
1031:
1032: /* Only output the value if not empty string */
1.60 frystyk 1033: if (value && *value) {
1.58 frystyk 1034: HTChunk_puts(buffer, "=");
1035: HTChunk_puts(buffer, value);
1036: }
1037: }
1.60 frystyk 1038: if (!first) HTChunk_puts(buffer, ")");
1039: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 1040: HTChunk_delete(buffer);
1041: HT_FREE(uri);
1042: }
1043: }
1044:
1.55 frystyk 1045: /* Count the amount of body data that we have read */
1.59 frystyk 1046: if (HTRequest_method(request) == METHOD_GET) {
1047: int length = HTAnchor_length(HTRequest_anchor(request));
1048: if (length > 0) mr->get_bytes += length;
1049: mr->get_docs++;
1050: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 1051: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 1052: if (length > 0) mr->head_bytes += length;
1053: mr->head_docs++;
1054: } else {
1055: mr->other_docs++;
1.55 frystyk 1056: }
1057:
1.58 frystyk 1058: /* Cleanup the anchor so that we don't drown in metainformation */
1059: if (!(mr->flags & MR_KEEP_META))
1060: HTAnchor_clearHeader(HTRequest_anchor(request));
1061:
1.55 frystyk 1062: /* Delete this thread */
1.34 eric 1063: Finger_delete(finger);
1.55 frystyk 1064:
1065: /* Should we stop? */
1.46 eric 1066: if (mr->cnt <= 0) {
1.62 frystyk 1067: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1068: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 1069: }
1.62 frystyk 1070: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 1071: return HT_OK;
1072: }
1073:
1074: /* ------------------------------------------------------------------------- */
1075: /* HTEXT INTERFACE */
1076: /* ------------------------------------------------------------------------- */
1077:
1078: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
1079: HTStream * stream)
1080: {
1081: HText * me;
1.34 eric 1082: Finger * finger = (Finger *) HTRequest_context(request);
1083: Robot * mr = finger->robot;
1.65 frystyk 1084: char * robots = NULL;
1085:
1.14 frystyk 1086: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1087: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1088:
1089: /* Bind the HText object together with the Request Object */
1.1 frystyk 1090: me->request = request;
1.65 frystyk 1091: me->follow = YES;
1092:
1093: /* Check to see if we have any meta tags */
1094: if ((robots = HTAnchor_robots(anchor)) != NULL) {
1095: char * strval = NULL;
1096: char * ptr = NULL;
1097: char * token = NULL;
1098: StrAllocCopy(strval, robots);
1099: ptr = strval;
1100: while ((token = HTNextField(&ptr)) != NULL) {
1101: if (!strcasecomp(token, "nofollow")) {
1102: me->follow = NO;
1103: break;
1104: }
1105: }
1106: HT_FREE(strval);
1107: }
1.4 frystyk 1108:
1109: /* Add this HyperDoc object to our list */
1110: if (!mr->htext) mr->htext = HTList_new();
1111: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1112: return me;
1113: }
1114:
1.4 frystyk 1115: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1116: if (me) HT_FREE (me);
1.4 frystyk 1117: }
1118:
1.1 frystyk 1119: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1120: {
1121: if (text && anchor) {
1.34 eric 1122: Finger * finger = (Finger *) HTRequest_context(text->request);
1123: Robot * mr = finger->robot;
1.1 frystyk 1124: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1125: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1126: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1127: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1128: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1129: BOOL match = text->follow;
1.58 frystyk 1130: BOOL check = NO;
1.1 frystyk 1131:
1.55 frystyk 1132: if (!uri) return;
1.62 frystyk 1133: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1134:
1135: if (hd) {
1.62 frystyk 1136: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1137: hd->hits++;
1.68 frystyk 1138: #ifdef HT_MYSQL
1139: if (mr->sqllog) {
1140: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1141: if (ref_addr) {
1142: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1143: "referer", NULL);
1144: HT_FREE(ref_addr);
1145: }
1146: }
1147: #endif
1.58 frystyk 1148: HT_FREE(uri);
1149: return;
1150: }
1.70 frystyk 1151:
1.58 frystyk 1152: /* Check for prefix match */
1.65 frystyk 1153: if (match && mr->prefix) {
1154: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1155: }
1.58 frystyk 1156:
1157: #ifdef HT_POSIX_REGEX
1.69 frystyk 1158: /*
1159: ** Check for any regular expression. The include may override
1160: ** the prefix matching
1161: */
1162: if (mr->include) {
1.58 frystyk 1163: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1164: }
1165: if (match && mr->exclude) {
1166: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1167: }
1168: if (match && mr->check) {
1169: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1170: }
1171: #endif
1172:
1173: /* Test whether we already have a hyperdoc for this document */
1174: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1175: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1176: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1177: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1178: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1179: HTRequest * newreq = newfinger->request;
1.2 frystyk 1180: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1181: HTRequest_setParent(newreq, referer);
1.58 frystyk 1182: if (check || depth >= mr->depth) {
1.62 frystyk 1183: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1184: HTRequest_setMethod(newreq, METHOD_HEAD);
1185: } else {
1.62 frystyk 1186: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1187: }
1188: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1189: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1190: Finger_delete(newfinger);
1.2 frystyk 1191: }
1.7 frystyk 1192: } else {
1.62 frystyk 1193: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1194: #ifdef HT_MYSQL
1195: if (mr->reject || mr->sqllog) {
1196: #else
1.60 frystyk 1197: if (mr->reject) {
1.68 frystyk 1198: #endif
1.60 frystyk 1199: if (referer) {
1200: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1201: if (mr->reject && ref_addr)
1202: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1203: #ifdef HT_MYSQL
1204: if (mr->sqllog && mr->sqlexternals && ref_addr)
1205: HTSQLLog_addLinkRelationship(mr->sqllog,
1206: ref_addr, uri,
1207: "referer", NULL);
1208: #endif
1209:
1.60 frystyk 1210: HT_FREE(ref_addr);
1211: }
1212: }
1.2 frystyk 1213: }
1.11 frystyk 1214: HT_FREE(uri);
1.2 frystyk 1215: }
1216: }
1217:
1218: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1219: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1220: {
1221: if (text && anchor) {
1.34 eric 1222: Finger * finger = (Finger *) HTRequest_context(text->request);
1223: Robot * mr = finger->robot;
1.59 frystyk 1224: if (mr->flags & MR_IMG) {
1.60 frystyk 1225: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1226: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1227: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1228: HyperDoc * hd = HTAnchor_document(dest_parent);
1229: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1230: BOOL match = YES;
1231:
1232: if (hd) {
1.62 frystyk 1233: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1234: hd->hits++;
1.68 frystyk 1235: #ifdef HT_MYSQL
1236: if (mr->sqllog) {
1237: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1238: if (ref_addr) {
1239: HTSQLLog_addLinkRelationship(mr->sqllog,
1240: ref_addr, uri,
1241: "image", alt);
1242: HT_FREE(ref_addr);
1243: }
1244: }
1245: #endif
1.11 frystyk 1246: HT_FREE(uri);
1.59 frystyk 1247: return;
1.2 frystyk 1248: }
1.59 frystyk 1249:
1250: /* Check for prefix match */
1251: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1252:
1253: /* Test whether we already have a hyperdoc for this document */
1254: if (match && dest) {
1.60 frystyk 1255: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1256: mr->flags & MR_SAVE ?
1257: METHOD_GET : METHOD_HEAD);
1258: HTRequest * newreq = newfinger->request;
1.60 frystyk 1259: HyperDoc_new(mr, dest_parent, 1);
1260: HTRequest_setParent(newreq, referer);
1261:
1262: /* Check whether we should report missing ALT tags */
1263: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1264: if (referer) {
1265: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1266: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1267: HT_FREE(ref_addr);
1268: }
1269: }
1270:
1.62 frystyk 1271: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1272: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1273: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1274: Finger_delete(newfinger);
1275: }
1276: } else {
1.62 frystyk 1277: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1278: #ifdef HT_MYSQL
1279: if (mr->reject || mr->sqllog) {
1280: #else
1.60 frystyk 1281: if (mr->reject) {
1.68 frystyk 1282: #endif
1.60 frystyk 1283: if (referer) {
1284: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1285: if (mr->reject && ref_addr)
1286: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1287: #ifdef HT_MYSQL
1288: if (mr->sqllog && mr->sqlexternals && ref_addr)
1289: HTSQLLog_addLinkRelationship(mr->sqllog,
1290: ref_addr, uri,
1291: "image", alt);
1292: #endif
1293:
1.60 frystyk 1294: HT_FREE(ref_addr);
1295: }
1296: }
1.1 frystyk 1297: }
1.59 frystyk 1298: HT_FREE(uri);
1.1 frystyk 1299: }
1300: }
1301: }
1302:
1303: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1304: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1305: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1306: PUBLIC void HText_endAppend (HText * text) {}
1307: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1308: PUBLIC void HText_beginAppend (HText * text) {}
1309: PUBLIC void HText_appendParagraph (HText * text) {}
1310:
1.48 frystyk 1311: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1312: {
1313: return (vfprintf(stderr, fmt, pArgs));
1314: }
1315:
1.1 frystyk 1316: /* ------------------------------------------------------------------------- */
1317: /* MAIN PROGRAM */
1318: /* ------------------------------------------------------------------------- */
1319:
1320: int main (int argc, char ** argv)
1321: {
1.48 frystyk 1322: int status = 0;
1.1 frystyk 1323: int arg;
1.48 frystyk 1324: BOOL cache = NO; /* Use persistent cache */
1325: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1326: char * cache_root = NULL;
1.1 frystyk 1327: HTChunk * keywords = NULL; /* From command line */
1328: int keycnt = 0;
1.12 frystyk 1329: Robot * mr = NULL;
1.43 frystyk 1330: Finger * finger = NULL;
1331: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1332:
1333: /* Starts Mac GUSI socket library */
1334: #ifdef GUSI
1335: GUSISetup(GUSIwithSIOUXSockets);
1336: GUSISetup(GUSIwithInternetSockets);
1337: #endif
1338:
1339: #ifdef __MWERKS__ /* STR */
1340: InitGraf((Ptr) &qd.thePort);
1341: InitFonts();
1342: InitWindows();
1343: InitMenus(); TEInit();
1344: InitDialogs(nil);
1345: InitCursor();
1346: SIOUXSettings.asktosaveonclose = false;
1347: argc=ccommand(&argv);
1.50 frystyk 1348: #endif /* __MWERKS__ */
1.1 frystyk 1349:
1.50 frystyk 1350: #ifdef HT_MEMLOG
1.51 frystyk 1351: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1352: #endif
1.46 eric 1353:
1.27 frystyk 1354: /* Initiate W3C Reference Library with a robot profile */
1355: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1356: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1357:
1358: /* Add the default HTML parser to the set of converters */
1359: {
1360: HTList * converters = HTFormat_conversion();
1361: HTMLInit(converters);
1362: }
1.1 frystyk 1363:
1.12 frystyk 1364: /* Build a new robot object */
1365: mr = Robot_new();
1366:
1.1 frystyk 1367: /* Scan command Line for parameters */
1368: for (arg=1; arg<argc; arg++) {
1369: if (*argv[arg] == '-') {
1370:
1371: /* non-interactive */
1.17 frystyk 1372: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1373: HTAlert_setInteractive(NO);
1374:
1.62 frystyk 1375: /* help */
1376: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1377: VersionInfo();
1378: Cleanup(mr, 0);
1379:
1.63 frystyk 1380: /* clf log file */
1.1 frystyk 1381: } else if (!strcmp(argv[arg], "-l")) {
1382: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1383: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1384: mr->flags |= MR_LOGGING;
1.1 frystyk 1385:
1.63 frystyk 1386: /* referer log file */
1.58 frystyk 1387: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1388: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1389: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1390: mr->flags |= MR_LOGGING;
1.57 frystyk 1391:
1.58 frystyk 1392: /* Not found error log file */
1393: } else if (!strncmp(argv[arg], "-404", 4)) {
1394: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1395: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1396: mr->flags |= MR_LOGGING;
1.58 frystyk 1397:
1398: /* reject log file */
1399: } else if (!strncmp(argv[arg], "-rej", 4)) {
1400: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1401: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1402: mr->flags |= MR_LOGGING;
1.58 frystyk 1403:
1.63 frystyk 1404: /* no alt tags log file */
1405: } else if (!strncmp(argv[arg], "-alt", 4)) {
1406: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1407: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1408: mr->flags |= MR_LOGGING;
1409:
1410: /* negotiated resource log file */
1.58 frystyk 1411: } else if (!strncmp(argv[arg], "-neg", 4)) {
1412: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1413: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1414: mr->flags |= MR_LOGGING;
1415:
1416: /* hit file log */
1417: } else if (!strcmp(argv[arg], "-hit")) {
1418: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1419: argv[++arg] : DEFAULT_HIT_FILE;
1420: mr->flags |= MR_DISTRIBUTIONS;
1421:
1.64 frystyk 1422: /* link relations file log */
1423: } else if (!strcmp(argv[arg], "-rellog")) {
1424: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1425: argv[++arg] : DEFAULT_REL_FILE;
1426: mr->flags |= MR_DISTRIBUTIONS;
1427:
1428: /* Specific link relation to look for (only used i also -rellog) */
1429: } else if (!strcmp(argv[arg], "-relation")) {
1430: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
1431: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
1432: mr->flags |= MR_DISTRIBUTIONS;
1433:
1.63 frystyk 1434: /* last modified log file */
1435: } else if (!strcmp(argv[arg], "-lm")) {
1436: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1437: argv[++arg] : DEFAULT_LM_FILE;
1438: mr->flags |= MR_DISTRIBUTIONS;
1439:
1440: /* title log file */
1441: } else if (!strcmp(argv[arg], "-title")) {
1442: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1443: argv[++arg] : DEFAULT_TITLE_FILE;
1444: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1445:
1446: /* mediatype distribution log file */
1447: } else if (!strncmp(argv[arg], "-for", 4)) {
1448: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1449: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1450: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1451:
1.60 frystyk 1452: /* charset distribution log file */
1453: } else if (!strncmp(argv[arg], "-char", 5)) {
1454: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1455: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1456: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1457:
1.55 frystyk 1458: /* rule file */
1.1 frystyk 1459: } else if (!strcmp(argv[arg], "-r")) {
1460: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1461: argv[++arg] : DEFAULT_RULE_FILE;
1462:
1463: /* output filename */
1464: } else if (!strcmp(argv[arg], "-o")) {
1465: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1466: argv[++arg] : DEFAULT_OUTPUT_FILE;
1467:
1.55 frystyk 1468: /* URI prefix */
1469: } else if (!strcmp(argv[arg], "-prefix")) {
1470: char * prefix = NULL;
1471: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1472: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1473: if (*prefix && *prefix != '*') {
1.55 frystyk 1474: StrAllocCopy(mr->prefix, prefix);
1475: StrAllocCat(mr->prefix, "*");
1476: }
1477:
1.1 frystyk 1478: /* timeout -- Change the default request timeout */
1479: } else if (!strcmp(argv[arg], "-timeout")) {
1480: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1481: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1482: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1483:
1.54 frystyk 1484: /* Force no pipelined requests */
1485: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 frystyk 1486: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1487:
1.48 frystyk 1488: /* Start the persistent cache */
1489: } else if (!strcmp(argv[arg], "-cache")) {
1490: cache = YES;
1491:
1.54 frystyk 1492: /* Determine the cache root */
1493: } else if (!strcmp(argv[arg], "-cacheroot")) {
1494: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1495: argv[++arg] : NULL;
1.51 frystyk 1496:
1.52 frystyk 1497: /* Stream write flush delay in ms */
1498: } else if (!strcmp(argv[arg], "-delay")) {
1499: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1500: atoi(argv[++arg]) : DEFAULT_DELAY;
1501: HTHost_setDefaultWriteDelay(delay);
1502:
1.48 frystyk 1503: /* Persistent cache flush */
1504: } else if (!strcmp(argv[arg], "-flush")) {
1505: flush = YES;
1506:
1507: /* Do a cache validation */
1508: } else if (!strcmp(argv[arg], "-validate")) {
1509: mr->flags |= MR_VALIDATE;
1510:
1511: /* Do an end-to-end cache-validation */
1512: } else if (!strcmp(argv[arg], "-endvalidate")) {
1513: mr->flags |= MR_END_VALIDATE;
1514:
1.7 frystyk 1515: /* preemptive or non-preemptive access */
1.1 frystyk 1516: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1517: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1518:
1519: /* test inlined images */
1520: } else if (!strcmp(argv[arg], "-img")) {
1521: mr->flags |= MR_IMG;
1.45 frystyk 1522:
1523: /* load inlined images */
1524: } else if (!strcmp(argv[arg], "-saveimg")) {
1525: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1526:
1527: /* URI prefix for inlined images */
1528: } else if (!strcmp(argv[arg], "-imgprefix")) {
1529: char * prefix = NULL;
1530: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1531: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1532: if (*prefix && *prefix!='*') {
1.59 frystyk 1533: StrAllocCopy(mr->img_prefix, prefix);
1534: StrAllocCat(mr->img_prefix, "*");
1535: }
1.2 frystyk 1536:
1537: /* load anchors */
1.58 frystyk 1538: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1539: mr->flags |= MR_LINK;
1.7 frystyk 1540: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1541: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1542:
1.12 frystyk 1543: /* Output start and end time */
1544: } else if (!strcmp(argv[arg], "-ss")) {
1545: mr->flags |= MR_TIME;
1546:
1.1 frystyk 1547: /* print version and exit */
1548: } else if (!strcmp(argv[arg], "-version")) {
1549: VersionInfo();
1550: Cleanup(mr, 0);
1.46 eric 1551:
1552: /* run in quiet mode */
1553: } else if (!strcmp(argv[arg], "-q")) {
1554: mr->flags |= MR_QUIET;
1.1 frystyk 1555:
1.62 frystyk 1556: /* run in really quiet mode */
1557: } else if (!strcmp(argv[arg], "-Q")) {
1558: mr->flags |= MR_REAL_QUIET;
1559:
1.1 frystyk 1560: #ifdef WWWTRACE
1561: /* trace flags */
1562: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1563: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1564: #endif
1565:
1.58 frystyk 1566: #ifdef HT_POSIX_REGEX
1567:
1568: /* If we can link against a POSIX regex library */
1569: } else if (!strncmp(argv[arg], "-inc", 4)) {
1570: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1571: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1572: }
1573: } else if (!strncmp(argv[arg], "-exc", 4)) {
1574: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1575: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1576: }
1577: } else if (!strncmp(argv[arg], "-check", 6)) {
1578: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1579: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1580: }
1581: #endif
1582:
1.68 frystyk 1583: #ifdef HT_MYSQL
1584: /* If we can link against a MYSQL database library */
1585: } else if (!strncmp(argv[arg], "-sqldb", 5)) {
1586: mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ?
1587: argv[++arg] : DEFAULT_SQL_DB;
1588:
1589: } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) {
1590: mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE;
1591:
1592: } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) {
1593: mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE;
1594:
1595: } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) {
1596: mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE;
1597:
1598: } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) {
1599: mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE;
1600:
1601: } else if (!strncmp(argv[arg], "-sqlexternals", 5)) {
1602: mr->sqlexternals = YES;
1603:
1604: } else if (!strncmp(argv[arg], "-sqlpassword", 5)) {
1605: mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ?
1606: argv[++arg] : DEFAULT_SQL_PW;
1607:
1608: } else if (!strncmp(argv[arg], "-sqlrelative", 5)) {
1609: mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ?
1610: argv[++arg] : NULL;
1611:
1612: } else if (!strncmp(argv[arg], "-sqlserver", 5)) {
1613: mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ?
1614: argv[++arg] : DEFAULT_SQL_SERVER;
1615:
1616: } else if (!strncmp(argv[arg], "-sqluser", 5)) {
1617: mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ?
1618: argv[++arg] : DEFAULT_SQL_USER;
1619:
1620: #endif
1621:
1.1 frystyk 1622: } else {
1.62 frystyk 1623: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1624: }
1.17 frystyk 1625: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1626: if (!keycnt) {
1627: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1628: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1629: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1630: keycnt = 1;
1.11 frystyk 1631: HT_FREE(ref);
1.1 frystyk 1632: } else { /* Check for successive keyword arguments */
1633: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1634: if (keycnt++ <= 1)
1.5 frystyk 1635: keywords = HTChunk_new(128);
1.1 frystyk 1636: else
1.5 frystyk 1637: HTChunk_putc(keywords, ' ');
1638: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1639: HT_FREE(escaped);
1.1 frystyk 1640: }
1641: }
1642: }
1643:
1644: #ifdef CATCH_SIG
1645: SetSignal();
1646: #endif
1647:
1648: if (!keycnt) {
1.62 frystyk 1649: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1650: Cleanup(mr, -1);
1651: }
1652:
1653: if (mr->depth != DEFAULT_DEPTH &&
1654: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1655: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1656: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1657: mr->depth);
1.1 frystyk 1658: Cleanup(mr, -1);
1659: }
1660:
1.23 manoli 1661: /* Testing that HTTrace is working */
1.62 frystyk 1662: if (mr->flags & MR_TIME) {
1663: if (SHOW_REAL_QUIET(mr)) {
1664: time_t local = time(NULL);
1.67 frystyk 1665: HTTrace("Welcome to the W3C mini Robot version %s - started on %s\n",
1666: APP_VERSION, HTDateTimeStr(&local, YES));
1.62 frystyk 1667: }
1668: }
1.23 manoli 1669:
1.1 frystyk 1670: /* Rule file specified? */
1671: if (mr->rules) {
1672: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1673: if (!HTLoadRules(rules))
1.62 frystyk 1674: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1675: HT_FREE(rules);
1.1 frystyk 1676: }
1677:
1678: /* Output file specified? */
1679: if (mr->outputfile) {
1680: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1681: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1682: mr->output = OUTPUT;
1683: }
1684: }
1685:
1.48 frystyk 1686: /* Should we use persistent cache? */
1687: if (cache) {
1.54 frystyk 1688: HTCacheInit(cache_root, 20);
1.49 frystyk 1689: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1690: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1691: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1692:
1693: /* Should we start by flushing? */
1694: if (flush) HTCache_flushAll();
1695: }
1.68 frystyk 1696:
1697: /* SQL Log specified? */
1698: #ifdef HT_MYSQL
1699: if (mr->sqlserver) {
1700: if ((mr->sqllog =
1.69 frystyk 1701: HTSQLLog_open(mr->sqlserver,
1702: mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER,
1703: mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW,
1704: mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB,
1705: mr->sqlflags)) != NULL) {
1.68 frystyk 1706: if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative);
1707: }
1708: }
1709: #endif
1.48 frystyk 1710:
1.58 frystyk 1711: /* CLF Log file specified? */
1.55 frystyk 1712: if (mr->logfile) {
1713: mr->log = HTLog_open(mr->logfile, YES, YES);
1714: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1715: }
1716:
1.58 frystyk 1717: /* Referer Log file specified? */
1.57 frystyk 1718: if (mr->reffile) {
1719: mr->ref = HTLog_open(mr->reffile, YES, YES);
1720: if (mr->ref)
1721: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1722: }
1.1 frystyk 1723:
1.58 frystyk 1724: /* Not found error log specified? */
1725: if (mr->notfoundfile) {
1726: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1727: if (mr->notfound)
1728: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1729: }
1730:
1731: /* Negotiated resource log specified? */
1732: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1733:
1734: /* No alt tags log file specified? */
1735: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1736:
1737: /* Reject Log file specified? */
1738: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1739:
1740: /* Register our own terminate filter */
1.32 frystyk 1741: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1742:
1743: /* Setting event timeout */
1744: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1745:
1.56 frystyk 1746: mr->time = HTGetTimeInMillis();
1.37 frystyk 1747:
1.34 eric 1748: /* Start the request */
1749: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1750:
1751: /*
1752: ** Make sure that the first request is flushed immediately and not
1753: ** buffered in the output buffer
1754: */
1755: HTRequest_setFlush(finger->request, YES);
1756:
1757: /*
1.48 frystyk 1758: ** Check whether we should do some kind of cache validation on
1759: ** the load
1760: */
1761: if (mr->flags & MR_VALIDATE)
1762: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1763: if (mr->flags & MR_END_VALIDATE)
1764: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1765:
1766: /*
1.43 frystyk 1767: ** Now do the load
1768: */
1.34 eric 1769: if (mr->flags & MR_PREEMPTIVE)
1770: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1771:
1772: if (keywords) /* Search */
1.34 eric 1773: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1774: else
1.34 eric 1775: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1776:
1.5 frystyk 1777: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1778: if (status != YES) {
1.62 frystyk 1779: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1780: Cleanup(mr, -1);
1781: }
1782:
1783: /* Go into the event loop... */
1.34 eric 1784: HTEventList_loop(finger->request);
1.1 frystyk 1785:
1786: /* Only gets here if event loop fails */
1787: Cleanup(mr, 0);
1788: return 0;
1789: }
Webmaster