Annotation of libwww/Robot/src/HTRobot.c, revision 1.73
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.71 frystyk 20: #include "WWWSQL.h"
1.9 frystyk 21:
1.4 frystyk 22: #include "HText.h"
1.1 frystyk 23:
24: #include "HTRobot.h" /* Implemented here */
25:
1.58 frystyk 26: #ifdef HT_POSIX_REGEX
1.64 frystyk 27: #ifdef HAVE_RXPOSIX_H
28: #include <rxposix.h>
29: #else
1.62 frystyk 30: #ifdef HAVE_REGEX_H
31: #include <regex.h>
32: #endif
33: #endif
1.60 frystyk 34: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 35: #endif
36:
1.14 frystyk 37: #ifndef W3C_VERSION
1.33 eric 38: #define W3C_VERSION "Unspecified"
1.1 frystyk 39: #endif
40:
41: #define APP_NAME "W3CRobot"
1.14 frystyk 42: #define APP_VERSION W3C_VERSION
1.62 frystyk 43: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 44:
45: #define DEFAULT_OUTPUT_FILE "robot.out"
46: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 47: #define DEFAULT_LOG_FILE "log-clf.txt"
48: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 frystyk 49: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 50: #define DEFAULT_LM_FILE "log-lastmodified.txt"
51: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 52: #define DEFAULT_REFERER_FILE "log-referer.txt"
53: #define DEFAULT_REJECT_FILE "log-reject.txt"
54: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
55: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 56: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 57: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 58: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 59: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 60: #define DEFAULT_PREFIX ""
1.59 frystyk 61: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 62: #define DEFAULT_DEPTH 0
1.53 frystyk 63: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 64:
1.68 frystyk 65: #define DEFAULT_SQL_SERVER "localhost"
66: #define DEFAULT_SQL_DB "webbot"
67: #define DEFAULT_SQL_USER "webbot"
68: #define DEFAULT_SQL_PW ""
69:
1.51 frystyk 70: #if 0
1.65 frystyk 71: #define HT_MEMLOG /* Is expensive in performance! */
1.51 frystyk 72: #endif
73:
1.46 eric 74: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 75: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
76: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 77:
1.66 frystyk 78: #define DEFAULT_TIMEOUT 50000 /* timeout in millis */
1.1 frystyk 79:
80: #if defined(__svr4__)
81: #define CATCH_SIG
82: #endif
83:
84: typedef enum _MRFlags {
1.45 frystyk 85: MR_IMG = 0x1,
86: MR_LINK = 0x2,
87: MR_PREEMPTIVE = 0x4,
88: MR_TIME = 0x8,
1.46 eric 89: MR_SAVE = 0x10,
1.48 frystyk 90: MR_QUIET = 0x20,
1.62 frystyk 91: MR_REAL_QUIET = 0x40,
92: MR_VALIDATE = 0x80,
93: MR_END_VALIDATE = 0x100,
1.63 frystyk 94: MR_KEEP_META = 0x200,
95: MR_LOGGING = 0x400,
96: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 97: } MRFlags;
98:
99: typedef struct _Robot {
1.2 frystyk 100: int depth; /* How deep is our tree */
1.30 frystyk 101: int cnt; /* Count of requests */
1.2 frystyk 102: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 103: HTList * htext; /* List of our HText Objects */
1.34 eric 104: HTList * fingers;
1.59 frystyk 105:
1.40 frystyk 106: int timer;
1.65 frystyk 107: char * cwd; /* Current dir URL */
1.1 frystyk 108: char * rules;
1.55 frystyk 109: char * prefix;
1.59 frystyk 110: char * img_prefix;
111:
1.60 frystyk 112: char * logfile; /* clf log */
1.55 frystyk 113: HTLog * log;
1.60 frystyk 114: char * reffile; /* referer log */
1.57 frystyk 115: HTLog * ref;
1.60 frystyk 116: char * rejectfile; /* unchecked links */
1.58 frystyk 117: HTLog * reject;
1.60 frystyk 118: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 119: HTLog * notfound;
1.60 frystyk 120: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 121: HTLog * conneg;
1.60 frystyk 122: char * noalttagfile; /* images without alt tags*/
123: HTLog * noalttag;
124:
125: char * hitfile; /* links sorted after hit counts */
1.64 frystyk 126: char * relfile; /* link sorted after relationships */
127: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 128: char * titlefile; /* links with titles */
1.60 frystyk 129: char * mtfile; /* media types encountered */
130: char * charsetfile; /* charsets encountered */
1.63 frystyk 131: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 132:
133: char * outputfile;
1.1 frystyk 134: FILE * output;
1.59 frystyk 135:
1.1 frystyk 136: MRFlags flags;
1.55 frystyk 137:
1.59 frystyk 138: long get_bytes; /* Total number of bytes processed using GET*/
139: long get_docs; /* Total number of documents using GET */
140:
141: long head_bytes; /* bytes processed bytes processed using HEAD */
142: long head_docs; /* Total number of documents using HEAD*/
143:
144: long other_docs;
145:
1.56 frystyk 146: ms_t time; /* Time of run */
1.58 frystyk 147:
148: #ifdef HT_POSIX_REGEX
149: regex_t * include;
150: regex_t * exclude;
151: regex_t * check;
152: #endif
153:
1.68 frystyk 154: #ifdef HT_MYSQL
155: HTSQLLog * sqllog;
156: char * sqlserver;
157: char * sqldb;
158: char * sqluser;
159: char * sqlpw;
160: char * sqlrelative;
161: BOOL sqlexternals;
162: int sqlflags;
163: #endif
164:
1.1 frystyk 165: } Robot;
1.34 eric 166:
167: typedef struct _Finger {
168: Robot * robot;
169: HTRequest * request;
170: HTParentAnchor * dest;
171: } Finger;
172:
1.1 frystyk 173: typedef enum _LoadState {
174: L_INVALID = -2,
175: L_LOADING = -1,
176: L_SUCCESS = 0,
177: L_ERROR
178: } LoadState;
179:
180: /*
181: ** The HyperDoc object is bound to the anchor and contains information about
182: ** where we are in the search for recursive searches
183: */
184: typedef struct _HyperDoc {
185: HTParentAnchor * anchor;
186: LoadState state;
187: int depth;
1.55 frystyk 188: int hits;
1.1 frystyk 189: } HyperDoc;
190:
191: /*
1.65 frystyk 192: ** This is the HText object that is created every time we start parsing an
1.1 frystyk 193: ** HTML object
194: */
1.4 frystyk 195: struct _HText {
1.1 frystyk 196: HTRequest * request;
1.65 frystyk 197: BOOL follow;
1.4 frystyk 198: };
1.1 frystyk 199:
1.58 frystyk 200: /*
201: ** A structure for calculating metadata distributions
202: */
203: typedef struct _MetaDist {
204: HTAtom * name;
205: int hits;
206: } MetaDist;
207:
208: /*
209: ** Some sorting algorithms
210: */
1.63 frystyk 211: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 212:
1.1 frystyk 213: PUBLIC HText * HTMainText = NULL;
214: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
215: PUBLIC HTStyleSheet * styleSheet = NULL;
216:
217: /* ------------------------------------------------------------------------- */
218:
1.13 eric 219: /* Standard (non-error) Output
220: ** ---------------------------
221: */
222: PUBLIC int OutputData(const char * fmt, ...)
223: {
224: int ret;
225: va_list pArgs;
226: va_start(pArgs, fmt);
227: ret = vfprintf(stdout, fmt, pArgs);
228: va_end(pArgs);
229: return ret;
230: }
231:
232: /* ------------------------------------------------------------------------- */
233:
1.2 frystyk 234: /* Create a "HyperDoc" object
235: ** --------------------------
236: ** A HyperDoc object contains information about whether we have already
237: ** started checking the anchor and the depth in our search
238: */
239: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
240: {
241: HyperDoc * hd;
1.14 frystyk 242: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
243: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 244: hd->state = L_INVALID;
245: hd->depth = depth;
1.55 frystyk 246: hd->hits = 1;
1.2 frystyk 247:
248: /* Bind the HyperDoc object together with the Anchor Object */
249: hd->anchor = anchor;
250: HTAnchor_setDocument(anchor, (void *) hd);
251:
252: /* Add this HyperDoc object to our list */
253: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
254: HTList_addObject(mr->hyperdoc, (void *) hd);
255: return hd;
256: }
257:
258: /* Delete a "HyperDoc" object
259: ** --------------------------
260: */
261: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
262: {
263: if (hd) {
1.11 frystyk 264: HT_FREE (hd);
1.2 frystyk 265: return YES;
266: }
267: return NO;
268: }
269:
1.55 frystyk 270: /*
271: ** Sort the anchor array and log reference count
272: */
273: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
274: {
275: if (mr && array) {
276: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
277: if (log) {
278: void ** data = NULL;
279: HTParentAnchor * anchor = NULL;
280: HTArray_sort(array, HitSort);
281: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
282: while (anchor) {
283: char * uri = HTAnchor_address((HTAnchor *) anchor);
284: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 285: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 286: HT_FREE(uri);
287: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
288: }
289: }
290: HTLog_close(log);
291: return YES;
292: }
293: return NO;
294: }
295:
296: PRIVATE int HitSort (const void * a, const void * b)
297: {
298: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
299: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
300: if (aa && bb) return (bb->hits - aa->hits);
301: return bb - aa;
302: }
303:
1.58 frystyk 304: /*
1.64 frystyk 305: ** Sort the anchor array and log link relations
306: */
307: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
308: {
309: if (mr && array) {
1.68 frystyk 310: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
311: void ** data = NULL;
312: HTParentAnchor * anchor = NULL;
313: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
314: while (anchor) {
315:
316: /*
317: ** If we have a specific link relation to look for then do this.
318: ** Otherwise look for all link relations.
319: */
320: if (mr->relation) {
321: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
322: if (link) {
323: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
324: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
325: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
326: if (src_uri && dest_uri) {
327: #ifdef HT_MYSQL
328: if (mr->sqllog) {
329: HTSQLLog_addLinkRelationship (mr->sqllog,
330: src_uri, dest_uri,
331: HTAtom_name(mr->relation),
332: NULL);
333: }
334: #endif
335: if (log) {
336: HTFormat format = HTAnchor_format(dest);
337: HTLog_addText(log, "%s %s %s --> %s\n",
338: HTAtom_name(mr->relation),
339: format != WWW_UNKNOWN ?
340: HTAtom_name(format) : "<unknown>",
341: src_uri, dest_uri);
342: }
343:
344: /* Cleanup */
345: HT_FREE(src_uri);
346: HT_FREE(dest_uri);
347: }
348: }
349: } else {
350: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
351: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
352: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
353: HTLinkType linktype;
354:
355: /* First look in the main link */
356: if (link && (linktype = HTLink_type(link))) {
357: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
358: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
359: if (src_uri && dest_uri) {
360: #ifdef HT_MYSQL
361: if (mr->sqllog) {
362: HTSQLLog_addLinkRelationship (mr->sqllog,
363: src_uri, dest_uri,
364: HTAtom_name(linktype),
365: NULL);
366: }
367: #endif
368: if (log) {
369: HTFormat format = HTAnchor_format(dest);
370: HTLog_addText(log, "%s %s %s --> %s\n",
371: HTAtom_name(linktype),
372: format != WWW_UNKNOWN ?
373: HTAtom_name(format) : "<unknown>",
374: src_uri, dest_uri);
375: }
376: }
377: HT_FREE(dest_uri);
378: }
379:
380: /* and then in any sublinks */
381: if (sublinks) {
382: HTLink * pres;
383: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
384: if ((linktype = HTLink_type(pres))) {
385: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 386: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 387: if (src_uri && dest_uri) {
388: #ifdef HT_MYSQL
389: if (mr->sqllog) {
390: HTSQLLog_addLinkRelationship (mr->sqllog,
391: src_uri, dest_uri,
392: HTAtom_name(linktype),
393: NULL);
394: }
395: #endif
396: if (log) {
397: HTFormat format = HTAnchor_format(dest);
398: HTLog_addText(log, "%s %s %s --> %s\n",
399: HTAtom_name(linktype),
400: format != WWW_UNKNOWN ?
401: HTAtom_name(format) : "<unknown>",
402: src_uri, dest_uri);
403: }
1.64 frystyk 404: HT_FREE(dest_uri);
405: }
406: }
407: }
408: }
1.68 frystyk 409:
410: /* Cleanup */
411: HT_FREE(src_uri);
412: }
413: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 414: }
1.68 frystyk 415: if (log) HTLog_close(log);
1.64 frystyk 416: return YES;
417: }
418: return NO;
419: }
420:
421: /*
1.63 frystyk 422: ** Sort the anchor array and log last modified date
423: */
424: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
425: {
426: if (mr && array) {
427: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
428: if (log) {
429: void ** data = NULL;
430: HTParentAnchor * anchor = NULL;
431: HTArray_sort(array, LastModifiedSort);
432: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
433: while (anchor) {
434: char * uri = HTAnchor_address((HTAnchor *) anchor);
435: time_t lm = HTAnchor_lastModified(anchor);
436: if (uri && lm > 0)
437: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
438: HT_FREE(uri);
439: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
440: }
441: }
442: HTLog_close(log);
443: return YES;
444: }
445: return NO;
446: }
447:
448: PRIVATE int LastModifiedSort (const void * a, const void * b)
449: {
450: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
451: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
452: return bb - aa;
453: }
454:
455: /*
456: ** Sort the anchor array and log the document title
457: */
458: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
459: {
460: if (mr && array) {
461: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
462: if (log) {
463: void ** data = NULL;
464: HTParentAnchor * anchor = NULL;
465: HTArray_sort(array, TitleSort);
466: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
467: while (anchor) {
468: char * uri = HTAnchor_address((HTAnchor *) anchor);
469: const char * title = HTAnchor_title(anchor);
470: HTCharset charset = HTAnchor_charset(anchor);
471: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
472: charset ? HTAtom_name(charset) : "<none>",
473: title ? title : "<none>",
474: uri);
475: HT_FREE(uri);
476: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
477: }
478: }
479: HTLog_close(log);
480: return YES;
481: }
482: return NO;
483: }
484:
485: PRIVATE int TitleSort (const void * a, const void * b)
486: {
487: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
488: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
489: return strcasecomp(bb?bb:"", aa?aa:"");
490: }
491:
492: /*
1.58 frystyk 493: ** Calculate distributions for media types. The same mechanism
494: ** can be used for other characteristics with relatively
495: ** few outcomes.
496: */
497: PRIVATE HTList * mediatype_distribution (HTArray * array)
498: {
499: if (array) {
500: HTList * mt = HTList_new();
501: MetaDist * pres = NULL;
502: void ** data = NULL;
503: HTParentAnchor * anchor = NULL;
504: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
505: while (anchor) {
506: HTFormat format = HTAnchor_format(anchor);
507: if (format && format != WWW_UNKNOWN) {
508: HTList * cur = mt;
509:
510: /* If found then increase counter */
511: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
512: if (pres->name == format) {
513: pres->hits++;
514: break;
515: }
516: }
517:
518: /* If not found then add new format to list */
519: if (!pres) {
520: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
521: HT_OUTOFMEM("mediatype_distribution");
522: pres->name = format;
523: pres->hits = 1;
524: HTList_addObject(mt, pres);
525: HTList_insertionSort(mt, FormatSort);
526: }
527: }
528:
529: /* Find next anchor in array */
530: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
531: }
532: return mt;
533: }
534: return NULL;
535: }
536:
1.60 frystyk 537: /*
538: ** Calculate distributions for charsets. The same mechanism
539: ** can be used for other characteristics with relatively
540: ** few outcomes.
541: */
542: PRIVATE HTList * charset_distribution (HTArray * array)
543: {
544: if (array) {
545: HTList * cs = HTList_new();
546: MetaDist * pres = NULL;
547: void ** data = NULL;
548: HTParentAnchor * anchor = NULL;
549: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
550: while (anchor) {
551: HTCharset charset = HTAnchor_charset(anchor);
552: if (charset) {
553: HTList * cur = cs;
554:
555: /* If found then increase counter */
556: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
557: if (pres->name == charset) {
558: pres->hits++;
559: break;
560: }
561: }
562:
563: /* If not found then add new format to list */
564: if (!pres) {
565: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
566: HT_OUTOFMEM("charset_distribution");
567: pres->name = charset;
568: pres->hits = 1;
569: HTList_addObject(cs, pres);
570: HTList_insertionSort(cs, FormatSort);
571: }
572: }
573:
574: /* Find next anchor in array */
575: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
576: }
577: return cs;
578: }
579: return NULL;
580: }
581:
1.58 frystyk 582: PRIVATE int FormatSort (const void * a, const void * b)
583: {
584: MetaDist * aa = (MetaDist *) a;
585: MetaDist * bb = (MetaDist *) b;
586: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
587: }
588:
589: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
590: {
591: if (logfile && distribution) {
592: HTLog * log = HTLog_open(logfile, YES, YES);
593: if (log) {
594: HTList * cur = distribution;
595: MetaDist * pres;
596: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
597: if (pres->name) {
1.60 frystyk 598: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 599: }
600: }
601: HTLog_close(log);
602: }
603: }
604: return NO;
605: }
606:
607: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
608: {
609: if (distribution) {
610: HTList * cur = distribution;
611: MetaDist * pres;
612: while ((pres = (MetaDist *) HTList_nextObject(cur)))
613: HT_FREE(pres);
614: HTList_delete(distribution);
615: return YES;
616: }
617: return NO;
618: }
619:
620:
1.55 frystyk 621: /* Statistics
622: ** ----------
623: ** Calculates a bunch of statistics for the anchors traversed
624: */
625: PRIVATE BOOL calculate_statistics (Robot * mr)
626: {
1.59 frystyk 627: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 628: if (!mr) return NO;
629:
630: /* Calculate efficiency */
1.59 frystyk 631: if (mr->time > 0) {
1.56 frystyk 632: ms_t t = HTGetTimeInMillis() - mr->time;
633: if (t > 0) {
1.60 frystyk 634: double loadfactor = (mr->get_bytes / (t * 0.001));
635: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 636: double secs = t / 1000.0;
1.55 frystyk 637: char bytes[50];
1.62 frystyk 638: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 639: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 640: total_docs, secs, reqprsec);
1.59 frystyk 641:
642: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 643: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 644: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 645: mr->get_docs, bytes, loadfactor);
1.59 frystyk 646:
647: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 648: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 649: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 650: mr->head_docs, bytes);
1.55 frystyk 651: }
652: }
653:
654: /* Create an array of existing anchors */
1.59 frystyk 655: if (total_docs > 1) {
656: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 657: if (array) {
658:
1.63 frystyk 659: /* Distributions */
660: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 661: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 662: }
663:
1.55 frystyk 664: /* Sort after hit counts */
1.63 frystyk 665: if (mr->hitfile) {
666: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 667: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 668: mr->hitfile);
669: calculate_hits(mr, array);
670: }
671:
1.64 frystyk 672: /* Sort after link relations */
1.68 frystyk 673: #ifdef HT_MYSQL
674: if (mr->relfile || mr->sqllog) {
1.69 frystyk 675: #else
676: if (mr->relfile) {
677: #endif
1.68 frystyk 678: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 679: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
680: mr->relfile);
681: calculate_linkRelations(mr, array);
682: }
683:
1.63 frystyk 684: /* Sort after modified date */
685: if (mr->lmfile) {
686: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 687: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 688: mr->lmfile);
689: calculate_lm(mr, array);
690: }
691:
692: /* Sort after title */
693: if (mr->titlefile) {
694: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 695: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 696: mr->titlefile);
697: calculate_title(mr, array);
698: }
1.55 frystyk 699:
1.58 frystyk 700: /* Find mediatype distribution */
701: if (mr->mtfile) {
702: HTList * mtdist = mediatype_distribution(array);
703: if (mtdist) {
1.63 frystyk 704: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 705: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 706: mr->mtfile);
1.58 frystyk 707: log_meta_distribution(mr->mtfile, mtdist);
708: delete_meta_distribution(mtdist);
709: }
710: }
1.55 frystyk 711:
1.60 frystyk 712: /* Find charset distribution */
713: if (mr->charsetfile) {
714: HTList * charsetdist = charset_distribution(array);
715: if (charsetdist) {
1.63 frystyk 716: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 717: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 718: mr->charsetfile);
1.60 frystyk 719: log_meta_distribution(mr->charsetfile, charsetdist);
720: delete_meta_distribution(charsetdist);
721: }
722: }
723:
1.55 frystyk 724: /* Add as may other stats here as you like */
1.60 frystyk 725: /* ... */
1.58 frystyk 726:
727: /* Delete the array */
1.55 frystyk 728: HTArray_delete(array);
729: }
730: }
731: return YES;
732: }
733:
1.1 frystyk 734: /* Create a Command Line Object
735: ** ----------------------------
736: */
737: PRIVATE Robot * Robot_new (void)
738: {
739: Robot * me;
1.41 frystyk 740: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 741: HT_OUTOFMEM("Robot_new");
1.2 frystyk 742: me->hyperdoc = HTList_new();
1.4 frystyk 743: me->htext = HTList_new();
1.40 frystyk 744: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 745: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 746: me->output = OUTPUT;
1.35 eric 747: me->cnt = 0;
1.34 eric 748: me->fingers = HTList_new();
1.1 frystyk 749: return me;
750: }
751:
752: /* Delete a Command Line Object
753: ** ----------------------------
754: */
1.62 frystyk 755: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 756: {
1.62 frystyk 757: if (mr) {
758: HTList_delete(mr->fingers);
1.55 frystyk 759:
760: /* Calculate statistics */
1.62 frystyk 761: calculate_statistics(mr);
1.55 frystyk 762:
1.62 frystyk 763: if (mr->hyperdoc) {
764: HTList * cur = mr->hyperdoc;
1.2 frystyk 765: HyperDoc * pres;
766: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
767: HyperDoc_delete(pres);
1.62 frystyk 768: HTList_delete(mr->hyperdoc);
1.2 frystyk 769: }
1.62 frystyk 770: if (mr->htext) {
771: HTList * cur = mr->htext;
1.4 frystyk 772: HText * pres;
773: while ((pres = (HText *) HTList_nextObject(cur)))
774: HText_free(pres);
1.62 frystyk 775: HTList_delete(mr->htext);
1.4 frystyk 776: }
1.62 frystyk 777:
778: /* Close all the log files */
1.63 frystyk 779: if (mr->flags & MR_LOGGING) {
1.64 frystyk 780: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 781: }
782:
1.62 frystyk 783: if (mr->log) {
784: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 785: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 786: HTLog_accessCount(mr->log), mr->logfile);
787: HTLog_close(mr->log);
788: }
789: if (mr->ref) {
790: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 791: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 792: HTLog_accessCount(mr->ref), mr->reffile);
793: HTLog_close(mr->ref);
794: }
795: if (mr->reject) {
796: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 797: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 798: HTLog_accessCount(mr->reject), mr->rejectfile);
799: HTLog_close(mr->reject);
800: }
801: if (mr->notfound) {
802: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 803: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 804: HTLog_accessCount(mr->notfound), mr->notfoundfile);
805: HTLog_close(mr->notfound);
806: }
807: if (mr->conneg) {
808: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 809: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 810: HTLog_accessCount(mr->conneg), mr->connegfile);
811: HTLog_close(mr->conneg);
812: }
813: if (mr->noalttag) {
814: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 815: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 816: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
817: HTLog_close(mr->noalttag);
818: }
819:
820: if (mr->output && mr->output != STDOUT) fclose(mr->output);
821:
822: if (mr->flags & MR_TIME) {
1.12 frystyk 823: time_t local = time(NULL);
1.62 frystyk 824: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 825: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 826: }
1.55 frystyk 827:
1.58 frystyk 828: #ifdef HT_POSIX_REGEX
1.62 frystyk 829: if (mr->include) {
830: regfree(mr->include);
831: HT_FREE(mr->include);
832: }
833: if (mr->exclude) {
834: regfree(mr->exclude);
835: HT_FREE(mr->exclude);
836: }
837: if (mr->check) {
838: regfree(mr->check);
839: HT_FREE(mr->check);
1.58 frystyk 840: }
841: #endif
842:
1.68 frystyk 843: #ifdef HT_MYSQL
844: if (mr->sqllog) {
845: HTSQLLog_close(mr->sqllog);
846: mr->sqllog = NULL;
847: }
848: #endif
849:
1.62 frystyk 850: HT_FREE(mr->cwd);
851: HT_FREE(mr->prefix);
852: HT_FREE(mr->img_prefix);
853: HT_FREE(mr);
1.1 frystyk 854: return YES;
855: }
856: return NO;
857: }
858:
1.2 frystyk 859: /*
1.34 eric 860: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 861: */
1.34 eric 862: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 863: {
1.34 eric 864: Finger * me;
865: HTRequest * request = HTRequest_new();
866: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
867: HT_OUTOFMEM("Finger_new");
868: me->robot = robot;
869: me->request = request;
870: me->dest = dest;
871: HTList_addObject(robot->fingers, (void *)me);
872:
1.48 frystyk 873: /* Set the context for this request */
1.34 eric 874: HTRequest_setContext (request, me);
1.48 frystyk 875:
876: /* Check the various flags to customize the request */
877: if (robot->flags & MR_PREEMPTIVE)
878: HTRequest_setPreemptive(request, YES);
879: if (robot->flags & MR_VALIDATE)
880: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
881: if (robot->flags & MR_END_VALIDATE)
882: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
883:
884: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 885: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 886:
887: /* Set the method for this request */
1.34 eric 888: HTRequest_setMethod(request, method);
889: robot->cnt++;
890: return me;
1.2 frystyk 891: }
892:
1.34 eric 893: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 894: {
1.34 eric 895: HTList_removeObject(me->robot->fingers, (void *)me);
896: me->robot->cnt--;
1.37 frystyk 897:
898: /*
899: ** If we are down at one request then flush the output buffer
900: */
901: if (me->request) {
902: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 903: HTRequest_delete(me->request);
1.37 frystyk 904: }
905:
906: /*
907: ** Delete the request and free myself
908: */
1.34 eric 909: HT_FREE(me);
910: return YES;
1.2 frystyk 911: }
912:
913: /*
914: ** Cleanup and make sure we close all connections including the persistent
915: ** ones
916: */
1.1 frystyk 917: PRIVATE void Cleanup (Robot * me, int status)
918: {
919: Robot_delete(me);
1.29 eric 920: HTProfile_delete();
1.50 frystyk 921: #ifdef HT_MEMLOG
1.39 eric 922: HTMemLog_close();
1.47 frystyk 923: #endif
924:
1.1 frystyk 925: #ifdef VMS
926: exit(status ? status : 1);
927: #else
928: exit(status ? status : 0);
929: #endif
930: }
931:
932: #ifdef CATCH_SIG
933: #include <signal.h>
934: /* SetSignal
935: ** This function sets up signal handlers. This might not be necessary to
936: ** call if the application has its own handlers (lossage on SVR4)
937: */
938: PRIVATE void SetSignal (void)
939: {
940: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
941: ** when attemting to connect to a remote host where you normally should
942: ** get `connection refused' back
943: */
944: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 945: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 946: } else {
1.13 eric 947: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 948: }
1.47 frystyk 949:
1.50 frystyk 950: #ifdef HT_MEMLOG
1.44 eric 951: HTMemLog_flush();
1.47 frystyk 952: #endif
953:
1.1 frystyk 954: }
955: #endif /* CATCH_SIG */
956:
1.58 frystyk 957: #ifdef HT_POSIX_REGEX
958: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
959: {
960: size_t length = regerror (errcode, compiled, NULL, 0);
961: char * str = NULL;
962: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
963: HT_OUTOFMEM("get_regerror");
964: (void) regerror (errcode, compiled, str, length);
965: return str;
966: }
967:
1.60 frystyk 968: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 969: {
970: regex_t * regex = NULL;
971: if (regex_str && *regex_str) {
972: int status;
973: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
974: HT_OUTOFMEM("get_regtype");
1.60 frystyk 975: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 976: char * err_msg = get_regerror(status, regex);
1.62 frystyk 977: if (SHOW_REAL_QUIET(mr))
978: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 979: HT_FREE(err_msg);
980: Cleanup(mr, -1);
981: }
982: }
983: return regex;
984: }
985: #endif
986:
1.1 frystyk 987: PRIVATE void VersionInfo (void)
988: {
1.62 frystyk 989: OutputData("W3C Sample Software\n\n");
990: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
991: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
992: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 993: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 994: }
995:
996: /* terminate_handler
997: ** -----------------
1.2 frystyk 998: ** This function is registered to handle the result of the request.
999: ** If no more requests are pending then terminate program
1.1 frystyk 1000: */
1.32 frystyk 1001: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
1002: void * param, int status)
1.1 frystyk 1003: {
1.34 eric 1004: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 1005: Robot * mr = finger->robot;
1.62 frystyk 1006: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 1007:
1.68 frystyk 1008: #ifdef HT_MYSQL
1009: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1010: #endif
1011:
1.58 frystyk 1012: /* Check if negotiated resource and whether we should log that*/
1013: if (mr->conneg) {
1014: HTAssocList * cur = HTResponse_variant(response);
1015: if (cur) {
1016: BOOL first = YES;
1017: HTChunk * buffer = HTChunk_new(128);
1018: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1019: HTAssoc * pres;
1.60 frystyk 1020: HTChunk_puts(buffer, uri);
1.58 frystyk 1021: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1022: char * value = HTAssoc_value(pres);
1023: if (first) {
1.60 frystyk 1024: HTChunk_puts(buffer, "\t(");
1.58 frystyk 1025: first = NO;
1026: } else
1027: HTChunk_puts(buffer, ", ");
1028:
1029: /* Output the name */
1030: HTChunk_puts(buffer, HTAssoc_name(pres));
1031:
1032: /* Only output the value if not empty string */
1.60 frystyk 1033: if (value && *value) {
1.58 frystyk 1034: HTChunk_puts(buffer, "=");
1035: HTChunk_puts(buffer, value);
1036: }
1037: }
1.60 frystyk 1038: if (!first) HTChunk_puts(buffer, ")");
1039: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 1040: HTChunk_delete(buffer);
1041: HT_FREE(uri);
1042: }
1043: }
1044:
1.55 frystyk 1045: /* Count the amount of body data that we have read */
1.59 frystyk 1046: if (HTRequest_method(request) == METHOD_GET) {
1047: int length = HTAnchor_length(HTRequest_anchor(request));
1048: if (length > 0) mr->get_bytes += length;
1049: mr->get_docs++;
1050: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 1051: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 1052: if (length > 0) mr->head_bytes += length;
1053: mr->head_docs++;
1054: } else {
1055: mr->other_docs++;
1.55 frystyk 1056: }
1057:
1.58 frystyk 1058: /* Cleanup the anchor so that we don't drown in metainformation */
1059: if (!(mr->flags & MR_KEEP_META))
1060: HTAnchor_clearHeader(HTRequest_anchor(request));
1061:
1.55 frystyk 1062: /* Delete this thread */
1.34 eric 1063: Finger_delete(finger);
1.55 frystyk 1064:
1065: /* Should we stop? */
1.46 eric 1066: if (mr->cnt <= 0) {
1.62 frystyk 1067: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1068: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 1069: }
1.62 frystyk 1070: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 1071: return HT_OK;
1072: }
1073:
1074: /* ------------------------------------------------------------------------- */
1075: /* HTEXT INTERFACE */
1076: /* ------------------------------------------------------------------------- */
1077:
1078: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
1079: HTStream * stream)
1080: {
1081: HText * me;
1.34 eric 1082: Finger * finger = (Finger *) HTRequest_context(request);
1083: Robot * mr = finger->robot;
1.65 frystyk 1084: char * robots = NULL;
1085:
1.14 frystyk 1086: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1087: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1088:
1089: /* Bind the HText object together with the Request Object */
1.1 frystyk 1090: me->request = request;
1.65 frystyk 1091: me->follow = YES;
1092:
1093: /* Check to see if we have any meta tags */
1094: if ((robots = HTAnchor_robots(anchor)) != NULL) {
1095: char * strval = NULL;
1096: char * ptr = NULL;
1097: char * token = NULL;
1098: StrAllocCopy(strval, robots);
1099: ptr = strval;
1100: while ((token = HTNextField(&ptr)) != NULL) {
1101: if (!strcasecomp(token, "nofollow")) {
1102: me->follow = NO;
1103: break;
1104: }
1105: }
1106: HT_FREE(strval);
1107: }
1.4 frystyk 1108:
1109: /* Add this HyperDoc object to our list */
1110: if (!mr->htext) mr->htext = HTList_new();
1111: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1112: return me;
1113: }
1114:
1.4 frystyk 1115: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1116: if (me) HT_FREE (me);
1.4 frystyk 1117: }
1118:
1.1 frystyk 1119: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1120: {
1121: if (text && anchor) {
1.34 eric 1122: Finger * finger = (Finger *) HTRequest_context(text->request);
1123: Robot * mr = finger->robot;
1.1 frystyk 1124: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1125: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1126: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1127: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1128: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1129: BOOL match = text->follow;
1.58 frystyk 1130: BOOL check = NO;
1.1 frystyk 1131:
1.55 frystyk 1132: if (!uri) return;
1.62 frystyk 1133: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1134:
1135: if (hd) {
1.62 frystyk 1136: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1137: hd->hits++;
1.68 frystyk 1138: #ifdef HT_MYSQL
1139: if (mr->sqllog) {
1140: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1141: if (ref_addr) {
1142: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1143: "referer", NULL);
1144: HT_FREE(ref_addr);
1145: }
1146: }
1147: #endif
1.58 frystyk 1148: HT_FREE(uri);
1149: return;
1150: }
1.70 frystyk 1151:
1.58 frystyk 1152: /* Check for prefix match */
1.65 frystyk 1153: if (match && mr->prefix) {
1154: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1155: }
1.58 frystyk 1156:
1157: #ifdef HT_POSIX_REGEX
1.69 frystyk 1158: /*
1159: ** Check for any regular expression. The include may override
1160: ** the prefix matching
1161: */
1162: if (mr->include) {
1.58 frystyk 1163: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1164: }
1165: if (match && mr->exclude) {
1166: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1167: }
1168: if (match && mr->check) {
1169: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1170: }
1171: #endif
1172:
1173: /* Test whether we already have a hyperdoc for this document */
1174: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1175: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1176: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1177: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1178: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1179: HTRequest * newreq = newfinger->request;
1.2 frystyk 1180: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1181: HTRequest_setParent(newreq, referer);
1.58 frystyk 1182: if (check || depth >= mr->depth) {
1.62 frystyk 1183: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1184: HTRequest_setMethod(newreq, METHOD_HEAD);
1185: } else {
1.62 frystyk 1186: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1187: }
1188: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1189: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1190: Finger_delete(newfinger);
1.2 frystyk 1191: }
1.7 frystyk 1192: } else {
1.62 frystyk 1193: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1194: #ifdef HT_MYSQL
1195: if (mr->reject || mr->sqllog) {
1196: #else
1.60 frystyk 1197: if (mr->reject) {
1.68 frystyk 1198: #endif
1.60 frystyk 1199: if (referer) {
1200: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1201: if (mr->reject && ref_addr)
1202: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1203: #ifdef HT_MYSQL
1204: if (mr->sqllog && mr->sqlexternals && ref_addr)
1205: HTSQLLog_addLinkRelationship(mr->sqllog,
1206: ref_addr, uri,
1207: "referer", NULL);
1208: #endif
1209:
1.60 frystyk 1210: HT_FREE(ref_addr);
1211: }
1212: }
1.2 frystyk 1213: }
1.11 frystyk 1214: HT_FREE(uri);
1.2 frystyk 1215: }
1216: }
1217:
1218: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1219: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1220: {
1221: if (text && anchor) {
1.34 eric 1222: Finger * finger = (Finger *) HTRequest_context(text->request);
1223: Robot * mr = finger->robot;
1.59 frystyk 1224: if (mr->flags & MR_IMG) {
1.60 frystyk 1225: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1226: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1227: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1228: HyperDoc * hd = HTAnchor_document(dest_parent);
1229: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1230: BOOL match = YES;
1231:
1.72 frystyk 1232: if (!uri) return;
1.59 frystyk 1233: if (hd) {
1.62 frystyk 1234: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1235: hd->hits++;
1.68 frystyk 1236: #ifdef HT_MYSQL
1237: if (mr->sqllog) {
1238: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1239: if (ref_addr) {
1240: HTSQLLog_addLinkRelationship(mr->sqllog,
1241: ref_addr, uri,
1242: "image", alt);
1243: HT_FREE(ref_addr);
1244: }
1245: }
1246: #endif
1.11 frystyk 1247: HT_FREE(uri);
1.59 frystyk 1248: return;
1.2 frystyk 1249: }
1.59 frystyk 1250:
1251: /* Check for prefix match */
1252: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1253:
1254: /* Test whether we already have a hyperdoc for this document */
1255: if (match && dest) {
1.60 frystyk 1256: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1257: mr->flags & MR_SAVE ?
1258: METHOD_GET : METHOD_HEAD);
1259: HTRequest * newreq = newfinger->request;
1.60 frystyk 1260: HyperDoc_new(mr, dest_parent, 1);
1261: HTRequest_setParent(newreq, referer);
1262:
1263: /* Check whether we should report missing ALT tags */
1264: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1265: if (referer) {
1266: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1267: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1268: HT_FREE(ref_addr);
1269: }
1270: }
1271:
1.62 frystyk 1272: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1273: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1274: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1275: Finger_delete(newfinger);
1276: }
1277: } else {
1.62 frystyk 1278: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1279: #ifdef HT_MYSQL
1280: if (mr->reject || mr->sqllog) {
1281: #else
1.60 frystyk 1282: if (mr->reject) {
1.68 frystyk 1283: #endif
1.60 frystyk 1284: if (referer) {
1285: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1286: if (mr->reject && ref_addr)
1287: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1288: #ifdef HT_MYSQL
1289: if (mr->sqllog && mr->sqlexternals && ref_addr)
1290: HTSQLLog_addLinkRelationship(mr->sqllog,
1291: ref_addr, uri,
1292: "image", alt);
1293: #endif
1294:
1.60 frystyk 1295: HT_FREE(ref_addr);
1296: }
1297: }
1.1 frystyk 1298: }
1.59 frystyk 1299: HT_FREE(uri);
1.72 frystyk 1300: }
1301: }
1302: }
1303:
1304: PUBLIC void HText_appendLink (HText * text, HTChildAnchor * anchor,
1305: const BOOL * present, const char ** value)
1306: {
1307: if (text && anchor) {
1308: Finger * finger = (Finger *) HTRequest_context(text->request);
1309: Robot * mr = finger->robot;
1310: if (SHOW_QUIET(mr))
1311: HTTrace("Robot....... Received Link element with anchor %p\n", anchor);
1312: HText_beginAnchor(text, anchor);
1313: }
1314: }
1315:
1316: PUBLIC void HText_appendObject (HText * text, int element_number,
1317: const BOOL * present, const char ** value)
1318: {
1319: /* Here we can look for frames, link tags, meta tags etc. */
1320: if (text && text->request) {
1321: Finger * finger = (Finger *) HTRequest_context(text->request);
1322: Robot * mr = finger->robot;
1323:
1324: if (SHOW_QUIET(mr))
1325: HTTrace("Robot....... HText Object %p called with HTML element number %d\n",
1326: text, element_number);
1327:
1328: switch (element_number) {
1329:
1330: case HTML_FRAME:
1331: {
1332: HTChildAnchor * source = HTAnchor_findChildAndLink(
1333: HTRequest_anchor(text->request), /* Parent */
1334: NULL, /* Tag */
1335: present[HTML_FRAME_SRC] ? value[HTML_FRAME_SRC] : NULL, /* Addresss */
1336: NULL); /* Rels */
1337: HText_beginAnchor(text, source);
1338: }
1339: break;
1340:
1341: case HTML_BODY:
1342: {
1343: HTChildAnchor * source = HTAnchor_findChildAndLink(
1344: HTRequest_anchor(text->request), /* Parent */
1345: NULL, /* Tag */
1346: present[HTML_BODY_BACKGROUND] ? value[HTML_BODY_BACKGROUND] : NULL, /* Addresss */
1347: NULL); /* Rels */
1348: HText_appendImage(text, source, NULL, NULL, NO);
1349: }
1350: break;
1351:
1352: default:
1353: break;
1.1 frystyk 1354: }
1355: }
1356: }
1357:
1358: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1359: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1360: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1361: PUBLIC void HText_endAppend (HText * text) {}
1362: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1363: PUBLIC void HText_beginAppend (HText * text) {}
1364: PUBLIC void HText_appendParagraph (HText * text) {}
1365:
1.48 frystyk 1366: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1367: {
1368: return (vfprintf(stderr, fmt, pArgs));
1369: }
1370:
1.1 frystyk 1371: /* ------------------------------------------------------------------------- */
1372: /* MAIN PROGRAM */
1373: /* ------------------------------------------------------------------------- */
1374:
1375: int main (int argc, char ** argv)
1376: {
1.48 frystyk 1377: int status = 0;
1.1 frystyk 1378: int arg;
1.48 frystyk 1379: BOOL cache = NO; /* Use persistent cache */
1380: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1381: char * cache_root = NULL;
1.1 frystyk 1382: HTChunk * keywords = NULL; /* From command line */
1383: int keycnt = 0;
1.12 frystyk 1384: Robot * mr = NULL;
1.43 frystyk 1385: Finger * finger = NULL;
1386: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1387:
1388: /* Starts Mac GUSI socket library */
1389: #ifdef GUSI
1390: GUSISetup(GUSIwithSIOUXSockets);
1391: GUSISetup(GUSIwithInternetSockets);
1392: #endif
1393:
1394: #ifdef __MWERKS__ /* STR */
1395: InitGraf((Ptr) &qd.thePort);
1396: InitFonts();
1397: InitWindows();
1398: InitMenus(); TEInit();
1399: InitDialogs(nil);
1400: InitCursor();
1401: SIOUXSettings.asktosaveonclose = false;
1402: argc=ccommand(&argv);
1.50 frystyk 1403: #endif /* __MWERKS__ */
1.1 frystyk 1404:
1.50 frystyk 1405: #ifdef HT_MEMLOG
1.51 frystyk 1406: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1407: #endif
1.46 eric 1408:
1.27 frystyk 1409: /* Initiate W3C Reference Library with a robot profile */
1410: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1411: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1412:
1413: /* Add the default HTML parser to the set of converters */
1414: {
1415: HTList * converters = HTFormat_conversion();
1416: HTMLInit(converters);
1417: }
1.1 frystyk 1418:
1.12 frystyk 1419: /* Build a new robot object */
1420: mr = Robot_new();
1421:
1.1 frystyk 1422: /* Scan command Line for parameters */
1423: for (arg=1; arg<argc; arg++) {
1424: if (*argv[arg] == '-') {
1425:
1426: /* non-interactive */
1.17 frystyk 1427: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1428: HTAlert_setInteractive(NO);
1429:
1.62 frystyk 1430: /* help */
1431: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1432: VersionInfo();
1433: Cleanup(mr, 0);
1434:
1.63 frystyk 1435: /* clf log file */
1.1 frystyk 1436: } else if (!strcmp(argv[arg], "-l")) {
1437: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1438: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1439: mr->flags |= MR_LOGGING;
1.1 frystyk 1440:
1.63 frystyk 1441: /* referer log file */
1.58 frystyk 1442: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1443: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1444: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1445: mr->flags |= MR_LOGGING;
1.57 frystyk 1446:
1.58 frystyk 1447: /* Not found error log file */
1448: } else if (!strncmp(argv[arg], "-404", 4)) {
1449: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1450: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1451: mr->flags |= MR_LOGGING;
1.58 frystyk 1452:
1453: /* reject log file */
1454: } else if (!strncmp(argv[arg], "-rej", 4)) {
1455: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1456: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1457: mr->flags |= MR_LOGGING;
1.58 frystyk 1458:
1.63 frystyk 1459: /* no alt tags log file */
1460: } else if (!strncmp(argv[arg], "-alt", 4)) {
1461: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1462: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1463: mr->flags |= MR_LOGGING;
1464:
1465: /* negotiated resource log file */
1.58 frystyk 1466: } else if (!strncmp(argv[arg], "-neg", 4)) {
1467: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1468: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1469: mr->flags |= MR_LOGGING;
1470:
1471: /* hit file log */
1472: } else if (!strcmp(argv[arg], "-hit")) {
1473: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1474: argv[++arg] : DEFAULT_HIT_FILE;
1475: mr->flags |= MR_DISTRIBUTIONS;
1476:
1.64 frystyk 1477: /* link relations file log */
1478: } else if (!strcmp(argv[arg], "-rellog")) {
1479: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1480: argv[++arg] : DEFAULT_REL_FILE;
1481: mr->flags |= MR_DISTRIBUTIONS;
1482:
1483: /* Specific link relation to look for (only used i also -rellog) */
1484: } else if (!strcmp(argv[arg], "-relation")) {
1485: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
1486: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
1487: mr->flags |= MR_DISTRIBUTIONS;
1488:
1.63 frystyk 1489: /* last modified log file */
1490: } else if (!strcmp(argv[arg], "-lm")) {
1491: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1492: argv[++arg] : DEFAULT_LM_FILE;
1493: mr->flags |= MR_DISTRIBUTIONS;
1494:
1495: /* title log file */
1496: } else if (!strcmp(argv[arg], "-title")) {
1497: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1498: argv[++arg] : DEFAULT_TITLE_FILE;
1499: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1500:
1501: /* mediatype distribution log file */
1502: } else if (!strncmp(argv[arg], "-for", 4)) {
1503: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1504: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1505: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1506:
1.60 frystyk 1507: /* charset distribution log file */
1508: } else if (!strncmp(argv[arg], "-char", 5)) {
1509: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1510: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1511: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1512:
1.55 frystyk 1513: /* rule file */
1.1 frystyk 1514: } else if (!strcmp(argv[arg], "-r")) {
1515: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1516: argv[++arg] : DEFAULT_RULE_FILE;
1517:
1518: /* output filename */
1519: } else if (!strcmp(argv[arg], "-o")) {
1520: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1521: argv[++arg] : DEFAULT_OUTPUT_FILE;
1522:
1.55 frystyk 1523: /* URI prefix */
1524: } else if (!strcmp(argv[arg], "-prefix")) {
1525: char * prefix = NULL;
1526: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1527: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1528: if (*prefix && *prefix != '*') {
1.55 frystyk 1529: StrAllocCopy(mr->prefix, prefix);
1530: StrAllocCat(mr->prefix, "*");
1531: }
1532:
1.1 frystyk 1533: /* timeout -- Change the default request timeout */
1534: } else if (!strcmp(argv[arg], "-timeout")) {
1535: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1536: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1537: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1538:
1.54 frystyk 1539: /* Force no pipelined requests */
1540: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 frystyk 1541: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1542:
1.48 frystyk 1543: /* Start the persistent cache */
1544: } else if (!strcmp(argv[arg], "-cache")) {
1545: cache = YES;
1546:
1.54 frystyk 1547: /* Determine the cache root */
1548: } else if (!strcmp(argv[arg], "-cacheroot")) {
1549: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1550: argv[++arg] : NULL;
1.51 frystyk 1551:
1.52 frystyk 1552: /* Stream write flush delay in ms */
1553: } else if (!strcmp(argv[arg], "-delay")) {
1554: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1555: atoi(argv[++arg]) : DEFAULT_DELAY;
1556: HTHost_setDefaultWriteDelay(delay);
1557:
1.48 frystyk 1558: /* Persistent cache flush */
1559: } else if (!strcmp(argv[arg], "-flush")) {
1560: flush = YES;
1561:
1562: /* Do a cache validation */
1563: } else if (!strcmp(argv[arg], "-validate")) {
1564: mr->flags |= MR_VALIDATE;
1565:
1566: /* Do an end-to-end cache-validation */
1567: } else if (!strcmp(argv[arg], "-endvalidate")) {
1568: mr->flags |= MR_END_VALIDATE;
1569:
1.7 frystyk 1570: /* preemptive or non-preemptive access */
1.1 frystyk 1571: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1572: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1573:
1574: /* test inlined images */
1575: } else if (!strcmp(argv[arg], "-img")) {
1576: mr->flags |= MR_IMG;
1.45 frystyk 1577:
1578: /* load inlined images */
1579: } else if (!strcmp(argv[arg], "-saveimg")) {
1580: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1581:
1582: /* URI prefix for inlined images */
1583: } else if (!strcmp(argv[arg], "-imgprefix")) {
1584: char * prefix = NULL;
1585: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1586: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1587: if (*prefix && *prefix!='*') {
1.59 frystyk 1588: StrAllocCopy(mr->img_prefix, prefix);
1589: StrAllocCat(mr->img_prefix, "*");
1590: }
1.2 frystyk 1591:
1592: /* load anchors */
1.58 frystyk 1593: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1594: mr->flags |= MR_LINK;
1.7 frystyk 1595: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1596: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1597:
1.12 frystyk 1598: /* Output start and end time */
1599: } else if (!strcmp(argv[arg], "-ss")) {
1600: mr->flags |= MR_TIME;
1601:
1.1 frystyk 1602: /* print version and exit */
1603: } else if (!strcmp(argv[arg], "-version")) {
1604: VersionInfo();
1605: Cleanup(mr, 0);
1.46 eric 1606:
1607: /* run in quiet mode */
1608: } else if (!strcmp(argv[arg], "-q")) {
1609: mr->flags |= MR_QUIET;
1.1 frystyk 1610:
1.62 frystyk 1611: /* run in really quiet mode */
1612: } else if (!strcmp(argv[arg], "-Q")) {
1613: mr->flags |= MR_REAL_QUIET;
1614:
1.1 frystyk 1615: #ifdef WWWTRACE
1616: /* trace flags */
1617: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1618: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1619: #endif
1620:
1.58 frystyk 1621: #ifdef HT_POSIX_REGEX
1622:
1623: /* If we can link against a POSIX regex library */
1624: } else if (!strncmp(argv[arg], "-inc", 4)) {
1625: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1626: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1627: }
1628: } else if (!strncmp(argv[arg], "-exc", 4)) {
1629: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1630: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1631: }
1632: } else if (!strncmp(argv[arg], "-check", 6)) {
1633: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1634: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1635: }
1636: #endif
1637:
1.68 frystyk 1638: #ifdef HT_MYSQL
1639: /* If we can link against a MYSQL database library */
1640: } else if (!strncmp(argv[arg], "-sqldb", 5)) {
1641: mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ?
1642: argv[++arg] : DEFAULT_SQL_DB;
1643:
1644: } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) {
1645: mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE;
1646:
1647: } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) {
1648: mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE;
1649:
1650: } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) {
1651: mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE;
1652:
1653: } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) {
1654: mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE;
1655:
1656: } else if (!strncmp(argv[arg], "-sqlexternals", 5)) {
1657: mr->sqlexternals = YES;
1658:
1659: } else if (!strncmp(argv[arg], "-sqlpassword", 5)) {
1660: mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ?
1661: argv[++arg] : DEFAULT_SQL_PW;
1662:
1663: } else if (!strncmp(argv[arg], "-sqlrelative", 5)) {
1664: mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ?
1665: argv[++arg] : NULL;
1666:
1667: } else if (!strncmp(argv[arg], "-sqlserver", 5)) {
1668: mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ?
1669: argv[++arg] : DEFAULT_SQL_SERVER;
1670:
1671: } else if (!strncmp(argv[arg], "-sqluser", 5)) {
1672: mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ?
1673: argv[++arg] : DEFAULT_SQL_USER;
1674:
1675: #endif
1676:
1.1 frystyk 1677: } else {
1.62 frystyk 1678: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1679: }
1.17 frystyk 1680: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1681: if (!keycnt) {
1682: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1683: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1684: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1685: keycnt = 1;
1.11 frystyk 1686: HT_FREE(ref);
1.1 frystyk 1687: } else { /* Check for successive keyword arguments */
1688: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1689: if (keycnt++ <= 1)
1.5 frystyk 1690: keywords = HTChunk_new(128);
1.1 frystyk 1691: else
1.5 frystyk 1692: HTChunk_putc(keywords, ' ');
1693: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1694: HT_FREE(escaped);
1.1 frystyk 1695: }
1696: }
1697: }
1698:
1699: #ifdef CATCH_SIG
1700: SetSignal();
1701: #endif
1702:
1703: if (!keycnt) {
1.62 frystyk 1704: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1705: Cleanup(mr, -1);
1706: }
1707:
1708: if (mr->depth != DEFAULT_DEPTH &&
1709: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1710: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1711: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1712: mr->depth);
1.1 frystyk 1713: Cleanup(mr, -1);
1714: }
1715:
1.23 manoli 1716: /* Testing that HTTrace is working */
1.62 frystyk 1717: if (mr->flags & MR_TIME) {
1718: if (SHOW_REAL_QUIET(mr)) {
1719: time_t local = time(NULL);
1.67 frystyk 1720: HTTrace("Welcome to the W3C mini Robot version %s - started on %s\n",
1721: APP_VERSION, HTDateTimeStr(&local, YES));
1.62 frystyk 1722: }
1723: }
1.23 manoli 1724:
1.1 frystyk 1725: /* Rule file specified? */
1726: if (mr->rules) {
1727: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.73 ! frystyk 1728: if (!HTLoadRulesAutomatically(rules))
1.62 frystyk 1729: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1730: HT_FREE(rules);
1.1 frystyk 1731: }
1732:
1733: /* Output file specified? */
1734: if (mr->outputfile) {
1735: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1736: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1737: mr->output = OUTPUT;
1738: }
1739: }
1740:
1.48 frystyk 1741: /* Should we use persistent cache? */
1742: if (cache) {
1.54 frystyk 1743: HTCacheInit(cache_root, 20);
1.49 frystyk 1744: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1745: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1746: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1747:
1748: /* Should we start by flushing? */
1749: if (flush) HTCache_flushAll();
1750: }
1.68 frystyk 1751:
1752: /* SQL Log specified? */
1753: #ifdef HT_MYSQL
1754: if (mr->sqlserver) {
1755: if ((mr->sqllog =
1.69 frystyk 1756: HTSQLLog_open(mr->sqlserver,
1757: mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER,
1758: mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW,
1759: mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB,
1760: mr->sqlflags)) != NULL) {
1.68 frystyk 1761: if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative);
1762: }
1763: }
1764: #endif
1.48 frystyk 1765:
1.58 frystyk 1766: /* CLF Log file specified? */
1.55 frystyk 1767: if (mr->logfile) {
1768: mr->log = HTLog_open(mr->logfile, YES, YES);
1769: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1770: }
1771:
1.58 frystyk 1772: /* Referer Log file specified? */
1.57 frystyk 1773: if (mr->reffile) {
1774: mr->ref = HTLog_open(mr->reffile, YES, YES);
1775: if (mr->ref)
1776: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1777: }
1.1 frystyk 1778:
1.58 frystyk 1779: /* Not found error log specified? */
1780: if (mr->notfoundfile) {
1781: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1782: if (mr->notfound)
1783: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1784: }
1785:
1786: /* Negotiated resource log specified? */
1787: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1788:
1789: /* No alt tags log file specified? */
1790: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1791:
1792: /* Reject Log file specified? */
1793: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1794:
1795: /* Register our own terminate filter */
1.32 frystyk 1796: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1797:
1798: /* Setting event timeout */
1799: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1800:
1.56 frystyk 1801: mr->time = HTGetTimeInMillis();
1.37 frystyk 1802:
1.34 eric 1803: /* Start the request */
1804: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1805:
1806: /*
1807: ** Make sure that the first request is flushed immediately and not
1808: ** buffered in the output buffer
1809: */
1810: HTRequest_setFlush(finger->request, YES);
1811:
1812: /*
1.48 frystyk 1813: ** Check whether we should do some kind of cache validation on
1814: ** the load
1815: */
1816: if (mr->flags & MR_VALIDATE)
1817: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1818: if (mr->flags & MR_END_VALIDATE)
1819: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1820:
1821: /*
1.43 frystyk 1822: ** Now do the load
1823: */
1.34 eric 1824: if (mr->flags & MR_PREEMPTIVE)
1825: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1826:
1827: if (keywords) /* Search */
1.34 eric 1828: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1829: else
1.34 eric 1830: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1831:
1.5 frystyk 1832: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1833: if (status != YES) {
1.62 frystyk 1834: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1835: Cleanup(mr, -1);
1836: }
1837:
1838: /* Go into the event loop... */
1.34 eric 1839: HTEventList_loop(finger->request);
1.1 frystyk 1840:
1841: /* Only gets here if event loop fails */
1842: Cleanup(mr, 0);
1843: return 0;
1844: }
Webmaster