Annotation of libwww/Robot/src/HTRobot.c, revision 1.68
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
1.64 frystyk 26: #ifdef HAVE_RXPOSIX_H
27: #include <rxposix.h>
28: #else
1.62 frystyk 29: #ifdef HAVE_REGEX_H
30: #include <regex.h>
31: #endif
32: #endif
1.60 frystyk 33: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 34: #endif
35:
1.14 frystyk 36: #ifndef W3C_VERSION
1.33 eric 37: #define W3C_VERSION "Unspecified"
1.1 frystyk 38: #endif
39:
40: #define APP_NAME "W3CRobot"
1.14 frystyk 41: #define APP_VERSION W3C_VERSION
1.62 frystyk 42: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 43:
44: #define DEFAULT_OUTPUT_FILE "robot.out"
45: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 46: #define DEFAULT_LOG_FILE "log-clf.txt"
47: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 frystyk 48: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 49: #define DEFAULT_LM_FILE "log-lastmodified.txt"
50: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 51: #define DEFAULT_REFERER_FILE "log-referer.txt"
52: #define DEFAULT_REJECT_FILE "log-reject.txt"
53: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
54: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 55: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 56: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 57: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 58: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 59: #define DEFAULT_PREFIX ""
1.59 frystyk 60: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 61: #define DEFAULT_DEPTH 0
1.53 frystyk 62: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 63:
1.68 ! frystyk 64: #define DEFAULT_SQL_SERVER "localhost"
! 65: #define DEFAULT_SQL_DB "webbot"
! 66: #define DEFAULT_SQL_USER "webbot"
! 67: #define DEFAULT_SQL_PW ""
! 68:
1.51 frystyk 69: #if 0
1.65 frystyk 70: #define HT_MEMLOG /* Is expensive in performance! */
1.51 frystyk 71: #endif
72:
1.46 eric 73: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 74: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
75: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 76:
1.66 frystyk 77: #define DEFAULT_TIMEOUT 50000 /* timeout in millis */
1.1 frystyk 78:
79: #if defined(__svr4__)
80: #define CATCH_SIG
81: #endif
82:
83: typedef enum _MRFlags {
1.45 frystyk 84: MR_IMG = 0x1,
85: MR_LINK = 0x2,
86: MR_PREEMPTIVE = 0x4,
87: MR_TIME = 0x8,
1.46 eric 88: MR_SAVE = 0x10,
1.48 frystyk 89: MR_QUIET = 0x20,
1.62 frystyk 90: MR_REAL_QUIET = 0x40,
91: MR_VALIDATE = 0x80,
92: MR_END_VALIDATE = 0x100,
1.63 frystyk 93: MR_KEEP_META = 0x200,
94: MR_LOGGING = 0x400,
95: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 96: } MRFlags;
97:
98: typedef struct _Robot {
1.2 frystyk 99: int depth; /* How deep is our tree */
1.30 frystyk 100: int cnt; /* Count of requests */
1.2 frystyk 101: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 102: HTList * htext; /* List of our HText Objects */
1.34 eric 103: HTList * fingers;
1.59 frystyk 104:
1.40 frystyk 105: int timer;
1.65 frystyk 106: char * cwd; /* Current dir URL */
1.1 frystyk 107: char * rules;
1.55 frystyk 108: char * prefix;
1.59 frystyk 109: char * img_prefix;
110:
1.60 frystyk 111: char * logfile; /* clf log */
1.55 frystyk 112: HTLog * log;
1.60 frystyk 113: char * reffile; /* referer log */
1.57 frystyk 114: HTLog * ref;
1.60 frystyk 115: char * rejectfile; /* unchecked links */
1.58 frystyk 116: HTLog * reject;
1.60 frystyk 117: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 118: HTLog * notfound;
1.60 frystyk 119: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 120: HTLog * conneg;
1.60 frystyk 121: char * noalttagfile; /* images without alt tags*/
122: HTLog * noalttag;
123:
124: char * hitfile; /* links sorted after hit counts */
1.64 frystyk 125: char * relfile; /* link sorted after relationships */
126: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 127: char * titlefile; /* links with titles */
1.60 frystyk 128: char * mtfile; /* media types encountered */
129: char * charsetfile; /* charsets encountered */
1.63 frystyk 130: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 131:
132: char * outputfile;
1.1 frystyk 133: FILE * output;
1.59 frystyk 134:
1.1 frystyk 135: MRFlags flags;
1.55 frystyk 136:
1.59 frystyk 137: long get_bytes; /* Total number of bytes processed using GET*/
138: long get_docs; /* Total number of documents using GET */
139:
140: long head_bytes; /* bytes processed bytes processed using HEAD */
141: long head_docs; /* Total number of documents using HEAD*/
142:
143: long other_docs;
144:
1.56 frystyk 145: ms_t time; /* Time of run */
1.58 frystyk 146:
147: #ifdef HT_POSIX_REGEX
148: regex_t * include;
149: regex_t * exclude;
150: regex_t * check;
151: #endif
152:
1.68 ! frystyk 153: #ifdef HT_MYSQL
! 154: HTSQLLog * sqllog;
! 155: char * sqlserver;
! 156: char * sqldb;
! 157: char * sqluser;
! 158: char * sqlpw;
! 159: char * sqlrelative;
! 160: BOOL sqlexternals;
! 161: int sqlflags;
! 162: #endif
! 163:
1.1 frystyk 164: } Robot;
1.34 eric 165:
166: typedef struct _Finger {
167: Robot * robot;
168: HTRequest * request;
169: HTParentAnchor * dest;
170: } Finger;
171:
1.1 frystyk 172: typedef enum _LoadState {
173: L_INVALID = -2,
174: L_LOADING = -1,
175: L_SUCCESS = 0,
176: L_ERROR
177: } LoadState;
178:
179: /*
180: ** The HyperDoc object is bound to the anchor and contains information about
181: ** where we are in the search for recursive searches
182: */
183: typedef struct _HyperDoc {
184: HTParentAnchor * anchor;
185: LoadState state;
186: int depth;
1.55 frystyk 187: int hits;
1.1 frystyk 188: } HyperDoc;
189:
190: /*
1.65 frystyk 191: ** This is the HText object that is created every time we start parsing an
1.1 frystyk 192: ** HTML object
193: */
1.4 frystyk 194: struct _HText {
1.1 frystyk 195: HTRequest * request;
1.65 frystyk 196: BOOL follow;
1.4 frystyk 197: };
1.1 frystyk 198:
1.58 frystyk 199: /*
200: ** A structure for calculating metadata distributions
201: */
202: typedef struct _MetaDist {
203: HTAtom * name;
204: int hits;
205: } MetaDist;
206:
207: /*
208: ** Some sorting algorithms
209: */
1.63 frystyk 210: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 211:
1.1 frystyk 212: PUBLIC HText * HTMainText = NULL;
213: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
214: PUBLIC HTStyleSheet * styleSheet = NULL;
215:
216: /* ------------------------------------------------------------------------- */
217:
1.13 eric 218: /* Standard (non-error) Output
219: ** ---------------------------
220: */
221: PUBLIC int OutputData(const char * fmt, ...)
222: {
223: int ret;
224: va_list pArgs;
225: va_start(pArgs, fmt);
226: ret = vfprintf(stdout, fmt, pArgs);
227: va_end(pArgs);
228: return ret;
229: }
230:
231: /* ------------------------------------------------------------------------- */
232:
1.2 frystyk 233: /* Create a "HyperDoc" object
234: ** --------------------------
235: ** A HyperDoc object contains information about whether we have already
236: ** started checking the anchor and the depth in our search
237: */
238: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
239: {
240: HyperDoc * hd;
1.14 frystyk 241: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
242: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 243: hd->state = L_INVALID;
244: hd->depth = depth;
1.55 frystyk 245: hd->hits = 1;
1.2 frystyk 246:
247: /* Bind the HyperDoc object together with the Anchor Object */
248: hd->anchor = anchor;
249: HTAnchor_setDocument(anchor, (void *) hd);
250:
251: /* Add this HyperDoc object to our list */
252: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
253: HTList_addObject(mr->hyperdoc, (void *) hd);
254: return hd;
255: }
256:
257: /* Delete a "HyperDoc" object
258: ** --------------------------
259: */
260: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
261: {
262: if (hd) {
1.11 frystyk 263: HT_FREE (hd);
1.2 frystyk 264: return YES;
265: }
266: return NO;
267: }
268:
1.55 frystyk 269: /*
270: ** Sort the anchor array and log reference count
271: */
272: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
273: {
274: if (mr && array) {
275: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
276: if (log) {
277: void ** data = NULL;
278: HTParentAnchor * anchor = NULL;
279: HTArray_sort(array, HitSort);
280: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
281: while (anchor) {
282: char * uri = HTAnchor_address((HTAnchor *) anchor);
283: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 284: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 285: HT_FREE(uri);
286: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
287: }
288: }
289: HTLog_close(log);
290: return YES;
291: }
292: return NO;
293: }
294:
295: PRIVATE int HitSort (const void * a, const void * b)
296: {
297: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
298: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
299: if (aa && bb) return (bb->hits - aa->hits);
300: return bb - aa;
301: }
302:
1.58 frystyk 303: /*
1.64 frystyk 304: ** Sort the anchor array and log link relations
305: */
306: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
307: {
308: if (mr && array) {
1.68 ! frystyk 309: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
! 310: void ** data = NULL;
! 311: HTParentAnchor * anchor = NULL;
! 312: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
! 313: while (anchor) {
! 314:
! 315: /*
! 316: ** If we have a specific link relation to look for then do this.
! 317: ** Otherwise look for all link relations.
! 318: */
! 319: if (mr->relation) {
! 320: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
! 321: if (link) {
! 322: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
! 323: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
! 324: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
! 325: if (src_uri && dest_uri) {
! 326: #ifdef HT_MYSQL
! 327: if (mr->sqllog) {
! 328: HTSQLLog_addLinkRelationship (mr->sqllog,
! 329: src_uri, dest_uri,
! 330: HTAtom_name(mr->relation),
! 331: NULL);
! 332: }
! 333: #endif
! 334: if (log) {
! 335: HTFormat format = HTAnchor_format(dest);
! 336: HTLog_addText(log, "%s %s %s --> %s\n",
! 337: HTAtom_name(mr->relation),
! 338: format != WWW_UNKNOWN ?
! 339: HTAtom_name(format) : "<unknown>",
! 340: src_uri, dest_uri);
! 341: }
! 342:
! 343: /* Cleanup */
! 344: HT_FREE(src_uri);
! 345: HT_FREE(dest_uri);
! 346: }
! 347: }
! 348: } else {
! 349: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
! 350: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
! 351: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
! 352: HTLinkType linktype;
! 353:
! 354: /* First look in the main link */
! 355: if (link && (linktype = HTLink_type(link))) {
! 356: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
! 357: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
! 358: if (src_uri && dest_uri) {
! 359: #ifdef HT_MYSQL
! 360: if (mr->sqllog) {
! 361: HTSQLLog_addLinkRelationship (mr->sqllog,
! 362: src_uri, dest_uri,
! 363: HTAtom_name(linktype),
! 364: NULL);
! 365: }
! 366: #endif
! 367: if (log) {
! 368: HTFormat format = HTAnchor_format(dest);
! 369: HTLog_addText(log, "%s %s %s --> %s\n",
! 370: HTAtom_name(linktype),
! 371: format != WWW_UNKNOWN ?
! 372: HTAtom_name(format) : "<unknown>",
! 373: src_uri, dest_uri);
! 374: }
! 375: }
! 376: HT_FREE(dest_uri);
! 377: }
! 378:
! 379: /* and then in any sublinks */
! 380: if (sublinks) {
! 381: HTLink * pres;
! 382: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
! 383: if ((linktype = HTLink_type(pres))) {
! 384: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 385: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 ! frystyk 386: if (src_uri && dest_uri) {
! 387: #ifdef HT_MYSQL
! 388: if (mr->sqllog) {
! 389: HTSQLLog_addLinkRelationship (mr->sqllog,
! 390: src_uri, dest_uri,
! 391: HTAtom_name(linktype),
! 392: NULL);
! 393: }
! 394: #endif
! 395: if (log) {
! 396: HTFormat format = HTAnchor_format(dest);
! 397: HTLog_addText(log, "%s %s %s --> %s\n",
! 398: HTAtom_name(linktype),
! 399: format != WWW_UNKNOWN ?
! 400: HTAtom_name(format) : "<unknown>",
! 401: src_uri, dest_uri);
! 402: }
1.64 frystyk 403: HT_FREE(dest_uri);
404: }
405: }
406: }
407: }
1.68 ! frystyk 408:
! 409: /* Cleanup */
! 410: HT_FREE(src_uri);
! 411: }
! 412: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 413: }
1.68 ! frystyk 414: if (log) HTLog_close(log);
1.64 frystyk 415: return YES;
416: }
417: return NO;
418: }
419:
420: /*
1.63 frystyk 421: ** Sort the anchor array and log last modified date
422: */
423: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
424: {
425: if (mr && array) {
426: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
427: if (log) {
428: void ** data = NULL;
429: HTParentAnchor * anchor = NULL;
430: HTArray_sort(array, LastModifiedSort);
431: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
432: while (anchor) {
433: char * uri = HTAnchor_address((HTAnchor *) anchor);
434: time_t lm = HTAnchor_lastModified(anchor);
435: if (uri && lm > 0)
436: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
437: HT_FREE(uri);
438: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
439: }
440: }
441: HTLog_close(log);
442: return YES;
443: }
444: return NO;
445: }
446:
447: PRIVATE int LastModifiedSort (const void * a, const void * b)
448: {
449: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
450: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
451: return bb - aa;
452: }
453:
454: /*
455: ** Sort the anchor array and log the document title
456: */
457: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
458: {
459: if (mr && array) {
460: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
461: if (log) {
462: void ** data = NULL;
463: HTParentAnchor * anchor = NULL;
464: HTArray_sort(array, TitleSort);
465: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
466: while (anchor) {
467: char * uri = HTAnchor_address((HTAnchor *) anchor);
468: const char * title = HTAnchor_title(anchor);
469: HTCharset charset = HTAnchor_charset(anchor);
470: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
471: charset ? HTAtom_name(charset) : "<none>",
472: title ? title : "<none>",
473: uri);
474: HT_FREE(uri);
475: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
476: }
477: }
478: HTLog_close(log);
479: return YES;
480: }
481: return NO;
482: }
483:
484: PRIVATE int TitleSort (const void * a, const void * b)
485: {
486: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
487: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
488: return strcasecomp(bb?bb:"", aa?aa:"");
489: }
490:
491: /*
1.58 frystyk 492: ** Calculate distributions for media types. The same mechanism
493: ** can be used for other characteristics with relatively
494: ** few outcomes.
495: */
496: PRIVATE HTList * mediatype_distribution (HTArray * array)
497: {
498: if (array) {
499: HTList * mt = HTList_new();
500: MetaDist * pres = NULL;
501: void ** data = NULL;
502: HTParentAnchor * anchor = NULL;
503: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
504: while (anchor) {
505: HTFormat format = HTAnchor_format(anchor);
506: if (format && format != WWW_UNKNOWN) {
507: HTList * cur = mt;
508:
509: /* If found then increase counter */
510: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
511: if (pres->name == format) {
512: pres->hits++;
513: break;
514: }
515: }
516:
517: /* If not found then add new format to list */
518: if (!pres) {
519: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
520: HT_OUTOFMEM("mediatype_distribution");
521: pres->name = format;
522: pres->hits = 1;
523: HTList_addObject(mt, pres);
524: HTList_insertionSort(mt, FormatSort);
525: }
526: }
527:
528: /* Find next anchor in array */
529: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
530: }
531: return mt;
532: }
533: return NULL;
534: }
535:
1.60 frystyk 536: /*
537: ** Calculate distributions for charsets. The same mechanism
538: ** can be used for other characteristics with relatively
539: ** few outcomes.
540: */
541: PRIVATE HTList * charset_distribution (HTArray * array)
542: {
543: if (array) {
544: HTList * cs = HTList_new();
545: MetaDist * pres = NULL;
546: void ** data = NULL;
547: HTParentAnchor * anchor = NULL;
548: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
549: while (anchor) {
550: HTCharset charset = HTAnchor_charset(anchor);
551: if (charset) {
552: HTList * cur = cs;
553:
554: /* If found then increase counter */
555: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
556: if (pres->name == charset) {
557: pres->hits++;
558: break;
559: }
560: }
561:
562: /* If not found then add new format to list */
563: if (!pres) {
564: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
565: HT_OUTOFMEM("charset_distribution");
566: pres->name = charset;
567: pres->hits = 1;
568: HTList_addObject(cs, pres);
569: HTList_insertionSort(cs, FormatSort);
570: }
571: }
572:
573: /* Find next anchor in array */
574: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
575: }
576: return cs;
577: }
578: return NULL;
579: }
580:
1.58 frystyk 581: PRIVATE int FormatSort (const void * a, const void * b)
582: {
583: MetaDist * aa = (MetaDist *) a;
584: MetaDist * bb = (MetaDist *) b;
585: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
586: }
587:
588: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
589: {
590: if (logfile && distribution) {
591: HTLog * log = HTLog_open(logfile, YES, YES);
592: if (log) {
593: HTList * cur = distribution;
594: MetaDist * pres;
595: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
596: if (pres->name) {
1.60 frystyk 597: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 598: }
599: }
600: HTLog_close(log);
601: }
602: }
603: return NO;
604: }
605:
606: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
607: {
608: if (distribution) {
609: HTList * cur = distribution;
610: MetaDist * pres;
611: while ((pres = (MetaDist *) HTList_nextObject(cur)))
612: HT_FREE(pres);
613: HTList_delete(distribution);
614: return YES;
615: }
616: return NO;
617: }
618:
619:
1.55 frystyk 620: /* Statistics
621: ** ----------
622: ** Calculates a bunch of statistics for the anchors traversed
623: */
624: PRIVATE BOOL calculate_statistics (Robot * mr)
625: {
1.59 frystyk 626: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 627: if (!mr) return NO;
628:
629: /* Calculate efficiency */
1.59 frystyk 630: if (mr->time > 0) {
1.56 frystyk 631: ms_t t = HTGetTimeInMillis() - mr->time;
632: if (t > 0) {
1.60 frystyk 633: double loadfactor = (mr->get_bytes / (t * 0.001));
634: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 635: double secs = t / 1000.0;
1.55 frystyk 636: char bytes[50];
1.62 frystyk 637: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 638: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 639: total_docs, secs, reqprsec);
1.59 frystyk 640:
641: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 642: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 643: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 644: mr->get_docs, bytes, loadfactor);
1.59 frystyk 645:
646: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 647: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 648: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 649: mr->head_docs, bytes);
1.55 frystyk 650: }
651: }
652:
653: /* Create an array of existing anchors */
1.59 frystyk 654: if (total_docs > 1) {
655: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 656: if (array) {
657:
1.63 frystyk 658: /* Distributions */
659: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 660: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 661: }
662:
1.55 frystyk 663: /* Sort after hit counts */
1.63 frystyk 664: if (mr->hitfile) {
665: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 666: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 667: mr->hitfile);
668: calculate_hits(mr, array);
669: }
670:
1.64 frystyk 671: /* Sort after link relations */
1.68 ! frystyk 672: #ifdef HT_MYSQL
! 673: if (mr->relfile || mr->sqllog) {
! 674: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 675: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
676: mr->relfile);
677: calculate_linkRelations(mr, array);
678: }
1.68 ! frystyk 679: #endif
1.64 frystyk 680:
1.63 frystyk 681: /* Sort after modified date */
682: if (mr->lmfile) {
683: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 684: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 685: mr->lmfile);
686: calculate_lm(mr, array);
687: }
688:
689: /* Sort after title */
690: if (mr->titlefile) {
691: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 692: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 693: mr->titlefile);
694: calculate_title(mr, array);
695: }
1.55 frystyk 696:
1.58 frystyk 697: /* Find mediatype distribution */
698: if (mr->mtfile) {
699: HTList * mtdist = mediatype_distribution(array);
700: if (mtdist) {
1.63 frystyk 701: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 702: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 703: mr->mtfile);
1.58 frystyk 704: log_meta_distribution(mr->mtfile, mtdist);
705: delete_meta_distribution(mtdist);
706: }
707: }
1.55 frystyk 708:
1.60 frystyk 709: /* Find charset distribution */
710: if (mr->charsetfile) {
711: HTList * charsetdist = charset_distribution(array);
712: if (charsetdist) {
1.63 frystyk 713: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 714: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 715: mr->charsetfile);
1.60 frystyk 716: log_meta_distribution(mr->charsetfile, charsetdist);
717: delete_meta_distribution(charsetdist);
718: }
719: }
720:
1.55 frystyk 721: /* Add as may other stats here as you like */
1.60 frystyk 722: /* ... */
1.58 frystyk 723:
724: /* Delete the array */
1.55 frystyk 725: HTArray_delete(array);
726: }
727: }
728: return YES;
729: }
730:
1.1 frystyk 731: /* Create a Command Line Object
732: ** ----------------------------
733: */
734: PRIVATE Robot * Robot_new (void)
735: {
736: Robot * me;
1.41 frystyk 737: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 738: HT_OUTOFMEM("Robot_new");
1.2 frystyk 739: me->hyperdoc = HTList_new();
1.4 frystyk 740: me->htext = HTList_new();
1.40 frystyk 741: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 742: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 743: me->output = OUTPUT;
1.35 eric 744: me->cnt = 0;
1.34 eric 745: me->fingers = HTList_new();
1.1 frystyk 746: return me;
747: }
748:
749: /* Delete a Command Line Object
750: ** ----------------------------
751: */
1.62 frystyk 752: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 753: {
1.62 frystyk 754: if (mr) {
755: HTList_delete(mr->fingers);
1.55 frystyk 756:
757: /* Calculate statistics */
1.62 frystyk 758: calculate_statistics(mr);
1.55 frystyk 759:
1.62 frystyk 760: if (mr->hyperdoc) {
761: HTList * cur = mr->hyperdoc;
1.2 frystyk 762: HyperDoc * pres;
763: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
764: HyperDoc_delete(pres);
1.62 frystyk 765: HTList_delete(mr->hyperdoc);
1.2 frystyk 766: }
1.62 frystyk 767: if (mr->htext) {
768: HTList * cur = mr->htext;
1.4 frystyk 769: HText * pres;
770: while ((pres = (HText *) HTList_nextObject(cur)))
771: HText_free(pres);
1.62 frystyk 772: HTList_delete(mr->htext);
1.4 frystyk 773: }
1.62 frystyk 774:
775: /* Close all the log files */
1.63 frystyk 776: if (mr->flags & MR_LOGGING) {
1.64 frystyk 777: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 778: }
779:
1.62 frystyk 780: if (mr->log) {
781: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 782: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 783: HTLog_accessCount(mr->log), mr->logfile);
784: HTLog_close(mr->log);
785: }
786: if (mr->ref) {
787: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 788: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 789: HTLog_accessCount(mr->ref), mr->reffile);
790: HTLog_close(mr->ref);
791: }
792: if (mr->reject) {
793: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 794: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 795: HTLog_accessCount(mr->reject), mr->rejectfile);
796: HTLog_close(mr->reject);
797: }
798: if (mr->notfound) {
799: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 800: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 801: HTLog_accessCount(mr->notfound), mr->notfoundfile);
802: HTLog_close(mr->notfound);
803: }
804: if (mr->conneg) {
805: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 806: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 807: HTLog_accessCount(mr->conneg), mr->connegfile);
808: HTLog_close(mr->conneg);
809: }
810: if (mr->noalttag) {
811: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 812: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 813: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
814: HTLog_close(mr->noalttag);
815: }
816:
817: if (mr->output && mr->output != STDOUT) fclose(mr->output);
818:
819: if (mr->flags & MR_TIME) {
1.12 frystyk 820: time_t local = time(NULL);
1.62 frystyk 821: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 822: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 823: }
1.55 frystyk 824:
1.58 frystyk 825: #ifdef HT_POSIX_REGEX
1.62 frystyk 826: if (mr->include) {
827: regfree(mr->include);
828: HT_FREE(mr->include);
829: }
830: if (mr->exclude) {
831: regfree(mr->exclude);
832: HT_FREE(mr->exclude);
833: }
834: if (mr->check) {
835: regfree(mr->check);
836: HT_FREE(mr->check);
1.58 frystyk 837: }
838: #endif
839:
1.68 ! frystyk 840: #ifdef HT_MYSQL
! 841: if (mr->sqllog) {
! 842: HTSQLLog_close(mr->sqllog);
! 843: mr->sqllog = NULL;
! 844: }
! 845: #endif
! 846:
1.62 frystyk 847: HT_FREE(mr->cwd);
848: HT_FREE(mr->prefix);
849: HT_FREE(mr->img_prefix);
850: HT_FREE(mr);
1.1 frystyk 851: return YES;
852: }
853: return NO;
854: }
855:
1.2 frystyk 856: /*
1.34 eric 857: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 858: */
1.34 eric 859: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 860: {
1.34 eric 861: Finger * me;
862: HTRequest * request = HTRequest_new();
863: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
864: HT_OUTOFMEM("Finger_new");
865: me->robot = robot;
866: me->request = request;
867: me->dest = dest;
868: HTList_addObject(robot->fingers, (void *)me);
869:
1.48 frystyk 870: /* Set the context for this request */
1.34 eric 871: HTRequest_setContext (request, me);
1.48 frystyk 872:
873: /* Check the various flags to customize the request */
874: if (robot->flags & MR_PREEMPTIVE)
875: HTRequest_setPreemptive(request, YES);
876: if (robot->flags & MR_VALIDATE)
877: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
878: if (robot->flags & MR_END_VALIDATE)
879: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
880:
881: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 882: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 883:
884: /* Set the method for this request */
1.34 eric 885: HTRequest_setMethod(request, method);
886: robot->cnt++;
887: return me;
1.2 frystyk 888: }
889:
1.34 eric 890: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 891: {
1.34 eric 892: HTList_removeObject(me->robot->fingers, (void *)me);
893: me->robot->cnt--;
1.37 frystyk 894:
895: /*
896: ** If we are down at one request then flush the output buffer
897: */
898: if (me->request) {
899: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 900: HTRequest_delete(me->request);
1.37 frystyk 901: }
902:
903: /*
904: ** Delete the request and free myself
905: */
1.34 eric 906: HT_FREE(me);
907: return YES;
1.2 frystyk 908: }
909:
910: /*
911: ** Cleanup and make sure we close all connections including the persistent
912: ** ones
913: */
1.1 frystyk 914: PRIVATE void Cleanup (Robot * me, int status)
915: {
916: Robot_delete(me);
1.29 eric 917: HTProfile_delete();
1.50 frystyk 918: #ifdef HT_MEMLOG
1.39 eric 919: HTMemLog_close();
1.47 frystyk 920: #endif
921:
1.1 frystyk 922: #ifdef VMS
923: exit(status ? status : 1);
924: #else
925: exit(status ? status : 0);
926: #endif
927: }
928:
929: #ifdef CATCH_SIG
930: #include <signal.h>
931: /* SetSignal
932: ** This function sets up signal handlers. This might not be necessary to
933: ** call if the application has its own handlers (lossage on SVR4)
934: */
935: PRIVATE void SetSignal (void)
936: {
937: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
938: ** when attemting to connect to a remote host where you normally should
939: ** get `connection refused' back
940: */
941: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 942: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 943: } else {
1.13 eric 944: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 945: }
1.47 frystyk 946:
1.50 frystyk 947: #ifdef HT_MEMLOG
1.44 eric 948: HTMemLog_flush();
1.47 frystyk 949: #endif
950:
1.1 frystyk 951: }
952: #endif /* CATCH_SIG */
953:
1.58 frystyk 954: #ifdef HT_POSIX_REGEX
955: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
956: {
957: size_t length = regerror (errcode, compiled, NULL, 0);
958: char * str = NULL;
959: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
960: HT_OUTOFMEM("get_regerror");
961: (void) regerror (errcode, compiled, str, length);
962: return str;
963: }
964:
1.60 frystyk 965: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 966: {
967: regex_t * regex = NULL;
968: if (regex_str && *regex_str) {
969: int status;
970: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
971: HT_OUTOFMEM("get_regtype");
1.60 frystyk 972: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 973: char * err_msg = get_regerror(status, regex);
1.62 frystyk 974: if (SHOW_REAL_QUIET(mr))
975: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 976: HT_FREE(err_msg);
977: Cleanup(mr, -1);
978: }
979: }
980: return regex;
981: }
982: #endif
983:
1.1 frystyk 984: PRIVATE void VersionInfo (void)
985: {
1.62 frystyk 986: OutputData("W3C Sample Software\n\n");
987: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
988: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
989: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 990: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 991: }
992:
993: /* terminate_handler
994: ** -----------------
1.2 frystyk 995: ** This function is registered to handle the result of the request.
996: ** If no more requests are pending then terminate program
1.1 frystyk 997: */
1.32 frystyk 998: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
999: void * param, int status)
1.1 frystyk 1000: {
1.34 eric 1001: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 1002: Robot * mr = finger->robot;
1.62 frystyk 1003: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 1004:
1.68 ! frystyk 1005: #ifdef HT_MYSQL
! 1006: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
! 1007: #endif
! 1008:
1.58 frystyk 1009: /* Check if negotiated resource and whether we should log that*/
1010: if (mr->conneg) {
1011: HTAssocList * cur = HTResponse_variant(response);
1012: if (cur) {
1013: BOOL first = YES;
1014: HTChunk * buffer = HTChunk_new(128);
1015: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1016: HTAssoc * pres;
1.60 frystyk 1017: HTChunk_puts(buffer, uri);
1.58 frystyk 1018: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1019: char * value = HTAssoc_value(pres);
1020: if (first) {
1.60 frystyk 1021: HTChunk_puts(buffer, "\t(");
1.58 frystyk 1022: first = NO;
1023: } else
1024: HTChunk_puts(buffer, ", ");
1025:
1026: /* Output the name */
1027: HTChunk_puts(buffer, HTAssoc_name(pres));
1028:
1029: /* Only output the value if not empty string */
1.60 frystyk 1030: if (value && *value) {
1.58 frystyk 1031: HTChunk_puts(buffer, "=");
1032: HTChunk_puts(buffer, value);
1033: }
1034: }
1.60 frystyk 1035: if (!first) HTChunk_puts(buffer, ")");
1036: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 1037: HTChunk_delete(buffer);
1038: HT_FREE(uri);
1039: }
1040: }
1041:
1.55 frystyk 1042: /* Count the amount of body data that we have read */
1.59 frystyk 1043: if (HTRequest_method(request) == METHOD_GET) {
1044: int length = HTAnchor_length(HTRequest_anchor(request));
1045: if (length > 0) mr->get_bytes += length;
1046: mr->get_docs++;
1047: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 1048: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 1049: if (length > 0) mr->head_bytes += length;
1050: mr->head_docs++;
1051: } else {
1052: mr->other_docs++;
1.55 frystyk 1053: }
1054:
1.58 frystyk 1055: /* Cleanup the anchor so that we don't drown in metainformation */
1056: if (!(mr->flags & MR_KEEP_META))
1057: HTAnchor_clearHeader(HTRequest_anchor(request));
1058:
1.55 frystyk 1059: /* Delete this thread */
1.34 eric 1060: Finger_delete(finger);
1.55 frystyk 1061:
1062: /* Should we stop? */
1.46 eric 1063: if (mr->cnt <= 0) {
1.62 frystyk 1064: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1065: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 1066: }
1.62 frystyk 1067: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 1068: return HT_OK;
1069: }
1070:
1071: /* ------------------------------------------------------------------------- */
1072: /* HTEXT INTERFACE */
1073: /* ------------------------------------------------------------------------- */
1074:
1075: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
1076: HTStream * stream)
1077: {
1078: HText * me;
1.34 eric 1079: Finger * finger = (Finger *) HTRequest_context(request);
1080: Robot * mr = finger->robot;
1.65 frystyk 1081: char * robots = NULL;
1082:
1.14 frystyk 1083: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1084: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1085:
1086: /* Bind the HText object together with the Request Object */
1.1 frystyk 1087: me->request = request;
1.65 frystyk 1088: me->follow = YES;
1089:
1090: /* Check to see if we have any meta tags */
1091: if ((robots = HTAnchor_robots(anchor)) != NULL) {
1092: char * strval = NULL;
1093: char * ptr = NULL;
1094: char * token = NULL;
1095: StrAllocCopy(strval, robots);
1096: ptr = strval;
1097: while ((token = HTNextField(&ptr)) != NULL) {
1098: if (!strcasecomp(token, "nofollow")) {
1099: me->follow = NO;
1100: break;
1101: }
1102: }
1103: HT_FREE(strval);
1104: }
1.4 frystyk 1105:
1106: /* Add this HyperDoc object to our list */
1107: if (!mr->htext) mr->htext = HTList_new();
1108: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1109: return me;
1110: }
1111:
1.4 frystyk 1112: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1113: if (me) HT_FREE (me);
1.4 frystyk 1114: }
1115:
1.1 frystyk 1116: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1117: {
1118: if (text && anchor) {
1.34 eric 1119: Finger * finger = (Finger *) HTRequest_context(text->request);
1120: Robot * mr = finger->robot;
1.1 frystyk 1121: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1122: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1123: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1124: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1125: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1126: BOOL match = text->follow;
1.58 frystyk 1127: BOOL check = NO;
1.1 frystyk 1128:
1.55 frystyk 1129: if (!uri) return;
1.62 frystyk 1130: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1131:
1132: if (hd) {
1.62 frystyk 1133: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1134: hd->hits++;
1.68 ! frystyk 1135: #ifdef HT_MYSQL
! 1136: if (mr->sqllog) {
! 1137: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
! 1138: if (ref_addr) {
! 1139: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
! 1140: "referer", NULL);
! 1141: HT_FREE(ref_addr);
! 1142: }
! 1143: }
! 1144: #endif
1.58 frystyk 1145: HT_FREE(uri);
1146: return;
1147: }
1148:
1149: /* Check for prefix match */
1.65 frystyk 1150: if (match && mr->prefix) {
1151: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1152: }
1.58 frystyk 1153:
1154: #ifdef HT_POSIX_REGEX
1155: /* Check for any regular expression */
1156: if (match && mr->include) {
1157: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1158: }
1159: if (match && mr->exclude) {
1160: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1161: }
1162: if (match && mr->check) {
1163: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1164: }
1165: #endif
1166:
1167: /* Test whether we already have a hyperdoc for this document */
1168: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1169: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1170: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1171: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1172: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1173: HTRequest * newreq = newfinger->request;
1.2 frystyk 1174: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1175: HTRequest_setParent(newreq, referer);
1.58 frystyk 1176: if (check || depth >= mr->depth) {
1.62 frystyk 1177: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1178: HTRequest_setMethod(newreq, METHOD_HEAD);
1179: } else {
1.62 frystyk 1180: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1181: }
1182: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1183: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1184: Finger_delete(newfinger);
1.2 frystyk 1185: }
1.7 frystyk 1186: } else {
1.62 frystyk 1187: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 ! frystyk 1188: #ifdef HT_MYSQL
! 1189: if (mr->reject || mr->sqllog) {
! 1190: #else
1.60 frystyk 1191: if (mr->reject) {
1.68 ! frystyk 1192: #endif
1.60 frystyk 1193: if (referer) {
1194: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 ! frystyk 1195: if (mr->reject && ref_addr)
! 1196: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
! 1197: #ifdef HT_MYSQL
! 1198: if (mr->sqllog && mr->sqlexternals && ref_addr)
! 1199: HTSQLLog_addLinkRelationship(mr->sqllog,
! 1200: ref_addr, uri,
! 1201: "referer", NULL);
! 1202: #endif
! 1203:
1.60 frystyk 1204: HT_FREE(ref_addr);
1205: }
1206: }
1.2 frystyk 1207: }
1.11 frystyk 1208: HT_FREE(uri);
1.2 frystyk 1209: }
1210: }
1211:
1212: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1213: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1214: {
1215: if (text && anchor) {
1.34 eric 1216: Finger * finger = (Finger *) HTRequest_context(text->request);
1217: Robot * mr = finger->robot;
1.59 frystyk 1218: if (mr->flags & MR_IMG) {
1.60 frystyk 1219: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1220: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1221: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1222: HyperDoc * hd = HTAnchor_document(dest_parent);
1223: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1224: BOOL match = YES;
1225:
1226: if (hd) {
1.62 frystyk 1227: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1228: hd->hits++;
1.68 ! frystyk 1229: #ifdef HT_MYSQL
! 1230: if (mr->sqllog) {
! 1231: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
! 1232: if (ref_addr) {
! 1233: HTSQLLog_addLinkRelationship(mr->sqllog,
! 1234: ref_addr, uri,
! 1235: "image", alt);
! 1236: HT_FREE(ref_addr);
! 1237: }
! 1238: }
! 1239: #endif
1.11 frystyk 1240: HT_FREE(uri);
1.59 frystyk 1241: return;
1.2 frystyk 1242: }
1.59 frystyk 1243:
1244: /* Check for prefix match */
1245: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1246:
1247: /* Test whether we already have a hyperdoc for this document */
1248: if (match && dest) {
1.60 frystyk 1249: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1250: mr->flags & MR_SAVE ?
1251: METHOD_GET : METHOD_HEAD);
1252: HTRequest * newreq = newfinger->request;
1.60 frystyk 1253: HyperDoc_new(mr, dest_parent, 1);
1254: HTRequest_setParent(newreq, referer);
1255:
1256: /* Check whether we should report missing ALT tags */
1257: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1258: if (referer) {
1259: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1260: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1261: HT_FREE(ref_addr);
1262: }
1263: }
1264:
1.62 frystyk 1265: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1266: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1267: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1268: Finger_delete(newfinger);
1269: }
1270: } else {
1.62 frystyk 1271: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 ! frystyk 1272: #ifdef HT_MYSQL
! 1273: if (mr->reject || mr->sqllog) {
! 1274: #else
1.60 frystyk 1275: if (mr->reject) {
1.68 ! frystyk 1276: #endif
1.60 frystyk 1277: if (referer) {
1278: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 ! frystyk 1279: if (mr->reject && ref_addr)
! 1280: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
! 1281: #ifdef HT_MYSQL
! 1282: if (mr->sqllog && mr->sqlexternals && ref_addr)
! 1283: HTSQLLog_addLinkRelationship(mr->sqllog,
! 1284: ref_addr, uri,
! 1285: "image", alt);
! 1286: #endif
! 1287:
1.60 frystyk 1288: HT_FREE(ref_addr);
1289: }
1290: }
1.1 frystyk 1291: }
1.59 frystyk 1292: HT_FREE(uri);
1.1 frystyk 1293: }
1294: }
1295: }
1296:
1297: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1298: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1299: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1300: PUBLIC void HText_endAppend (HText * text) {}
1301: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1302: PUBLIC void HText_beginAppend (HText * text) {}
1303: PUBLIC void HText_appendParagraph (HText * text) {}
1304:
1.48 frystyk 1305: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1306: {
1307: return (vfprintf(stderr, fmt, pArgs));
1308: }
1309:
1.1 frystyk 1310: /* ------------------------------------------------------------------------- */
1311: /* MAIN PROGRAM */
1312: /* ------------------------------------------------------------------------- */
1313:
1314: int main (int argc, char ** argv)
1315: {
1.48 frystyk 1316: int status = 0;
1.1 frystyk 1317: int arg;
1.48 frystyk 1318: BOOL cache = NO; /* Use persistent cache */
1319: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1320: char * cache_root = NULL;
1.1 frystyk 1321: HTChunk * keywords = NULL; /* From command line */
1322: int keycnt = 0;
1.12 frystyk 1323: Robot * mr = NULL;
1.43 frystyk 1324: Finger * finger = NULL;
1325: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1326:
1327: /* Starts Mac GUSI socket library */
1328: #ifdef GUSI
1329: GUSISetup(GUSIwithSIOUXSockets);
1330: GUSISetup(GUSIwithInternetSockets);
1331: #endif
1332:
1333: #ifdef __MWERKS__ /* STR */
1334: InitGraf((Ptr) &qd.thePort);
1335: InitFonts();
1336: InitWindows();
1337: InitMenus(); TEInit();
1338: InitDialogs(nil);
1339: InitCursor();
1340: SIOUXSettings.asktosaveonclose = false;
1341: argc=ccommand(&argv);
1.50 frystyk 1342: #endif /* __MWERKS__ */
1.1 frystyk 1343:
1.50 frystyk 1344: #ifdef HT_MEMLOG
1.51 frystyk 1345: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1346: #endif
1.46 eric 1347:
1.27 frystyk 1348: /* Initiate W3C Reference Library with a robot profile */
1349: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1350: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1351:
1352: /* Add the default HTML parser to the set of converters */
1353: {
1354: HTList * converters = HTFormat_conversion();
1355: HTMLInit(converters);
1356: }
1.1 frystyk 1357:
1.12 frystyk 1358: /* Build a new robot object */
1359: mr = Robot_new();
1360:
1.1 frystyk 1361: /* Scan command Line for parameters */
1362: for (arg=1; arg<argc; arg++) {
1363: if (*argv[arg] == '-') {
1364:
1365: /* non-interactive */
1.17 frystyk 1366: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1367: HTAlert_setInteractive(NO);
1368:
1.62 frystyk 1369: /* help */
1370: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1371: VersionInfo();
1372: Cleanup(mr, 0);
1373:
1.63 frystyk 1374: /* clf log file */
1.1 frystyk 1375: } else if (!strcmp(argv[arg], "-l")) {
1376: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1377: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1378: mr->flags |= MR_LOGGING;
1.1 frystyk 1379:
1.63 frystyk 1380: /* referer log file */
1.58 frystyk 1381: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1382: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1383: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1384: mr->flags |= MR_LOGGING;
1.57 frystyk 1385:
1.58 frystyk 1386: /* Not found error log file */
1387: } else if (!strncmp(argv[arg], "-404", 4)) {
1388: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1389: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1390: mr->flags |= MR_LOGGING;
1.58 frystyk 1391:
1392: /* reject log file */
1393: } else if (!strncmp(argv[arg], "-rej", 4)) {
1394: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1395: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1396: mr->flags |= MR_LOGGING;
1.58 frystyk 1397:
1.63 frystyk 1398: /* no alt tags log file */
1399: } else if (!strncmp(argv[arg], "-alt", 4)) {
1400: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1401: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1402: mr->flags |= MR_LOGGING;
1403:
1404: /* negotiated resource log file */
1.58 frystyk 1405: } else if (!strncmp(argv[arg], "-neg", 4)) {
1406: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1407: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1408: mr->flags |= MR_LOGGING;
1409:
1410: /* hit file log */
1411: } else if (!strcmp(argv[arg], "-hit")) {
1412: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1413: argv[++arg] : DEFAULT_HIT_FILE;
1414: mr->flags |= MR_DISTRIBUTIONS;
1415:
1.64 frystyk 1416: /* link relations file log */
1417: } else if (!strcmp(argv[arg], "-rellog")) {
1418: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1419: argv[++arg] : DEFAULT_REL_FILE;
1420: mr->flags |= MR_DISTRIBUTIONS;
1421:
1422: /* Specific link relation to look for (only used i also -rellog) */
1423: } else if (!strcmp(argv[arg], "-relation")) {
1424: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
1425: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
1426: mr->flags |= MR_DISTRIBUTIONS;
1427:
1.63 frystyk 1428: /* last modified log file */
1429: } else if (!strcmp(argv[arg], "-lm")) {
1430: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1431: argv[++arg] : DEFAULT_LM_FILE;
1432: mr->flags |= MR_DISTRIBUTIONS;
1433:
1434: /* title log file */
1435: } else if (!strcmp(argv[arg], "-title")) {
1436: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1437: argv[++arg] : DEFAULT_TITLE_FILE;
1438: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1439:
1440: /* mediatype distribution log file */
1441: } else if (!strncmp(argv[arg], "-for", 4)) {
1442: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1443: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1444: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1445:
1.60 frystyk 1446: /* charset distribution log file */
1447: } else if (!strncmp(argv[arg], "-char", 5)) {
1448: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1449: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1450: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1451:
1.55 frystyk 1452: /* rule file */
1.1 frystyk 1453: } else if (!strcmp(argv[arg], "-r")) {
1454: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1455: argv[++arg] : DEFAULT_RULE_FILE;
1456:
1457: /* output filename */
1458: } else if (!strcmp(argv[arg], "-o")) {
1459: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1460: argv[++arg] : DEFAULT_OUTPUT_FILE;
1461:
1.55 frystyk 1462: /* URI prefix */
1463: } else if (!strcmp(argv[arg], "-prefix")) {
1464: char * prefix = NULL;
1465: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1466: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1467: if (*prefix && *prefix != '*') {
1.55 frystyk 1468: StrAllocCopy(mr->prefix, prefix);
1469: StrAllocCat(mr->prefix, "*");
1470: }
1471:
1.1 frystyk 1472: /* timeout -- Change the default request timeout */
1473: } else if (!strcmp(argv[arg], "-timeout")) {
1474: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1475: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1476: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1477:
1.54 frystyk 1478: /* Force no pipelined requests */
1479: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 frystyk 1480: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1481:
1.48 frystyk 1482: /* Start the persistent cache */
1483: } else if (!strcmp(argv[arg], "-cache")) {
1484: cache = YES;
1485:
1.54 frystyk 1486: /* Determine the cache root */
1487: } else if (!strcmp(argv[arg], "-cacheroot")) {
1488: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1489: argv[++arg] : NULL;
1.51 frystyk 1490:
1.52 frystyk 1491: /* Stream write flush delay in ms */
1492: } else if (!strcmp(argv[arg], "-delay")) {
1493: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1494: atoi(argv[++arg]) : DEFAULT_DELAY;
1495: HTHost_setDefaultWriteDelay(delay);
1496:
1.48 frystyk 1497: /* Persistent cache flush */
1498: } else if (!strcmp(argv[arg], "-flush")) {
1499: flush = YES;
1500:
1501: /* Do a cache validation */
1502: } else if (!strcmp(argv[arg], "-validate")) {
1503: mr->flags |= MR_VALIDATE;
1504:
1505: /* Do an end-to-end cache-validation */
1506: } else if (!strcmp(argv[arg], "-endvalidate")) {
1507: mr->flags |= MR_END_VALIDATE;
1508:
1.7 frystyk 1509: /* preemptive or non-preemptive access */
1.1 frystyk 1510: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1511: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1512:
1513: /* test inlined images */
1514: } else if (!strcmp(argv[arg], "-img")) {
1515: mr->flags |= MR_IMG;
1.45 frystyk 1516:
1517: /* load inlined images */
1518: } else if (!strcmp(argv[arg], "-saveimg")) {
1519: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1520:
1521: /* URI prefix for inlined images */
1522: } else if (!strcmp(argv[arg], "-imgprefix")) {
1523: char * prefix = NULL;
1524: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1525: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1526: if (*prefix && *prefix!='*') {
1.59 frystyk 1527: StrAllocCopy(mr->img_prefix, prefix);
1528: StrAllocCat(mr->img_prefix, "*");
1529: }
1.2 frystyk 1530:
1531: /* load anchors */
1.58 frystyk 1532: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1533: mr->flags |= MR_LINK;
1.7 frystyk 1534: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1535: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1536:
1.12 frystyk 1537: /* Output start and end time */
1538: } else if (!strcmp(argv[arg], "-ss")) {
1539: mr->flags |= MR_TIME;
1540:
1.1 frystyk 1541: /* print version and exit */
1542: } else if (!strcmp(argv[arg], "-version")) {
1543: VersionInfo();
1544: Cleanup(mr, 0);
1.46 eric 1545:
1546: /* run in quiet mode */
1547: } else if (!strcmp(argv[arg], "-q")) {
1548: mr->flags |= MR_QUIET;
1.1 frystyk 1549:
1.62 frystyk 1550: /* run in really quiet mode */
1551: } else if (!strcmp(argv[arg], "-Q")) {
1552: mr->flags |= MR_REAL_QUIET;
1553:
1.1 frystyk 1554: #ifdef WWWTRACE
1555: /* trace flags */
1556: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1557: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1558: #endif
1559:
1.58 frystyk 1560: #ifdef HT_POSIX_REGEX
1561:
1562: /* If we can link against a POSIX regex library */
1563: } else if (!strncmp(argv[arg], "-inc", 4)) {
1564: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1565: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1566: }
1567: } else if (!strncmp(argv[arg], "-exc", 4)) {
1568: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1569: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1570: }
1571: } else if (!strncmp(argv[arg], "-check", 6)) {
1572: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1573: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1574: }
1575: #endif
1576:
1.68 ! frystyk 1577: #ifdef HT_MYSQL
! 1578: /* If we can link against a MYSQL database library */
! 1579: } else if (!strncmp(argv[arg], "-sqldb", 5)) {
! 1580: mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1581: argv[++arg] : DEFAULT_SQL_DB;
! 1582:
! 1583: } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) {
! 1584: mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE;
! 1585:
! 1586: } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) {
! 1587: mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE;
! 1588:
! 1589: } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) {
! 1590: mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE;
! 1591:
! 1592: } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) {
! 1593: mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE;
! 1594:
! 1595: } else if (!strncmp(argv[arg], "-sqlexternals", 5)) {
! 1596: mr->sqlexternals = YES;
! 1597:
! 1598: } else if (!strncmp(argv[arg], "-sqlpassword", 5)) {
! 1599: mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1600: argv[++arg] : DEFAULT_SQL_PW;
! 1601:
! 1602: } else if (!strncmp(argv[arg], "-sqlrelative", 5)) {
! 1603: mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1604: argv[++arg] : NULL;
! 1605:
! 1606: } else if (!strncmp(argv[arg], "-sqlserver", 5)) {
! 1607: mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1608: argv[++arg] : DEFAULT_SQL_SERVER;
! 1609:
! 1610: } else if (!strncmp(argv[arg], "-sqluser", 5)) {
! 1611: mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ?
! 1612: argv[++arg] : DEFAULT_SQL_USER;
! 1613:
! 1614: #endif
! 1615:
1.1 frystyk 1616: } else {
1.62 frystyk 1617: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1618: }
1.17 frystyk 1619: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1620: if (!keycnt) {
1621: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1622: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1623: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1624: keycnt = 1;
1.11 frystyk 1625: HT_FREE(ref);
1.1 frystyk 1626: } else { /* Check for successive keyword arguments */
1627: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1628: if (keycnt++ <= 1)
1.5 frystyk 1629: keywords = HTChunk_new(128);
1.1 frystyk 1630: else
1.5 frystyk 1631: HTChunk_putc(keywords, ' ');
1632: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1633: HT_FREE(escaped);
1.1 frystyk 1634: }
1635: }
1636: }
1637:
1638: #ifdef CATCH_SIG
1639: SetSignal();
1640: #endif
1641:
1642: if (!keycnt) {
1.62 frystyk 1643: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1644: Cleanup(mr, -1);
1645: }
1646:
1647: if (mr->depth != DEFAULT_DEPTH &&
1648: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1649: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1650: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1651: mr->depth);
1.1 frystyk 1652: Cleanup(mr, -1);
1653: }
1654:
1.23 manoli 1655: /* Testing that HTTrace is working */
1.62 frystyk 1656: if (mr->flags & MR_TIME) {
1657: if (SHOW_REAL_QUIET(mr)) {
1658: time_t local = time(NULL);
1.67 frystyk 1659: HTTrace("Welcome to the W3C mini Robot version %s - started on %s\n",
1660: APP_VERSION, HTDateTimeStr(&local, YES));
1.62 frystyk 1661: }
1662: }
1.23 manoli 1663:
1.1 frystyk 1664: /* Rule file specified? */
1665: if (mr->rules) {
1666: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1667: if (!HTLoadRules(rules))
1.62 frystyk 1668: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1669: HT_FREE(rules);
1.1 frystyk 1670: }
1671:
1672: /* Output file specified? */
1673: if (mr->outputfile) {
1674: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1675: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1676: mr->output = OUTPUT;
1677: }
1678: }
1679:
1.48 frystyk 1680: /* Should we use persistent cache? */
1681: if (cache) {
1.54 frystyk 1682: HTCacheInit(cache_root, 20);
1.49 frystyk 1683: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1684: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1685: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1686:
1687: /* Should we start by flushing? */
1688: if (flush) HTCache_flushAll();
1689: }
1.68 ! frystyk 1690:
! 1691: /* SQL Log specified? */
! 1692: #ifdef HT_MYSQL
! 1693: if (mr->sqlserver) {
! 1694: if ((mr->sqllog =
! 1695: HTSQLLog_connect(mr->sqlserver,
! 1696: mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER,
! 1697: mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW)) != NULL) {
! 1698: HTSQLLog_openDB(mr->sqllog, mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB,
! 1699: mr->sqlflags);
! 1700: if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative);
! 1701: }
! 1702: }
! 1703: #endif
1.48 frystyk 1704:
1.58 frystyk 1705: /* CLF Log file specified? */
1.55 frystyk 1706: if (mr->logfile) {
1707: mr->log = HTLog_open(mr->logfile, YES, YES);
1708: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1709: }
1710:
1.58 frystyk 1711: /* Referer Log file specified? */
1.57 frystyk 1712: if (mr->reffile) {
1713: mr->ref = HTLog_open(mr->reffile, YES, YES);
1714: if (mr->ref)
1715: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1716: }
1.1 frystyk 1717:
1.58 frystyk 1718: /* Not found error log specified? */
1719: if (mr->notfoundfile) {
1720: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1721: if (mr->notfound)
1722: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1723: }
1724:
1725: /* Negotiated resource log specified? */
1726: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1727:
1728: /* No alt tags log file specified? */
1729: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1730:
1731: /* Reject Log file specified? */
1732: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1733:
1734: /* Register our own terminate filter */
1.32 frystyk 1735: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1736:
1737: /* Setting event timeout */
1738: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1739:
1.56 frystyk 1740: mr->time = HTGetTimeInMillis();
1.37 frystyk 1741:
1.34 eric 1742: /* Start the request */
1743: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1744:
1745: /*
1746: ** Make sure that the first request is flushed immediately and not
1747: ** buffered in the output buffer
1748: */
1749: HTRequest_setFlush(finger->request, YES);
1750:
1751: /*
1.48 frystyk 1752: ** Check whether we should do some kind of cache validation on
1753: ** the load
1754: */
1755: if (mr->flags & MR_VALIDATE)
1756: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1757: if (mr->flags & MR_END_VALIDATE)
1758: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1759:
1760: /*
1.43 frystyk 1761: ** Now do the load
1762: */
1.34 eric 1763: if (mr->flags & MR_PREEMPTIVE)
1764: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1765:
1766: if (keywords) /* Search */
1.34 eric 1767: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1768: else
1.34 eric 1769: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1770:
1.5 frystyk 1771: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1772: if (status != YES) {
1.62 frystyk 1773: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1774: Cleanup(mr, -1);
1775: }
1776:
1777: /* Go into the event loop... */
1.34 eric 1778: HTEventList_loop(finger->request);
1.1 frystyk 1779:
1780: /* Only gets here if event loop fails */
1781: Cleanup(mr, 0);
1782: return 0;
1783: }
Webmaster