Annotation of libwww/Robot/src/HTRobot.c, revision 1.74
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.71 frystyk 20: #include "WWWSQL.h"
1.9 frystyk 21:
1.4 frystyk 22: #include "HText.h"
1.1 frystyk 23:
24: #include "HTRobot.h" /* Implemented here */
25:
1.58 frystyk 26: #ifdef HT_POSIX_REGEX
1.64 frystyk 27: #ifdef HAVE_RXPOSIX_H
28: #include <rxposix.h>
29: #else
1.62 frystyk 30: #ifdef HAVE_REGEX_H
31: #include <regex.h>
32: #endif
33: #endif
1.60 frystyk 34: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 35: #endif
36:
1.14 frystyk 37: #ifndef W3C_VERSION
1.33 eric 38: #define W3C_VERSION "Unspecified"
1.1 frystyk 39: #endif
40:
41: #define APP_NAME "W3CRobot"
1.14 frystyk 42: #define APP_VERSION W3C_VERSION
1.62 frystyk 43: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 44:
45: #define DEFAULT_OUTPUT_FILE "robot.out"
46: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 47: #define DEFAULT_LOG_FILE "log-clf.txt"
48: #define DEFAULT_HIT_FILE "log-hit.txt"
1.64 frystyk 49: #define DEFAULT_REL_FILE "log-rel.txt"
1.63 frystyk 50: #define DEFAULT_LM_FILE "log-lastmodified.txt"
51: #define DEFAULT_TITLE_FILE "log-title.txt"
1.58 frystyk 52: #define DEFAULT_REFERER_FILE "log-referer.txt"
53: #define DEFAULT_REJECT_FILE "log-reject.txt"
54: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
55: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 56: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 57: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 58: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 59: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 60: #define DEFAULT_PREFIX ""
1.59 frystyk 61: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 62: #define DEFAULT_DEPTH 0
1.53 frystyk 63: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 64:
1.68 frystyk 65: #define DEFAULT_SQL_SERVER "localhost"
66: #define DEFAULT_SQL_DB "webbot"
67: #define DEFAULT_SQL_USER "webbot"
68: #define DEFAULT_SQL_PW ""
69:
1.51 frystyk 70: #if 0
1.65 frystyk 71: #define HT_MEMLOG /* Is expensive in performance! */
1.51 frystyk 72: #endif
73:
1.46 eric 74: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 frystyk 75: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
76: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 77:
1.74 ! frystyk 78: #define MILLIES 1000
! 79: #define DEFAULT_TIMEOUT 20 /* timeout in secs */
1.1 frystyk 80:
81: #if defined(__svr4__)
82: #define CATCH_SIG
83: #endif
84:
85: typedef enum _MRFlags {
1.45 frystyk 86: MR_IMG = 0x1,
87: MR_LINK = 0x2,
88: MR_PREEMPTIVE = 0x4,
89: MR_TIME = 0x8,
1.46 eric 90: MR_SAVE = 0x10,
1.48 frystyk 91: MR_QUIET = 0x20,
1.62 frystyk 92: MR_REAL_QUIET = 0x40,
93: MR_VALIDATE = 0x80,
94: MR_END_VALIDATE = 0x100,
1.63 frystyk 95: MR_KEEP_META = 0x200,
96: MR_LOGGING = 0x400,
97: MR_DISTRIBUTIONS = 0x800
1.1 frystyk 98: } MRFlags;
99:
100: typedef struct _Robot {
1.2 frystyk 101: int depth; /* How deep is our tree */
1.30 frystyk 102: int cnt; /* Count of requests */
1.2 frystyk 103: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 104: HTList * htext; /* List of our HText Objects */
1.34 eric 105: HTList * fingers;
1.59 frystyk 106:
1.40 frystyk 107: int timer;
1.65 frystyk 108: char * cwd; /* Current dir URL */
1.1 frystyk 109: char * rules;
1.55 frystyk 110: char * prefix;
1.59 frystyk 111: char * img_prefix;
112:
1.60 frystyk 113: char * logfile; /* clf log */
1.55 frystyk 114: HTLog * log;
1.60 frystyk 115: char * reffile; /* referer log */
1.57 frystyk 116: HTLog * ref;
1.60 frystyk 117: char * rejectfile; /* unchecked links */
1.58 frystyk 118: HTLog * reject;
1.60 frystyk 119: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 120: HTLog * notfound;
1.60 frystyk 121: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 122: HTLog * conneg;
1.60 frystyk 123: char * noalttagfile; /* images without alt tags*/
124: HTLog * noalttag;
125:
126: char * hitfile; /* links sorted after hit counts */
1.64 frystyk 127: char * relfile; /* link sorted after relationships */
128: HTLinkType relation; /* Specific relation to look for */
1.63 frystyk 129: char * titlefile; /* links with titles */
1.60 frystyk 130: char * mtfile; /* media types encountered */
131: char * charsetfile; /* charsets encountered */
1.63 frystyk 132: char * lmfile; /* sortef after last modified dates */
1.60 frystyk 133:
134: char * outputfile;
1.1 frystyk 135: FILE * output;
1.59 frystyk 136:
1.1 frystyk 137: MRFlags flags;
1.55 frystyk 138:
1.59 frystyk 139: long get_bytes; /* Total number of bytes processed using GET*/
140: long get_docs; /* Total number of documents using GET */
141:
142: long head_bytes; /* bytes processed bytes processed using HEAD */
143: long head_docs; /* Total number of documents using HEAD*/
144:
145: long other_docs;
146:
1.56 frystyk 147: ms_t time; /* Time of run */
1.58 frystyk 148:
149: #ifdef HT_POSIX_REGEX
150: regex_t * include;
151: regex_t * exclude;
152: regex_t * check;
153: #endif
154:
1.68 frystyk 155: #ifdef HT_MYSQL
156: HTSQLLog * sqllog;
157: char * sqlserver;
158: char * sqldb;
159: char * sqluser;
160: char * sqlpw;
161: char * sqlrelative;
162: BOOL sqlexternals;
163: int sqlflags;
164: #endif
165:
1.1 frystyk 166: } Robot;
1.34 eric 167:
168: typedef struct _Finger {
169: Robot * robot;
170: HTRequest * request;
171: HTParentAnchor * dest;
172: } Finger;
173:
1.1 frystyk 174: typedef enum _LoadState {
175: L_INVALID = -2,
176: L_LOADING = -1,
177: L_SUCCESS = 0,
178: L_ERROR
179: } LoadState;
180:
181: /*
182: ** The HyperDoc object is bound to the anchor and contains information about
183: ** where we are in the search for recursive searches
184: */
185: typedef struct _HyperDoc {
186: HTParentAnchor * anchor;
187: LoadState state;
188: int depth;
1.55 frystyk 189: int hits;
1.1 frystyk 190: } HyperDoc;
191:
192: /*
1.65 frystyk 193: ** This is the HText object that is created every time we start parsing an
1.1 frystyk 194: ** HTML object
195: */
1.4 frystyk 196: struct _HText {
1.1 frystyk 197: HTRequest * request;
1.65 frystyk 198: BOOL follow;
1.4 frystyk 199: };
1.1 frystyk 200:
1.58 frystyk 201: /*
202: ** A structure for calculating metadata distributions
203: */
204: typedef struct _MetaDist {
205: HTAtom * name;
206: int hits;
207: } MetaDist;
208:
209: /*
210: ** Some sorting algorithms
211: */
1.63 frystyk 212: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 213:
1.1 frystyk 214: PUBLIC HText * HTMainText = NULL;
215: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
216: PUBLIC HTStyleSheet * styleSheet = NULL;
217:
218: /* ------------------------------------------------------------------------- */
219:
1.13 eric 220: /* Standard (non-error) Output
221: ** ---------------------------
222: */
223: PUBLIC int OutputData(const char * fmt, ...)
224: {
225: int ret;
226: va_list pArgs;
227: va_start(pArgs, fmt);
228: ret = vfprintf(stdout, fmt, pArgs);
229: va_end(pArgs);
230: return ret;
231: }
232:
233: /* ------------------------------------------------------------------------- */
234:
1.2 frystyk 235: /* Create a "HyperDoc" object
236: ** --------------------------
237: ** A HyperDoc object contains information about whether we have already
238: ** started checking the anchor and the depth in our search
239: */
240: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
241: {
242: HyperDoc * hd;
1.14 frystyk 243: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
244: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 245: hd->state = L_INVALID;
246: hd->depth = depth;
1.55 frystyk 247: hd->hits = 1;
1.2 frystyk 248:
249: /* Bind the HyperDoc object together with the Anchor Object */
250: hd->anchor = anchor;
251: HTAnchor_setDocument(anchor, (void *) hd);
252:
253: /* Add this HyperDoc object to our list */
254: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
255: HTList_addObject(mr->hyperdoc, (void *) hd);
256: return hd;
257: }
258:
259: /* Delete a "HyperDoc" object
260: ** --------------------------
261: */
262: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
263: {
264: if (hd) {
1.11 frystyk 265: HT_FREE (hd);
1.2 frystyk 266: return YES;
267: }
268: return NO;
269: }
270:
1.55 frystyk 271: /*
272: ** Sort the anchor array and log reference count
273: */
274: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
275: {
276: if (mr && array) {
277: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
278: if (log) {
279: void ** data = NULL;
280: HTParentAnchor * anchor = NULL;
281: HTArray_sort(array, HitSort);
282: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
283: while (anchor) {
284: char * uri = HTAnchor_address((HTAnchor *) anchor);
285: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 286: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 287: HT_FREE(uri);
288: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
289: }
290: }
291: HTLog_close(log);
292: return YES;
293: }
294: return NO;
295: }
296:
297: PRIVATE int HitSort (const void * a, const void * b)
298: {
299: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
300: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
301: if (aa && bb) return (bb->hits - aa->hits);
302: return bb - aa;
303: }
304:
1.58 frystyk 305: /*
1.64 frystyk 306: ** Sort the anchor array and log link relations
307: */
308: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
309: {
310: if (mr && array) {
1.68 frystyk 311: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
312: void ** data = NULL;
313: HTParentAnchor * anchor = NULL;
314: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
315: while (anchor) {
316:
317: /*
318: ** If we have a specific link relation to look for then do this.
319: ** Otherwise look for all link relations.
320: */
321: if (mr->relation) {
322: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
323: if (link) {
324: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
325: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
326: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
327: if (src_uri && dest_uri) {
328: #ifdef HT_MYSQL
329: if (mr->sqllog) {
330: HTSQLLog_addLinkRelationship (mr->sqllog,
331: src_uri, dest_uri,
332: HTAtom_name(mr->relation),
333: NULL);
334: }
335: #endif
336: if (log) {
337: HTFormat format = HTAnchor_format(dest);
338: HTLog_addText(log, "%s %s %s --> %s\n",
339: HTAtom_name(mr->relation),
340: format != WWW_UNKNOWN ?
341: HTAtom_name(format) : "<unknown>",
342: src_uri, dest_uri);
343: }
344:
345: /* Cleanup */
346: HT_FREE(src_uri);
347: HT_FREE(dest_uri);
348: }
349: }
350: } else {
351: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
352: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
353: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
354: HTLinkType linktype;
355:
356: /* First look in the main link */
357: if (link && (linktype = HTLink_type(link))) {
358: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
359: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
360: if (src_uri && dest_uri) {
361: #ifdef HT_MYSQL
362: if (mr->sqllog) {
363: HTSQLLog_addLinkRelationship (mr->sqllog,
364: src_uri, dest_uri,
365: HTAtom_name(linktype),
366: NULL);
367: }
368: #endif
369: if (log) {
370: HTFormat format = HTAnchor_format(dest);
371: HTLog_addText(log, "%s %s %s --> %s\n",
372: HTAtom_name(linktype),
373: format != WWW_UNKNOWN ?
374: HTAtom_name(format) : "<unknown>",
375: src_uri, dest_uri);
376: }
377: }
378: HT_FREE(dest_uri);
379: }
380:
381: /* and then in any sublinks */
382: if (sublinks) {
383: HTLink * pres;
384: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
385: if ((linktype = HTLink_type(pres))) {
386: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 387: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 388: if (src_uri && dest_uri) {
389: #ifdef HT_MYSQL
390: if (mr->sqllog) {
391: HTSQLLog_addLinkRelationship (mr->sqllog,
392: src_uri, dest_uri,
393: HTAtom_name(linktype),
394: NULL);
395: }
396: #endif
397: if (log) {
398: HTFormat format = HTAnchor_format(dest);
399: HTLog_addText(log, "%s %s %s --> %s\n",
400: HTAtom_name(linktype),
401: format != WWW_UNKNOWN ?
402: HTAtom_name(format) : "<unknown>",
403: src_uri, dest_uri);
404: }
1.64 frystyk 405: HT_FREE(dest_uri);
406: }
407: }
408: }
409: }
1.68 frystyk 410:
411: /* Cleanup */
412: HT_FREE(src_uri);
413: }
414: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 415: }
1.68 frystyk 416: if (log) HTLog_close(log);
1.64 frystyk 417: return YES;
418: }
419: return NO;
420: }
421:
422: /*
1.63 frystyk 423: ** Sort the anchor array and log last modified date
424: */
425: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
426: {
427: if (mr && array) {
428: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
429: if (log) {
430: void ** data = NULL;
431: HTParentAnchor * anchor = NULL;
432: HTArray_sort(array, LastModifiedSort);
433: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
434: while (anchor) {
435: char * uri = HTAnchor_address((HTAnchor *) anchor);
436: time_t lm = HTAnchor_lastModified(anchor);
437: if (uri && lm > 0)
438: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
439: HT_FREE(uri);
440: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
441: }
442: }
443: HTLog_close(log);
444: return YES;
445: }
446: return NO;
447: }
448:
449: PRIVATE int LastModifiedSort (const void * a, const void * b)
450: {
451: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
452: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
453: return bb - aa;
454: }
455:
456: /*
457: ** Sort the anchor array and log the document title
458: */
459: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
460: {
461: if (mr && array) {
462: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
463: if (log) {
464: void ** data = NULL;
465: HTParentAnchor * anchor = NULL;
466: HTArray_sort(array, TitleSort);
467: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
468: while (anchor) {
469: char * uri = HTAnchor_address((HTAnchor *) anchor);
470: const char * title = HTAnchor_title(anchor);
471: HTCharset charset = HTAnchor_charset(anchor);
472: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
473: charset ? HTAtom_name(charset) : "<none>",
474: title ? title : "<none>",
475: uri);
476: HT_FREE(uri);
477: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
478: }
479: }
480: HTLog_close(log);
481: return YES;
482: }
483: return NO;
484: }
485:
486: PRIVATE int TitleSort (const void * a, const void * b)
487: {
488: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
489: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
490: return strcasecomp(bb?bb:"", aa?aa:"");
491: }
492:
493: /*
1.58 frystyk 494: ** Calculate distributions for media types. The same mechanism
495: ** can be used for other characteristics with relatively
496: ** few outcomes.
497: */
498: PRIVATE HTList * mediatype_distribution (HTArray * array)
499: {
500: if (array) {
501: HTList * mt = HTList_new();
502: MetaDist * pres = NULL;
503: void ** data = NULL;
504: HTParentAnchor * anchor = NULL;
505: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
506: while (anchor) {
507: HTFormat format = HTAnchor_format(anchor);
508: if (format && format != WWW_UNKNOWN) {
509: HTList * cur = mt;
510:
511: /* If found then increase counter */
512: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
513: if (pres->name == format) {
514: pres->hits++;
515: break;
516: }
517: }
518:
519: /* If not found then add new format to list */
520: if (!pres) {
521: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
522: HT_OUTOFMEM("mediatype_distribution");
523: pres->name = format;
524: pres->hits = 1;
525: HTList_addObject(mt, pres);
526: HTList_insertionSort(mt, FormatSort);
527: }
528: }
529:
530: /* Find next anchor in array */
531: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
532: }
533: return mt;
534: }
535: return NULL;
536: }
537:
1.60 frystyk 538: /*
539: ** Calculate distributions for charsets. The same mechanism
540: ** can be used for other characteristics with relatively
541: ** few outcomes.
542: */
543: PRIVATE HTList * charset_distribution (HTArray * array)
544: {
545: if (array) {
546: HTList * cs = HTList_new();
547: MetaDist * pres = NULL;
548: void ** data = NULL;
549: HTParentAnchor * anchor = NULL;
550: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
551: while (anchor) {
552: HTCharset charset = HTAnchor_charset(anchor);
553: if (charset) {
554: HTList * cur = cs;
555:
556: /* If found then increase counter */
557: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
558: if (pres->name == charset) {
559: pres->hits++;
560: break;
561: }
562: }
563:
564: /* If not found then add new format to list */
565: if (!pres) {
566: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
567: HT_OUTOFMEM("charset_distribution");
568: pres->name = charset;
569: pres->hits = 1;
570: HTList_addObject(cs, pres);
571: HTList_insertionSort(cs, FormatSort);
572: }
573: }
574:
575: /* Find next anchor in array */
576: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
577: }
578: return cs;
579: }
580: return NULL;
581: }
582:
1.58 frystyk 583: PRIVATE int FormatSort (const void * a, const void * b)
584: {
585: MetaDist * aa = (MetaDist *) a;
586: MetaDist * bb = (MetaDist *) b;
587: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
588: }
589:
590: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
591: {
592: if (logfile && distribution) {
593: HTLog * log = HTLog_open(logfile, YES, YES);
594: if (log) {
595: HTList * cur = distribution;
596: MetaDist * pres;
597: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
598: if (pres->name) {
1.60 frystyk 599: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 600: }
601: }
602: HTLog_close(log);
603: }
604: }
605: return NO;
606: }
607:
608: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
609: {
610: if (distribution) {
611: HTList * cur = distribution;
612: MetaDist * pres;
613: while ((pres = (MetaDist *) HTList_nextObject(cur)))
614: HT_FREE(pres);
615: HTList_delete(distribution);
616: return YES;
617: }
618: return NO;
619: }
620:
621:
1.55 frystyk 622: /* Statistics
623: ** ----------
624: ** Calculates a bunch of statistics for the anchors traversed
625: */
626: PRIVATE BOOL calculate_statistics (Robot * mr)
627: {
1.59 frystyk 628: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 629: if (!mr) return NO;
630:
631: /* Calculate efficiency */
1.59 frystyk 632: if (mr->time > 0) {
1.56 frystyk 633: ms_t t = HTGetTimeInMillis() - mr->time;
634: if (t > 0) {
1.60 frystyk 635: double loadfactor = (mr->get_bytes / (t * 0.001));
636: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 637: double secs = t / 1000.0;
1.55 frystyk 638: char bytes[50];
1.62 frystyk 639: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 640: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 641: total_docs, secs, reqprsec);
1.59 frystyk 642:
643: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 644: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 645: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 646: mr->get_docs, bytes, loadfactor);
1.59 frystyk 647:
648: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 649: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 650: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 651: mr->head_docs, bytes);
1.55 frystyk 652: }
653: }
654:
655: /* Create an array of existing anchors */
1.59 frystyk 656: if (total_docs > 1) {
657: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 658: if (array) {
659:
1.63 frystyk 660: /* Distributions */
661: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 662: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 663: }
664:
1.55 frystyk 665: /* Sort after hit counts */
1.63 frystyk 666: if (mr->hitfile) {
667: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 668: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 669: mr->hitfile);
670: calculate_hits(mr, array);
671: }
672:
1.64 frystyk 673: /* Sort after link relations */
1.68 frystyk 674: #ifdef HT_MYSQL
675: if (mr->relfile || mr->sqllog) {
1.69 frystyk 676: #else
677: if (mr->relfile) {
678: #endif
1.68 frystyk 679: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 680: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
681: mr->relfile);
682: calculate_linkRelations(mr, array);
683: }
684:
1.63 frystyk 685: /* Sort after modified date */
686: if (mr->lmfile) {
687: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 688: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 689: mr->lmfile);
690: calculate_lm(mr, array);
691: }
692:
693: /* Sort after title */
694: if (mr->titlefile) {
695: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 696: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 697: mr->titlefile);
698: calculate_title(mr, array);
699: }
1.55 frystyk 700:
1.58 frystyk 701: /* Find mediatype distribution */
702: if (mr->mtfile) {
703: HTList * mtdist = mediatype_distribution(array);
704: if (mtdist) {
1.63 frystyk 705: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 706: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 707: mr->mtfile);
1.58 frystyk 708: log_meta_distribution(mr->mtfile, mtdist);
709: delete_meta_distribution(mtdist);
710: }
711: }
1.55 frystyk 712:
1.60 frystyk 713: /* Find charset distribution */
714: if (mr->charsetfile) {
715: HTList * charsetdist = charset_distribution(array);
716: if (charsetdist) {
1.63 frystyk 717: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 718: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 719: mr->charsetfile);
1.60 frystyk 720: log_meta_distribution(mr->charsetfile, charsetdist);
721: delete_meta_distribution(charsetdist);
722: }
723: }
724:
1.55 frystyk 725: /* Add as may other stats here as you like */
1.60 frystyk 726: /* ... */
1.58 frystyk 727:
728: /* Delete the array */
1.55 frystyk 729: HTArray_delete(array);
730: }
731: }
732: return YES;
733: }
734:
1.1 frystyk 735: /* Create a Command Line Object
736: ** ----------------------------
737: */
738: PRIVATE Robot * Robot_new (void)
739: {
740: Robot * me;
1.41 frystyk 741: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 742: HT_OUTOFMEM("Robot_new");
1.2 frystyk 743: me->hyperdoc = HTList_new();
1.4 frystyk 744: me->htext = HTList_new();
1.74 ! frystyk 745: me->timer = DEFAULT_TIMEOUT*MILLIES;
1.25 frystyk 746: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 747: me->output = OUTPUT;
1.35 eric 748: me->cnt = 0;
1.34 eric 749: me->fingers = HTList_new();
1.1 frystyk 750: return me;
751: }
752:
753: /* Delete a Command Line Object
754: ** ----------------------------
755: */
1.62 frystyk 756: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 757: {
1.62 frystyk 758: if (mr) {
759: HTList_delete(mr->fingers);
1.55 frystyk 760:
761: /* Calculate statistics */
1.62 frystyk 762: calculate_statistics(mr);
1.55 frystyk 763:
1.62 frystyk 764: if (mr->hyperdoc) {
765: HTList * cur = mr->hyperdoc;
1.2 frystyk 766: HyperDoc * pres;
767: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
768: HyperDoc_delete(pres);
1.62 frystyk 769: HTList_delete(mr->hyperdoc);
1.2 frystyk 770: }
1.62 frystyk 771: if (mr->htext) {
772: HTList * cur = mr->htext;
1.4 frystyk 773: HText * pres;
774: while ((pres = (HText *) HTList_nextObject(cur)))
775: HText_free(pres);
1.62 frystyk 776: HTList_delete(mr->htext);
1.4 frystyk 777: }
1.62 frystyk 778:
779: /* Close all the log files */
1.63 frystyk 780: if (mr->flags & MR_LOGGING) {
1.64 frystyk 781: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 782: }
783:
1.62 frystyk 784: if (mr->log) {
785: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 786: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 787: HTLog_accessCount(mr->log), mr->logfile);
788: HTLog_close(mr->log);
789: }
790: if (mr->ref) {
791: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 792: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 793: HTLog_accessCount(mr->ref), mr->reffile);
794: HTLog_close(mr->ref);
795: }
796: if (mr->reject) {
797: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 798: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 799: HTLog_accessCount(mr->reject), mr->rejectfile);
800: HTLog_close(mr->reject);
801: }
802: if (mr->notfound) {
803: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 804: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 805: HTLog_accessCount(mr->notfound), mr->notfoundfile);
806: HTLog_close(mr->notfound);
807: }
808: if (mr->conneg) {
809: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 810: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 811: HTLog_accessCount(mr->conneg), mr->connegfile);
812: HTLog_close(mr->conneg);
813: }
814: if (mr->noalttag) {
815: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 816: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 817: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
818: HTLog_close(mr->noalttag);
819: }
820:
821: if (mr->output && mr->output != STDOUT) fclose(mr->output);
822:
823: if (mr->flags & MR_TIME) {
1.12 frystyk 824: time_t local = time(NULL);
1.62 frystyk 825: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 826: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 827: }
1.55 frystyk 828:
1.58 frystyk 829: #ifdef HT_POSIX_REGEX
1.62 frystyk 830: if (mr->include) {
831: regfree(mr->include);
832: HT_FREE(mr->include);
833: }
834: if (mr->exclude) {
835: regfree(mr->exclude);
836: HT_FREE(mr->exclude);
837: }
838: if (mr->check) {
839: regfree(mr->check);
840: HT_FREE(mr->check);
1.58 frystyk 841: }
842: #endif
843:
1.68 frystyk 844: #ifdef HT_MYSQL
845: if (mr->sqllog) {
846: HTSQLLog_close(mr->sqllog);
847: mr->sqllog = NULL;
848: }
849: #endif
850:
1.62 frystyk 851: HT_FREE(mr->cwd);
852: HT_FREE(mr->prefix);
853: HT_FREE(mr->img_prefix);
854: HT_FREE(mr);
1.1 frystyk 855: return YES;
856: }
857: return NO;
858: }
859:
1.2 frystyk 860: /*
1.34 eric 861: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 862: */
1.34 eric 863: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 864: {
1.34 eric 865: Finger * me;
866: HTRequest * request = HTRequest_new();
867: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
868: HT_OUTOFMEM("Finger_new");
869: me->robot = robot;
870: me->request = request;
871: me->dest = dest;
872: HTList_addObject(robot->fingers, (void *)me);
873:
1.48 frystyk 874: /* Set the context for this request */
1.34 eric 875: HTRequest_setContext (request, me);
1.48 frystyk 876:
877: /* Check the various flags to customize the request */
878: if (robot->flags & MR_PREEMPTIVE)
879: HTRequest_setPreemptive(request, YES);
880: if (robot->flags & MR_VALIDATE)
881: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
882: if (robot->flags & MR_END_VALIDATE)
883: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
884:
885: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 886: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 887:
888: /* Set the method for this request */
1.34 eric 889: HTRequest_setMethod(request, method);
890: robot->cnt++;
891: return me;
1.2 frystyk 892: }
893:
1.34 eric 894: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 895: {
1.34 eric 896: HTList_removeObject(me->robot->fingers, (void *)me);
897: me->robot->cnt--;
1.37 frystyk 898:
899: /*
900: ** If we are down at one request then flush the output buffer
901: */
902: if (me->request) {
903: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 904: HTRequest_delete(me->request);
1.37 frystyk 905: }
906:
907: /*
908: ** Delete the request and free myself
909: */
1.34 eric 910: HT_FREE(me);
911: return YES;
1.2 frystyk 912: }
913:
914: /*
915: ** Cleanup and make sure we close all connections including the persistent
916: ** ones
917: */
1.1 frystyk 918: PRIVATE void Cleanup (Robot * me, int status)
919: {
920: Robot_delete(me);
1.29 eric 921: HTProfile_delete();
1.50 frystyk 922: #ifdef HT_MEMLOG
1.39 eric 923: HTMemLog_close();
1.47 frystyk 924: #endif
925:
1.1 frystyk 926: #ifdef VMS
927: exit(status ? status : 1);
928: #else
929: exit(status ? status : 0);
930: #endif
931: }
932:
933: #ifdef CATCH_SIG
934: #include <signal.h>
935: /* SetSignal
936: ** This function sets up signal handlers. This might not be necessary to
937: ** call if the application has its own handlers (lossage on SVR4)
938: */
939: PRIVATE void SetSignal (void)
940: {
941: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
942: ** when attemting to connect to a remote host where you normally should
943: ** get `connection refused' back
944: */
945: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 946: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 947: } else {
1.13 eric 948: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 949: }
1.47 frystyk 950:
1.50 frystyk 951: #ifdef HT_MEMLOG
1.44 eric 952: HTMemLog_flush();
1.47 frystyk 953: #endif
954:
1.1 frystyk 955: }
956: #endif /* CATCH_SIG */
957:
1.58 frystyk 958: #ifdef HT_POSIX_REGEX
959: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
960: {
961: size_t length = regerror (errcode, compiled, NULL, 0);
962: char * str = NULL;
963: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
964: HT_OUTOFMEM("get_regerror");
965: (void) regerror (errcode, compiled, str, length);
966: return str;
967: }
968:
1.60 frystyk 969: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 970: {
971: regex_t * regex = NULL;
972: if (regex_str && *regex_str) {
973: int status;
974: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
975: HT_OUTOFMEM("get_regtype");
1.60 frystyk 976: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 977: char * err_msg = get_regerror(status, regex);
1.62 frystyk 978: if (SHOW_REAL_QUIET(mr))
979: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 980: HT_FREE(err_msg);
981: Cleanup(mr, -1);
982: }
983: }
984: return regex;
985: }
986: #endif
987:
1.1 frystyk 988: PRIVATE void VersionInfo (void)
989: {
1.62 frystyk 990: OutputData("W3C Sample Software\n\n");
991: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
992: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
993: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 994: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 995: }
996:
997: /* terminate_handler
998: ** -----------------
1.2 frystyk 999: ** This function is registered to handle the result of the request.
1000: ** If no more requests are pending then terminate program
1.1 frystyk 1001: */
1.32 frystyk 1002: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
1003: void * param, int status)
1.1 frystyk 1004: {
1.34 eric 1005: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 1006: Robot * mr = finger->robot;
1.62 frystyk 1007: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 1008:
1.68 frystyk 1009: #ifdef HT_MYSQL
1010: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1011: #endif
1012:
1.58 frystyk 1013: /* Check if negotiated resource and whether we should log that*/
1014: if (mr->conneg) {
1015: HTAssocList * cur = HTResponse_variant(response);
1016: if (cur) {
1017: BOOL first = YES;
1018: HTChunk * buffer = HTChunk_new(128);
1019: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1020: HTAssoc * pres;
1.60 frystyk 1021: HTChunk_puts(buffer, uri);
1.58 frystyk 1022: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1023: char * value = HTAssoc_value(pres);
1024: if (first) {
1.60 frystyk 1025: HTChunk_puts(buffer, "\t(");
1.58 frystyk 1026: first = NO;
1027: } else
1028: HTChunk_puts(buffer, ", ");
1029:
1030: /* Output the name */
1031: HTChunk_puts(buffer, HTAssoc_name(pres));
1032:
1033: /* Only output the value if not empty string */
1.60 frystyk 1034: if (value && *value) {
1.58 frystyk 1035: HTChunk_puts(buffer, "=");
1036: HTChunk_puts(buffer, value);
1037: }
1038: }
1.60 frystyk 1039: if (!first) HTChunk_puts(buffer, ")");
1040: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 1041: HTChunk_delete(buffer);
1042: HT_FREE(uri);
1043: }
1044: }
1045:
1.55 frystyk 1046: /* Count the amount of body data that we have read */
1.59 frystyk 1047: if (HTRequest_method(request) == METHOD_GET) {
1048: int length = HTAnchor_length(HTRequest_anchor(request));
1049: if (length > 0) mr->get_bytes += length;
1050: mr->get_docs++;
1051: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 1052: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 1053: if (length > 0) mr->head_bytes += length;
1054: mr->head_docs++;
1055: } else {
1056: mr->other_docs++;
1.55 frystyk 1057: }
1058:
1.58 frystyk 1059: /* Cleanup the anchor so that we don't drown in metainformation */
1060: if (!(mr->flags & MR_KEEP_META))
1061: HTAnchor_clearHeader(HTRequest_anchor(request));
1062:
1.55 frystyk 1063: /* Delete this thread */
1.34 eric 1064: Finger_delete(finger);
1.55 frystyk 1065:
1066: /* Should we stop? */
1.46 eric 1067: if (mr->cnt <= 0) {
1.62 frystyk 1068: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1069: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 1070: }
1.62 frystyk 1071: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 1072: return HT_OK;
1073: }
1074:
1075: /* ------------------------------------------------------------------------- */
1076: /* HTEXT INTERFACE */
1077: /* ------------------------------------------------------------------------- */
1078:
1079: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
1080: HTStream * stream)
1081: {
1082: HText * me;
1.34 eric 1083: Finger * finger = (Finger *) HTRequest_context(request);
1084: Robot * mr = finger->robot;
1.65 frystyk 1085: char * robots = NULL;
1086:
1.14 frystyk 1087: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1088: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1089:
1090: /* Bind the HText object together with the Request Object */
1.1 frystyk 1091: me->request = request;
1.65 frystyk 1092: me->follow = YES;
1093:
1094: /* Check to see if we have any meta tags */
1095: if ((robots = HTAnchor_robots(anchor)) != NULL) {
1096: char * strval = NULL;
1097: char * ptr = NULL;
1098: char * token = NULL;
1099: StrAllocCopy(strval, robots);
1100: ptr = strval;
1101: while ((token = HTNextField(&ptr)) != NULL) {
1102: if (!strcasecomp(token, "nofollow")) {
1103: me->follow = NO;
1104: break;
1105: }
1106: }
1107: HT_FREE(strval);
1108: }
1.4 frystyk 1109:
1110: /* Add this HyperDoc object to our list */
1111: if (!mr->htext) mr->htext = HTList_new();
1112: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1113: return me;
1114: }
1115:
1.4 frystyk 1116: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1117: if (me) HT_FREE (me);
1.4 frystyk 1118: }
1119:
1.1 frystyk 1120: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1121: {
1122: if (text && anchor) {
1.34 eric 1123: Finger * finger = (Finger *) HTRequest_context(text->request);
1124: Robot * mr = finger->robot;
1.1 frystyk 1125: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1126: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1127: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1128: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1129: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1130: BOOL match = text->follow;
1.58 frystyk 1131: BOOL check = NO;
1.1 frystyk 1132:
1.55 frystyk 1133: if (!uri) return;
1.62 frystyk 1134: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 1135:
1136: if (hd) {
1.62 frystyk 1137: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 1138: hd->hits++;
1.68 frystyk 1139: #ifdef HT_MYSQL
1140: if (mr->sqllog) {
1141: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1142: if (ref_addr) {
1143: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1144: "referer", NULL);
1145: HT_FREE(ref_addr);
1146: }
1147: }
1148: #endif
1.58 frystyk 1149: HT_FREE(uri);
1150: return;
1151: }
1.70 frystyk 1152:
1.58 frystyk 1153: /* Check for prefix match */
1.65 frystyk 1154: if (match && mr->prefix) {
1155: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1156: }
1.58 frystyk 1157:
1158: #ifdef HT_POSIX_REGEX
1.69 frystyk 1159: /*
1160: ** Check for any regular expression. The include may override
1161: ** the prefix matching
1162: */
1163: if (mr->include) {
1.58 frystyk 1164: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1165: }
1166: if (match && mr->exclude) {
1167: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1168: }
1169: if (match && mr->check) {
1170: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1171: }
1172: #endif
1173:
1174: /* Test whether we already have a hyperdoc for this document */
1175: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 1176: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
1177: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1178: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 1179: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1180: HTRequest * newreq = newfinger->request;
1.2 frystyk 1181: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 1182: HTRequest_setParent(newreq, referer);
1.58 frystyk 1183: if (check || depth >= mr->depth) {
1.62 frystyk 1184: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 1185: HTRequest_setMethod(newreq, METHOD_HEAD);
1186: } else {
1.62 frystyk 1187: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 1188: }
1189: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 frystyk 1190: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 1191: Finger_delete(newfinger);
1.2 frystyk 1192: }
1.7 frystyk 1193: } else {
1.62 frystyk 1194: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1195: #ifdef HT_MYSQL
1196: if (mr->reject || mr->sqllog) {
1197: #else
1.60 frystyk 1198: if (mr->reject) {
1.68 frystyk 1199: #endif
1.60 frystyk 1200: if (referer) {
1201: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1202: if (mr->reject && ref_addr)
1203: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1204: #ifdef HT_MYSQL
1205: if (mr->sqllog && mr->sqlexternals && ref_addr)
1206: HTSQLLog_addLinkRelationship(mr->sqllog,
1207: ref_addr, uri,
1208: "referer", NULL);
1209: #endif
1210:
1.60 frystyk 1211: HT_FREE(ref_addr);
1212: }
1213: }
1.2 frystyk 1214: }
1.11 frystyk 1215: HT_FREE(uri);
1.2 frystyk 1216: }
1217: }
1218:
1219: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1220: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1221: {
1222: if (text && anchor) {
1.34 eric 1223: Finger * finger = (Finger *) HTRequest_context(text->request);
1224: Robot * mr = finger->robot;
1.59 frystyk 1225: if (mr->flags & MR_IMG) {
1.60 frystyk 1226: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1227: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1228: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1229: HyperDoc * hd = HTAnchor_document(dest_parent);
1230: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1231: BOOL match = YES;
1232:
1.72 frystyk 1233: if (!uri) return;
1.59 frystyk 1234: if (hd) {
1.62 frystyk 1235: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 1236: hd->hits++;
1.68 frystyk 1237: #ifdef HT_MYSQL
1238: if (mr->sqllog) {
1239: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1240: if (ref_addr) {
1241: HTSQLLog_addLinkRelationship(mr->sqllog,
1242: ref_addr, uri,
1243: "image", alt);
1244: HT_FREE(ref_addr);
1245: }
1246: }
1247: #endif
1.11 frystyk 1248: HT_FREE(uri);
1.59 frystyk 1249: return;
1.2 frystyk 1250: }
1.59 frystyk 1251:
1252: /* Check for prefix match */
1253: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1254:
1255: /* Test whether we already have a hyperdoc for this document */
1256: if (match && dest) {
1.60 frystyk 1257: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1258: mr->flags & MR_SAVE ?
1259: METHOD_GET : METHOD_HEAD);
1260: HTRequest * newreq = newfinger->request;
1.60 frystyk 1261: HyperDoc_new(mr, dest_parent, 1);
1262: HTRequest_setParent(newreq, referer);
1263:
1264: /* Check whether we should report missing ALT tags */
1265: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1266: if (referer) {
1267: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1268: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1269: HT_FREE(ref_addr);
1270: }
1271: }
1272:
1.62 frystyk 1273: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1274: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1275: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1276: Finger_delete(newfinger);
1277: }
1278: } else {
1.62 frystyk 1279: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.68 frystyk 1280: #ifdef HT_MYSQL
1281: if (mr->reject || mr->sqllog) {
1282: #else
1.60 frystyk 1283: if (mr->reject) {
1.68 frystyk 1284: #endif
1.60 frystyk 1285: if (referer) {
1286: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1287: if (mr->reject && ref_addr)
1288: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1289: #ifdef HT_MYSQL
1290: if (mr->sqllog && mr->sqlexternals && ref_addr)
1291: HTSQLLog_addLinkRelationship(mr->sqllog,
1292: ref_addr, uri,
1293: "image", alt);
1294: #endif
1295:
1.60 frystyk 1296: HT_FREE(ref_addr);
1297: }
1298: }
1.1 frystyk 1299: }
1.59 frystyk 1300: HT_FREE(uri);
1.72 frystyk 1301: }
1302: }
1303: }
1304:
1305: PUBLIC void HText_appendLink (HText * text, HTChildAnchor * anchor,
1306: const BOOL * present, const char ** value)
1307: {
1308: if (text && anchor) {
1309: Finger * finger = (Finger *) HTRequest_context(text->request);
1310: Robot * mr = finger->robot;
1311: if (SHOW_QUIET(mr))
1312: HTTrace("Robot....... Received Link element with anchor %p\n", anchor);
1313: HText_beginAnchor(text, anchor);
1314: }
1315: }
1316:
1317: PUBLIC void HText_appendObject (HText * text, int element_number,
1318: const BOOL * present, const char ** value)
1319: {
1320: /* Here we can look for frames, link tags, meta tags etc. */
1321: if (text && text->request) {
1322: Finger * finger = (Finger *) HTRequest_context(text->request);
1323: Robot * mr = finger->robot;
1324:
1325: if (SHOW_QUIET(mr))
1326: HTTrace("Robot....... HText Object %p called with HTML element number %d\n",
1327: text, element_number);
1328:
1329: switch (element_number) {
1330:
1331: case HTML_FRAME:
1332: {
1333: HTChildAnchor * source = HTAnchor_findChildAndLink(
1334: HTRequest_anchor(text->request), /* Parent */
1335: NULL, /* Tag */
1336: present[HTML_FRAME_SRC] ? value[HTML_FRAME_SRC] : NULL, /* Addresss */
1337: NULL); /* Rels */
1338: HText_beginAnchor(text, source);
1339: }
1340: break;
1341:
1342: case HTML_BODY:
1343: {
1344: HTChildAnchor * source = HTAnchor_findChildAndLink(
1345: HTRequest_anchor(text->request), /* Parent */
1346: NULL, /* Tag */
1347: present[HTML_BODY_BACKGROUND] ? value[HTML_BODY_BACKGROUND] : NULL, /* Addresss */
1348: NULL); /* Rels */
1349: HText_appendImage(text, source, NULL, NULL, NO);
1350: }
1351: break;
1352:
1353: default:
1354: break;
1.1 frystyk 1355: }
1356: }
1357: }
1358:
1359: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1360: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1361: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1362: PUBLIC void HText_endAppend (HText * text) {}
1363: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1364: PUBLIC void HText_beginAppend (HText * text) {}
1365: PUBLIC void HText_appendParagraph (HText * text) {}
1366:
1.48 frystyk 1367: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
1368: {
1369: return (vfprintf(stderr, fmt, pArgs));
1370: }
1371:
1.1 frystyk 1372: /* ------------------------------------------------------------------------- */
1373: /* MAIN PROGRAM */
1374: /* ------------------------------------------------------------------------- */
1375:
1376: int main (int argc, char ** argv)
1377: {
1.48 frystyk 1378: int status = 0;
1.1 frystyk 1379: int arg;
1.48 frystyk 1380: BOOL cache = NO; /* Use persistent cache */
1381: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 1382: char * cache_root = NULL;
1.1 frystyk 1383: HTChunk * keywords = NULL; /* From command line */
1384: int keycnt = 0;
1.12 frystyk 1385: Robot * mr = NULL;
1.43 frystyk 1386: Finger * finger = NULL;
1387: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 1388:
1389: /* Starts Mac GUSI socket library */
1390: #ifdef GUSI
1391: GUSISetup(GUSIwithSIOUXSockets);
1392: GUSISetup(GUSIwithInternetSockets);
1393: #endif
1394:
1395: #ifdef __MWERKS__ /* STR */
1396: InitGraf((Ptr) &qd.thePort);
1397: InitFonts();
1398: InitWindows();
1399: InitMenus(); TEInit();
1400: InitDialogs(nil);
1401: InitCursor();
1402: SIOUXSettings.asktosaveonclose = false;
1403: argc=ccommand(&argv);
1.50 frystyk 1404: #endif /* __MWERKS__ */
1.1 frystyk 1405:
1.50 frystyk 1406: #ifdef HT_MEMLOG
1.51 frystyk 1407: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1408: #endif
1.46 eric 1409:
1.27 frystyk 1410: /* Initiate W3C Reference Library with a robot profile */
1411: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1412: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1413:
1414: /* Add the default HTML parser to the set of converters */
1415: {
1416: HTList * converters = HTFormat_conversion();
1417: HTMLInit(converters);
1418: }
1.1 frystyk 1419:
1.12 frystyk 1420: /* Build a new robot object */
1421: mr = Robot_new();
1422:
1.1 frystyk 1423: /* Scan command Line for parameters */
1424: for (arg=1; arg<argc; arg++) {
1425: if (*argv[arg] == '-') {
1426:
1427: /* non-interactive */
1.17 frystyk 1428: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1429: HTAlert_setInteractive(NO);
1430:
1.62 frystyk 1431: /* help */
1432: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
1433: VersionInfo();
1434: Cleanup(mr, 0);
1435:
1.63 frystyk 1436: /* clf log file */
1.1 frystyk 1437: } else if (!strcmp(argv[arg], "-l")) {
1438: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1439: argv[++arg] : DEFAULT_LOG_FILE;
1.63 frystyk 1440: mr->flags |= MR_LOGGING;
1.1 frystyk 1441:
1.63 frystyk 1442: /* referer log file */
1.58 frystyk 1443: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1444: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1445: argv[++arg] : DEFAULT_REFERER_FILE;
1.63 frystyk 1446: mr->flags |= MR_LOGGING;
1.57 frystyk 1447:
1.58 frystyk 1448: /* Not found error log file */
1449: } else if (!strncmp(argv[arg], "-404", 4)) {
1450: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1451: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1.63 frystyk 1452: mr->flags |= MR_LOGGING;
1.58 frystyk 1453:
1454: /* reject log file */
1455: } else if (!strncmp(argv[arg], "-rej", 4)) {
1456: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1457: argv[++arg] : DEFAULT_REJECT_FILE;
1.63 frystyk 1458: mr->flags |= MR_LOGGING;
1.58 frystyk 1459:
1.63 frystyk 1460: /* no alt tags log file */
1461: } else if (!strncmp(argv[arg], "-alt", 4)) {
1462: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1463: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1464: mr->flags |= MR_LOGGING;
1465:
1466: /* negotiated resource log file */
1.58 frystyk 1467: } else if (!strncmp(argv[arg], "-neg", 4)) {
1468: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1469: argv[++arg] : DEFAULT_CONNEG_FILE;
1.63 frystyk 1470: mr->flags |= MR_LOGGING;
1471:
1472: /* hit file log */
1473: } else if (!strcmp(argv[arg], "-hit")) {
1474: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1475: argv[++arg] : DEFAULT_HIT_FILE;
1476: mr->flags |= MR_DISTRIBUTIONS;
1477:
1.64 frystyk 1478: /* link relations file log */
1479: } else if (!strcmp(argv[arg], "-rellog")) {
1480: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1481: argv[++arg] : DEFAULT_REL_FILE;
1482: mr->flags |= MR_DISTRIBUTIONS;
1483:
1484: /* Specific link relation to look for (only used i also -rellog) */
1485: } else if (!strcmp(argv[arg], "-relation")) {
1486: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
1487: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
1488: mr->flags |= MR_DISTRIBUTIONS;
1489:
1.63 frystyk 1490: /* last modified log file */
1491: } else if (!strcmp(argv[arg], "-lm")) {
1492: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1493: argv[++arg] : DEFAULT_LM_FILE;
1494: mr->flags |= MR_DISTRIBUTIONS;
1495:
1496: /* title log file */
1497: } else if (!strcmp(argv[arg], "-title")) {
1498: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
1499: argv[++arg] : DEFAULT_TITLE_FILE;
1500: mr->flags |= MR_DISTRIBUTIONS;
1.58 frystyk 1501:
1502: /* mediatype distribution log file */
1503: } else if (!strncmp(argv[arg], "-for", 4)) {
1504: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1505: argv[++arg] : DEFAULT_FORMAT_FILE;
1.63 frystyk 1506: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.58 frystyk 1507:
1.60 frystyk 1508: /* charset distribution log file */
1509: } else if (!strncmp(argv[arg], "-char", 5)) {
1510: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1511: argv[++arg] : DEFAULT_CHARSET_FILE;
1.63 frystyk 1512: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
1.60 frystyk 1513:
1.55 frystyk 1514: /* rule file */
1.1 frystyk 1515: } else if (!strcmp(argv[arg], "-r")) {
1516: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1517: argv[++arg] : DEFAULT_RULE_FILE;
1518:
1519: /* output filename */
1520: } else if (!strcmp(argv[arg], "-o")) {
1521: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1522: argv[++arg] : DEFAULT_OUTPUT_FILE;
1523:
1.55 frystyk 1524: /* URI prefix */
1525: } else if (!strcmp(argv[arg], "-prefix")) {
1526: char * prefix = NULL;
1527: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1528: argv[++arg] : DEFAULT_PREFIX;
1.62 frystyk 1529: if (*prefix && *prefix != '*') {
1.55 frystyk 1530: StrAllocCopy(mr->prefix, prefix);
1531: StrAllocCat(mr->prefix, "*");
1532: }
1533:
1.1 frystyk 1534: /* timeout -- Change the default request timeout */
1535: } else if (!strcmp(argv[arg], "-timeout")) {
1536: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1537: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.74 ! frystyk 1538: if (timeout > 1) mr->timer = timeout*MILLIES;
1.1 frystyk 1539:
1.54 frystyk 1540: /* Force no pipelined requests */
1541: } else if (!strcmp(argv[arg], "-nopipe")) {
1.64 frystyk 1542: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
1.54 frystyk 1543:
1.48 frystyk 1544: /* Start the persistent cache */
1545: } else if (!strcmp(argv[arg], "-cache")) {
1546: cache = YES;
1547:
1.54 frystyk 1548: /* Determine the cache root */
1549: } else if (!strcmp(argv[arg], "-cacheroot")) {
1550: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1551: argv[++arg] : NULL;
1.51 frystyk 1552:
1.52 frystyk 1553: /* Stream write flush delay in ms */
1554: } else if (!strcmp(argv[arg], "-delay")) {
1555: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1556: atoi(argv[++arg]) : DEFAULT_DELAY;
1557: HTHost_setDefaultWriteDelay(delay);
1558:
1.48 frystyk 1559: /* Persistent cache flush */
1560: } else if (!strcmp(argv[arg], "-flush")) {
1561: flush = YES;
1562:
1563: /* Do a cache validation */
1564: } else if (!strcmp(argv[arg], "-validate")) {
1565: mr->flags |= MR_VALIDATE;
1566:
1567: /* Do an end-to-end cache-validation */
1568: } else if (!strcmp(argv[arg], "-endvalidate")) {
1569: mr->flags |= MR_END_VALIDATE;
1570:
1.7 frystyk 1571: /* preemptive or non-preemptive access */
1.1 frystyk 1572: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1573: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1574:
1575: /* test inlined images */
1576: } else if (!strcmp(argv[arg], "-img")) {
1577: mr->flags |= MR_IMG;
1.45 frystyk 1578:
1579: /* load inlined images */
1580: } else if (!strcmp(argv[arg], "-saveimg")) {
1581: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1582:
1583: /* URI prefix for inlined images */
1584: } else if (!strcmp(argv[arg], "-imgprefix")) {
1585: char * prefix = NULL;
1586: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1587: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 frystyk 1588: if (*prefix && *prefix!='*') {
1.59 frystyk 1589: StrAllocCopy(mr->img_prefix, prefix);
1590: StrAllocCat(mr->img_prefix, "*");
1591: }
1.2 frystyk 1592:
1593: /* load anchors */
1.58 frystyk 1594: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1595: mr->flags |= MR_LINK;
1.7 frystyk 1596: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1597: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1598:
1.12 frystyk 1599: /* Output start and end time */
1600: } else if (!strcmp(argv[arg], "-ss")) {
1601: mr->flags |= MR_TIME;
1602:
1.1 frystyk 1603: /* print version and exit */
1604: } else if (!strcmp(argv[arg], "-version")) {
1605: VersionInfo();
1606: Cleanup(mr, 0);
1.46 eric 1607:
1608: /* run in quiet mode */
1609: } else if (!strcmp(argv[arg], "-q")) {
1610: mr->flags |= MR_QUIET;
1.1 frystyk 1611:
1.62 frystyk 1612: /* run in really quiet mode */
1613: } else if (!strcmp(argv[arg], "-Q")) {
1614: mr->flags |= MR_REAL_QUIET;
1615:
1.1 frystyk 1616: #ifdef WWWTRACE
1617: /* trace flags */
1618: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1619: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1620: #endif
1621:
1.58 frystyk 1622: #ifdef HT_POSIX_REGEX
1623:
1624: /* If we can link against a POSIX regex library */
1625: } else if (!strncmp(argv[arg], "-inc", 4)) {
1626: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1627: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1628: }
1629: } else if (!strncmp(argv[arg], "-exc", 4)) {
1630: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1631: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1632: }
1633: } else if (!strncmp(argv[arg], "-check", 6)) {
1634: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1635: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1636: }
1637: #endif
1638:
1.68 frystyk 1639: #ifdef HT_MYSQL
1640: /* If we can link against a MYSQL database library */
1641: } else if (!strncmp(argv[arg], "-sqldb", 5)) {
1642: mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ?
1643: argv[++arg] : DEFAULT_SQL_DB;
1644:
1645: } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) {
1646: mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE;
1647:
1648: } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) {
1649: mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE;
1650:
1651: } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) {
1652: mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE;
1653:
1654: } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) {
1655: mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE;
1656:
1657: } else if (!strncmp(argv[arg], "-sqlexternals", 5)) {
1658: mr->sqlexternals = YES;
1659:
1660: } else if (!strncmp(argv[arg], "-sqlpassword", 5)) {
1661: mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ?
1662: argv[++arg] : DEFAULT_SQL_PW;
1663:
1664: } else if (!strncmp(argv[arg], "-sqlrelative", 5)) {
1665: mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ?
1666: argv[++arg] : NULL;
1667:
1668: } else if (!strncmp(argv[arg], "-sqlserver", 5)) {
1669: mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ?
1670: argv[++arg] : DEFAULT_SQL_SERVER;
1671:
1672: } else if (!strncmp(argv[arg], "-sqluser", 5)) {
1673: mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ?
1674: argv[++arg] : DEFAULT_SQL_USER;
1675:
1676: #endif
1677:
1.1 frystyk 1678: } else {
1.62 frystyk 1679: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1680: }
1.17 frystyk 1681: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1682: if (!keycnt) {
1683: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1684: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1685: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1686: keycnt = 1;
1.11 frystyk 1687: HT_FREE(ref);
1.1 frystyk 1688: } else { /* Check for successive keyword arguments */
1689: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1690: if (keycnt++ <= 1)
1.5 frystyk 1691: keywords = HTChunk_new(128);
1.1 frystyk 1692: else
1.5 frystyk 1693: HTChunk_putc(keywords, ' ');
1694: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1695: HT_FREE(escaped);
1.1 frystyk 1696: }
1697: }
1698: }
1699:
1700: #ifdef CATCH_SIG
1701: SetSignal();
1702: #endif
1703:
1704: if (!keycnt) {
1.62 frystyk 1705: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1706: Cleanup(mr, -1);
1707: }
1708:
1709: if (mr->depth != DEFAULT_DEPTH &&
1710: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 frystyk 1711: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1712: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1713: mr->depth);
1.1 frystyk 1714: Cleanup(mr, -1);
1715: }
1716:
1.23 manoli 1717: /* Testing that HTTrace is working */
1.62 frystyk 1718: if (mr->flags & MR_TIME) {
1719: if (SHOW_REAL_QUIET(mr)) {
1720: time_t local = time(NULL);
1.67 frystyk 1721: HTTrace("Welcome to the W3C mini Robot version %s - started on %s\n",
1722: APP_VERSION, HTDateTimeStr(&local, YES));
1.62 frystyk 1723: }
1724: }
1.23 manoli 1725:
1.1 frystyk 1726: /* Rule file specified? */
1727: if (mr->rules) {
1728: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.73 frystyk 1729: if (!HTLoadRulesAutomatically(rules))
1.62 frystyk 1730: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1731: HT_FREE(rules);
1.1 frystyk 1732: }
1733:
1734: /* Output file specified? */
1735: if (mr->outputfile) {
1736: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 frystyk 1737: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1738: mr->output = OUTPUT;
1739: }
1740: }
1741:
1.48 frystyk 1742: /* Should we use persistent cache? */
1743: if (cache) {
1.54 frystyk 1744: HTCacheInit(cache_root, 20);
1.49 frystyk 1745: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1746: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1747: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1748:
1749: /* Should we start by flushing? */
1750: if (flush) HTCache_flushAll();
1751: }
1.68 frystyk 1752:
1753: /* SQL Log specified? */
1754: #ifdef HT_MYSQL
1755: if (mr->sqlserver) {
1756: if ((mr->sqllog =
1.69 frystyk 1757: HTSQLLog_open(mr->sqlserver,
1758: mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER,
1759: mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW,
1760: mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB,
1761: mr->sqlflags)) != NULL) {
1.68 frystyk 1762: if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative);
1763: }
1764: }
1765: #endif
1.48 frystyk 1766:
1.58 frystyk 1767: /* CLF Log file specified? */
1.55 frystyk 1768: if (mr->logfile) {
1769: mr->log = HTLog_open(mr->logfile, YES, YES);
1770: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1771: }
1772:
1.58 frystyk 1773: /* Referer Log file specified? */
1.57 frystyk 1774: if (mr->reffile) {
1775: mr->ref = HTLog_open(mr->reffile, YES, YES);
1776: if (mr->ref)
1777: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1778: }
1.1 frystyk 1779:
1.58 frystyk 1780: /* Not found error log specified? */
1781: if (mr->notfoundfile) {
1782: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1783: if (mr->notfound)
1784: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1785: }
1786:
1787: /* Negotiated resource log specified? */
1788: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1789:
1790: /* No alt tags log file specified? */
1791: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1792:
1793: /* Reject Log file specified? */
1794: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1795:
1796: /* Register our own terminate filter */
1.32 frystyk 1797: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1798:
1799: /* Setting event timeout */
1800: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1801:
1.56 frystyk 1802: mr->time = HTGetTimeInMillis();
1.37 frystyk 1803:
1.34 eric 1804: /* Start the request */
1805: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1806:
1807: /*
1808: ** Make sure that the first request is flushed immediately and not
1809: ** buffered in the output buffer
1810: */
1811: HTRequest_setFlush(finger->request, YES);
1812:
1813: /*
1.48 frystyk 1814: ** Check whether we should do some kind of cache validation on
1815: ** the load
1816: */
1817: if (mr->flags & MR_VALIDATE)
1818: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1819: if (mr->flags & MR_END_VALIDATE)
1820: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1821:
1822: /*
1.43 frystyk 1823: ** Now do the load
1824: */
1.34 eric 1825: if (mr->flags & MR_PREEMPTIVE)
1826: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1827:
1828: if (keywords) /* Search */
1.34 eric 1829: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1830: else
1.34 eric 1831: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1832:
1.5 frystyk 1833: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1834: if (status != YES) {
1.62 frystyk 1835: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1836: Cleanup(mr, -1);
1837: }
1838:
1839: /* Go into the event loop... */
1.34 eric 1840: HTEventList_loop(finger->request);
1.1 frystyk 1841:
1842: /* Only gets here if event loop fails */
1843: Cleanup(mr, 0);
1844: return 0;
1845: }
Webmaster