Annotation of libwww/Robot/src/HTRobot.c, revision 1.62
1.1 frystyk 1: /* HTRobot.c
2: ** W3C MINI ROBOT
3: **
4: ** (c) COPRIGHT MIT 1995.
5: ** Please first read the full copyright statement in the file COPYRIGH.
6: **
7: ** This program illustrates how to travers links using the Anchor object
8: **
9: ** Authors:
10: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
11: **
12: ** History:
13: ** Dec 04 95 First version
14: */
15:
16: #include "WWWLib.h" /* Global Library Include file */
17: #include "WWWApp.h" /* Application stuff */
1.17 frystyk 18: #include "WWWTrans.h"
1.10 frystyk 19: #include "WWWInit.h"
1.9 frystyk 20:
1.4 frystyk 21: #include "HText.h"
1.1 frystyk 22:
23: #include "HTRobot.h" /* Implemented here */
24:
1.58 frystyk 25: #ifdef HT_POSIX_REGEX
1.62 ! frystyk 26: #ifdef HAVE_REGEX_H
! 27: #include <regex.h>
! 28: #else
! 29: #ifdef HAVE_RXPOSIX_H
! 30: #include <rxposix.h>
! 31: #endif
! 32: #endif
1.60 frystyk 33: #define W3C_REGEX_FLAGS (REG_EXTENDED | REG_NEWLINE)
1.58 frystyk 34: #endif
35:
1.14 frystyk 36: #ifndef W3C_VERSION
1.33 eric 37: #define W3C_VERSION "Unspecified"
1.1 frystyk 38: #endif
39:
40: #define APP_NAME "W3CRobot"
1.14 frystyk 41: #define APP_VERSION W3C_VERSION
1.62 ! frystyk 42: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
1.1 frystyk 43:
44: #define DEFAULT_OUTPUT_FILE "robot.out"
45: #define DEFAULT_RULE_FILE "robot.conf"
1.58 frystyk 46: #define DEFAULT_LOG_FILE "log-clf.txt"
47: #define DEFAULT_HIT_FILE "log-hit.txt"
48: #define DEFAULT_REFERER_FILE "log-referer.txt"
49: #define DEFAULT_REJECT_FILE "log-reject.txt"
50: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
51: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
1.60 frystyk 52: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
1.58 frystyk 53: #define DEFAULT_FORMAT_FILE "log-format.txt"
1.60 frystyk 54: #define DEFAULT_CHARSET_FILE "log-charset.txt"
1.51 frystyk 55: #define DEFAULT_MEMLOG "robot.mem"
1.55 frystyk 56: #define DEFAULT_PREFIX ""
1.59 frystyk 57: #define DEFAULT_IMG_PREFIX ""
1.7 frystyk 58: #define DEFAULT_DEPTH 0
1.53 frystyk 59: #define DEFAULT_DELAY 50 /* Write delay in ms */
1.1 frystyk 60:
1.51 frystyk 61: #if 0
1.53 frystyk 62: #define HT_MEMLOG /* May be expensive in performance! */
1.51 frystyk 63: #endif
64:
1.46 eric 65: /* #define SHOW_MSG (WWWTRACE || HTAlert_interactive()) */
1.62 ! frystyk 66: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
! 67: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 68:
1.40 frystyk 69: #define DEFAULT_TIMEOUT 10000 /* timeout in millis */
1.1 frystyk 70:
71: #if defined(__svr4__)
72: #define CATCH_SIG
73: #endif
74:
75: typedef enum _MRFlags {
1.45 frystyk 76: MR_IMG = 0x1,
77: MR_LINK = 0x2,
78: MR_PREEMPTIVE = 0x4,
79: MR_TIME = 0x8,
1.46 eric 80: MR_SAVE = 0x10,
1.48 frystyk 81: MR_QUIET = 0x20,
1.62 ! frystyk 82: MR_REAL_QUIET = 0x40,
! 83: MR_VALIDATE = 0x80,
! 84: MR_END_VALIDATE = 0x100,
! 85: MR_KEEP_META = 0x200
1.1 frystyk 86: } MRFlags;
87:
88: typedef struct _Robot {
1.2 frystyk 89: int depth; /* How deep is our tree */
1.30 frystyk 90: int cnt; /* Count of requests */
1.2 frystyk 91: HTList * hyperdoc; /* List of our HyperDoc Objects */
1.4 frystyk 92: HTList * htext; /* List of our HText Objects */
1.34 eric 93: HTList * fingers;
1.59 frystyk 94:
1.40 frystyk 95: int timer;
1.1 frystyk 96: char * cwd; /* Current dir URL */
97: char * rules;
1.55 frystyk 98: char * prefix;
1.59 frystyk 99: char * img_prefix;
100:
1.60 frystyk 101: char * logfile; /* clf log */
1.55 frystyk 102: HTLog * log;
1.60 frystyk 103: char * reffile; /* referer log */
1.57 frystyk 104: HTLog * ref;
1.60 frystyk 105: char * rejectfile; /* unchecked links */
1.58 frystyk 106: HTLog * reject;
1.60 frystyk 107: char * notfoundfile; /* links that returned 404 */
1.58 frystyk 108: HTLog * notfound;
1.60 frystyk 109: char * connegfile; /* links that were conneg'ed */
1.58 frystyk 110: HTLog * conneg;
1.60 frystyk 111: char * noalttagfile; /* images without alt tags*/
112: HTLog * noalttag;
113:
114: char * hitfile; /* links sorted after hit counts */
115: char * mtfile; /* media types encountered */
116: char * charsetfile; /* charsets encountered */
117:
118: char * outputfile;
1.1 frystyk 119: FILE * output;
1.59 frystyk 120:
1.1 frystyk 121: MRFlags flags;
1.55 frystyk 122:
1.59 frystyk 123: long get_bytes; /* Total number of bytes processed using GET*/
124: long get_docs; /* Total number of documents using GET */
125:
126: long head_bytes; /* bytes processed bytes processed using HEAD */
127: long head_docs; /* Total number of documents using HEAD*/
128:
129: long other_docs;
130:
1.56 frystyk 131: ms_t time; /* Time of run */
1.58 frystyk 132:
133: #ifdef HT_POSIX_REGEX
134: regex_t * include;
135: regex_t * exclude;
136: regex_t * check;
137: #endif
138:
1.1 frystyk 139: } Robot;
1.34 eric 140:
141: typedef struct _Finger {
142: Robot * robot;
143: HTRequest * request;
144: HTParentAnchor * dest;
145: } Finger;
146:
1.1 frystyk 147: typedef enum _LoadState {
148: L_INVALID = -2,
149: L_LOADING = -1,
150: L_SUCCESS = 0,
151: L_ERROR
152: } LoadState;
153:
154: /*
155: ** The HyperDoc object is bound to the anchor and contains information about
156: ** where we are in the search for recursive searches
157: */
158: typedef struct _HyperDoc {
159: HTParentAnchor * anchor;
160: LoadState state;
161: int depth;
1.55 frystyk 162: int hits;
1.1 frystyk 163: } HyperDoc;
164:
165: /*
166: ** This is the HText object that is created every time we start parsing a
167: ** HTML object
168: */
1.4 frystyk 169: struct _HText {
1.1 frystyk 170: HTRequest * request;
1.4 frystyk 171: };
1.1 frystyk 172:
1.58 frystyk 173: /*
174: ** A structure for calculating metadata distributions
175: */
176: typedef struct _MetaDist {
177: HTAtom * name;
178: int hits;
179: } MetaDist;
180:
181: /*
182: ** Some sorting algorithms
183: */
184: PRIVATE HTComparer HitSort, FormatSort;
185:
1.1 frystyk 186: PUBLIC HText * HTMainText = NULL;
187: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
188: PUBLIC HTStyleSheet * styleSheet = NULL;
189:
190: /* ------------------------------------------------------------------------- */
191:
1.13 eric 192: /* Standard (non-error) Output
193: ** ---------------------------
194: */
195: PUBLIC int OutputData(const char * fmt, ...)
196: {
197: int ret;
198: va_list pArgs;
199: va_start(pArgs, fmt);
200: ret = vfprintf(stdout, fmt, pArgs);
201: va_end(pArgs);
202: return ret;
203: }
204:
205: /* ------------------------------------------------------------------------- */
206:
1.2 frystyk 207: /* Create a "HyperDoc" object
208: ** --------------------------
209: ** A HyperDoc object contains information about whether we have already
210: ** started checking the anchor and the depth in our search
211: */
212: PRIVATE HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
213: {
214: HyperDoc * hd;
1.14 frystyk 215: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
216: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 217: hd->state = L_INVALID;
218: hd->depth = depth;
1.55 frystyk 219: hd->hits = 1;
1.2 frystyk 220:
221: /* Bind the HyperDoc object together with the Anchor Object */
222: hd->anchor = anchor;
223: HTAnchor_setDocument(anchor, (void *) hd);
224:
225: /* Add this HyperDoc object to our list */
226: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
227: HTList_addObject(mr->hyperdoc, (void *) hd);
228: return hd;
229: }
230:
231: /* Delete a "HyperDoc" object
232: ** --------------------------
233: */
234: PRIVATE BOOL HyperDoc_delete (HyperDoc * hd)
235: {
236: if (hd) {
1.11 frystyk 237: HT_FREE (hd);
1.2 frystyk 238: return YES;
239: }
240: return NO;
241: }
242:
1.55 frystyk 243: /*
244: ** Sort the anchor array and log reference count
245: */
246: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
247: {
248: if (mr && array) {
249: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
250: if (log) {
251: void ** data = NULL;
252: HTParentAnchor * anchor = NULL;
253: HTArray_sort(array, HitSort);
254: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
255: while (anchor) {
256: char * str = NULL;
257: char * uri = HTAnchor_address((HTAnchor *) anchor);
258: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
259: if (uri && hd) {
260: if ((str = (char *) HT_MALLOC(strlen(uri) + 50)) == NULL)
261: HT_OUTOFMEM("calculate_hits");
1.58 frystyk 262: sprintf(str, "%8d %s", hd->hits, uri);
1.55 frystyk 263: HTLog_addLine(log, str);
264: HT_FREE(str);
265: }
266: HT_FREE(uri);
267: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
268: }
269: }
270: HTLog_close(log);
271: return YES;
272: }
273: return NO;
274: }
275:
276: PRIVATE int HitSort (const void * a, const void * b)
277: {
278: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
279: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
280: if (aa && bb) return (bb->hits - aa->hits);
281: return bb - aa;
282: }
283:
1.58 frystyk 284: /*
285: ** Calculate distributions for media types. The same mechanism
286: ** can be used for other characteristics with relatively
287: ** few outcomes.
288: */
289: PRIVATE HTList * mediatype_distribution (HTArray * array)
290: {
291: if (array) {
292: HTList * mt = HTList_new();
293: MetaDist * pres = NULL;
294: void ** data = NULL;
295: HTParentAnchor * anchor = NULL;
296: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
297: while (anchor) {
298: HTFormat format = HTAnchor_format(anchor);
299: if (format && format != WWW_UNKNOWN) {
300: HTList * cur = mt;
301:
302: /* If found then increase counter */
303: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
304: if (pres->name == format) {
305: pres->hits++;
306: break;
307: }
308: }
309:
310: /* If not found then add new format to list */
311: if (!pres) {
312: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
313: HT_OUTOFMEM("mediatype_distribution");
314: pres->name = format;
315: pres->hits = 1;
316: HTList_addObject(mt, pres);
317: HTList_insertionSort(mt, FormatSort);
318: }
319: }
320:
321: /* Find next anchor in array */
322: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
323: }
324: return mt;
325: }
326: return NULL;
327: }
328:
1.60 frystyk 329: /*
330: ** Calculate distributions for charsets. The same mechanism
331: ** can be used for other characteristics with relatively
332: ** few outcomes.
333: */
334: PRIVATE HTList * charset_distribution (HTArray * array)
335: {
336: if (array) {
337: HTList * cs = HTList_new();
338: MetaDist * pres = NULL;
339: void ** data = NULL;
340: HTParentAnchor * anchor = NULL;
341: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
342: while (anchor) {
343: HTCharset charset = HTAnchor_charset(anchor);
344: if (charset) {
345: HTList * cur = cs;
346:
347: /* If found then increase counter */
348: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
349: if (pres->name == charset) {
350: pres->hits++;
351: break;
352: }
353: }
354:
355: /* If not found then add new format to list */
356: if (!pres) {
357: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
358: HT_OUTOFMEM("charset_distribution");
359: pres->name = charset;
360: pres->hits = 1;
361: HTList_addObject(cs, pres);
362: HTList_insertionSort(cs, FormatSort);
363: }
364: }
365:
366: /* Find next anchor in array */
367: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
368: }
369: return cs;
370: }
371: return NULL;
372: }
373:
1.58 frystyk 374: PRIVATE int FormatSort (const void * a, const void * b)
375: {
376: MetaDist * aa = (MetaDist *) a;
377: MetaDist * bb = (MetaDist *) b;
378: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
379: }
380:
381: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
382: {
383: if (logfile && distribution) {
384: HTLog * log = HTLog_open(logfile, YES, YES);
385: if (log) {
386: HTList * cur = distribution;
387: MetaDist * pres;
388: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
389: if (pres->name) {
1.60 frystyk 390: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 391: }
392: }
393: HTLog_close(log);
394: }
395: }
396: return NO;
397: }
398:
399: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
400: {
401: if (distribution) {
402: HTList * cur = distribution;
403: MetaDist * pres;
404: while ((pres = (MetaDist *) HTList_nextObject(cur)))
405: HT_FREE(pres);
406: HTList_delete(distribution);
407: return YES;
408: }
409: return NO;
410: }
411:
412:
1.55 frystyk 413: /* Statistics
414: ** ----------
415: ** Calculates a bunch of statistics for the anchors traversed
416: */
417: PRIVATE BOOL calculate_statistics (Robot * mr)
418: {
1.59 frystyk 419: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 420: if (!mr) return NO;
421:
422: /* Calculate efficiency */
1.59 frystyk 423: if (mr->time > 0) {
1.56 frystyk 424: ms_t t = HTGetTimeInMillis() - mr->time;
425: if (t > 0) {
1.60 frystyk 426: double loadfactor = (mr->get_bytes / (t * 0.001));
427: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 428: double secs = t / 1000.0;
1.55 frystyk 429: char bytes[50];
1.62 ! frystyk 430: if (SHOW_REAL_QUIET(mr))
! 431: HTTrace("Accessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
! 432: total_docs, secs, reqprsec);
1.59 frystyk 433:
434: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 ! frystyk 435: if (SHOW_REAL_QUIET(mr))
! 436: HTTrace("Did a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
! 437: mr->get_docs, bytes, loadfactor);
1.59 frystyk 438:
439: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 ! frystyk 440: if (SHOW_REAL_QUIET(mr))
! 441: HTTrace("Did a HEAD on %ld document(s) with a total of %s bytes\n",
! 442: mr->head_docs, bytes);
1.55 frystyk 443: }
444: }
445:
446: /* Create an array of existing anchors */
1.59 frystyk 447: if (total_docs > 1) {
448: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 449: if (array) {
450:
451: /* Sort after hit counts */
452: if (mr->hitfile) calculate_hits(mr, array);
453:
1.58 frystyk 454: /* Find mediatype distribution */
455: if (mr->mtfile) {
456: HTList * mtdist = mediatype_distribution(array);
457: if (mtdist) {
458: log_meta_distribution(mr->mtfile, mtdist);
459: delete_meta_distribution(mtdist);
460: }
461: }
1.55 frystyk 462:
1.60 frystyk 463: /* Find charset distribution */
464: if (mr->charsetfile) {
465: HTList * charsetdist = charset_distribution(array);
466: if (charsetdist) {
467: log_meta_distribution(mr->charsetfile, charsetdist);
468: delete_meta_distribution(charsetdist);
469: }
470: }
471:
1.55 frystyk 472: /* Add as may other stats here as you like */
1.60 frystyk 473: /* ... */
1.58 frystyk 474:
475: /* Delete the array */
1.55 frystyk 476: HTArray_delete(array);
477: }
478: }
479: return YES;
480: }
481:
1.1 frystyk 482: /* Create a Command Line Object
483: ** ----------------------------
484: */
485: PRIVATE Robot * Robot_new (void)
486: {
487: Robot * me;
1.41 frystyk 488: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 489: HT_OUTOFMEM("Robot_new");
1.2 frystyk 490: me->hyperdoc = HTList_new();
1.4 frystyk 491: me->htext = HTList_new();
1.40 frystyk 492: me->timer = DEFAULT_TIMEOUT;
1.25 frystyk 493: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 494: me->output = OUTPUT;
1.35 eric 495: me->cnt = 0;
1.34 eric 496: me->fingers = HTList_new();
1.1 frystyk 497: return me;
498: }
499:
500: /* Delete a Command Line Object
501: ** ----------------------------
502: */
1.62 ! frystyk 503: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 504: {
1.62 ! frystyk 505: if (mr) {
! 506: HTList_delete(mr->fingers);
1.55 frystyk 507:
508: /* Calculate statistics */
1.62 ! frystyk 509: calculate_statistics(mr);
1.55 frystyk 510:
1.62 ! frystyk 511: if (mr->hyperdoc) {
! 512: HTList * cur = mr->hyperdoc;
1.2 frystyk 513: HyperDoc * pres;
514: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
515: HyperDoc_delete(pres);
1.62 ! frystyk 516: HTList_delete(mr->hyperdoc);
1.2 frystyk 517: }
1.62 ! frystyk 518: if (mr->htext) {
! 519: HTList * cur = mr->htext;
1.4 frystyk 520: HText * pres;
521: while ((pres = (HText *) HTList_nextObject(cur)))
522: HText_free(pres);
1.62 ! frystyk 523: HTList_delete(mr->htext);
1.4 frystyk 524: }
1.62 ! frystyk 525:
! 526: /* Close all the log files */
! 527: if (mr->log) {
! 528: if (SHOW_REAL_QUIET(mr))
! 529: HTTrace("Logged %5d entries in general log file `%s\'\n",
! 530: HTLog_accessCount(mr->log), mr->logfile);
! 531: HTLog_close(mr->log);
! 532: }
! 533: if (mr->ref) {
! 534: if (SHOW_REAL_QUIET(mr))
! 535: HTTrace("Logged %5d entries in referer log file `%s\'\n",
! 536: HTLog_accessCount(mr->ref), mr->reffile);
! 537: HTLog_close(mr->ref);
! 538: }
! 539: if (mr->reject) {
! 540: if (SHOW_REAL_QUIET(mr))
! 541: HTTrace("Logged %5d entries in rejected log file `%s\'\n",
! 542: HTLog_accessCount(mr->reject), mr->rejectfile);
! 543: HTLog_close(mr->reject);
! 544: }
! 545: if (mr->notfound) {
! 546: if (SHOW_REAL_QUIET(mr))
! 547: HTTrace("Logged %5d entries in not found log file `%s\'\n",
! 548: HTLog_accessCount(mr->notfound), mr->notfoundfile);
! 549: HTLog_close(mr->notfound);
! 550: }
! 551: if (mr->conneg) {
! 552: if (SHOW_REAL_QUIET(mr))
! 553: HTTrace("Logged %5d entries in content negotiation log file `%s\'\n",
! 554: HTLog_accessCount(mr->conneg), mr->connegfile);
! 555: HTLog_close(mr->conneg);
! 556: }
! 557: if (mr->noalttag) {
! 558: if (SHOW_REAL_QUIET(mr))
! 559: HTTrace("Logged %5d entries in missing alt tag log file `%s\'\n",
! 560: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
! 561: HTLog_close(mr->noalttag);
! 562: }
! 563:
! 564: if (mr->output && mr->output != STDOUT) fclose(mr->output);
! 565:
! 566: if (mr->flags & MR_TIME) {
1.12 frystyk 567: time_t local = time(NULL);
1.62 ! frystyk 568: if (SHOW_REAL_QUIET(mr))
! 569: HTTrace("Robot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 570: }
1.55 frystyk 571:
1.58 frystyk 572: #ifdef HT_POSIX_REGEX
1.62 ! frystyk 573: if (mr->include) {
! 574: regfree(mr->include);
! 575: HT_FREE(mr->include);
! 576: }
! 577: if (mr->exclude) {
! 578: regfree(mr->exclude);
! 579: HT_FREE(mr->exclude);
! 580: }
! 581: if (mr->check) {
! 582: regfree(mr->check);
! 583: HT_FREE(mr->check);
1.58 frystyk 584: }
585: #endif
586:
1.62 ! frystyk 587: HT_FREE(mr->cwd);
! 588: HT_FREE(mr->prefix);
! 589: HT_FREE(mr->img_prefix);
! 590: HT_FREE(mr);
1.1 frystyk 591: return YES;
592: }
593: return NO;
594: }
595:
1.2 frystyk 596: /*
1.34 eric 597: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 598: */
1.34 eric 599: PRIVATE Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 600: {
1.34 eric 601: Finger * me;
602: HTRequest * request = HTRequest_new();
603: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
604: HT_OUTOFMEM("Finger_new");
605: me->robot = robot;
606: me->request = request;
607: me->dest = dest;
608: HTList_addObject(robot->fingers, (void *)me);
609:
1.48 frystyk 610: /* Set the context for this request */
1.34 eric 611: HTRequest_setContext (request, me);
1.48 frystyk 612:
613: /* Check the various flags to customize the request */
614: if (robot->flags & MR_PREEMPTIVE)
615: HTRequest_setPreemptive(request, YES);
616: if (robot->flags & MR_VALIDATE)
617: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
618: if (robot->flags & MR_END_VALIDATE)
619: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
620:
621: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 622: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 623:
624: /* Set the method for this request */
1.34 eric 625: HTRequest_setMethod(request, method);
626: robot->cnt++;
627: return me;
1.2 frystyk 628: }
629:
1.34 eric 630: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 631: {
1.34 eric 632: HTList_removeObject(me->robot->fingers, (void *)me);
633: me->robot->cnt--;
1.37 frystyk 634:
635: /*
636: ** If we are down at one request then flush the output buffer
637: */
638: if (me->request) {
639: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 640: HTRequest_delete(me->request);
1.37 frystyk 641: }
642:
643: /*
644: ** Delete the request and free myself
645: */
1.34 eric 646: HT_FREE(me);
647: return YES;
1.2 frystyk 648: }
649:
650: /*
651: ** Cleanup and make sure we close all connections including the persistent
652: ** ones
653: */
1.1 frystyk 654: PRIVATE void Cleanup (Robot * me, int status)
655: {
656: Robot_delete(me);
1.29 eric 657: HTProfile_delete();
1.50 frystyk 658: #ifdef HT_MEMLOG
1.39 eric 659: HTMemLog_close();
1.47 frystyk 660: #endif
661:
1.1 frystyk 662: #ifdef VMS
663: exit(status ? status : 1);
664: #else
665: exit(status ? status : 0);
666: #endif
667: }
668:
669: #ifdef CATCH_SIG
670: #include <signal.h>
671: /* SetSignal
672: ** This function sets up signal handlers. This might not be necessary to
673: ** call if the application has its own handlers (lossage on SVR4)
674: */
675: PRIVATE void SetSignal (void)
676: {
677: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
678: ** when attemting to connect to a remote host where you normally should
679: ** get `connection refused' back
680: */
681: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 682: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 683: } else {
1.13 eric 684: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 685: }
1.47 frystyk 686:
1.50 frystyk 687: #ifdef HT_MEMLOG
1.44 eric 688: HTMemLog_flush();
1.47 frystyk 689: #endif
690:
1.1 frystyk 691: }
692: #endif /* CATCH_SIG */
693:
1.58 frystyk 694: #ifdef HT_POSIX_REGEX
695: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
696: {
697: size_t length = regerror (errcode, compiled, NULL, 0);
698: char * str = NULL;
699: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
700: HT_OUTOFMEM("get_regerror");
701: (void) regerror (errcode, compiled, str, length);
702: return str;
703: }
704:
1.60 frystyk 705: PRIVATE regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 706: {
707: regex_t * regex = NULL;
708: if (regex_str && *regex_str) {
709: int status;
710: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
711: HT_OUTOFMEM("get_regtype");
1.60 frystyk 712: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 713: char * err_msg = get_regerror(status, regex);
1.62 ! frystyk 714: if (SHOW_REAL_QUIET(mr))
! 715: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 716: HT_FREE(err_msg);
717: Cleanup(mr, -1);
718: }
719: }
720: return regex;
721: }
722: #endif
723:
1.1 frystyk 724: PRIVATE void VersionInfo (void)
725: {
1.62 ! frystyk 726: OutputData("W3C Sample Software\n\n");
! 727: OutputData("\tW3C Mini Robot (%s) version %s\n", APP_NAME, APP_VERSION);
! 728: OutputData("\tW3C Sample Library (libwww) version %s\n\n", HTLib_version());
! 729: OutputData("For command line options, see\n\t%s\n\n", COMMAND_LINE);
1.13 eric 730: OutputData("Please send feedback to <libwww@w3.org>\n");
1.1 frystyk 731: }
732:
733: /* terminate_handler
734: ** -----------------
1.2 frystyk 735: ** This function is registered to handle the result of the request.
736: ** If no more requests are pending then terminate program
1.1 frystyk 737: */
1.32 frystyk 738: PRIVATE int terminate_handler (HTRequest * request, HTResponse * response,
739: void * param, int status)
1.1 frystyk 740: {
1.34 eric 741: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 742: Robot * mr = finger->robot;
1.62 ! frystyk 743: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 744:
1.58 frystyk 745: /* Check if negotiated resource and whether we should log that*/
746: if (mr->conneg) {
747: HTAssocList * cur = HTResponse_variant(response);
748: if (cur) {
749: BOOL first = YES;
750: HTChunk * buffer = HTChunk_new(128);
751: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
752: HTAssoc * pres;
1.60 frystyk 753: HTChunk_puts(buffer, uri);
1.58 frystyk 754: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
755: char * value = HTAssoc_value(pres);
756: if (first) {
1.60 frystyk 757: HTChunk_puts(buffer, "\t(");
1.58 frystyk 758: first = NO;
759: } else
760: HTChunk_puts(buffer, ", ");
761:
762: /* Output the name */
763: HTChunk_puts(buffer, HTAssoc_name(pres));
764:
765: /* Only output the value if not empty string */
1.60 frystyk 766: if (value && *value) {
1.58 frystyk 767: HTChunk_puts(buffer, "=");
768: HTChunk_puts(buffer, value);
769: }
770: }
1.60 frystyk 771: if (!first) HTChunk_puts(buffer, ")");
772: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 773: HTChunk_delete(buffer);
774: HT_FREE(uri);
775: }
776: }
777:
1.55 frystyk 778: /* Count the amount of body data that we have read */
1.59 frystyk 779: if (HTRequest_method(request) == METHOD_GET) {
780: int length = HTAnchor_length(HTRequest_anchor(request));
781: if (length > 0) mr->get_bytes += length;
782: mr->get_docs++;
783: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 784: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 785: if (length > 0) mr->head_bytes += length;
786: mr->head_docs++;
787: } else {
788: mr->other_docs++;
1.55 frystyk 789: }
790:
1.58 frystyk 791: /* Cleanup the anchor so that we don't drown in metainformation */
792: if (!(mr->flags & MR_KEEP_META))
793: HTAnchor_clearHeader(HTRequest_anchor(request));
794:
1.55 frystyk 795: /* Delete this thread */
1.34 eric 796: Finger_delete(finger);
1.55 frystyk 797:
798: /* Should we stop? */
1.46 eric 799: if (mr->cnt <= 0) {
1.62 ! frystyk 800: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 801: Cleanup(mr, 0); /* No way back from here */
1.30 frystyk 802: }
1.62 ! frystyk 803: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.1 frystyk 804: return HT_OK;
805: }
806:
807: /* ------------------------------------------------------------------------- */
808: /* HTEXT INTERFACE */
809: /* ------------------------------------------------------------------------- */
810:
811: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
812: HTStream * stream)
813: {
814: HText * me;
1.34 eric 815: Finger * finger = (Finger *) HTRequest_context(request);
816: Robot * mr = finger->robot;
1.14 frystyk 817: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
818: HT_OUTOFMEM("HText_new2");
1.4 frystyk 819:
820: /* Bind the HText object together with the Request Object */
1.1 frystyk 821: me->request = request;
1.4 frystyk 822:
823: /* Add this HyperDoc object to our list */
824: if (!mr->htext) mr->htext = HTList_new();
825: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 826: return me;
827: }
828:
1.4 frystyk 829: PUBLIC void HText_free (HText * me) {
1.11 frystyk 830: if (me) HT_FREE (me);
1.4 frystyk 831: }
832:
1.1 frystyk 833: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
834: {
835: if (text && anchor) {
1.34 eric 836: Finger * finger = (Finger *) HTRequest_context(text->request);
837: Robot * mr = finger->robot;
1.1 frystyk 838: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
839: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 840: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 841: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 842: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.58 frystyk 843: BOOL match = YES;
844: BOOL check = NO;
1.1 frystyk 845:
1.55 frystyk 846: if (!uri) return;
1.62 ! frystyk 847: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - ", uri ? uri : "NULL\n");
1.55 frystyk 848:
849: if (hd) {
1.62 ! frystyk 850: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.55 frystyk 851: hd->hits++;
1.58 frystyk 852: HT_FREE(uri);
853: return;
854: }
855:
856: /* Check for prefix match */
857: if (mr->prefix) match = HTStrMatch(mr->prefix, uri) ? YES : NO;
858:
859: #ifdef HT_POSIX_REGEX
860: /* Check for any regular expression */
861: if (match && mr->include) {
862: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
863: }
864: if (match && mr->exclude) {
865: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
866: }
867: if (match && mr->check) {
868: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
869: }
870: #endif
871:
872: /* Test whether we already have a hyperdoc for this document */
873: if (mr->flags & MR_LINK && match && dest_parent) {
1.60 frystyk 874: HTParentAnchor * last_anchor = HTRequest_parent(text->request);
875: HyperDoc * last_doc = HTAnchor_document(last_anchor);
876: int depth = last_doc ? last_doc->depth+1 : 0;
1.34 eric 877: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
878: HTRequest * newreq = newfinger->request;
1.2 frystyk 879: HyperDoc_new(mr, dest_parent, depth);
1.60 frystyk 880: HTRequest_setParent(newreq, referer);
1.58 frystyk 881: if (check || depth >= mr->depth) {
1.62 ! frystyk 882: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1.7 frystyk 883: HTRequest_setMethod(newreq, METHOD_HEAD);
1.30 frystyk 884: HTRequest_setOutputFormat(newreq, WWW_DEBUG);
1.7 frystyk 885: } else {
1.62 ! frystyk 886: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1.2 frystyk 887: }
888: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.62 ! frystyk 889: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1.34 eric 890: Finger_delete(newfinger);
1.2 frystyk 891: }
1.7 frystyk 892: } else {
1.62 ! frystyk 893: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 894: if (mr->reject) {
895: if (referer) {
896: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
897: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
898: HT_FREE(ref_addr);
899: }
900: }
1.2 frystyk 901: }
1.11 frystyk 902: HT_FREE(uri);
1.2 frystyk 903: }
904: }
905:
906: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 907: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 908: {
909: if (text && anchor) {
1.34 eric 910: Finger * finger = (Finger *) HTRequest_context(text->request);
911: Robot * mr = finger->robot;
1.59 frystyk 912: if (mr->flags & MR_IMG) {
1.60 frystyk 913: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
914: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
915: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
916: HyperDoc * hd = HTAnchor_document(dest_parent);
917: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 918: BOOL match = YES;
919:
920: if (hd) {
1.62 ! frystyk 921: if (SHOW_QUIET(mr)) HTTrace("Already checked\n");
1.59 frystyk 922: hd->hits++;
1.11 frystyk 923: HT_FREE(uri);
1.59 frystyk 924: return;
1.2 frystyk 925: }
1.59 frystyk 926:
927: /* Check for prefix match */
928: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
929:
930: /* Test whether we already have a hyperdoc for this document */
931: if (match && dest) {
1.60 frystyk 932: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 933: mr->flags & MR_SAVE ?
934: METHOD_GET : METHOD_HEAD);
935: HTRequest * newreq = newfinger->request;
1.60 frystyk 936: HyperDoc_new(mr, dest_parent, 1);
937: HTRequest_setParent(newreq, referer);
938:
939: /* Check whether we should report missing ALT tags */
940: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
941: if (referer) {
942: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
943: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
944: HT_FREE(ref_addr);
945: }
946: }
947:
1.62 ! frystyk 948: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 949: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 ! frystyk 950: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 951: Finger_delete(newfinger);
952: }
953: } else {
1.62 ! frystyk 954: if (SHOW_QUIET(mr)) HTTrace("does not fulfill constraints\n");
1.60 frystyk 955: if (mr->reject) {
956: if (referer) {
957: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
958: if (ref_addr) HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
959: HT_FREE(ref_addr);
960: }
961: }
1.1 frystyk 962: }
1.59 frystyk 963: HT_FREE(uri);
1.1 frystyk 964: }
965: }
966: }
967:
968: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 969: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 970: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
971: PUBLIC void HText_endAppend (HText * text) {}
972: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
973: PUBLIC void HText_beginAppend (HText * text) {}
974: PUBLIC void HText_appendParagraph (HText * text) {}
975:
1.48 frystyk 976: PRIVATE int RobotTrace (const char * fmt, va_list pArgs)
977: {
978: return (vfprintf(stderr, fmt, pArgs));
979: }
980:
1.1 frystyk 981: /* ------------------------------------------------------------------------- */
982: /* MAIN PROGRAM */
983: /* ------------------------------------------------------------------------- */
984:
985: int main (int argc, char ** argv)
986: {
1.48 frystyk 987: int status = 0;
1.1 frystyk 988: int arg;
1.48 frystyk 989: BOOL cache = NO; /* Use persistent cache */
990: BOOL flush = NO; /* flush the persistent cache */
1.54 frystyk 991: char * cache_root = NULL;
1.1 frystyk 992: HTChunk * keywords = NULL; /* From command line */
993: int keycnt = 0;
1.12 frystyk 994: Robot * mr = NULL;
1.43 frystyk 995: Finger * finger = NULL;
996: HTParentAnchor * startAnchor = NULL;
1.1 frystyk 997:
998: /* Starts Mac GUSI socket library */
999: #ifdef GUSI
1000: GUSISetup(GUSIwithSIOUXSockets);
1001: GUSISetup(GUSIwithInternetSockets);
1002: #endif
1003:
1004: #ifdef __MWERKS__ /* STR */
1005: InitGraf((Ptr) &qd.thePort);
1006: InitFonts();
1007: InitWindows();
1008: InitMenus(); TEInit();
1009: InitDialogs(nil);
1010: InitCursor();
1011: SIOUXSettings.asktosaveonclose = false;
1012: argc=ccommand(&argv);
1.50 frystyk 1013: #endif /* __MWERKS__ */
1.1 frystyk 1014:
1.50 frystyk 1015: #ifdef HT_MEMLOG
1.51 frystyk 1016: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
1.47 frystyk 1017: #endif
1.46 eric 1018:
1.27 frystyk 1019: /* Initiate W3C Reference Library with a robot profile */
1020: HTProfile_newRobot(APP_NAME, APP_VERSION);
1.48 frystyk 1021: HTTrace_setCallback(RobotTrace);
1.27 frystyk 1022:
1023: /* Add the default HTML parser to the set of converters */
1024: {
1025: HTList * converters = HTFormat_conversion();
1026: HTMLInit(converters);
1027: }
1.1 frystyk 1028:
1.12 frystyk 1029: /* Build a new robot object */
1030: mr = Robot_new();
1031:
1.1 frystyk 1032: /* Scan command Line for parameters */
1033: for (arg=1; arg<argc; arg++) {
1034: if (*argv[arg] == '-') {
1035:
1036: /* non-interactive */
1.17 frystyk 1037: if (!strcmp(argv[arg], "-n")) {
1.1 frystyk 1038: HTAlert_setInteractive(NO);
1039:
1.62 ! frystyk 1040: /* help */
! 1041: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
! 1042: VersionInfo();
! 1043: Cleanup(mr, 0);
! 1044:
1.55 frystyk 1045: /* log file */
1.1 frystyk 1046: } else if (!strcmp(argv[arg], "-l")) {
1047: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1048: argv[++arg] : DEFAULT_LOG_FILE;
1049:
1.55 frystyk 1050: /* hit file */
1051: } else if (!strcmp(argv[arg], "-hit")) {
1052: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1053: argv[++arg] : DEFAULT_HIT_FILE;
1054:
1.57 frystyk 1055: /* referer file */
1.58 frystyk 1056: } else if (!strncmp(argv[arg], "-ref", 4)) {
1.57 frystyk 1057: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
1058: argv[++arg] : DEFAULT_REFERER_FILE;
1059:
1.58 frystyk 1060: /* Not found error log file */
1061: } else if (!strncmp(argv[arg], "-404", 4)) {
1062: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1063: argv[++arg] : DEFAULT_NOTFOUND_FILE;
1064:
1065: /* reject log file */
1066: } else if (!strncmp(argv[arg], "-rej", 4)) {
1067: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1068: argv[++arg] : DEFAULT_REJECT_FILE;
1069:
1070: /* negoatiated resource log file */
1071: } else if (!strncmp(argv[arg], "-neg", 4)) {
1072: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1073: argv[++arg] : DEFAULT_CONNEG_FILE;
1074:
1075: /* mediatype distribution log file */
1076: } else if (!strncmp(argv[arg], "-for", 4)) {
1077: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1078: argv[++arg] : DEFAULT_FORMAT_FILE;
1079: mr->flags |= MR_KEEP_META;
1080:
1.60 frystyk 1081: /* charset distribution log file */
1082: } else if (!strncmp(argv[arg], "-char", 5)) {
1083: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1084: argv[++arg] : DEFAULT_CHARSET_FILE;
1085: mr->flags |= MR_KEEP_META;
1086:
1087: /* no alt tags log file */
1088: } else if (!strncmp(argv[arg], "-alt", 4)) {
1089: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1090: argv[++arg] : DEFAULT_NOALTTAG_FILE;
1091:
1.55 frystyk 1092: /* rule file */
1.1 frystyk 1093: } else if (!strcmp(argv[arg], "-r")) {
1094: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
1095: argv[++arg] : DEFAULT_RULE_FILE;
1096:
1097: /* output filename */
1098: } else if (!strcmp(argv[arg], "-o")) {
1099: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
1100: argv[++arg] : DEFAULT_OUTPUT_FILE;
1101:
1.55 frystyk 1102: /* URI prefix */
1103: } else if (!strcmp(argv[arg], "-prefix")) {
1104: char * prefix = NULL;
1105: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1106: argv[++arg] : DEFAULT_PREFIX;
1.62 ! frystyk 1107: if (*prefix && *prefix != '*') {
1.55 frystyk 1108: StrAllocCopy(mr->prefix, prefix);
1109: StrAllocCat(mr->prefix, "*");
1110: }
1111:
1.1 frystyk 1112: /* timeout -- Change the default request timeout */
1113: } else if (!strcmp(argv[arg], "-timeout")) {
1114: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
1115: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
1.40 frystyk 1116: if (timeout > 0) mr->timer = timeout;
1.1 frystyk 1117:
1.54 frystyk 1118: /* Force no pipelined requests */
1119: } else if (!strcmp(argv[arg], "-nopipe")) {
1120: HTTP_setConnectionMode(HTTP_NO_PIPELINING);
1121:
1.48 frystyk 1122: /* Start the persistent cache */
1123: } else if (!strcmp(argv[arg], "-cache")) {
1124: cache = YES;
1125:
1.54 frystyk 1126: /* Determine the cache root */
1127: } else if (!strcmp(argv[arg], "-cacheroot")) {
1128: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
1129: argv[++arg] : NULL;
1.51 frystyk 1130:
1.52 frystyk 1131: /* Stream write flush delay in ms */
1132: } else if (!strcmp(argv[arg], "-delay")) {
1133: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
1134: atoi(argv[++arg]) : DEFAULT_DELAY;
1135: HTHost_setDefaultWriteDelay(delay);
1136:
1.48 frystyk 1137: /* Persistent cache flush */
1138: } else if (!strcmp(argv[arg], "-flush")) {
1139: flush = YES;
1140:
1141: /* Do a cache validation */
1142: } else if (!strcmp(argv[arg], "-validate")) {
1143: mr->flags |= MR_VALIDATE;
1144:
1145: /* Do an end-to-end cache-validation */
1146: } else if (!strcmp(argv[arg], "-endvalidate")) {
1147: mr->flags |= MR_END_VALIDATE;
1148:
1.7 frystyk 1149: /* preemptive or non-preemptive access */
1.1 frystyk 1150: } else if (!strcmp(argv[arg], "-single")) {
1.7 frystyk 1151: mr->flags |= MR_PREEMPTIVE;
1.2 frystyk 1152:
1153: /* test inlined images */
1154: } else if (!strcmp(argv[arg], "-img")) {
1155: mr->flags |= MR_IMG;
1.45 frystyk 1156:
1157: /* load inlined images */
1158: } else if (!strcmp(argv[arg], "-saveimg")) {
1159: mr->flags |= (MR_IMG | MR_SAVE);
1.59 frystyk 1160:
1161: /* URI prefix for inlined images */
1162: } else if (!strcmp(argv[arg], "-imgprefix")) {
1163: char * prefix = NULL;
1164: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
1165: argv[++arg] : DEFAULT_IMG_PREFIX;
1.62 ! frystyk 1166: if (*prefix && *prefix!='*') {
1.59 frystyk 1167: StrAllocCopy(mr->img_prefix, prefix);
1168: StrAllocCat(mr->img_prefix, "*");
1169: }
1.2 frystyk 1170:
1171: /* load anchors */
1.58 frystyk 1172: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
1.2 frystyk 1173: mr->flags |= MR_LINK;
1.7 frystyk 1174: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
1175: atoi(argv[++arg]) : DEFAULT_DEPTH;
1.2 frystyk 1176:
1.12 frystyk 1177: /* Output start and end time */
1178: } else if (!strcmp(argv[arg], "-ss")) {
1179: mr->flags |= MR_TIME;
1180:
1.1 frystyk 1181: /* print version and exit */
1182: } else if (!strcmp(argv[arg], "-version")) {
1183: VersionInfo();
1184: Cleanup(mr, 0);
1.46 eric 1185:
1186: /* run in quiet mode */
1187: } else if (!strcmp(argv[arg], "-q")) {
1188: mr->flags |= MR_QUIET;
1.1 frystyk 1189:
1.62 ! frystyk 1190: /* run in really quiet mode */
! 1191: } else if (!strcmp(argv[arg], "-Q")) {
! 1192: mr->flags |= MR_REAL_QUIET;
! 1193:
1.1 frystyk 1194: #ifdef WWWTRACE
1195: /* trace flags */
1196: } else if (!strncmp(argv[arg], "-v", 2)) {
1.24 frystyk 1197: HTSetTraceMessageMask(argv[arg]+2);
1.1 frystyk 1198: #endif
1199:
1.58 frystyk 1200: #ifdef HT_POSIX_REGEX
1201:
1202: /* If we can link against a POSIX regex library */
1203: } else if (!strncmp(argv[arg], "-inc", 4)) {
1204: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1205: mr->include = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1206: }
1207: } else if (!strncmp(argv[arg], "-exc", 4)) {
1208: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1209: mr->exclude = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1210: }
1211: } else if (!strncmp(argv[arg], "-check", 6)) {
1212: if (arg+1 < argc && *argv[arg+1] != '-') {
1.60 frystyk 1213: mr->check = get_regtype(mr, argv[++arg], W3C_REGEX_FLAGS);
1.58 frystyk 1214: }
1215: #endif
1216:
1.1 frystyk 1217: } else {
1.62 ! frystyk 1218: if (SHOW_REAL_QUIET(mr)) HTTrace("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 1219: }
1.17 frystyk 1220: } else { /* If no leading `-' then check for URL or keywords */
1.1 frystyk 1221: if (!keycnt) {
1222: char * ref = HTParse(argv[arg], mr->cwd, PARSE_ALL);
1.56 frystyk 1223: startAnchor = HTAnchor_parent(HTAnchor_findAddress(ref));
1.34 eric 1224: HyperDoc_new(mr, startAnchor, 0);
1.1 frystyk 1225: keycnt = 1;
1.11 frystyk 1226: HT_FREE(ref);
1.1 frystyk 1227: } else { /* Check for successive keyword arguments */
1228: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
1229: if (keycnt++ <= 1)
1.5 frystyk 1230: keywords = HTChunk_new(128);
1.1 frystyk 1231: else
1.5 frystyk 1232: HTChunk_putc(keywords, ' ');
1233: HTChunk_puts(keywords, HTStrip(escaped));
1.11 frystyk 1234: HT_FREE(escaped);
1.1 frystyk 1235: }
1236: }
1237: }
1238:
1239: #ifdef CATCH_SIG
1240: SetSignal();
1241: #endif
1242:
1243: if (!keycnt) {
1.62 ! frystyk 1244: if (SHOW_REAL_QUIET(mr)) HTTrace("Please specify URL to check.\n");
1.61 frystyk 1245: Cleanup(mr, -1);
1246: }
1247:
1248: if (mr->depth != DEFAULT_DEPTH &&
1249: (mr->prefix == NULL || *mr->prefix == '*')) {
1.62 ! frystyk 1250: if (SHOW_REAL_QUIET(mr))
1.61 frystyk 1251: HTTrace("A depth of more than 0 requires that you also specify a URI prefix.\n",
1252: mr->depth);
1.1 frystyk 1253: Cleanup(mr, -1);
1254: }
1255:
1.23 manoli 1256: /* Testing that HTTrace is working */
1.62 ! frystyk 1257: if (mr->flags & MR_TIME) {
! 1258: if (SHOW_REAL_QUIET(mr)) {
! 1259: time_t local = time(NULL);
! 1260: HTTrace("Welcome to the W3C mini Robot - started on %s\n",
! 1261: HTDateTimeStr(&local, YES));
! 1262: }
! 1263: }
1.23 manoli 1264:
1.1 frystyk 1265: /* Rule file specified? */
1266: if (mr->rules) {
1267: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
1.27 frystyk 1268: if (!HTLoadRules(rules))
1.62 ! frystyk 1269: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access rules\n");
1.11 frystyk 1270: HT_FREE(rules);
1.1 frystyk 1271: }
1272:
1273: /* Output file specified? */
1274: if (mr->outputfile) {
1275: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.62 ! frystyk 1276: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 1277: mr->output = OUTPUT;
1278: }
1279: }
1280:
1.48 frystyk 1281: /* Should we use persistent cache? */
1282: if (cache) {
1.54 frystyk 1283: HTCacheInit(cache_root, 20);
1.49 frystyk 1284: HTNet_addBefore(HTCacheFilter, "http://*", NULL, HT_FILTER_MIDDLE);
1285: HTNet_addAfter(HTCacheUpdateFilter, "http://*", NULL,
1286: HT_NOT_MODIFIED, HT_FILTER_MIDDLE);
1.48 frystyk 1287:
1288: /* Should we start by flushing? */
1289: if (flush) HTCache_flushAll();
1290: }
1291:
1.58 frystyk 1292: /* CLF Log file specified? */
1.55 frystyk 1293: if (mr->logfile) {
1294: mr->log = HTLog_open(mr->logfile, YES, YES);
1295: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
1.57 frystyk 1296: }
1297:
1.58 frystyk 1298: /* Referer Log file specified? */
1.57 frystyk 1299: if (mr->reffile) {
1300: mr->ref = HTLog_open(mr->reffile, YES, YES);
1301: if (mr->ref)
1302: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
1.55 frystyk 1303: }
1.1 frystyk 1304:
1.58 frystyk 1305: /* Not found error log specified? */
1306: if (mr->notfoundfile) {
1307: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
1308: if (mr->notfound)
1309: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
1310: }
1311:
1312: /* Negotiated resource log specified? */
1313: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
1.60 frystyk 1314:
1315: /* No alt tags log file specified? */
1316: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
1.58 frystyk 1317:
1318: /* Reject Log file specified? */
1319: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
1320:
1321: /* Register our own terminate filter */
1.32 frystyk 1322: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.40 frystyk 1323:
1324: /* Setting event timeout */
1325: HTHost_setEventTimeout(mr->timer);
1.55 frystyk 1326:
1.56 frystyk 1327: mr->time = HTGetTimeInMillis();
1.37 frystyk 1328:
1.34 eric 1329: /* Start the request */
1330: finger = Finger_new(mr, startAnchor, METHOD_GET);
1.43 frystyk 1331:
1332: /*
1333: ** Make sure that the first request is flushed immediately and not
1334: ** buffered in the output buffer
1335: */
1336: HTRequest_setFlush(finger->request, YES);
1337:
1338: /*
1.48 frystyk 1339: ** Check whether we should do some kind of cache validation on
1340: ** the load
1341: */
1342: if (mr->flags & MR_VALIDATE)
1343: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
1344: if (mr->flags & MR_END_VALIDATE)
1345: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
1346:
1347: /*
1.43 frystyk 1348: ** Now do the load
1349: */
1.34 eric 1350: if (mr->flags & MR_PREEMPTIVE)
1351: HTRequest_setPreemptive(finger->request, YES);
1.1 frystyk 1352:
1353: if (keywords) /* Search */
1.34 eric 1354: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1355: else
1.34 eric 1356: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
1.1 frystyk 1357:
1.5 frystyk 1358: if (keywords) HTChunk_delete(keywords);
1.1 frystyk 1359: if (status != YES) {
1.62 ! frystyk 1360: if (SHOW_REAL_QUIET(mr)) HTTrace("Can't access resource\n");
1.1 frystyk 1361: Cleanup(mr, -1);
1362: }
1363:
1364: /* Go into the event loop... */
1.34 eric 1365: HTEventList_loop(finger->request);
1.1 frystyk 1366:
1367: /* Only gets here if event loop fails */
1368: Cleanup(mr, 0);
1369: return 0;
1370: }
Webmaster