Annotation of libwww/Robot/src/RobotMain.c, revision 1.14
1.1 frystyk 1: /*
1.14 ! vbancrof 2: ** @(#) $Id: RobotMain.c,v 1.13 2005/01/23 18:29:25 vbancrof Exp $
1.1 frystyk 3: **
4: ** W3C Webbot can be found at "http://www.w3.org/Robot/"
5: **
6: ** Copyright 1995-1998 World Wide Web Consortium, (Massachusetts
7: ** Institute of Technology, Institut National de Recherche en
8: ** Informatique et en Automatique, Keio University). All Rights
9: ** Reserved. This program is distributed under the W3C's Software
10: ** Intellectual Property License. This program is distributed in the hope
11: ** that it will be useful, but WITHOUT ANY WARRANTY; without even the
12: ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13: ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14: ** details.
15: **
16: ** Authors:
17: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
18: ** BR Bob Racko
19: ** JP John Punin
20: **
21: ** History:
22: ** Dec 04 95 First version
23: ** Oct 1998 Split into separate files
24: */
25:
26: #include "HTRobMan.h"
27: #include "RobotTxt.h"
28:
29: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
30: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
31:
32: /* ------------------------------------------------------------------------- */
33: /* MAIN PROGRAM */
34: /* ------------------------------------------------------------------------- */
35:
1.7 frystyk 36: PRIVATE int printer (const char * fmt, va_list pArgs)
37: {
38: return (vfprintf(stdout, fmt, pArgs));
39: }
40:
41: PRIVATE int tracer (const char * fmt, va_list pArgs)
42: {
43: return (vfprintf(stderr, fmt, pArgs));
44: }
45:
1.1 frystyk 46: int main (int argc, char ** argv)
47: {
48: int status = 0;
49: int arg;
50: BOOL cache = NO; /* Use persistent cache */
51: BOOL flush = NO; /* flush the persistent cache */
52: char * cache_root = NULL;
1.11 frystyk 53: int cache_size = DEFAULT_CACHE_SIZE;
1.1 frystyk 54: HTChunk * keywords = NULL; /* From command line */
55: int keycnt = 0;
56: Robot * mr = NULL;
57: Finger * finger = NULL;
58: HTParentAnchor * startAnchor = NULL;
59:
60: /* Starts Mac GUSI socket library */
61: #ifdef GUSI
62: GUSISetup(GUSIwithSIOUXSockets);
63: GUSISetup(GUSIwithInternetSockets);
64: #endif
65:
66: #ifdef __MWERKS__ /* STR */
67: InitGraf((Ptr) &qd.thePort);
68: InitFonts();
69: InitWindows();
70: InitMenus(); TEInit();
71: InitDialogs(nil);
72: InitCursor();
73: SIOUXSettings.asktosaveonclose = false;
74: argc=ccommand(&argv);
75: #endif /* __MWERKS__ */
76:
77: #ifdef HT_MEMLOG
78: HTMemLog_open(DEFAULT_MEMLOG, 8192, YES);
79: #endif
80:
81: /* Initiate W3C Reference Library with a robot profile */
82: HTProfile_newRobot(APP_NAME, APP_VERSION);
83:
1.7 frystyk 84: /* Need our own trace and print functions */
85: HTPrint_setCallback(printer);
86: HTTrace_setCallback(tracer);
87:
1.1 frystyk 88: /* Build a new robot object */
89: mr = Robot_new();
90:
91: /* Scan command Line for parameters */
92: for (arg=1; arg<argc; arg++) {
93: if (*argv[arg] == '-') {
94:
95: /* non-interactive */
96: if (!strcmp(argv[arg], "-n")) {
97: HTAlert_setInteractive(NO);
98:
99: /* help */
100: } else if (!strcmp(argv[arg], "-h") || !strcmp(argv[arg], "-?")) {
101: VersionInfo();
102: Cleanup(mr, 0);
103:
104: /* clf log file */
105: } else if (!strcmp(argv[arg], "-l")) {
106: mr->logfile = (arg+1 < argc && *argv[arg+1] != '-') ?
107: argv[++arg] : DEFAULT_LOG_FILE;
108: mr->flags |= MR_LOGGING;
109:
110: /* referer log file */
111: } else if (!strncmp(argv[arg], "-ref", 4)) {
112: mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ?
113: argv[++arg] : DEFAULT_REFERER_FILE;
114: mr->flags |= MR_LOGGING;
115:
116: /* Not found error log file */
117: } else if (!strncmp(argv[arg], "-404", 4)) {
118: mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ?
119: argv[++arg] : DEFAULT_NOTFOUND_FILE;
120: mr->flags |= MR_LOGGING;
121:
122: /* reject log file */
123: } else if (!strncmp(argv[arg], "-rej", 4)) {
124: mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ?
125: argv[++arg] : DEFAULT_REJECT_FILE;
126: mr->flags |= MR_LOGGING;
127:
128: /* no alt tags log file */
129: } else if (!strncmp(argv[arg], "-alt", 4)) {
130: mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ?
131: argv[++arg] : DEFAULT_NOALTTAG_FILE;
132: mr->flags |= MR_LOGGING;
133:
134: /* negotiated resource log file */
135: } else if (!strncmp(argv[arg], "-neg", 4)) {
136: mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ?
137: argv[++arg] : DEFAULT_CONNEG_FILE;
138: mr->flags |= MR_LOGGING;
139:
140: /* hit file log */
141: } else if (!strcmp(argv[arg], "-hit")) {
142: mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ?
143: argv[++arg] : DEFAULT_HIT_FILE;
144: mr->flags |= MR_DISTRIBUTIONS;
145:
146: /* link relations file log */
147: } else if (!strcmp(argv[arg], "-rellog")) {
148: mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ?
149: argv[++arg] : DEFAULT_REL_FILE;
150: mr->flags |= MR_DISTRIBUTIONS;
151:
152: /* Specific link relation to look for (only used i also -rellog) */
153: } else if (!strcmp(argv[arg], "-relation")) {
154: mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ?
155: (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL;
156: mr->flags |= MR_DISTRIBUTIONS;
157:
158: /* last modified log file */
159: } else if (!strcmp(argv[arg], "-lm")) {
160: mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ?
161: argv[++arg] : DEFAULT_LM_FILE;
162: mr->flags |= MR_DISTRIBUTIONS;
163:
164: /* title log file */
165: } else if (!strcmp(argv[arg], "-title")) {
166: mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ?
167: argv[++arg] : DEFAULT_TITLE_FILE;
168: mr->flags |= MR_DISTRIBUTIONS;
169:
170: /* mediatype distribution log file */
171: } else if (!strncmp(argv[arg], "-for", 4)) {
172: mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ?
173: argv[++arg] : DEFAULT_FORMAT_FILE;
174: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
175:
176: /* charset distribution log file */
177: } else if (!strncmp(argv[arg], "-char", 5)) {
178: mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ?
179: argv[++arg] : DEFAULT_CHARSET_FILE;
180: mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS);
181:
182:
183: /* rule file */
184: } else if (!strcmp(argv[arg], "-r")) {
185: mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ?
186: argv[++arg] : DEFAULT_RULE_FILE;
1.2 frystyk 187:
188: /* Don't follow HTML META tags with robot information */
189: } else if (!strcmp(argv[arg], "-nometatags")) {
190: mr->flags |= MR_NOMETATAGS;
1.1 frystyk 191:
192: /* output filename */
193: } else if (!strcmp(argv[arg], "-o")) {
194: mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ?
195: argv[++arg] : DEFAULT_OUTPUT_FILE;
196:
197: /* URI prefix */
198: } else if (!strcmp(argv[arg], "-prefix")) {
199: char * prefix = NULL;
200: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
201: argv[++arg] : DEFAULT_PREFIX;
202: if (*prefix && *prefix != '*') {
203: StrAllocCopy(mr->prefix, prefix);
204: StrAllocCat(mr->prefix, "*");
205: }
206:
207: /* timeout -- Change the default request timeout */
208: } else if (!strcmp(argv[arg], "-timeout")) {
209: int timeout = (arg+1 < argc && *argv[arg+1] != '-') ?
210: atoi(argv[++arg]) : DEFAULT_TIMEOUT;
211: if (timeout > 1) mr->timer = timeout*MILLIES;
212:
213: /* wait -- Change the default pwait time */ /* This is new */
214: } else if (!strcmp(argv[arg], "-wait")) {
215: int waits = (arg+1 < argc && *argv[arg+1] != '-') ?
216: atoi(argv[++arg]) : 0;
217: if (waits > 0) mr->waits = waits;
218:
219: /* Force no pipelined requests */
220: } else if (!strcmp(argv[arg], "-nopipe")) {
221: HTTP_setConnectionMode(HTTP_11_NO_PIPELINING);
222:
1.11 frystyk 223: /* Stream write flush delay in ms */
224: } else if (!strcmp(argv[arg], "-delay")) {
225: int delay = (arg+1 < argc && *argv[arg+1] != '-') ?
226: atoi(argv[++arg]) : DEFAULT_DELAY;
227: HTHost_setDefaultWriteDelay(delay);
228:
1.1 frystyk 229: /* Start the persistent cache */
230: } else if (!strcmp(argv[arg], "-cache")) {
231: cache = YES;
232:
233: /* Determine the cache root */
234: } else if (!strcmp(argv[arg], "-cacheroot")) {
235: cache_root = (arg+1 < argc && *argv[arg+1] != '-') ?
236: argv[++arg] : NULL;
237:
238: /* Persistent cache flush */
239: } else if (!strcmp(argv[arg], "-flush")) {
240: flush = YES;
241:
242: /* Do a cache validation */
243: } else if (!strcmp(argv[arg], "-validate")) {
244: mr->flags |= MR_VALIDATE;
245:
1.11 frystyk 246: } else if (!strcmp(argv[arg], "-cache_size")) {
247: cache_size = (arg+1 < argc && *argv[arg+1] != '-') ?
248: atoi(argv[++arg]) : DEFAULT_CACHE_SIZE;
249:
1.1 frystyk 250: /* Do an end-to-end cache-validation */
251: } else if (!strcmp(argv[arg], "-endvalidate")) {
252: mr->flags |= MR_END_VALIDATE;
253:
254: /* preemptive or non-preemptive access */
255: } else if (!strcmp(argv[arg], "-single")) {
256: mr->flags |= MR_PREEMPTIVE;
257:
258: /* test inlined images */
259: } else if (!strcmp(argv[arg], "-img")) {
260: mr->flags |= MR_IMG;
261:
262: /* load inlined images */
263: } else if (!strcmp(argv[arg], "-saveimg")) {
264: mr->flags |= (MR_IMG | MR_SAVE);
265:
266: /* URI prefix for inlined images */
267: } else if (!strcmp(argv[arg], "-imgprefix")) {
268: char * prefix = NULL;
269: prefix = (arg+1 < argc && *argv[arg+1] != '-') ?
270: argv[++arg] : DEFAULT_IMG_PREFIX;
271: if (*prefix && *prefix!='*') {
272: StrAllocCopy(mr->img_prefix, prefix);
273: StrAllocCat(mr->img_prefix, "*");
274: }
275:
276: /* load anchors */
277: } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) {
278: mr->flags |= MR_LINK;
279: mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ?
280: atoi(argv[++arg]) : DEFAULT_DEPTH;
281:
1.9 frystyk 282: /* load fixed number of anchors */
283: } else if (!strcmp(argv[arg], "-ndoc")) {
284: mr->ndoc = (arg+1 < argc && *argv[arg+1] != '-') ?
285: atoi(argv[++arg]) : -1 ;
286:
1.1 frystyk 287: /* Output start and end time */
288: } else if (!strcmp(argv[arg], "-ss")) {
289: mr->flags |= MR_TIME;
290:
291: /* print version and exit */
292: } else if (!strcmp(argv[arg], "-version")) {
293: VersionInfo();
294: Cleanup(mr, 0);
1.3 frystyk 295:
296: /* run in BFS mode */
297: } else if (!strcmp(argv[arg], "-bfs")) {
298: mr->flags |= MR_BFS;
1.1 frystyk 299:
300: /* run in quiet mode */
301: } else if (!strcmp(argv[arg], "-q")) {
302: mr->flags |= MR_QUIET;
303:
304: /* run in really quiet mode */
305: } else if (!strcmp(argv[arg], "-Q")) {
306: mr->flags |= MR_REAL_QUIET;
307:
1.9 frystyk 308: /* run in redirection mode */
309: } else if (!strcmp(argv[arg], "-redir")) {
310: mr->flags |= MR_REDIR;
1.10 frystyk 311: mr->redir_code = (arg+1 < argc && *argv[arg+1] != '-') ?
1.9 frystyk 312: atoi(argv[++arg]) : 0;
313:
1.1 frystyk 314: #ifdef WWWTRACE
315: /* trace flags */
316: } else if (!strncmp(argv[arg], "-v", 2)) {
317: HTSetTraceMessageMask(argv[arg]+2);
318: #endif
319:
320: #ifdef HT_POSIX_REGEX
321:
322: /* If we can link against a POSIX regex library */
323: } else if (!strncmp(argv[arg], "-inc", 4)) {
324: if (arg+1 < argc && *argv[arg+1] != '-') {
325: mr->include = get_regtype(mr, argv[++arg], W3C_DEFAULT_REGEX_FLAGS);
326: }
327: } else if (!strncmp(argv[arg], "-exc", 4)) {
328: if (arg+1 < argc && *argv[arg+1] != '-') {
329: mr->exclude = get_regtype(mr, argv[++arg], W3C_DEFAULT_REGEX_FLAGS);
330: }
331: } else if (!strncmp(argv[arg], "-check", 6)) {
332: if (arg+1 < argc && *argv[arg+1] != '-') {
333: mr->check = get_regtype(mr, argv[++arg], W3C_DEFAULT_REGEX_FLAGS);
334: }
335: } else if (!strcmp(argv[arg], "-norobotstxt")) {
336: mr->flags |= MR_NOROBOTSTXT;
337: #endif
338:
339: #ifdef HT_MYSQL
340: /* If we can link against a MYSQL database library */
341: } else if (!strncmp(argv[arg], "-sqldb", 5)) {
342: mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ?
343: argv[++arg] : DEFAULT_SQL_DB;
344:
345: } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) {
346: mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE;
347:
348: } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) {
349: mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE;
350:
351: } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) {
352: mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE;
353:
354: } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) {
355: mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE;
356:
357: } else if (!strncmp(argv[arg], "-sqlexternals", 5)) {
358: mr->sqlexternals = YES;
359:
360: } else if (!strncmp(argv[arg], "-sqlpassword", 5)) {
361: mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ?
362: argv[++arg] : DEFAULT_SQL_PW;
363:
364: } else if (!strncmp(argv[arg], "-sqlrelative", 5)) {
365: mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ?
366: argv[++arg] : NULL;
367:
368: } else if (!strncmp(argv[arg], "-sqlserver", 5)) {
369: mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ?
370: argv[++arg] : DEFAULT_SQL_SERVER;
371:
372: } else if (!strncmp(argv[arg], "-sqluser", 5)) {
373: mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ?
374: argv[++arg] : DEFAULT_SQL_USER;
375:
376: #endif
377:
1.14 ! vbancrof 378: #ifdef HT_SSL
! 379: } else if (!strncmp(argv[arg], "-verifydepth", 12)) {
! 380: mr->sslverifydepth = (arg+1 < argc && *argv[arg+1] != '-') ?
! 381: atoi(argv[++arg]) : DEFAULT_SSL_VDEPTH;
! 382: } else if (!strncmp(argv[arg], "-sslprot", 8)) {
! 383: if (arg+1 < argc && *argv[arg+1] != '-') {
! 384: if (!strcmp(argv[arg], "v2")) {
! 385: mr->sslprot = HTSSL_V2;
! 386: } else if (!strcmp(argv[arg], "v3")) {
! 387: mr->sslprot = HTSSL_V3;
! 388: } else if (!strcmp(argv[arg], "v23")) {
! 389: mr->sslprot = HTSSL_V23;
! 390: } else {
! 391: mr->sslprot = DEFAULT_SSL_PROT;
! 392: }
! 393: }
! 394: } else if (!strncmp(argv[arg], "-keyfile", 8)) {
! 395: mr->sslkeyfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 396: argv[++arg] : DEFAULT_SSL_KFILE;
! 397: } else if (!strncmp(argv[arg], "-certfile", 9)) {
! 398: mr->sslcertfile = (arg+1 < argc && *argv[arg+1] != '-') ?
! 399: argv[++arg] : DEFAULT_SSL_CFILE;
! 400:
! 401: #endif
! 402:
1.1 frystyk 403: } else {
1.6 frystyk 404: if (SHOW_REAL_QUIET(mr)) HTPrint("Bad Argument (%s)\n", argv[arg]);
1.1 frystyk 405: }
406: } else { /* If no leading `-' then check for URL or keywords */
407: if (!keycnt) {
408: HyperDoc *hd; /* This is new variable */
409: mr->furl = HTParse(argv[arg], mr->cwd, PARSE_ALL);
410: startAnchor = HTAnchor_parent(HTAnchor_findAddress(mr->furl));
411: hd = HyperDoc_new(mr, startAnchor, 0);
412: hd->method = METHOD_GET;
413: keycnt = 1;
414: } else { /* Check for successive keyword arguments */
415: char *escaped = HTEscape(argv[arg], URL_XALPHAS);
416: if (keycnt++ <= 1)
417: keywords = HTChunk_new(128);
418: else
419: HTChunk_putc(keywords, ' ');
420: HTChunk_puts(keywords, HTStrip(escaped));
421: HT_FREE(escaped);
422: }
423: }
424: }
425:
426: if (!keycnt) {
427: VersionInfo();
428: Cleanup(mr, 0);
429: }
1.14 ! vbancrof 430: #ifdef HT_SSL
! 431: /* Set the SSL protocol method. By default, it is the highest
! 432: available protocol. Setting it up to SSL_V23 allows the client
! 433: to negotiate with the server and set up either TSLv1, SSLv3,
! 434: or SSLv2 */
! 435: HTSSL_protMethod_set (mr->sslprot ? mr->sslprot : DEFAULT_SSL_PROT);
! 436:
! 437: /* Set the certificate verification depth to 2 in order to be
! 438: able to
! 439: validate self signed certificates */
! 440: HTSSL_verifyDepth_set (mr->sslverifydepth ?
! 441: mr->sslverifydepth :
! 442: DEFAULT_SSL_VDEPTH);
! 443:
! 444: /* Setup cert stuff */
! 445: if (mr->sslcertfile) {
! 446: HTSSL_certFile_set(mr->sslcertfile);
! 447: HTPrint("Setting certfile %s\n", HTSSL_certFile());
! 448: }
! 449: /* Setup key stuff */
! 450: if (mr->sslkeyfile) {
! 451: HTSSL_keyFile_set(mr->sslkeyfile);
! 452: HTPrint("Setting keyfile %s\n", HTSSL_keyFile());
! 453: }
! 454:
! 455: /* Register SSL stuff for handling ssl access */
! 456: HTSSLhttps_init(YES);
! 457: #endif /* HT_SSL */
1.1 frystyk 458:
459: if (mr->depth != DEFAULT_DEPTH &&
460: (mr->prefix == NULL || *mr->prefix == '*')) {
461: if (SHOW_REAL_QUIET(mr))
1.6 frystyk 462: HTPrint("A depth of more than 0 requires that you also specify a URI prefix.\n",
1.1 frystyk 463: mr->depth);
464: Cleanup(mr, -1);
465: }
466:
1.6 frystyk 467: /* Testing that HTPrint is working */
1.1 frystyk 468: if (mr->flags & MR_TIME) {
469: if (SHOW_REAL_QUIET(mr)) {
470: time_t local = time(NULL);
1.6 frystyk 471: HTPrint("Welcome to the W3C mini Robot version %s - started on %s\n",
1.1 frystyk 472: APP_VERSION, HTDateTimeStr(&local, YES));
473: }
474: }
475:
476: /* Rule file specified? */
477: if (mr->rules) {
478: char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL);
479: if (!HTLoadRulesAutomatically(rules))
1.6 frystyk 480: if (SHOW_REAL_QUIET(mr)) HTPrint("Can't access rules\n");
1.1 frystyk 481: HT_FREE(rules);
482: }
483:
484: /* Output file specified? */
485: if (mr->outputfile) {
486: if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) {
1.6 frystyk 487: if (SHOW_REAL_QUIET(mr)) HTPrint("Can't open `%s'\n", mr->outputfile);
1.1 frystyk 488: mr->output = OUTPUT;
489: }
490: }
491:
492: /* This is new */
1.10 frystyk 493: if ((mr->cdepth = (int *) HT_CALLOC(mr->depth+2, sizeof(int)))==NULL)
1.7 frystyk 494: HT_OUTOFMEM("main");
1.1 frystyk 495:
496: /* Should we use persistent cache? */
497: if (cache) {
1.11 frystyk 498: HTCacheInit(cache_root, cache_size);
1.1 frystyk 499:
500: /* Should we start by flushing? */
501: if (flush) HTCache_flushAll();
502: }
503:
504: /* SQL Log specified? */
505: #ifdef HT_MYSQL
506: if (mr->sqlserver) {
507: if ((mr->sqllog =
508: HTSQLLog_open(mr->sqlserver,
509: mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER,
510: mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW,
511: mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB,
512: mr->sqlflags)) != NULL) {
513: if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative);
514: }
515: }
516: #endif
517:
518: /* CLF Log file specified? */
519: if (mr->logfile) {
520: mr->log = HTLog_open(mr->logfile, YES, YES);
521: if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE);
522: }
523:
524: /* Referer Log file specified? */
525: if (mr->reffile) {
526: mr->ref = HTLog_open(mr->reffile, YES, YES);
527: if (mr->ref)
528: HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE);
529: }
530:
531: /* Not found error log specified? */
532: if (mr->notfoundfile) {
533: mr->notfound = HTLog_open(mr->notfoundfile, YES, YES);
534: if (mr->notfound)
535: HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE);
536: }
537:
1.10 frystyk 538: /* Check that the redirection code is valid */
539: if (mr->flags & MR_REDIR) {
540: BOOL isredir = NO;
541: if (mr->redir_code == HT_PERM_REDIRECT || mr->redir_code == 0) {
542: HTNet_addAfter(redirection_handler, "http://*" , NULL, HT_PERM_REDIRECT, HT_FILTER_LATE);
543: isredir = YES;
544: }
545: if (mr->redir_code == HT_TEMP_REDIRECT || mr->redir_code == 0) {
546: HTNet_addAfter(redirection_handler, "http://*", NULL, HT_TEMP_REDIRECT, HT_FILTER_LATE);
547: isredir = YES;
548: }
549: if (mr->redir_code == HT_FOUND || mr->redir_code == 0) {
550: HTNet_addAfter(redirection_handler, "http://*", NULL, HT_FOUND, HT_FILTER_LATE);
551: isredir = YES;
552: }
553: if (mr->redir_code == HT_SEE_OTHER || mr->redir_code == 0) {
554: HTNet_addAfter(redirection_handler, "http://*", NULL, HT_SEE_OTHER, HT_FILTER_LATE);
555: isredir = YES;
556: }
557: if (!isredir) {
558: if (SHOW_REAL_QUIET(mr))
559: HTPrint("%d is not a valid redirection code\n", mr->redir_code);
560: Cleanup(mr, -1);
561: }
562: }
563:
1.1 frystyk 564: /* Negotiated resource log specified? */
565: if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES);
566:
567: /* No alt tags log file specified? */
568: if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES);
569:
570: /* Reject Log file specified? */
571: if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES);
572:
573: #ifdef HT_POSIX_REGEX
574: if(!(mr->flags & MR_NOROBOTSTXT))
575: {
576: char *ruri = HTParse(ROBOTS_TXT, mr->furl, PARSE_ALL);
577: char *robot_str = get_robots_txt(ruri);
578: char *reg_exp_robot = robot_str ?
579: scan_robots_txt(robot_str,APP_NAME) : NULL;
1.6 frystyk 580: if (SHOW_REAL_QUIET(mr)) HTPrint("robots.txt uri is `%s'\n", ruri);
1.1 frystyk 581: if(robot_str)
582: HT_FREE(robot_str);
583: if(reg_exp_robot)
584: {
585: mr->exc_robot = get_regtype(mr, reg_exp_robot, W3C_DEFAULT_REGEX_FLAGS);
586: HT_FREE(reg_exp_robot);
587: }
588: HT_FREE(ruri);
589: }
590: #endif
591:
1.4 frystyk 592: /* Add our own HTML HText functions */
593: Robot_registerHTMLParser();
1.9 frystyk 594:
1.1 frystyk 595: /* Register our own terminate filter */
596: HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.3 frystyk 597:
598: /* If doing breath first search */
599: if (mr->flags & MR_BFS)
1.10 frystyk 600: HTNet_addAfter(bfs_terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST);
1.1 frystyk 601:
602: /* Setting event timeout */
603: HTHost_setEventTimeout(mr->timer);
604:
605: mr->time = HTGetTimeInMillis();
606:
607: /* Start the request */
608: finger = Finger_new(mr, startAnchor, METHOD_GET);
609:
610: /*
611: ** Make sure that the first request is flushed immediately and not
612: ** buffered in the output buffer
613: */
614: HTRequest_setFlush(finger->request, YES);
615:
616: /*
617: ** Check whether we should do some kind of cache validation on
618: ** the load
619: */
620: if (mr->flags & MR_VALIDATE)
621: HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE);
622: if (mr->flags & MR_END_VALIDATE)
623: HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE);
624:
625: /*
626: ** Now do the load
627: */
628: if (mr->flags & MR_PREEMPTIVE)
629: HTRequest_setPreemptive(finger->request, YES);
630:
631: if (keywords) /* Search */
632: status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request);
633: else
634: status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request);
635:
636: if (keywords) HTChunk_delete(keywords);
637: if (status != YES) {
1.6 frystyk 638: if (SHOW_REAL_QUIET(mr)) HTPrint("Can't access resource\n");
1.1 frystyk 639: Cleanup(mr, -1);
640: }
641:
642: /* Go into the event loop... */
643:
1.3 frystyk 644: if((mr->flags & MR_PREEMPTIVE) && (mr->flags & MR_BFS))
1.1 frystyk 645: Serving_queue(mr);
646: else
647: HTEventList_loop(finger->request);
648:
649:
650: /* Only gets here if event loop fails */
651: Cleanup(mr, 0);
652: return 0;
653: }
Webmaster