Annotation of libwww/Robot/src/HTRobot.c, revision 1.81
1.75 frystyk 1: /*
1.81 ! frystyk 2: ** @(#) $Id: HTRobot.c,v 1.80 1999/01/06 15:38:48 frystyk Exp $
1.75 frystyk 3: **
4: ** W3C Webbot can be found at "http://www.w3.org/Robot/"
5: **
6: ** Copyright 1995-1998 World Wide Web Consortium, (Massachusetts
7: ** Institute of Technology, Institut National de Recherche en
8: ** Informatique et en Automatique, Keio University). All Rights
9: ** Reserved. This program is distributed under the W3C's Software
10: ** Intellectual Property License. This program is distributed in the hope
11: ** that it will be useful, but WITHOUT ANY WARRANTY; without even the
12: ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13: ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14: ** details.
1.1 frystyk 15: **
16: ** Authors:
17: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
1.75 frystyk 18: ** BR Bob Racko
19: ** JP John Punin
1.1 frystyk 20: **
21: ** History:
22: ** Dec 04 95 First version
1.75 frystyk 23: ** Oct 1998 Split into separate files
1.1 frystyk 24: */
25:
1.75 frystyk 26: #include "HTRobMan.h"
27: #include "HTQueue.h"
28: #include "HTAncMan.h"
1.51 frystyk 29:
1.62 frystyk 30: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
31: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 32:
1.75 frystyk 33: PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};
1.58 frystyk 34:
35: /*
36: ** Some sorting algorithms
37: */
1.63 frystyk 38: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 39:
1.80 frystyk 40: /*
41: ** Ths callbacks that we need from the libwww HTML parser
42: */
43: PRIVATE HText_new RHText_new;
44: PRIVATE HText_delete RHText_delete;
45: PRIVATE HText_foundLink RHText_foundLink;
1.1 frystyk 46:
47: /* ------------------------------------------------------------------------- */
48:
1.13 eric 49: /* Standard (non-error) Output
50: ** ---------------------------
51: */
52: PUBLIC int OutputData(const char * fmt, ...)
53: {
54: int ret;
55: va_list pArgs;
56: va_start(pArgs, fmt);
57: ret = vfprintf(stdout, fmt, pArgs);
58: va_end(pArgs);
59: return ret;
60: }
61:
62: /* ------------------------------------------------------------------------- */
63:
1.2 frystyk 64: /* Create a "HyperDoc" object
65: ** --------------------------
66: ** A HyperDoc object contains information about whether we have already
67: ** started checking the anchor and the depth in our search
68: */
1.75 frystyk 69: PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
1.2 frystyk 70: {
71: HyperDoc * hd;
1.14 frystyk 72: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
73: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 74: hd->depth = depth;
1.55 frystyk 75: hd->hits = 1;
1.75 frystyk 76:
77: hd->code = -1;
78: hd->index = ++mr->cindex;
79:
1.2 frystyk 80: /* Bind the HyperDoc object together with the Anchor Object */
81: hd->anchor = anchor;
82: HTAnchor_setDocument(anchor, (void *) hd);
83:
84: /* Add this HyperDoc object to our list */
85: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
86: HTList_addObject(mr->hyperdoc, (void *) hd);
87: return hd;
88: }
89:
90: /* Delete a "HyperDoc" object
91: ** --------------------------
92: */
1.75 frystyk 93: PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
1.2 frystyk 94: {
95: if (hd) {
1.11 frystyk 96: HT_FREE (hd);
1.2 frystyk 97: return YES;
98: }
99: return NO;
100: }
101:
1.55 frystyk 102: /*
103: ** Sort the anchor array and log reference count
104: */
105: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
106: {
107: if (mr && array) {
108: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
109: if (log) {
110: void ** data = NULL;
111: HTParentAnchor * anchor = NULL;
112: HTArray_sort(array, HitSort);
113: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
114: while (anchor) {
115: char * uri = HTAnchor_address((HTAnchor *) anchor);
116: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 117: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 118: HT_FREE(uri);
119: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
120: }
121: }
122: HTLog_close(log);
123: return YES;
124: }
125: return NO;
126: }
127:
128: PRIVATE int HitSort (const void * a, const void * b)
129: {
130: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
131: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
132: if (aa && bb) return (bb->hits - aa->hits);
133: return bb - aa;
134: }
135:
1.58 frystyk 136: /*
1.64 frystyk 137: ** Sort the anchor array and log link relations
138: */
139: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
140: {
141: if (mr && array) {
1.68 frystyk 142: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
143: void ** data = NULL;
144: HTParentAnchor * anchor = NULL;
145: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
146: while (anchor) {
147:
148: /*
149: ** If we have a specific link relation to look for then do this.
150: ** Otherwise look for all link relations.
151: */
152: if (mr->relation) {
153: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
154: if (link) {
155: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
156: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
157: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
158: if (src_uri && dest_uri) {
159: #ifdef HT_MYSQL
160: if (mr->sqllog) {
161: HTSQLLog_addLinkRelationship (mr->sqllog,
162: src_uri, dest_uri,
163: HTAtom_name(mr->relation),
164: NULL);
165: }
166: #endif
167: if (log) {
168: HTFormat format = HTAnchor_format(dest);
169: HTLog_addText(log, "%s %s %s --> %s\n",
170: HTAtom_name(mr->relation),
171: format != WWW_UNKNOWN ?
172: HTAtom_name(format) : "<unknown>",
173: src_uri, dest_uri);
174: }
175:
176: /* Cleanup */
177: HT_FREE(src_uri);
178: HT_FREE(dest_uri);
179: }
180: }
181: } else {
182: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
183: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
184: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
185: HTLinkType linktype;
186:
187: /* First look in the main link */
188: if (link && (linktype = HTLink_type(link))) {
189: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
190: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
191: if (src_uri && dest_uri) {
192: #ifdef HT_MYSQL
193: if (mr->sqllog) {
194: HTSQLLog_addLinkRelationship (mr->sqllog,
195: src_uri, dest_uri,
196: HTAtom_name(linktype),
197: NULL);
198: }
199: #endif
200: if (log) {
201: HTFormat format = HTAnchor_format(dest);
202: HTLog_addText(log, "%s %s %s --> %s\n",
203: HTAtom_name(linktype),
204: format != WWW_UNKNOWN ?
205: HTAtom_name(format) : "<unknown>",
206: src_uri, dest_uri);
207: }
208: }
209: HT_FREE(dest_uri);
210: }
211:
212: /* and then in any sublinks */
213: if (sublinks) {
214: HTLink * pres;
215: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
216: if ((linktype = HTLink_type(pres))) {
217: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 218: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 219: if (src_uri && dest_uri) {
220: #ifdef HT_MYSQL
221: if (mr->sqllog) {
222: HTSQLLog_addLinkRelationship (mr->sqllog,
223: src_uri, dest_uri,
224: HTAtom_name(linktype),
225: NULL);
226: }
227: #endif
228: if (log) {
229: HTFormat format = HTAnchor_format(dest);
230: HTLog_addText(log, "%s %s %s --> %s\n",
231: HTAtom_name(linktype),
232: format != WWW_UNKNOWN ?
233: HTAtom_name(format) : "<unknown>",
234: src_uri, dest_uri);
235: }
1.64 frystyk 236: HT_FREE(dest_uri);
237: }
238: }
239: }
240: }
1.68 frystyk 241:
242: /* Cleanup */
243: HT_FREE(src_uri);
244: }
245: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 246: }
1.68 frystyk 247: if (log) HTLog_close(log);
1.64 frystyk 248: return YES;
249: }
250: return NO;
251: }
252:
253: /*
1.63 frystyk 254: ** Sort the anchor array and log last modified date
255: */
256: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
257: {
258: if (mr && array) {
259: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
260: if (log) {
261: void ** data = NULL;
262: HTParentAnchor * anchor = NULL;
263: HTArray_sort(array, LastModifiedSort);
264: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
265: while (anchor) {
266: char * uri = HTAnchor_address((HTAnchor *) anchor);
267: time_t lm = HTAnchor_lastModified(anchor);
268: if (uri && lm > 0)
269: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
270: HT_FREE(uri);
271: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
272: }
273: }
274: HTLog_close(log);
275: return YES;
276: }
277: return NO;
278: }
279:
280: PRIVATE int LastModifiedSort (const void * a, const void * b)
281: {
282: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
283: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
284: return bb - aa;
285: }
286:
287: /*
288: ** Sort the anchor array and log the document title
289: */
290: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
291: {
292: if (mr && array) {
293: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
294: if (log) {
295: void ** data = NULL;
296: HTParentAnchor * anchor = NULL;
297: HTArray_sort(array, TitleSort);
298: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
299: while (anchor) {
300: char * uri = HTAnchor_address((HTAnchor *) anchor);
301: const char * title = HTAnchor_title(anchor);
302: HTCharset charset = HTAnchor_charset(anchor);
303: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
304: charset ? HTAtom_name(charset) : "<none>",
305: title ? title : "<none>",
306: uri);
307: HT_FREE(uri);
308: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
309: }
310: }
311: HTLog_close(log);
312: return YES;
313: }
314: return NO;
315: }
316:
317: PRIVATE int TitleSort (const void * a, const void * b)
318: {
319: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
320: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
321: return strcasecomp(bb?bb:"", aa?aa:"");
322: }
323:
324: /*
1.58 frystyk 325: ** Calculate distributions for media types. The same mechanism
326: ** can be used for other characteristics with relatively
327: ** few outcomes.
328: */
329: PRIVATE HTList * mediatype_distribution (HTArray * array)
330: {
331: if (array) {
332: HTList * mt = HTList_new();
333: MetaDist * pres = NULL;
334: void ** data = NULL;
335: HTParentAnchor * anchor = NULL;
336: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
337: while (anchor) {
338: HTFormat format = HTAnchor_format(anchor);
339: if (format && format != WWW_UNKNOWN) {
340: HTList * cur = mt;
341:
342: /* If found then increase counter */
343: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
344: if (pres->name == format) {
345: pres->hits++;
346: break;
347: }
348: }
349:
350: /* If not found then add new format to list */
351: if (!pres) {
352: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
353: HT_OUTOFMEM("mediatype_distribution");
354: pres->name = format;
355: pres->hits = 1;
356: HTList_addObject(mt, pres);
357: HTList_insertionSort(mt, FormatSort);
358: }
359: }
360:
361: /* Find next anchor in array */
362: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
363: }
364: return mt;
365: }
366: return NULL;
367: }
368:
1.60 frystyk 369: /*
370: ** Calculate distributions for charsets. The same mechanism
371: ** can be used for other characteristics with relatively
372: ** few outcomes.
373: */
374: PRIVATE HTList * charset_distribution (HTArray * array)
375: {
376: if (array) {
377: HTList * cs = HTList_new();
378: MetaDist * pres = NULL;
379: void ** data = NULL;
380: HTParentAnchor * anchor = NULL;
381: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
382: while (anchor) {
383: HTCharset charset = HTAnchor_charset(anchor);
384: if (charset) {
385: HTList * cur = cs;
386:
387: /* If found then increase counter */
388: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
389: if (pres->name == charset) {
390: pres->hits++;
391: break;
392: }
393: }
394:
395: /* If not found then add new format to list */
396: if (!pres) {
397: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
398: HT_OUTOFMEM("charset_distribution");
399: pres->name = charset;
400: pres->hits = 1;
401: HTList_addObject(cs, pres);
402: HTList_insertionSort(cs, FormatSort);
403: }
404: }
405:
406: /* Find next anchor in array */
407: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
408: }
409: return cs;
410: }
411: return NULL;
412: }
413:
1.58 frystyk 414: PRIVATE int FormatSort (const void * a, const void * b)
415: {
416: MetaDist * aa = (MetaDist *) a;
417: MetaDist * bb = (MetaDist *) b;
418: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
419: }
420:
421: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
422: {
423: if (logfile && distribution) {
424: HTLog * log = HTLog_open(logfile, YES, YES);
425: if (log) {
426: HTList * cur = distribution;
427: MetaDist * pres;
428: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
429: if (pres->name) {
1.60 frystyk 430: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 431: }
432: }
433: HTLog_close(log);
434: }
435: }
436: return NO;
437: }
438:
439: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
440: {
441: if (distribution) {
442: HTList * cur = distribution;
443: MetaDist * pres;
444: while ((pres = (MetaDist *) HTList_nextObject(cur)))
445: HT_FREE(pres);
446: HTList_delete(distribution);
447: return YES;
448: }
449: return NO;
450: }
451:
452:
1.55 frystyk 453: /* Statistics
454: ** ----------
455: ** Calculates a bunch of statistics for the anchors traversed
456: */
457: PRIVATE BOOL calculate_statistics (Robot * mr)
458: {
1.59 frystyk 459: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 460: if (!mr) return NO;
461:
462: /* Calculate efficiency */
1.59 frystyk 463: if (mr->time > 0) {
1.56 frystyk 464: ms_t t = HTGetTimeInMillis() - mr->time;
465: if (t > 0) {
1.60 frystyk 466: double loadfactor = (mr->get_bytes / (t * 0.001));
467: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 468: double secs = t / 1000.0;
1.55 frystyk 469: char bytes[50];
1.62 frystyk 470: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 471: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 472: total_docs, secs, reqprsec);
1.59 frystyk 473:
474: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 475: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 476: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 477: mr->get_docs, bytes, loadfactor);
1.59 frystyk 478:
479: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 480: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 481: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 482: mr->head_docs, bytes);
1.55 frystyk 483: }
484: }
485:
486: /* Create an array of existing anchors */
1.59 frystyk 487: if (total_docs > 1) {
488: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 489: if (array) {
490:
1.63 frystyk 491: /* Distributions */
492: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 493: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 494: }
495:
1.55 frystyk 496: /* Sort after hit counts */
1.63 frystyk 497: if (mr->hitfile) {
498: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 499: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 500: mr->hitfile);
501: calculate_hits(mr, array);
502: }
503:
1.64 frystyk 504: /* Sort after link relations */
1.68 frystyk 505: #ifdef HT_MYSQL
506: if (mr->relfile || mr->sqllog) {
1.69 frystyk 507: #else
508: if (mr->relfile) {
509: #endif
1.68 frystyk 510: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 511: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
512: mr->relfile);
513: calculate_linkRelations(mr, array);
514: }
515:
1.63 frystyk 516: /* Sort after modified date */
517: if (mr->lmfile) {
518: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 519: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 520: mr->lmfile);
521: calculate_lm(mr, array);
522: }
523:
524: /* Sort after title */
525: if (mr->titlefile) {
526: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 527: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 528: mr->titlefile);
529: calculate_title(mr, array);
530: }
1.55 frystyk 531:
1.58 frystyk 532: /* Find mediatype distribution */
533: if (mr->mtfile) {
534: HTList * mtdist = mediatype_distribution(array);
535: if (mtdist) {
1.63 frystyk 536: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 537: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 538: mr->mtfile);
1.58 frystyk 539: log_meta_distribution(mr->mtfile, mtdist);
540: delete_meta_distribution(mtdist);
541: }
542: }
1.55 frystyk 543:
1.60 frystyk 544: /* Find charset distribution */
545: if (mr->charsetfile) {
546: HTList * charsetdist = charset_distribution(array);
547: if (charsetdist) {
1.63 frystyk 548: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 549: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 550: mr->charsetfile);
1.60 frystyk 551: log_meta_distribution(mr->charsetfile, charsetdist);
552: delete_meta_distribution(charsetdist);
553: }
554: }
555:
1.55 frystyk 556: /* Add as may other stats here as you like */
1.60 frystyk 557: /* ... */
1.58 frystyk 558:
559: /* Delete the array */
1.55 frystyk 560: HTArray_delete(array);
561: }
562: }
563: return YES;
564: }
565:
1.75 frystyk 566: PRIVATE HTParentAnchor *
567: get_last_parent(HTParentAnchor *anchor)
568: {
569: HTAnchor *anc;
570: HTList *sources = anchor->sources;
571:
572: while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
573: {
574: HTParentAnchor *panchor = HTAnchor_parent(anc);
575: return panchor;
576: }
577: return NULL;
578: }
579:
580: PRIVATE void
581: set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
582: {
583: HTList * cur = HTRequest_error(request);
584: HTError *pres;
585:
586: while((pres = (HTError *) HTList_nextObject(cur)) != NULL)
587: {
588: int code =HTErrors[HTError_index(pres)].code;
589:
590: hd->code = code;
591: }
592: }
593:
594:
595: PRIVATE int
596: test_for_blank_spaces(char *uri)
597: {
598: char *ptr = uri;
599: for(;*ptr!='\0';ptr++)
600: if(*ptr == ' ')
601: return 1;
602: return 0;
603: }
604:
605:
1.1 frystyk 606: /* Create a Command Line Object
607: ** ----------------------------
608: */
1.75 frystyk 609: PUBLIC Robot * Robot_new (void)
1.1 frystyk 610: {
611: Robot * me;
1.41 frystyk 612: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 613: HT_OUTOFMEM("Robot_new");
1.2 frystyk 614: me->hyperdoc = HTList_new();
1.4 frystyk 615: me->htext = HTList_new();
1.74 frystyk 616: me->timer = DEFAULT_TIMEOUT*MILLIES;
1.75 frystyk 617: me->waits = 0;
1.25 frystyk 618: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 619: me->output = OUTPUT;
1.35 eric 620: me->cnt = 0;
1.75 frystyk 621: me->ndoc = -1;
1.34 eric 622: me->fingers = HTList_new();
1.75 frystyk 623:
624: /* This is new */
625: me->queue = HTQueue_new();
626: me->cq = 0;
627: me->furl = NULL;
628:
1.1 frystyk 629: return me;
630: }
631:
632: /* Delete a Command Line Object
633: ** ----------------------------
634: */
1.62 frystyk 635: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 636: {
1.62 frystyk 637: if (mr) {
638: HTList_delete(mr->fingers);
1.55 frystyk 639:
640: /* Calculate statistics */
1.62 frystyk 641: calculate_statistics(mr);
1.55 frystyk 642:
1.62 frystyk 643: if (mr->hyperdoc) {
644: HTList * cur = mr->hyperdoc;
1.2 frystyk 645: HyperDoc * pres;
646: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
647: HyperDoc_delete(pres);
1.62 frystyk 648: HTList_delete(mr->hyperdoc);
1.2 frystyk 649: }
1.62 frystyk 650: if (mr->htext) {
651: HTList * cur = mr->htext;
1.4 frystyk 652: HText * pres;
653: while ((pres = (HText *) HTList_nextObject(cur)))
1.80 frystyk 654: RHText_delete(pres);
1.62 frystyk 655: HTList_delete(mr->htext);
1.4 frystyk 656: }
1.62 frystyk 657:
658: /* Close all the log files */
1.63 frystyk 659: if (mr->flags & MR_LOGGING) {
1.64 frystyk 660: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 661: }
662:
1.62 frystyk 663: if (mr->log) {
664: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 665: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 666: HTLog_accessCount(mr->log), mr->logfile);
667: HTLog_close(mr->log);
668: }
669: if (mr->ref) {
670: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 671: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 672: HTLog_accessCount(mr->ref), mr->reffile);
673: HTLog_close(mr->ref);
674: }
675: if (mr->reject) {
676: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 677: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 678: HTLog_accessCount(mr->reject), mr->rejectfile);
679: HTLog_close(mr->reject);
680: }
681: if (mr->notfound) {
682: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 683: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 684: HTLog_accessCount(mr->notfound), mr->notfoundfile);
685: HTLog_close(mr->notfound);
686: }
687: if (mr->conneg) {
688: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 689: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 690: HTLog_accessCount(mr->conneg), mr->connegfile);
691: HTLog_close(mr->conneg);
692: }
693: if (mr->noalttag) {
694: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 695: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 696: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
697: HTLog_close(mr->noalttag);
698: }
699:
700: if (mr->output && mr->output != STDOUT) fclose(mr->output);
701:
702: if (mr->flags & MR_TIME) {
1.12 frystyk 703: time_t local = time(NULL);
1.62 frystyk 704: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 705: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 706: }
1.55 frystyk 707:
1.75 frystyk 708: /* This is new */
709: if(mr->cdepth)
710: HT_FREE(mr->cdepth);
711: if(mr->furl) HT_FREE(mr->furl);
712:
1.58 frystyk 713: #ifdef HT_POSIX_REGEX
1.62 frystyk 714: if (mr->include) {
715: regfree(mr->include);
716: HT_FREE(mr->include);
717: }
718: if (mr->exclude) {
719: regfree(mr->exclude);
720: HT_FREE(mr->exclude);
721: }
1.75 frystyk 722: if (mr->exc_robot) {
723: regfree(mr->exc_robot);
724: HT_FREE(mr->exc_robot);
725: }
1.62 frystyk 726: if (mr->check) {
727: regfree(mr->check);
728: HT_FREE(mr->check);
1.58 frystyk 729: }
730: #endif
731:
1.68 frystyk 732: #ifdef HT_MYSQL
733: if (mr->sqllog) {
734: HTSQLLog_close(mr->sqllog);
735: mr->sqllog = NULL;
736: }
737: #endif
738:
1.81 ! frystyk 739: if (mr->queue) HTQueue_delete(mr->queue);
1.62 frystyk 740: HT_FREE(mr->cwd);
741: HT_FREE(mr->prefix);
742: HT_FREE(mr->img_prefix);
743: HT_FREE(mr);
1.1 frystyk 744: return YES;
745: }
746: return NO;
747: }
748:
1.2 frystyk 749: /*
1.34 eric 750: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 751: */
1.75 frystyk 752: PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 753: {
1.34 eric 754: Finger * me;
755: HTRequest * request = HTRequest_new();
756: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
757: HT_OUTOFMEM("Finger_new");
758: me->robot = robot;
759: me->request = request;
760: me->dest = dest;
761: HTList_addObject(robot->fingers, (void *)me);
762:
1.48 frystyk 763: /* Set the context for this request */
1.34 eric 764: HTRequest_setContext (request, me);
1.48 frystyk 765:
766: /* Check the various flags to customize the request */
767: if (robot->flags & MR_PREEMPTIVE)
768: HTRequest_setPreemptive(request, YES);
769: if (robot->flags & MR_VALIDATE)
770: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
771: if (robot->flags & MR_END_VALIDATE)
772: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
773:
774: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 775: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 776:
777: /* Set the method for this request */
1.34 eric 778: HTRequest_setMethod(request, method);
779: robot->cnt++;
780: return me;
1.2 frystyk 781: }
782:
1.34 eric 783: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 784: {
1.34 eric 785: HTList_removeObject(me->robot->fingers, (void *)me);
786: me->robot->cnt--;
1.37 frystyk 787:
788: /*
789: ** If we are down at one request then flush the output buffer
790: */
791: if (me->request) {
792: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 793: HTRequest_delete(me->request);
1.37 frystyk 794: }
795:
796: /*
797: ** Delete the request and free myself
798: */
1.34 eric 799: HT_FREE(me);
800: return YES;
1.2 frystyk 801: }
802:
803: /*
804: ** Cleanup and make sure we close all connections including the persistent
805: ** ones
806: */
1.75 frystyk 807: PUBLIC void Cleanup (Robot * me, int status)
1.1 frystyk 808: {
1.81 ! frystyk 809: HTProfile_delete();
1.1 frystyk 810: Robot_delete(me);
1.50 frystyk 811: #ifdef HT_MEMLOG
1.39 eric 812: HTMemLog_close();
1.47 frystyk 813: #endif
814:
1.1 frystyk 815: #ifdef VMS
816: exit(status ? status : 1);
817: #else
818: exit(status ? status : 0);
819: #endif
820: }
821:
822: #ifdef CATCH_SIG
823: #include <signal.h>
824: /* SetSignal
825: ** This function sets up signal handlers. This might not be necessary to
826: ** call if the application has its own handlers (lossage on SVR4)
827: */
1.75 frystyk 828: PUBLIC void SetSignal (void)
1.1 frystyk 829: {
830: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
831: ** when attemting to connect to a remote host where you normally should
832: ** get `connection refused' back
833: */
834: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 835: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 836: } else {
1.13 eric 837: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 838: }
1.47 frystyk 839:
1.50 frystyk 840: #ifdef HT_MEMLOG
1.44 eric 841: HTMemLog_flush();
1.47 frystyk 842: #endif
843:
1.1 frystyk 844: }
845: #endif /* CATCH_SIG */
846:
1.58 frystyk 847: #ifdef HT_POSIX_REGEX
848: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
849: {
850: size_t length = regerror (errcode, compiled, NULL, 0);
851: char * str = NULL;
852: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
853: HT_OUTOFMEM("get_regerror");
854: (void) regerror (errcode, compiled, str, length);
855: return str;
856: }
857:
1.75 frystyk 858: PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 859: {
860: regex_t * regex = NULL;
861: if (regex_str && *regex_str) {
862: int status;
863: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
864: HT_OUTOFMEM("get_regtype");
1.60 frystyk 865: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 866: char * err_msg = get_regerror(status, regex);
1.62 frystyk 867: if (SHOW_REAL_QUIET(mr))
868: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 869: HT_FREE(err_msg);
870: Cleanup(mr, -1);
871: }
872: }
873: return regex;
874: }
875: #endif
876:
1.75 frystyk 877: PUBLIC void VersionInfo (void)
1.1 frystyk 878: {
1.75 frystyk 879: OutputData("\nW3C OpenSource Software");
880: OutputData("\n-----------------------\n\n");
881: OutputData("\tWebbot version %s\n", APP_VERSION);
882: OutputData("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
883: OutputData("\tSee \"%s\" for help\n", COMMAND_LINE);
884: OutputData("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
885: OutputData("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
886: OutputData("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
887: OutputData("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
1.1 frystyk 888: }
889:
890: /* terminate_handler
891: ** -----------------
1.2 frystyk 892: ** This function is registered to handle the result of the request.
893: ** If no more requests are pending then terminate program
1.1 frystyk 894: */
1.75 frystyk 895: PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
1.32 frystyk 896: void * param, int status)
1.1 frystyk 897: {
1.34 eric 898: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 899: Robot * mr = finger->robot;
1.62 frystyk 900: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 901:
1.68 frystyk 902: #ifdef HT_MYSQL
903: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
904: #endif
905:
1.58 frystyk 906: /* Check if negotiated resource and whether we should log that*/
907: if (mr->conneg) {
908: HTAssocList * cur = HTResponse_variant(response);
909: if (cur) {
910: BOOL first = YES;
911: HTChunk * buffer = HTChunk_new(128);
912: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
913: HTAssoc * pres;
1.60 frystyk 914: HTChunk_puts(buffer, uri);
1.58 frystyk 915: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
916: char * value = HTAssoc_value(pres);
917: if (first) {
1.60 frystyk 918: HTChunk_puts(buffer, "\t(");
1.58 frystyk 919: first = NO;
920: } else
921: HTChunk_puts(buffer, ", ");
922:
923: /* Output the name */
924: HTChunk_puts(buffer, HTAssoc_name(pres));
925:
926: /* Only output the value if not empty string */
1.60 frystyk 927: if (value && *value) {
1.58 frystyk 928: HTChunk_puts(buffer, "=");
929: HTChunk_puts(buffer, value);
930: }
931: }
1.60 frystyk 932: if (!first) HTChunk_puts(buffer, ")");
933: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 934: HTChunk_delete(buffer);
935: HT_FREE(uri);
936: }
937: }
938:
1.55 frystyk 939: /* Count the amount of body data that we have read */
1.59 frystyk 940: if (HTRequest_method(request) == METHOD_GET) {
941: int length = HTAnchor_length(HTRequest_anchor(request));
942: if (length > 0) mr->get_bytes += length;
943: mr->get_docs++;
944: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 945: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 946: if (length > 0) mr->head_bytes += length;
947: mr->head_docs++;
948: } else {
949: mr->other_docs++;
1.55 frystyk 950: }
951:
1.78 frystyk 952: if (!(mr->flags & MR_BFS)) {
953:
954: /* Delete this thread */
955: Finger_delete(finger);
956:
957: /* Should we stop? */
958: if (mr->cnt <= 0) {
959: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
960: Cleanup(mr, 0); /* No way back from here */
961: }
962: }
963:
1.75 frystyk 964: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
965: return HT_OK;
966:
967: }
968: PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
969: void * param, int status)
970: {
971: Finger * finger = (Finger *) HTRequest_context(request);
972: Robot * mr = finger->robot;
973: HTParentAnchor * dest = finger->dest;
974: HyperDoc * hd = HTAnchor_document(dest);
975: int depth = (hd ? hd->depth : -1);
976:
977: if (hd) set_error_state_hyperdoc(hd,request);
978:
979: if(hd && (HTRequest_method(request)== METHOD_HEAD) &&
980: (depth < mr->depth))
981: {
982: hd->method = METHOD_GET;
983: HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
984: }
1.58 frystyk 985:
1.34 eric 986: Finger_delete(finger);
1.55 frystyk 987:
1.75 frystyk 988: if(!(mr->flags & MR_PREEMPTIVE))
989: Serving_queue(mr);
990:
991: return HT_OK;
992: }
993:
994:
995: PUBLIC void Serving_queue(Robot *mr)
996: {
997: BOOL abort = NO;
998: Finger *nfinger;
999:
1000: while(!abort)
1001: {
1002: if(!HTQueue_isEmpty(mr->queue))
1003: {
1004: HTRequest *newreq;
1005:
1006: HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
1007:
1008: if(nhd)
1009: {
1010: char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
1011: HTQueue_dequeue(mr->queue); (mr->cq)--;
1012:
1013: nfinger = Finger_new(mr, nhd->anchor, nhd->method);
1014:
1015: newreq = nfinger->request;
1016:
1017: if(SHOW_QUIET(mr)) HTTrace("Request from QUEUE %s\n",uri);
1018: HT_FREE(uri);
1019: if(SHOW_QUIET(mr)) HTTrace("%d elements in queue \n", mr->cq);
1020:
1021: HTRequest_setParent(newreq,get_last_parent(nhd->anchor));
1022:
1.76 frystyk 1023: /* @@@ Should be done using a timer and not sleep! @@@ */
1024: #if 0
1.75 frystyk 1025: if(mr->waits)
1026: sleep(mr->waits);
1.76 frystyk 1027: #endif
1.75 frystyk 1028:
1029: if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES)
1030: {
1031: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1032: Finger_delete(nfinger);
1033: }
1034: }
1035: else
1036: abort = YES;
1037: }
1038: else
1039: abort = YES;
1040: }
1041:
1042: if(SHOW_QUIET(mr)) HTTrace("Queue size: %d \n", mr->cq);
1043:
1044: if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
1045: {
1046: if(mr->cnt > 0)
1047: if(SHOW_QUIET(mr)) HTTrace("%d requests were not served\n", mr->cnt);
1048:
1.62 frystyk 1049: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1050: Cleanup(mr, 0); /* No way back from here */
1.75 frystyk 1051: }
1.1 frystyk 1052: }
1053:
1054: /* ------------------------------------------------------------------------- */
1055: /* HTEXT INTERFACE */
1056: /* ------------------------------------------------------------------------- */
1057:
1.80 frystyk 1058: PUBLIC BOOL Robot_registerHTMLParser (void)
1059: {
1060: HText_registerCDCallback(RHText_new, RHText_delete);
1061: HText_registerLinkCallback(RHText_foundLink);
1062: return YES;
1063: }
1064:
1065: PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,
1066: HTStream * stream)
1.1 frystyk 1067: {
1068: HText * me;
1.34 eric 1069: Finger * finger = (Finger *) HTRequest_context(request);
1070: Robot * mr = finger->robot;
1.65 frystyk 1071: char * robots = NULL;
1072:
1.14 frystyk 1073: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1074: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1075:
1076: /* Bind the HText object together with the Request Object */
1.1 frystyk 1077: me->request = request;
1.65 frystyk 1078: me->follow = YES;
1079:
1080: /* Check to see if we have any meta tags */
1.77 frystyk 1081: if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
1.65 frystyk 1082: char * strval = NULL;
1083: char * ptr = NULL;
1084: char * token = NULL;
1085: StrAllocCopy(strval, robots);
1086: ptr = strval;
1087: while ((token = HTNextField(&ptr)) != NULL) {
1088: if (!strcasecomp(token, "nofollow")) {
1089: me->follow = NO;
1090: break;
1091: }
1092: }
1093: HT_FREE(strval);
1094: }
1.4 frystyk 1095:
1096: /* Add this HyperDoc object to our list */
1097: if (!mr->htext) mr->htext = HTList_new();
1098: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1099: return me;
1100: }
1101:
1.80 frystyk 1102: PRIVATE BOOL RHText_delete (HText * me) {
1.81 ! frystyk 1103: if (me) {
! 1104: HT_FREE(me);
! 1105: return YES;
! 1106: }
! 1107: return NO;
1.4 frystyk 1108: }
1109:
1.80 frystyk 1110: PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor)
1.1 frystyk 1111: {
1112: if (text && anchor) {
1.34 eric 1113: Finger * finger = (Finger *) HTRequest_context(text->request);
1114: Robot * mr = finger->robot;
1.1 frystyk 1115: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1116: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1117: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1118: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1119: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1120: BOOL match = text->follow;
1.58 frystyk 1121: BOOL check = NO;
1.1 frystyk 1122:
1.75 frystyk 1123: /* These are new variables */
1124: HyperDoc * nhd = NULL;
1125: BOOL follow = YES;
1126:
1127: /* These three variables were moved */
1128: /*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
1129: HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
1130: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1131: int depth = last_doc ? last_doc->depth+1 : 0;
1132:
1.55 frystyk 1133: if (!uri) return;
1.75 frystyk 1134: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");
1.55 frystyk 1135:
1136: if (hd) {
1.75 frystyk 1137: if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
1.55 frystyk 1138: hd->hits++;
1.68 frystyk 1139: #ifdef HT_MYSQL
1140: if (mr->sqllog) {
1141: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1142: if (ref_addr) {
1143: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1144: "referer", NULL);
1145: HT_FREE(ref_addr);
1146: }
1147: }
1148: #endif
1.58 frystyk 1149: HT_FREE(uri);
1150: return;
1151: }
1.70 frystyk 1152:
1.58 frystyk 1153: /* Check for prefix match */
1.65 frystyk 1154: if (match && mr->prefix) {
1155: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1156: }
1.58 frystyk 1157:
1158: #ifdef HT_POSIX_REGEX
1.69 frystyk 1159: /*
1160: ** Check for any regular expression. The include may override
1161: ** the prefix matching
1162: */
1163: if (mr->include) {
1.58 frystyk 1164: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1165: }
1.75 frystyk 1166: if (match && mr->exc_robot) {
1167: match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
1168: }
1.58 frystyk 1169: if (match && mr->exclude) {
1170: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1171: }
1172: if (match && mr->check) {
1173: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1174: }
1175: #endif
1.75 frystyk 1176: if(uri && test_for_blank_spaces(uri))
1177: follow = NO;
1178: else if (mr->ndoc == 0) /* Number of Documents is reached */
1179: follow = NO;
1180:
1181: /* Test whether we already have a hyperdoc for this document */
1182: if(!hd && dest_parent)
1183: {
1184: nhd = HyperDoc_new(mr, dest_parent, depth);
1185: mr->cdepth[depth]++;
1186: }
1.58 frystyk 1187:
1188: /* Test whether we already have a hyperdoc for this document */
1.78 frystyk 1189: if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
1190: if (mr->flags & MR_BFS) {
1191: nhd->method = METHOD_HEAD;
1192: HTQueue_enqueue(mr->queue, (void *) nhd);
1193: (mr->cq)++;
1194: if(mr->ndoc > 0) mr->ndoc--;
1195: } else {
1196: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1197: HTRequest * newreq = newfinger->request;
1198: HTRequest_setParent(newreq, referer);
1199: if (check || depth >= mr->depth) {
1200: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1201: HTRequest_setMethod(newreq, METHOD_HEAD);
1202: } else {
1203: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1204: }
1205: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1206: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1207: Finger_delete(newfinger);
1208: }
1209: }
1.75 frystyk 1210:
1.7 frystyk 1211: } else {
1.75 frystyk 1212: if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
1.68 frystyk 1213: #ifdef HT_MYSQL
1214: if (mr->reject || mr->sqllog) {
1215: #else
1.60 frystyk 1216: if (mr->reject) {
1.68 frystyk 1217: #endif
1.60 frystyk 1218: if (referer) {
1219: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1220: if (mr->reject && ref_addr)
1221: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1222: #ifdef HT_MYSQL
1223: if (mr->sqllog && mr->sqlexternals && ref_addr)
1224: HTSQLLog_addLinkRelationship(mr->sqllog,
1225: ref_addr, uri,
1226: "referer", NULL);
1227: #endif
1228:
1.60 frystyk 1229: HT_FREE(ref_addr);
1230: }
1231: }
1.2 frystyk 1232: }
1.11 frystyk 1233: HT_FREE(uri);
1.2 frystyk 1234: }
1235: }
1236:
1.80 frystyk 1237: PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,
1238: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1239: {
1240: if (text && anchor) {
1.34 eric 1241: Finger * finger = (Finger *) HTRequest_context(text->request);
1242: Robot * mr = finger->robot;
1.75 frystyk 1243:
1.59 frystyk 1244: if (mr->flags & MR_IMG) {
1.60 frystyk 1245: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1246: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1247: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1248: HyperDoc * hd = HTAnchor_document(dest_parent);
1249: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1250: BOOL match = YES;
1251:
1.72 frystyk 1252: if (!uri) return;
1.59 frystyk 1253: if (hd) {
1.75 frystyk 1254: if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
1.59 frystyk 1255: hd->hits++;
1.68 frystyk 1256: #ifdef HT_MYSQL
1257: if (mr->sqllog) {
1258: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1259: if (ref_addr) {
1260: HTSQLLog_addLinkRelationship(mr->sqllog,
1261: ref_addr, uri,
1262: "image", alt);
1263: HT_FREE(ref_addr);
1264: }
1265: }
1266: #endif
1.11 frystyk 1267: HT_FREE(uri);
1.59 frystyk 1268: return;
1.2 frystyk 1269: }
1.59 frystyk 1270:
1271: /* Check for prefix match */
1272: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1273:
1.79 br334 1274: #ifdef HT_POSIX_REGEX
1275: /*
1276: ** Check for any regular expression. The include may override
1277: ** the prefix matching
1278: */
1279: if (mr->include) {
1280: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1281: }
1282: if (match && mr->exc_robot) {
1283: match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
1284: }
1285: if (match && mr->exclude) {
1286: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1287: }
1288: #endif
1.59 frystyk 1289: /* Test whether we already have a hyperdoc for this document */
1290: if (match && dest) {
1.60 frystyk 1291: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1292: mr->flags & MR_SAVE ?
1293: METHOD_GET : METHOD_HEAD);
1294: HTRequest * newreq = newfinger->request;
1.60 frystyk 1295: HyperDoc_new(mr, dest_parent, 1);
1296: HTRequest_setParent(newreq, referer);
1297:
1298: /* Check whether we should report missing ALT tags */
1299: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1300: if (referer) {
1301: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1302: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1303: HT_FREE(ref_addr);
1304: }
1305: }
1306:
1.62 frystyk 1307: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1308: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1309: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1310: Finger_delete(newfinger);
1311: }
1312: } else {
1.75 frystyk 1313: if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
1.68 frystyk 1314: #ifdef HT_MYSQL
1315: if (mr->reject || mr->sqllog) {
1316: #else
1.60 frystyk 1317: if (mr->reject) {
1.68 frystyk 1318: #endif
1.60 frystyk 1319: if (referer) {
1320: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1321: if (mr->reject && ref_addr)
1322: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1323: #ifdef HT_MYSQL
1324: if (mr->sqllog && mr->sqlexternals && ref_addr)
1325: HTSQLLog_addLinkRelationship(mr->sqllog,
1326: ref_addr, uri,
1327: "image", alt);
1328: #endif
1329:
1.60 frystyk 1330: HT_FREE(ref_addr);
1331: }
1332: }
1.1 frystyk 1333: }
1.59 frystyk 1334: HT_FREE(uri);
1.72 frystyk 1335: }
1336: }
1337: }
1338:
1.80 frystyk 1339: PRIVATE void RHText_foundLink (HText * text,
1340: int element_number, int attribute_number,
1341: HTChildAnchor * anchor,
1342: const BOOL * present, const char ** value)
1.72 frystyk 1343: {
1344: if (text && anchor) {
1345: Finger * finger = (Finger *) HTRequest_context(text->request);
1346: Robot * mr = finger->robot;
1347: if (SHOW_QUIET(mr))
1.80 frystyk 1348: HTTrace("Robot....... Received element %d, attribute %d with anchor %p\n",
1349: element_number, attribute_number, anchor);
1350: if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) ||
1351: (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND))
1352: RHText_foundImage(text, anchor, NULL, NULL, NO);
1353: else
1354: RHText_foundAnchor(text, anchor);
1.72 frystyk 1355: }
1356: }
1357:
1.80 frystyk 1358: PUBLIC char * get_robots_txt(char * uri)
1.48 frystyk 1359: {
1.75 frystyk 1360: char *str = NULL;
1361: HTChunk * chunk;
1362: HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
1363: HTRequest *request = HTRequest_new();
1364: HTRequest_setOutputFormat(request, WWW_SOURCE);
1365: HTRequest_setPreemptive(request, YES);
1366: HTRequest_setMethod(request, METHOD_GET);
1367: chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
1368: str = HTChunk_toCString(chunk);
1369: HTRequest_delete(request);
1370: return str;
1.48 frystyk 1371: }
1372:
Webmaster