Annotation of libwww/Robot/src/HTRobot.c, revision 1.79
1.75 frystyk 1: /*
1.79 ! br334 2: ** @(#) $Id: HTRobot.c,v 1.78 1998/11/01 15:52:00 frystyk Exp $
1.75 frystyk 3: **
4: ** W3C Webbot can be found at "http://www.w3.org/Robot/"
5: **
6: ** Copyright 1995-1998 World Wide Web Consortium, (Massachusetts
7: ** Institute of Technology, Institut National de Recherche en
8: ** Informatique et en Automatique, Keio University). All Rights
9: ** Reserved. This program is distributed under the W3C's Software
10: ** Intellectual Property License. This program is distributed in the hope
11: ** that it will be useful, but WITHOUT ANY WARRANTY; without even the
12: ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13: ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14: ** details.
1.1 frystyk 15: **
16: ** Authors:
17: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
1.75 frystyk 18: ** BR Bob Racko
19: ** JP John Punin
1.1 frystyk 20: **
21: ** History:
22: ** Dec 04 95 First version
1.75 frystyk 23: ** Oct 1998 Split into separate files
1.1 frystyk 24: */
25:
1.75 frystyk 26: #include "HTRobMan.h"
27: #include "HTQueue.h"
28: #include "HTAncMan.h"
1.51 frystyk 29:
1.62 frystyk 30: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
31: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 32:
1.75 frystyk 33: PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};
1.58 frystyk 34:
35: /*
36: ** Some sorting algorithms
37: */
1.63 frystyk 38: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 39:
1.1 frystyk 40: PUBLIC HText * HTMainText = NULL;
41: PUBLIC HTParentAnchor * HTMainAnchor = NULL;
42: PUBLIC HTStyleSheet * styleSheet = NULL;
43:
44: /* ------------------------------------------------------------------------- */
45:
1.13 eric 46: /* Standard (non-error) Output
47: ** ---------------------------
48: */
49: PUBLIC int OutputData(const char * fmt, ...)
50: {
51: int ret;
52: va_list pArgs;
53: va_start(pArgs, fmt);
54: ret = vfprintf(stdout, fmt, pArgs);
55: va_end(pArgs);
56: return ret;
57: }
58:
59: /* ------------------------------------------------------------------------- */
60:
1.2 frystyk 61: /* Create a "HyperDoc" object
62: ** --------------------------
63: ** A HyperDoc object contains information about whether we have already
64: ** started checking the anchor and the depth in our search
65: */
1.75 frystyk 66: PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
1.2 frystyk 67: {
68: HyperDoc * hd;
1.14 frystyk 69: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
70: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 71: hd->depth = depth;
1.55 frystyk 72: hd->hits = 1;
1.75 frystyk 73:
74: hd->code = -1;
75: hd->index = ++mr->cindex;
76:
1.2 frystyk 77: /* Bind the HyperDoc object together with the Anchor Object */
78: hd->anchor = anchor;
79: HTAnchor_setDocument(anchor, (void *) hd);
80:
81: /* Add this HyperDoc object to our list */
82: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
83: HTList_addObject(mr->hyperdoc, (void *) hd);
84: return hd;
85: }
86:
87: /* Delete a "HyperDoc" object
88: ** --------------------------
89: */
1.75 frystyk 90: PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
1.2 frystyk 91: {
92: if (hd) {
1.11 frystyk 93: HT_FREE (hd);
1.2 frystyk 94: return YES;
95: }
96: return NO;
97: }
98:
1.55 frystyk 99: /*
100: ** Sort the anchor array and log reference count
101: */
102: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
103: {
104: if (mr && array) {
105: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
106: if (log) {
107: void ** data = NULL;
108: HTParentAnchor * anchor = NULL;
109: HTArray_sort(array, HitSort);
110: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
111: while (anchor) {
112: char * uri = HTAnchor_address((HTAnchor *) anchor);
113: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 114: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 115: HT_FREE(uri);
116: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
117: }
118: }
119: HTLog_close(log);
120: return YES;
121: }
122: return NO;
123: }
124:
125: PRIVATE int HitSort (const void * a, const void * b)
126: {
127: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
128: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
129: if (aa && bb) return (bb->hits - aa->hits);
130: return bb - aa;
131: }
132:
1.58 frystyk 133: /*
1.64 frystyk 134: ** Sort the anchor array and log link relations
135: */
136: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
137: {
138: if (mr && array) {
1.68 frystyk 139: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
140: void ** data = NULL;
141: HTParentAnchor * anchor = NULL;
142: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
143: while (anchor) {
144:
145: /*
146: ** If we have a specific link relation to look for then do this.
147: ** Otherwise look for all link relations.
148: */
149: if (mr->relation) {
150: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
151: if (link) {
152: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
153: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
154: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
155: if (src_uri && dest_uri) {
156: #ifdef HT_MYSQL
157: if (mr->sqllog) {
158: HTSQLLog_addLinkRelationship (mr->sqllog,
159: src_uri, dest_uri,
160: HTAtom_name(mr->relation),
161: NULL);
162: }
163: #endif
164: if (log) {
165: HTFormat format = HTAnchor_format(dest);
166: HTLog_addText(log, "%s %s %s --> %s\n",
167: HTAtom_name(mr->relation),
168: format != WWW_UNKNOWN ?
169: HTAtom_name(format) : "<unknown>",
170: src_uri, dest_uri);
171: }
172:
173: /* Cleanup */
174: HT_FREE(src_uri);
175: HT_FREE(dest_uri);
176: }
177: }
178: } else {
179: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
180: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
181: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
182: HTLinkType linktype;
183:
184: /* First look in the main link */
185: if (link && (linktype = HTLink_type(link))) {
186: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
187: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
188: if (src_uri && dest_uri) {
189: #ifdef HT_MYSQL
190: if (mr->sqllog) {
191: HTSQLLog_addLinkRelationship (mr->sqllog,
192: src_uri, dest_uri,
193: HTAtom_name(linktype),
194: NULL);
195: }
196: #endif
197: if (log) {
198: HTFormat format = HTAnchor_format(dest);
199: HTLog_addText(log, "%s %s %s --> %s\n",
200: HTAtom_name(linktype),
201: format != WWW_UNKNOWN ?
202: HTAtom_name(format) : "<unknown>",
203: src_uri, dest_uri);
204: }
205: }
206: HT_FREE(dest_uri);
207: }
208:
209: /* and then in any sublinks */
210: if (sublinks) {
211: HTLink * pres;
212: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
213: if ((linktype = HTLink_type(pres))) {
214: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 215: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 216: if (src_uri && dest_uri) {
217: #ifdef HT_MYSQL
218: if (mr->sqllog) {
219: HTSQLLog_addLinkRelationship (mr->sqllog,
220: src_uri, dest_uri,
221: HTAtom_name(linktype),
222: NULL);
223: }
224: #endif
225: if (log) {
226: HTFormat format = HTAnchor_format(dest);
227: HTLog_addText(log, "%s %s %s --> %s\n",
228: HTAtom_name(linktype),
229: format != WWW_UNKNOWN ?
230: HTAtom_name(format) : "<unknown>",
231: src_uri, dest_uri);
232: }
1.64 frystyk 233: HT_FREE(dest_uri);
234: }
235: }
236: }
237: }
1.68 frystyk 238:
239: /* Cleanup */
240: HT_FREE(src_uri);
241: }
242: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 243: }
1.68 frystyk 244: if (log) HTLog_close(log);
1.64 frystyk 245: return YES;
246: }
247: return NO;
248: }
249:
250: /*
1.63 frystyk 251: ** Sort the anchor array and log last modified date
252: */
253: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
254: {
255: if (mr && array) {
256: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
257: if (log) {
258: void ** data = NULL;
259: HTParentAnchor * anchor = NULL;
260: HTArray_sort(array, LastModifiedSort);
261: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
262: while (anchor) {
263: char * uri = HTAnchor_address((HTAnchor *) anchor);
264: time_t lm = HTAnchor_lastModified(anchor);
265: if (uri && lm > 0)
266: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
267: HT_FREE(uri);
268: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
269: }
270: }
271: HTLog_close(log);
272: return YES;
273: }
274: return NO;
275: }
276:
277: PRIVATE int LastModifiedSort (const void * a, const void * b)
278: {
279: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
280: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
281: return bb - aa;
282: }
283:
284: /*
285: ** Sort the anchor array and log the document title
286: */
287: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
288: {
289: if (mr && array) {
290: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
291: if (log) {
292: void ** data = NULL;
293: HTParentAnchor * anchor = NULL;
294: HTArray_sort(array, TitleSort);
295: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
296: while (anchor) {
297: char * uri = HTAnchor_address((HTAnchor *) anchor);
298: const char * title = HTAnchor_title(anchor);
299: HTCharset charset = HTAnchor_charset(anchor);
300: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
301: charset ? HTAtom_name(charset) : "<none>",
302: title ? title : "<none>",
303: uri);
304: HT_FREE(uri);
305: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
306: }
307: }
308: HTLog_close(log);
309: return YES;
310: }
311: return NO;
312: }
313:
314: PRIVATE int TitleSort (const void * a, const void * b)
315: {
316: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
317: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
318: return strcasecomp(bb?bb:"", aa?aa:"");
319: }
320:
321: /*
1.58 frystyk 322: ** Calculate distributions for media types. The same mechanism
323: ** can be used for other characteristics with relatively
324: ** few outcomes.
325: */
326: PRIVATE HTList * mediatype_distribution (HTArray * array)
327: {
328: if (array) {
329: HTList * mt = HTList_new();
330: MetaDist * pres = NULL;
331: void ** data = NULL;
332: HTParentAnchor * anchor = NULL;
333: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
334: while (anchor) {
335: HTFormat format = HTAnchor_format(anchor);
336: if (format && format != WWW_UNKNOWN) {
337: HTList * cur = mt;
338:
339: /* If found then increase counter */
340: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
341: if (pres->name == format) {
342: pres->hits++;
343: break;
344: }
345: }
346:
347: /* If not found then add new format to list */
348: if (!pres) {
349: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
350: HT_OUTOFMEM("mediatype_distribution");
351: pres->name = format;
352: pres->hits = 1;
353: HTList_addObject(mt, pres);
354: HTList_insertionSort(mt, FormatSort);
355: }
356: }
357:
358: /* Find next anchor in array */
359: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
360: }
361: return mt;
362: }
363: return NULL;
364: }
365:
1.60 frystyk 366: /*
367: ** Calculate distributions for charsets. The same mechanism
368: ** can be used for other characteristics with relatively
369: ** few outcomes.
370: */
371: PRIVATE HTList * charset_distribution (HTArray * array)
372: {
373: if (array) {
374: HTList * cs = HTList_new();
375: MetaDist * pres = NULL;
376: void ** data = NULL;
377: HTParentAnchor * anchor = NULL;
378: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
379: while (anchor) {
380: HTCharset charset = HTAnchor_charset(anchor);
381: if (charset) {
382: HTList * cur = cs;
383:
384: /* If found then increase counter */
385: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
386: if (pres->name == charset) {
387: pres->hits++;
388: break;
389: }
390: }
391:
392: /* If not found then add new format to list */
393: if (!pres) {
394: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
395: HT_OUTOFMEM("charset_distribution");
396: pres->name = charset;
397: pres->hits = 1;
398: HTList_addObject(cs, pres);
399: HTList_insertionSort(cs, FormatSort);
400: }
401: }
402:
403: /* Find next anchor in array */
404: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
405: }
406: return cs;
407: }
408: return NULL;
409: }
410:
1.58 frystyk 411: PRIVATE int FormatSort (const void * a, const void * b)
412: {
413: MetaDist * aa = (MetaDist *) a;
414: MetaDist * bb = (MetaDist *) b;
415: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
416: }
417:
418: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
419: {
420: if (logfile && distribution) {
421: HTLog * log = HTLog_open(logfile, YES, YES);
422: if (log) {
423: HTList * cur = distribution;
424: MetaDist * pres;
425: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
426: if (pres->name) {
1.60 frystyk 427: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 428: }
429: }
430: HTLog_close(log);
431: }
432: }
433: return NO;
434: }
435:
436: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
437: {
438: if (distribution) {
439: HTList * cur = distribution;
440: MetaDist * pres;
441: while ((pres = (MetaDist *) HTList_nextObject(cur)))
442: HT_FREE(pres);
443: HTList_delete(distribution);
444: return YES;
445: }
446: return NO;
447: }
448:
449:
1.55 frystyk 450: /* Statistics
451: ** ----------
452: ** Calculates a bunch of statistics for the anchors traversed
453: */
454: PRIVATE BOOL calculate_statistics (Robot * mr)
455: {
1.59 frystyk 456: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 457: if (!mr) return NO;
458:
459: /* Calculate efficiency */
1.59 frystyk 460: if (mr->time > 0) {
1.56 frystyk 461: ms_t t = HTGetTimeInMillis() - mr->time;
462: if (t > 0) {
1.60 frystyk 463: double loadfactor = (mr->get_bytes / (t * 0.001));
464: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 465: double secs = t / 1000.0;
1.55 frystyk 466: char bytes[50];
1.62 frystyk 467: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 468: HTTrace("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 469: total_docs, secs, reqprsec);
1.59 frystyk 470:
471: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 472: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 473: HTTrace("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 474: mr->get_docs, bytes, loadfactor);
1.59 frystyk 475:
476: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 477: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 478: HTTrace("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 479: mr->head_docs, bytes);
1.55 frystyk 480: }
481: }
482:
483: /* Create an array of existing anchors */
1.59 frystyk 484: if (total_docs > 1) {
485: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 486: if (array) {
487:
1.63 frystyk 488: /* Distributions */
489: if (mr->flags & MR_DISTRIBUTIONS) {
1.64 frystyk 490: if (SHOW_REAL_QUIET(mr)) HTTrace("\nDistributions:\n");
1.63 frystyk 491: }
492:
1.55 frystyk 493: /* Sort after hit counts */
1.63 frystyk 494: if (mr->hitfile) {
495: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 496: HTTrace("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 497: mr->hitfile);
498: calculate_hits(mr, array);
499: }
500:
1.64 frystyk 501: /* Sort after link relations */
1.68 frystyk 502: #ifdef HT_MYSQL
503: if (mr->relfile || mr->sqllog) {
1.69 frystyk 504: #else
505: if (mr->relfile) {
506: #endif
1.68 frystyk 507: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.64 frystyk 508: HTTrace("\tLogged link relationship distribution in file `%s\'\n",
509: mr->relfile);
510: calculate_linkRelations(mr, array);
511: }
512:
1.63 frystyk 513: /* Sort after modified date */
514: if (mr->lmfile) {
515: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 516: HTTrace("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 517: mr->lmfile);
518: calculate_lm(mr, array);
519: }
520:
521: /* Sort after title */
522: if (mr->titlefile) {
523: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 524: HTTrace("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 525: mr->titlefile);
526: calculate_title(mr, array);
527: }
1.55 frystyk 528:
1.58 frystyk 529: /* Find mediatype distribution */
530: if (mr->mtfile) {
531: HTList * mtdist = mediatype_distribution(array);
532: if (mtdist) {
1.63 frystyk 533: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 534: HTTrace("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 535: mr->mtfile);
1.58 frystyk 536: log_meta_distribution(mr->mtfile, mtdist);
537: delete_meta_distribution(mtdist);
538: }
539: }
1.55 frystyk 540:
1.60 frystyk 541: /* Find charset distribution */
542: if (mr->charsetfile) {
543: HTList * charsetdist = charset_distribution(array);
544: if (charsetdist) {
1.63 frystyk 545: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 546: HTTrace("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 547: mr->charsetfile);
1.60 frystyk 548: log_meta_distribution(mr->charsetfile, charsetdist);
549: delete_meta_distribution(charsetdist);
550: }
551: }
552:
1.55 frystyk 553: /* Add as may other stats here as you like */
1.60 frystyk 554: /* ... */
1.58 frystyk 555:
556: /* Delete the array */
1.55 frystyk 557: HTArray_delete(array);
558: }
559: }
560: return YES;
561: }
562:
1.75 frystyk 563: PRIVATE HTParentAnchor *
564: get_last_parent(HTParentAnchor *anchor)
565: {
566: HTAnchor *anc;
567: HTList *sources = anchor->sources;
568:
569: while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
570: {
571: HTParentAnchor *panchor = HTAnchor_parent(anc);
572: return panchor;
573: }
574: return NULL;
575: }
576:
577: PRIVATE void
578: set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
579: {
580: HTList * cur = HTRequest_error(request);
581: HTError *pres;
582:
583: while((pres = (HTError *) HTList_nextObject(cur)) != NULL)
584: {
585: int code =HTErrors[HTError_index(pres)].code;
586:
587: hd->code = code;
588: }
589: }
590:
591:
592: PRIVATE int
593: test_for_blank_spaces(char *uri)
594: {
595: char *ptr = uri;
596: for(;*ptr!='\0';ptr++)
597: if(*ptr == ' ')
598: return 1;
599: return 0;
600: }
601:
602:
1.1 frystyk 603: /* Create a Command Line Object
604: ** ----------------------------
605: */
1.75 frystyk 606: PUBLIC Robot * Robot_new (void)
1.1 frystyk 607: {
608: Robot * me;
1.41 frystyk 609: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 610: HT_OUTOFMEM("Robot_new");
1.2 frystyk 611: me->hyperdoc = HTList_new();
1.4 frystyk 612: me->htext = HTList_new();
1.74 frystyk 613: me->timer = DEFAULT_TIMEOUT*MILLIES;
1.75 frystyk 614: me->waits = 0;
1.25 frystyk 615: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 616: me->output = OUTPUT;
1.35 eric 617: me->cnt = 0;
1.75 frystyk 618: me->ndoc = -1;
1.34 eric 619: me->fingers = HTList_new();
1.75 frystyk 620:
621: /* This is new */
622: me->queue = HTQueue_new();
623: me->cq = 0;
624: me->furl = NULL;
625:
1.1 frystyk 626: return me;
627: }
628:
629: /* Delete a Command Line Object
630: ** ----------------------------
631: */
1.62 frystyk 632: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 633: {
1.62 frystyk 634: if (mr) {
635: HTList_delete(mr->fingers);
1.55 frystyk 636:
637: /* Calculate statistics */
1.62 frystyk 638: calculate_statistics(mr);
1.55 frystyk 639:
1.62 frystyk 640: if (mr->hyperdoc) {
641: HTList * cur = mr->hyperdoc;
1.2 frystyk 642: HyperDoc * pres;
643: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
644: HyperDoc_delete(pres);
1.62 frystyk 645: HTList_delete(mr->hyperdoc);
1.2 frystyk 646: }
1.62 frystyk 647: if (mr->htext) {
648: HTList * cur = mr->htext;
1.4 frystyk 649: HText * pres;
650: while ((pres = (HText *) HTList_nextObject(cur)))
651: HText_free(pres);
1.62 frystyk 652: HTList_delete(mr->htext);
1.4 frystyk 653: }
1.62 frystyk 654:
655: /* Close all the log files */
1.63 frystyk 656: if (mr->flags & MR_LOGGING) {
1.64 frystyk 657: if (SHOW_REAL_QUIET(mr)) HTTrace("\nRaw Log files:\n");
1.63 frystyk 658: }
659:
1.62 frystyk 660: if (mr->log) {
661: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 662: HTTrace("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 663: HTLog_accessCount(mr->log), mr->logfile);
664: HTLog_close(mr->log);
665: }
666: if (mr->ref) {
667: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 668: HTTrace("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 669: HTLog_accessCount(mr->ref), mr->reffile);
670: HTLog_close(mr->ref);
671: }
672: if (mr->reject) {
673: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 674: HTTrace("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 675: HTLog_accessCount(mr->reject), mr->rejectfile);
676: HTLog_close(mr->reject);
677: }
678: if (mr->notfound) {
679: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 680: HTTrace("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 681: HTLog_accessCount(mr->notfound), mr->notfoundfile);
682: HTLog_close(mr->notfound);
683: }
684: if (mr->conneg) {
685: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 686: HTTrace("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 687: HTLog_accessCount(mr->conneg), mr->connegfile);
688: HTLog_close(mr->conneg);
689: }
690: if (mr->noalttag) {
691: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 692: HTTrace("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 693: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
694: HTLog_close(mr->noalttag);
695: }
696:
697: if (mr->output && mr->output != STDOUT) fclose(mr->output);
698:
699: if (mr->flags & MR_TIME) {
1.12 frystyk 700: time_t local = time(NULL);
1.62 frystyk 701: if (SHOW_REAL_QUIET(mr))
1.64 frystyk 702: HTTrace("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 703: }
1.55 frystyk 704:
1.75 frystyk 705: /* This is new */
706: if(mr->cdepth)
707: HT_FREE(mr->cdepth);
708: if(mr->furl) HT_FREE(mr->furl);
709:
1.58 frystyk 710: #ifdef HT_POSIX_REGEX
1.62 frystyk 711: if (mr->include) {
712: regfree(mr->include);
713: HT_FREE(mr->include);
714: }
715: if (mr->exclude) {
716: regfree(mr->exclude);
717: HT_FREE(mr->exclude);
718: }
1.75 frystyk 719: if (mr->exc_robot) {
720: regfree(mr->exc_robot);
721: HT_FREE(mr->exc_robot);
722: }
1.62 frystyk 723: if (mr->check) {
724: regfree(mr->check);
725: HT_FREE(mr->check);
1.58 frystyk 726: }
727: #endif
728:
1.68 frystyk 729: #ifdef HT_MYSQL
730: if (mr->sqllog) {
731: HTSQLLog_close(mr->sqllog);
732: mr->sqllog = NULL;
733: }
734: #endif
735:
1.62 frystyk 736: HT_FREE(mr->cwd);
737: HT_FREE(mr->prefix);
738: HT_FREE(mr->img_prefix);
739: HT_FREE(mr);
1.1 frystyk 740: return YES;
741: }
742: return NO;
743: }
744:
1.2 frystyk 745: /*
1.34 eric 746: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 747: */
1.75 frystyk 748: PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 749: {
1.34 eric 750: Finger * me;
751: HTRequest * request = HTRequest_new();
752: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
753: HT_OUTOFMEM("Finger_new");
754: me->robot = robot;
755: me->request = request;
756: me->dest = dest;
757: HTList_addObject(robot->fingers, (void *)me);
758:
1.48 frystyk 759: /* Set the context for this request */
1.34 eric 760: HTRequest_setContext (request, me);
1.48 frystyk 761:
762: /* Check the various flags to customize the request */
763: if (robot->flags & MR_PREEMPTIVE)
764: HTRequest_setPreemptive(request, YES);
765: if (robot->flags & MR_VALIDATE)
766: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
767: if (robot->flags & MR_END_VALIDATE)
768: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
769:
770: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 771: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 772:
773: /* Set the method for this request */
1.34 eric 774: HTRequest_setMethod(request, method);
775: robot->cnt++;
776: return me;
1.2 frystyk 777: }
778:
1.34 eric 779: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 780: {
1.34 eric 781: HTList_removeObject(me->robot->fingers, (void *)me);
782: me->robot->cnt--;
1.37 frystyk 783:
784: /*
785: ** If we are down at one request then flush the output buffer
786: */
787: if (me->request) {
788: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 789: HTRequest_delete(me->request);
1.37 frystyk 790: }
791:
792: /*
793: ** Delete the request and free myself
794: */
1.34 eric 795: HT_FREE(me);
796: return YES;
1.2 frystyk 797: }
798:
799: /*
800: ** Cleanup and make sure we close all connections including the persistent
801: ** ones
802: */
1.75 frystyk 803: PUBLIC void Cleanup (Robot * me, int status)
1.1 frystyk 804: {
805: Robot_delete(me);
1.29 eric 806: HTProfile_delete();
1.50 frystyk 807: #ifdef HT_MEMLOG
1.39 eric 808: HTMemLog_close();
1.47 frystyk 809: #endif
810:
1.1 frystyk 811: #ifdef VMS
812: exit(status ? status : 1);
813: #else
814: exit(status ? status : 0);
815: #endif
816: }
817:
818: #ifdef CATCH_SIG
819: #include <signal.h>
820: /* SetSignal
821: ** This function sets up signal handlers. This might not be necessary to
822: ** call if the application has its own handlers (lossage on SVR4)
823: */
1.75 frystyk 824: PUBLIC void SetSignal (void)
1.1 frystyk 825: {
826: /* On some systems (SYSV) it is necessary to catch the SIGPIPE signal
827: ** when attemting to connect to a remote host where you normally should
828: ** get `connection refused' back
829: */
830: if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
1.13 eric 831: if (PROT_TRACE) HTTrace("HTSignal.... Can't catch SIGPIPE\n");
1.1 frystyk 832: } else {
1.13 eric 833: if (PROT_TRACE) HTTrace("HTSignal.... Ignoring SIGPIPE\n");
1.1 frystyk 834: }
1.47 frystyk 835:
1.50 frystyk 836: #ifdef HT_MEMLOG
1.44 eric 837: HTMemLog_flush();
1.47 frystyk 838: #endif
839:
1.1 frystyk 840: }
841: #endif /* CATCH_SIG */
842:
1.58 frystyk 843: #ifdef HT_POSIX_REGEX
844: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
845: {
846: size_t length = regerror (errcode, compiled, NULL, 0);
847: char * str = NULL;
848: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
849: HT_OUTOFMEM("get_regerror");
850: (void) regerror (errcode, compiled, str, length);
851: return str;
852: }
853:
1.75 frystyk 854: PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 855: {
856: regex_t * regex = NULL;
857: if (regex_str && *regex_str) {
858: int status;
859: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
860: HT_OUTOFMEM("get_regtype");
1.60 frystyk 861: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 862: char * err_msg = get_regerror(status, regex);
1.62 frystyk 863: if (SHOW_REAL_QUIET(mr))
864: HTTrace("Regular expression error: %s\n", err_msg);
1.58 frystyk 865: HT_FREE(err_msg);
866: Cleanup(mr, -1);
867: }
868: }
869: return regex;
870: }
871: #endif
872:
1.75 frystyk 873: PUBLIC void VersionInfo (void)
1.1 frystyk 874: {
1.75 frystyk 875: OutputData("\nW3C OpenSource Software");
876: OutputData("\n-----------------------\n\n");
877: OutputData("\tWebbot version %s\n", APP_VERSION);
878: OutputData("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
879: OutputData("\tSee \"%s\" for help\n", COMMAND_LINE);
880: OutputData("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
881: OutputData("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
882: OutputData("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
883: OutputData("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
1.1 frystyk 884: }
885:
886: /* terminate_handler
887: ** -----------------
1.2 frystyk 888: ** This function is registered to handle the result of the request.
889: ** If no more requests are pending then terminate program
1.1 frystyk 890: */
1.75 frystyk 891: PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
1.32 frystyk 892: void * param, int status)
1.1 frystyk 893: {
1.34 eric 894: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 895: Robot * mr = finger->robot;
1.62 frystyk 896: if (SHOW_QUIET(mr)) HTTrace("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 897:
1.68 frystyk 898: #ifdef HT_MYSQL
899: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
900: #endif
901:
1.58 frystyk 902: /* Check if negotiated resource and whether we should log that*/
903: if (mr->conneg) {
904: HTAssocList * cur = HTResponse_variant(response);
905: if (cur) {
906: BOOL first = YES;
907: HTChunk * buffer = HTChunk_new(128);
908: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
909: HTAssoc * pres;
1.60 frystyk 910: HTChunk_puts(buffer, uri);
1.58 frystyk 911: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
912: char * value = HTAssoc_value(pres);
913: if (first) {
1.60 frystyk 914: HTChunk_puts(buffer, "\t(");
1.58 frystyk 915: first = NO;
916: } else
917: HTChunk_puts(buffer, ", ");
918:
919: /* Output the name */
920: HTChunk_puts(buffer, HTAssoc_name(pres));
921:
922: /* Only output the value if not empty string */
1.60 frystyk 923: if (value && *value) {
1.58 frystyk 924: HTChunk_puts(buffer, "=");
925: HTChunk_puts(buffer, value);
926: }
927: }
1.60 frystyk 928: if (!first) HTChunk_puts(buffer, ")");
929: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 930: HTChunk_delete(buffer);
931: HT_FREE(uri);
932: }
933: }
934:
1.55 frystyk 935: /* Count the amount of body data that we have read */
1.59 frystyk 936: if (HTRequest_method(request) == METHOD_GET) {
937: int length = HTAnchor_length(HTRequest_anchor(request));
938: if (length > 0) mr->get_bytes += length;
939: mr->get_docs++;
940: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 941: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 942: if (length > 0) mr->head_bytes += length;
943: mr->head_docs++;
944: } else {
945: mr->other_docs++;
1.55 frystyk 946: }
947:
1.78 frystyk 948: if (!(mr->flags & MR_BFS)) {
949:
950: /* Delete this thread */
951: Finger_delete(finger);
952:
953: /* Should we stop? */
954: if (mr->cnt <= 0) {
955: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
956: Cleanup(mr, 0); /* No way back from here */
957: }
958: }
959:
1.75 frystyk 960: if (SHOW_QUIET(mr)) HTTrace(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
961: return HT_OK;
962:
963: }
964: PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
965: void * param, int status)
966: {
967: Finger * finger = (Finger *) HTRequest_context(request);
968: Robot * mr = finger->robot;
969: HTParentAnchor * dest = finger->dest;
970: HyperDoc * hd = HTAnchor_document(dest);
971: int depth = (hd ? hd->depth : -1);
972:
973: if (hd) set_error_state_hyperdoc(hd,request);
974:
975: if(hd && (HTRequest_method(request)== METHOD_HEAD) &&
976: (depth < mr->depth))
977: {
978: hd->method = METHOD_GET;
979: HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
980: }
1.58 frystyk 981:
1.34 eric 982: Finger_delete(finger);
1.55 frystyk 983:
1.75 frystyk 984: if(!(mr->flags & MR_PREEMPTIVE))
985: Serving_queue(mr);
986:
987: return HT_OK;
988: }
989:
990:
991: PUBLIC void Serving_queue(Robot *mr)
992: {
993: BOOL abort = NO;
994: Finger *nfinger;
995:
996: while(!abort)
997: {
998: if(!HTQueue_isEmpty(mr->queue))
999: {
1000: HTRequest *newreq;
1001:
1002: HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
1003:
1004: if(nhd)
1005: {
1006: char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
1007: HTQueue_dequeue(mr->queue); (mr->cq)--;
1008:
1009: nfinger = Finger_new(mr, nhd->anchor, nhd->method);
1010:
1011: newreq = nfinger->request;
1012:
1013: if(SHOW_QUIET(mr)) HTTrace("Request from QUEUE %s\n",uri);
1014: HT_FREE(uri);
1015: if(SHOW_QUIET(mr)) HTTrace("%d elements in queue \n", mr->cq);
1016:
1017: HTRequest_setParent(newreq,get_last_parent(nhd->anchor));
1018:
1.76 frystyk 1019: /* @@@ Should be done using a timer and not sleep! @@@ */
1020: #if 0
1.75 frystyk 1021: if(mr->waits)
1022: sleep(mr->waits);
1.76 frystyk 1023: #endif
1.75 frystyk 1024:
1025: if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES)
1026: {
1027: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1028: Finger_delete(nfinger);
1029: }
1030: }
1031: else
1032: abort = YES;
1033: }
1034: else
1035: abort = YES;
1036: }
1037:
1038: if(SHOW_QUIET(mr)) HTTrace("Queue size: %d \n", mr->cq);
1039:
1040: if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
1041: {
1042: if(mr->cnt > 0)
1043: if(SHOW_QUIET(mr)) HTTrace("%d requests were not served\n", mr->cnt);
1044:
1.62 frystyk 1045: if (SHOW_QUIET(mr)) HTTrace(" Everything is finished...\n");
1.46 eric 1046: Cleanup(mr, 0); /* No way back from here */
1.75 frystyk 1047: }
1.1 frystyk 1048: }
1049:
1050: /* ------------------------------------------------------------------------- */
1051: /* HTEXT INTERFACE */
1052: /* ------------------------------------------------------------------------- */
1053:
1054: PUBLIC HText * HText_new2 (HTRequest * request, HTParentAnchor * anchor,
1055: HTStream * stream)
1056: {
1057: HText * me;
1.34 eric 1058: Finger * finger = (Finger *) HTRequest_context(request);
1059: Robot * mr = finger->robot;
1.65 frystyk 1060: char * robots = NULL;
1061:
1.14 frystyk 1062: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1063: HT_OUTOFMEM("HText_new2");
1.4 frystyk 1064:
1065: /* Bind the HText object together with the Request Object */
1.1 frystyk 1066: me->request = request;
1.65 frystyk 1067: me->follow = YES;
1068:
1069: /* Check to see if we have any meta tags */
1.77 frystyk 1070: if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
1.65 frystyk 1071: char * strval = NULL;
1072: char * ptr = NULL;
1073: char * token = NULL;
1074: StrAllocCopy(strval, robots);
1075: ptr = strval;
1076: while ((token = HTNextField(&ptr)) != NULL) {
1077: if (!strcasecomp(token, "nofollow")) {
1078: me->follow = NO;
1079: break;
1080: }
1081: }
1082: HT_FREE(strval);
1083: }
1.4 frystyk 1084:
1085: /* Add this HyperDoc object to our list */
1086: if (!mr->htext) mr->htext = HTList_new();
1087: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1088: return me;
1089: }
1090:
1.4 frystyk 1091: PUBLIC void HText_free (HText * me) {
1.11 frystyk 1092: if (me) HT_FREE (me);
1.4 frystyk 1093: }
1094:
1.1 frystyk 1095: PUBLIC void HText_beginAnchor (HText * text, HTChildAnchor * anchor)
1096: {
1097: if (text && anchor) {
1.34 eric 1098: Finger * finger = (Finger *) HTRequest_context(text->request);
1099: Robot * mr = finger->robot;
1.1 frystyk 1100: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1101: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1102: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1103: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1104: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1105: BOOL match = text->follow;
1.58 frystyk 1106: BOOL check = NO;
1.1 frystyk 1107:
1.75 frystyk 1108: /* These are new variables */
1109: HyperDoc * nhd = NULL;
1110: BOOL follow = YES;
1111:
1112: /* These three variables were moved */
1113: /*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
1114: HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
1115: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1116: int depth = last_doc ? last_doc->depth+1 : 0;
1117:
1.55 frystyk 1118: if (!uri) return;
1.75 frystyk 1119: if (SHOW_QUIET(mr)) HTTrace("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");
1.55 frystyk 1120:
1121: if (hd) {
1.75 frystyk 1122: if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
1.55 frystyk 1123: hd->hits++;
1.68 frystyk 1124: #ifdef HT_MYSQL
1125: if (mr->sqllog) {
1126: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1127: if (ref_addr) {
1128: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1129: "referer", NULL);
1130: HT_FREE(ref_addr);
1131: }
1132: }
1133: #endif
1.58 frystyk 1134: HT_FREE(uri);
1135: return;
1136: }
1.70 frystyk 1137:
1.58 frystyk 1138: /* Check for prefix match */
1.65 frystyk 1139: if (match && mr->prefix) {
1140: match = HTStrMatch(mr->prefix, uri) ? YES : NO;
1141: }
1.58 frystyk 1142:
1143: #ifdef HT_POSIX_REGEX
1.69 frystyk 1144: /*
1145: ** Check for any regular expression. The include may override
1146: ** the prefix matching
1147: */
1148: if (mr->include) {
1.58 frystyk 1149: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
1150: }
1.75 frystyk 1151: if (match && mr->exc_robot) {
1152: match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
1153: }
1.58 frystyk 1154: if (match && mr->exclude) {
1155: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
1156: }
1157: if (match && mr->check) {
1158: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1159: }
1160: #endif
1.75 frystyk 1161: if(uri && test_for_blank_spaces(uri))
1162: follow = NO;
1163: else if (mr->ndoc == 0) /* Number of Documents is reached */
1164: follow = NO;
1165:
1166: /* Test whether we already have a hyperdoc for this document */
1167: if(!hd && dest_parent)
1168: {
1169: nhd = HyperDoc_new(mr, dest_parent, depth);
1170: mr->cdepth[depth]++;
1171: }
1.58 frystyk 1172:
1173: /* Test whether we already have a hyperdoc for this document */
1.78 frystyk 1174: if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
1175: if (mr->flags & MR_BFS) {
1176: nhd->method = METHOD_HEAD;
1177: HTQueue_enqueue(mr->queue, (void *) nhd);
1178: (mr->cq)++;
1179: if(mr->ndoc > 0) mr->ndoc--;
1180: } else {
1181: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1182: HTRequest * newreq = newfinger->request;
1183: HTRequest_setParent(newreq, referer);
1184: if (check || depth >= mr->depth) {
1185: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d using HEAD\n", depth);
1186: HTRequest_setMethod(newreq, METHOD_HEAD);
1187: } else {
1188: if (SHOW_QUIET(mr)) HTTrace("loading at depth %d\n", depth);
1189: }
1190: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1191: if (SHOW_QUIET(mr)) HTTrace("not tested!\n");
1192: Finger_delete(newfinger);
1193: }
1194: }
1.75 frystyk 1195:
1.7 frystyk 1196: } else {
1.75 frystyk 1197: if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
1.68 frystyk 1198: #ifdef HT_MYSQL
1199: if (mr->reject || mr->sqllog) {
1200: #else
1.60 frystyk 1201: if (mr->reject) {
1.68 frystyk 1202: #endif
1.60 frystyk 1203: if (referer) {
1204: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1205: if (mr->reject && ref_addr)
1206: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1207: #ifdef HT_MYSQL
1208: if (mr->sqllog && mr->sqlexternals && ref_addr)
1209: HTSQLLog_addLinkRelationship(mr->sqllog,
1210: ref_addr, uri,
1211: "referer", NULL);
1212: #endif
1213:
1.60 frystyk 1214: HT_FREE(ref_addr);
1215: }
1216: }
1.2 frystyk 1217: }
1.11 frystyk 1218: HT_FREE(uri);
1.2 frystyk 1219: }
1220: }
1221:
1222: PUBLIC void HText_appendImage (HText * text, HTChildAnchor * anchor,
1.14 frystyk 1223: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1224: {
1225: if (text && anchor) {
1.34 eric 1226: Finger * finger = (Finger *) HTRequest_context(text->request);
1227: Robot * mr = finger->robot;
1.75 frystyk 1228:
1.59 frystyk 1229: if (mr->flags & MR_IMG) {
1.60 frystyk 1230: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1231: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1232: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1233: HyperDoc * hd = HTAnchor_document(dest_parent);
1234: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1235: BOOL match = YES;
1236:
1.72 frystyk 1237: if (!uri) return;
1.59 frystyk 1238: if (hd) {
1.75 frystyk 1239: if (SHOW_QUIET(mr)) HTTrace("............ Already checked\n");
1.59 frystyk 1240: hd->hits++;
1.68 frystyk 1241: #ifdef HT_MYSQL
1242: if (mr->sqllog) {
1243: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1244: if (ref_addr) {
1245: HTSQLLog_addLinkRelationship(mr->sqllog,
1246: ref_addr, uri,
1247: "image", alt);
1248: HT_FREE(ref_addr);
1249: }
1250: }
1251: #endif
1.11 frystyk 1252: HT_FREE(uri);
1.59 frystyk 1253: return;
1.2 frystyk 1254: }
1.59 frystyk 1255:
1256: /* Check for prefix match */
1257: if (mr->img_prefix) match = HTStrMatch(mr->img_prefix, uri) ? YES : NO;
1258:
1.79 ! br334 1259: #ifdef HT_POSIX_REGEX
! 1260: /*
! 1261: ** Check for any regular expression. The include may override
! 1262: ** the prefix matching
! 1263: */
! 1264: if (mr->include) {
! 1265: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
! 1266: }
! 1267: if (match && mr->exc_robot) {
! 1268: match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
! 1269: }
! 1270: if (match && mr->exclude) {
! 1271: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
! 1272: }
! 1273: #endif
1.59 frystyk 1274: /* Test whether we already have a hyperdoc for this document */
1275: if (match && dest) {
1.60 frystyk 1276: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1277: mr->flags & MR_SAVE ?
1278: METHOD_GET : METHOD_HEAD);
1279: HTRequest * newreq = newfinger->request;
1.60 frystyk 1280: HyperDoc_new(mr, dest_parent, 1);
1281: HTRequest_setParent(newreq, referer);
1282:
1283: /* Check whether we should report missing ALT tags */
1284: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1285: if (referer) {
1286: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1287: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1288: HT_FREE(ref_addr);
1289: }
1290: }
1291:
1.62 frystyk 1292: if (SHOW_QUIET(mr)) HTTrace("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1293: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.62 frystyk 1294: if (SHOW_QUIET(mr)) HTTrace("Robot....... Image not tested!\n");
1.59 frystyk 1295: Finger_delete(newfinger);
1296: }
1297: } else {
1.75 frystyk 1298: if (SHOW_QUIET(mr)) HTTrace("............ does not fulfill constraints\n");
1.68 frystyk 1299: #ifdef HT_MYSQL
1300: if (mr->reject || mr->sqllog) {
1301: #else
1.60 frystyk 1302: if (mr->reject) {
1.68 frystyk 1303: #endif
1.60 frystyk 1304: if (referer) {
1305: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1306: if (mr->reject && ref_addr)
1307: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1308: #ifdef HT_MYSQL
1309: if (mr->sqllog && mr->sqlexternals && ref_addr)
1310: HTSQLLog_addLinkRelationship(mr->sqllog,
1311: ref_addr, uri,
1312: "image", alt);
1313: #endif
1314:
1.60 frystyk 1315: HT_FREE(ref_addr);
1316: }
1317: }
1.1 frystyk 1318: }
1.59 frystyk 1319: HT_FREE(uri);
1.72 frystyk 1320: }
1321: }
1322: }
1323:
1324: PUBLIC void HText_appendLink (HText * text, HTChildAnchor * anchor,
1325: const BOOL * present, const char ** value)
1326: {
1327: if (text && anchor) {
1328: Finger * finger = (Finger *) HTRequest_context(text->request);
1329: Robot * mr = finger->robot;
1330: if (SHOW_QUIET(mr))
1331: HTTrace("Robot....... Received Link element with anchor %p\n", anchor);
1332: HText_beginAnchor(text, anchor);
1333: }
1334: }
1335:
1336: PUBLIC void HText_appendObject (HText * text, int element_number,
1337: const BOOL * present, const char ** value)
1338: {
1339: /* Here we can look for frames, link tags, meta tags etc. */
1340: if (text && text->request) {
1341: Finger * finger = (Finger *) HTRequest_context(text->request);
1342: Robot * mr = finger->robot;
1343:
1344: if (SHOW_QUIET(mr))
1345: HTTrace("Robot....... HText Object %p called with HTML element number %d\n",
1346: text, element_number);
1347:
1348: switch (element_number) {
1349:
1350: case HTML_FRAME:
1351: {
1352: HTChildAnchor * source = HTAnchor_findChildAndLink(
1353: HTRequest_anchor(text->request), /* Parent */
1354: NULL, /* Tag */
1355: present[HTML_FRAME_SRC] ? value[HTML_FRAME_SRC] : NULL, /* Addresss */
1356: NULL); /* Rels */
1357: HText_beginAnchor(text, source);
1358: }
1359: break;
1360:
1361: case HTML_BODY:
1362: {
1363: HTChildAnchor * source = HTAnchor_findChildAndLink(
1364: HTRequest_anchor(text->request), /* Parent */
1365: NULL, /* Tag */
1366: present[HTML_BODY_BACKGROUND] ? value[HTML_BODY_BACKGROUND] : NULL, /* Addresss */
1367: NULL); /* Rels */
1368: HText_appendImage(text, source, NULL, NULL, NO);
1369: }
1370: break;
1371:
1372: default:
1373: break;
1.1 frystyk 1374: }
1375: }
1376: }
1377:
1378: PUBLIC void HText_endAnchor (HText * text) {}
1.14 frystyk 1379: PUBLIC void HText_appendText (HText * text, const char * str) {}
1.1 frystyk 1380: PUBLIC void HText_appendCharacter (HText * text, char ch) {}
1381: PUBLIC void HText_endAppend (HText * text) {}
1382: PUBLIC void HText_setStyle (HText * text, HTStyle * style) {}
1383: PUBLIC void HText_beginAppend (HText * text) {}
1384: PUBLIC void HText_appendParagraph (HText * text) {}
1385:
1.75 frystyk 1386:
1387: PUBLIC char *
1388: get_robots_txt(char *uri)
1.48 frystyk 1389: {
1.75 frystyk 1390: char *str = NULL;
1391: HTChunk * chunk;
1392: HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
1393: HTRequest *request = HTRequest_new();
1394: HTRequest_setOutputFormat(request, WWW_SOURCE);
1395: HTRequest_setPreemptive(request, YES);
1396: HTRequest_setMethod(request, METHOD_GET);
1397: chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
1398: str = HTChunk_toCString(chunk);
1399: HTRequest_delete(request);
1400: return str;
1.48 frystyk 1401: }
1402:
Webmaster