Annotation of libwww/Robot/src/HTRobot.c, revision 1.88
1.75 frystyk 1: /*
1.88 ! frystyk 2: ** @(#) $Id: HTRobot.c,v 1.87 1999/03/07 17:17:19 frystyk Exp $
1.75 frystyk 3: **
4: ** W3C Webbot can be found at "http://www.w3.org/Robot/"
5: **
6: ** Copyright 1995-1998 World Wide Web Consortium, (Massachusetts
7: ** Institute of Technology, Institut National de Recherche en
8: ** Informatique et en Automatique, Keio University). All Rights
9: ** Reserved. This program is distributed under the W3C's Software
10: ** Intellectual Property License. This program is distributed in the hope
11: ** that it will be useful, but WITHOUT ANY WARRANTY; without even the
12: ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
13: ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more
14: ** details.
1.1 frystyk 15: **
16: ** Authors:
17: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org)
1.75 frystyk 18: ** BR Bob Racko
19: ** JP John Punin
1.1 frystyk 20: **
21: ** History:
22: ** Dec 04 95 First version
1.75 frystyk 23: ** Oct 1998 Split into separate files
1.1 frystyk 24: */
25:
1.75 frystyk 26: #include "HTRobMan.h"
27: #include "HTQueue.h"
28: #include "HTAncMan.h"
1.51 frystyk 29:
1.62 frystyk 30: #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET))
31: #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET))
1.1 frystyk 32:
1.75 frystyk 33: PRIVATE HTErrorMessage HTErrors[HTERR_ELEMENTS] = {HTERR_ENGLISH_INITIALIZER};
1.58 frystyk 34:
35: /*
36: ** Some sorting algorithms
37: */
1.63 frystyk 38: PRIVATE HTComparer HitSort, FormatSort, LastModifiedSort, TitleSort;
1.58 frystyk 39:
1.80 frystyk 40: /*
41: ** Ths callbacks that we need from the libwww HTML parser
42: */
43: PRIVATE HText_new RHText_new;
44: PRIVATE HText_delete RHText_delete;
45: PRIVATE HText_foundLink RHText_foundLink;
1.1 frystyk 46:
47: /* ------------------------------------------------------------------------- */
48:
1.2 frystyk 49: /* Create a "HyperDoc" object
50: ** --------------------------
51: ** A HyperDoc object contains information about whether we have already
52: ** started checking the anchor and the depth in our search
53: */
1.75 frystyk 54: PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth)
1.2 frystyk 55: {
56: HyperDoc * hd;
1.14 frystyk 57: if ((hd = (HyperDoc *) HT_CALLOC(1, sizeof(HyperDoc))) == NULL)
58: HT_OUTOFMEM("HyperDoc_new");
1.2 frystyk 59: hd->depth = depth;
1.55 frystyk 60: hd->hits = 1;
1.75 frystyk 61:
1.86 frystyk 62: hd->code = NO_CODE;
1.75 frystyk 63: hd->index = ++mr->cindex;
64:
1.2 frystyk 65: /* Bind the HyperDoc object together with the Anchor Object */
66: hd->anchor = anchor;
67: HTAnchor_setDocument(anchor, (void *) hd);
68:
69: /* Add this HyperDoc object to our list */
70: if (!mr->hyperdoc) mr->hyperdoc = HTList_new();
71: HTList_addObject(mr->hyperdoc, (void *) hd);
72: return hd;
73: }
74:
75: /* Delete a "HyperDoc" object
76: ** --------------------------
77: */
1.75 frystyk 78: PUBLIC BOOL HyperDoc_delete (HyperDoc * hd)
1.2 frystyk 79: {
80: if (hd) {
1.11 frystyk 81: HT_FREE (hd);
1.2 frystyk 82: return YES;
83: }
84: return NO;
85: }
86:
1.55 frystyk 87: /*
88: ** Sort the anchor array and log reference count
89: */
90: PRIVATE BOOL calculate_hits (Robot * mr, HTArray * array)
91: {
92: if (mr && array) {
93: HTLog * log = HTLog_open(mr->hitfile, YES, YES);
94: if (log) {
95: void ** data = NULL;
96: HTParentAnchor * anchor = NULL;
97: HTArray_sort(array, HitSort);
98: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
99: while (anchor) {
100: char * uri = HTAnchor_address((HTAnchor *) anchor);
101: HyperDoc * hd = (HyperDoc *) HTAnchor_document(anchor);
1.63 frystyk 102: if (uri && hd) HTLog_addText(log, "%8d %s\n", hd->hits, uri);
1.55 frystyk 103: HT_FREE(uri);
104: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
105: }
106: }
107: HTLog_close(log);
108: return YES;
109: }
110: return NO;
111: }
112:
113: PRIVATE int HitSort (const void * a, const void * b)
114: {
115: HyperDoc * aa = HTAnchor_document(*(HTParentAnchor **) a);
116: HyperDoc * bb = HTAnchor_document(*(HTParentAnchor **) b);
117: if (aa && bb) return (bb->hits - aa->hits);
118: return bb - aa;
119: }
120:
1.58 frystyk 121: /*
1.64 frystyk 122: ** Sort the anchor array and log link relations
123: */
124: PRIVATE BOOL calculate_linkRelations (Robot * mr, HTArray * array)
125: {
126: if (mr && array) {
1.68 frystyk 127: HTLog * log = mr->relfile ? HTLog_open(mr->relfile, YES, YES) : NULL;
128: void ** data = NULL;
129: HTParentAnchor * anchor = NULL;
130: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
131: while (anchor) {
132:
133: /*
134: ** If we have a specific link relation to look for then do this.
135: ** Otherwise look for all link relations.
136: */
137: if (mr->relation) {
138: HTLink * link = HTAnchor_findLinkType((HTAnchor *) anchor, mr->relation);
139: if (link) {
140: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
141: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
142: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
143: if (src_uri && dest_uri) {
144: #ifdef HT_MYSQL
145: if (mr->sqllog) {
146: HTSQLLog_addLinkRelationship (mr->sqllog,
147: src_uri, dest_uri,
148: HTAtom_name(mr->relation),
149: NULL);
150: }
151: #endif
152: if (log) {
153: HTFormat format = HTAnchor_format(dest);
154: HTLog_addText(log, "%s %s %s --> %s\n",
155: HTAtom_name(mr->relation),
156: format != WWW_UNKNOWN ?
157: HTAtom_name(format) : "<unknown>",
158: src_uri, dest_uri);
159: }
160:
161: /* Cleanup */
162: HT_FREE(src_uri);
163: HT_FREE(dest_uri);
164: }
165: }
166: } else {
167: HTLink * link = HTAnchor_mainLink((HTAnchor *) anchor);
168: HTList * sublinks = HTAnchor_subLinks((HTAnchor *) anchor);
169: char * src_uri = HTAnchor_address((HTAnchor *) anchor);
170: HTLinkType linktype;
171:
172: /* First look in the main link */
173: if (link && (linktype = HTLink_type(link))) {
174: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link));
175: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
176: if (src_uri && dest_uri) {
177: #ifdef HT_MYSQL
178: if (mr->sqllog) {
179: HTSQLLog_addLinkRelationship (mr->sqllog,
180: src_uri, dest_uri,
181: HTAtom_name(linktype),
182: NULL);
183: }
184: #endif
185: if (log) {
186: HTFormat format = HTAnchor_format(dest);
187: HTLog_addText(log, "%s %s %s --> %s\n",
188: HTAtom_name(linktype),
189: format != WWW_UNKNOWN ?
190: HTAtom_name(format) : "<unknown>",
191: src_uri, dest_uri);
192: }
193: }
194: HT_FREE(dest_uri);
195: }
196:
197: /* and then in any sublinks */
198: if (sublinks) {
199: HTLink * pres;
200: while ((pres = (HTLink *) HTList_nextObject(sublinks))) {
201: if ((linktype = HTLink_type(pres))) {
202: HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(pres));
1.64 frystyk 203: char * dest_uri = HTAnchor_address((HTAnchor *) dest);
1.68 frystyk 204: if (src_uri && dest_uri) {
205: #ifdef HT_MYSQL
206: if (mr->sqllog) {
207: HTSQLLog_addLinkRelationship (mr->sqllog,
208: src_uri, dest_uri,
209: HTAtom_name(linktype),
210: NULL);
211: }
212: #endif
213: if (log) {
214: HTFormat format = HTAnchor_format(dest);
215: HTLog_addText(log, "%s %s %s --> %s\n",
216: HTAtom_name(linktype),
217: format != WWW_UNKNOWN ?
218: HTAtom_name(format) : "<unknown>",
219: src_uri, dest_uri);
220: }
1.64 frystyk 221: HT_FREE(dest_uri);
222: }
223: }
224: }
225: }
1.68 frystyk 226:
227: /* Cleanup */
228: HT_FREE(src_uri);
229: }
230: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
1.64 frystyk 231: }
1.68 frystyk 232: if (log) HTLog_close(log);
1.64 frystyk 233: return YES;
234: }
235: return NO;
236: }
237:
238: /*
1.63 frystyk 239: ** Sort the anchor array and log last modified date
240: */
241: PRIVATE BOOL calculate_lm (Robot * mr, HTArray * array)
242: {
243: if (mr && array) {
244: HTLog * log = HTLog_open(mr->lmfile, YES, YES);
245: if (log) {
246: void ** data = NULL;
247: HTParentAnchor * anchor = NULL;
248: HTArray_sort(array, LastModifiedSort);
249: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
250: while (anchor) {
251: char * uri = HTAnchor_address((HTAnchor *) anchor);
252: time_t lm = HTAnchor_lastModified(anchor);
253: if (uri && lm > 0)
254: HTLog_addText(log, "%s %s\n", HTDateTimeStr(&lm, NO), uri);
255: HT_FREE(uri);
256: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
257: }
258: }
259: HTLog_close(log);
260: return YES;
261: }
262: return NO;
263: }
264:
265: PRIVATE int LastModifiedSort (const void * a, const void * b)
266: {
267: time_t aa = HTAnchor_lastModified(*(HTParentAnchor **) a);
268: time_t bb = HTAnchor_lastModified(*(HTParentAnchor **) b);
269: return bb - aa;
270: }
271:
272: /*
273: ** Sort the anchor array and log the document title
274: */
275: PRIVATE BOOL calculate_title (Robot * mr, HTArray * array)
276: {
277: if (mr && array) {
278: HTLog * log = HTLog_open(mr->titlefile, YES, YES);
279: if (log) {
280: void ** data = NULL;
281: HTParentAnchor * anchor = NULL;
282: HTArray_sort(array, TitleSort);
283: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
284: while (anchor) {
285: char * uri = HTAnchor_address((HTAnchor *) anchor);
286: const char * title = HTAnchor_title(anchor);
287: HTCharset charset = HTAnchor_charset(anchor);
288: if (uri) HTLog_addText(log, "%s `%s\' %s\n",
289: charset ? HTAtom_name(charset) : "<none>",
290: title ? title : "<none>",
291: uri);
292: HT_FREE(uri);
293: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
294: }
295: }
296: HTLog_close(log);
297: return YES;
298: }
299: return NO;
300: }
301:
302: PRIVATE int TitleSort (const void * a, const void * b)
303: {
304: const char * aa = HTAnchor_title(*(HTParentAnchor **) a);
305: const char * bb = HTAnchor_title(*(HTParentAnchor **) b);
306: return strcasecomp(bb?bb:"", aa?aa:"");
307: }
308:
309: /*
1.58 frystyk 310: ** Calculate distributions for media types. The same mechanism
311: ** can be used for other characteristics with relatively
312: ** few outcomes.
313: */
314: PRIVATE HTList * mediatype_distribution (HTArray * array)
315: {
316: if (array) {
317: HTList * mt = HTList_new();
318: MetaDist * pres = NULL;
319: void ** data = NULL;
320: HTParentAnchor * anchor = NULL;
321: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
322: while (anchor) {
323: HTFormat format = HTAnchor_format(anchor);
324: if (format && format != WWW_UNKNOWN) {
325: HTList * cur = mt;
326:
327: /* If found then increase counter */
328: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
329: if (pres->name == format) {
330: pres->hits++;
331: break;
332: }
333: }
334:
335: /* If not found then add new format to list */
336: if (!pres) {
337: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
338: HT_OUTOFMEM("mediatype_distribution");
339: pres->name = format;
340: pres->hits = 1;
341: HTList_addObject(mt, pres);
342: HTList_insertionSort(mt, FormatSort);
343: }
344: }
345:
346: /* Find next anchor in array */
347: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
348: }
349: return mt;
350: }
351: return NULL;
352: }
353:
1.60 frystyk 354: /*
355: ** Calculate distributions for charsets. The same mechanism
356: ** can be used for other characteristics with relatively
357: ** few outcomes.
358: */
359: PRIVATE HTList * charset_distribution (HTArray * array)
360: {
361: if (array) {
362: HTList * cs = HTList_new();
363: MetaDist * pres = NULL;
364: void ** data = NULL;
365: HTParentAnchor * anchor = NULL;
366: anchor = (HTParentAnchor *) HTArray_firstObject(array, data);
367: while (anchor) {
368: HTCharset charset = HTAnchor_charset(anchor);
369: if (charset) {
370: HTList * cur = cs;
371:
372: /* If found then increase counter */
373: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
374: if (pres->name == charset) {
375: pres->hits++;
376: break;
377: }
378: }
379:
380: /* If not found then add new format to list */
381: if (!pres) {
382: if ((pres = (MetaDist *) HT_CALLOC(1, sizeof(MetaDist))) == NULL)
383: HT_OUTOFMEM("charset_distribution");
384: pres->name = charset;
385: pres->hits = 1;
386: HTList_addObject(cs, pres);
387: HTList_insertionSort(cs, FormatSort);
388: }
389: }
390:
391: /* Find next anchor in array */
392: anchor = (HTParentAnchor *) HTArray_nextObject(array, data);
393: }
394: return cs;
395: }
396: return NULL;
397: }
398:
1.58 frystyk 399: PRIVATE int FormatSort (const void * a, const void * b)
400: {
401: MetaDist * aa = (MetaDist *) a;
402: MetaDist * bb = (MetaDist *) b;
403: return strcmp(HTAtom_name(bb->name), HTAtom_name(aa->name));
404: }
405:
406: PRIVATE BOOL log_meta_distribution (const char * logfile, HTList * distribution)
407: {
408: if (logfile && distribution) {
409: HTLog * log = HTLog_open(logfile, YES, YES);
410: if (log) {
411: HTList * cur = distribution;
412: MetaDist * pres;
413: while ((pres = (MetaDist *) HTList_nextObject(cur))) {
414: if (pres->name) {
1.60 frystyk 415: HTLog_addText(log, "%8d %s\n", pres->hits, HTAtom_name(pres->name));
1.58 frystyk 416: }
417: }
418: HTLog_close(log);
419: }
420: }
421: return NO;
422: }
423:
424: PRIVATE BOOL delete_meta_distribution (HTList * distribution)
425: {
426: if (distribution) {
427: HTList * cur = distribution;
428: MetaDist * pres;
429: while ((pres = (MetaDist *) HTList_nextObject(cur)))
430: HT_FREE(pres);
431: HTList_delete(distribution);
432: return YES;
433: }
434: return NO;
435: }
436:
437:
1.55 frystyk 438: /* Statistics
439: ** ----------
440: ** Calculates a bunch of statistics for the anchors traversed
441: */
442: PRIVATE BOOL calculate_statistics (Robot * mr)
443: {
1.59 frystyk 444: long total_docs = mr->get_docs + mr->head_docs + mr->other_docs;
1.55 frystyk 445: if (!mr) return NO;
446:
447: /* Calculate efficiency */
1.59 frystyk 448: if (mr->time > 0) {
1.56 frystyk 449: ms_t t = HTGetTimeInMillis() - mr->time;
450: if (t > 0) {
1.60 frystyk 451: double loadfactor = (mr->get_bytes / (t * 0.001));
452: double reqprsec = (total_docs / (t * 0.001));
1.56 frystyk 453: double secs = t / 1000.0;
1.55 frystyk 454: char bytes[50];
1.62 frystyk 455: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 456: HTPrint("\nAccessed %ld documents in %.2f seconds (%.2f requests pr sec)\n",
1.62 frystyk 457: total_docs, secs, reqprsec);
1.59 frystyk 458:
459: HTNumToStr(mr->get_bytes, bytes, 50);
1.62 frystyk 460: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 461: HTPrint("\tDid a GET on %ld document(s) and downloaded %s bytes of document bodies (%2.1f bytes/sec)\n",
1.62 frystyk 462: mr->get_docs, bytes, loadfactor);
1.59 frystyk 463:
464: HTNumToStr(mr->head_bytes, bytes, 50);
1.62 frystyk 465: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 466: HTPrint("\tDid a HEAD on %ld document(s) with a total of %s bytes\n",
1.62 frystyk 467: mr->head_docs, bytes);
1.55 frystyk 468: }
469: }
470:
471: /* Create an array of existing anchors */
1.59 frystyk 472: if (total_docs > 1) {
473: HTArray * array = HTAnchor_getArray(total_docs);
1.55 frystyk 474: if (array) {
475:
1.63 frystyk 476: /* Distributions */
477: if (mr->flags & MR_DISTRIBUTIONS) {
1.82 frystyk 478: if (SHOW_REAL_QUIET(mr)) HTPrint("\nDistributions:\n");
1.63 frystyk 479: }
480:
1.55 frystyk 481: /* Sort after hit counts */
1.63 frystyk 482: if (mr->hitfile) {
483: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 484: HTPrint("\tLogged hit count distribution in file `%s\'\n",
1.63 frystyk 485: mr->hitfile);
486: calculate_hits(mr, array);
487: }
488:
1.64 frystyk 489: /* Sort after link relations */
1.68 frystyk 490: #ifdef HT_MYSQL
491: if (mr->relfile || mr->sqllog) {
1.69 frystyk 492: #else
493: if (mr->relfile) {
494: #endif
1.68 frystyk 495: if (mr->relfile && SHOW_REAL_QUIET(mr))
1.82 frystyk 496: HTPrint("\tLogged link relationship distribution in file `%s\'\n",
1.64 frystyk 497: mr->relfile);
498: calculate_linkRelations(mr, array);
499: }
500:
1.63 frystyk 501: /* Sort after modified date */
502: if (mr->lmfile) {
503: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 504: HTPrint("\tLogged last modified distribution in file `%s\'\n",
1.63 frystyk 505: mr->lmfile);
506: calculate_lm(mr, array);
507: }
508:
509: /* Sort after title */
510: if (mr->titlefile) {
511: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 512: HTPrint("\tLogged title distribution in file `%s\'\n",
1.63 frystyk 513: mr->titlefile);
514: calculate_title(mr, array);
515: }
1.55 frystyk 516:
1.58 frystyk 517: /* Find mediatype distribution */
518: if (mr->mtfile) {
519: HTList * mtdist = mediatype_distribution(array);
520: if (mtdist) {
1.63 frystyk 521: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 522: HTPrint("\tLogged media type distribution in file `%s\'\n",
1.63 frystyk 523: mr->mtfile);
1.58 frystyk 524: log_meta_distribution(mr->mtfile, mtdist);
525: delete_meta_distribution(mtdist);
526: }
527: }
1.55 frystyk 528:
1.60 frystyk 529: /* Find charset distribution */
530: if (mr->charsetfile) {
531: HTList * charsetdist = charset_distribution(array);
532: if (charsetdist) {
1.63 frystyk 533: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 534: HTPrint("\tLogged charset distribution in file `%s\'\n",
1.63 frystyk 535: mr->charsetfile);
1.60 frystyk 536: log_meta_distribution(mr->charsetfile, charsetdist);
537: delete_meta_distribution(charsetdist);
538: }
539: }
540:
1.55 frystyk 541: /* Add as may other stats here as you like */
1.60 frystyk 542: /* ... */
1.58 frystyk 543:
544: /* Delete the array */
1.55 frystyk 545: HTArray_delete(array);
546: }
547: }
548: return YES;
549: }
550:
1.75 frystyk 551: PRIVATE HTParentAnchor *
552: get_last_parent(HTParentAnchor *anchor)
553: {
554: HTAnchor *anc;
555: HTList *sources = anchor->sources;
556:
557: while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL)
558: {
559: HTParentAnchor *panchor = HTAnchor_parent(anc);
560: return panchor;
561: }
562: return NULL;
563: }
564:
1.86 frystyk 565: PRIVATE HTLink *
566: HTLink_find_type(HTAnchor * src, HTAnchor * dest, char *linktype)
567: {
568: if(src && dest && linktype)
569: {
570: HTLink * link = HTAnchor_mainLink(src);
571: HTList * sublinks = HTAnchor_subLinks(src);
572: HTLinkType type = (HTLinkType)HTAtom_caseFor(linktype);
573: HTAnchor *sdest = HTLink_destination(link);
574: if (link && sdest == dest && type == HTLink_type(link))
575: return link;
576: else if (sublinks) {
577: while ((link = (HTLink *) HTList_nextObject (sublinks))) {
578: sdest = HTLink_destination(link);
579: if (sdest == dest && HTLink_type(link) == type)
580: return link;
581:
582: }
583: }
584: }
585: return NULL;
586: }
587:
588: PRIVATE void
589: update_incoming_links(HTParentAnchor *anchor, HTParentAnchor *nanchor)
590: {
591: if(anchor && nanchor) {
592: HTAnchor *anc;
593: HTList *sources = anchor->sources;
594: while((anc = (HTAnchor *) HTList_nextObject(sources)) != NULL) {
595: HTParentAnchor *panchor = HTAnchor_parent(anc);
596: if((HTLink_find((HTAnchor *)panchor,(HTAnchor *)anchor)) &&
597: (!HTLink_find_type((HTAnchor *)panchor,
598: (HTAnchor *)nanchor,"redirection"))) {
599: HTLink_add((HTAnchor *)panchor,(HTAnchor *)nanchor,
600: (HTLinkType) HTAtom_caseFor("redirection"),
601: METHOD_HEAD);
602: }
603: }
604: }
605: }
606:
607: PRIVATE void
608: update_hyperdoc(HyperDoc *hd,HTRequest *request)
609: {
610: if(hd && request) {
611: HTParentAnchor *anchor = hd->anchor;
612: HTParentAnchor *nanchor = HTRequest_anchor(request);
613: HTParentAnchor *parent = HTRequest_parent(request);
614: HyperDoc *nhd = HTAnchor_document(nanchor);
615:
616: char *tit = (char *) HTAnchor_title(nanchor);
617:
618: if(nhd && tit)
619: StrAllocCopy(nhd->title,tit);
620:
621: if (anchor != nanchor) {
622: if(nhd) { /* The redirected anchor has a Hyperdoc */
623: if(nhd != hd) {
624: hd->code = REDIR_CODE;
625:
626: HTAnchor_setDocument(anchor,(void *)nhd);
627:
628: if(!HTLink_find_type((HTAnchor *)parent,
629: (HTAnchor *)nanchor,"redirection")) {
630: HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor,
631: (HTLinkType) HTAtom_caseFor("redirection"),
632: METHOD_HEAD);
633: }
634: }
635: } else { /* The redirected anchor does not have a Hyperdoc */
636: hd->anchor = nanchor;
637: HTAnchor_setDocument(nanchor,(void *) hd);
638:
639: if(!HTLink_find_type((HTAnchor *)parent,(HTAnchor *)nanchor,
640: "redirection")) {
641: HTLink_add((HTAnchor *)parent,(HTAnchor *)nanchor,
642: (HTLinkType) HTAtom_caseFor("redirection") ,
643: METHOD_HEAD);
644: }
645: }
646: update_incoming_links(anchor,nanchor);
647: }
648: }
649: }
650:
1.75 frystyk 651: PRIVATE void
652: set_error_state_hyperdoc(HyperDoc * hd, HTRequest *request)
653: {
1.86 frystyk 654: HTList * cur = HTRequest_error(request);
655: HTError *pres;
656: Finger * finger = (Finger *) HTRequest_context(request);
657: Robot * mr = finger->robot;
1.75 frystyk 658:
1.86 frystyk 659: while((pres = (HTError *) HTList_nextObject(cur)) != NULL) {
660: int code =HTErrors[HTError_index(pres)].code;
661:
662: hd->code = code;
1.75 frystyk 663:
1.86 frystyk 664: if((mr->flags & MR_REDIR) && code >= 200 && code < 300 )
665: update_hyperdoc(hd,request);
1.75 frystyk 666: }
667: }
668:
1.87 frystyk 669: #if 0
1.75 frystyk 670: PRIVATE int
671: test_for_blank_spaces(char *uri)
672: {
673: char *ptr = uri;
674: for(;*ptr!='\0';ptr++)
675: if(*ptr == ' ')
676: return 1;
677: return 0;
678: }
1.87 frystyk 679: #endif
1.75 frystyk 680:
1.1 frystyk 681: /* Create a Command Line Object
682: ** ----------------------------
683: */
1.75 frystyk 684: PUBLIC Robot * Robot_new (void)
1.1 frystyk 685: {
686: Robot * me;
1.41 frystyk 687: if ((me = (Robot *) HT_CALLOC(1, sizeof(Robot))) == NULL)
1.14 frystyk 688: HT_OUTOFMEM("Robot_new");
1.2 frystyk 689: me->hyperdoc = HTList_new();
1.4 frystyk 690: me->htext = HTList_new();
1.74 frystyk 691: me->timer = DEFAULT_TIMEOUT*MILLIES;
1.75 frystyk 692: me->waits = 0;
1.25 frystyk 693: me->cwd = HTGetCurrentDirectoryURL();
1.1 frystyk 694: me->output = OUTPUT;
1.35 eric 695: me->cnt = 0;
1.75 frystyk 696: me->ndoc = -1;
1.34 eric 697: me->fingers = HTList_new();
1.75 frystyk 698:
699: /* This is new */
700: me->queue = HTQueue_new();
701: me->cq = 0;
702: me->furl = NULL;
703:
1.1 frystyk 704: return me;
705: }
706:
707: /* Delete a Command Line Object
708: ** ----------------------------
709: */
1.62 frystyk 710: PRIVATE BOOL Robot_delete (Robot * mr)
1.1 frystyk 711: {
1.62 frystyk 712: if (mr) {
713: HTList_delete(mr->fingers);
1.55 frystyk 714:
715: /* Calculate statistics */
1.62 frystyk 716: calculate_statistics(mr);
1.55 frystyk 717:
1.62 frystyk 718: if (mr->hyperdoc) {
719: HTList * cur = mr->hyperdoc;
1.2 frystyk 720: HyperDoc * pres;
721: while ((pres = (HyperDoc *) HTList_nextObject(cur)))
722: HyperDoc_delete(pres);
1.62 frystyk 723: HTList_delete(mr->hyperdoc);
1.2 frystyk 724: }
1.62 frystyk 725: if (mr->htext) {
726: HTList * cur = mr->htext;
1.4 frystyk 727: HText * pres;
728: while ((pres = (HText *) HTList_nextObject(cur)))
1.80 frystyk 729: RHText_delete(pres);
1.62 frystyk 730: HTList_delete(mr->htext);
1.4 frystyk 731: }
1.62 frystyk 732:
733: /* Close all the log files */
1.63 frystyk 734: if (mr->flags & MR_LOGGING) {
1.82 frystyk 735: if (SHOW_REAL_QUIET(mr)) HTPrint("\nRaw Log files:\n");
1.63 frystyk 736: }
737:
1.62 frystyk 738: if (mr->log) {
739: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 740: HTPrint("\tLogged %5d entries in general log file `%s\'\n",
1.62 frystyk 741: HTLog_accessCount(mr->log), mr->logfile);
742: HTLog_close(mr->log);
743: }
744: if (mr->ref) {
745: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 746: HTPrint("\tLogged %5d entries in referer log file `%s\'\n",
1.62 frystyk 747: HTLog_accessCount(mr->ref), mr->reffile);
748: HTLog_close(mr->ref);
749: }
750: if (mr->reject) {
751: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 752: HTPrint("\tLogged %5d entries in rejected log file `%s\'\n",
1.62 frystyk 753: HTLog_accessCount(mr->reject), mr->rejectfile);
754: HTLog_close(mr->reject);
755: }
756: if (mr->notfound) {
757: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 758: HTPrint("\tLogged %5d entries in not found log file `%s\'\n",
1.62 frystyk 759: HTLog_accessCount(mr->notfound), mr->notfoundfile);
760: HTLog_close(mr->notfound);
761: }
762: if (mr->conneg) {
763: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 764: HTPrint("\tLogged %5d entries in content negotiation log file `%s\'\n",
1.62 frystyk 765: HTLog_accessCount(mr->conneg), mr->connegfile);
766: HTLog_close(mr->conneg);
767: }
768: if (mr->noalttag) {
769: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 770: HTPrint("\tLogged %5d entries in missing alt tag log file `%s\'\n",
1.62 frystyk 771: HTLog_accessCount(mr->noalttag), mr->noalttagfile);
772: HTLog_close(mr->noalttag);
773: }
774:
775: if (mr->output && mr->output != STDOUT) fclose(mr->output);
776:
777: if (mr->flags & MR_TIME) {
1.12 frystyk 778: time_t local = time(NULL);
1.62 frystyk 779: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 780: HTPrint("\nRobot terminated %s\n", HTDateTimeStr(&local, YES));
1.12 frystyk 781: }
1.55 frystyk 782:
1.75 frystyk 783: /* This is new */
1.88 ! frystyk 784: HT_FREE(mr->cdepth);
! 785: HT_FREE(mr->furl);
1.75 frystyk 786:
1.58 frystyk 787: #ifdef HT_POSIX_REGEX
1.62 frystyk 788: if (mr->include) {
789: regfree(mr->include);
790: HT_FREE(mr->include);
791: }
792: if (mr->exclude) {
793: regfree(mr->exclude);
794: HT_FREE(mr->exclude);
795: }
1.75 frystyk 796: if (mr->exc_robot) {
797: regfree(mr->exc_robot);
798: HT_FREE(mr->exc_robot);
799: }
1.62 frystyk 800: if (mr->check) {
801: regfree(mr->check);
802: HT_FREE(mr->check);
1.58 frystyk 803: }
804: #endif
805:
1.68 frystyk 806: #ifdef HT_MYSQL
807: if (mr->sqllog) {
808: HTSQLLog_close(mr->sqllog);
809: mr->sqllog = NULL;
810: }
811: #endif
812:
1.81 frystyk 813: if (mr->queue) HTQueue_delete(mr->queue);
1.62 frystyk 814: HT_FREE(mr->cwd);
815: HT_FREE(mr->prefix);
816: HT_FREE(mr->img_prefix);
817: HT_FREE(mr);
1.1 frystyk 818: return YES;
819: }
820: return NO;
821: }
822:
1.2 frystyk 823: /*
1.34 eric 824: ** This function creates a new finger object and initializes it with a new request
1.2 frystyk 825: */
1.75 frystyk 826: PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method)
1.2 frystyk 827: {
1.34 eric 828: Finger * me;
829: HTRequest * request = HTRequest_new();
830: if ((me = (Finger *) HT_CALLOC(1, sizeof(Finger))) == NULL)
831: HT_OUTOFMEM("Finger_new");
832: me->robot = robot;
833: me->request = request;
834: me->dest = dest;
835: HTList_addObject(robot->fingers, (void *)me);
836:
1.48 frystyk 837: /* Set the context for this request */
1.34 eric 838: HTRequest_setContext (request, me);
1.48 frystyk 839:
840: /* Check the various flags to customize the request */
841: if (robot->flags & MR_PREEMPTIVE)
842: HTRequest_setPreemptive(request, YES);
843: if (robot->flags & MR_VALIDATE)
844: HTRequest_setReloadMode(request, HT_CACHE_VALIDATE);
845: if (robot->flags & MR_END_VALIDATE)
846: HTRequest_setReloadMode(request, HT_CACHE_END_VALIDATE);
847:
848: /* We wanna make sure that we are sending a Host header (default) */
1.34 eric 849: HTRequest_addRqHd(request, HT_C_HOST);
1.48 frystyk 850:
851: /* Set the method for this request */
1.34 eric 852: HTRequest_setMethod(request, method);
853: robot->cnt++;
854: return me;
1.2 frystyk 855: }
856:
1.34 eric 857: PRIVATE int Finger_delete (Finger * me)
1.2 frystyk 858: {
1.34 eric 859: HTList_removeObject(me->robot->fingers, (void *)me);
860: me->robot->cnt--;
1.37 frystyk 861:
862: /*
863: ** If we are down at one request then flush the output buffer
864: */
865: if (me->request) {
866: if (me->robot->cnt == 1) HTRequest_forceFlush(me->request);
1.34 eric 867: HTRequest_delete(me->request);
1.37 frystyk 868: }
869:
870: /*
871: ** Delete the request and free myself
872: */
1.34 eric 873: HT_FREE(me);
874: return YES;
1.2 frystyk 875: }
876:
1.88 ! frystyk 877: PRIVATE BOOL check_constraints(Robot * mr, char *prefix, char *uri)
! 878: {
! 879: BOOL match = YES;
! 880: /* Check for prefix match */
! 881: if (prefix) {
! 882: match = HTStrMatch(prefix, uri) ? YES : NO;
! 883: }
! 884:
! 885: #ifdef HT_POSIX_REGEX
! 886: /* Check for any regular expression */
! 887: if (match && mr->include) {
! 888: match = regexec(mr->include, uri, 0, NULL, 0) ? NO : YES;
! 889: }
! 890: if (match && mr->exc_robot) {
! 891: match = regexec(mr->exc_robot, uri, 0, NULL, 0) ? YES : NO;
! 892: }
! 893: if (match && mr->exclude) {
! 894: match = regexec(mr->exclude, uri, 0, NULL, 0) ? YES : NO;
! 895: }
! 896:
! 897: #endif
! 898: return match;
! 899: }
! 900:
1.2 frystyk 901: /*
902: ** Cleanup and make sure we close all connections including the persistent
903: ** ones
904: */
1.75 frystyk 905: PUBLIC void Cleanup (Robot * me, int status)
1.1 frystyk 906: {
1.84 frystyk 907: /*
908: ** First we clean up the robot itself and calculate the various
909: ** statistics. This can actually take some time as a lot of data
910: ** has to be manipulated
911: */
912: Robot_delete(me);
913:
914: /*
915: ** Then we shut down libwww
916: */
1.81 frystyk 917: HTProfile_delete();
1.84 frystyk 918:
1.50 frystyk 919: #ifdef HT_MEMLOG
1.39 eric 920: HTMemLog_close();
1.47 frystyk 921: #endif
922:
1.1 frystyk 923: #ifdef VMS
924: exit(status ? status : 1);
925: #else
926: exit(status ? status : 0);
927: #endif
928: }
929:
1.58 frystyk 930: #ifdef HT_POSIX_REGEX
931: PRIVATE char * get_regerror (int errcode, regex_t * compiled)
932: {
933: size_t length = regerror (errcode, compiled, NULL, 0);
934: char * str = NULL;
935: if ((str = (char *) HT_MALLOC(length+1)) == NULL)
936: HT_OUTOFMEM("get_regerror");
937: (void) regerror (errcode, compiled, str, length);
938: return str;
939: }
940:
1.75 frystyk 941: PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags)
1.58 frystyk 942: {
943: regex_t * regex = NULL;
944: if (regex_str && *regex_str) {
945: int status;
946: if ((regex = (regex_t *) HT_CALLOC(1, sizeof(regex_t))) == NULL)
947: HT_OUTOFMEM("get_regtype");
1.60 frystyk 948: if ((status = regcomp(regex, regex_str, cflags))) {
1.58 frystyk 949: char * err_msg = get_regerror(status, regex);
1.62 frystyk 950: if (SHOW_REAL_QUIET(mr))
1.82 frystyk 951: HTPrint("Regular expression error: %s\n", err_msg);
1.58 frystyk 952: HT_FREE(err_msg);
953: Cleanup(mr, -1);
954: }
955: }
956: return regex;
957: }
958: #endif
959:
1.75 frystyk 960: PUBLIC void VersionInfo (void)
1.1 frystyk 961: {
1.82 frystyk 962: HTPrint("\nW3C OpenSource Software");
963: HTPrint("\n-----------------------\n\n");
964: HTPrint("\tWebbot version %s\n", APP_VERSION);
965: HTPrint("\tusing the W3C libwww library version %s.\n\n",HTLib_version());
966: HTPrint("\tSee \"%s\" for help\n", COMMAND_LINE);
967: HTPrint("\tSee \"http://www.w3.org/Robot/User/\" for user information\n");
968: HTPrint("\tSee \"http://www.w3.org/Robot/\" for general information\n\n");
969: HTPrint("\tPlease send feedback to the <www-lib@w3.org> mailing list,\n");
970: HTPrint("\tsee \"http://www.w3.org/Library/#Forums\" for details\n\n");
1.1 frystyk 971: }
972:
1.88 ! frystyk 973: /* redirection_handler
! 974: ** -------------------
! 975: ** If we are set up to handle redirections then handle it here.
! 976: */
! 977: PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,
! 978: void * param, int status)
! 979: {
! 980: Finger * finger = (Finger *) HTRequest_context(request);
! 981: Robot * mr = finger->robot;
! 982: HTParentAnchor * me = HTRequest_anchor(request);
! 983: HTAnchor * redirection = HTResponse_redirection(response);
! 984: HTParentAnchor * redirection_parent = HTAnchor_parent(redirection);
! 985: HyperDoc * redirection_hd = HTAnchor_document(redirection_parent);
! 986: char * uri = NULL;
! 987: char * redirection_parent_addr = NULL;
! 988: BOOL match = YES;
! 989: BOOL check = NO;
! 990:
! 991: /* In case we didn't get any redirection destination */
! 992: if (!redirection) return HT_OK;
! 993:
! 994: /* Get the addresses */
! 995: uri = HTAnchor_address((HTAnchor *) me);
! 996: redirection_parent_addr = HTAnchor_address((HTAnchor *) redirection_parent);
! 997: if (SHOW_QUIET(mr))
! 998: HTPrint("Robot....... Checking redirecting from `%s\' to `%s\'\n",
! 999: uri, redirection_parent_addr);
! 1000:
! 1001: /* Log the event */
! 1002: #ifdef HT_MYSQL
! 1003: if (mr->sqllog && redirection_parent_addr)
! 1004: HTSQLLog_addLinkRelationship(mr->sqllog, redirection_parent_addr,
! 1005: uri, "redirection", NULL);
! 1006: #endif
! 1007:
! 1008: /* Check our constraints matcher */
! 1009: match = check_constraints(mr,mr->prefix, redirection_parent_addr);
! 1010:
! 1011: #ifdef HT_POSIX_REGEX
! 1012: /* See if we should do a HEAD or a GET on this URI */
! 1013: if (match && mr->check) {
! 1014: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
! 1015: }
! 1016: #endif
! 1017:
! 1018: /*
! 1019: ** If we already have a HyperDoc for the redirected anchor
! 1020: ** then update it
! 1021: */
! 1022: if (match) {
! 1023: if ((redirection_hd = HTAnchor_document(redirection_parent)) != NULL) {
! 1024: if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
! 1025: redirection_hd->hits++;
! 1026: HT_FREE(redirection_parent_addr);
! 1027: HT_FREE(uri);
! 1028: return HT_OK;
! 1029: }
! 1030:
! 1031: /* Now call the default libwww handler for actually carrying it out */
! 1032: if (mr->redir_code==0 || mr->redir_code==status) {
! 1033: HyperDoc * me_hd = HTAnchor_document(me);
! 1034: HyperDoc_new(mr, redirection_parent, me_hd->depth);
! 1035: if (check) {
! 1036: if (SHOW_QUIET(mr)) HTPrint("Checking redirection using HEAD\n");
! 1037: HTRequest_setMethod(request, METHOD_HEAD);
! 1038: }
! 1039: HT_FREE(redirection_parent_addr);
! 1040: HT_FREE(uri);
! 1041: return HTRedirectFilter(request, response, param, status);
! 1042: }
! 1043: } else {
! 1044: if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
! 1045: #ifdef HT_MYSQL
! 1046: if (mr->reject || mr->sqllog)
! 1047: #else
! 1048: if (mr->reject)
! 1049: #endif
! 1050: {
! 1051: if (mr->reject && redirection_parent_addr)
! 1052: HTLog_addText(mr->reject, "%s --> %s\n", redirection_parent_addr, uri);
! 1053: }
! 1054: }
! 1055:
! 1056: /* Just fall through */
! 1057: HT_FREE(redirection_parent_addr);
! 1058: HT_FREE(uri);
! 1059: return HT_OK;
! 1060: }
! 1061:
1.1 frystyk 1062: /* terminate_handler
1063: ** -----------------
1.2 frystyk 1064: ** This function is registered to handle the result of the request.
1065: ** If no more requests are pending then terminate program
1.1 frystyk 1066: */
1.75 frystyk 1067: PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
1.32 frystyk 1068: void * param, int status)
1.1 frystyk 1069: {
1.34 eric 1070: Finger * finger = (Finger *) HTRequest_context(request);
1.46 eric 1071: Robot * mr = finger->robot;
1.82 frystyk 1072: if (SHOW_QUIET(mr)) HTPrint("Robot....... done with %s\n", HTAnchor_physical(finger->dest));
1.55 frystyk 1073:
1.68 frystyk 1074: #ifdef HT_MYSQL
1075: if (mr->sqllog) HTSQLLog_addEntry(mr->sqllog, request, status);
1076: #endif
1077:
1.58 frystyk 1078: /* Check if negotiated resource and whether we should log that*/
1079: if (mr->conneg) {
1080: HTAssocList * cur = HTResponse_variant(response);
1081: if (cur) {
1082: BOOL first = YES;
1083: HTChunk * buffer = HTChunk_new(128);
1084: char * uri = HTAnchor_address((HTAnchor *) finger->dest);
1085: HTAssoc * pres;
1.60 frystyk 1086: HTChunk_puts(buffer, uri);
1.58 frystyk 1087: while ((pres = (HTAssoc *) HTAssocList_nextObject(cur))) {
1088: char * value = HTAssoc_value(pres);
1089: if (first) {
1.60 frystyk 1090: HTChunk_puts(buffer, "\t(");
1.58 frystyk 1091: first = NO;
1092: } else
1093: HTChunk_puts(buffer, ", ");
1094:
1095: /* Output the name */
1096: HTChunk_puts(buffer, HTAssoc_name(pres));
1097:
1098: /* Only output the value if not empty string */
1.60 frystyk 1099: if (value && *value) {
1.58 frystyk 1100: HTChunk_puts(buffer, "=");
1101: HTChunk_puts(buffer, value);
1102: }
1103: }
1.60 frystyk 1104: if (!first) HTChunk_puts(buffer, ")");
1105: HTLog_addLine(mr->conneg, HTChunk_data(buffer));
1.58 frystyk 1106: HTChunk_delete(buffer);
1107: HT_FREE(uri);
1108: }
1109: }
1110:
1.55 frystyk 1111: /* Count the amount of body data that we have read */
1.59 frystyk 1112: if (HTRequest_method(request) == METHOD_GET) {
1113: int length = HTAnchor_length(HTRequest_anchor(request));
1114: if (length > 0) mr->get_bytes += length;
1115: mr->get_docs++;
1116: } else if (HTRequest_method(request) == METHOD_HEAD) {
1.56 frystyk 1117: int length = HTAnchor_length(HTRequest_anchor(request));
1.59 frystyk 1118: if (length > 0) mr->head_bytes += length;
1119: mr->head_docs++;
1120: } else {
1121: mr->other_docs++;
1.55 frystyk 1122: }
1123:
1.78 frystyk 1124: if (!(mr->flags & MR_BFS)) {
1.88 ! frystyk 1125:
! 1126: #if 0
1.86 frystyk 1127: HyperDoc * hd = HTAnchor_document(finger->dest);
1128: if (hd) set_error_state_hyperdoc(hd,request);
1.88 ! frystyk 1129: #endif
1.78 frystyk 1130:
1131: /* Delete this thread */
1132: Finger_delete(finger);
1133:
1134: /* Should we stop? */
1135: if (mr->cnt <= 0) {
1.82 frystyk 1136: if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n");
1.78 frystyk 1137: Cleanup(mr, 0); /* No way back from here */
1138: }
1139: }
1140:
1.82 frystyk 1141: if (SHOW_QUIET(mr)) HTPrint(" %d outstanding request%s\n", mr->cnt, mr->cnt == 1 ? "" : "s");
1.75 frystyk 1142: return HT_OK;
1143:
1144: }
1.88 ! frystyk 1145:
! 1146: PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response,
! 1147: void * param, int status)
1.75 frystyk 1148: {
1149: Finger * finger = (Finger *) HTRequest_context(request);
1150: Robot * mr = finger->robot;
1151: HTParentAnchor * dest = finger->dest;
1152: HyperDoc * hd = HTAnchor_document(dest);
1153: int depth = (hd ? hd->depth : -1);
1154:
1155: if (hd) set_error_state_hyperdoc(hd,request);
1156:
1157: if(hd && (HTRequest_method(request)== METHOD_HEAD) &&
1158: (depth < mr->depth))
1159: {
1160: hd->method = METHOD_GET;
1161: HTQueue_append(mr->queue, (void *)hd); (mr->cq)++;
1162: }
1.58 frystyk 1163:
1.34 eric 1164: Finger_delete(finger);
1.55 frystyk 1165:
1.75 frystyk 1166: if(!(mr->flags & MR_PREEMPTIVE))
1167: Serving_queue(mr);
1168:
1169: return HT_OK;
1170: }
1171:
1172: PUBLIC void Serving_queue(Robot *mr)
1173: {
1174: BOOL abort = NO;
1175: Finger *nfinger;
1176:
1177: while(!abort)
1178: {
1179: if(!HTQueue_isEmpty(mr->queue))
1180: {
1181: HTRequest *newreq;
1182:
1183: HyperDoc *nhd = (HyperDoc *)HTQueue_headOfQueue(mr->queue);
1184:
1185: if(nhd)
1186: {
1187: char *uri = HTAnchor_address((HTAnchor *)nhd->anchor);
1188: HTQueue_dequeue(mr->queue); (mr->cq)--;
1189:
1190: nfinger = Finger_new(mr, nhd->anchor, nhd->method);
1191:
1192: newreq = nfinger->request;
1193:
1.82 frystyk 1194: if(SHOW_QUIET(mr)) HTPrint("Request from QUEUE %s\n",uri);
1.75 frystyk 1195: HT_FREE(uri);
1.82 frystyk 1196: if(SHOW_QUIET(mr)) HTPrint("%d elements in queue \n", mr->cq);
1.75 frystyk 1197:
1198: HTRequest_setParent(newreq,get_last_parent(nhd->anchor));
1199:
1.76 frystyk 1200: /* @@@ Should be done using a timer and not sleep! @@@ */
1201: #if 0
1.75 frystyk 1202: if(mr->waits)
1203: sleep(mr->waits);
1.76 frystyk 1204: #endif
1.75 frystyk 1205:
1206: if (HTLoadAnchor((HTAnchor *)nhd->anchor , newreq) != YES)
1207: {
1.82 frystyk 1208: if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
1.75 frystyk 1209: Finger_delete(nfinger);
1210: }
1211: }
1212: else
1213: abort = YES;
1214: }
1215: else
1216: abort = YES;
1217: }
1218:
1.82 frystyk 1219: if(SHOW_QUIET(mr)) HTPrint("Queue size: %d \n", mr->cq);
1.75 frystyk 1220:
1221: if (mr->cnt <= 0 || (abort && (mr->flags & MR_PREEMPTIVE)))
1222: {
1223: if(mr->cnt > 0)
1.82 frystyk 1224: if(SHOW_QUIET(mr)) HTPrint("%d requests were not served\n", mr->cnt);
1.75 frystyk 1225:
1.82 frystyk 1226: if (SHOW_QUIET(mr)) HTPrint(" Everything is finished...\n");
1.46 eric 1227: Cleanup(mr, 0); /* No way back from here */
1.75 frystyk 1228: }
1.1 frystyk 1229: }
1230:
1231: /* ------------------------------------------------------------------------- */
1232: /* HTEXT INTERFACE */
1233: /* ------------------------------------------------------------------------- */
1234:
1.80 frystyk 1235: PUBLIC BOOL Robot_registerHTMLParser (void)
1236: {
1237: HText_registerCDCallback(RHText_new, RHText_delete);
1238: HText_registerLinkCallback(RHText_foundLink);
1239: return YES;
1240: }
1241:
1242: PRIVATE HText * RHText_new (HTRequest * request, HTParentAnchor * anchor,
1243: HTStream * stream)
1.1 frystyk 1244: {
1245: HText * me;
1.34 eric 1246: Finger * finger = (Finger *) HTRequest_context(request);
1247: Robot * mr = finger->robot;
1.65 frystyk 1248: char * robots = NULL;
1249:
1.14 frystyk 1250: if ((me = (HText *) HT_CALLOC(1, sizeof(HText))) == NULL)
1.86 frystyk 1251: HT_OUTOFMEM("RHText_new");
1.4 frystyk 1252:
1253: /* Bind the HText object together with the Request Object */
1.1 frystyk 1254: me->request = request;
1.65 frystyk 1255: me->follow = YES;
1256:
1257: /* Check to see if we have any meta tags */
1.77 frystyk 1258: if (!(mr->flags & MR_NOMETATAGS) && (robots = HTAnchor_robots(anchor)) != NULL) {
1.65 frystyk 1259: char * strval = NULL;
1260: char * ptr = NULL;
1261: char * token = NULL;
1262: StrAllocCopy(strval, robots);
1263: ptr = strval;
1264: while ((token = HTNextField(&ptr)) != NULL) {
1265: if (!strcasecomp(token, "nofollow")) {
1266: me->follow = NO;
1267: break;
1268: }
1269: }
1270: HT_FREE(strval);
1271: }
1.4 frystyk 1272:
1273: /* Add this HyperDoc object to our list */
1274: if (!mr->htext) mr->htext = HTList_new();
1275: HTList_addObject(mr->htext, (void *) me);
1.1 frystyk 1276: return me;
1277: }
1278:
1.80 frystyk 1279: PRIVATE BOOL RHText_delete (HText * me) {
1.81 frystyk 1280: if (me) {
1281: HT_FREE(me);
1282: return YES;
1283: }
1284: return NO;
1.4 frystyk 1285: }
1286:
1.80 frystyk 1287: PRIVATE void RHText_foundAnchor (HText * text, HTChildAnchor * anchor)
1.1 frystyk 1288: {
1289: if (text && anchor) {
1.34 eric 1290: Finger * finger = (Finger *) HTRequest_context(text->request);
1291: Robot * mr = finger->robot;
1.1 frystyk 1292: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1293: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1.7 frystyk 1294: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1.1 frystyk 1295: HyperDoc * hd = HTAnchor_document(dest_parent);
1.60 frystyk 1296: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.65 frystyk 1297: BOOL match = text->follow;
1.58 frystyk 1298: BOOL check = NO;
1.1 frystyk 1299:
1.75 frystyk 1300: /* These are new variables */
1301: HyperDoc * nhd = NULL;
1302: BOOL follow = YES;
1303:
1304: /* These three variables were moved */
1305: /*HTParentAnchor * last_anchor = HTRequest_parent(text->request);*/
1306: HTParentAnchor * last_anchor = HTRequest_anchor(text->request);
1307: HyperDoc * last_doc = HTAnchor_document(last_anchor);
1308: int depth = last_doc ? last_doc->depth+1 : 0;
1309:
1.55 frystyk 1310: if (!uri) return;
1.82 frystyk 1311: if (SHOW_QUIET(mr)) HTPrint("Robot....... Found `%s\' - \n", uri ? uri : "NULL\n");
1.55 frystyk 1312:
1313: if (hd) {
1.82 frystyk 1314: if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1.55 frystyk 1315: hd->hits++;
1.68 frystyk 1316: #ifdef HT_MYSQL
1317: if (mr->sqllog) {
1318: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1319: if (ref_addr) {
1320: HTSQLLog_addLinkRelationship(mr->sqllog, ref_addr, uri,
1321: "referer", NULL);
1322: HT_FREE(ref_addr);
1323: }
1324: }
1325: #endif
1.58 frystyk 1326: HT_FREE(uri);
1327: return;
1328: }
1.70 frystyk 1329:
1.88 ! frystyk 1330: /* Check our constraints matcher */
1.86 frystyk 1331: match = check_constraints(mr,mr->prefix, uri);
1.58 frystyk 1332:
1.87 frystyk 1333: #ifdef HT_POSIX_REGEX
1334: /* See if we should do a HEAD or a GET on this URI */
1335: if (match && mr->check) {
1336: check = regexec(mr->check, uri, 0, NULL, 0) ? NO : YES;
1337: }
1338: #endif
1339:
1340: #if 0
1341: /* This is already checked in HTParse.c */
1.75 frystyk 1342: if(uri && test_for_blank_spaces(uri))
1343: follow = NO;
1.87 frystyk 1344: else
1345: #endif
1346: if (mr->ndoc == 0) /* Number of Documents is reached */
1.75 frystyk 1347: follow = NO;
1348:
1349: /* Test whether we already have a hyperdoc for this document */
1.88 ! frystyk 1350: if (!hd && dest_parent) {
1.75 frystyk 1351: nhd = HyperDoc_new(mr, dest_parent, depth);
1352: mr->cdepth[depth]++;
1.88 ! frystyk 1353: }
1.58 frystyk 1354:
1355: /* Test whether we already have a hyperdoc for this document */
1.78 frystyk 1356: if (mr->flags & MR_LINK && match && dest_parent && follow && !hd) {
1357: if (mr->flags & MR_BFS) {
1358: nhd->method = METHOD_HEAD;
1359: HTQueue_enqueue(mr->queue, (void *) nhd);
1360: (mr->cq)++;
1361: if(mr->ndoc > 0) mr->ndoc--;
1362: } else {
1363: Finger * newfinger = Finger_new(mr, dest_parent, METHOD_GET);
1364: HTRequest * newreq = newfinger->request;
1.86 frystyk 1365: HTRequest_setParent(newreq, referer);
1366: nhd->method = METHOD_GET;
1367:
1.78 frystyk 1368: if (check || depth >= mr->depth) {
1.82 frystyk 1369: if (SHOW_QUIET(mr)) HTPrint("loading at depth %d using HEAD\n", depth);
1.78 frystyk 1370: HTRequest_setMethod(newreq, METHOD_HEAD);
1.86 frystyk 1371: nhd->method = METHOD_HEAD;
1372:
1.78 frystyk 1373: } else {
1.82 frystyk 1374: if (SHOW_QUIET(mr)) HTPrint("loading at depth %d\n", depth);
1.78 frystyk 1375: }
1376: if (HTLoadAnchor((HTAnchor *) dest_parent, newreq) != YES) {
1.82 frystyk 1377: if (SHOW_QUIET(mr)) HTPrint("not tested!\n");
1.78 frystyk 1378: Finger_delete(newfinger);
1379: }
1380: }
1.75 frystyk 1381:
1.7 frystyk 1382: } else {
1.82 frystyk 1383: if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1.68 frystyk 1384: #ifdef HT_MYSQL
1385: if (mr->reject || mr->sqllog) {
1386: #else
1.60 frystyk 1387: if (mr->reject) {
1.68 frystyk 1388: #endif
1.60 frystyk 1389: if (referer) {
1390: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1391: if (mr->reject && ref_addr)
1392: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1393: #ifdef HT_MYSQL
1394: if (mr->sqllog && mr->sqlexternals && ref_addr)
1395: HTSQLLog_addLinkRelationship(mr->sqllog,
1396: ref_addr, uri,
1397: "referer", NULL);
1398: #endif
1399:
1.60 frystyk 1400: HT_FREE(ref_addr);
1401: }
1402: }
1.2 frystyk 1403: }
1.11 frystyk 1404: HT_FREE(uri);
1.2 frystyk 1405: }
1406: }
1407:
1.80 frystyk 1408: PRIVATE void RHText_foundImage (HText * text, HTChildAnchor * anchor,
1409: const char *alt, const char * align, BOOL isMap)
1.2 frystyk 1410: {
1411: if (text && anchor) {
1.34 eric 1412: Finger * finger = (Finger *) HTRequest_context(text->request);
1413: Robot * mr = finger->robot;
1.75 frystyk 1414:
1.59 frystyk 1415: if (mr->flags & MR_IMG) {
1.60 frystyk 1416: HTAnchor * dest = HTAnchor_followMainLink((HTAnchor *) anchor);
1417: HTParentAnchor * dest_parent = HTAnchor_parent(dest);
1418: char * uri = HTAnchor_address((HTAnchor *) dest_parent);
1419: HyperDoc * hd = HTAnchor_document(dest_parent);
1420: HTParentAnchor * referer = HTRequest_anchor(text->request);
1.59 frystyk 1421: BOOL match = YES;
1422:
1.72 frystyk 1423: if (!uri) return;
1.59 frystyk 1424: if (hd) {
1.82 frystyk 1425: if (SHOW_QUIET(mr)) HTPrint("............ Already checked\n");
1.59 frystyk 1426: hd->hits++;
1.68 frystyk 1427: #ifdef HT_MYSQL
1428: if (mr->sqllog) {
1429: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1430: if (ref_addr) {
1431: HTSQLLog_addLinkRelationship(mr->sqllog,
1432: ref_addr, uri,
1433: "image", alt);
1434: HT_FREE(ref_addr);
1435: }
1436: }
1437: #endif
1.11 frystyk 1438: HT_FREE(uri);
1.59 frystyk 1439: return;
1.2 frystyk 1440: }
1.59 frystyk 1441:
1.88 ! frystyk 1442: /* Check our constraints matcher */
1.86 frystyk 1443: match = check_constraints(mr, mr->img_prefix, uri);
1.59 frystyk 1444:
1445: /* Test whether we already have a hyperdoc for this document */
1446: if (match && dest) {
1.60 frystyk 1447: Finger * newfinger = Finger_new(mr, dest_parent,
1.59 frystyk 1448: mr->flags & MR_SAVE ?
1449: METHOD_GET : METHOD_HEAD);
1450: HTRequest * newreq = newfinger->request;
1.60 frystyk 1451: HyperDoc_new(mr, dest_parent, 1);
1452: HTRequest_setParent(newreq, referer);
1453:
1454: /* Check whether we should report missing ALT tags */
1455: if (mr->noalttag && (alt==NULL || *alt=='\0')) {
1456: if (referer) {
1457: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1458: if (ref_addr) HTLog_addText(mr->noalttag, "%s --> %s\n", ref_addr, uri);
1459: HT_FREE(ref_addr);
1460: }
1461: }
1462:
1.82 frystyk 1463: if (SHOW_QUIET(mr)) HTPrint("Robot....... Checking Image `%s\'\n", uri);
1.59 frystyk 1464: if (HTLoadAnchor((HTAnchor *) dest, newreq) != YES) {
1.82 frystyk 1465: if (SHOW_QUIET(mr)) HTPrint("Robot....... Image not tested!\n");
1.59 frystyk 1466: Finger_delete(newfinger);
1467: }
1468: } else {
1.82 frystyk 1469: if (SHOW_QUIET(mr)) HTPrint("............ does not fulfill constraints\n");
1.68 frystyk 1470: #ifdef HT_MYSQL
1471: if (mr->reject || mr->sqllog) {
1472: #else
1.60 frystyk 1473: if (mr->reject) {
1.68 frystyk 1474: #endif
1.60 frystyk 1475: if (referer) {
1476: char * ref_addr = HTAnchor_address((HTAnchor *) referer);
1.68 frystyk 1477: if (mr->reject && ref_addr)
1478: HTLog_addText(mr->reject, "%s --> %s\n", ref_addr, uri);
1479: #ifdef HT_MYSQL
1480: if (mr->sqllog && mr->sqlexternals && ref_addr)
1481: HTSQLLog_addLinkRelationship(mr->sqllog,
1482: ref_addr, uri,
1483: "image", alt);
1484: #endif
1485:
1.60 frystyk 1486: HT_FREE(ref_addr);
1487: }
1488: }
1.1 frystyk 1489: }
1.59 frystyk 1490: HT_FREE(uri);
1.72 frystyk 1491: }
1492: }
1493: }
1494:
1.80 frystyk 1495: PRIVATE void RHText_foundLink (HText * text,
1496: int element_number, int attribute_number,
1497: HTChildAnchor * anchor,
1498: const BOOL * present, const char ** value)
1.72 frystyk 1499: {
1500: if (text && anchor) {
1501: Finger * finger = (Finger *) HTRequest_context(text->request);
1502: Robot * mr = finger->robot;
1503: if (SHOW_QUIET(mr))
1.82 frystyk 1504: HTPrint("Robot....... Received element %d, attribute %d with anchor %p\n",
1.80 frystyk 1505: element_number, attribute_number, anchor);
1506: if ((element_number==HTML_IMG && attribute_number==HTML_IMG_SRC) ||
1.85 frystyk 1507: (element_number==HTML_BODY && attribute_number==HTML_BODY_BACKGROUND) ||
1508: (element_number==HTML_INPUT && attribute_number==HTML_INPUT_SRC))
1.80 frystyk 1509: RHText_foundImage(text, anchor, NULL, NULL, NO);
1510: else
1511: RHText_foundAnchor(text, anchor);
1.72 frystyk 1512: }
1513: }
1.88 ! frystyk 1514:
1.80 frystyk 1515: PUBLIC char * get_robots_txt(char * uri)
1.48 frystyk 1516: {
1.88 ! frystyk 1517: char *str = NULL;
! 1518: HTChunk * chunk;
! 1519: HTParentAnchor *anchor = HTAnchor_parent(HTAnchor_findAddress(uri));
! 1520: HTRequest *request = HTRequest_new();
! 1521: HTRequest_setOutputFormat(request, WWW_SOURCE);
! 1522: HTRequest_setPreemptive(request, YES);
! 1523: HTRequest_setMethod(request, METHOD_GET);
! 1524: chunk = HTLoadAnchorToChunk ((HTAnchor *)anchor, request);
! 1525: str = HTChunk_toCString(chunk);
! 1526: HTRequest_delete(request);
! 1527: return str;
1.86 frystyk 1528: }
1529:
1.48 frystyk 1530:
Webmaster