Annotation of libwww/Robot/src/HTRobMan.html, revision 1.9
1.1 frystyk 1: <HTML>
2: <HEAD>
3: <TITLE>Webbot - the W3C Mini Robot</TITLE>
4: </HEAD>
5: <BODY>
6: <H1>
7: Webbot - the W3C Mini Robot
8: </H1>
9: <PRE>
10: /*
11: ** (c) COPRIGHT MIT 1995.
12: ** Please first read the full copyright statement in the file COPYRIGH.
13: */
14: </PRE>
15: <P>
16: This program illustrates how to travers links using the Anchor object
17: <PRE>
18: #ifndef HTROBMAN_H
19: #define HTROBMAN_H
20:
21: #include "WWWLib.h" /* Global Library Include file */
22: #include "WWWApp.h" /* Application stuff */
23: #include "WWWTrans.h"
24: #include "WWWInit.h"
25: #include "WWWSQL.h"
26:
27: #include "HText.h"
28:
29: #include "HTRobot.h" /* Implemented here */
30:
31: #ifndef W3C_VERSION
32: #define W3C_VERSION "Unspecified"
33: #endif
34:
35: #define APP_NAME "W3CRobot"
36: #define APP_VERSION W3C_VERSION
37: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
38: #define ROBOTS_TXT "/robots.txt"
39:
40: #define DEFAULT_OUTPUT_FILE "robot.out"
41: #define DEFAULT_RULE_FILE "robot.conf"
42: #define DEFAULT_LOG_FILE "log-clf.txt"
43: #define DEFAULT_HIT_FILE "log-hit.txt"
44: #define DEFAULT_REL_FILE "log-rel.txt"
45: #define DEFAULT_LM_FILE "log-lastmodified.txt"
46: #define DEFAULT_TITLE_FILE "log-title.txt"
47: #define DEFAULT_REFERER_FILE "log-referer.txt"
48: #define DEFAULT_REJECT_FILE "log-reject.txt"
49: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
50: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
51: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
52: #define DEFAULT_FORMAT_FILE "log-format.txt"
53: #define DEFAULT_CHARSET_FILE "log-charset.txt"
54: #define DEFAULT_MEMLOG "robot.mem"
55: #define DEFAULT_PREFIX ""
56: #define DEFAULT_IMG_PREFIX ""
57: #define DEFAULT_DEPTH 0
58: #define DEFAULT_DELAY 50 /* Write delay in ms */
59:
1.9 ! frystyk 60: #define DEFAULT_CACHE_SIZE 20 /* Default cache size */
! 61:
1.1 frystyk 62: #define DEFAULT_SQL_SERVER "localhost"
63: #define DEFAULT_SQL_DB "webbot"
64: #define DEFAULT_SQL_USER "webbot"
65: #define DEFAULT_SQL_PW ""
66:
67: #if 0
68: #define HT_MEMLOG /* Is expensive in performance! */
69: #endif
70:
71: #define MILLIES 1000
72: #define DEFAULT_TIMEOUT 20 /* timeout in secs */
73:
74: typedef enum _MRFlags {
75: MR_IMG = 0x1,
76: MR_LINK = 0x2,
77: MR_PREEMPTIVE = 0x4,
78: MR_TIME = 0x8,
79: MR_SAVE = 0x10,
80: MR_QUIET = 0x20,
81: MR_REAL_QUIET = 0x40,
82: MR_VALIDATE = 0x80,
83: MR_END_VALIDATE = 0x100,
84: MR_KEEP_META = 0x200,
85: MR_LOGGING = 0x400,
86: MR_DISTRIBUTIONS = 0x800,
1.3 frystyk 87: MR_NOROBOTSTXT = 0x1000,
1.4 frystyk 88: MR_NOMETATAGS = 0x2000,
1.7 frystyk 89: MR_BFS = 0x4000,
90: MR_REDIR = 0x8000
1.1 frystyk 91: } MRFlags;
92:
93: typedef struct _Robot {
94: int depth; /* How deep is our tree */
95: int ndoc;
96: int *cdepth; /* Number of nodes per level */
97: int cnt; /* Count of requests */
98: int cindex; /* Number assigned to each document */
99:
100: HTList * hyperdoc; /* List of our HyperDoc Objects */
101: HTList * htext; /* List of our HText Objects */
102: HTList * fingers;
103:
104: HTList * queue; /* Queue */
105: int cq;
106:
107: int timer;
108: int waits;
109:
110: char * cwd; /* Current dir URL */
111: char * rules;
112: char * prefix;
113: char * img_prefix;
114:
115: char * logfile; /* clf log */
116: HTLog * log;
117: char * reffile; /* referer log */
118: HTLog * ref;
119: char * rejectfile; /* unchecked links */
120: HTLog * reject;
121: char * notfoundfile; /* links that returned 404 */
122: HTLog * notfound;
123: char * connegfile; /* links that were conneg'ed */
124: HTLog * conneg;
125: char * noalttagfile; /* images without alt tags*/
126: HTLog * noalttag;
127:
128:
129: char * hitfile; /* links sorted after hit counts */
130: char * relfile; /* link sorted after relationships */
131: HTLinkType relation; /* Specific relation to look for */
132: char * titlefile; /* links with titles */
133: char * mtfile; /* media types encountered */
134: char * charsetfile; /* charsets encountered */
135: char * lmfile; /* sortef after last modified dates */
136:
137: char * outputfile;
138: FILE * output;
139:
140: char * furl; /* First url */
141:
1.8 frystyk 142: MRFlags flags;
1.1 frystyk 143:
1.8 frystyk 144: int redir_code; /* 0 means all, otherwise 301, 302, 305... */
1.1 frystyk 145:
146: long get_bytes; /* Total number of bytes processed using GET*/
147: long get_docs; /* Total number of documents using GET */
148:
149: long head_bytes; /* bytes processed bytes processed using HEAD */
150: long head_docs; /* Total number of documents using HEAD*/
151:
152: long other_docs;
153:
154: ms_t time; /* Time of run */
155:
156: #ifdef HT_POSIX_REGEX
157: regex_t * include;
158: regex_t * exclude;
159: regex_t * check;
160: regex_t * exc_robot; /* Robots.txt exclusion */
161: #endif
162:
163: #ifdef HT_MYSQL
164: HTSQLLog * sqllog;
165: char * sqlserver;
166: char * sqldb;
167: char * sqluser;
168: char * sqlpw;
169: char * sqlrelative;
170: BOOL sqlexternals;
171: int sqlflags;
172: #endif
173:
174: } Robot;
175:
176: typedef struct _Finger {
177: Robot * robot;
178: HTRequest * request;
179: HTParentAnchor * dest;
180: } Finger;
181:
182: /*
183: ** The HyperDoc object is bound to the anchor and contains information about
184: ** where we are in the search for recursive searches
185: */
1.7 frystyk 186:
187: #define NO_CODE -1
188: #define REDIR_CODE -2
189:
1.1 frystyk 190: typedef struct _HyperDoc {
191: HTParentAnchor * anchor;
192: int depth;
193: int hits;
194: int code;
195: int index;
196: char * title;
197: HTMethod method;
198: } HyperDoc;
199:
200: /*
201: ** This is the HText object that is created every time we start parsing an
202: ** HTML object
203: */
204: struct _HText {
205: HTRequest * request;
206: BOOL follow;
207: };
208:
209: /*
210: ** A structure for calculating metadata distributions
211: */
212: typedef struct _MetaDist {
213: HTAtom * name;
214: int hits;
215: } MetaDist;
216:
1.2 frystyk 217: #ifdef HT_POSIX_REGEX
218: PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags);
219: #endif
1.1 frystyk 220:
221: PUBLIC int OutputData(const char * fmt, ...);
222: PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth);
223: PUBLIC BOOL HyperDoc_delete (HyperDoc * hd);
224: PUBLIC Robot * Robot_new (void);
225: PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method);
1.5 frystyk 226: PUBLIC BOOL Robot_registerHTMLParser (void);
1.1 frystyk 227: PUBLIC void Cleanup (Robot * me, int status);
228: PUBLIC void VersionInfo (void);
1.8 frystyk 229:
1.1 frystyk 230: PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
231: void * param, int status) ;
232:
1.8 frystyk 233: PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response,
234: void * param, int status) ;
235:
236: PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,
237: void * param, int status) ;
1.1 frystyk 238:
239: PUBLIC void Serving_queue(Robot *mr);
240:
241: PUBLIC char *get_robots_txt(char *uri);
242:
243: #endif
244: </PRE>
245: <P>
246: <HR>
247: <ADDRESS>
1.9 ! frystyk 248: @(#) $Id: HTRobMan.html,v 1.8 1999/03/08 16:54:33 frystyk Exp $
1.1 frystyk 249: </ADDRESS>
250: </BODY></HTML>
Webmaster