Annotation of libwww/Robot/src/HTRobMan.html, revision 1.10
1.1 frystyk 1: <HTML>
2: <HEAD>
3: <TITLE>Webbot - the W3C Mini Robot</TITLE>
4: </HEAD>
5: <BODY>
6: <H1>
7: Webbot - the W3C Mini Robot
8: </H1>
9: <PRE>
10: /*
11: ** (c) COPRIGHT MIT 1995.
12: ** Please first read the full copyright statement in the file COPYRIGH.
13: */
14: </PRE>
15: <P>
16: This program illustrates how to travers links using the Anchor object
17: <PRE>
18: #ifndef HTROBMAN_H
19: #define HTROBMAN_H
20:
21: #include "WWWLib.h" /* Global Library Include file */
22: #include "WWWApp.h" /* Application stuff */
23: #include "WWWTrans.h"
24: #include "WWWInit.h"
25: #include "WWWSQL.h"
26:
1.10 ! vbancrof 27: #ifdef HT_SSL
! 28: #include "WWWSSL.h"
! 29: #endif /* HT_SSL */
! 30:
1.1 frystyk 31: #include "HText.h"
32: #include "HTRobot.h" /* Implemented here */
33:
34: #ifndef W3C_VERSION
35: #define W3C_VERSION "Unspecified"
36: #endif
37:
38: #define APP_NAME "W3CRobot"
39: #define APP_VERSION W3C_VERSION
40: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
41: #define ROBOTS_TXT "/robots.txt"
42:
43: #define DEFAULT_OUTPUT_FILE "robot.out"
44: #define DEFAULT_RULE_FILE "robot.conf"
45: #define DEFAULT_LOG_FILE "log-clf.txt"
46: #define DEFAULT_HIT_FILE "log-hit.txt"
47: #define DEFAULT_REL_FILE "log-rel.txt"
48: #define DEFAULT_LM_FILE "log-lastmodified.txt"
49: #define DEFAULT_TITLE_FILE "log-title.txt"
50: #define DEFAULT_REFERER_FILE "log-referer.txt"
51: #define DEFAULT_REJECT_FILE "log-reject.txt"
52: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
53: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
54: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
55: #define DEFAULT_FORMAT_FILE "log-format.txt"
56: #define DEFAULT_CHARSET_FILE "log-charset.txt"
57: #define DEFAULT_MEMLOG "robot.mem"
58: #define DEFAULT_PREFIX ""
59: #define DEFAULT_IMG_PREFIX ""
60: #define DEFAULT_DEPTH 0
61: #define DEFAULT_DELAY 50 /* Write delay in ms */
62:
1.9 frystyk 63: #define DEFAULT_CACHE_SIZE 20 /* Default cache size */
64:
1.1 frystyk 65: #define DEFAULT_SQL_SERVER "localhost"
66: #define DEFAULT_SQL_DB "webbot"
67: #define DEFAULT_SQL_USER "webbot"
68: #define DEFAULT_SQL_PW ""
69:
1.10 ! vbancrof 70: #ifdef HT_SSL
! 71: #define DEFAULT_SSL_PROT HTSSL_V23
! 72: #define DEFAULT_SSL_VDEPTH 2
! 73: #define DEFAULT_SSL_CFILE ""
! 74: #define DEFAULT_SSL_KFILE ""
! 75: #endif
! 76:
1.1 frystyk 77: #if 0
78: #define HT_MEMLOG /* Is expensive in performance! */
79: #endif
80:
81: #define MILLIES 1000
82: #define DEFAULT_TIMEOUT 20 /* timeout in secs */
83:
84: typedef enum _MRFlags {
85: MR_IMG = 0x1,
86: MR_LINK = 0x2,
87: MR_PREEMPTIVE = 0x4,
88: MR_TIME = 0x8,
89: MR_SAVE = 0x10,
90: MR_QUIET = 0x20,
91: MR_REAL_QUIET = 0x40,
92: MR_VALIDATE = 0x80,
93: MR_END_VALIDATE = 0x100,
94: MR_KEEP_META = 0x200,
95: MR_LOGGING = 0x400,
96: MR_DISTRIBUTIONS = 0x800,
1.3 frystyk 97: MR_NOROBOTSTXT = 0x1000,
1.4 frystyk 98: MR_NOMETATAGS = 0x2000,
1.7 frystyk 99: MR_BFS = 0x4000,
100: MR_REDIR = 0x8000
1.1 frystyk 101: } MRFlags;
102:
103: typedef struct _Robot {
104: int depth; /* How deep is our tree */
105: int ndoc;
106: int *cdepth; /* Number of nodes per level */
107: int cnt; /* Count of requests */
108: int cindex; /* Number assigned to each document */
109:
110: HTList * hyperdoc; /* List of our HyperDoc Objects */
111: HTList * htext; /* List of our HText Objects */
112: HTList * fingers;
113:
114: HTList * queue; /* Queue */
115: int cq;
116:
117: int timer;
118: int waits;
119:
120: char * cwd; /* Current dir URL */
121: char * rules;
122: char * prefix;
123: char * img_prefix;
124:
125: char * logfile; /* clf log */
126: HTLog * log;
127: char * reffile; /* referer log */
128: HTLog * ref;
129: char * rejectfile; /* unchecked links */
130: HTLog * reject;
131: char * notfoundfile; /* links that returned 404 */
132: HTLog * notfound;
133: char * connegfile; /* links that were conneg'ed */
134: HTLog * conneg;
135: char * noalttagfile; /* images without alt tags*/
136: HTLog * noalttag;
137:
138:
139: char * hitfile; /* links sorted after hit counts */
140: char * relfile; /* link sorted after relationships */
141: HTLinkType relation; /* Specific relation to look for */
142: char * titlefile; /* links with titles */
143: char * mtfile; /* media types encountered */
144: char * charsetfile; /* charsets encountered */
145: char * lmfile; /* sortef after last modified dates */
146:
147: char * outputfile;
148: FILE * output;
149:
150: char * furl; /* First url */
151:
1.8 frystyk 152: MRFlags flags;
1.1 frystyk 153:
1.8 frystyk 154: int redir_code; /* 0 means all, otherwise 301, 302, 305... */
1.1 frystyk 155:
156: long get_bytes; /* Total number of bytes processed using GET*/
157: long get_docs; /* Total number of documents using GET */
158:
159: long head_bytes; /* bytes processed bytes processed using HEAD */
160: long head_docs; /* Total number of documents using HEAD*/
161:
162: long other_docs;
163:
164: ms_t time; /* Time of run */
165:
166: #ifdef HT_POSIX_REGEX
167: regex_t * include;
168: regex_t * exclude;
169: regex_t * check;
170: regex_t * exc_robot; /* Robots.txt exclusion */
171: #endif
172:
173: #ifdef HT_MYSQL
174: HTSQLLog * sqllog;
175: char * sqlserver;
176: char * sqldb;
177: char * sqluser;
178: char * sqlpw;
179: char * sqlrelative;
180: BOOL sqlexternals;
181: int sqlflags;
182: #endif
183:
1.10 ! vbancrof 184: #ifdef HT_SSL
! 185: HTSSL_PROTOCOL sslprot;
! 186: int sslverifydepth;
! 187: char * sslcertfile;
! 188: char * sslkeyfile;
! 189: #endif
! 190:
1.1 frystyk 191: } Robot;
192:
193: typedef struct _Finger {
194: Robot * robot;
195: HTRequest * request;
196: HTParentAnchor * dest;
197: } Finger;
198:
199: /*
200: ** The HyperDoc object is bound to the anchor and contains information about
201: ** where we are in the search for recursive searches
202: */
1.7 frystyk 203:
204: #define NO_CODE -1
205: #define REDIR_CODE -2
206:
1.1 frystyk 207: typedef struct _HyperDoc {
208: HTParentAnchor * anchor;
209: int depth;
210: int hits;
211: int code;
212: int index;
213: char * title;
214: HTMethod method;
215: } HyperDoc;
216:
217: /*
218: ** This is the HText object that is created every time we start parsing an
219: ** HTML object
220: */
221: struct _HText {
222: HTRequest * request;
223: BOOL follow;
224: };
225:
226: /*
227: ** A structure for calculating metadata distributions
228: */
229: typedef struct _MetaDist {
230: HTAtom * name;
231: int hits;
232: } MetaDist;
233:
1.2 frystyk 234: #ifdef HT_POSIX_REGEX
235: PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags);
236: #endif
1.1 frystyk 237:
238: PUBLIC int OutputData(const char * fmt, ...);
239: PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth);
240: PUBLIC BOOL HyperDoc_delete (HyperDoc * hd);
241: PUBLIC Robot * Robot_new (void);
242: PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method);
1.5 frystyk 243: PUBLIC BOOL Robot_registerHTMLParser (void);
1.1 frystyk 244: PUBLIC void Cleanup (Robot * me, int status);
245: PUBLIC void VersionInfo (void);
1.8 frystyk 246:
1.1 frystyk 247: PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
248: void * param, int status) ;
249:
1.8 frystyk 250: PUBLIC int bfs_terminate_handler (HTRequest * request, HTResponse * response,
251: void * param, int status) ;
252:
253: PUBLIC int redirection_handler (HTRequest * request, HTResponse * response,
254: void * param, int status) ;
1.1 frystyk 255:
256: PUBLIC void Serving_queue(Robot *mr);
257:
258: PUBLIC char *get_robots_txt(char *uri);
259:
260: #endif
261: </PRE>
262: <P>
263: <HR>
264: <ADDRESS>
1.10 ! vbancrof 265: @(#) $Id: HTRobMan.html,v 1.9 1999/03/14 02:21:09 frystyk Exp $
1.1 frystyk 266: </ADDRESS>
267: </BODY></HTML>
Webmaster