Annotation of libwww/Robot/src/HTRobMan.html, revision 1.1
1.1 ! frystyk 1: <HTML>
! 2: <HEAD>
! 3: <TITLE>Webbot - the W3C Mini Robot</TITLE>
! 4: </HEAD>
! 5: <BODY>
! 6: <H1>
! 7: Webbot - the W3C Mini Robot
! 8: </H1>
! 9: <PRE>
! 10: /*
! 11: ** (c) COPRIGHT MIT 1995.
! 12: ** Please first read the full copyright statement in the file COPYRIGH.
! 13: */
! 14: </PRE>
! 15: <P>
! 16: This program illustrates how to travers links using the Anchor object
! 17: <PRE>
! 18: #ifndef HTROBMAN_H
! 19: #define HTROBMAN_H
! 20:
! 21: #include "WWWLib.h" /* Global Library Include file */
! 22: #include "WWWApp.h" /* Application stuff */
! 23: #include "WWWTrans.h"
! 24: #include "WWWInit.h"
! 25: #include "WWWSQL.h"
! 26:
! 27: #include "HText.h"
! 28:
! 29: #include "HTRobot.h" /* Implemented here */
! 30:
! 31: #ifndef W3C_VERSION
! 32: #define W3C_VERSION "Unspecified"
! 33: #endif
! 34:
! 35: #define APP_NAME "W3CRobot"
! 36: #define APP_VERSION W3C_VERSION
! 37: #define COMMAND_LINE "http://www.w3.org/Robot/User/CommandLine"
! 38: #define ROBOTS_TXT "/robots.txt"
! 39:
! 40: #define DEFAULT_OUTPUT_FILE "robot.out"
! 41: #define DEFAULT_RULE_FILE "robot.conf"
! 42: #define DEFAULT_LOG_FILE "log-clf.txt"
! 43: #define DEFAULT_HIT_FILE "log-hit.txt"
! 44: #define DEFAULT_REL_FILE "log-rel.txt"
! 45: #define DEFAULT_LM_FILE "log-lastmodified.txt"
! 46: #define DEFAULT_TITLE_FILE "log-title.txt"
! 47: #define DEFAULT_REFERER_FILE "log-referer.txt"
! 48: #define DEFAULT_REJECT_FILE "log-reject.txt"
! 49: #define DEFAULT_NOTFOUND_FILE "log-notfound.txt"
! 50: #define DEFAULT_CONNEG_FILE "log-conneg.txt"
! 51: #define DEFAULT_NOALTTAG_FILE "log-alt.txt"
! 52: #define DEFAULT_FORMAT_FILE "log-format.txt"
! 53: #define DEFAULT_CHARSET_FILE "log-charset.txt"
! 54: #define DEFAULT_MEMLOG "robot.mem"
! 55: #define DEFAULT_PREFIX ""
! 56: #define DEFAULT_IMG_PREFIX ""
! 57: #define DEFAULT_DEPTH 0
! 58: #define DEFAULT_DELAY 50 /* Write delay in ms */
! 59:
! 60: #define DEFAULT_SQL_SERVER "localhost"
! 61: #define DEFAULT_SQL_DB "webbot"
! 62: #define DEFAULT_SQL_USER "webbot"
! 63: #define DEFAULT_SQL_PW ""
! 64:
! 65: #if 0
! 66: #define HT_MEMLOG /* Is expensive in performance! */
! 67: #endif
! 68:
! 69: #define MILLIES 1000
! 70: #define DEFAULT_TIMEOUT 20 /* timeout in secs */
! 71:
! 72: #if defined(__svr4__)
! 73: #define CATCH_SIG
! 74: #endif
! 75:
! 76: typedef enum _MRFlags {
! 77: MR_IMG = 0x1,
! 78: MR_LINK = 0x2,
! 79: MR_PREEMPTIVE = 0x4,
! 80: MR_TIME = 0x8,
! 81: MR_SAVE = 0x10,
! 82: MR_QUIET = 0x20,
! 83: MR_REAL_QUIET = 0x40,
! 84: MR_VALIDATE = 0x80,
! 85: MR_END_VALIDATE = 0x100,
! 86: MR_KEEP_META = 0x200,
! 87: MR_LOGGING = 0x400,
! 88: MR_DISTRIBUTIONS = 0x800,
! 89: MR_NOROBOTSTXT = 0x1000
! 90: } MRFlags;
! 91:
! 92: typedef struct _Robot {
! 93: int depth; /* How deep is our tree */
! 94: int ndoc;
! 95: int *cdepth; /* Number of nodes per level */
! 96: int cnt; /* Count of requests */
! 97: int cindex; /* Number assigned to each document */
! 98:
! 99: HTList * hyperdoc; /* List of our HyperDoc Objects */
! 100: HTList * htext; /* List of our HText Objects */
! 101: HTList * fingers;
! 102:
! 103: HTList * queue; /* Queue */
! 104: int cq;
! 105:
! 106: int timer;
! 107: int waits;
! 108:
! 109: char * cwd; /* Current dir URL */
! 110: char * rules;
! 111: char * prefix;
! 112: char * img_prefix;
! 113:
! 114: char * logfile; /* clf log */
! 115: HTLog * log;
! 116: char * reffile; /* referer log */
! 117: HTLog * ref;
! 118: char * rejectfile; /* unchecked links */
! 119: HTLog * reject;
! 120: char * notfoundfile; /* links that returned 404 */
! 121: HTLog * notfound;
! 122: char * connegfile; /* links that were conneg'ed */
! 123: HTLog * conneg;
! 124: char * noalttagfile; /* images without alt tags*/
! 125: HTLog * noalttag;
! 126:
! 127:
! 128: char * hitfile; /* links sorted after hit counts */
! 129: char * relfile; /* link sorted after relationships */
! 130: HTLinkType relation; /* Specific relation to look for */
! 131: char * titlefile; /* links with titles */
! 132: char * mtfile; /* media types encountered */
! 133: char * charsetfile; /* charsets encountered */
! 134: char * lmfile; /* sortef after last modified dates */
! 135:
! 136: char * outputfile;
! 137: FILE * output;
! 138:
! 139: char * furl; /* First url */
! 140:
! 141:
! 142: MRFlags flags;
! 143:
! 144: long get_bytes; /* Total number of bytes processed using GET*/
! 145: long get_docs; /* Total number of documents using GET */
! 146:
! 147: long head_bytes; /* bytes processed bytes processed using HEAD */
! 148: long head_docs; /* Total number of documents using HEAD*/
! 149:
! 150: long other_docs;
! 151:
! 152: ms_t time; /* Time of run */
! 153:
! 154: #ifdef HT_POSIX_REGEX
! 155: regex_t * include;
! 156: regex_t * exclude;
! 157: regex_t * check;
! 158: regex_t * exc_robot; /* Robots.txt exclusion */
! 159: #endif
! 160:
! 161: #ifdef HT_MYSQL
! 162: HTSQLLog * sqllog;
! 163: char * sqlserver;
! 164: char * sqldb;
! 165: char * sqluser;
! 166: char * sqlpw;
! 167: char * sqlrelative;
! 168: BOOL sqlexternals;
! 169: int sqlflags;
! 170: #endif
! 171:
! 172: } Robot;
! 173:
! 174: typedef struct _Finger {
! 175: Robot * robot;
! 176: HTRequest * request;
! 177: HTParentAnchor * dest;
! 178: } Finger;
! 179:
! 180: /*
! 181: ** The HyperDoc object is bound to the anchor and contains information about
! 182: ** where we are in the search for recursive searches
! 183: */
! 184: typedef struct _HyperDoc {
! 185: HTParentAnchor * anchor;
! 186: int depth;
! 187: int hits;
! 188: int code;
! 189: int index;
! 190: char * title;
! 191: HTMethod method;
! 192: } HyperDoc;
! 193:
! 194: /*
! 195: ** This is the HText object that is created every time we start parsing an
! 196: ** HTML object
! 197: */
! 198: struct _HText {
! 199: HTRequest * request;
! 200: BOOL follow;
! 201: };
! 202:
! 203: /*
! 204: ** A structure for calculating metadata distributions
! 205: */
! 206: typedef struct _MetaDist {
! 207: HTAtom * name;
! 208: int hits;
! 209: } MetaDist;
! 210:
! 211:
! 212: PUBLIC int OutputData(const char * fmt, ...);
! 213: PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth);
! 214: PUBLIC BOOL HyperDoc_delete (HyperDoc * hd);
! 215: PUBLIC Robot * Robot_new (void);
! 216: PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method);
! 217: PUBLIC void Cleanup (Robot * me, int status);
! 218: PUBLIC void SetSignal (void);
! 219: PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags);
! 220: PUBLIC void VersionInfo (void);
! 221: PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
! 222: void * param, int status) ;
! 223:
! 224: PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
! 225: void * param, int status) ;
! 226:
! 227: PUBLIC void Serving_queue(Robot *mr);
! 228:
! 229: PUBLIC char *get_robots_txt(char *uri);
! 230:
! 231: #endif
! 232: </PRE>
! 233: <P>
! 234: <HR>
! 235: <ADDRESS>
! 236: @(#) $Id: HTAccess.html,v 2.87 1998/09/30 21:41:51 frystyk Exp $
! 237: </ADDRESS>
! 238: </BODY></HTML>
Webmaster