File:  [Public] / libwww / Robot / src / HTRobMan.html
Revision 1.5: download - view: text, annotated - select for diffs
Wed Jan 6 15:38:48 1999 UTC (25 years, 5 months ago) by frystyk
Branches: MAIN
CVS tags: HEAD, Before-New-Trace-Messages
A completely rewritten HText interface that makes it a lot easier to use the libwww HTML parser and also doesn't throw away tags or entities that are not defined in the libwww HTML DTD. The new interface is based on registering callbacks so that the application doesn't have to provide the old HText functions that it didn't use. The HTML parser itself has also improved as it now knows about several HTML 4.0 tags. Finally, there are three new sample applications called showtags, showtext, and showlinks which illustrate how to use the new HText interface. Currently the line mode browser hasn't been updated so it doesn't compile for the moment.

<HTML>
<HEAD>
  <TITLE>Webbot - the W3C Mini Robot</TITLE>
</HEAD>
<BODY>
<H1>
  Webbot - the W3C Mini Robot
</H1>
<PRE>
/*
**	(c) COPRIGHT MIT 1995.
**	Please first read the full copyright statement in the file COPYRIGH.
*/
</PRE>
<P>
This program illustrates how to travers links using the Anchor object
<PRE>
#ifndef HTROBMAN_H
#define HTROBMAN_H

#include "WWWLib.h"			      /* Global Library Include file */
#include "WWWApp.h"				        /* Application stuff */
#include "WWWTrans.h"
#include "WWWInit.h"
#include "WWWSQL.h"

#include "HText.h"

#include "HTRobot.h"			     		 /* Implemented here */

#ifndef W3C_VERSION
#define W3C_VERSION 		"Unspecified"
#endif

#define APP_NAME		"W3CRobot"
#define APP_VERSION		W3C_VERSION
#define COMMAND_LINE		"http://www.w3.org/Robot/User/CommandLine"
#define ROBOTS_TXT              "/robots.txt"

#define DEFAULT_OUTPUT_FILE	"robot.out"
#define DEFAULT_RULE_FILE	"robot.conf"
#define DEFAULT_LOG_FILE       	"log-clf.txt"
#define DEFAULT_HIT_FILE       	"log-hit.txt"
#define DEFAULT_REL_FILE      	"log-rel.txt"
#define DEFAULT_LM_FILE       	"log-lastmodified.txt"
#define DEFAULT_TITLE_FILE     	"log-title.txt"
#define DEFAULT_REFERER_FILE   	"log-referer.txt"
#define DEFAULT_REJECT_FILE   	"log-reject.txt"
#define DEFAULT_NOTFOUND_FILE  	"log-notfound.txt"
#define DEFAULT_CONNEG_FILE  	"log-conneg.txt"
#define DEFAULT_NOALTTAG_FILE  	"log-alt.txt"
#define DEFAULT_FORMAT_FILE  	"log-format.txt"
#define DEFAULT_CHARSET_FILE  	"log-charset.txt"
#define DEFAULT_MEMLOG		"robot.mem"
#define DEFAULT_PREFIX		""
#define DEFAULT_IMG_PREFIX	""
#define DEFAULT_DEPTH		0
#define DEFAULT_DELAY		50			/* Write delay in ms */

#define DEFAULT_SQL_SERVER	"localhost"
#define DEFAULT_SQL_DB		"webbot"
#define DEFAULT_SQL_USER	"webbot"
#define DEFAULT_SQL_PW		""

#if 0
#define HT_MEMLOG		/* Is expensive in performance! */
#endif

#define MILLIES			1000
#define DEFAULT_TIMEOUT		20		          /* timeout in secs */

#if defined(__svr4__)
#define CATCH_SIG
#endif

typedef enum _MRFlags {
    MR_IMG		= 0x1,
    MR_LINK		= 0x2,
    MR_PREEMPTIVE	= 0x4,
    MR_TIME		= 0x8,
    MR_SAVE	  	= 0x10,
    MR_QUIET	  	= 0x20,
    MR_REAL_QUIET  	= 0x40,
    MR_VALIDATE		= 0x80,
    MR_END_VALIDATE	= 0x100,
    MR_KEEP_META	= 0x200,
    MR_LOGGING		= 0x400,
    MR_DISTRIBUTIONS	= 0x800,
    MR_NOROBOTSTXT	= 0x1000,
    MR_NOMETATAGS	= 0x2000,
    MR_BFS      	= 0x4000
} MRFlags;

typedef struct _Robot {
    int			depth;			     /* How deep is our tree */
    int                 ndoc;
    int                *cdepth;                /* Number of nodes per level */
    int			cnt;				/* Count of requests */
    int                 cindex;         /* Number assigned to each document */

    HTList *		hyperdoc;	     /* List of our HyperDoc Objects */
    HTList *		htext;			/* List of our HText Objects */
    HTList *		fingers;

    HTList *            queue;                  /* Queue */
    int                 cq;

    int 		timer;
    int 		waits;

    char *		cwd;			/* Current dir URL */
    char *		rules;
    char *		prefix;
    char *		img_prefix;

    char *		logfile;		/* clf log */
    HTLog *             log;
    char *		reffile;		/* referer log */
    HTLog *             ref;
    char *		rejectfile;		/* unchecked links */
    HTLog *	        reject;
    char *		notfoundfile;		/* links that returned 404 */
    HTLog *	        notfound;
    char *		connegfile;		/* links that were conneg'ed */
    HTLog *	        conneg;
    char *		noalttagfile;		/* images without alt tags*/
    HTLog *	        noalttag;


    char *		hitfile;		/* links sorted after hit counts */
    char *		relfile;		/* link sorted after relationships */
    HTLinkType		relation;		/* Specific relation to look for */
    char *		titlefile;		/* links with titles */
    char *		mtfile;			/* media types encountered */
    char *		charsetfile;		/* charsets encountered */
    char *		lmfile;			/* sortef after last modified dates */

    char *		outputfile;		
    FILE *	        output;

    char *              furl;                              /* First url */


    MRFlags		flags;

    long		get_bytes;	/* Total number of bytes processed using GET*/
    long                get_docs;     	/* Total number of documents using GET */

    long		head_bytes;	/* bytes processed bytes processed using HEAD */
    long                head_docs;   	/* Total number of documents using HEAD*/

    long		other_docs;

    ms_t		time;		/* Time of run */

#ifdef HT_POSIX_REGEX
    regex_t *		include;
    regex_t *		exclude;
    regex_t *		check;
    regex_t *		exc_robot;     /* Robots.txt exclusion */
#endif

#ifdef HT_MYSQL
    HTSQLLog *		sqllog;
    char *		sqlserver;
    char *		sqldb;
    char *		sqluser;
    char *		sqlpw;
    char *		sqlrelative;
    BOOL		sqlexternals;
    int			sqlflags;
#endif

} Robot;

typedef struct _Finger {
    Robot * robot;
    HTRequest * request;
    HTParentAnchor * dest;
} Finger;

/*
**  The HyperDoc object is bound to the anchor and contains information about
**  where we are in the search for recursive searches
*/
typedef struct _HyperDoc {
    HTParentAnchor * 	anchor;
    int			depth;
    int                 hits;
    int                 code;
    int                 index;
    char *              title;
    HTMethod            method;
} HyperDoc;

/*
** This is the HText object that is created every time we start parsing an 
** HTML object
*/
struct _HText {
    HTRequest *		request;
    BOOL		follow;
};

/*
**  A structure for calculating metadata distributions
*/
typedef struct _MetaDist {
    HTAtom *		name;
    int			hits;
} MetaDist;
 
#ifdef HT_POSIX_REGEX
PUBLIC regex_t * get_regtype (Robot * mr, const char * regex_str, int cflags);
#endif

PUBLIC int OutputData(const char  * fmt, ...);
PUBLIC HyperDoc * HyperDoc_new (Robot * mr,HTParentAnchor * anchor, int depth);
PUBLIC BOOL HyperDoc_delete (HyperDoc * hd);
PUBLIC Robot * Robot_new (void);
PUBLIC Finger * Finger_new (Robot * robot, HTParentAnchor * dest, HTMethod method);
PUBLIC BOOL Robot_registerHTMLParser (void);
PUBLIC void Cleanup (Robot * me, int status);
PUBLIC void SetSignal (void);
PUBLIC void VersionInfo (void);
PUBLIC int terminate_handler (HTRequest * request, HTResponse * response,
			       void * param, int status) ;

PUBLIC int my_terminate_handler (HTRequest * request, HTResponse * response,
			       void * param, int status) ;

PUBLIC void Serving_queue(Robot *mr);

PUBLIC char *get_robots_txt(char *uri);

#endif
</PRE>
<P>
  <HR>
<ADDRESS>
  @(#) $Id: HTRobMan.html,v 1.5 1999/01/06 15:38:48 frystyk Exp $
</ADDRESS>
</BODY></HTML>

Webmaster