#!/usr/local/bin/perl5 -w
######################################
# update-archive
# Author: Jose Kahan <kahan@w3.org> 
#####################################
# Description 
#############################################
# Script for transforming a smartlist email
# archive into an HTML archive. Features:
# * Automatic division of the HTML archive
#   into periods, whose size can be chosen
#   by the user
# * Automatic generation of a script for
#   invoking update-period (the hypermail
#   front end)
# * Automatic generation of an HTML master index
#   for the archive
######################################
# Assumptions: All mails are stored in the
# same directory, one mail per file. 
# Moreover, each file is associated with
# a number corresponding to the order of
# reception.
######################################
# History:
# 8/Mar/97: Genesis from finddates
# Mar-June 97: Different corrections, updates
# 6 June 97: Merged this script with the old cronjob one
# 11 June 97: Added the smartlist $ARCHIVE feature
# 13 June 97: Started adding recovery conditions
# 16 June 97: Changed the condition for detecting the date in
#          the received file
# 20 June 97: unlinking of old catalog files in regen option
# June 98: Made the parser independent of the format of the 
#          first Received header (used to sort out dates)
# July 98: Legacy upgrade: if the year in the Received header
#          is less than 1900, I add 1900 to it (welcome y2k bug)
# Sept 99: General rewrite of the script
#          Rewrote the first Received: parser (it was broken)
#          Added a search form
#          Upgraded the generated HTML
#          Added a /latest shortcut (using Apache's asis stuff)
# Dec 99:  Changed the master index format to use tables
#          Added a version number for having an automatic regeneration if we
#          change versions
#          Added a protection (hopefully) against the n + 1 problem
#          Added code to automatically set an rc.lock when the script
#          is called by hand
#          Changed the way we get the filenames from an MH directory. 
#          The other one was broken when the directory contained other
#          filenames that weren't digits exclusively.
#          
# to be done ... Add a received: header to msgs that don't have it!
# or get the date from the From header and use it if we don't find
# a received header
# ... write the latest.asis file only when there's a period change
# Add a protection so that if we receive a message previous to the
# current one, we'll add it to the current period, rather than saying
# that the date wasn't found
#######################################################
#
# USAGE:
# update-archive options
#
# OPTIONS:
#
# -list name pf the mailing list
# -addone increase archive by one mail
# -latestmail filename of the latest mail (generated by Smartlist)
# -quiet don't output progress messages
# -debug create a master index, but don't regenerate the index file
# -recover recover the archive from a server crash/shutdown
######################################

############################# 
# Pre-initializationon      #
#############################

BEGIN {
    require 5.005;
    use lib '/home/lists/.bin/';

    ###############################
    # Global custom configuration #
    ###############################
    use hm_config;

    #############################
    ## External modules        ##
    #############################
    # our ad-hoc lock module
    use hm_lock;
}


#############################
# Global variables         ##
#############################

## The version number of the script
## If we change it, archives generated under older versions will be
## force-regenerated
$hm_version = "1";

## The current "archive" version of hypermess used to generate the
##  archives up to now.
$hm_old_version = -1;

## the desc of the list: "www-html@w3.org JanFeb96"
$verbose_archive_name = "";
## link to the Master Index 

## has member, team, or public value, which allows us to 
## connect to a specific search engine
$archive_type = undef;

# set to 1 if we managed to set a lock when running manually
$rc_lock_status = 0;
$archive_lock_status = 0;

$progress = 1;
$debug = 0;
#add_one = 0;
$latest_mail = "";

$index_header = "";
$index_footer = "";
$author_email = "";

$autoscript = "";
$list_dir =  ""; 
$mail_dir = "";
$archive_dir = "";
$mainindex_css = "";
$messagelist_css = "";
$message_css = "";

$first_date = ""; #"Jan 1997";
$last_date = "";
$list_name = undef;
$marchive_name = "default";
$add_one = 0;
$mpperiod = 0;
$max_period = 0;
$recover_archive = 0;
$use_lock = 1;

$last_catalog_entry = undef;
$recover_catalog_entry = undef;

$first_index = -1;

@MONTHS = ("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
           "Oct", "Nov", "Dec");

%MONTHS = ("Jan", 0,
	   "Feb", 1,
	   "Mar", 2,
	   "Apr", 3,
	   "May", 4,
	   "Jun", 5,
	   "Jul", 6,
	   "Aug", 7,
	   "Sep", 8,
	   "Oct", 9,
	   "Nov", 10,
	   "Dec", 11);

%VERBOSE_MONTHS = ("Jan", "January",
		"Feb", "February",
		"Mar", "March",
		"Apr", "April",
		"May", "May",
		"Jun", "June",
		"Jul", "July",
		"Aug", "August",
		"Sep", "September", 
		"Oct", "October", 
		"Nov", "November",
		"Dec", "December");

## lock cleanup file

END {
    if ($list_dir ne "") {
	if ($rc_lock_status) {
	    lock_clear ("$list_dir/$lock_file");
	    $rc_lock_status = 0;
	}
	if ($archive_lock_status) {
	    lock_clear ("$list_dir/$archive_lock_file");
	    $archive_lock_status = 0;
	}

    }
}

##############################
### Main Program           ###
##############################

{

###
## for debug
#    open (DEBUG, ">>/tmp/hm_debug");

## Configure the script and choose the rebuild mode

    &Configure;

# Build an index of date and filenames corresponding to the beginning of 
# the period

    if ($add_one) {
	&build_indexes_one;
    } else {
	
	if ($recover_archive) {
	    &build_indexes_recover_archive;
	} else {
	    &build_indexes_regen;
	}

        # create the script which will call update-period
	&make_autoscript;

        # print an on-screen report
	if ($progress) {
	    &write_report;
	}
    }

    # write the CATALOG file
    &write_catalog_file;

    # write the master index
    &write_master_index;

    # write the redirection info for latest
    #@@ I could use_period_change here and avoid writing this file unless
    # necessary
    &write_latest_redirection;

    # add the message (or list of messages) to the archive
    if ($add_one) {
	&callHypermail;
    } else {
	if ($debug == 0) {
	    system ($autoscript);
	}
    }

    # update the version info if it changed
    if ($hm_version ne $hm_old_version) {
	write_hm_version ();
    }

    &ResetUpdatingFlag;

## DEBUG
#    close (DEBUG);

    exit (0);
}

##########
# END of MAIN program
##########

#################
## SUBROUTINES
#################

#########################
## Common subroutinres ##
#########################

#
# sort subroutine
#

sub by_number {
        $a <=> $b;

} #by_number

#
# increases a MMM YYYY Structure
#

sub increase_date {
    $current_month_index = ($current_month_index + 1 ) % 12;
    if ($current_month_index == 0) {
	# previous month was Dec, increase the tmp year;
	$current_year++;
    } 
    $current_month = $MONTHS[$current_month_index];
    $current_date = $current_month . " " . $current_year;

} #increase_date

#
# increases a MMM YYYY Structure, while maintaining the corresponding indexes
#

sub increase_date_current_index {

    $current_month_index = ($current_month_index + 1 ) % 12;

    if ($PERIOD{$MONTHS[$current_month_index]} != $current_period_index) {
	# period has changed
	$period_change = 1;
	$tmp = join (":", $mpperiod,
		     $current_year,
		     $VERBOSE_PERIOD[$current_period_index],
		     $current_year. $PERIOD[$current_period_index],
		     $first_file,
		     $last_file);
	
	push (@date_list, $tmp);
	$current_period_index = ($current_period_index + 1 ) % $max_period;


	if ($current_month_index == 0) {
	    # previous month was Dec, increase the tmp year;
	    $current_year++;
	} 

	$first_file = $last_file = "";
    }

    $current_month = $MONTHS[$current_month_index];
    $current_date = $current_month . " " . $current_year;

} ##increase_date_current_index

#
# Read the file names
# on exit:  $totalfiles = number of files read
#           @fileNames = all the filenames, sorted by numerical order


sub ReadFileNames {
    opendir (SOURCEDIR, "$mail_dir") or
	die ("Could not read dir $mail_dir: $!");

# The following line works only under Perl5 or more recent
    #@fileNames = grep -T, map "$mail_dir/$_", readdir SOURCEDIR;
    @fileNames = grep (/^\d*$/, readdir (SOURCEDIR));
    closedir (SOURCEDIR);
    
    $totalfiles = $#fileNames;

    #### either there was no file or the . file ??
    #### verify this

    if ($totalfiles == -1) {
	die ("Could not read dir $mail_dir (maybe it's empty): $!");
    }

    # Sort the file names by numerical order

    if ($totalfiles > 0) {
	@fileNames = sort by_number @fileNames;
    }

} ## ReadFileNames

sub write_report {

    my ($last_file);
    my ($first_file);
    my ($last_entry);
    my ($number_mails);

# we use the following for outputing verbose info when
# the script is manually executed

format top1 =
Proposed mail periods for @*
                          $list_name
using a period size of ^||||| months per period.
                               $mpperiod
                                                        Number                 
Year       Period            First File   Last File   of mails
----  --------------------   ----------   ---------   --------

.

format line1 =
@<<<<  @<<<<<<<<<<<<<<<<<<<  @>>>>>>      @>>>>>>     @>>>>>         
$current_year, $verbose_archive_name,  $first_file,   $last_file,  $number_mails
.

    $first_file = "";
    $last_entry = $#date_list + 1;
    
    $^ = top1;
    $~ = line1;

    for ($i = 0; $i < $last_entry; $i++) {
	($m_months, $current_year, $verbose_archive_name, $archive_name, $first_file, $last_file) = split (':', $date_list[$i]);

	if ($first_file eq "") {
	    next;
	}

	$number_mails = $last_file - $first_file + 1;

	write;
    }

} # write_report

#
# write a catalog reflecting the master index to a file
#

sub write_catalog_file {
	
    my ($last_entry);

    ### Make a backup copy of the catalog

    if ($add_one == 1) {
	BackupFile ("$list_dir/$archive_status/",
		    "catalog");
    }

    $last_entry = $#date_list + 1;
     
    open (OUT, ">$list_dir/$archive_status/catalog") or
	die "could not open file $list_dir/$archive_status/catalog: $!";

    print OUT "###########################################\n";
    print OUT "## Generated by update-archive on ", scalar localtime, "\n";
    print OUT "## DO NOT MODIFY BY HAND\n";
    print OUT "###########################################\n";
    print OUT "## mpperiod:year:master_index_name:directory_name:first_mail_last_mail\n";
    print OUT "###########################################\n";


    for ($i = 0; $i < $last_entry; $i++) {
	($m_months, $current_year, $verbose_archive_name, $archive_name, $first_file, $last_file) = split (':', $date_list[$i]);

	if ($first_file eq "") { # dont' write empty periods
	    next;
	}
	print OUT $date_list[$i], "\n";
    }

    close (OUT);

} # write_catalog_file


#
# write the master index
#

sub write_master_index {
    my ($old_umask);
    my ($first_file);
    my ($last_file);
    my ($last_entry);
    my ($last_year);

    $first_file = "";

    # save the current umask
    $old_umask = umask;
    umask (02);

    open (OUT, ">$list_dir/Archive/$master_index_name") 
	or die "could not open file $list_dir/Archive/$master_index_name: $!";

    # write the header
## add this if I want a whie bg
## <body bgcolor="#FFFFFF">
    print OUT <<END;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
                      "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>    
<title>$marchive_name Mail Archives</title>
<LINK REL="Stylesheet" HREF="$mainindex_css">
</head>
<body>

<a href="http://www.w3.org/"><img alt="W3C"
src="http://www.w3.org/Icons/WWW/w3c_home" border="0"></a>

<h1>$marchive_name Mail Archives</h1>
<hr>
END

    # write the search engine form if there really is a "standard" archive
    if (defined $archive_type) {
	print OUT <<END;
<form method="GET" 
action="http://search.w3.org/$archive_type/cgi-bin/query">
<input type="hidden" name="mss" value="simple">
<input type="hidden" name="pg" value="q">
<input type="hidden" name="what" value="web">
<input type="hidden" name="filter" value="lists">
<input type="hidden" name="fmt" value=".">
<input name="q" size="40" maxlength=800 value="+$list_name "> 
<input type="submit" name="search" value="Search">
<a href="http://search.w3.org/$archive_type/cgi-bin/query?mss=advanced&amp;pg=aq">
Advanced Search</a>
<a href="http://search.w3.org/$archive_type/tmpl/default/help_query_body.html#general">
Help</a>
</form>
END
    }

# insert the archive description (if it exists)
    if (-e $index_header and open (DESC, "<$index_header")) {
	while (<DESC>) {
	    print OUT $_;
	}
	close (DESC);
    }
    
# write the archive period index 

    # we first write the table header

    print OUT "<table border=\"0\" cellspacing=\"3\" cellpadding=\"2\">\n";
    # give the number of columns
    print OUT "<colgroup span=\"6\"></colgroup>\n";
    print OUT "<thead>\n";
    print OUT "<tr>"
	. "<th align=\"center\"><em>period</em></td>"
	. "<th align=\"center\"colspan=\"4\"><em>sorted by</em></td>"
	. "<th align=\"right\"><em>messages</em></td>"
        . "</tr>\n";
    print OUT "</thead>\n";
    print OUT "<tbody>\n";

    # now we write every entry in the index
    
    $current_year = -1;
    $last_year = -1;
    $last_entry = $#date_list;
    for ($i = $last_entry;  $i > -1; $i--) {

        ($m_months, $current_year, $verbose_archive_name,
         $archive_name, $first_file, $last_file) =
             split (':', $date_list[$i]);
 
        if ($first_file eq "") { # don't write empty periods
            next;
        }

	# skip a line if the year has changed
	if ($last_year != -1 && $last_year != $current_year) {
	    printf OUT "<tr><td colspan=\"6\"></td></tr>\n"
	}
	$last_year = $current_year;

        $num_mails = $last_file - $first_file + 1;
 
        printf OUT "<tr>"
	    . "<th align=\"right\"><strong>%s %s</strong></th>"
         . "<td><a href=\"%s\">date</a></td>"
         . "<td><a href=\"%s\">thread</a></td>"
         . "<td><a href=\"%s\">author</a></td>"
         . "<td><a href=\"%s\">subject</a></td>"
         . "<td align=\"center\">%s</td>"
         . "</tr>\n",
             $verbose_archive_name,
             $current_year,
             "$archive_name/",
             "$archive_name/thread.html",
             "$archive_name/author.html",
             "$archive_name/subject.html",
             $num_mails;
    }
    print OUT "</tbody>\n";
    print OUT "</table>\n";

    # write the user's custom footer if it exists

    if (-e $index_footer and open (DESC, "<$index_footer")) {
        while (<DESC>) {
            print OUT $_;
        }
        close (DESC);
    }

    # write the footer
    print OUT <<END;
<p></p>
<hr>
<address>
END

    if (-e $author_email and open (DESC, "<$author_email")) {
	while (<DESC>) {
	    chomp ($_);
	    print OUT $_;
	}
	close (DESC);
    }
    print OUT "<br>\n";
    print OUT "<a href=\"http://www.w3.org/Help/Webmaster\">Webmaster</A><br>\n";
    print OUT  "Last update on: ", scalar localtime, "\n";
    print OUT <<END;
</address>
</body></html>
END

    close (OUT);
    # restore the old umask
    umask ($old_umask);

} # write_master_index

#
# Creates a local apache conf file that redirect the /latest request
# to the URL of the most recent period
#

sub write_latest_redirection {
    my $old_umask;
    my $last_entry;
    my $archive_name;

    $last_entry = $#date_list;
    $archive_name = (split (':', $date_list[$last_entry]))[3];

    # save the current umask
    $old_umask = umask;
    umask (02);

    if (defined $archive_type 
	and open (OUT, ">$list_dir/Archive/$latest_redirection_file")) {
	print OUT "Status: 302 Found\n";
	printf  OUT $latest_redirection_format, $archive_type, $list_name,
	$archive_name;
	close (OUT);
    }
    umask ($old_umask);

} ## write_latest_redirection


## Read the command line options
sub getOpts {
    local($_);

    while (@ARGV && ($_ = $ARGV[0]) =~ /^-(.)(.*)/) {
        $curr = shift(@ARGV);
	if ($curr eq "-addone") {
	    $add_one = 1;
	} elsif ($curr eq "-first") {
	    $first_date = shift(@ARGV);	    
	    $first_date = $first_date . " " . shift (@ARGV);
	} elsif ($curr eq "-last") {
	    $last_date = shift(@ARGV);	    
	    $last_date = $last_date . " " . shift (@ARGV);
	} elsif ($curr eq "-list") {
	    $list_name = shift(@ARGV);
	} elsif ($curr eq "-quiet") {
	    $progress = 0;
	} elsif ($curr eq "-debug") {
	    $debug = 1;
	} elsif ($curr eq "-latestmail") {
	    $latest_mail = shift(@ARGV);
	} elsif ($curr eq "-recover") {
	    $recover_archive = 1;
        } elsif ($curr eq "-archive_name") {
            $marchive_name = shift(@ARGV);
        } elsif ($curr eq "-no_lock") {
            $use_lock = 0;
        }

    }

} ## getOpts


# read_last_archived_mail
# Returns the filename of the last (sequentially) stored email
# We suppose nothing is erased in the archives
#

sub read_last_archived_mail {
    my ($last_archived_mail);
    my ($lastline);

    if (-e "$list_dir/$archive_status/catalog") {
	open (TAIL, "$tail -1 $list_dir/$archive_status/catalog |") 
	    or return -1;
	$lastline = <TAIL>;
	close (TAIL);
    }
    if (!defined ($lastline)) {
	return -1;
    }

    $last_archived_mail = (split (':', $lastline))[5];
    if (!defined ($last_archived_mail)) {
	return -1;
    }
    else {
	return $last_archived_mail;
    }
} # read_last_archived_mail;
    

# Configure the script

sub Configure {

    my ($tmp);
    my ($regenerate);
    my ($css_type);

#
# Read the command_line options
#
    &getOpts();

    ################################################
    ## Local custom configuration                 ##
    ## (I need to lock the access to these files) ##
    ## (and it's a bit dangerous... anything could ##
    ## change in the script unless we parse it.    ##
    ################################################
    if (-e "$list_dir/hm_local_config.dat") {
	do '$list_dir/hm_local_config.dat';
    }
    
    if ((!defined $list_name) ||
       ($list_name eq "-addone") ||
	($add_one == 1 && $latest_mail eq "")) {
	if ($progress != 0) {
		die "Usage: update-archive "
		    ."-list <name of the list> "
		    ."[-lastmailfile <last smartlist archived msg filename>]"
		    ."[-addone <add one message to the archive>]"
                    ."[-quiet] [-debug] [-archive_name <add a custom archive name>]\n";
	} else {
	  # quietly exit, without showing any error message
	  exit (0);
	}
    }

    # if Smartlist is invoking this script, then list_name has the form
    # list-name@site. Here we chop the \@ and anything after it.
	
    if (($tmp = index ($list_name, "\@")) > -1) {
	$list_name = substr($list_name, 0, $tmp);
    }

    # if Smartlist is giving the latest mail filename, it'll have the
    # form path/filename. Here we chop anything before the /, including the /

    if (($tmp = rindex ($latest_mail, "/")) > -1) {
	$latest_mail = substr($latest_mail, $tmp + 1);
    }

    # if the user didn't specify a custom archive_name, we'll use
    # $archive_name@domain_name

    if ($marchive_name eq "default") {
	$marchive_name = "$list_name\@$domain_name";
    }

    # What is the visibility of this mailing list archive?
    # We use the Archive path to infer it
    $tmp = readlink "$mail_list_path/$list_name/Archive";
    if (defined $tmp) {
	if ($tmp =~ /Team/i) {
	    $archive_type = "Team";
	} elsif ($tmp =~ /Member/i) {
	    $archive_type = "Member";
	} else {
	    $archive_type = "Public";
	}
    }
    
    ## setup the CSS urls
    if (defined $archive_type) {
	$css_type = $archive_type;
	$css_type =~ tr/A-Z/a-z/;
    } else {
	$css_type = "public";
    }
    $mainindex_css = "$css_base/$css_type-mainindex";
    $messagelist_css = "$css_base/$css_type-messagelist";
    $message_css = "$css_base/$css_type-message";

    $list_dir = "$mail_list_path/$list_name";
    $archive_dir = "$list_dir/Archive";
    $mail_dir = "$list_dir/archive/latest";
    $autoscript = "$list_dir/$archive_status/autoscript";
    $index_header = "$list_dir/$archive_status/header";
    $index_footer = "$list_dir/$archive_status/footer";
    $author_email = "$list_dir/$archive_status/email";

## try to set the rc.lock or die trying if we're running interactively
    if ($use_lock) {

	if (lock_set ("$list_dir/$rc_lock_file")) {
	    $rc_lock_status = 1;
	} else {
	    die "Aborting... Couldn't set the $list_dir/$rc_lock_file\n";
	}

	if (lock_set ("$list_dir/$archive_lock_file")) {
	    $archive_lock_status = 1;
	} else {
	    die "Aborting... Couldn't set the $list_dir/$archive_lock_file\n";
	}

    }

    
# get the months per period user configurable parameter
    &read_mpperiod_file;

# set the recover flags and recover if they're set!
   
    &TestSetUpdatingFlag;

# more tests that I haven't placed elsewhere to see if we should or not
# regenerate the archive

#    print DEBUG "before my fixes, we have recover_archive $recover_archive addone $add_one\n";

    if ($recover_archive || $add_one) {
	$regenerate = 0;

# was the archived generated using the same version of hypermess?
	$hm_old_version = read_hm_old_version ();
#	print DEBUG "hm_old version is $hm_old_version hm_version is $hm_version\n";
	if ($hm_old_version ne $hm_version) {
	    $regenerate = 1;
	}

# is the msgno that smartlist is giving is equal to the last archived
# msgno + 1 ?  We could here of course avoid regenerating EVERYTHING and
# just do so with the latest period
	if ($regenerate == 0) {
	    $last_archived_mail = read_last_archived_mail ();
#	    print DEBUG "last_archived_mail number is $last_archived_mail, latest mail (sl) is $latest_mail \n";
	    if ($latest_mail != $last_archived_mail + 1) {
		$regenerate = 1;
	    }
	}

	# if the versions are different, we force the regeneration of 
	# the whole archive
	if ($regenerate) {
	    $recover_archive = 0;
	    $add_one = 0;
	    # erase the existing catalog, flag.backup, and catalog.bak
	    # as we're forcing a recovery
	    unlink ("$list_dir/$archive_status/catalog");
	    unlink ("$list_dir/$archive_status/catalog.bak");
	    unlink ("$list_dir/$archive_status/flag.backup");
	}
    }

#    print DEBUG "after my fixes, we have recover_archive $recover_archive addone $add_one\n";

# Setup the configuration parameters
    
    if ($recover_archive) {
	&setup_configuration_recover_archive;
	$add_one = 0;
    } elsif ($add_one) {
	&setup_configuration_one;
    } else {
	&setup_configuration_regen;
    }

} ## Configure


#
# Setup the period arrays according to the mpperiod value
#

sub setup_period_arrays {

    for ($i = 0, $j = $mpperiod, $max_period = 0; $i <12; $i++) {
    
	$PERIOD{$MONTHS[$i]} = $max_period;

	if (--$j == 0) {
	    if ($mpperiod >1) {
		push (@PERIOD, $MONTHS[$i-$mpperiod+1] . $MONTHS[$i]);
		push (@VERBOSE_PERIOD, "$VERBOSE_MONTHS{$MONTHS[$i-$mpperiod+1]} to $VERBOSE_MONTHS{$MONTHS[$i]}");
	    } else {
		push (@PERIOD, $MONTHS[$i]);
		push (@VERBOSE_PERIOD, $VERBOSE_MONTHS{$MONTHS[$i]});
	    }

	    $max_period++;
	    $j = $mpperiod;
	}
    }

} #setup_period_array

#
# Setup the configuration parameters
#


###############################################
# Subroutines for regeneratin an HTML archive #
###############################################

sub setup_configuration_regen {

    # build the periods arrays according to the mpperiod value
    &setup_period_arrays;
    
#
# Increment the last_date value (we verify up to the month after) 
#

    if ($last_date) {
	($current_month, $current_year) = split(/ /, $last_date);
	if (exists $MONTHS{$current_month}) {
	    $current_month_index = $MONTHS{$current_month};
	} else {
	    die "Last date parameter has an incorrect syntax";
	} 
	&increase_date;
	$last_date = $current_date;
    }

#
# Initialize the state variables
#
    
    if ($first_date) {
	### $current_date = $current_date;  ### BUG?
	($current_month, $current_year) = split(/ /, $current_date);

	if (exists $MONTHS{$current_month}) {
	    $current_month_index = $MONTHS{$current_month};
	} else {
	    die ("Error: Invalid first date: $first_date");
	}
	    
	if (exists $PERIOD{$current_month}) {
	    $current_period_index = $PERIOD{$current_month};
	}  else {
	    die ("Error: Could not find the corresponding period");
	}
    }

    # Remove any trailing files

    if (-e "$list_dir/$archive_status/catalog.bak") {
	unlink ("$list_dir/$archive_status/catalog.bak");
    }

    if (-e "$list_dir/$archive_status/catalog") {
	unlink ("$list_dir/$archive_status/catalog.");
    }

    # Read the file names in the smartlist directory
    &ReadFileNames;

} # setup_configuration_regen


#
# Build an index of date and filenames corresponding to the beginning of 
# the period
#

sub build_indexes_regen {
    my $found;

    # find the first file that has a valid Received: header

    for ($i = 0, $first_index = -1; $i <= $totalfiles 
	 && $first_index == -1; $i++) {

	# get the Received header from the message
	$found = getdate_fromreceived ("$mail_dir/$fileNames[$i]");
	unless ($found) {
	    next;
	}

	if ($first_date) {
	    $testr = index ($_, $current_date);	    
	    unless ($testr == -1) {
		$first_file = $fileNames[$i];
	    }
	    $first_index = $i;
	    last;
	} else {
	    # the was no first_date parameter, so we look it up
	    for ($j=0; $j <12; $j++) {
		$testr = index ($_, $MONTHS[$j]);
		unless ($testr == -1) { #found a month!
		    $first_index = $i;
		    # set up the status variables
		    $current_date = substr ($_, $testr, 8);
		    $current_month_index = $j;
		    ($current_month, $current_year) =
			split(/ /, $current_date);
		    $current_period_index = $PERIOD{$current_month};
		    last;
		} 
	    }
	    last;
	}
    }

    if ($first_index == -1) {
	die ("could not find the first date $first_date");
    } else {
	$first_file = $last_file = $fileNames[$first_index];
    }

    # index the remaining files

    ($sec, $min, $hour, $this_day, $this_month, $this_year) = localtime(time);
    # localtime returns the # of years since 1900
    $this_year = $this_year + 1900;
    $this_date = join (":",$hour, $min, $sec);
    $this_date = "$this_date " . $this_year . $this_month .$this_day;
    
  INDEX:
    for ($i = $first_index + 1; $i <= $totalfiles; $i++) {
	
	# get the date from a message
	$found = getdate_fromreceived ("$mail_dir/$fileNames[$i]");

	if ($found == 0) {
	    # we somehow found an empty file!, let's remove it from
	    # the fileNames table and skip it
	    splice(@fileNames, $i, 1);
	    $totalfiles--;
	    $i--;
	    next;
	} elsif (index ($_, $current_date) > -1) {
	    # we are still in the same month
	    next;
	} else {
	    # the month changed!
	    # increase the date (will correctly fix up the date );
	    $last_file = $fileNames[$i-1];
	    &increase_date_current_index;
	    
	    if ($last_date && $current_date eq $last_date) {
		# we've reached the last date
		last INDEX;
	    }
	    
	    # if there's a gap in the archives, try to find the current year
	    while ($current_year <= $this_year) {
		$test = index ($_, $current_date);
		if ($test == -1) {
		    # didn't find it, so continue searching 
		    &increase_date_current_index; 
		    if ($last_date && $current_date eq $last_date) {
			# we've reached the last date
			last INDEX;
		    } elsif ($current_year > $this_year) {
			die ("ERROR: Could not find a date for mail id $mail_dir/$fileNames[$i]");
		    }
		} else {
		    if ($period_change) {
			$period_change = 0;
			$first_file = $fileNames[$i];
		    }
		    last;
		}
	    }
	}
    }

    # Add an entry for the last filename in the directory
    unless ($last_date) {
	$last_file = $fileNames[$i-1];
	$tmp = join (":", $mpperiod,
		     $current_year,
		     $VERBOSE_PERIOD[$current_period_index],
		     $current_year. $PERIOD[$current_period_index],
		     $first_file,
		     $last_file);
	push (@date_list, $tmp);
    }

} ##  build_indexes_regen

#
# create the script which will call update-period
#

sub make_autoscript {

    my ($i);
    my ($last_entry);
    my ($first_file);

    $first_file = "";
    $last_entry = $#date_list + 1;

    if ($progress == 0) {
	$update_period = $update_period . " -quiet";
    }

    open (OUT, ">$autoscript") or die "could not open file $autoscript: $!";
    print (OUT "#!/usr/bin/sh\n");
    print (OUT "#### Script for regenerating html archives\n");
    print (OUT "#### of W3C's mailing lists archives\n\n");
    print (OUT "#### \(Generated by update-archive\)\n\n");

    if ($recover_archive && $recover_catalog_entry) {
	$i = $recover_catalog_entry;
    } else {
	$i = 0;
    }

    for (; $i < $last_entry; $i++) {
	($m_months, $current_year, $verbose_archive_name, $archive_name, $first_file, $last_file) = split (':', $date_list[$i]);

	if ($first_file eq "") {
	    next;
	}

##	print (OUT "mkdir $archive_dir/$archive_name\n");
##      hypermail makes its own directories

	print (OUT "$update_period -list $list_name -first $first_file -last $last_file -pdir $archive_name -overwrite -archive_name $marchive_name -desc $marchive_name from $verbose_archive_name $current_year\n");
    }
    close (OUT);
    # make it executable
    chmod(0774, $autoscript);

} # make_autoscript


###################
### Here below are the routines for dealing with one mail at the time
###################
### Some ideas to speed it up: Just regenerate the master index
### when there has been some changes
### don't read the whole master index, etc.

#
# read the archive's catalog file
#

sub read_catalog_file {
    
    if (-e "$list_dir/$archive_status/catalog") {
	open (IN, "<$list_dir/$archive_status/catalog");
	
	while (<IN>) {
	    if ((/^\#/ == 1) || (/^ / == 1)) { 
		next;
	    }
	    push (@date_list, $_);
	}
	close (IN);
	chop (@date_list);
	$last_catalog_entry = $#date_list;
    } else {
	$last_catalog_entry = -1;
    }

} # read_catalog_file

# 
# read_number_from_file 
# reads a number from a line. Any whitespace or # comments are ignored
# returns -1 if no number was found or if the file didn't exist

sub read_number_from_file {
    my ($file_name) = @_;
    my ($number);

    open (IN, "<$file_name")
	or return -1;

    $number = -1;
    while (<IN>) {
	# skip any comments in the file 
	if ( (/^\#/ == 1) || (/^ / == 1)) {
	    next;
	} else {
	    chop ($_);
	    $number = $_;
	    last;
	}
    }
    close (IN);

    return $number;

} # read_file

sub read_mpperiod_file {
    
    $mpperiod = read_number_from_file ("$list_dir/$archive_status/mpperiod");

#    print DEBUG "mpperiod is $mpperiod\n";

    if ($mpperiod <1 || $mpperiod >12) {
	die "Invalid month period value: $mpperiod";
    }
    
} # read_mpperiod_file 

#
# Reads the hypermess version number last used to generate the archive
#
sub read_hm_old_version {

    my ($hm_old_version);

    $hm_old_version = read_number_from_file ("$list_dir/$archive_status/hm_version");

#    print DEBUG "reading hm_old_version: $hm_old_version\n";

    return $hm_old_version;

} # read_hm_old_version

#
# Writes the hypermess version number used to generate the archive
#
sub write_hm_version {

#    print DEBUG "writing new hm_version: $hm_version\n";

    open (OUT, ">$list_dir/$archive_status/hm_version")
	or die "Couldn't write hm_version number to $list_dir/$archive_status/hm_version: $!";
    print  OUT "$hm_version\n";
    close (OUT);

} # write_hm_version_file

#
# Setup the configuration parameters for processing only one mail
#

sub setup_configuration_one {

    # Read the existing catalog file
    &read_catalog_file;

    # get the variables stored in the last entry of the archive's
    # catalog. If it does not exist, then use the mpperiod
    # parameter and process the first file in the archive

    if(@date_list) {
	($mpperiod, $current_year, $verbose_archive_name, $archive_name, 
	 $first_file, $last_file) = 
	     split (':',$date_list[$last_catalog_entry]);
    } else {
	$last_file = -1;
    }

    # build the periods arrays according to the mpperiod value
    &setup_period_arrays;

} # setup_configuration_one


#
# Very simple routine: Opens the last and latests received mails,
# extracts the date,  and if the period has changed, then updates the
# period array. Easy!
#

sub build_indexes_one {

    # get the characteristics of the latest (hypertext) converted 
    # mail (if it exists)

    if ($last_file != -1) {

	# Get the characteristicss of the last processed msgl in the
	# hypertext archive
	$found = getdate_fromreceived ("$mail_dir/$last_file");
	# @@ we could control $found, but we should have a date here (unless
	# this message was erased)

	# we normalize the date of the mail
	for ($j=0; $j <12; $j++) {
	    $testr = index ($_, $MONTHS[$j]);
	    unless ($testr == -1) { #found a month!
		$tmp_index = $i;
		# set up the status variables
		$current_date = substr ($_, $testr, 8);
		$current_month_index = $j;
		($current_month, $current_year) =
		    split(/ /, $current_date);
		$current_period_index = $PERIOD{$current_month};
		last;
	    } 
	}
    }

    # Get the characteristics of the latest MH stored mail
    $found = getdate_fromreceived ("$mail_dir/$latest_mail");
    ## @@ should we add some error protection in case someone edits, purges
    ## this file?

    # normalize the date of the mail
	
    for ($j=0; $j <12; $j++) {
	$testr = index ($_, $MONTHS[$j]);
	unless ($testr == -1) { #found a month!
	    $tmp_index = $i;
	    # set up the status variables
	    $mail_date = substr ($_, $testr, 8);
	    ($mail_month, $mail_year) =
		split(/ /, $mail_date);
	    $mail_period_index = $PERIOD{$mail_month};
	    last;
	} 
    }

    # prepare the index
    if (@date_list) {
	if ($current_period_index == $mail_period_index) {
	    # we are still in the same period, update the catalog
	    
	    # remove the last element of the date list
	    # as we'll update it in this step
	    pop (@date_list);

	    # Add an entry for the last filename in the directory
	    $tmp = join (":", $mpperiod,
			     $current_year,
			     $VERBOSE_PERIOD[$current_period_index],
			     $current_year. $PERIOD[$current_period_index],
			     $first_file,
			     $latest_mail);
	    push (@date_list, $tmp);
 	} else {
	    #the period has changed!

	    ($sec, $min, $hour, $this_day, $this_month, $this_year) 
		= localtime(time);
	    # localtime returns the # of years since 1900
	    $this_year = $this_year + 1900;
	    # remove the last element of the date list
	    # as we'll update it in this step
	    pop (@date_list);
	    &increase_date_current_index;
	    while ($current_year <= $this_year) {
		$test = index ($mail_date, $current_date);
		if ($test > -1) {
		    last;
		}
		# didn't find it, so continue searching 
		&increase_date_current_index;
	    }

	    if ($current_year > $this_year) {
		die ("ERROR: Could not find a date for mail id $mail_dir/$fileNames[$i]");
	    }
	    
	    $tmp = join (":", $mpperiod,
			 $mail_year,
			 $VERBOSE_PERIOD[$current_period_index],
			 $mail_year. $PERIOD[$current_period_index],
			 $latest_mail,
			 $latest_mail);
	    push (@date_list, $tmp);
	    # regenerate the master_index??
	} 
    } else {
	# this is the first mail in the archive
	$current_period_index = $PERIOD{$mail_month};
	$tmp = join (":", $mpperiod,
		     $mail_year,
		     $VERBOSE_PERIOD[$current_period_index],
		     $mail_year. $PERIOD[$current_period_index],
		     $latest_mail,
		     $latest_mail);
	push (@date_list, $tmp);
    }

} # build_indexes_one


sub callHypermail {

    ($m_months, $current_year, $verbose_archive_name, $period_dir, 
     $first_file, $last_file) 
	= split (':', $date_list[$#date_list]);

    # we temporarily convert the file
    if ($filter_message) {
        $command ="$nkf -e -J $mail_dir/$latest_mail"
            ." >$archive_dir/$latest_mail";
        system ($command);
        $mail_dir = $archive_dir;
    }

    $options = "-c $hm_conf_file -M -u -o readone=1 -m $mail_dir/$latest_mail -l \"$marchive_name from $verbose_archive_name $current_year\" -d $archive_dir/$period_dir -o icss_url=$messagelist_css -o mcss_url=$message_css -o hmail=$marchive_name";

    if ($progress) {
	$options = "-p " . $options;
    }

    $command ="$hypermail $options";
    print "trying $command \n";
    system($command);

    # we remove the temporary nkf translated file
    if ($filter_message) {
        unlink ("$archive_dir/$latest_mail");
    }

} ## callHypermail


##### New subroutines for protecting the system

# Backups a file (using the .bak extension). To know when the backup process
# is over, a file called "doing_backup" is created before the process starts,
# and deleted when it's over.

# A rebuild is like a combination of build_one together with update_archive.
# In case of error, the last archived period is regenerated (if it
# exists); otherwise, if we can't find out the last period, the whole
# archive is regenerated.

sub CopyFile {
    my ($src, $dest);
  
    ($src, $dest) = @_;

    open (IN, "<$src") or
	die "Could not open $src: $!";    
    open (OUT, ">$dest") or
	die "Could not open $dest: $!";    

    while (<IN>) {
	print OUT $_;
    }

    close (IN);
    close (OUT);
} ## CopyFile

sub BackupFile {

    my ($file);
    ($dir, $file) = @_;

    unless (-e "$dir/$file") {
	return;
    }

    # Set the flag for detecting system crashes/shutdowns
    open (OUT, ">$dir/flag.backup") or
	die "Could not open $dir/flag.backup: $!";    
   
    if (-e "$dir/$file") {
	CopyFile ("$dir/$file", "$dir/$file.bak");
    }

    # Reset the flag for detecting system crashes/shutdowns
    unlink ("$dir/flag.backup");

} ## BackupFile

#
# Tests a flag (creates a file), to know if we the precedent archiving
# processes had an abnormal error. If the flag does not exists, it creates
# one
#

sub TestSetUpdatingFlag {

    unless (-e "$list_dir/$archive_status/flag.updating") {
	open (OUT, ">$list_dir/$archive_status/flag.updating") or
	    die "Could not open $list_dir/$archive_status/flag.updating: $!";
    	close (OUT);
    } else {
	# the server crashed/was shutdown during the last archive updating
	# process

	# Recover the latest catalog backup (if it exists)
	
	unless (-e "$list_dir/$archive_status/flag.backup") {
	    # the crash/shutdown did not occur during the catalog backup phase
	    # so we retrieve the latest saved catalog
	    if (-e "$list_dir/$archive_status/catalog.bak") {
		CopyFile ("$list_dir/$archive_status/catalog.bak", 
			  "$list_dir/$archive_status/catalog");
		$recover_archive = 1; 
	    } else {
		# as there was no catalog.bak, we have to remove the latest
		# catalog
		unlink ("$list_dir/$archive_status/catalog");
		# and force the regeneration of the whole archive
		if ($add_one == 1) {
		    $add_one = 0;
		}
	    }
	} else {
	    # erase the existing catalog, flag.backup, and catalog.bak
	    # as they may well be corrupted
	    unlink ("$list_dir/$archive_status/catalog");
	    unlink ("$list_dir/$archive_status/catalog.bak");
	    unlink ("$list_dir/$archive_status/flag.backup");
	    # and force the regeneration of the whole archive
	    if ($add_one == 1) {
		$add_one = 0;
	    }
	}
    }
} ## TestSetUpdatingFlag


# 
# Resets the flag (removes the file) set in SetUpdatingFlag
#
    sub ResetUpdatingFlag {

    unlink ("$list_dir/$archive_status/flag.updating");

} ## ResetUpdatingFlag


#
#
#

sub setup_configuration_recover_archive {

    # Read the existing catalog file
    &read_catalog_file;

    # get the variables stored in the entry before the last one in
    # the catalog (the last entry is presumably corrupt).

    if(@date_list) {
	# remove the last archived period, as it may be wrong
	pop (@date_list);
	$last_catalog_entry--;

	# get the latest safe archive
	if (@date_list) {
	    ($mpperiod, $current_year, $verbose_archive_name, $archive_name, 
	     $first_file, $last_file) = 
		 split (':',$date_list[$last_catalog_entry]);
	    # remove this entry too, as it'll be regenerated during the
	    # build indexes phase;
	    pop (@date_list);
	    $last_catalog_entry--;
	} else {
	    $last_file = -1;
	    # there's no precedent catalog entry. We'll do a standard
	    # archive rebuild
	    $recover_archive = 0;
	}
    } else {
	# there was no precedent catalog entry. We'll do a standard archive
	# rebuild
	$last_file = -1;
	$recover_archive = 0;
    }

    # build the periods arrays according to the mpperiod value
    &setup_period_arrays;

    # Read the file names in the MH directory
    &ReadFileNames;

} # setup_configuration_recover_archive


#
# Build an index of date and filenames corresponding to the beginning of 
# the period
#

sub build_indexes_recover_archive {

    # find the index value of the last archived file
    for ($i = 0, $first_index = -1; $i <= $totalfiles; $i++) {
	if ($fileNames[$i] == $last_file) {
	    $last_index = $i;
	    last;
	}
    }

    if ($last_index == -1) {
	die "index seems to be corrupt: I can't find the index entry for $last_file";
    }

    # get the characteristics of the latest safe stored mail (if it exists)
    $found = getdate_fromreceived ("$mail_dir/$last_file");

    # we normalize the date of the last safe stored mail
    for ($j=0; $j <12; $j++) {
	$testr = index ($_, $MONTHS[$j]);
	unless ($testr == -1) { #found a month!
	    $tmp_index = $i;
	    # set up the status variables
	    $current_date = substr ($_, $testr, 8);
	    $current_month_index = $j;
	    ($current_month, $current_year) =
		split(/ /, $current_date);
	    $current_period_index = $PERIOD{$current_month};
	    last;
	} 
    }

   # index the remaining files

    ($sec, $min, $hour, $this_day, $this_month, $this_year) = localtime(time);
    $this_year = $this_year + 1900;
    $this_date = join (":",$hour, $min, $sec);
    $this_date = "$this_date " . $this_year . $this_month .$this_day;
    
  INDEX: 
    for ($i = $last_index + 1; $i <= $totalfiles; $i++) {

	# get the received date of this message
	$found = getdate_fromreceived ("$mail_dir/$fileNames[$i]");
      
	if (!$found) {
	    # we somehow found an empty file!, let's remove it from
	    # the fileNames table and skip it
	    splice(@fileNames, $i, 1);
	    $totalfiles--;
	    $i--;
	    next;
	} elsif (index ($_, $current_date) > -1) {
	    # we are still in the same month
	    next;
	} else {
	    # the month changed!
	    # increase the date (will correctly fix up the date );
	    $last_file = $fileNames[$i-1];
	    
	    &increase_date_current_index;
	    
	    if ($last_date && $current_date eq $last_date) {
		# we've reached the last date
		last INDEX;
	    }
	    
	    # if there's a gap in the archives, try to find the current year
	    while ($current_year <= $this_year) {
		$test = index ($_, $current_date);
		if ($test == -1) {
		    # didn't find it, so continue searching 
		    &increase_date_current_index; 
		    if ($last_date && $current_date eq $last_date) {
			# we've reached the last date
			last INDEX;
		    } elsif ($current_year > $this_year) {
			die ("ERROR: Could not find a date for mail id $mail_dir/$fileNames[$i]");
		    }
		} else {
		    if ($period_change) {
			$period_change = 0;
			$first_file = $fileNames[$i];
		    }
		    last;
		}
	    }
	}
    }
    
    # Add an entry for the last filename in the directory
    unless ($last_date) {
	$last_file = $fileNames[$i-1];
	$tmp = join (":", $mpperiod,
		     $current_year,
		     $VERBOSE_PERIOD[$current_period_index],
		     $current_year. $PERIOD[$current_period_index],
		     $first_file,
		     $last_file);
	push (@date_list, $tmp);

	# get the latest generated entry in the catalog file,
	# so that we regenerate the archives starting from that entry
	# this is not yet perfect, as the last entry also gets regenerated :-/
	# it's + 1, because we do a push at the end of this routine
	$recover_catalog_entry = $last_catalog_entry + 1;
    }

} ##  build_indexes_recover_archive

#
# make sure the year in the date has 4 digits
#

sub normalize_date {
    my $recv_line = shift;
    my ($testr, $wday, $mday, $month, $year, @rest);

    $testr = index ($recv_line, ";");
    $_ = substr ($recv_line, $testr + 2);
    ($wday, $mday, $month, $year, @rest) =  split (/ /);
    ## @@ y2k bug ... what will be the date of y2k messages?
    if ($year < 60) {           ## Y2K+ messages?
	$year = $year + 2000;
    }
    elsif ($year <= 99) {       ## current messages, but on 2 digits
	$year = $year + 1900;
    }
    $_ = join (' ', $wday, $mday, $month, $year, @rest);
    $_ = $_ . "\n";
}				

#
# Extracts the date from a Received: header
# Input: filename where a message is stored
# Returns: 1 if date was found and date in $_,
#          0 otherwise

sub getdate_fromreceived {
    my $file = shift;
    my $found;
    my $recv_line;

    open (IN, "<$file") or
	die "Could not open $file: $!";
    $found = 0;
    while (<IN>) {
	if (/^Received: /) {
	    # found an email msg!
	    $found = 1;
	    chop;
	    $recv_line = $_;
	    # read any folded lines that may follow
	    while (<IN>) {
		if (/^[ \t]/) {
		    chop;
		    # remove initial white space
		    s/[ \t]+//;
		    $recv_line = $recv_line . " $_";
		} else {
		    last;
		}
	    }
	    normalize_date ($recv_line);
	    last;
	}
    }
    close (IN);
    return ($found);

} ## getdate_fromreceived


