#!/usr/bin/perl

# Some description of process in #rdfig log [1].
# [1] http://ilrt.org/discovery/chatlogs/rdfig/2003-03-25#T19-56-42
# [2] http://rdfig.xmlhack.com/2003/03/25/2003-03-25.html#1048622430.568575

$REVISION = '$Id: compileTables,v 1.6 2004/06/08 06:51:00 eric Exp $ ';

use strict;

#BEGIN {unshift@INC,('../../..');}

package Predicate;

sub new {
    my ($proto, $pId, $pName) = @_;
    my $class = ref($proto) || $proto;
    my $self = {ID => $pId, NAME => $pName};
    bless ($self, $class);
    return $self;
}

sub getId {return $_[0]->{ID}}
sub getName {return $_[0]->{NAME}}
sub toString {return $_[0]->getName}

package Table;

sub new {
    my ($proto, $name) = @_;
    my $class = ref($proto) || $proto;
    my $self = {FIELDS => [], NAME => $name};
    bless ($self, $class);
    return $self;
}

sub addField {
    my ($self, $field, $ratio) = @_;
    push (@{$self->{FIELDS}}, [$field, $ratio]);
}

sub toString {
    my ($self) = @_;
    my @fields;
    foreach my $fieldPair (@{$self->{FIELDS}}) {
	my ($field, $ratio) = @$fieldPair;
	push (@fields, $field->toString()."($ratio)");
    }
    my $fields = join (' ', @fields);
    return "$self->{NAME}: $fields";
}

package SqlTableCompiler;
use vars qw(@ISA $MIN_ASSOC_INSTANCES);
@ISA = qw(W3C::Util::NamedParmObject);

use W3C::Util::Exception;
use W3C::Util::Object;
use W3C::Util::Properties;
use W3C::Database::DBIInterface;

$MIN_ASSOC_INSTANCES = .1;

eval {
    my $compiler = new SqlTableCompiler(-properties => new W3C::Util::Properties('../../../Conf/rdf.prop'));
    $compiler->execute(\@ARGV);
}; if ($@) {if (my $ex = &catch('W3C::Util::Exception')) {
	die $ex->toString;
    } else {
	die $@;
    }
}

sub new {
    my ($proto, @parms) = @_;
    my $class = ref($proto) || $proto;
    my $self = $class->SUPER::new(@parms);
    my $props = delete $self->{-properties};
    $self->{DB} = new W3C::Database::DBIInterface($props);
    my $typeStr = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type';
    my $typeId = $self->{DB}->executeSingleQuery("SELECT RdfIds.id FROM Uris,RdfIds WHERE Uris.uri='$typeStr' AND RdfIds.uri=Uris.id");
    $self->{RDF_TYPE} = new Predicate($typeId, $typeStr);
    return $self;
}

sub execute {
    my ($self, $args) = @_;

    my %ignorablePredicates = ($self->{RDF_TYPE}->getId => 1); # predicates we can ignore;

    # Find frequent ?p in '((?p ?s ?o))
    my @predicateCounts; # Instances of each predicate

    my $orig = 'Statements';
    my $dupTable = $self->createDup($orig, 'id', 1000, 200);
    $self->{OPERTABLE} = $dupTable;

    $self->{DB}->executeArrayQuery(\@predicateCounts, "SELECT predicate,COUNT(*) FROM $self->{OPERTABLE} GROUP BY predicate");
    my @predicateCounts = sort {$b->[1] == $a->[1] ? $a->[0] <=> $b->[0] : $b->[1] <=> $a->[1]} @predicateCounts; # grep {$_->[0] == 136} @predicateCounts; # 

    # For each predicate, starting with most frequent...
    for (my $iPredicateCount = 0; $iPredicateCount < @predicateCounts; $iPredicateCount++) {
	my $p = $predicateCounts[$iPredicateCount][0];
	if (exists $ignorablePredicates{$p}) {
	    next;
	}
	my $c = $predicateCounts[$iPredicateCount][1];
	print "$p used $c times\n";

	# Find frequent ?ap in '((?p ?s ?o)(?ap ?s ?o2))
	my @associatedPredicates; # Other predicates with same subject

	$self->{DB}->executeArrayQuery(\@associatedPredicates, "SELECT ap.predicate,count(*) FROM $self->{OPERTABLE} AS a INNER JOIN $self->{OPERTABLE} AS ap ON a.subject=ap.subject WHERE a.predicate=$p AND ap.predicate!=$p GROUP BY ap.predicate;");
	my @associatedPredicates = sort {$b->[1] == $a->[1] ? $a->[0] <=> $b->[0] : $b->[1] <=> $a->[1]} @associatedPredicates;

	# For each associated predicate (instance data has same subject)...
	for (my $iAssocidatedPredicates = 0; $iAssocidatedPredicates < @associatedPredicates; $iAssocidatedPredicates++) {
	    my $ap = $associatedPredicates[$iAssocidatedPredicates][0];
	    if (exists $ignorablePredicates{$ap}) {
		next;
	    }
	    my $cap = $associatedPredicates[$iAssocidatedPredicates][1];

	    print "  trying $ap:\n";

	    # Find count(*) of '((?ap ?s ?o2)!(?p ?s ?o))
	    my $missociations = $self->{DB}->executeSingleQuery("SELECT count(*) FROM $self->{OPERTABLE} AS ap LEFT OUTER JOIN $self->{OPERTABLE} AS p ON ap.subject=p.subject WHERE ap.predicate=$ap AND p.predicate=$p AND p.predicate IS NULL;");

	    # If ?ap is used on any subjects where ?p is not, forget this ?ap
	    if ($missociations > 0) {
		# may want to try again with tramp predicate
		print "  -- loose associations\n";
		next;
	    }

	    if (0) {
		my @trampPredicates; # Predicates also associated with $ap
		$self->{DB}->executeArrayQuery(\@trampPredicates, "SELECT tp.predicate,count(*) FROM $self->{OPERTABLE} AS ap LEFT OUTER JOIN $self->{OPERTABLE} AS tp ON ap.subject=tp.subject WHERE ap.predicate=$ap AND tp.predicate!=$ap AND tp.predicate!=$p GROUP BY ap.predicate;");
		my @trampPredicates = sort {$b->[1] == $a->[1] ? $a->[0] <=> $b->[0] : $b->[1] <=> $a->[1]} @trampPredicates;

		# If it is used on any subjects where $p is not, forget this $ap
		if (@trampPredicates > 0) {
		    # may want to try again with tramp predicate
		    print "  -- loose associations\n";
		    next;
		}
	    }

	    # Find count(*) of '((?ap ?s ?o2)(?ap ?s ?o3))
	    # Is it a repeated property?
	    if ($self->{DB}->executeSingleQuery("SELECT COUNT(*) FROM $self->{OPERTABLE} AS ap INNER JOIN $self->{OPERTABLE} AS rp ON rp.subject=ap.subject WHERE ap.predicate=$ap AND rp.predicate=$ap AND rp.id!=ap.id") > 0) {
		# candidate for separate table
		print "  -- repeated property\n";
		next;
	    }

	    # How often is it associated with $p?
	    my $used = $c - $self->{DB}->executeSingleQuery("SELECT COUNT(*) FROM $self->{OPERTABLE} AS a LEFT OUTER JOIN $self->{OPERTABLE} AS ap ON a.subject=ap.subject WHERE a.predicate=$p AND ap.predicate=$ap AND ap.predicate IS NULL");
	    my $ratio = $used/$c;
	    if ($ratio < $MIN_ASSOC_INSTANCES) {
		print "  -- $used/$c = $ratio\n";
		next;
	    }

	    $self->getTable($self->getPredicate($p))->addField($self->getPredicate($ap), $ratio);
	    $ignorablePredicates{$ap} = 1;
	}
    }
    $self->{DB}->executeUpdate("DROP TABLE $dupTable");
    print $self->toString();
}

sub getPredicate {
    my ($self, $pId) = @_;
    if (exists $self->{PREDICATES_FOR_IDS}{$pId}) {
	return $self->{PREDICATES_FOR_IDS}{$pId};
    } else {
	my $p = new Predicate($pId, $self->{DB}->executeSingleQuery("SELECT Uris.uri FROM RdfIds,Uris WHERE RdfIds.id=$pId AND Uris.id=RdfIds.uri"));
	$self->{PREDICATES_FOR_IDS}{$pId} = $p;
	return $p;
    }
}

sub getTable {
    my ($self, $p) = @_;
    if (exists $self->{TABLES_FOR_FIELDS}{$p}) {
	return $self->{TABLES_FOR_FIELDS}{$p};
    } else {
	my $name = $self->makeUpName($p);
	my $table = new Table($name);
	$table->addField($p, 1);
	$self->{TABLES_FOR_FIELDS}{$p} = $table;
	return $table;
    }
}

sub makeUpName {
    my ($self, $p) = @_;
    # Look for a type on this object
    my $typeId = $self->{RDF_TYPE}->getId;
    my $pId = $p->getId;
    my @types;
    $self->{DB}->executeArrayQuery(\@types, "SELECT RdfIds.id,Uris.uri FROM $self->{OPERTABLE} AS pStatements INNER JOIN $self->{OPERTABLE} typeStatements ON typeStatements.subject=pStatements.subject INNER JOIN RdfIds ON RdfIds.id=typeStatements.object INNER JOIN Uris ON Uris.id=RdfIds.uri WHERE pStatements.predicate=$pId AND typeStatements.predicate=$typeId");
    if (@types) {
	return $types[0][1];
    } else {
	return "a$pId";
    }
}

sub toString {
    my ($self) = @_;
    foreach my $field (keys %{$self->{TABLES_FOR_FIELDS}}) {
	print $self->{TABLES_FOR_FIELDS}{$field}->toString,"\n";
    }
}


sub createDup {
    my ($self, $copyMe, $pk, $sampleSize, $maxPerAttrib) = @_;
    my $newTable = '__Statements_copy';

    # Make sure tables are as expected.
    {
	my %tableList;
    eval {$self->{DB}->executeUpdate("DROP TABLE $newTable")}; # !!!
	$self->{DB}->executeHashArrayQuery(\%tableList, 'SHOW TABLES');
	if (!exists $tableList{$copyMe}) {
	    &throw(new W3C::Util::Exception(-message => "table \"$copyMe\" not found"));
	}
	if (exists $tableList{$newTable}) {
	    &throw(new W3C::Util::Exception(-message => "table \"$newTable\" not expected"));
	}
    }

    # Get table creation information.
    {
	my @row;
	$self->{DB}->executeArrayQuery(\@row, "SHOW CREATE TABLE $copyMe");
	if (@row != 1) {
	    &throw(new W3C::Util::Exception(-message => "expected 1 row, not ".@row));
	}
	if ($row[0][1] !~ m/^CREATE TABLE \`\Q$copyMe\E\` (\(.*)$/s) {
	    &throw(new W3C::Util::Exception(-message => "could not find create for $copyMe in \"$row[0][1]\""));
	}
	my $create = $1;

	# Create table for sample of data.
	$self->{DB}->executeUpdate("CREATE TABLE $newTable $create");
    }

    # Populate randomly
    {
	my $max = $self->{DB}->executeSingleQuery("SELECT MAX($pk) FROM $copyMe");
	my $count = $self->{DB}->executeSingleQuery("SELECT COUNT(*) FROM $copyMe");
	my (%pks, %attribs);
	for (my $copied = 0; $copied < $sampleSize;) {
	    my $try = int(rand($max)+1);
	    if ($pks{$try}) {
		next;
	    }
	    $pks{$try} = 1;
	    my $attrib;
	    eval {
		$attrib = $self->{DB}->executeSingleQuery("SELECT attribution FROM $copyMe WHERE $pk=$try");
	    };
	    if ($@) {if (my $ex = &catch('W3C::Database::DBIInterface::SingleQueryException')) {
		my $next = $self->{DB}->executeSingleQuery("SELECT MIN($pk) FROM Statements WHERE $pk > $try");
		if (!defined $next) {
		    $next = $self->{DB}->executeSingleQuery("SELECT MAX($pk) FROM Statements WHERE $pk < $try");
		}
		if ($pks{$next}) {
		    next;
		}
		$try = $next;
		$attrib = $self->{DB}->executeSingleQuery("SELECT attribution FROM $copyMe WHERE $pk=$try");
	    } else {&throw();}}
	    if ($attribs{$attrib}) {
		next;
	    }
	    $attribs{$attrib} = 1;
	    if ($self->{DB}->executeSingleQuery("SELECT COUNT(*) FROM $copyMe WHERE attribution=$attrib") > $maxPerAttrib) {
		next;
	    }
	    $self->{DB}->executeUpdate("INSERT INTO $newTable SELECT * FROM $copyMe WHERE attribution=$attrib");
	    $copied += $self->{DB}->executeSingleQuery("SELECT COUNT(*) FROM $newTable WHERE attribution=$attrib");
	}
    }
    return $newTable;
}

