#!/usr/bin/perl
use strict;

#BEGIN {unshift@INC,('../../..');}
use W3C::Database::DBIInterface;
use Digest::MD5;

if ($ARGV[0] =~ m/\-*h(?:elp)?/) {
    print "$0 [propFile]
  transform Strings table to use md5s for unique id.
  propFile - properties file with database name and access parameters.
";
    exit(0);
}

my $propFileLocation = $ARGV[0];
$propFileLocation ||= '../../../Conf/rdf.prop';
my $props = new W3C::Util::Properties($propFileLocation);
my $db = new W3C::Database::DBIInterface($props);

$db->executeUpdate('LOCK TABLE Strings WRITE');
$db->executeUpdate('ALTER TABLE Strings ADD md5hex CHAR(32)');
my $strings = [];
$db->executeArrayQuery($strings, 'SELECT id, substring, encoding, string FROM Strings');
foreach my $entry (@$strings) {
    my $md5hex = Digest::MD5::md5_hex($entry->[3]);
    $db->executeUpdate("UPDATE Strings SET md5hex='$md5hex' WHERE id=$entry->[0]");
}
print scalar @$strings," entries updated\n";
$db->executeUpdate('ALTER TABLE Strings CHANGE md5hex md5hex CHAR(32) NOT NULL');
$db->executeUpdate('ALTER TABLE Strings CHANGE encoding encoding enum("PLAIN","XML") NOT NULL');
$db->executeUpdate('UNLOCK TABLE');

# there may be some entries that are duplicated if there were unsufficient
# unique indexes.
print "checking for non-unique fields. this may take a couple minutes.\n";
my $dupStrings = [];
$db->executeArrayQuery($dupStrings, 'SELECT a.id, b.id FROM Strings AS a,Strings AS b WHERE a.md5hex=b.md5hex AND a.id!=b.id');
#+------+------+
#| id   | id   |
#+------+------+
#| 6943 | 9918 |
#| 9918 | 6943 |
#| 7683 | 7547 |
#| 7684 | 7552 |
#| 7685 | 7557 |
#| 7547 | 7683 |
#| 7552 | 7684 |
#| 7557 | 7685 |
#+------+------+
#8 rows in set (1 min 11.68 sec)
my $stringIds = [];
foreach my $entry (@$dupStrings) {
    my ($replace, $with) = @$entry;
    if ($replace > $with) {
	push (@$stringIds, ($replace, $with));
    }
}
if (@$stringIds) {
    my $genIds = {};
    $db->executeHashArrayQuery($genIds, 'SELECT string,id FROM RdfIds WHERE string IN ('.join (',', @$stringIds).')');

    foreach my $entry (@$dupStrings) {
	my ($replaceString, $withString) = @$entry;
	my $replaceRdfId = $genIds->{$replaceString}[0];
	my $withRdfId = $genIds->{$withString}[0];
	$db->executeUpdate("UPDATE Statements SET predicate=$withRdfId WHERE predicate=$replaceRdfId");
	$db->executeUpdate("UPDATE Statements SET subject=$withRdfId WHERE subject=$replaceRdfId");
	$db->executeUpdate("UPDATE Statements SET object=$withRdfId WHERE object=$replaceRdfId");
	$db->executeUpdate("DELETE FROM RdfIds WHERE id=$replaceRdfId");
	$db->executeUpdate("DELETE FROM Strings WHERE id=$replaceString");
    }
}
print scalar @$dupStrings," strings fixed\n";

$db->executeUpdate('ALTER TABLE Strings ADD UNIQUE u_md5hex_encoding(md5hex,encoding)');
$db->executeUpdate('ALTER TABLE Strings DROP INDEX u_substring');
$db->executeUpdate('ALTER TABLE Strings DROP substring');
