#!/usr/bin/perl
# Copyright notice:
# (c) Copyright Keio University 1999, 2000
# This software is made available under the terms of the
# W3C Software Licence available at
# http://www.w3.org/Consortium/Legal/copyright-software.
# Acknowledgements: Mark Davis for various discussions about Unicode TR #15
# and for the test suite
# Paul Hoffman for suggestions and encouragement
# Roland Mas for how to use Storable
# James Briggs and Masayasu Ishikawa for error reports
# Tim Bray for help with CDATA
# Authors:
# MJD Martin J. Du"rst, duerst@w3.org
$version = 'Version 0.43';
# History:
# 2000/11/09: 0.44, implemented -s/-S (Storable data) MJD
# 2000/11/05: 0.43, implemented -K (kompatibility decomposition) MJD
# 2000/11/05: 0.42, updated for 3.0.1, fixed line ends MJD
# 2000/11/05: 0.41, added 2000 to copyright, tested CVS commit MJD
# 2000/08/03: 0.40, added Hangul support and did quite some testing MJD
# 2000/08/02: 0.37, added -x and -X for decomposition MJD
# 2000/07/27: 0.36, fixed a bug for non-starter decompositions MJD
# 2000/07/24: 0.35, adapted exclusions to 3.0.0 final (+Tibetan) MJD
# 2000/07/24: 0.34, $chClass = $CombClass{ch}; should read $chClass = $CombClass{$ch};
# implemented -C MJD
# 1999/08/16: 0.33, updated for second version of 3.0.0.beta MJD
# 1999/07/01: 0.32, adapted surrogates/exclusions to 3.0.0.beta MJD
# 1999/06/25: 0.31, fixed reordering bug, going public MJD
# 1999/06/23: 0.30, preparation for W3C member test, without Hangul MJD
# CVS last revised $Date: 2000/11/09 10:58:51 $ by $Author: duerst $
#package CHARLINT; ## tried these, but need more info on correct use
#use strict;
#use diagnostics;
use Storable;
#
# SUBROUTINES
#
sub addtext {
local (&r, $t) = @_;
$r = $r ? ($r."; ".$t) : $t;
}
# Check problems in UTF-8
sub CheckUTF8 {
local ($s) = @_;
local ($return) = "";
local ($st);
if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) {
&addtext (*return, "surrogate pair") if !$OPTU;
}
if ($s =~ /[\355][\240-\277][\200-\277]/) {
&addtext (*return, "single surrogate") if !$OPTU;
}
if ($s =~ /[\300-\301][\200-\277]/) {
&addtext (*return, "ASCII in 2 bytes") if !$OPTU;
}
if ($s =~ /[\340][\200-\237][\200-\277]/) {
&addtext (*return, "3 bytes instead of 2 or less") if !$OPTU;
}
if ($s =~ /[\360][\200-\217][\200-\277]{2}/) {
&addtext (*return, "4 bytes instead of 3 or less") if !$OPTU;
}
if ($s =~ /[\370][\200-\207][\200-\277]{3}/) {
&addtext (*return, "5 bytes instead of 4 or less") if !$OPTU;
}
if ($s =~ /[\374][\200-\203][\200-\277]{4}/) {
&addtext (*return, "6 bytes instead of 5 or less") if !$OPTU;
}
# non-synchronized cases
$s =~ s{ [\000-\177]
| [\300-\337][\200-\277]
| [\340-\357][\200-\277]{2}
| [\360-\367][\200-\277]{3}
| [\370-\373][\200-\277]{4}
| [\374-\375][\200-\277]{5}
}{}gx;
# forbidden bytes
if ($s =~ /[\376\377]/) {
&addtext (*return, "0xFE or 0xFF byte") if !$OPTU;
}
if ($s ne "") {
&addtext (*return, "synchronization problem") if !$OPTU;
}
$return;
} # end CheckUTF8
sub exists {
my $s = shift;
return ($exists{$s}
or ("\344\270\200" le $s and $s le "\351\276\245") # CJK
or ("\352\260\200" le $s and $s le "\355\236\243") # Hangul
or ('\343\220\200' le $s and $s le '\344\266\265')); # CJK Extension A
}
sub CheckExists {
my ($s) = @_;
my $news = join "",
grep (&exists($_), $s =~ m/([\000-\177]|[\300-\377][\200-\277]+)/go);
return ($s ne $news);
} # end CheckExists
sub CheckPrivate {
my ($s) = @_;
if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) {
return "BMP";
}
if ($s =~ /[\363][\260-\277][\200-\277]{2}/) {
return "plane 15";
}
if ($s =~ /[\364][\200-\217][\200-\277]{2}/) {
return "plane 16";
}
} # end CheckPrivate
#### convert hex to UTF-8
sub hex2utf8 {
num2utf8 (hex($_[0]));
} # end hex2utf8
sub Xhex2utf8 { #### avoid to covert <>&"
local ($t) = hex($_[0]);
if ($t < 0x40) { return "&#x".$_[0].";"; }
num2utf8 ($t);
} # end Xhex2utf8
sub Xnum2utf8 { #### avoid to covert <>&"
local ($t) = @_;
if ($t < 64) { return "&#".$t.";"; }
num2utf8 ($t);
} # end Xnum2utf8
#### convert number to UTF-8
sub num2utf8 {
local($t) = @_;
local($trail, $firstbits, @result);
if ($t<0x00000080) { $firstbits=0x00; $trail=0; }
elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; }
elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; }
elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; }
elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; }
elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; }
else {
die "Too large scalar value, cannot be converted to UTF-8.\n";
}
for (1 .. $trail) {
unshift (@result, ($t & 0x3F) | 0x80);
$t >>= 6; # slight danger of non-portability
}
unshift (@result, $t | $firstbits);
pack ("C*", @result);
} # end num2utf8
sub utf82ncr { # works for more than one character
my $r;
foreach $c (splitutf8(shift)) {
$r .= ($c =~ /[\040-\177]/) ? $c : sprintf ("&#x%lX;",&utf82num($c));
}
$r;
} # end utf82ncr
#### convert UTF-8 to number
sub utf82num {
local(@t, $t, $result);
@t = unpack ("C*", $_[0]);
$t = shift (@t);
if ($t<0x80) { $result= $t ; $trail=0; }
elsif ($t<0xC0) { die "Illegal leading byte in UTF-8.\n"; }
elsif ($t<0xE0) { $result= $t & 0x1F; $trail=1; }
elsif ($t<0xF0) { $result= $t & 0x0F; $trail=2; }
elsif ($t<0xF8) { $result= $t & 0x07; $trail=3; }
elsif ($t<0xFC) { $result= $t & 0x03; $trail=4; }
elsif ($t<0xFE) { $result= $t & 0x01; $trail=5; }
else { die "Illegal byte in UTF-8.\n"; }
if ($trail != $#t + 1) { die "Not right number of trailing bytes.\n"; }
while ($trail--) {
# maybe check for 01xxxxxx
$result <<= 6;
$result += 0x3F & shift (@t);
}
return $result;
} # end utf82num
#### variant of hex2utf8 to get rid of spaces
sub spacehex2utf8 {
my ($t) = @_;
return "" if ($t eq " ");
hex2utf8($t);
} # end spacehex2utf8
#### split an utf-8 string into codepoints
sub splitutf8 {
split (/(?=[\000-\177\300-\377])/, shift);
} # end splitutf8
#### canonical sort of combining marks
# input: utf-8 string
# return: utf-8 string
sub sortCano {
my @a = @_;
my ($i, $ccHere, $ccPrev, $temp);
return @a if (@a <= 1);
for ($i=1; $i < @a; $i++) {
$ccHere = $CombClass{$a[$i]};
$ccPrev = $CombClass{$a[$i-1]};
$ccHere = 0 if (!defined($ccHere));
$ccPrev = 0 if (!defined($ccPrev));
if ($ccHere != 0 && $ccPrev > $ccHere) {
$temp = $a[$i]; # exchange
$a[$i] = $a[$i-1];
$a[$i-1] = $temp;
$i -= 2 if ($i > 1); # backtrack and check again
}
}
return @a;
} # end sortCano
#
# add algorithmic Hangul and unity transform to DecoCano lookup
#
sub DecoCano {
my ($s) = @_;
my $h = utf82num($s);
if ($h >= 0xAC00 && $h < 0xD7A4) {
my $hindex = $h - 0xAC00;
my $l = 0x1100 + $hindex/(21*28);
my $v = 0x1161 + ($hindex % (21*28)) / 28;
my $t = $hindex % 28;
if ($t) {
return join "", num2utf8($l), num2utf8($v), num2utf8(0x11A7 + $t);
}
else {
return join "", num2utf8($l), num2utf8($v);
}
}
else {
my $r = $DecoCano{$s};
return $r if defined $r;
return $s;
}
} # end DecoCano
#
# add algorithmic Hangul and unity transform to DecoKomp lookup
#
sub DecoKomp {
my ($s) = @_;
my $h = utf82num($s);
if ($h >= 0xAC00 && $h < 0xD7A4) {
return DecoCano($s); #refer to DecoCano for Hangul decomposition
}
else {
my $r = $DecoKompData{$s};
return $r if defined $r;
return $s;
}
} # end DecoKomp
#
# add algorithmic Hangul to CompCano lookup
#
sub CompCano {
my ($starterCh, $ch) = @_;
my $s = utf82num($starterCh);
my $c = utf82num($ch);
if ($s >= 0x1100 && $s < 0x1113 && $c >= 0x1161 && $c < 0x1176) {
return num2utf8((($s-0x1100)*21+$c-0x1161) * 28 + 0xAC00);
}
elsif ($s >= 0xAC00 && $s < 0xD7A4 && !(($s-0xAC00)%28) && $c >= 0x11A8 && $c < 0x11C3) {
return num2utf8($s + $c - 0x11A7);
}
else {
return $CompCano{join "", ($starterCh, $ch)};
}
} # end CompCano
sub printoctal {
my $s = shift;
$s =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/ge;
print STDERR $s;
} # end printoctal
#### output data considering all relevant options
sub printOPT {
my $t = shift;
if ($OPTN) {
# hexadecimal numeric character references
$t =~ s/([\300-\377][\200-\277]+)/utf82ncr($1)/eg;
}
elsif ($OPTo) {
$t =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/eg;
}
print $t;
}
#
# read in base data file
#
sub ReadCharacterDataFile {
my ($dataFile) = @_;
my $line = 0;
open (BASE, $dataFile)
or die "Cannot open character data file $dataFile.\n";
BASE:
while (<BASE>) {
print STDERR "Reading data file, line $line\n"
if !($line % 1000 && !$OPTq); $line++;
chop;
($hex, $name, $category, $combClass, $t4, $dec) = split(/;/);
#### Check ranges for consistency with handcoded pieces, then skip
if ($name =~ /^<(.*), (.*)/) {
if ( $_ !~ /^4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;$/
and $_ !~ /^9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;$/
and $_ !~ /^AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;$/
and $_ !~ /^D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;$/
and $_ !~ /^D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
and $_ !~ /^DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
and $_ !~ /^DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
and $_ !~ /^DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
and $_ !~ /^DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
and $_ !~ /^DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
and $_ !~ /^E000;<Private Use, First>;Co;0;L;;;;;N;;;;;$/
and $_ !~ /^F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;$/
and $_ !~ /^3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;$/
and $_ !~ /^4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;$/
and $_ !~ /^F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;$/
and $_ !~ /^FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;$/
and $_ !~ /^100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;$/
and $_ !~ /^10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;$/
) {
die "Problem with data file consistency, line $line: \n\t$_.\n";
}
}
else { # normal line processing
$u = &hex2utf8($hex);
$exists{$u} = 1; # to check characters that exist
#### Decompositions
if ($dec eq "") { } # no decomposition
elsif ($dec =~ /^<(.*)>(.*)/) { # compatibility
$decKind = $1;
$dec = $2;
$DecoKompKind{$u} = $decKind;
$dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
$DecoKompData{$u} = $dec;
}
else { # canonical decomposition
$dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
$DecoCanoData{$u} = $dec;
$DecoKompData{$u} = $dec; # add to Komp, to expand everything
}
#### Canonical Combining Class
$CombClass{$u} = $combClass if ($combClass);
}
}
close (BASE);
print STDERR "Finished reading character database.\n" if (!$OPTq);
%DecoCanoRest = %DecoCano = %DecoCanoData; # keep original data as is, and
# copy to restrict for composition
# list of compatibility kinds for later work
# the idea is to group them (e.g. sub and super) and allow
# normalization by group
#Kompatibility Kind: circle
#Kompatibility Kind: compat
#Kompatibility Kind: final
#Kompatibility Kind: font
#Kompatibility Kind: fraction
#Kompatibility Kind: initial
#Kompatibility Kind: isolated
#Kompatibility Kind: medial
#Kompatibility Kind: narrow
#Kompatibility Kind: noBreak
#Kompatibility Kind: small
#Kompatibility Kind: square
#Kompatibility Kind: sub
#Kompatibility Kind: super
#Kompatibility Kind: vertical
#Kompatibility Kind: wide
# fully expand canonical decompositions
$fixpoint = 0; # set to false
while (!$fixpoint) {
$fixpoint = 1; # set to true
print "Fixpoint\n" if ($OPTd);
foreach $key (sort keys %DecoCano) {
my @s = splitutf8($DecoCano{$key});
my $i = 0;
foreach $c (@s) {
my $d;
if ($d = $DecoCano{$c}) {
print "replacing ", utf82ncr($c), " with ",
utf82ncr($d), " in ", utf82ncr($key), "\n"
if $OPTd;
if ($i > 0) {
print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
die "Giving up!\n";
}
$c = $d;
$fixpoint = 0; # changed something; need one more pass
}
$i++;
}
$DecoCano{$key} = join "", @s;
}
}
# fully expand kompatibility decompositions
$fixpoint = 0; # set to false
while (!$fixpoint) {
$fixpoint = 1; # set to true
print "Fixpoint\n" if ($OPTd);
foreach $key (sort keys %DecoKompData) {
my @s = splitutf8($DecoKompData{$key});
my $i = 0;
foreach $c (@s) {
my $d;
if ($d = $DecoKompData{$c}) {
print "replacing ", utf82ncr($c), " with ",
utf82ncr($d), " in ", utf82ncr($key), "\n"
if $OPTd;
if ($i > 0 && splitutf8($d) > 1) {
# print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
# die "Giving up!\n";
}
$c = $d;
$fixpoint = 0; # changed something; need one more pass
}
$i++;
}
$DecoKompData{$key} = join "", @s;
}
}
# reorder combining marks for canonical decomposition
foreach $key (sort keys %DecoCano) { # sort to sort the output
my $s = $DecoCano{$key};
my $t = join "", sortCano(splitutf8($s));
if ($s ne $t) {
print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
utf82ncr($s), " to ", utf82ncr($t), "\n";
die "Giving up!\n";
}
$DecoCano{$key} = $t;
}
# reorder combining marks for kompatibility decomposition
foreach $key (sort keys %DecoKompData) { # sort to sort the output
my $s = $DecoKompData{$key};
my $t = join "", sortCano(splitutf8($s));
if ($s ne $t) {
print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
utf82ncr($s), " to ", utf82ncr($t), "\n";
die "Giving up!\n";
}
$DecoKompData{$key} = $t;
}
# detect singular compositions
foreach $key (sort keys %DecoCanoRest) { # sort to sort output
if (1 == scalar(splitutf8($DecoCanoRest{$key}))) {
print 'Singular composition: ', utf82ncr($key), ' from ',
utf82ncr($DecoCanoRest{$key}), ", removed.\n"
if ($OPTd);
delete $DecoCanoRest{$key};
}
}
# detect 'non-zero' compositions
foreach $key (sort keys %DecoCanoRest) { # sort to sort output
my @a = splitutf8($DecoCanoRest{$key});
if ($CombClass{shift @a}) {
print 'Non-zero composition: ', utf82ncr($key), ' from ',
utf82ncr($DecoCanoRest{$key}), ", removed.\n"
if ($OPTd);
delete $DecoCanoRest{$key};
}
}
# detect 'all-zero' compositions
foreach $key (sort keys %DecoCanoRest) { # sort to sort output
my $allzero = 1;
my @a = splitutf8($DecoCanoRest{$key});
foreach $c (@a) {
$allzero = 0 if ($CombClass{$c});
}
if ($allzero) {
print 'All-zero composition: ', utf82ncr($key), ' from ',
utf82ncr($DecoCanoRest{$key}), ".\n"
if ($OPTd);
}
}
@NoRecomp = ( # Script-specific, table-based
# according to ftp://ftp.unicode.org/Public/3.0-Update/CompositionExclusions-1.beta.txt
'0958', # DEVANAGARI LETTER QA
'0959', # DEVANAGARI LETTER KHHA
'095A', # DEVANAGARI LETTER GHHA
'095B', # DEVANAGARI LETTER ZA
'095C', # DEVANAGARI LETTER DDDHA
'095D', # DEVANAGARI LETTER RHA
'095E', # DEVANAGARI LETTER FA
'095F', # DEVANAGARI LETTER YYA
'09DC', # BENGALI LETTER RRA
'09DD', # BENGALI LETTER RHA
'09DF', # BENGALI LETTER YYA
'0A33', # GURMUKHI LETTER LLA
'0A36', # GURMUKHI LETTER SHA
'0A59', # GURMUKHI LETTER KHHA
'0A5A', # GURMUKHI LETTER GHHA
'0A5B', # GURMUKHI LETTER ZA
'0A5E', # GURMUKHI LETTER FA
'0B5C', # ORIYA LETTER RRA
'0B5D', # ORIYA LETTER RHA
'0F43', # TIBETAN LETTER GHA
'0F4D', # TIBETAN LETTER DDHA
'0F52', # TIBETAN LETTER DHA
'0F57', # TIBETAN LETTER BHA
'0F5C', # TIBETAN LETTER DZHA
'0F69', # TIBETAN LETTER KSSA
'0F76', # TIBETAN VOWEL SIGN VOCALIC R
'0F78', # TIBETAN VOWEL SIGN VOCALIC L
'0F93', # TIBETAN SUBJOINED LETTER GHA
'0F9D', # TIBETAN SUBJOINED LETTER DDHA
'0FA2', # TIBETAN SUBJOINED LETTER DHA
'0FA7', # TIBETAN SUBJOINED LETTER BHA
'0FAC', # TIBETAN SUBJOINED LETTER DZHA
'0FB9', # TIBETAN SUBJOINED LETTER KSSA
'FB1F', # HEBREW LIGATURE YIDDISH YOD YOD PATAH
'FB2A', # HEBREW LETTER SHIN WITH SHIN DOT
'FB2B', # HEBREW LETTER SHIN WITH SIN DOT
'FB2C', # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
'FB2D', # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
'FB2E', # HEBREW LETTER ALEF WITH PATAH
'FB2F', # HEBREW LETTER ALEF WITH QAMATS
'FB30', # HEBREW LETTER ALEF WITH MAPIQ
'FB31', # HEBREW LETTER BET WITH DAGESH
'FB32', # HEBREW LETTER GIMEL WITH DAGESH
'FB33', # HEBREW LETTER DALET WITH DAGESH
'FB34', # HEBREW LETTER HE WITH MAPIQ
'FB35', # HEBREW LETTER VAV WITH DAGESH
'FB36', # HEBREW LETTER ZAYIN WITH DAGESH
'FB38', # HEBREW LETTER TET WITH DAGESH
'FB39', # HEBREW LETTER YOD WITH DAGESH
'FB3A', # HEBREW LETTER FINAL KAF WITH DAGESH
'FB3B', # HEBREW LETTER KAF WITH DAGESH
'FB3C', # HEBREW LETTER LAMED WITH DAGESH
'FB3E', # HEBREW LETTER MEM WITH DAGESH
'FB40', # HEBREW LETTER NUN WITH DAGESH
'FB41', # HEBREW LETTER SAMEKH WITH DAGESH
'FB43', # HEBREW LETTER FINAL PE WITH DAGESH
'FB44', # HEBREW LETTER PE WITH DAGESH
'FB46', # HEBREW LETTER TSADI WITH DAGESH
'FB47', # HEBREW LETTER QOF WITH DAGESH
'FB48', # HEBREW LETTER RESH WITH DAGESH
'FB49', # HEBREW LETTER SHIN WITH DAGESH
'FB4A', # HEBREW LETTER TAV WITH DAGESH
'FB4B', # HEBREW LETTER VAV WITH HOLAM
'FB4C', # HEBREW LETTER BET WITH RAFE
'FB4D', # HEBREW LETTER KAF WITH RAFE
'FB4E' # HEBREW LETTER PE WITH RAFE
);
# remove script-specific
foreach $hex (@NoRecomp) {
my $key = &hex2utf8($hex);
print 'Non-recomposing composition: ', utf82ncr($key), ' from ',
utf82ncr($DecoCanoRest{$key}), ", removed.\n"
if ($OPTd);
delete $DecoCanoRest{$key};
}
# replace with fully expanded decompositions
foreach $key (keys %DecoCanoRest) {
$DecoCanoRest{$key} = $DecoCano{$key}
if ($DecoCanoRest{$key} ne $DecoCano{$key});
}
# detect duplicate compositions ## should not find any
if ($OPTd) {
print "Checking duplicates, takes some time.\n";
foreach $key (sort keys %DecoCanoRest) { # sort to sort output
my $s = $DecoCanoRest{$key};
foreach $key2 (keys %DecoCanoRest) {
if (($key lt $key2) # don't compare with itself, don't warn twice
&& ($DecoCanoRest{$key2} eq $s)) { # duplicate compositions
print STDERR 'Duplicate composition: ', utf82ncr($key),
' and ', utf82ncr($key2),
' to ', utf82ncr($s), "\n";
die "Giving up!\n";
}
else { next; } # shortcut loop
}
}
}
# invert for composition
foreach $key (keys %DecoCanoRest) { # use reduced decomps for selection
if (defined $CompCano{$DecoCanoData{$key}}) # use original data
{ # (strictly binary)
die "Duplicate compositions, giving up.\n";
}
$CompCano{$DecoCanoData{$key}} = $key;
}
if ($OPTd) {
foreach $key (sort keys %DecoCano) { # sort to sort output
print 'Final canonical decomposition: ', utf82ncr($key),
' to ', utf82ncr($DecoCano{$key}), "\n";
}
foreach $key (sort keys %CompCano) { # sort to sort output
print 'Final composition: ', utf82ncr($key),
' to ', utf82ncr($CompCano{$key}), "\n";
}
foreach $key (sort keys %DecoKompData) { # sort to sort output
print 'Final kompatibility decomposition: ', utf82ncr($key),
' to ', utf82ncr($DecoKompData{$key}), "\n";
}
}
print STDERR "Finished processing character data file(s).\n" if (!$OPTq);
} # end ReadCharacterDataFile
#
# store data to file for fast reread
#
sub StoreData {
my ($dataFile) = @_;
my %all_data = ();
$all_data{exists} = \%exists;
$all_data{DecoCano} = \%DecoCano;
$all_data{CompCano} = \%CompCano;
$all_data{DecoKompData} = \%DecoKompData;
$all_data{CombClass} = \%CombClass;
require Storable; # in line, to not require module if not needed
&Storable::nstore (\%all_data, $dataFile);
}
#
# read data from file
#
sub ReadStoredData {
my ($dataFile) = @_;
require Storable; # in line, to not require module if not needed
my %all_data = %{&Storable::retrieve ($dataFile)};
%exists = %{$all_data{exists}};
%DecoCano = %{$all_data{DecoCano}};
%CompCano = %{$all_data{CompCano}};
%DecoKompData = %{$all_data{DecoKompData}};
%CombClass = %{$all_data{CombClass}};
}
#
# Print instructions (-h)
#
sub PrintInstructions {
print STDERR <<EOF;
charlint (code name Charly)
Character Check and Normalization
According to W3C and Unicode Specifications
===========================================
$version
(c) Keio University 1999, see perl source or
http://www.w3.org/International/charlint for details
Available options:
(options prefixed by # are currently not available)
-b: Remove initial 'Byte Order Mark'
-B: Supress warning about initial 'Byte Order Mark'
-C: Do not normalize
-d: Debug: Thoroughly check character data table input
-D: Leave after reading in character data
-e: # remove undefined codepoints
-E: Do not warn about undefined codepoints
-f file: Read data from file (no default anymore)
(please use newest V3.0 datafiles)
-h: Prints out this short description
-k: # Warn about compatibility codepoints
-K: Normalize out (i.e. decompose) compatibility codepoints
-n: Accept &#ddddd; and &#xhhhh; on input
(beware of <![CDATA[, <SCRIPT>, <STYLE>)
-N: Produce &#xhhhh; on output
-o: Print out 'unprintable' bytes as \octal
-p: # Remove stuff in private zone
-P: Supress checking private zone
-q: Quiet, don't output progress messages
-s file: Read data from file produced with -S
-S file: Write data to file for fast reload with -s
-u: # Fix UTF-8 (convert or remove)
-U: Supress checking correctness of UTF-8
-v: Print version
-x: Do decomposition only
-X: Don't do decomposition (assume input is decomposed)
EOF
# end of raw in-place text
# ideas for more options:
# * don't normalize, just check
# * allow to do kompatibility processing by category
# * warn/remove plane 14 language tag codes and other crap
# * convert crap to what it's supposed to be (difficult)
# * directionality control codes
} # end &PrintInstructions
#
# Start of main program
#
#
# Read options
#
OPTIONS:
while ($ARGV[0] =~ /^-/) {
$_ = shift(@ARGV);
$OPTb= 1, next OPTIONS if /^-b$/;
$OPTB= 1, next OPTIONS if /^-B$/;
$OPTC= 1, next OPTIONS if /^-C$/;
$OPTd= 1, next OPTIONS if /^-d$/;
$OPTD= 1, next OPTIONS if /^-D$/;
$OPTE= 1, next OPTIONS if /^-E$/;
if (/^-f$/) {
$OPTf = 1;
$dataFile = shift(@ARGV);
print STDERR "Using character data file $dataFile.",
" Maybe not what you intend.\n" if ($dataFile =~ /^-.$/ && !$OPTq);
next OPTIONS;
}
$OPTh= 1, next OPTIONS if /^-h$/;
$OPTK= 1, next OPTIONS if /^-K$/;
$OPTn= 1, next OPTIONS if /^-n$/;
$OPTN= 1, next OPTIONS if /^-N$/;
$OPTo= 1, next OPTIONS if /^-o$/;
$OPTP= 1, next OPTIONS if /^-P$/;
$OPTq= 1, next OPTIONS if /^-P$/;
if (/^-s$/) {
$OPTs = 1;
$readStoreFile = shift(@ARGV);
print STDERR "Reading from store file $readStoreFile.",
" Maybe not what you intend.\n" if ($readStoreFile =~ /^-.$/ && !$OPTq);
next OPTIONS;
}
if (/^-S$/) {
$OPTS = 1;
$writeStoreFile = shift(@ARGV);
print STDERR "Writing to store file $writeStoreFile.",
" Maybe not what you intend.\n" if ($writeStoreFile =~ /^-.$/ && !$OPTq);
next OPTIONS;
}
$OPTU= 1, next OPTIONS if /^-U$/;
$OPTv= 1, next OPTIONS if /^-v$/;
$OPTx= 1, next OPTIONS if /^-x$/;
$OPTX= 1, next OPTIONS if /^-X$/;
print STDERR "Unrecognized argument: \"", $_, "\"; ignored.\n";
}
if ($OPTh) { &PrintInstructions(); }
if ($OPTv) {
print $version, ",\nfor more information, use 'charlint -h'.\n";
}
#
# Read/write data files
#
if ($OPTf && !$OPTs && $OPTS) {
ReadCharacterDataFile($dataFile);
StoreData ($writeStoreFile);
}
elsif ($OPTf && !$OPTs && !$OPTS) {
ReadCharacterDataFile($dataFile);
}
elsif (!$OPTf && $OPTs && !$OPTS) {
ReadStoredData ($readStoreFile);
}
elsif (!$OPTf && !$OPTs && !$OPTS) {
# default amounts to -f UnicodeData-Latest.txt
ReadCharacterDataFile("UnicodeData-Latest.txt");
}
else {
die "Inappropriate file option combination (-f/-s/-S).\n";
}
exit 0 if ($OPTD);
#
# PROCESS ACTUAL FILE(S)
#
$line = 0;
LINE:
while (<>) {
$line++;
# Convert NCRs on input
if ($OPTn) {
# decimal numeric character references
s/&#([0-9]+)\;/Xnum2utf8($1)/eg;
# hexadecimal numeric character references
s/&#x([0-9a-fA-F]+)\;/Xhex2utf8($1)/eg;
}
# Check BOM
if ($line == 1) {
if (!$OPTb && /^\357\273\277/) {
print STDERR "Initial BOM.\n";
}
if ($OPTB) {
s/^\357\273\277// # remove initial BOM
}
}
# Check UTF-8
if (!$OPTU && ($r = CheckUTF8 ($_))) {
print STDERR "Line $line: Non-UTF-8 ($r).\n";
die "Giving up!\n";
}
# Check nonexisting characters
if (!$OPTE && CheckExists ($_)) {
die "Line $line: Non-Existing codepoints.\nGiving up!\n";
}
# Check private characters
if (!$OPTP && ($r = CheckPrivate ($_))) {
die "Line $line: Private charaters ($r).\nGiving up!\n";
}
@line = splitutf8($_);
@line2 = ();
if (!$OPTC) {
if ($OPTX) {
@line2 = @line;
}
else { # decompose
while (defined($s = shift @line)) {
if ($OPTK) {
push @line2, splitutf8(DecoKomp($s));
}
else {
push @line2, splitutf8(DecoCano($s));
}
}
}
# canonical reordering
@line = sortCano(@line2);
# recompose
if (!$OPTx) {
$starterPos = 0;
$sourceLength = @line;
if ($sourceLength == 0) {
$targetPos = 0;
}
else {
$targetPos = 1;
$starterCh = $line[0];
$lastClass = -1; # this eliminates a special check
}
for ($sourcePos = 1; $sourcePos < $sourceLength; $sourcePos++) {
$ch = $line[$sourcePos];
$chClass = $CombClass{$ch};
$chClass = 0 if (!defined($chClass));
$composite = CompCano($starterCh, $ch);
if (defined($composite) && $lastClass < $chClass) {
$line[$starterPos] = $composite;
$starterCh = $composite;
}
elsif ($chClass == 0) {
$starterPos = $targetPos;
$starterCh = $ch;
$lastClass = -1;
$line[$targetPos++] = $ch;
}
else {
$lastClass = $chClass;
$line[$targetPos++] = $ch;
}
}
$#line = $targetPos-1;
} # end of recomposition
} #if (!OPTC)
printOPT (join "", @line);
} # end while <>
Webmaster