#!/usr/bin/perl # Copyright notice: # (c) Copyright Keio University 1999, 2000 # This software is made available under the terms of the # W3C Software Licence available at # http://www.w3.org/Consortium/Legal/copyright-software. # Acknowledgements: Mark Davis for various discussions about Unicode TR #15 # and for the test suite # Paul Hoffman for suggestions, encouragement, and bug reports # Roland Mas for how to use Storable # James Briggs and Masayasu Ishikawa for error reports # Tim Bray for help with CDATA # Authors: # MJD Martin J. Du"rst, duerst@w3.org $version = 'Version 0.45'; # History: # 2000/11/12: 0.45, bug fix for CJK extension A MJD # 2000/11/09: 0.44, implemented -s/-S (Storable data) MJD # 2000/11/05: 0.43, implemented -K (kompatibility decomposition) MJD # 2000/11/05: 0.42, updated for 3.0.1, fixed line ends MJD # 2000/11/05: 0.41, added 2000 to copyright, tested CVS commit MJD # 2000/08/03: 0.40, added Hangul support and did quite some testing MJD # 2000/08/02: 0.37, added -x and -X for decomposition MJD # 2000/07/27: 0.36, fixed a bug for non-starter decompositions MJD # 2000/07/24: 0.35, adapted exclusions to 3.0.0 final (+Tibetan) MJD # 2000/07/24: 0.34, $chClass = $CombClass{ch}; should read $chClass = $CombClass{$ch}; # implemented -C MJD # 1999/08/16: 0.33, updated for second version of 3.0.0.beta MJD # 1999/07/01: 0.32, adapted surrogates/exclusions to 3.0.0.beta MJD # 1999/06/25: 0.31, fixed reordering bug, going public MJD # 1999/06/23: 0.30, preparation for W3C member test, without Hangul MJD # CVS last revised $Date: 2000/11/17 16:46:16 $ by $Author: duerst $ #package CHARLINT; ## tried these, but need more time to get it right #use strict; #use diagnostics; use Storable; # # SUBROUTINES # sub addtext { local (&r, $t) = @_; $r = $r ? ($r."; ".$t) : $t; } # Check problems in UTF-8 sub CheckUTF8 { local ($s) = @_; local ($return) = ""; local ($st); if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) { &addtext (*return, "surrogate pair") if !$OPTU; } if ($s =~ /[\355][\240-\277][\200-\277]/) { &addtext (*return, "single surrogate") if !$OPTU; } if ($s =~ /[\300-\301][\200-\277]/) { &addtext (*return, "ASCII in 2 bytes") if !$OPTU; } if ($s =~ /[\340][\200-\237][\200-\277]/) { &addtext (*return, "3 bytes instead of 2 or less") if !$OPTU; } if ($s =~ /[\360][\200-\217][\200-\277]{2}/) { &addtext (*return, "4 bytes instead of 3 or less") if !$OPTU; } if ($s =~ /[\370][\200-\207][\200-\277]{3}/) { &addtext (*return, "5 bytes instead of 4 or less") if !$OPTU; } if ($s =~ /[\374][\200-\203][\200-\277]{4}/) { &addtext (*return, "6 bytes instead of 5 or less") if !$OPTU; } # non-synchronized cases $s =~ s{ [\000-\177] | [\300-\337][\200-\277] | [\340-\357][\200-\277]{2} | [\360-\367][\200-\277]{3} | [\370-\373][\200-\277]{4} | [\374-\375][\200-\277]{5} }{}gx; # forbidden bytes if ($s =~ /[\376\377]/) { &addtext (*return, "0xFE or 0xFF byte") if !$OPTU; } if ($s ne "") { &addtext (*return, "synchronization problem") if !$OPTU; } $return; } # end CheckUTF8 sub exists { my $s = shift; return ($exists{$s} or ("\344\270\200" le $s and $s le "\351\276\245") # CJK or ("\352\260\200" le $s and $s le "\355\236\243") # Hangul or ("\343\220\200" le $s and $s le "\344\266\265")); # CJK Extension A } sub CheckExists { my ($s) = @_; my $news = join "", grep (&exists($_), $s =~ m/([\000-\177]|[\300-\377][\200-\277]+)/go); return ($s ne $news); } # end CheckExists sub CheckPrivate { my ($s) = @_; if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) { return "BMP"; } if ($s =~ /[\363][\260-\277][\200-\277]{2}/) { return "plane 15"; } if ($s =~ /[\364][\200-\217][\200-\277]{2}/) { return "plane 16"; } } # end CheckPrivate #### convert hex to UTF-8 sub hex2utf8 { num2utf8 (hex($_[0])); } # end hex2utf8 sub Xhex2utf8 { #### avoid to covert <>&" local ($t) = hex($_[0]); if ($t < 0x40) { return "&#x".$_[0].";"; } num2utf8 ($t); } # end Xhex2utf8 sub Xnum2utf8 { #### avoid to covert <>&" local ($t) = @_; if ($t < 64) { return "&#".$t.";"; } num2utf8 ($t); } # end Xnum2utf8 #### convert number to UTF-8 sub num2utf8 { local($t) = @_; local($trail, $firstbits, @result); if ($t<0x00000080) { $firstbits=0x00; $trail=0; } elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; } elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; } elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; } elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; } elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; } else { die "Too large scalar value, cannot be converted to UTF-8.\n"; } for (1 .. $trail) { unshift (@result, ($t & 0x3F) | 0x80); $t >>= 6; # slight danger of non-portability } unshift (@result, $t | $firstbits); pack ("C*", @result); } # end num2utf8 sub utf82ncr { # works for more than one character my $r; foreach $c (splitutf8(shift)) { $r .= ($c =~ /[\040-\177]/) ? $c : sprintf ("&#x%lX;",&utf82num($c)); } $r; } # end utf82ncr #### convert UTF-8 to number sub utf82num { local(@t, $t, $result); @t = unpack ("C*", $_[0]); $t = shift (@t); if ($t<0x80) { $result= $t ; $trail=0; } elsif ($t<0xC0) { die "Illegal leading byte in UTF-8.\n"; } elsif ($t<0xE0) { $result= $t & 0x1F; $trail=1; } elsif ($t<0xF0) { $result= $t & 0x0F; $trail=2; } elsif ($t<0xF8) { $result= $t & 0x07; $trail=3; } elsif ($t<0xFC) { $result= $t & 0x03; $trail=4; } elsif ($t<0xFE) { $result= $t & 0x01; $trail=5; } else { die "Illegal byte in UTF-8.\n"; } if ($trail != $#t + 1) { die "Not right number of trailing bytes.\n"; } while ($trail--) { # maybe check for 01xxxxxx $result <<= 6; $result += 0x3F & shift (@t); } return $result; } # end utf82num #### variant of hex2utf8 to get rid of spaces sub spacehex2utf8 { my ($t) = @_; return "" if ($t eq " "); hex2utf8($t); } # end spacehex2utf8 #### split an utf-8 string into codepoints sub splitutf8 { split (/(?=[\000-\177\300-\377])/, shift); } # end splitutf8 #### canonical sort of combining marks # input: utf-8 string # return: utf-8 string sub sortCano { my @a = @_; my ($i, $ccHere, $ccPrev, $temp); return @a if (@a <= 1); for ($i=1; $i < @a; $i++) { $ccHere = $CombClass{$a[$i]}; $ccPrev = $CombClass{$a[$i-1]}; $ccHere = 0 if (!defined($ccHere)); $ccPrev = 0 if (!defined($ccPrev)); if ($ccHere != 0 && $ccPrev > $ccHere) { $temp = $a[$i]; # exchange $a[$i] = $a[$i-1]; $a[$i-1] = $temp; $i -= 2 if ($i > 1); # backtrack and check again } } return @a; } # end sortCano # # add algorithmic Hangul and unity transform to DecoCano lookup # sub DecoCano { my ($s) = @_; my $h = utf82num($s); if ($h >= 0xAC00 && $h < 0xD7A4) { my $hindex = $h - 0xAC00; my $l = 0x1100 + $hindex/(21*28); my $v = 0x1161 + ($hindex % (21*28)) / 28; my $t = $hindex % 28; if ($t) { return join "", num2utf8($l), num2utf8($v), num2utf8(0x11A7 + $t); } else { return join "", num2utf8($l), num2utf8($v); } } else { my $r = $DecoCano{$s}; return $r if defined $r; return $s; } } # end DecoCano # # add algorithmic Hangul and unity transform to DecoKomp lookup # sub DecoKomp { my ($s) = @_; my $h = utf82num($s); if ($h >= 0xAC00 && $h < 0xD7A4) { return DecoCano($s); #refer to DecoCano for Hangul decomposition } else { my $r = $DecoKompData{$s}; return $r if defined $r; return $s; } } # end DecoKomp # # add algorithmic Hangul to CompCano lookup # sub CompCano { my ($starterCh, $ch) = @_; my $s = utf82num($starterCh); my $c = utf82num($ch); if ($s >= 0x1100 && $s < 0x1113 && $c >= 0x1161 && $c < 0x1176) { return num2utf8((($s-0x1100)*21+$c-0x1161) * 28 + 0xAC00); } elsif ($s >= 0xAC00 && $s < 0xD7A4 && !(($s-0xAC00)%28) && $c >= 0x11A8 && $c < 0x11C3) { return num2utf8($s + $c - 0x11A7); } else { return $CompCano{join "", ($starterCh, $ch)}; } } # end CompCano sub printoctal { my $s = shift; $s =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/ge; print STDERR $s; } # end printoctal #### output data considering all relevant options sub printOPT { my $t = shift; if ($OPTN) { # hexadecimal numeric character references $t =~ s/([\300-\377][\200-\277]+)/utf82ncr($1)/eg; } elsif ($OPTo) { $t =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/eg; } print $t; } # # read in base data file # sub ReadCharacterDataFile { my ($dataFile) = @_; my $line = 0; open (BASE, $dataFile) or die "Cannot open character data file $dataFile.\n"; BASE: while () { print STDERR "Reading data file, line $line\n" if !($line % 1000 && !$OPTq); $line++; chop; ($hex, $name, $category, $combClass, $t4, $dec) = split(/;/); #### Check ranges for consistency with handcoded pieces, then skip if ($name =~ /^<(.*), (.*)/) { if ( $_ !~ /^4E00;;Lo;0;L;;;;;N;;;;;$/ and $_ !~ /^9FA5;;Lo;0;L;;;;;N;;;;;$/ and $_ !~ /^AC00;;Lo;0;L;;;;;N;;;;;$/ and $_ !~ /^D7A3;;Lo;0;L;;;;;N;;;;;$/ and $_ !~ /^D800;;Cs;0;L;;;;;N;;;;;$/ and $_ !~ /^DB7F;;Cs;0;L;;;;;N;;;;;$/ and $_ !~ /^DB80;;Cs;0;L;;;;;N;;;;;$/ and $_ !~ /^DBFF;;Cs;0;L;;;;;N;;;;;$/ and $_ !~ /^DC00;;Cs;0;L;;;;;N;;;;;$/ and $_ !~ /^DFFF;;Cs;0;L;;;;;N;;;;;$/ and $_ !~ /^E000;;Co;0;L;;;;;N;;;;;$/ and $_ !~ /^F8FF;;Co;0;L;;;;;N;;;;;$/ and $_ !~ /^3400;;Lo;0;L;;;;;N;;;;;$/ and $_ !~ /^4DB5;;Lo;0;L;;;;;N;;;;;$/ and $_ !~ /^F0000;;Co;0;L;;;;;N;;;;;$/ and $_ !~ /^FFFFD;;Co;0;L;;;;;N;;;;;$/ and $_ !~ /^100000;;Co;0;L;;;;;N;;;;;$/ and $_ !~ /^10FFFD;;Co;0;L;;;;;N;;;;;$/ ) { die "Problem with data file consistency, line $line: \n\t$_.\n"; } } else { # normal line processing $u = &hex2utf8($hex); $exists{$u} = 1; # to check characters that exist #### Decompositions if ($dec eq "") { } # no decomposition elsif ($dec =~ /^<(.*)>(.*)/) { # compatibility $decKind = $1; $dec = $2; $DecoKompKind{$u} = $decKind; $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg; $DecoKompData{$u} = $dec; } else { # canonical decomposition $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg; $DecoCanoData{$u} = $dec; $DecoKompData{$u} = $dec; # add to Komp, to expand everything } #### Canonical Combining Class $CombClass{$u} = $combClass if ($combClass); } } close (BASE); print STDERR "Finished reading character database.\n" if (!$OPTq); %DecoCanoRest = %DecoCano = %DecoCanoData; # keep original data as is, and # copy to restrict for composition # list of compatibility kinds for later work # the idea is to group them (e.g. sub and super) and allow # normalization by group #Kompatibility Kind: circle #Kompatibility Kind: compat #Kompatibility Kind: final #Kompatibility Kind: font #Kompatibility Kind: fraction #Kompatibility Kind: initial #Kompatibility Kind: isolated #Kompatibility Kind: medial #Kompatibility Kind: narrow #Kompatibility Kind: noBreak #Kompatibility Kind: small #Kompatibility Kind: square #Kompatibility Kind: sub #Kompatibility Kind: super #Kompatibility Kind: vertical #Kompatibility Kind: wide # fully expand canonical decompositions $fixpoint = 0; # set to false while (!$fixpoint) { $fixpoint = 1; # set to true print "Fixpoint\n" if ($OPTd); foreach $key (sort keys %DecoCano) { my @s = splitutf8($DecoCano{$key}); my $i = 0; foreach $c (@s) { my $d; if ($d = $DecoCano{$c}) { print "replacing ", utf82ncr($c), " with ", utf82ncr($d), " in ", utf82ncr($key), "\n" if $OPTd; if ($i > 0) { print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n"; die "Giving up!\n"; } $c = $d; $fixpoint = 0; # changed something; need one more pass } $i++; } $DecoCano{$key} = join "", @s; } } # fully expand kompatibility decompositions $fixpoint = 0; # set to false while (!$fixpoint) { $fixpoint = 1; # set to true print "Fixpoint\n" if ($OPTd); foreach $key (sort keys %DecoKompData) { my @s = splitutf8($DecoKompData{$key}); my $i = 0; foreach $c (@s) { my $d; if ($d = $DecoKompData{$c}) { print "replacing ", utf82ncr($c), " with ", utf82ncr($d), " in ", utf82ncr($key), "\n" if $OPTd; if ($i > 0 && splitutf8($d) > 1) { # print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n"; # die "Giving up!\n"; } $c = $d; $fixpoint = 0; # changed something; need one more pass } $i++; } $DecoKompData{$key} = join "", @s; } } # reorder combining marks for canonical decomposition foreach $key (sort keys %DecoCano) { # sort to sort the output my $s = $DecoCano{$key}; my $t = join "", sortCano(splitutf8($s)); if ($s ne $t) { print STDERR "Error: Had to reorder ", utf82ncr($key), " from ", utf82ncr($s), " to ", utf82ncr($t), "\n"; die "Giving up!\n"; } $DecoCano{$key} = $t; } # reorder combining marks for kompatibility decomposition foreach $key (sort keys %DecoKompData) { # sort to sort the output my $s = $DecoKompData{$key}; my $t = join "", sortCano(splitutf8($s)); if ($s ne $t) { print STDERR "Error: Had to reorder ", utf82ncr($key), " from ", utf82ncr($s), " to ", utf82ncr($t), "\n"; die "Giving up!\n"; } $DecoKompData{$key} = $t; } # detect singular compositions foreach $key (sort keys %DecoCanoRest) { # sort to sort output if (1 == scalar(splitutf8($DecoCanoRest{$key}))) { print 'Singular composition: ', utf82ncr($key), ' from ', utf82ncr($DecoCanoRest{$key}), ", removed.\n" if ($OPTd); delete $DecoCanoRest{$key}; } } # detect 'non-zero' compositions foreach $key (sort keys %DecoCanoRest) { # sort to sort output my @a = splitutf8($DecoCanoRest{$key}); if ($CombClass{shift @a}) { print 'Non-zero composition: ', utf82ncr($key), ' from ', utf82ncr($DecoCanoRest{$key}), ", removed.\n" if ($OPTd); delete $DecoCanoRest{$key}; } } # detect 'all-zero' compositions foreach $key (sort keys %DecoCanoRest) { # sort to sort output my $allzero = 1; my @a = splitutf8($DecoCanoRest{$key}); foreach $c (@a) { $allzero = 0 if ($CombClass{$c}); } if ($allzero) { print 'All-zero composition: ', utf82ncr($key), ' from ', utf82ncr($DecoCanoRest{$key}), ".\n" if ($OPTd); } } @NoRecomp = ( # Script-specific, table-based # according to ftp://ftp.unicode.org/Public/3.0-Update/CompositionExclusions-1.beta.txt '0958', # DEVANAGARI LETTER QA '0959', # DEVANAGARI LETTER KHHA '095A', # DEVANAGARI LETTER GHHA '095B', # DEVANAGARI LETTER ZA '095C', # DEVANAGARI LETTER DDDHA '095D', # DEVANAGARI LETTER RHA '095E', # DEVANAGARI LETTER FA '095F', # DEVANAGARI LETTER YYA '09DC', # BENGALI LETTER RRA '09DD', # BENGALI LETTER RHA '09DF', # BENGALI LETTER YYA '0A33', # GURMUKHI LETTER LLA '0A36', # GURMUKHI LETTER SHA '0A59', # GURMUKHI LETTER KHHA '0A5A', # GURMUKHI LETTER GHHA '0A5B', # GURMUKHI LETTER ZA '0A5E', # GURMUKHI LETTER FA '0B5C', # ORIYA LETTER RRA '0B5D', # ORIYA LETTER RHA '0F43', # TIBETAN LETTER GHA '0F4D', # TIBETAN LETTER DDHA '0F52', # TIBETAN LETTER DHA '0F57', # TIBETAN LETTER BHA '0F5C', # TIBETAN LETTER DZHA '0F69', # TIBETAN LETTER KSSA '0F76', # TIBETAN VOWEL SIGN VOCALIC R '0F78', # TIBETAN VOWEL SIGN VOCALIC L '0F93', # TIBETAN SUBJOINED LETTER GHA '0F9D', # TIBETAN SUBJOINED LETTER DDHA '0FA2', # TIBETAN SUBJOINED LETTER DHA '0FA7', # TIBETAN SUBJOINED LETTER BHA '0FAC', # TIBETAN SUBJOINED LETTER DZHA '0FB9', # TIBETAN SUBJOINED LETTER KSSA 'FB1F', # HEBREW LIGATURE YIDDISH YOD YOD PATAH 'FB2A', # HEBREW LETTER SHIN WITH SHIN DOT 'FB2B', # HEBREW LETTER SHIN WITH SIN DOT 'FB2C', # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT 'FB2D', # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT 'FB2E', # HEBREW LETTER ALEF WITH PATAH 'FB2F', # HEBREW LETTER ALEF WITH QAMATS 'FB30', # HEBREW LETTER ALEF WITH MAPIQ 'FB31', # HEBREW LETTER BET WITH DAGESH 'FB32', # HEBREW LETTER GIMEL WITH DAGESH 'FB33', # HEBREW LETTER DALET WITH DAGESH 'FB34', # HEBREW LETTER HE WITH MAPIQ 'FB35', # HEBREW LETTER VAV WITH DAGESH 'FB36', # HEBREW LETTER ZAYIN WITH DAGESH 'FB38', # HEBREW LETTER TET WITH DAGESH 'FB39', # HEBREW LETTER YOD WITH DAGESH 'FB3A', # HEBREW LETTER FINAL KAF WITH DAGESH 'FB3B', # HEBREW LETTER KAF WITH DAGESH 'FB3C', # HEBREW LETTER LAMED WITH DAGESH 'FB3E', # HEBREW LETTER MEM WITH DAGESH 'FB40', # HEBREW LETTER NUN WITH DAGESH 'FB41', # HEBREW LETTER SAMEKH WITH DAGESH 'FB43', # HEBREW LETTER FINAL PE WITH DAGESH 'FB44', # HEBREW LETTER PE WITH DAGESH 'FB46', # HEBREW LETTER TSADI WITH DAGESH 'FB47', # HEBREW LETTER QOF WITH DAGESH 'FB48', # HEBREW LETTER RESH WITH DAGESH 'FB49', # HEBREW LETTER SHIN WITH DAGESH 'FB4A', # HEBREW LETTER TAV WITH DAGESH 'FB4B', # HEBREW LETTER VAV WITH HOLAM 'FB4C', # HEBREW LETTER BET WITH RAFE 'FB4D', # HEBREW LETTER KAF WITH RAFE 'FB4E' # HEBREW LETTER PE WITH RAFE ); # remove script-specific foreach $hex (@NoRecomp) { my $key = &hex2utf8($hex); print 'Non-recomposing composition: ', utf82ncr($key), ' from ', utf82ncr($DecoCanoRest{$key}), ", removed.\n" if ($OPTd); delete $DecoCanoRest{$key}; } # replace with fully expanded decompositions foreach $key (keys %DecoCanoRest) { $DecoCanoRest{$key} = $DecoCano{$key} if ($DecoCanoRest{$key} ne $DecoCano{$key}); } # detect duplicate compositions ## should not find any if ($OPTd) { print "Checking duplicates, takes some time.\n"; foreach $key (sort keys %DecoCanoRest) { # sort to sort output my $s = $DecoCanoRest{$key}; foreach $key2 (keys %DecoCanoRest) { if (($key lt $key2) # don't compare with itself, don't warn twice && ($DecoCanoRest{$key2} eq $s)) { # duplicate compositions print STDERR 'Duplicate composition: ', utf82ncr($key), ' and ', utf82ncr($key2), ' to ', utf82ncr($s), "\n"; die "Giving up!\n"; } else { next; } # shortcut loop } } } # invert for composition foreach $key (keys %DecoCanoRest) { # use reduced decomps for selection if (defined $CompCano{$DecoCanoData{$key}}) # use original data { # (strictly binary) die "Duplicate compositions, giving up.\n"; } $CompCano{$DecoCanoData{$key}} = $key; } if ($OPTd) { foreach $key (sort keys %DecoCano) { # sort to sort output print 'Final canonical decomposition: ', utf82ncr($key), ' to ', utf82ncr($DecoCano{$key}), "\n"; } foreach $key (sort keys %CompCano) { # sort to sort output print 'Final composition: ', utf82ncr($key), ' to ', utf82ncr($CompCano{$key}), "\n"; } foreach $key (sort keys %DecoKompData) { # sort to sort output print 'Final kompatibility decomposition: ', utf82ncr($key), ' to ', utf82ncr($DecoKompData{$key}), "\n"; } } print STDERR "Finished processing character data file(s).\n" if (!$OPTq); } # end ReadCharacterDataFile # # store data to file for fast reread # sub StoreData { my ($dataFile) = @_; my %all_data = (); $all_data{exists} = \%exists; $all_data{DecoCano} = \%DecoCano; $all_data{CompCano} = \%CompCano; $all_data{DecoKompData} = \%DecoKompData; $all_data{CombClass} = \%CombClass; require Storable; # in line, to not require module if not needed &Storable::nstore (\%all_data, $dataFile); } # # read data from file # sub ReadStoredData { my ($dataFile) = @_; require Storable; # in line, to not require module if not needed my %all_data = %{&Storable::retrieve ($dataFile)}; %exists = %{$all_data{exists}}; %DecoCano = %{$all_data{DecoCano}}; %CompCano = %{$all_data{CompCano}}; %DecoKompData = %{$all_data{DecoKompData}}; %CombClass = %{$all_data{CombClass}}; } # # Print instructions (-h) # sub PrintInstructions { print STDERR <,