Annotation of charlint/charlint.pl, revision 1.25
1.12 duerst 1: #!/usr/bin/perl
2:
3: # Copyright notice:
1.22 duerst 4: # (c) Copyright Keio University 1999-2002
1.12 duerst 5: # This software is made available under the terms of the
6: # W3C Software Licence available at
7: # http://www.w3.org/Consortium/Legal/copyright-software.
8:
9:
1.13 duerst 10: # Acknowledgements: Mark Davis for various discussions about Unicode TR #15
1.12 duerst 11: # and for the test suite
1.16 duerst 12: # Paul Hoffman for suggestions, encouragement, and bug reports
1.15 duerst 13: # Roland Mas for how to use Storable
1.12 duerst 14: # James Briggs and Masayasu Ishikawa for error reports
1.13 duerst 15: # Tim Bray for help with CDATA
1.21 duerst 16: # Kai Henningsen for finally getting me to clean up
17: # for 'use strict' and 'perl -w'
1.24 duerst 18: # Roozbeh Pournader for proposing to deal with HTML &#Xhhhh;
1.12 duerst 19:
1.22 duerst 20: # Author:
1.12 duerst 21: # MJD Martin J. Du"rst, duerst@w3.org
22:
1.25 ! duerst 23: my $version = 'Version 0.52';
1.12 duerst 24:
25: # History:
1.25 ! duerst 26: # 2002/08/23: 0.52, changed default file to UnicodeData.txt MJD
1.24 duerst 27: # 2002/05/21: 0.51, added option -nX (use for HTML only!) MJD
1.22 duerst 28: # 2002/04/03: 0.50, updated for 3.2.0; added -F951; added -c MJD
1.21 duerst 29: # 2001/10/03: 0.49, code cleanup for use strict and -w MJD
30: # 2001/04/01: 0.48, updated for 3.1.0 (final) MJD
1.20 duerst 31: # 2001/03/07: 0.47, YOD WITH HIRIQ corrigendum MJD
1.19 duerst 32: # 2000/12/19: 0.46, updated for 3.1.0 (beta) MJD
1.16 duerst 33: # 2000/11/12: 0.45, bug fix for CJK extension A MJD
1.15 duerst 34: # 2000/11/09: 0.44, implemented -s/-S (Storable data) MJD
35: # 2000/11/05: 0.43, implemented -K (kompatibility decomposition) MJD
36: # 2000/11/05: 0.42, updated for 3.0.1, fixed line ends MJD
37: # 2000/11/05: 0.41, added 2000 to copyright, tested CVS commit MJD
1.12 duerst 38: # 2000/08/03: 0.40, added Hangul support and did quite some testing MJD
39: # 2000/08/02: 0.37, added -x and -X for decomposition MJD
40: # 2000/07/27: 0.36, fixed a bug for non-starter decompositions MJD
41: # 2000/07/24: 0.35, adapted exclusions to 3.0.0 final (+Tibetan) MJD
42: # 2000/07/24: 0.34, $chClass = $CombClass{ch}; should read $chClass = $CombClass{$ch};
43: # implemented -C MJD
44: # 1999/08/16: 0.33, updated for second version of 3.0.0.beta MJD
45: # 1999/07/01: 0.32, adapted surrogates/exclusions to 3.0.0.beta MJD
46: # 1999/06/25: 0.31, fixed reordering bug, going public MJD
47: # 1999/06/23: 0.30, preparation for W3C member test, without Hangul MJD
48:
1.25 ! duerst 49: # CVS last revised $Date: 2002/05/21 16:11:18 $ by $Author: duerst $
1.12 duerst 50:
1.16 duerst 51: #package CHARLINT; ## tried these, but need more time to get it right
1.12 duerst 52: #use diagnostics;
1.21 duerst 53: use strict;
1.14 duerst 54: use Storable;
1.12 duerst 55:
1.21 duerst 56: # Global variables (options and data arrays)
57: use vars qw($OPTB $OPTC $OPTD $OPTE $OPTK
58: $OPTN $OPTP $OPTS $OPTU $OPTX $OPTYWH
1.22 duerst 59: $OPTb $OPTc $OPTd $OPTf $OPTF951 $OPTh
1.23 duerst 60: $OPTn $OPTnX $OPTo $OPTq $OPTs $OPTv $OPTx
1.21 duerst 61: %CombClass %CompCano %DecoCano %DecoCanoData
62: %DecoCanoRest %DecoKompData %DecoKompKind %exists);
63:
64:
1.12 duerst 65: #
66: # SUBROUTINES
67: #
68:
69: sub addtext {
1.21 duerst 70: my ($r, $t) = @_;
71: $$r = $$r ? ($$r."; ".$t) : $t;
1.12 duerst 72: }
73:
74: # Check problems in UTF-8
75:
76: sub CheckUTF8 {
1.21 duerst 77: my ($s) = @_;
78: my ($return) = "";
79: my ($st);
1.12 duerst 80: if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) {
1.21 duerst 81: &addtext (\$return, "surrogate pair") if !$OPTU;
1.12 duerst 82: }
83: if ($s =~ /[\355][\240-\277][\200-\277]/) {
1.21 duerst 84: &addtext (\$return, "single surrogate") if !$OPTU;
1.12 duerst 85: }
86: if ($s =~ /[\300-\301][\200-\277]/) {
1.21 duerst 87: &addtext (\$return, "ASCII in 2 bytes") if !$OPTU;
1.12 duerst 88: }
89: if ($s =~ /[\340][\200-\237][\200-\277]/) {
1.21 duerst 90: &addtext (\$return, "3 bytes instead of 2 or less") if !$OPTU;
1.12 duerst 91: }
92: if ($s =~ /[\360][\200-\217][\200-\277]{2}/) {
1.21 duerst 93: &addtext (\$return, "4 bytes instead of 3 or less") if !$OPTU;
1.12 duerst 94: }
95: if ($s =~ /[\370][\200-\207][\200-\277]{3}/) {
1.21 duerst 96: &addtext (\$return, "5 bytes instead of 4 or less") if !$OPTU;
1.12 duerst 97: }
98: if ($s =~ /[\374][\200-\203][\200-\277]{4}/) {
1.21 duerst 99: &addtext (\$return, "6 bytes instead of 5 or less") if !$OPTU;
1.12 duerst 100: }
101: # non-synchronized cases
102: $s =~ s{ [\000-\177]
103: | [\300-\337][\200-\277]
104: | [\340-\357][\200-\277]{2}
105: | [\360-\367][\200-\277]{3}
106: | [\370-\373][\200-\277]{4}
107: | [\374-\375][\200-\277]{5}
108: }{}gx;
109: # forbidden bytes
110: if ($s =~ /[\376\377]/) {
1.21 duerst 111: &addtext (\$return, "0xFE or 0xFF byte") if !$OPTU;
1.12 duerst 112: }
113: if ($s ne "") {
1.21 duerst 114: &addtext (\$return, "synchronization problem") if !$OPTU;
1.12 duerst 115: }
116: $return;
117: } # end CheckUTF8
118:
119: sub exists {
120: my $s = shift;
121: return ($exists{$s}
1.19 duerst 122: or ("\344\270\200" le $s and $s le "\351\276\245") # CJK
123: or ("\352\260\200" le $s and $s le "\355\236\243") # Hangul
124: or ("\343\220\200" le $s and $s le "\344\266\265") # CJK Extension A
125: or ("\xF0\xA0\x80\x80" le $s and $s le "\xF0\xAA\x9B\x96")) # CJK Extension B
1.12 duerst 126: }
127:
128: sub CheckExists {
129: my ($s) = @_;
130: my $news = join "",
131: grep (&exists($_), $s =~ m/([\000-\177]|[\300-\377][\200-\277]+)/go);
132: return ($s ne $news);
133: } # end CheckExists
134:
135: sub CheckPrivate {
136: my ($s) = @_;
137: if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) {
138: return "BMP";
139: }
140: if ($s =~ /[\363][\260-\277][\200-\277]{2}/) {
141: return "plane 15";
142: }
143: if ($s =~ /[\364][\200-\217][\200-\277]{2}/) {
144: return "plane 16";
145: }
146: } # end CheckPrivate
147:
148:
149: #### convert hex to UTF-8
150:
151: sub hex2utf8 {
152: num2utf8 (hex($_[0]));
153: } # end hex2utf8
154:
155: sub Xhex2utf8 { #### avoid to covert <>&"
1.21 duerst 156: my ($t) = hex($_[0]);
1.12 duerst 157: if ($t < 0x40) { return "&#x".$_[0].";"; }
158: num2utf8 ($t);
159: } # end Xhex2utf8
160:
161: sub Xnum2utf8 { #### avoid to covert <>&"
1.21 duerst 162: my ($t) = @_;
1.12 duerst 163: if ($t < 64) { return "&#".$t.";"; }
164: num2utf8 ($t);
165: } # end Xnum2utf8
166:
167: #### convert number to UTF-8
168:
169: sub num2utf8 {
1.21 duerst 170: my ($t) = @_;
171: my ($trail, $firstbits, @result);
1.12 duerst 172:
173: if ($t<0x00000080) { $firstbits=0x00; $trail=0; }
174: elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; }
175: elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; }
176: elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; }
177: elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; }
178: elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; }
179: else {
180: die "Too large scalar value, cannot be converted to UTF-8.\n";
181: }
182: for (1 .. $trail) {
183: unshift (@result, ($t & 0x3F) | 0x80);
184: $t >>= 6; # slight danger of non-portability
185: }
186: unshift (@result, $t | $firstbits);
187: pack ("C*", @result);
188: } # end num2utf8
189:
190:
191: sub utf82ncr { # works for more than one character
192: my $r;
1.21 duerst 193: foreach my $c (splitutf8(shift)) {
1.12 duerst 194: $r .= ($c =~ /[\040-\177]/) ? $c : sprintf ("&#x%lX;",&utf82num($c));
195: }
196: $r;
197: } # end utf82ncr
198:
199:
200: #### convert UTF-8 to number
201:
202: sub utf82num {
1.21 duerst 203: my (@t, $t, $result);
204: my $trail;
1.12 duerst 205: @t = unpack ("C*", $_[0]);
206: $t = shift (@t);
207: if ($t<0x80) { $result= $t ; $trail=0; }
208: elsif ($t<0xC0) { die "Illegal leading byte in UTF-8.\n"; }
209: elsif ($t<0xE0) { $result= $t & 0x1F; $trail=1; }
210: elsif ($t<0xF0) { $result= $t & 0x0F; $trail=2; }
211: elsif ($t<0xF8) { $result= $t & 0x07; $trail=3; }
212: elsif ($t<0xFC) { $result= $t & 0x03; $trail=4; }
213: elsif ($t<0xFE) { $result= $t & 0x01; $trail=5; }
214: else { die "Illegal byte in UTF-8.\n"; }
215:
216: if ($trail != $#t + 1) { die "Not right number of trailing bytes.\n"; }
217: while ($trail--) {
218: # maybe check for 01xxxxxx
219: $result <<= 6;
220: $result += 0x3F & shift (@t);
221: }
222: return $result;
223: } # end utf82num
224:
225: #### variant of hex2utf8 to get rid of spaces
226: sub spacehex2utf8 {
227: my ($t) = @_;
228: return "" if ($t eq " ");
229: hex2utf8($t);
230: } # end spacehex2utf8
231:
232: #### split an utf-8 string into codepoints
233: sub splitutf8 {
234: split (/(?=[\000-\177\300-\377])/, shift);
235: } # end splitutf8
236:
237:
238: #### canonical sort of combining marks
239: # input: utf-8 string
240: # return: utf-8 string
241: sub sortCano {
242: my @a = @_;
243: my ($i, $ccHere, $ccPrev, $temp);
244:
245: return @a if (@a <= 1);
246: for ($i=1; $i < @a; $i++) {
247: $ccHere = $CombClass{$a[$i]};
248: $ccPrev = $CombClass{$a[$i-1]};
249: $ccHere = 0 if (!defined($ccHere));
250: $ccPrev = 0 if (!defined($ccPrev));
251: if ($ccHere != 0 && $ccPrev > $ccHere) {
252: $temp = $a[$i]; # exchange
253: $a[$i] = $a[$i-1];
254: $a[$i-1] = $temp;
255: $i -= 2 if ($i > 1); # backtrack and check again
256: }
257: }
258: return @a;
259: } # end sortCano
260:
261: #
1.13 duerst 262: # add algorithmic Hangul and unity transform to DecoCano lookup
1.12 duerst 263: #
264: sub DecoCano {
265: my ($s) = @_;
266: my $h = utf82num($s);
267: if ($h >= 0xAC00 && $h < 0xD7A4) {
268: my $hindex = $h - 0xAC00;
269: my $l = 0x1100 + $hindex/(21*28);
270: my $v = 0x1161 + ($hindex % (21*28)) / 28;
271: my $t = $hindex % 28;
272: if ($t) {
273: return join "", num2utf8($l), num2utf8($v), num2utf8(0x11A7 + $t);
274: }
275: else {
276: return join "", num2utf8($l), num2utf8($v);
277: }
278: }
279: else {
1.13 duerst 280: my $r = $DecoCano{$s};
281: return $r if defined $r;
282: return $s;
1.12 duerst 283: }
284: } # end DecoCano
285:
286: #
1.13 duerst 287: # add algorithmic Hangul and unity transform to DecoKomp lookup
288: #
289: sub DecoKomp {
290: my ($s) = @_;
291: my $h = utf82num($s);
292: if ($h >= 0xAC00 && $h < 0xD7A4) {
293: return DecoCano($s); #refer to DecoCano for Hangul decomposition
294: }
295: else {
296: my $r = $DecoKompData{$s};
297: return $r if defined $r;
298: return $s;
299: }
300: } # end DecoKomp
301:
302: #
1.12 duerst 303: # add algorithmic Hangul to CompCano lookup
304: #
305: sub CompCano {
306: my ($starterCh, $ch) = @_;
307: my $s = utf82num($starterCh);
308: my $c = utf82num($ch);
309: if ($s >= 0x1100 && $s < 0x1113 && $c >= 0x1161 && $c < 0x1176) {
310: return num2utf8((($s-0x1100)*21+$c-0x1161) * 28 + 0xAC00);
311: }
312: elsif ($s >= 0xAC00 && $s < 0xD7A4 && !(($s-0xAC00)%28) && $c >= 0x11A8 && $c < 0x11C3) {
313: return num2utf8($s + $c - 0x11A7);
314: }
315: else {
316: return $CompCano{join "", ($starterCh, $ch)};
317: }
318: } # end CompCano
319:
320: sub printoctal {
321: my $s = shift;
322: $s =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/ge;
323: print STDERR $s;
324: } # end printoctal
325:
326: #### output data considering all relevant options
327: sub printOPT {
328: my $t = shift;
329: if ($OPTN) {
330: # hexadecimal numeric character references
331: $t =~ s/([\300-\377][\200-\277]+)/utf82ncr($1)/eg;
332: }
333: elsif ($OPTo) {
334: $t =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/eg;
335: }
336: print $t;
337: }
338:
339:
340: #
1.15 duerst 341: # read in base data file
1.12 duerst 342: #
343:
1.15 duerst 344: sub ReadCharacterDataFile {
345: my ($dataFile) = @_;
346: my $line = 0;
1.12 duerst 347: open (BASE, $dataFile)
348: or die "Cannot open character data file $dataFile.\n";
349: BASE:
350: while (<BASE>) {
1.15 duerst 351: print STDERR "Reading data file, line $line\n"
1.18 duerst 352: if !(($line % 1000) && !$OPTq); $line++;
1.12 duerst 353: chop;
1.21 duerst 354: my ($hex, $name, $category, $combClass, $t4, $dec) = split(/;/);
1.12 duerst 355: #### Check ranges for consistency with handcoded pieces, then skip
356: if ($name =~ /^<(.*), (.*)/) {
357: if ( $_ !~ /^4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;$/
358: and $_ !~ /^9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;$/
359: and $_ !~ /^AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;$/
360: and $_ !~ /^D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;$/
361: and $_ !~ /^D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
362: and $_ !~ /^DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
363: and $_ !~ /^DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
364: and $_ !~ /^DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
365: and $_ !~ /^DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
366: and $_ !~ /^DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
367: and $_ !~ /^E000;<Private Use, First>;Co;0;L;;;;;N;;;;;$/
368: and $_ !~ /^F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;$/
369: and $_ !~ /^3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;$/
370: and $_ !~ /^4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;$/
371: and $_ !~ /^F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;$/
372: and $_ !~ /^FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;$/
373: and $_ !~ /^100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;$/
374: and $_ !~ /^10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;$/
1.19 duerst 375: and $_ !~ /^20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;$/
376: and $_ !~ /^2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;$/
1.12 duerst 377: ) {
378: die "Problem with data file consistency, line $line: \n\t$_.\n";
379: }
380: }
381: else { # normal line processing
1.21 duerst 382: my $u = &hex2utf8($hex);
1.12 duerst 383: $exists{$u} = 1; # to check characters that exist
384:
385: #### Decompositions
386: if ($dec eq "") { } # no decomposition
387: elsif ($dec =~ /^<(.*)>(.*)/) { # compatibility
1.21 duerst 388: my $decKind = $1;
1.12 duerst 389: $dec = $2;
390: $DecoKompKind{$u} = $decKind;
391: $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
392: $DecoKompData{$u} = $dec;
393: }
394: else { # canonical decomposition
1.13 duerst 395: $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
396: $DecoCanoData{$u} = $dec;
397: $DecoKompData{$u} = $dec; # add to Komp, to expand everything
1.12 duerst 398: }
399:
400: #### Canonical Combining Class
401: $CombClass{$u} = $combClass if ($combClass);
402: }
403: }
404: close (BASE);
1.15 duerst 405: print STDERR "Finished reading character database.\n" if (!$OPTq);
1.12 duerst 406:
1.22 duerst 407: if ($OPTF951) {
408: $DecoCanoData{"\xEF\xA5\x91"} =
409: $DecoKompData{"\xEF\xA5\x91"} = "\xE9\x9B\xBB";
410: }
1.12 duerst 411: %DecoCanoRest = %DecoCano = %DecoCanoData; # keep original data as is, and
412: # copy to restrict for composition
413:
414: # list of compatibility kinds for later work
415: # the idea is to group them (e.g. sub and super) and allow
416: # normalization by group
417: #Kompatibility Kind: circle
418: #Kompatibility Kind: compat
419: #Kompatibility Kind: final
420: #Kompatibility Kind: font
421: #Kompatibility Kind: fraction
422: #Kompatibility Kind: initial
423: #Kompatibility Kind: isolated
424: #Kompatibility Kind: medial
425: #Kompatibility Kind: narrow
426: #Kompatibility Kind: noBreak
427: #Kompatibility Kind: small
428: #Kompatibility Kind: square
429: #Kompatibility Kind: sub
430: #Kompatibility Kind: super
431: #Kompatibility Kind: vertical
432: #Kompatibility Kind: wide
433:
1.13 duerst 434: # fully expand canonical decompositions
1.21 duerst 435: my $fixpoint = 0; # set to false
1.12 duerst 436: while (!$fixpoint) {
437: $fixpoint = 1; # set to true
438: print "Fixpoint\n" if ($OPTd);
1.21 duerst 439: foreach my $key (sort keys %DecoCano) {
1.12 duerst 440: my @s = splitutf8($DecoCano{$key});
441: my $i = 0;
1.21 duerst 442: foreach my $c (@s) {
1.12 duerst 443: my $d;
444: if ($d = $DecoCano{$c}) {
445: print "replacing ", utf82ncr($c), " with ",
446: utf82ncr($d), " in ", utf82ncr($key), "\n"
447: if $OPTd;
448: if ($i > 0) {
449: print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
450: die "Giving up!\n";
451: }
452: $c = $d;
453: $fixpoint = 0; # changed something; need one more pass
454: }
455: $i++;
456: }
457: $DecoCano{$key} = join "", @s;
458: }
459: }
460:
1.13 duerst 461: # fully expand kompatibility decompositions
462: $fixpoint = 0; # set to false
463: while (!$fixpoint) {
464: $fixpoint = 1; # set to true
465: print "Fixpoint\n" if ($OPTd);
1.21 duerst 466: foreach my $key (sort keys %DecoKompData) {
1.13 duerst 467: my @s = splitutf8($DecoKompData{$key});
468: my $i = 0;
1.21 duerst 469: foreach my $c (@s) {
1.13 duerst 470: my $d;
471: if ($d = $DecoKompData{$c}) {
472: print "replacing ", utf82ncr($c), " with ",
473: utf82ncr($d), " in ", utf82ncr($key), "\n"
474: if $OPTd;
475: if ($i > 0 && splitutf8($d) > 1) {
476: # print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
477: # die "Giving up!\n";
478: }
479: $c = $d;
480: $fixpoint = 0; # changed something; need one more pass
481: }
482: $i++;
483: }
484: $DecoKompData{$key} = join "", @s;
485: }
486: }
487:
488: # reorder combining marks for canonical decomposition
1.21 duerst 489: foreach my $key (sort keys %DecoCano) { # sort to sort the output
1.12 duerst 490: my $s = $DecoCano{$key};
491: my $t = join "", sortCano(splitutf8($s));
492: if ($s ne $t) {
493: print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
494: utf82ncr($s), " to ", utf82ncr($t), "\n";
495: die "Giving up!\n";
496: }
497: $DecoCano{$key} = $t;
498: }
499:
1.13 duerst 500: # reorder combining marks for kompatibility decomposition
1.21 duerst 501: foreach my $key (sort keys %DecoKompData) { # sort to sort the output
1.13 duerst 502: my $s = $DecoKompData{$key};
503: my $t = join "", sortCano(splitutf8($s));
504: if ($s ne $t) {
505: print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
506: utf82ncr($s), " to ", utf82ncr($t), "\n";
507: die "Giving up!\n";
508: }
509: $DecoKompData{$key} = $t;
510: }
511:
1.12 duerst 512: # detect singular compositions
1.21 duerst 513: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 514: if (1 == scalar(splitutf8($DecoCanoRest{$key}))) {
515: print 'Singular composition: ', utf82ncr($key), ' from ',
516: utf82ncr($DecoCanoRest{$key}), ", removed.\n"
517: if ($OPTd);
518: delete $DecoCanoRest{$key};
519: }
520: }
521:
522: # detect 'non-zero' compositions
1.21 duerst 523: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 524: my @a = splitutf8($DecoCanoRest{$key});
525: if ($CombClass{shift @a}) {
526: print 'Non-zero composition: ', utf82ncr($key), ' from ',
527: utf82ncr($DecoCanoRest{$key}), ", removed.\n"
528: if ($OPTd);
529: delete $DecoCanoRest{$key};
530: }
531: }
532:
533: # detect 'all-zero' compositions
1.21 duerst 534: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 535: my $allzero = 1;
536: my @a = splitutf8($DecoCanoRest{$key});
1.21 duerst 537: foreach my $c (@a) {
1.12 duerst 538: $allzero = 0 if ($CombClass{$c});
539: }
540: if ($allzero) {
541: print 'All-zero composition: ', utf82ncr($key), ' from ',
542: utf82ncr($DecoCanoRest{$key}), ".\n"
543: if ($OPTd);
544: }
545: }
546:
547:
1.21 duerst 548: my @NoRecomp = ( # Script-specific and post composition, table-based
549: # according to http://www.unicode.org/Public/3.1-Update/CompositionExclusions-3.txt
1.12 duerst 550: '0958', # DEVANAGARI LETTER QA
551: '0959', # DEVANAGARI LETTER KHHA
552: '095A', # DEVANAGARI LETTER GHHA
553: '095B', # DEVANAGARI LETTER ZA
554: '095C', # DEVANAGARI LETTER DDDHA
555: '095D', # DEVANAGARI LETTER RHA
556: '095E', # DEVANAGARI LETTER FA
557: '095F', # DEVANAGARI LETTER YYA
558: '09DC', # BENGALI LETTER RRA
559: '09DD', # BENGALI LETTER RHA
560: '09DF', # BENGALI LETTER YYA
561: '0A33', # GURMUKHI LETTER LLA
562: '0A36', # GURMUKHI LETTER SHA
563: '0A59', # GURMUKHI LETTER KHHA
564: '0A5A', # GURMUKHI LETTER GHHA
565: '0A5B', # GURMUKHI LETTER ZA
566: '0A5E', # GURMUKHI LETTER FA
567: '0B5C', # ORIYA LETTER RRA
568: '0B5D', # ORIYA LETTER RHA
569: '0F43', # TIBETAN LETTER GHA
570: '0F4D', # TIBETAN LETTER DDHA
571: '0F52', # TIBETAN LETTER DHA
572: '0F57', # TIBETAN LETTER BHA
573: '0F5C', # TIBETAN LETTER DZHA
574: '0F69', # TIBETAN LETTER KSSA
575: '0F76', # TIBETAN VOWEL SIGN VOCALIC R
576: '0F78', # TIBETAN VOWEL SIGN VOCALIC L
577: '0F93', # TIBETAN SUBJOINED LETTER GHA
578: '0F9D', # TIBETAN SUBJOINED LETTER DDHA
579: '0FA2', # TIBETAN SUBJOINED LETTER DHA
580: '0FA7', # TIBETAN SUBJOINED LETTER BHA
581: '0FAC', # TIBETAN SUBJOINED LETTER DZHA
582: '0FB9', # TIBETAN SUBJOINED LETTER KSSA
1.21 duerst 583: # 'FB1D' # HEBREW LETTER YOD WITH HIRIQ: see below for $OPTYWH
1.12 duerst 584: 'FB1F', # HEBREW LIGATURE YIDDISH YOD YOD PATAH
585: 'FB2A', # HEBREW LETTER SHIN WITH SHIN DOT
586: 'FB2B', # HEBREW LETTER SHIN WITH SIN DOT
587: 'FB2C', # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
588: 'FB2D', # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
589: 'FB2E', # HEBREW LETTER ALEF WITH PATAH
590: 'FB2F', # HEBREW LETTER ALEF WITH QAMATS
591: 'FB30', # HEBREW LETTER ALEF WITH MAPIQ
592: 'FB31', # HEBREW LETTER BET WITH DAGESH
593: 'FB32', # HEBREW LETTER GIMEL WITH DAGESH
594: 'FB33', # HEBREW LETTER DALET WITH DAGESH
595: 'FB34', # HEBREW LETTER HE WITH MAPIQ
596: 'FB35', # HEBREW LETTER VAV WITH DAGESH
597: 'FB36', # HEBREW LETTER ZAYIN WITH DAGESH
598: 'FB38', # HEBREW LETTER TET WITH DAGESH
599: 'FB39', # HEBREW LETTER YOD WITH DAGESH
600: 'FB3A', # HEBREW LETTER FINAL KAF WITH DAGESH
601: 'FB3B', # HEBREW LETTER KAF WITH DAGESH
602: 'FB3C', # HEBREW LETTER LAMED WITH DAGESH
603: 'FB3E', # HEBREW LETTER MEM WITH DAGESH
604: 'FB40', # HEBREW LETTER NUN WITH DAGESH
605: 'FB41', # HEBREW LETTER SAMEKH WITH DAGESH
606: 'FB43', # HEBREW LETTER FINAL PE WITH DAGESH
607: 'FB44', # HEBREW LETTER PE WITH DAGESH
608: 'FB46', # HEBREW LETTER TSADI WITH DAGESH
609: 'FB47', # HEBREW LETTER QOF WITH DAGESH
610: 'FB48', # HEBREW LETTER RESH WITH DAGESH
611: 'FB49', # HEBREW LETTER SHIN WITH DAGESH
612: 'FB4A', # HEBREW LETTER TAV WITH DAGESH
613: 'FB4B', # HEBREW LETTER VAV WITH HOLAM
614: 'FB4C', # HEBREW LETTER BET WITH RAFE
615: 'FB4D', # HEBREW LETTER KAF WITH RAFE
1.21 duerst 616: 'FB4E', # HEBREW LETTER PE WITH RAFE
617: ## post composition exclusion
1.22 duerst 618: '2ADC', # FORKING
1.21 duerst 619: '1D15E', # MUSICAL SYMBOL HALF NOTE
1.22 duerst 620: '1D15F', # MUSICAL SYMBOL QUARTER NOTE
621: '1D160', # MUSICAL SYMBOL EIGHTH NOTE
622: '1D161', # MUSICAL SYMBOL SIXTEENTH NOTE
623: '1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE
624: '1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE
625: '1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
626: '1D1BB', # MUSICAL SYMBOL MINIMA
627: '1D1BC', # MUSICAL SYMBOL MINIMA BLACK
628: '1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE
629: '1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK
630: '1D1BF', # MUSICAL SYMBOL FUSA WHITE
631: '1D1C0' # MUSICAL SYMBOL FUSA BLACK
1.12 duerst 632: );
1.20 duerst 633:
634: if (!$OPTYWH) {
635: push @NoRecomp, 'FB1D'; # HEBREW LETTER YOD WITH HIRIQ
636: } # see http://www.unicode.org/unicode/uni2errata/Normalization_Corrigendum.html
1.12 duerst 637:
638: # remove script-specific
1.21 duerst 639: foreach my $hex (@NoRecomp) {
1.12 duerst 640: my $key = &hex2utf8($hex);
641: print 'Non-recomposing composition: ', utf82ncr($key), ' from ',
642: utf82ncr($DecoCanoRest{$key}), ", removed.\n"
643: if ($OPTd);
644: delete $DecoCanoRest{$key};
645: }
646:
647: # replace with fully expanded decompositions
1.21 duerst 648: foreach my $key (keys %DecoCanoRest) {
1.12 duerst 649: $DecoCanoRest{$key} = $DecoCano{$key}
650: if ($DecoCanoRest{$key} ne $DecoCano{$key});
651: }
652:
653: # detect duplicate compositions ## should not find any
654: if ($OPTd) {
655: print "Checking duplicates, takes some time.\n";
1.21 duerst 656: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 657: my $s = $DecoCanoRest{$key};
1.21 duerst 658: foreach my $key2 (keys %DecoCanoRest) {
1.12 duerst 659: if (($key lt $key2) # don't compare with itself, don't warn twice
660: && ($DecoCanoRest{$key2} eq $s)) { # duplicate compositions
661: print STDERR 'Duplicate composition: ', utf82ncr($key),
662: ' and ', utf82ncr($key2),
663: ' to ', utf82ncr($s), "\n";
664: die "Giving up!\n";
665: }
666: else { next; } # shortcut loop
667: }
668: }
669: }
670:
671: # invert for composition
1.21 duerst 672: foreach my $key (keys %DecoCanoRest) { # use reduced decomps for selection
1.12 duerst 673: if (defined $CompCano{$DecoCanoData{$key}}) # use original data
674: { # (strictly binary)
675: die "Duplicate compositions, giving up.\n";
676: }
677: $CompCano{$DecoCanoData{$key}} = $key;
678: }
679:
680: if ($OPTd) {
1.21 duerst 681: foreach my $key (sort keys %DecoCano) { # sort to sort output
1.13 duerst 682: print 'Final canonical decomposition: ', utf82ncr($key),
1.12 duerst 683: ' to ', utf82ncr($DecoCano{$key}), "\n";
684: }
1.21 duerst 685: foreach my $key (sort keys %CompCano) { # sort to sort output
1.12 duerst 686: print 'Final composition: ', utf82ncr($key),
687: ' to ', utf82ncr($CompCano{$key}), "\n";
688: }
1.21 duerst 689: foreach my $key (sort keys %DecoKompData) { # sort to sort output
1.13 duerst 690: print 'Final kompatibility decomposition: ', utf82ncr($key),
691: ' to ', utf82ncr($DecoKompData{$key}), "\n";
692: }
1.12 duerst 693: }
694:
1.15 duerst 695: print STDERR "Finished processing character data file(s).\n" if (!$OPTq);
696:
697: } # end ReadCharacterDataFile
698:
699: #
700: # store data to file for fast reread
701: #
702:
703: sub StoreData {
704: my ($dataFile) = @_;
705: my %all_data = ();
706:
707: $all_data{exists} = \%exists;
708: $all_data{DecoCano} = \%DecoCano;
709: $all_data{CompCano} = \%CompCano;
710: $all_data{DecoKompData} = \%DecoKompData;
711: $all_data{CombClass} = \%CombClass;
712:
713: require Storable; # in line, to not require module if not needed
714: &Storable::nstore (\%all_data, $dataFile);
715: }
716:
717: #
718: # read data from file
719: #
720:
721: sub ReadStoredData {
722: my ($dataFile) = @_;
723: require Storable; # in line, to not require module if not needed
724: my %all_data = %{&Storable::retrieve ($dataFile)};
725:
726: %exists = %{$all_data{exists}};
727: %DecoCano = %{$all_data{DecoCano}};
728: %CompCano = %{$all_data{CompCano}};
729: %DecoKompData = %{$all_data{DecoKompData}};
730: %CombClass = %{$all_data{CombClass}};
731: }
732:
733: #
734: # Print instructions (-h)
735: #
736:
737: sub PrintInstructions {
738: print STDERR <<EOF;
739:
740: charlint (code name Charly)
741: Character Check and Normalization
742: According to W3C and Unicode Specifications
743: ===========================================
744:
745: $version
746:
747: (c) Keio University 1999, see perl source or
748: http://www.w3.org/International/charlint for details
749:
750: Available options:
751:
752: (options prefixed by # are currently not available)
753: -b: Remove initial 'Byte Order Mark'
754: -B: Supress warning about initial 'Byte Order Mark'
1.22 duerst 755: -c: Detect non-normalized data (but do not normalize)
1.15 duerst 756: -C: Do not normalize
757: -d: Debug: Thoroughly check character data table input
758: -D: Leave after reading in character data
759: -e: # remove undefined codepoints
760: -E: Do not warn about undefined codepoints
1.25 ! duerst 761: -f file: Read data from file (default is UnicodeData.txt;
! 762: please use newest V3.2.0 datafiles)
1.22 duerst 763: -F951: Use old (wrong) mapping for U+F951 (use this option
764: if you really need 3.1.0 behaviour)
1.15 duerst 765: -h: Prints out this short description
766: -k: # Warn about compatibility codepoints
767: -K: Normalize out (i.e. decompose) compatibility codepoints
768: -n: Accept &#ddddd; and &#xhhhh; on input
769: (beware of <![CDATA[, <SCRIPT>, <STYLE>)
1.23 duerst 770: -nX: same as -n, plus &#Xhhhh; (use for HTML only!)
1.15 duerst 771: -N: Produce &#xhhhh; on output
1.21 duerst 772: -o: Print out 'unprintable' bytes as \\octal
1.15 duerst 773: -p: # Remove stuff in private zone
774: -P: Supress checking private zone
775: -q: Quiet, don't output progress messages
776: -s file: Read data from file produced with -S
777: -S file: Write data to file for fast reload with -s
778: -u: # Fix UTF-8 (convert or remove)
779: -U: Supress checking correctness of UTF-8
780: -v: Print version
781: -x: Do decomposition only
782: -X: Don't do decomposition (assume input is decomposed)
1.20 duerst 783: -YWH: Treat YOD WITH HIRIQ as precomposed (use this option
1.22 duerst 784: if you really need 3.0.0 behaviour)
1.15 duerst 785:
786: EOF
787: # end of raw in-place text
788:
789: # ideas for more options:
790: # * allow to do kompatibility processing by category
791: # * warn/remove plane 14 language tag codes and other crap
792: # * convert crap to what it's supposed to be (difficult)
793: # * directionality control codes
794:
795: } # end &PrintInstructions
796:
797:
798: #
1.21 duerst 799: # Print instructions (-h)
1.15 duerst 800: #
801:
1.21 duerst 802: sub initialize {
803: my ($readStoreFile, $writeStoreFile, $dataFile);
804:
805: # Read options
1.15 duerst 806:
1.21 duerst 807: OPTIONS:
808: while (@ARGV and $ARGV[0] =~ /^-/) {
809: $_ = shift(@ARGV);
810: $OPTb= 1, next OPTIONS if /^-b$/;
811: $OPTB= 1, next OPTIONS if /^-B$/;
1.22 duerst 812: $OPTc= 1, next OPTIONS if /^-c$/;
1.21 duerst 813: $OPTC= 1, next OPTIONS if /^-C$/;
814: $OPTd= 1, next OPTIONS if /^-d$/;
815: $OPTD= 1, next OPTIONS if /^-D$/;
816: $OPTE= 1, next OPTIONS if /^-E$/;
817: if (/^-f$/) {
818: $OPTf = 1;
819: $dataFile = shift(@ARGV);
820: print STDERR "Using character data file $dataFile.",
821: " Maybe not what you intend.\n" if ($dataFile =~ /^-.$/ && !$OPTq);
822: next OPTIONS;
823: }
1.22 duerst 824: $OPTF951= 1, next OPTIONS if /^-F951$/;
1.21 duerst 825: $OPTh= 1, next OPTIONS if /^-h$/;
826: $OPTK= 1, next OPTIONS if /^-K$/;
827: $OPTn= 1, next OPTIONS if /^-n$/;
1.23 duerst 828: $OPTnX=1, next OPTIONS if /^-nX$/;
1.21 duerst 829: $OPTN= 1, next OPTIONS if /^-N$/;
830: $OPTo= 1, next OPTIONS if /^-o$/;
831: $OPTP= 1, next OPTIONS if /^-P$/;
832: $OPTq= 1, next OPTIONS if /^-q$/;
833: if (/^-s$/) {
834: $OPTs = 1;
835: $readStoreFile = shift(@ARGV);
1.24 duerst 836: print STDERR "Reading from store file $readStoreFile.",
837: " Maybe not what you intend.\n"
838: if ($readStoreFile =~ /^-.$/ && !$OPTq);
1.21 duerst 839: next OPTIONS;
840: }
841: if (/^-S$/) {
842: $OPTS = 1;
843: $writeStoreFile = shift(@ARGV);
1.24 duerst 844: print STDERR "Writing to store file $writeStoreFile.",
845: " Maybe not what you intend.\n"
846: if ($writeStoreFile =~ /^-.$/ && !$OPTq);
1.21 duerst 847: next OPTIONS;
848: }
849: $OPTU= 1, next OPTIONS if /^-U$/;
850: $OPTv= 1, next OPTIONS if /^-v$/;
851: $OPTx= 1, next OPTIONS if /^-x$/;
852: $OPTX= 1, next OPTIONS if /^-X$/;
853: $OPTYWH= 1, next OPTIONS if /^-YWH$/;
854: print STDERR "Unrecognized argument: \"", $_, "\"; ignored.\n";
855: }
856:
857: &PrintInstructions() if $OPTh;
858:
859: if ($OPTv) {
860: print $version, ",\nfor more information, use 'charlint -h'.\n";
861: }
862:
863: # Read/write data files
864:
1.25 ! duerst 865: if ($OPTs && $OPTS) {
! 866: die "Cannot read and store at the same time (-s/-S).\n";
! 867: }
! 868: if ($OPTf) {
! 869: if ($OPTs) {
! 870: die "Conflicting character data sources (-f/-s).\n";
! 871: }
! 872: else {
! 873: ReadCharacterDataFile($dataFile);
! 874: }
! 875: }
! 876: else {
! 877: if ($OPTs) {
! 878: ReadStoredData ($readStoreFile);
! 879: }
! 880: else {
! 881: # default amounts to -f UnicodeData.txt
! 882: ReadCharacterDataFile("UnicodeData.txt");
! 883: }
! 884: }
! 885:
! 886: if ($OPTS) {
1.21 duerst 887: StoreData ($writeStoreFile);
888: }
1.15 duerst 889:
1.21 duerst 890: exit 0 if ($OPTD);
1.15 duerst 891: }
892:
1.21 duerst 893:
1.15 duerst 894: #
1.21 duerst 895: # Start of main program
1.15 duerst 896: #
897:
1.21 duerst 898: #
899: # initialize: read options, read/write character data files
900: #
1.12 duerst 901:
1.21 duerst 902: &initialize;
1.12 duerst 903:
904: #
905: # PROCESS ACTUAL FILE(S)
906: #
907:
1.21 duerst 908: my $line = 0;
1.12 duerst 909:
910: LINE:
911: while (<>) {
912: $line++;
913:
914: # Convert NCRs on input
1.23 duerst 915: if ($OPTn || $OPTnX) {
1.12 duerst 916: # decimal numeric character references
917: s/&#([0-9]+)\;/Xnum2utf8($1)/eg;
918: # hexadecimal numeric character references
919: s/&#x([0-9a-fA-F]+)\;/Xhex2utf8($1)/eg;
1.23 duerst 920: if ($OPTnX) {
921: # hexadecimal numeric character references (HTML only)
922: s/&#X([0-9a-fA-F]+)\;/Xhex2utf8($1)/eg;
923: }
1.12 duerst 924: }
925:
926: # Check BOM
927: if ($line == 1) {
928: if (!$OPTb && /^\357\273\277/) {
929: print STDERR "Initial BOM.\n";
930: }
931: if ($OPTB) {
932: s/^\357\273\277// # remove initial BOM
933: }
934: }
935:
936: # Check UTF-8
1.21 duerst 937: my $r;
1.12 duerst 938: if (!$OPTU && ($r = CheckUTF8 ($_))) {
1.21 duerst 939: print STDERR "Line $line: Non-UTF-8 ($r).\n";
1.12 duerst 940: die "Giving up!\n";
941: }
942:
943: # Check nonexisting characters
944: if (!$OPTE && CheckExists ($_)) {
945: die "Line $line: Non-Existing codepoints.\nGiving up!\n";
946: }
947:
948: # Check private characters
949: if (!$OPTP && ($r = CheckPrivate ($_))) {
1.21 duerst 950: die "Line $line: Private charaters ($r).\nGiving up!\n";
1.12 duerst 951: }
952:
1.21 duerst 953: my @line = splitutf8($_);
1.22 duerst 954: my @lineoriginal = @line;
1.21 duerst 955: my @line2 = ();
1.12 duerst 956:
957: if (!$OPTC) {
1.13 duerst 958: if ($OPTX) {
959: @line2 = @line;
960: }
961: else { # decompose
1.21 duerst 962: while (defined(my $s = shift @line)) {
1.13 duerst 963: if ($OPTK) {
964: push @line2, splitutf8(DecoKomp($s));
965: }
966: else {
967: push @line2, splitutf8(DecoCano($s));
968: }
1.12 duerst 969: }
970: }
971:
972: # canonical reordering
973: @line = sortCano(@line2);
974:
975: # recompose
1.21 duerst 976: if (!$OPTx && length (@line)) {
977: my $lastClass = -1; # this eliminates a special check
978: my $starterPos = 0;
979: my $sourceLength = @line;
980: my $targetPos = 1;
981: my $starterCh = $line[0];
982: for (my $sourcePos = 1; $sourcePos < $sourceLength; $sourcePos++) {
983: my $ch = $line[$sourcePos];
984: my $chClass = $CombClass{$ch};
1.12 duerst 985: $chClass = 0 if (!defined($chClass));
1.21 duerst 986: my $composite = CompCano($starterCh, $ch);
1.12 duerst 987: if (defined($composite) && $lastClass < $chClass) {
988: $line[$starterPos] = $composite;
989: $starterCh = $composite;
990: }
991: elsif ($chClass == 0) {
992: $starterPos = $targetPos;
993: $starterCh = $ch;
994: $lastClass = -1;
995: $line[$targetPos++] = $ch;
996: }
997: else {
998: $lastClass = $chClass;
999: $line[$targetPos++] = $ch;
1000: }
1001: }
1002: $#line = $targetPos-1;
1003: } # end of recomposition
1.22 duerst 1004: if ($OPTc && join("",@line) ne join("",@lineoriginal)) {
1005: die "Line $line: Non-normalized data.\nGiving up!\n";
1006: }
1007: } #if (!$OPTC)
1.12 duerst 1008:
1009: printOPT (join "", @line);
1010:
1011: } # end while <>
Webmaster