Annotation of charlint/charlint.pl, revision 1.28
1.12 duerst 1: #!/usr/bin/perl
2:
3: # Copyright notice:
1.28 ! rishida 4: # (c) Copyright Keio University 1999-
1.12 duerst 5: # This software is made available under the terms of the
6: # W3C Software Licence available at
7: # http://www.w3.org/Consortium/Legal/copyright-software.
8:
9:
1.13 duerst 10: # Acknowledgements: Mark Davis for various discussions about Unicode TR #15
1.12 duerst 11: # and for the test suite
1.16 duerst 12: # Paul Hoffman for suggestions, encouragement, and bug reports
1.15 duerst 13: # Roland Mas for how to use Storable
1.12 duerst 14: # James Briggs and Masayasu Ishikawa for error reports
1.13 duerst 15: # Tim Bray for help with CDATA
1.21 duerst 16: # Kai Henningsen for finally getting me to clean up
17: # for 'use strict' and 'perl -w'
1.24 duerst 18: # Roozbeh Pournader for proposing to deal with HTML &#Xhhhh;
1.28 ! rishida 19: # Tim Kempton for nudging me to update to Unicode 5.2.0
1.12 duerst 20:
1.22 duerst 21: # Author:
1.28 ! rishida 22: # MJD Martin J. Du"rst, duerst@w3.org (1999-2005)
! 23: # duerst@it.aoyama.ac.jp (2005-2009)
1.12 duerst 24:
1.28 ! rishida 25: my $version = 'Version 0.55';
1.12 duerst 26:
27: # History:
1.28 ! rishida 28: # 2009/11/28: 0.55, updated to Unicode Version 5.2.0 MJD
1.27 duerst 29: # 2002/06/24: 0.54, improving -nf16check (compiler warnings, speed) MJD
1.26 duerst 30: # 2002/06/08: 0.53, added -nf16check data file production MJD
1.25 duerst 31: # 2002/08/23: 0.52, changed default file to UnicodeData.txt MJD
1.24 duerst 32: # 2002/05/21: 0.51, added option -nX (use for HTML only!) MJD
1.22 duerst 33: # 2002/04/03: 0.50, updated for 3.2.0; added -F951; added -c MJD
1.21 duerst 34: # 2001/10/03: 0.49, code cleanup for use strict and -w MJD
35: # 2001/04/01: 0.48, updated for 3.1.0 (final) MJD
1.20 duerst 36: # 2001/03/07: 0.47, YOD WITH HIRIQ corrigendum MJD
1.19 duerst 37: # 2000/12/19: 0.46, updated for 3.1.0 (beta) MJD
1.16 duerst 38: # 2000/11/12: 0.45, bug fix for CJK extension A MJD
1.15 duerst 39: # 2000/11/09: 0.44, implemented -s/-S (Storable data) MJD
40: # 2000/11/05: 0.43, implemented -K (kompatibility decomposition) MJD
41: # 2000/11/05: 0.42, updated for 3.0.1, fixed line ends MJD
42: # 2000/11/05: 0.41, added 2000 to copyright, tested CVS commit MJD
1.12 duerst 43: # 2000/08/03: 0.40, added Hangul support and did quite some testing MJD
44: # 2000/08/02: 0.37, added -x and -X for decomposition MJD
45: # 2000/07/27: 0.36, fixed a bug for non-starter decompositions MJD
46: # 2000/07/24: 0.35, adapted exclusions to 3.0.0 final (+Tibetan) MJD
1.26 duerst 47: # 2000/07/24: 0.34, bug fix for $CombClass{ch}; implemented -C MJD
1.12 duerst 48: # 1999/08/16: 0.33, updated for second version of 3.0.0.beta MJD
49: # 1999/07/01: 0.32, adapted surrogates/exclusions to 3.0.0.beta MJD
50: # 1999/06/25: 0.31, fixed reordering bug, going public MJD
51: # 1999/06/23: 0.30, preparation for W3C member test, without Hangul MJD
52:
1.28 ! rishida 53: # CVS last revised $Date: 2003/06/25 20:52:46 $ by $Author: duerst $
1.12 duerst 54:
1.16 duerst 55: #package CHARLINT; ## tried these, but need more time to get it right
1.12 duerst 56: #use diagnostics;
1.21 duerst 57: use strict;
1.14 duerst 58: use Storable;
1.12 duerst 59:
1.21 duerst 60: # Global variables (options and data arrays)
61: use vars qw($OPTB $OPTC $OPTD $OPTE $OPTK
62: $OPTN $OPTP $OPTS $OPTU $OPTX $OPTYWH
1.22 duerst 63: $OPTb $OPTc $OPTd $OPTf $OPTF951 $OPTh
1.26 duerst 64: $OPTn $OPTnX $OPTnf16 $OPTo $OPTq $OPTs $OPTv $OPTx
1.21 duerst 65: %CombClass %CompCano %DecoCano %DecoCanoData
1.26 duerst 66: %DecoCanoRest %DecoKompData %DecoKompKind %exists
67: %NFCforbid);
1.21 duerst 68:
69:
1.12 duerst 70: #
71: # SUBROUTINES
72: #
73:
74: sub addtext {
1.21 duerst 75: my ($r, $t) = @_;
76: $$r = $$r ? ($$r."; ".$t) : $t;
1.12 duerst 77: }
78:
79: # Check problems in UTF-8
80:
81: sub CheckUTF8 {
1.21 duerst 82: my ($s) = @_;
83: my ($return) = "";
84: my ($st);
1.12 duerst 85: if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) {
1.21 duerst 86: &addtext (\$return, "surrogate pair") if !$OPTU;
1.12 duerst 87: }
88: if ($s =~ /[\355][\240-\277][\200-\277]/) {
1.21 duerst 89: &addtext (\$return, "single surrogate") if !$OPTU;
1.12 duerst 90: }
91: if ($s =~ /[\300-\301][\200-\277]/) {
1.21 duerst 92: &addtext (\$return, "ASCII in 2 bytes") if !$OPTU;
1.12 duerst 93: }
94: if ($s =~ /[\340][\200-\237][\200-\277]/) {
1.21 duerst 95: &addtext (\$return, "3 bytes instead of 2 or less") if !$OPTU;
1.12 duerst 96: }
97: if ($s =~ /[\360][\200-\217][\200-\277]{2}/) {
1.21 duerst 98: &addtext (\$return, "4 bytes instead of 3 or less") if !$OPTU;
1.12 duerst 99: }
100: if ($s =~ /[\370][\200-\207][\200-\277]{3}/) {
1.21 duerst 101: &addtext (\$return, "5 bytes instead of 4 or less") if !$OPTU;
1.12 duerst 102: }
103: if ($s =~ /[\374][\200-\203][\200-\277]{4}/) {
1.21 duerst 104: &addtext (\$return, "6 bytes instead of 5 or less") if !$OPTU;
1.12 duerst 105: }
106: # non-synchronized cases
107: $s =~ s{ [\000-\177]
108: | [\300-\337][\200-\277]
109: | [\340-\357][\200-\277]{2}
110: | [\360-\367][\200-\277]{3}
111: | [\370-\373][\200-\277]{4}
112: | [\374-\375][\200-\277]{5}
113: }{}gx;
114: # forbidden bytes
115: if ($s =~ /[\376\377]/) {
1.21 duerst 116: &addtext (\$return, "0xFE or 0xFF byte") if !$OPTU;
1.12 duerst 117: }
118: if ($s ne "") {
1.21 duerst 119: &addtext (\$return, "synchronization problem") if !$OPTU;
1.12 duerst 120: }
121: $return;
122: } # end CheckUTF8
123:
124: sub exists {
125: my $s = shift;
126: return ($exists{$s}
1.28 ! rishida 127: or ("\344\270\200" le $s and $s le "\351\277\213") # CJK
1.19 duerst 128: or ("\352\260\200" le $s and $s le "\355\236\243") # Hangul
129: or ("\343\220\200" le $s and $s le "\344\266\265") # CJK Extension A
1.28 ! rishida 130: or ("\xF0\xA0\x80\x80" le $s and $s le "\xF0\xAA\x9B\x96") # CJK Extension B
! 131: or ("\xF0\xAA\x9C\x80" le $s and $s le "\xF0\xAB\x9C\xB4")) # CJK Extension C
1.12 duerst 132: }
133:
134: sub CheckExists {
135: my ($s) = @_;
136: my $news = join "",
137: grep (&exists($_), $s =~ m/([\000-\177]|[\300-\377][\200-\277]+)/go);
138: return ($s ne $news);
139: } # end CheckExists
140:
141: sub CheckPrivate {
142: my ($s) = @_;
143: if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) {
144: return "BMP";
145: }
146: if ($s =~ /[\363][\260-\277][\200-\277]{2}/) {
147: return "plane 15";
148: }
149: if ($s =~ /[\364][\200-\217][\200-\277]{2}/) {
150: return "plane 16";
151: }
152: } # end CheckPrivate
153:
154:
155: #### convert hex to UTF-8
156:
157: sub hex2utf8 {
158: num2utf8 (hex($_[0]));
159: } # end hex2utf8
160:
161: sub Xhex2utf8 { #### avoid to covert <>&"
1.21 duerst 162: my ($t) = hex($_[0]);
1.12 duerst 163: if ($t < 0x40) { return "&#x".$_[0].";"; }
164: num2utf8 ($t);
165: } # end Xhex2utf8
166:
167: sub Xnum2utf8 { #### avoid to covert <>&"
1.21 duerst 168: my ($t) = @_;
1.12 duerst 169: if ($t < 64) { return "&#".$t.";"; }
170: num2utf8 ($t);
171: } # end Xnum2utf8
172:
173: #### convert number to UTF-8
174:
175: sub num2utf8 {
1.21 duerst 176: my ($t) = @_;
177: my ($trail, $firstbits, @result);
1.12 duerst 178:
179: if ($t<0x00000080) { $firstbits=0x00; $trail=0; }
180: elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; }
181: elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; }
182: elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; }
183: elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; }
184: elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; }
185: else {
186: die "Too large scalar value, cannot be converted to UTF-8.\n";
187: }
188: for (1 .. $trail) {
189: unshift (@result, ($t & 0x3F) | 0x80);
190: $t >>= 6; # slight danger of non-portability
191: }
192: unshift (@result, $t | $firstbits);
193: pack ("C*", @result);
194: } # end num2utf8
195:
196:
197: sub utf82ncr { # works for more than one character
198: my $r;
1.21 duerst 199: foreach my $c (splitutf8(shift)) {
1.12 duerst 200: $r .= ($c =~ /[\040-\177]/) ? $c : sprintf ("&#x%lX;",&utf82num($c));
201: }
202: $r;
203: } # end utf82ncr
204:
205:
206: #### convert UTF-8 to number
207:
208: sub utf82num {
1.21 duerst 209: my (@t, $t, $result);
210: my $trail;
1.12 duerst 211: @t = unpack ("C*", $_[0]);
212: $t = shift (@t);
213: if ($t<0x80) { $result= $t ; $trail=0; }
214: elsif ($t<0xC0) { die "Illegal leading byte in UTF-8.\n"; }
215: elsif ($t<0xE0) { $result= $t & 0x1F; $trail=1; }
216: elsif ($t<0xF0) { $result= $t & 0x0F; $trail=2; }
217: elsif ($t<0xF8) { $result= $t & 0x07; $trail=3; }
218: elsif ($t<0xFC) { $result= $t & 0x03; $trail=4; }
219: elsif ($t<0xFE) { $result= $t & 0x01; $trail=5; }
220: else { die "Illegal byte in UTF-8.\n"; }
221:
222: if ($trail != $#t + 1) { die "Not right number of trailing bytes.\n"; }
223: while ($trail--) {
224: # maybe check for 01xxxxxx
225: $result <<= 6;
226: $result += 0x3F & shift (@t);
227: }
228: return $result;
229: } # end utf82num
230:
231: #### variant of hex2utf8 to get rid of spaces
232: sub spacehex2utf8 {
233: my ($t) = @_;
234: return "" if ($t eq " ");
235: hex2utf8($t);
236: } # end spacehex2utf8
237:
238: #### split an utf-8 string into codepoints
239: sub splitutf8 {
240: split (/(?=[\000-\177\300-\377])/, shift);
241: } # end splitutf8
242:
243:
244: #### canonical sort of combining marks
245: # input: utf-8 string
246: # return: utf-8 string
247: sub sortCano {
248: my @a = @_;
249: my ($i, $ccHere, $ccPrev, $temp);
250:
251: return @a if (@a <= 1);
252: for ($i=1; $i < @a; $i++) {
253: $ccHere = $CombClass{$a[$i]};
254: $ccPrev = $CombClass{$a[$i-1]};
255: $ccHere = 0 if (!defined($ccHere));
256: $ccPrev = 0 if (!defined($ccPrev));
257: if ($ccHere != 0 && $ccPrev > $ccHere) {
258: $temp = $a[$i]; # exchange
259: $a[$i] = $a[$i-1];
260: $a[$i-1] = $temp;
261: $i -= 2 if ($i > 1); # backtrack and check again
262: }
263: }
264: return @a;
265: } # end sortCano
266:
267: #
1.13 duerst 268: # add algorithmic Hangul and unity transform to DecoCano lookup
1.12 duerst 269: #
270: sub DecoCano {
271: my ($s) = @_;
272: my $h = utf82num($s);
273: if ($h >= 0xAC00 && $h < 0xD7A4) {
274: my $hindex = $h - 0xAC00;
275: my $l = 0x1100 + $hindex/(21*28);
276: my $v = 0x1161 + ($hindex % (21*28)) / 28;
277: my $t = $hindex % 28;
278: if ($t) {
279: return join "", num2utf8($l), num2utf8($v), num2utf8(0x11A7 + $t);
280: }
281: else {
282: return join "", num2utf8($l), num2utf8($v);
283: }
284: }
285: else {
1.26 duerst 286: my $r = $DecoCano{$s};
1.13 duerst 287: return $r if defined $r;
1.26 duerst 288: return $s;
1.12 duerst 289: }
290: } # end DecoCano
291:
292: #
1.13 duerst 293: # add algorithmic Hangul and unity transform to DecoKomp lookup
294: #
295: sub DecoKomp {
296: my ($s) = @_;
297: my $h = utf82num($s);
298: if ($h >= 0xAC00 && $h < 0xD7A4) {
1.26 duerst 299: return DecoCano($s); #refer to DecoCano for Hangul decomposition
1.13 duerst 300: }
301: else {
1.26 duerst 302: my $r = $DecoKompData{$s};
1.13 duerst 303: return $r if defined $r;
1.26 duerst 304: return $s;
1.13 duerst 305: }
306: } # end DecoKomp
307:
308: #
1.12 duerst 309: # add algorithmic Hangul to CompCano lookup
310: #
311: sub CompCano {
312: my ($starterCh, $ch) = @_;
313: my $s = utf82num($starterCh);
314: my $c = utf82num($ch);
315: if ($s >= 0x1100 && $s < 0x1113 && $c >= 0x1161 && $c < 0x1176) {
316: return num2utf8((($s-0x1100)*21+$c-0x1161) * 28 + 0xAC00);
317: }
318: elsif ($s >= 0xAC00 && $s < 0xD7A4 && !(($s-0xAC00)%28) && $c >= 0x11A8 && $c < 0x11C3) {
319: return num2utf8($s + $c - 0x11A7);
320: }
321: else {
322: return $CompCano{join "", ($starterCh, $ch)};
323: }
324: } # end CompCano
325:
326: sub printoctal {
327: my $s = shift;
328: $s =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/ge;
329: print STDERR $s;
330: } # end printoctal
331:
332: #### output data considering all relevant options
333: sub printOPT {
334: my $t = shift;
335: if ($OPTN) {
336: # hexadecimal numeric character references
337: $t =~ s/([\300-\377][\200-\277]+)/utf82ncr($1)/eg;
338: }
339: elsif ($OPTo) {
340: $t =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/eg;
341: }
342: print $t;
343: }
344:
345:
346: #
1.15 duerst 347: # read in base data file
1.12 duerst 348: #
349:
1.15 duerst 350: sub ReadCharacterDataFile {
351: my ($dataFile) = @_;
352: my $line = 0;
1.12 duerst 353: open (BASE, $dataFile)
354: or die "Cannot open character data file $dataFile.\n";
355: BASE:
356: while (<BASE>) {
1.15 duerst 357: print STDERR "Reading data file, line $line\n"
1.28 ! rishida 358: if !(($line % 1000) || $OPTq); $line++;
1.12 duerst 359: chop;
1.21 duerst 360: my ($hex, $name, $category, $combClass, $t4, $dec) = split(/;/);
1.12 duerst 361: #### Check ranges for consistency with handcoded pieces, then skip
362: if ($name =~ /^<(.*), (.*)/) {
1.28 ! rishida 363: if (!( /^3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;$/
! 364: or /^4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;$/
! 365: or /^4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;$/
! 366: or /^9FCB;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;$/
1.26 duerst 367: or /^AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;$/
368: or /^D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;$/
369: or /^D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
370: or /^DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
371: or /^DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
372: or /^DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
373: or /^DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
374: or /^DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
375: or /^E000;<Private Use, First>;Co;0;L;;;;;N;;;;;$/
376: or /^F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;$/
1.28 ! rishida 377: or /^20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;$/
! 378: or /^2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;$/
! 379: or /^2A700;<CJK Ideograph Extension C, First>;Lo;0;L;;;;;N;;;;;$/
! 380: or /^2B734;<CJK Ideograph Extension C, Last>;Lo;0;L;;;;;N;;;;;$/
1.26 duerst 381: or /^F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;$/
382: or /^FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;$/
383: or /^100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;$/
384: or /^10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;$/
385: )) {
1.12 duerst 386: die "Problem with data file consistency, line $line: \n\t$_.\n";
387: }
388: }
389: else { # normal line processing
1.21 duerst 390: my $u = &hex2utf8($hex);
1.12 duerst 391: $exists{$u} = 1; # to check characters that exist
392:
393: #### Decompositions
394: if ($dec eq "") { } # no decomposition
395: elsif ($dec =~ /^<(.*)>(.*)/) { # compatibility
1.21 duerst 396: my $decKind = $1;
1.12 duerst 397: $dec = $2;
398: $DecoKompKind{$u} = $decKind;
399: $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
400: $DecoKompData{$u} = $dec;
401: }
402: else { # canonical decomposition
1.13 duerst 403: $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
404: $DecoCanoData{$u} = $dec;
405: $DecoKompData{$u} = $dec; # add to Komp, to expand everything
1.12 duerst 406: }
407:
408: #### Canonical Combining Class
409: $CombClass{$u} = $combClass if ($combClass);
410: }
411: }
412: close (BASE);
1.15 duerst 413: print STDERR "Finished reading character database.\n" if (!$OPTq);
1.12 duerst 414:
1.22 duerst 415: if ($OPTF951) {
416: $DecoCanoData{"\xEF\xA5\x91"} =
417: $DecoKompData{"\xEF\xA5\x91"} = "\xE9\x9B\xBB";
418: }
1.12 duerst 419: %DecoCanoRest = %DecoCano = %DecoCanoData; # keep original data as is, and
420: # copy to restrict for composition
421:
422: # list of compatibility kinds for later work
423: # the idea is to group them (e.g. sub and super) and allow
424: # normalization by group
425: #Kompatibility Kind: circle
426: #Kompatibility Kind: compat
427: #Kompatibility Kind: final
428: #Kompatibility Kind: font
429: #Kompatibility Kind: fraction
430: #Kompatibility Kind: initial
431: #Kompatibility Kind: isolated
432: #Kompatibility Kind: medial
433: #Kompatibility Kind: narrow
434: #Kompatibility Kind: noBreak
435: #Kompatibility Kind: small
436: #Kompatibility Kind: square
437: #Kompatibility Kind: sub
438: #Kompatibility Kind: super
439: #Kompatibility Kind: vertical
440: #Kompatibility Kind: wide
441:
1.13 duerst 442: # fully expand canonical decompositions
1.21 duerst 443: my $fixpoint = 0; # set to false
1.12 duerst 444: while (!$fixpoint) {
445: $fixpoint = 1; # set to true
446: print "Fixpoint\n" if ($OPTd);
1.21 duerst 447: foreach my $key (sort keys %DecoCano) {
1.12 duerst 448: my @s = splitutf8($DecoCano{$key});
449: my $i = 0;
1.21 duerst 450: foreach my $c (@s) {
1.12 duerst 451: my $d;
452: if ($d = $DecoCano{$c}) {
453: print "replacing ", utf82ncr($c), " with ",
454: utf82ncr($d), " in ", utf82ncr($key), "\n"
455: if $OPTd;
456: if ($i > 0) {
457: print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
458: die "Giving up!\n";
459: }
460: $c = $d;
461: $fixpoint = 0; # changed something; need one more pass
462: }
463: $i++;
464: }
465: $DecoCano{$key} = join "", @s;
466: }
467: }
468:
1.13 duerst 469: # fully expand kompatibility decompositions
470: $fixpoint = 0; # set to false
471: while (!$fixpoint) {
472: $fixpoint = 1; # set to true
473: print "Fixpoint\n" if ($OPTd);
1.21 duerst 474: foreach my $key (sort keys %DecoKompData) {
1.13 duerst 475: my @s = splitutf8($DecoKompData{$key});
476: my $i = 0;
1.21 duerst 477: foreach my $c (@s) {
1.13 duerst 478: my $d;
479: if ($d = $DecoKompData{$c}) {
480: print "replacing ", utf82ncr($c), " with ",
481: utf82ncr($d), " in ", utf82ncr($key), "\n"
482: if $OPTd;
483: if ($i > 0 && splitutf8($d) > 1) {
484: # print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
485: # die "Giving up!\n";
486: }
487: $c = $d;
488: $fixpoint = 0; # changed something; need one more pass
489: }
490: $i++;
491: }
492: $DecoKompData{$key} = join "", @s;
493: }
494: }
495:
496: # reorder combining marks for canonical decomposition
1.21 duerst 497: foreach my $key (sort keys %DecoCano) { # sort to sort the output
1.12 duerst 498: my $s = $DecoCano{$key};
499: my $t = join "", sortCano(splitutf8($s));
500: if ($s ne $t) {
501: print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
502: utf82ncr($s), " to ", utf82ncr($t), "\n";
503: die "Giving up!\n";
504: }
505: $DecoCano{$key} = $t;
506: }
507:
1.13 duerst 508: # reorder combining marks for kompatibility decomposition
1.21 duerst 509: foreach my $key (sort keys %DecoKompData) { # sort to sort the output
1.13 duerst 510: my $s = $DecoKompData{$key};
511: my $t = join "", sortCano(splitutf8($s));
512: if ($s ne $t) {
513: print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
514: utf82ncr($s), " to ", utf82ncr($t), "\n";
515: die "Giving up!\n";
516: }
517: $DecoKompData{$key} = $t;
518: }
519:
1.26 duerst 520: # detect singular compositions, add to %NFCforbid
1.21 duerst 521: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 522: if (1 == scalar(splitutf8($DecoCanoRest{$key}))) {
523: print 'Singular composition: ', utf82ncr($key), ' from ',
524: utf82ncr($DecoCanoRest{$key}), ", removed.\n"
1.26 duerst 525: if ($OPTd);
1.12 duerst 526: delete $DecoCanoRest{$key};
1.26 duerst 527: $NFCforbid{$key} = 1;
1.12 duerst 528: }
529: }
530:
1.26 duerst 531: # detect 'non-zero' compositions, add to %NFCforbid
1.21 duerst 532: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 533: my @a = splitutf8($DecoCanoRest{$key});
534: if ($CombClass{shift @a}) {
535: print 'Non-zero composition: ', utf82ncr($key), ' from ',
536: utf82ncr($DecoCanoRest{$key}), ", removed.\n"
537: if ($OPTd);
538: delete $DecoCanoRest{$key};
1.26 duerst 539: $NFCforbid{$key} = 1;
1.12 duerst 540: }
541: }
542:
543: # detect 'all-zero' compositions
1.21 duerst 544: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 545: my $allzero = 1;
546: my @a = splitutf8($DecoCanoRest{$key});
1.21 duerst 547: foreach my $c (@a) {
1.12 duerst 548: $allzero = 0 if ($CombClass{$c});
549: }
550: if ($allzero) {
551: print 'All-zero composition: ', utf82ncr($key), ' from ',
552: utf82ncr($DecoCanoRest{$key}), ".\n"
553: if ($OPTd);
554: }
555: }
556:
557:
1.21 duerst 558: my @NoRecomp = ( # Script-specific and post composition, table-based
559: # according to http://www.unicode.org/Public/3.1-Update/CompositionExclusions-3.txt
1.12 duerst 560: '0958', # DEVANAGARI LETTER QA
561: '0959', # DEVANAGARI LETTER KHHA
562: '095A', # DEVANAGARI LETTER GHHA
563: '095B', # DEVANAGARI LETTER ZA
564: '095C', # DEVANAGARI LETTER DDDHA
565: '095D', # DEVANAGARI LETTER RHA
566: '095E', # DEVANAGARI LETTER FA
567: '095F', # DEVANAGARI LETTER YYA
568: '09DC', # BENGALI LETTER RRA
569: '09DD', # BENGALI LETTER RHA
570: '09DF', # BENGALI LETTER YYA
571: '0A33', # GURMUKHI LETTER LLA
572: '0A36', # GURMUKHI LETTER SHA
573: '0A59', # GURMUKHI LETTER KHHA
574: '0A5A', # GURMUKHI LETTER GHHA
575: '0A5B', # GURMUKHI LETTER ZA
576: '0A5E', # GURMUKHI LETTER FA
577: '0B5C', # ORIYA LETTER RRA
578: '0B5D', # ORIYA LETTER RHA
579: '0F43', # TIBETAN LETTER GHA
580: '0F4D', # TIBETAN LETTER DDHA
581: '0F52', # TIBETAN LETTER DHA
582: '0F57', # TIBETAN LETTER BHA
583: '0F5C', # TIBETAN LETTER DZHA
584: '0F69', # TIBETAN LETTER KSSA
585: '0F76', # TIBETAN VOWEL SIGN VOCALIC R
586: '0F78', # TIBETAN VOWEL SIGN VOCALIC L
587: '0F93', # TIBETAN SUBJOINED LETTER GHA
588: '0F9D', # TIBETAN SUBJOINED LETTER DDHA
589: '0FA2', # TIBETAN SUBJOINED LETTER DHA
590: '0FA7', # TIBETAN SUBJOINED LETTER BHA
591: '0FAC', # TIBETAN SUBJOINED LETTER DZHA
592: '0FB9', # TIBETAN SUBJOINED LETTER KSSA
1.21 duerst 593: # 'FB1D' # HEBREW LETTER YOD WITH HIRIQ: see below for $OPTYWH
1.12 duerst 594: 'FB1F', # HEBREW LIGATURE YIDDISH YOD YOD PATAH
595: 'FB2A', # HEBREW LETTER SHIN WITH SHIN DOT
596: 'FB2B', # HEBREW LETTER SHIN WITH SIN DOT
597: 'FB2C', # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
598: 'FB2D', # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
599: 'FB2E', # HEBREW LETTER ALEF WITH PATAH
600: 'FB2F', # HEBREW LETTER ALEF WITH QAMATS
601: 'FB30', # HEBREW LETTER ALEF WITH MAPIQ
602: 'FB31', # HEBREW LETTER BET WITH DAGESH
603: 'FB32', # HEBREW LETTER GIMEL WITH DAGESH
604: 'FB33', # HEBREW LETTER DALET WITH DAGESH
605: 'FB34', # HEBREW LETTER HE WITH MAPIQ
606: 'FB35', # HEBREW LETTER VAV WITH DAGESH
607: 'FB36', # HEBREW LETTER ZAYIN WITH DAGESH
608: 'FB38', # HEBREW LETTER TET WITH DAGESH
609: 'FB39', # HEBREW LETTER YOD WITH DAGESH
610: 'FB3A', # HEBREW LETTER FINAL KAF WITH DAGESH
611: 'FB3B', # HEBREW LETTER KAF WITH DAGESH
612: 'FB3C', # HEBREW LETTER LAMED WITH DAGESH
613: 'FB3E', # HEBREW LETTER MEM WITH DAGESH
614: 'FB40', # HEBREW LETTER NUN WITH DAGESH
615: 'FB41', # HEBREW LETTER SAMEKH WITH DAGESH
616: 'FB43', # HEBREW LETTER FINAL PE WITH DAGESH
617: 'FB44', # HEBREW LETTER PE WITH DAGESH
618: 'FB46', # HEBREW LETTER TSADI WITH DAGESH
619: 'FB47', # HEBREW LETTER QOF WITH DAGESH
620: 'FB48', # HEBREW LETTER RESH WITH DAGESH
621: 'FB49', # HEBREW LETTER SHIN WITH DAGESH
622: 'FB4A', # HEBREW LETTER TAV WITH DAGESH
623: 'FB4B', # HEBREW LETTER VAV WITH HOLAM
624: 'FB4C', # HEBREW LETTER BET WITH RAFE
625: 'FB4D', # HEBREW LETTER KAF WITH RAFE
1.21 duerst 626: 'FB4E', # HEBREW LETTER PE WITH RAFE
627: ## post composition exclusion
1.22 duerst 628: '2ADC', # FORKING
1.21 duerst 629: '1D15E', # MUSICAL SYMBOL HALF NOTE
1.22 duerst 630: '1D15F', # MUSICAL SYMBOL QUARTER NOTE
631: '1D160', # MUSICAL SYMBOL EIGHTH NOTE
632: '1D161', # MUSICAL SYMBOL SIXTEENTH NOTE
633: '1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE
634: '1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE
635: '1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE
636: '1D1BB', # MUSICAL SYMBOL MINIMA
637: '1D1BC', # MUSICAL SYMBOL MINIMA BLACK
638: '1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE
639: '1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK
640: '1D1BF', # MUSICAL SYMBOL FUSA WHITE
641: '1D1C0' # MUSICAL SYMBOL FUSA BLACK
1.12 duerst 642: );
1.20 duerst 643:
644: if (!$OPTYWH) {
645: push @NoRecomp, 'FB1D'; # HEBREW LETTER YOD WITH HIRIQ
646: } # see http://www.unicode.org/unicode/uni2errata/Normalization_Corrigendum.html
1.12 duerst 647:
1.26 duerst 648: # remove recomposition exclusions, add to %NFCforbid
1.21 duerst 649: foreach my $hex (@NoRecomp) {
1.12 duerst 650: my $key = &hex2utf8($hex);
651: print 'Non-recomposing composition: ', utf82ncr($key), ' from ',
652: utf82ncr($DecoCanoRest{$key}), ", removed.\n"
653: if ($OPTd);
654: delete $DecoCanoRest{$key};
1.26 duerst 655: $NFCforbid{$key} = 1;
1.12 duerst 656: }
657:
658: # replace with fully expanded decompositions
1.21 duerst 659: foreach my $key (keys %DecoCanoRest) {
1.12 duerst 660: $DecoCanoRest{$key} = $DecoCano{$key}
661: if ($DecoCanoRest{$key} ne $DecoCano{$key});
662: }
663:
664: # detect duplicate compositions ## should not find any
665: if ($OPTd) {
666: print "Checking duplicates, takes some time.\n";
1.21 duerst 667: foreach my $key (sort keys %DecoCanoRest) { # sort to sort output
1.12 duerst 668: my $s = $DecoCanoRest{$key};
1.21 duerst 669: foreach my $key2 (keys %DecoCanoRest) {
1.12 duerst 670: if (($key lt $key2) # don't compare with itself, don't warn twice
671: && ($DecoCanoRest{$key2} eq $s)) { # duplicate compositions
672: print STDERR 'Duplicate composition: ', utf82ncr($key),
673: ' and ', utf82ncr($key2),
674: ' to ', utf82ncr($s), "\n";
675: die "Giving up!\n";
676: }
677: else { next; } # shortcut loop
678: }
679: }
680: }
681:
682: # invert for composition
1.21 duerst 683: foreach my $key (keys %DecoCanoRest) { # use reduced decomps for selection
1.12 duerst 684: if (defined $CompCano{$DecoCanoData{$key}}) # use original data
685: { # (strictly binary)
686: die "Duplicate compositions, giving up.\n";
687: }
688: $CompCano{$DecoCanoData{$key}} = $key;
689: }
690:
691: if ($OPTd) {
1.21 duerst 692: foreach my $key (sort keys %DecoCano) { # sort to sort output
1.13 duerst 693: print 'Final canonical decomposition: ', utf82ncr($key),
1.12 duerst 694: ' to ', utf82ncr($DecoCano{$key}), "\n";
695: }
1.21 duerst 696: foreach my $key (sort keys %CompCano) { # sort to sort output
1.12 duerst 697: print 'Final composition: ', utf82ncr($key),
698: ' to ', utf82ncr($CompCano{$key}), "\n";
699: }
1.21 duerst 700: foreach my $key (sort keys %DecoKompData) { # sort to sort output
1.13 duerst 701: print 'Final kompatibility decomposition: ', utf82ncr($key),
702: ' to ', utf82ncr($DecoKompData{$key}), "\n";
703: }
1.12 duerst 704: }
705:
1.15 duerst 706: print STDERR "Finished processing character data file(s).\n" if (!$OPTq);
707:
708: } # end ReadCharacterDataFile
709:
710: #
711: # store data to file for fast reread
712: #
713:
714: sub StoreData {
715: my ($dataFile) = @_;
716: my %all_data = ();
717:
718: $all_data{exists} = \%exists;
719: $all_data{DecoCano} = \%DecoCano;
720: $all_data{CompCano} = \%CompCano;
721: $all_data{DecoKompData} = \%DecoKompData;
722: $all_data{CombClass} = \%CombClass;
723:
724: require Storable; # in line, to not require module if not needed
725: &Storable::nstore (\%all_data, $dataFile);
726: }
727:
728: #
729: # read data from file
730: #
731:
732: sub ReadStoredData {
733: my ($dataFile) = @_;
734: require Storable; # in line, to not require module if not needed
735: my %all_data = %{&Storable::retrieve ($dataFile)};
736:
737: %exists = %{$all_data{exists}};
738: %DecoCano = %{$all_data{DecoCano}};
739: %CompCano = %{$all_data{CompCano}};
740: %DecoKompData = %{$all_data{DecoKompData}};
741: %CombClass = %{$all_data{CombClass}};
742: }
743:
744: #
745: # Print instructions (-h)
746: #
747:
748: sub PrintInstructions {
749: print STDERR <<EOF;
750:
751: charlint (code name Charly)
752: Character Check and Normalization
753: According to W3C and Unicode Specifications
754: ===========================================
755:
756: $version
757:
758: (c) Keio University 1999, see perl source or
759: http://www.w3.org/International/charlint for details
760:
761: Available options:
762:
763: (options prefixed by # are currently not available)
764: -b: Remove initial 'Byte Order Mark'
765: -B: Supress warning about initial 'Byte Order Mark'
1.22 duerst 766: -c: Detect non-normalized data (but do not normalize)
1.15 duerst 767: -C: Do not normalize
768: -d: Debug: Thoroughly check character data table input
769: -D: Leave after reading in character data
770: -e: # remove undefined codepoints
771: -E: Do not warn about undefined codepoints
1.25 duerst 772: -f file: Read data from file (default is UnicodeData.txt;
773: please use newest V3.2.0 datafiles)
1.22 duerst 774: -F951: Use old (wrong) mapping for U+F951 (use this option
775: if you really need 3.1.0 behaviour)
1.15 duerst 776: -h: Prints out this short description
777: -k: # Warn about compatibility codepoints
778: -K: Normalize out (i.e. decompose) compatibility codepoints
779: -n: Accept &#ddddd; and &#xhhhh; on input
780: (beware of <![CDATA[, <SCRIPT>, <STYLE>)
1.23 duerst 781: -nX: same as -n, plus &#Xhhhh; (use for HTML only!)
1.15 duerst 782: -N: Produce &#xhhhh; on output
1.26 duerst 783: -nf16check: Produce nf16check data tables
1.21 duerst 784: -o: Print out 'unprintable' bytes as \\octal
1.28 ! rishida 785: -p: # Remove stuff in private use areas
! 786: -P: Supress checking private use areas
1.15 duerst 787: -q: Quiet, don't output progress messages
788: -s file: Read data from file produced with -S
789: -S file: Write data to file for fast reload with -s
790: -u: # Fix UTF-8 (convert or remove)
791: -U: Supress checking correctness of UTF-8
792: -v: Print version
793: -x: Do decomposition only
794: -X: Don't do decomposition (assume input is decomposed)
1.20 duerst 795: -YWH: Treat YOD WITH HIRIQ as precomposed (use this option
1.22 duerst 796: if you really need 3.0.0 behaviour)
1.15 duerst 797:
798: EOF
799: # end of raw in-place text
800:
801: # ideas for more options:
802: # * allow to do kompatibility processing by category
803: # * warn/remove plane 14 language tag codes and other crap
804: # * convert crap to what it's supposed to be (difficult)
805: # * directionality control codes
806:
807: } # end &PrintInstructions
808:
809:
810: #
1.21 duerst 811: # Print instructions (-h)
1.15 duerst 812: #
813:
1.21 duerst 814: sub initialize {
815: my ($readStoreFile, $writeStoreFile, $dataFile);
816:
817: # Read options
1.15 duerst 818:
1.21 duerst 819: OPTIONS:
820: while (@ARGV and $ARGV[0] =~ /^-/) {
821: $_ = shift(@ARGV);
822: $OPTb= 1, next OPTIONS if /^-b$/;
823: $OPTB= 1, next OPTIONS if /^-B$/;
1.22 duerst 824: $OPTc= 1, next OPTIONS if /^-c$/;
1.21 duerst 825: $OPTC= 1, next OPTIONS if /^-C$/;
826: $OPTd= 1, next OPTIONS if /^-d$/;
827: $OPTD= 1, next OPTIONS if /^-D$/;
828: $OPTE= 1, next OPTIONS if /^-E$/;
829: if (/^-f$/) {
830: $OPTf = 1;
831: $dataFile = shift(@ARGV);
832: print STDERR "Using character data file $dataFile.",
833: " Maybe not what you intend.\n" if ($dataFile =~ /^-.$/ && !$OPTq);
834: next OPTIONS;
835: }
1.22 duerst 836: $OPTF951= 1, next OPTIONS if /^-F951$/;
1.21 duerst 837: $OPTh= 1, next OPTIONS if /^-h$/;
838: $OPTK= 1, next OPTIONS if /^-K$/;
839: $OPTn= 1, next OPTIONS if /^-n$/;
1.23 duerst 840: $OPTnX=1, next OPTIONS if /^-nX$/;
1.21 duerst 841: $OPTN= 1, next OPTIONS if /^-N$/;
1.26 duerst 842: $OPTnf16= 1, next OPTIONS if /^-nf16check$/;
1.21 duerst 843: $OPTo= 1, next OPTIONS if /^-o$/;
844: $OPTP= 1, next OPTIONS if /^-P$/;
845: $OPTq= 1, next OPTIONS if /^-q$/;
846: if (/^-s$/) {
847: $OPTs = 1;
848: $readStoreFile = shift(@ARGV);
1.24 duerst 849: print STDERR "Reading from store file $readStoreFile.",
850: " Maybe not what you intend.\n"
851: if ($readStoreFile =~ /^-.$/ && !$OPTq);
1.21 duerst 852: next OPTIONS;
853: }
854: if (/^-S$/) {
855: $OPTS = 1;
856: $writeStoreFile = shift(@ARGV);
1.24 duerst 857: print STDERR "Writing to store file $writeStoreFile.",
858: " Maybe not what you intend.\n"
859: if ($writeStoreFile =~ /^-.$/ && !$OPTq);
1.21 duerst 860: next OPTIONS;
861: }
862: $OPTU= 1, next OPTIONS if /^-U$/;
863: $OPTv= 1, next OPTIONS if /^-v$/;
864: $OPTx= 1, next OPTIONS if /^-x$/;
865: $OPTX= 1, next OPTIONS if /^-X$/;
866: $OPTYWH= 1, next OPTIONS if /^-YWH$/;
867: print STDERR "Unrecognized argument: \"", $_, "\"; ignored.\n";
868: }
869:
870: &PrintInstructions() if $OPTh;
871:
872: if ($OPTv) {
873: print $version, ",\nfor more information, use 'charlint -h'.\n";
874: }
875:
876: # Read/write data files
877:
1.25 duerst 878: if ($OPTs && $OPTS) {
879: die "Cannot read and store at the same time (-s/-S).\n";
880: }
881: if ($OPTf) {
882: if ($OPTs) {
883: die "Conflicting character data sources (-f/-s).\n";
884: }
885: else {
886: ReadCharacterDataFile($dataFile);
887: }
888: }
889: else {
890: if ($OPTs) {
891: ReadStoredData ($readStoreFile);
892: }
893: else {
894: # default amounts to -f UnicodeData.txt
895: ReadCharacterDataFile("UnicodeData.txt");
896: }
897: }
898:
899: if ($OPTS) {
1.21 duerst 900: StoreData ($writeStoreFile);
901: }
1.15 duerst 902:
1.21 duerst 903: exit 0 if ($OPTD);
1.15 duerst 904: }
905:
1.21 duerst 906:
1.15 duerst 907: #
1.26 duerst 908: # Produce data for nf16check
909: #
910:
911: # Helper function: check whether a combining character attached
912: # to some other character recombines (and thus, whether the sequence
913: # is not normalized
914: sub recombines
915: {
916: my ($start, $end) = @_;
917: my $orig = $start . $end;
918: my @t = splitutf8(DecoCano($start));
919: push @t, $end;
920: @t = sortCano(@t);
921:
922: my $lastClass = -1; # this eliminates a special check
923: for (my $targetPos = 1; $targetPos < @t; ) {
924: my $chClass = $CombClass{$t[$targetPos]};
925: $chClass = 0 if (!defined($chClass));
926: my $composite = CompCano($t[0], $t[$targetPos]);
927: if (defined($composite) && $lastClass < $chClass) {
928: $t[0] = $composite;
929: splice @t, $targetPos, 1;
930: }
931: elsif ($chClass == 0) {
932: last;
933: }
934: else {
935: $lastClass = $chClass;
936: $targetPos++;
937: }
938: }
939: return 0 if ($orig eq (join "", @t));
940: return 1;
941: }
942:
943: sub produceNF16check
944: {
1.27 duerst 945: print "/* produced by charlint.pl, with option -nf16check */\n";
946: print "/* charlint.pl by Martin J. Du\"rst, W3C, 1999-2003 */\n";
947: print "/* data derived from Unicode Character Database, see */\n";
948: print "/* http://www.unicode.org/Public/UNIDATA/UCD.html#UCD_Terms */\n";
949:
950: print "\n#include \"nf16data.h\"\n\n";
1.26 duerst 951:
952: #### combining classes
953: print "combiningClass combiningClasses[] = {\n "; # header
954: my $count = 0;
955: foreach my $key (sort keys %CombClass) {
1.27 duerst 956: printf "{0x%05X, %3d}", utf82num($key), $CombClass{$key};
1.26 duerst 957: print (!(++$count % 4) ? ",\n " : ", ");
958: }
959: print "\n };\nint combiningClassCount = $count;\n"; # ending
960:
961: #### recombiners
962: # collect all characters potentially relevant for recombination
1.27 duerst 963: my %Recomguesses;
1.26 duerst 964: my %Recombiners;
965: foreach my $key (keys %DecoCanoRest) {
1.27 duerst 966: $Recomguesses{$key} = 1;
1.26 duerst 967: my @deco = splitutf8($DecoCanoRest{$key});
1.27 duerst 968: $Recomguesses{shift @deco} = 1;
1.26 duerst 969: foreach my $k (@deco) {
970: $Recombiners{$k} = 1;
971: }
972: }
973: # try recombination
974: print "\n\nrecombining recombiners[] = {\n "; # header
1.27 duerst 975: my %Recombases;
1.26 duerst 976: $count = 0;
1.27 duerst 977: foreach my $key (sort keys %Recomguesses) {
1.26 duerst 978: foreach my $key2 (sort keys %Recombiners) {
979: if (recombines($key, $key2)) {
1.27 duerst 980: $Recombases{$key} = 1;
981: printf "{0x%04X, 0x%04X}", utf82num($key), utf82num($key2);
1.26 duerst 982: print (!(++$count % 4) ? ",\n " : ", ");
983: }
1.27 duerst 984: if ($key eq $key2) {
985: print STDERR "ERROR: assumption that recombases and recombiners ";
986: print STDERR "do not overlap is violated: base=", utf82ncr($key),
987: ", combiner=", utf82ncr($key2), "\n";
988: }
1.26 duerst 989: }
990: }
1.27 duerst 991: print "\n {0xFFFF, 0xFFFF} /* cieling off */\n"; $count++;
1.26 duerst 992: print " };\nint recombinerCount = $count;\n";
993:
994:
995: #### flags
996: print "\n\nunsigned char flags[] = {\n"; # header
997:
998: for (my $c=0; $c < 0x1D800; $c++) {
999: next if ($c >= 0x10900 && $c < 0x1D000); # jump a big piece
1000: printf " /* U+%04X */ ", $c if (!($c % 8));
1001: my $cu = num2utf8($c);
1002: # surrogates don't exist, but are separate anyway, so do them first
1003: if ($c>=0xD800 && $c<0xDC00 ) { print "HIGH"; } # high surrogates
1004: elsif ($c>=0xDC00 && $c<0xE000 ) { print "loww"; } # low surrogate
1005: elsif (!&exists($cu)) { print "NoNo"; } # does not exist
1006: elsif ($NFCforbid{$cu}==1) { print "NOFC"; } # singleton/excluded
1.27 duerst 1007: elsif ($CombClass{$cu}) { # class > 0
1008: if ($Recombiners{$cu}) { print "ReCo"; } # recombining
1009: else { print "NoRe"; } # not recombining
1010: }
1011: elsif ($Recombiners{$cu}) { print "COM0"; } # class==0, but composing
1.26 duerst 1012: elsif ($c>=0x1100 && $c<=0x1112) { print "Hang"; } # hangul initial consonants
1013: elsif ($c>=0x1161 && $c<=0x1175) { print "hAng"; } # hangul medial vowel
1014: elsif ($c>=0x11A8 && $c<=0x11C2) { print "haNG"; } # hangul trailing consonants
1015: elsif ($c>=0xAC00 && $c<=0xD7A3
1016: && !(($c-0xAC00) % 28)) { print "HAng"; } # initial/medial syllable
1.27 duerst 1017: elsif ($Recombases{$cu}) { print "Base"; } # base that combines
1.26 duerst 1018: else { print "simp"; } # nothing special
1019:
1020: if (!($c % 2)) { print "*16+"; }
1021: elsif (($c+1) % 8) { print ", "; }
1022: else { printf ",\n", $c+1; }
1023: }
1.27 duerst 1024: print "\n };\n\n"; # ending
1.26 duerst 1025: }
1026:
1027:
1028: #
1.21 duerst 1029: # Start of main program
1.15 duerst 1030: #
1031:
1.21 duerst 1032: #
1033: # initialize: read options, read/write character data files
1034: #
1.12 duerst 1035:
1.21 duerst 1036: &initialize;
1.12 duerst 1037:
1.26 duerst 1038: if ($OPTnf16) {
1039: produceNF16check();
1040: exit; # won't do anything else
1041: }
1042:
1.12 duerst 1043: #
1044: # PROCESS ACTUAL FILE(S)
1045: #
1046:
1.21 duerst 1047: my $line = 0;
1.12 duerst 1048:
1049: LINE:
1050: while (<>) {
1051: $line++;
1052:
1053: # Convert NCRs on input
1.23 duerst 1054: if ($OPTn || $OPTnX) {
1.12 duerst 1055: # decimal numeric character references
1056: s/&#([0-9]+)\;/Xnum2utf8($1)/eg;
1057: # hexadecimal numeric character references
1058: s/&#x([0-9a-fA-F]+)\;/Xhex2utf8($1)/eg;
1.23 duerst 1059: if ($OPTnX) {
1060: # hexadecimal numeric character references (HTML only)
1061: s/&#X([0-9a-fA-F]+)\;/Xhex2utf8($1)/eg;
1062: }
1.12 duerst 1063: }
1064:
1065: # Check BOM
1066: if ($line == 1) {
1067: if (!$OPTb && /^\357\273\277/) {
1068: print STDERR "Initial BOM.\n";
1069: }
1070: if ($OPTB) {
1071: s/^\357\273\277// # remove initial BOM
1072: }
1073: }
1074:
1075: # Check UTF-8
1.21 duerst 1076: my $r;
1.12 duerst 1077: if (!$OPTU && ($r = CheckUTF8 ($_))) {
1.21 duerst 1078: print STDERR "Line $line: Non-UTF-8 ($r).\n";
1.12 duerst 1079: die "Giving up!\n";
1080: }
1081:
1082: # Check nonexisting characters
1083: if (!$OPTE && CheckExists ($_)) {
1084: die "Line $line: Non-Existing codepoints.\nGiving up!\n";
1085: }
1086:
1087: # Check private characters
1088: if (!$OPTP && ($r = CheckPrivate ($_))) {
1.21 duerst 1089: die "Line $line: Private charaters ($r).\nGiving up!\n";
1.12 duerst 1090: }
1091:
1.21 duerst 1092: my @line = splitutf8($_);
1.22 duerst 1093: my @lineoriginal = @line;
1.21 duerst 1094: my @line2 = ();
1.12 duerst 1095:
1096: if (!$OPTC) {
1.13 duerst 1097: if ($OPTX) {
1.26 duerst 1098: @line2 = @line;
1099: }
1100: else { # decompose
1101: while (defined(my $s = shift @line)) {
1102: if ($OPTK) {
1103: push @line2, splitutf8(DecoKomp($s));
1104: }
1105: else {
1106: push @line2, splitutf8(DecoCano($s));
1.13 duerst 1107: }
1.12 duerst 1108: }
1109: }
1110:
1111: # canonical reordering
1112: @line = sortCano(@line2);
1113:
1114: # recompose
1.21 duerst 1115: if (!$OPTx && length (@line)) {
1.26 duerst 1116: my $lastClass = -1; # this eliminates a special check
1.21 duerst 1117: my $starterPos = 0;
1118: my $sourceLength = @line;
1119: my $targetPos = 1;
1.26 duerst 1120: my $starterCh = $line[0];
1.21 duerst 1121: for (my $sourcePos = 1; $sourcePos < $sourceLength; $sourcePos++) {
1122: my $ch = $line[$sourcePos];
1123: my $chClass = $CombClass{$ch};
1.12 duerst 1124: $chClass = 0 if (!defined($chClass));
1.21 duerst 1125: my $composite = CompCano($starterCh, $ch);
1.12 duerst 1126: if (defined($composite) && $lastClass < $chClass) {
1127: $line[$starterPos] = $composite;
1128: $starterCh = $composite;
1129: }
1130: elsif ($chClass == 0) {
1131: $starterPos = $targetPos;
1132: $starterCh = $ch;
1133: $lastClass = -1;
1134: $line[$targetPos++] = $ch;
1135: }
1136: else {
1137: $lastClass = $chClass;
1138: $line[$targetPos++] = $ch;
1139: }
1140: }
1141: $#line = $targetPos-1;
1142: } # end of recomposition
1.22 duerst 1143: if ($OPTc && join("",@line) ne join("",@lineoriginal)) {
1144: die "Line $line: Non-normalized data.\nGiving up!\n";
1145: }
1146: } #if (!$OPTC)
1.12 duerst 1147:
1148: printOPT (join "", @line);
1149:
1150: } # end while <>
Webmaster