#!/usr/bin/perl # Copyrigth notice: # (c) Copyright Keio University 1999 # This software is made available under the terms of the # W3C Software Licence available at # http://www.w3.org/Consortium/Legal/copyright-software. # Acknowledgements: Tim Bray for help with CDATA, # Mark Davis for various discussions about Unicode TR #15 # James Briggs for an error report # Authors: # MJD Martin J. Du"rst, duerst@w3.org # History: # 1999/06/23: 0.30, preparation for W3C member test, without Hangul MJD # 1999/06/25: 0.31, fixed reordering bug, going public MJD # 1999/07/01: 0.32, adapted surrogates/exclusions to 3.0.0.beta MJD # 1999/08/16: 0.33, updated for second version of 3.0.0.beta MJD # 2000/07/24: 0.34, $chClass = $CombClass{ch}; should read $chClass = $CombClass{$ch}; # implemented -C MJD # CVS last revised $Date: 2000/07/24 07:20:35 $ by $Author: duerst $ # # SUBROUTINES # sub addtext { local (*r, $t) = @_; $r = $r ? ($r."; ".$t) : $t; } # Check problems in UTF-8 sub CheckUTF8 { local ($s) = @_; local ($return) = ""; local ($st); if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) { &addtext (*return, "surrogate pair") if !$OPTU; } if ($s =~ /[\355][\240-\277][\200-\277]/) { &addtext (*return, "single surrogate") if !$OPTU; } if ($s =~ /[\300-\301][\200-\277]/) { &addtext (*return, "ASCII in 2 bytes") if !$OPTU; } if ($s =~ /[\340][\200-\237][\200-\277]/) { &addtext (*return, "3 bytes instead of 2 or less") if !$OPTU; } if ($s =~ /[\360][\200-\217][\200-\277]{2}/) { &addtext (*return, "4 bytes instead of 3 or less") if !$OPTU; } if ($s =~ /[\370][\200-\207][\200-\277]{3}/) { &addtext (*return, "5 bytes instead of 4 or less") if !$OPTU; } if ($s =~ /[\374][\200-\203][\200-\277]{4}/) { &addtext (*return, "6 bytes instead of 5 or less") if !$OPTU; } # non-synchronized cases $s =~ s{ [\000-\177] | [\300-\337][\200-\277] | [\340-\357][\200-\277]{2} | [\360-\367][\200-\277]{3} | [\370-\373][\200-\277]{4} | [\374-\375][\200-\277]{5} }{}gx; # forbidden bytes if ($s =~ /[\376\377]/) { &addtext (*return, "0xFE or 0xFF byte") if !$OPTU; } if ($s ne "") { &addtext (*return, "synchronization problem") if !$OPTU; } $return; } # end CheckUTF8 sub exists { my $s = shift; return ($exists{$s} or ("\344\270\200" le $s and $s le "\351\276\245") # CJK or ("\352\260\200" le $s and $s le "\355\236\243") # Hangul or ('\343\220\200' le $s and $s le '\344\266\265')); # CJK Extension A } sub CheckExists { my $s = @_; my $news = join "", grep (&exists($_), $s =~ m/([\000-\177]|[\300-\377][\200-\277]+)/go); return ($s ne $news); } # end CheckExists sub CheckPrivate { my $s = @_; if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) { return "BMP"; } if ($s =~ /[\363][\260-\277][\200-\277]{2}/) { return "plane 15"; } if ($s =~ /[\364][\200-\217][\200-\277]{2}/) { return "plane 16"; } } # end CheckPrivate #### convert hex to UTF-8 sub hex2utf8 { num2utf8 (hex($_[0])); } # end hex2utf8 sub Xhex2utf8 { #### avoid to covert <>&" local ($t) = hex($_[0]); if ($t < 0x40) { return "&#x".$_[0].";"; } num2utf8 ($t); } # end Xhex2utf8 sub Xnum2utf8 { #### avoid to covert <>&" local ($t) = @_; if ($t < 64) { return "&#".$t.";"; } num2utf8 ($t); } # end Xnum2utf8 #### convert number to UTF-8 sub num2utf8 { local($t) = @_; local($trail, $firstbits, @result); if ($t<0x00000080) { $firstbits=0x00; $trail=0; } elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; } elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; } elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; } elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; } elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; } else { die "Too large scalar value, cannot be converted to UTF-8.\n"; } for (1 .. $trail) { unshift (@result, ($t & 0x3F) | 0x80); $t >>= 6; # slight danger of non-portability } unshift (@result, $t | $firstbits); pack ("C*", @result); } # end num2utf8 sub utf82ncr { # works for more than one character my $r; foreach $c (splitutf8(shift)) { $r .= ($c =~ /[\040-\177]/) ? $c : sprintf ("&#x%lX;",&utf82num($c)); } $r; } # end utf82ncr #### convert UTF-8 to number sub utf82num { local(@t, $t, $result); @t = unpack ("C*", $_[0]); $t = shift (@t); if ($t<0x80) { $result= $t ; $trail=0; } elsif ($t<0xC0) { die "Illegal leading byte in UTF-8.\n"; } elsif ($t<0xE0) { $result= $t & 0x1F; $trail=1; } elsif ($t<0xF0) { $result= $t & 0x0F; $trail=2; } elsif ($t<0xF8) { $result= $t & 0x07; $trail=3; } elsif ($t<0xFC) { $result= $t & 0x03; $trail=4; } elsif ($t<0xFE) { $result= $t & 0x01; $trail=5; } else { die "Illegal byte in UTF-8.\n"; } if ($trail != $#t + 1) { die "Not right number of trailing bytes.\n"; } while ($trail--) { # maybe check for 01xxxxxx $result <<= 6; $result += 0x3F & shift (@t); } return $result; } # end utf82num #### variant of hex2utf8 to get rid of spaces sub spacehex2utf8 { my ($t) = @_; return "" if ($t eq " "); hex2utf8($t); } # end spacehex2utf8 #### split an utf-8 string into codepoints sub splitutf8 { split (/(?=[\000-\177\300-\377])/, shift); } # end splitutf8 #### canonical sort of combining marks # input: utf-8 string # return: utf-8 string sub sortCano { my @a = @_; my $i, $ccHere, $ccPrev, $temp; return @a if (@a <= 1); for ($i=1; $i < @a; $i++) { $ccHere = $CombClass{$a[$i]}; $ccPrev = $CombClass{$a[$i-1]}; $ccHere = 0 if (!defined($ccHere)); $ccPrev = 0 if (!defined($ccPrev)); if ($ccHere != 0 && $ccPrev > $ccHere) { $temp = $a[$i]; # exchange $a[$i] = $a[$i-1]; $a[$i-1] = $temp; $i -= 2 if ($i > 1); # backtrack and check again } } return @a; } # end sortCano sub printoctal { my $s = shift; $s =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/ge; print STDERR $s; } # end printoctal #### output data considering all relevant options sub printOPT { my $t = shift; if ($OPTN) { # hexadecimal numeric character references $t =~ s/([\300-\377][\200-\277]+)/utf82ncr($1)/eg; } elsif ($OPTo) { $t =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/eg; } print $t; } # # Print instructions (-h) # $version = 'Version 0.32, without Hangul'; sub PrintInstructions { print STDERR <,