#!/usr/bin/perl

# Copyrigth notice:
#    (c) Copyright Keio University 1999
#    This software is made available under the terms of the
#    W3C Software Licence available at
#    http://www.w3.org/Consortium/Legal/copyright-software.


# Acknowledgements: Tim Bray for help with CDATA,
#                   Mark Davis for various discussions about Unicode TR #15
#                   James Briggs for an error report

# Authors:
# MJD Martin J. Du"rst, duerst@w3.org

# History:
# 1999/06/23: 0.30, preparation for W3C member test, without Hangul  MJD
# 1999/06/25: 0.31, fixed reordering bug, going public               MJD
# 1999/07/01: 0.32, adapted surrogates/exclusions to 3.0.0.beta      MJD
# 1999/08/16: 0.33, updated for second version of 3.0.0.beta         MJD
# 2000/07/24: 0.34, $chClass = $CombClass{ch}; should read $chClass = $CombClass{$ch};
#                   implemented -C                                   MJD

# CVS last revised $Date: 2000/07/24 07:20:35 $ by $Author: duerst $


#
#   SUBROUTINES
#

sub addtext {
    local (*r, $t) = @_;
    $r =  $r ? ($r."; ".$t) : $t;
}

# Check problems in UTF-8

sub CheckUTF8 {
    local ($s) = @_;
    local ($return) = "";
    local ($st);
    if ($s =~ /[\355][\240-\257][\200-\277][\355][\260-\277][\200-\277]/) {
	&addtext (*return, "surrogate pair") if !$OPTU;
    }
    if ($s =~ /[\355][\240-\277][\200-\277]/) {
	&addtext (*return, "single surrogate") if !$OPTU;
    }
    if ($s =~ /[\300-\301][\200-\277]/) {
	&addtext (*return, "ASCII in 2 bytes") if !$OPTU;
    }
    if ($s =~ /[\340][\200-\237][\200-\277]/) {
	&addtext (*return, "3 bytes instead of 2 or less") if !$OPTU;
    }
    if ($s =~ /[\360][\200-\217][\200-\277]{2}/) {
	&addtext (*return, "4 bytes instead of 3 or less") if !$OPTU;
    }
    if ($s =~ /[\370][\200-\207][\200-\277]{3}/) {
	&addtext (*return, "5 bytes instead of 4 or less") if !$OPTU;
    }
    if ($s =~ /[\374][\200-\203][\200-\277]{4}/) {
	&addtext (*return, "6 bytes instead of 5 or less") if !$OPTU;
    }
    # non-synchronized cases
    $s =~ s{   [\000-\177]
             | [\300-\337][\200-\277]
             | [\340-\357][\200-\277]{2}
             | [\360-\367][\200-\277]{3}
             | [\370-\373][\200-\277]{4}
             | [\374-\375][\200-\277]{5}
           }{}gx;
    # forbidden bytes
    if ($s =~ /[\376\377]/) {
	&addtext (*return, "0xFE or 0xFF byte") if !$OPTU;
    }
    if ($s ne "") {
	&addtext (*return, "synchronization problem") if !$OPTU;
    }
    $return;
} # end CheckUTF8

sub exists {
    my $s = shift;
    return ($exists{$s}
        or ("\344\270\200" le $s and $s le "\351\276\245")  # CJK
        or ("\352\260\200" le $s and $s le "\355\236\243")  # Hangul
	or ('\343\220\200' le $s and $s le '\344\266\265')); # CJK Extension A
}


sub CheckExists {
    my $s = @_;
    my $news = join "",
       grep (&exists($_), $s =~ m/([\000-\177]|[\300-\377][\200-\277]+)/go);
    return ($s ne $news);
} # end CheckExists

sub CheckPrivate {
    my $s = @_;
    if ($s =~ /[\356][\200-\277]{2}|[\357][\200-\237][\200-\277]/) {
	return "BMP";
    }
    if ($s =~ /[\363][\260-\277][\200-\277]{2}/) {
        return "plane 15";
    }
    if ($s =~ /[\364][\200-\217][\200-\277]{2}/) {
        return "plane 16";
    }
} # end CheckPrivate


#### convert hex to UTF-8

sub hex2utf8 {
    num2utf8 (hex($_[0]));
} # end hex2utf8

sub Xhex2utf8 {      #### avoid to covert <>&"
    local ($t) = hex($_[0]);
    if ($t < 0x40)  { return "&#x".$_[0].";"; }
    num2utf8 ($t);
} # end Xhex2utf8

sub Xnum2utf8 {      #### avoid to covert <>&"
    local ($t) = @_;
    if ($t < 64)  { return "&#".$t.";"; }
    num2utf8 ($t);
} # end Xnum2utf8

#### convert number to UTF-8

sub num2utf8 {
    local($t) = @_;
    local($trail, $firstbits, @result);

    if    ($t<0x00000080) { $firstbits=0x00; $trail=0; }
    elsif ($t<0x00000800) { $firstbits=0xC0; $trail=1; }
    elsif ($t<0x00010000) { $firstbits=0xE0; $trail=2; }
    elsif ($t<0x00200000) { $firstbits=0xF0; $trail=3; }
    elsif ($t<0x04000000) { $firstbits=0xF8; $trail=4; }
    elsif ($t<0x80000000) { $firstbits=0xFC; $trail=5; }
    else {
	die "Too large scalar value, cannot be converted to UTF-8.\n";
    }
    for (1 .. $trail) {
	unshift (@result, ($t & 0x3F) | 0x80);
        $t >>= 6;         # slight danger of non-portability
    }
    unshift (@result, $t | $firstbits);
    pack ("C*", @result);
} # end num2utf8


sub utf82ncr {   # works for more than one character
    my $r;
    foreach $c (splitutf8(shift)) {
	$r .= ($c =~ /[\040-\177]/) ? $c : sprintf ("&#x%lX;",&utf82num($c));
    }
    $r;
} # end utf82ncr


#### convert UTF-8 to number

sub utf82num {
    local(@t, $t, $result);

    @t = unpack ("C*", $_[0]);
    $t = shift (@t);
    if    ($t<0x80) { $result= $t       ; $trail=0; }
    elsif ($t<0xC0) { die "Illegal leading byte in UTF-8.\n"; }
    elsif ($t<0xE0) { $result= $t & 0x1F; $trail=1; }
    elsif ($t<0xF0) { $result= $t & 0x0F; $trail=2; }
    elsif ($t<0xF8) { $result= $t & 0x07; $trail=3; }
    elsif ($t<0xFC) { $result= $t & 0x03; $trail=4; }
    elsif ($t<0xFE) { $result= $t & 0x01; $trail=5; }
    else            { die "Illegal byte in UTF-8.\n"; }

    if ($trail != $#t + 1) { die "Not right number of trailing bytes.\n"; }
    while ($trail--) {
        # maybe check for 01xxxxxx
        $result <<= 6;
        $result += 0x3F & shift (@t);
    }
    return $result;
} # end utf82num

#### variant of hex2utf8 to get rid of spaces
sub spacehex2utf8 {
    my ($t) = @_;
    return "" if ($t eq " ");
    hex2utf8($t);
} # end spacehex2utf8

#### split an utf-8 string into codepoints
sub splitutf8 {
    split (/(?=[\000-\177\300-\377])/, shift);
} # end splitutf8


#### canonical sort of combining marks
# input: utf-8 string
# return: utf-8 string
sub sortCano {
    my @a = @_;
    my $i, $ccHere, $ccPrev, $temp;

    return @a  if (@a <= 1);
    for ($i=1; $i < @a; $i++) {
        $ccHere = $CombClass{$a[$i]};
        $ccPrev = $CombClass{$a[$i-1]};
        $ccHere = 0  if (!defined($ccHere));
        $ccPrev = 0  if (!defined($ccPrev));
        if ($ccHere != 0  &&  $ccPrev > $ccHere) {
            $temp    = $a[$i];     # exchange
            $a[$i]   = $a[$i-1];
            $a[$i-1] = $temp;
            $i -= 2  if ($i > 1);  # backtrack and check again
        }
    }
    return @a;
} # end sortCano

sub printoctal {
    my $s = shift;
    $s =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/ge;
    print STDERR $s;
} # end printoctal


#### output data considering all relevant options
sub printOPT {
    my $t = shift;
    if ($OPTN) {
        # hexadecimal numeric character references
        $t =~ s/([\300-\377][\200-\277]+)/utf82ncr($1)/eg;
    }
    elsif ($OPTo) {
	$t =~ s/([\200-\377])/sprintf("\\%lo",ord($1))/eg;
    }
    print $t;
}


#
# Print instructions (-h)
#

$version = 'Version 0.32, without Hangul';

sub PrintInstructions {
    print STDERR <<EOF;

charlint (code name Charly)
Character Check and Normalization
According to W3C and Unicode Specifications
===========================================

$version

(c) Keio University 1999, see perl source or
    http://www.w3.org/International/charlint for details

Available options:

(options prefixed by # are currently not available)
-b: Remove initial 'Byte Order Mark'
-B: Supress warning about initial 'Byte Order Mark'
-C: Do not normalize
-d: Debug: Thoroughly check character data table input
-D: Leave after reading in character data
-e: # remove undefined codepoints
-E: Do not warn about undefined codepoints
-f file: Read data from file
         (please use newest V3.0 beta datafiles)
-h: Prints out this short description
-k: # Warn about compatibility codepoints
-K: # Normalize out compatibility codepoints
-n: Accept &#ddddd; and &#xhhhh; on input
        (beware of <![CDATA[, <SCRIPT>, <STYLE>)
-N: Produce &#xhhhh; on output
-o: Print out 'unprintable' bytes as \octal
-p: # Remove stuff in private zone
-P: Supress checking private zone
-u: # Fix UTF-8 (convert or remove)
-U: Supress checking correctness of UTF-8
-v: Print version

EOF
# end of raw in-place text

# ideas for more options:
# * don't normalize, just check
# * warn/remove plane 14 language tag codes and other crap
# * convert crap to what it's supposed to be (difficult)
# * directionality control codes

} # end &PrintInstructions


#
# Start of main program
#

#
# Read options
#

$dataFile = "UnicodeData-Latest.txt";    # default for datafile, change with -f

OPTIONS:
while ($ARGV[0] =~ /^-/) {
    $_ = shift(@ARGV);
    $OPTb= 1, next OPTIONS  if /^-b$/;
    $OPTB= 1, next OPTIONS  if /^-B$/;
    $OPTC= 1, next OPTIONS  if /^-C$/;
    $OPTd= 1, next OPTIONS  if /^-d$/;
    $OPTD= 1, next OPTIONS  if /^-D$/;
    $OPTE= 1, next OPTIONS  if /^-E$/;
    if (/^-f$/) {
	$dataFile = shift(@ARGV);
	print STDERR "Using character data file $dataFile.";
	print STDERR " Maybe not what you intend." if ($dataFile =~ /^-.$/);
	print STDERR "\n";
	next OPTIONS;
    }
    $OPTh= 1, next OPTIONS  if /^-h$/;
    $OPTn= 1, next OPTIONS  if /^-n$/;
    $OPTN= 1, next OPTIONS  if /^-N$/;
    $OPTo= 1, next OPTIONS  if /^-o$/;
    $OPTP= 1, next OPTIONS  if /^-P$/;
    $OPTU= 1, next OPTIONS  if /^-U$/;
    $OPTv= 1, next OPTIONS  if /^-v$/;
    print STDERR "Unrecognized argument: \"", $_, "\"; ignored.\n";
}

if ($OPTh) { &PrintInstructions(); }

if ($OPTv) {
    print $version, ",\nfor more information, use 'charlint -h'.\n";
}


#
# read in base data file
#
$line = 0;
if (1) {                        # was if (!$OptC), don't want to realign everything
    open (BASE, $dataFile)
        or die "Cannot open character data file $dataFile.\n";
  BASE:
    while (<BASE>) {
        print STDERR "Reading data file, line $line\n" if !($line % 1000); $line++;
	chop;
	($hex, $name, $category, $combClass, $t4, $dec) = split(/;/);
        #### Check ranges for consistency with handcoded pieces, then skip
        if ($name =~ /^<(.*), (.*)/) {
	    if (    $_ !~ /^4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;$/
                and $_ !~ /^9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;$/
                and $_ !~ /^AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;$/
                and $_ !~ /^D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;$/
                and $_ !~ /^D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
                and $_ !~ /^DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
                and $_ !~ /^DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
                and $_ !~ /^DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
                and $_ !~ /^DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;$/
                and $_ !~ /^DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;$/
                and $_ !~ /^E000;<Private Use, First>;Co;0;L;;;;;N;;;;;$/
                and $_ !~ /^F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;$/
		and $_ !~ /^3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;$/
	        and $_ !~ /^4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;$/
                ) {
		die "Problem with data file consistency, line $line: \n\t$_.\n";
	    }
        }
	else { # normal line processing
	    $u = &hex2utf8($hex);
            $exists{$u} = 1; # to check characters that exist

	    #### Decompositions
	    if ($dec eq "") { }  # no decomposition
	    elsif ($dec =~ /^<(.*)>(.*)/) { # compatibility
                $decKind = $1;
                $dec = $2;
                $DecoKompKind{$u} = $decKind;
                $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
                $DecoKompData{$u} = $dec;
            }
	    else { # canonical decomposition
                $dec =~ s/([0-9a-fA-F]+|\040)/spacehex2utf8($1)/eg;
		$DecoCanoData{$u} = $dec;
	    }

	    #### Canonical Combining Class
	    $CombClass{$u} = $combClass  if ($combClass);
	}
    }
    close (BASE);
    print STDERR "Finished reading character database.\n";

    %DecoCanoRest = %DecoCano = %DecoCanoData;    # keep original data as is, and
                                                  # copy to restrict for composition

# list of compatibility kinds for later work
# the idea is to group them (e.g. sub and super) and allow
# normalization by group
#Kompatibility Kind: circle
#Kompatibility Kind: compat
#Kompatibility Kind: final
#Kompatibility Kind: font
#Kompatibility Kind: fraction
#Kompatibility Kind: initial
#Kompatibility Kind: isolated
#Kompatibility Kind: medial
#Kompatibility Kind: narrow
#Kompatibility Kind: noBreak
#Kompatibility Kind: small
#Kompatibility Kind: square
#Kompatibility Kind: sub
#Kompatibility Kind: super
#Kompatibility Kind: vertical
#Kompatibility Kind: wide

    # fully expand decompositions
    $fixpoint = 0;  # set to false
    while (!$fixpoint) {
        $fixpoint = 1;  # set to true
        print "Fixpoint\n"  if ($OPTd);
	foreach $key  (sort keys %DecoCano) {
	    my @s = splitutf8($DecoCano{$key});
            my $i = 0;
	    foreach $c (@s) {
		my $d;
		if ($d = $DecoCano{$c}) {
		    print "replacing ", utf82ncr($c), " with ",
                            utf82ncr($d), " in ", utf82ncr($key), "\n"
                        if $OPTd;
                    if ($i > 0) {
                        print STDERR "Rear expansion, against assumptions (use data from V3.0 upwards)!\n";
                        die "Giving up!\n";
                    }
		    $c = $d;
		    $fixpoint = 0;  # changed something; need one more pass 
		}
                $i++;
	    }
	    $DecoCano{$key} = join "", @s;
	}
    }

    # reorder combining marks
    foreach $key  (sort keys %DecoCano) {   # sort to sort the output
	my $s = $DecoCano{$key};
	my $t = join "", sortCano(splitutf8($s));
        if ($s ne $t) {
	    print STDERR "Error: Had to reorder ", utf82ncr($key), " from ",
                     utf82ncr($s), " to ", utf82ncr($t), "\n";
            die "Giving up!\n";
	}
        $DecoCano{$key} = $t;
    }

    # detect singular compositions
    foreach $key  (sort keys %DecoCanoRest) {   # sort to sort output
	if (1 == scalar(splitutf8($DecoCanoRest{$key}))) {
	    print 'Singular composition: ', utf82ncr($key), ' from ',
                      utf82ncr($DecoCanoRest{$key}), ", removed.\n"
                if ($OPTd);
	    delete $DecoCanoRest{$key};
	}
    }

    # detect 'non-zero' compositions
    foreach $key  (sort keys %DecoCanoRest) {   # sort to sort output
	my @a = splitutf8($DecoCanoRest{$key});
	if ($CombClass{shift @a}) {
            print 'Non-zero composition: ', utf82ncr($key), ' from ',
                    utf82ncr($DecoCanoRest{$key}), ", removed.\n"
                if ($OPTd);
	    delete $DecoCanoRest{$key};
	    last;
        }
    }

    # detect 'all-zero' compositions
    foreach $key  (sort keys %DecoCanoRest) {   # sort to sort output
	my $allzero = 1;
	my @a = splitutf8($DecoCanoRest{$key});
	foreach $c (@a) {
	    $allzero = 0  if ($CombClass{$c});
	}
	if ($allzero) {
	    print 'All-zero composition: ', utf82ncr($key), ' from ',
                      utf82ncr($DecoCanoRest{$key}), ".\n"
                if ($OPTd);
	}
    }


    @NoRecomp = (  # Script-specific, table-based
                   # according to ftp://ftp.unicode.org/Public/3.0-Update/CompositionExclusions-1.beta.txt
    '0958',  # DEVANAGARI LETTER QA
    '0959',  # DEVANAGARI LETTER KHHA
    '095A',  # DEVANAGARI LETTER GHHA
    '095B',  # DEVANAGARI LETTER ZA
    '095C',  # DEVANAGARI LETTER DDDHA
    '095D',  # DEVANAGARI LETTER RHA
    '095E',  # DEVANAGARI LETTER FA
    '095F',  # DEVANAGARI LETTER YYA
    '09DC',  # BENGALI LETTER RRA
    '09DD',  # BENGALI LETTER RHA
    '09DF',  # BENGALI LETTER YYA
    '0A33',  # GURMUKHI LETTER LLA
    '0A36',  # GURMUKHI LETTER SHA
    '0A59',  # GURMUKHI LETTER KHHA
    '0A5A',  # GURMUKHI LETTER GHHA
    '0A5B',  # GURMUKHI LETTER ZA
    '0A5E',  # GURMUKHI LETTER FA
    '0B5C',  # ORIYA LETTER RRA
    '0B5D',  # ORIYA LETTER RHA
    'FB1F',  # HEBREW LIGATURE YIDDISH YOD YOD PATAH
    'FB2A',  # HEBREW LETTER SHIN WITH SHIN DOT
    'FB2B',  # HEBREW LETTER SHIN WITH SIN DOT
    'FB2C',  # HEBREW LETTER SHIN WITH DAGESH AND SHIN DOT
    'FB2D',  # HEBREW LETTER SHIN WITH DAGESH AND SIN DOT
    'FB2E',  # HEBREW LETTER ALEF WITH PATAH
    'FB2F',  # HEBREW LETTER ALEF WITH QAMATS
    'FB30',  # HEBREW LETTER ALEF WITH MAPIQ
    'FB31',  # HEBREW LETTER BET WITH DAGESH
    'FB32',  # HEBREW LETTER GIMEL WITH DAGESH
    'FB33',  # HEBREW LETTER DALET WITH DAGESH
    'FB34',  # HEBREW LETTER HE WITH MAPIQ
    'FB35',  # HEBREW LETTER VAV WITH DAGESH
    'FB36',  # HEBREW LETTER ZAYIN WITH DAGESH
    'FB38',  # HEBREW LETTER TET WITH DAGESH
    'FB39',  # HEBREW LETTER YOD WITH DAGESH
    'FB3A',  # HEBREW LETTER FINAL KAF WITH DAGESH
    'FB3B',  # HEBREW LETTER KAF WITH DAGESH
    'FB3C',  # HEBREW LETTER LAMED WITH DAGESH
    'FB3E',  # HEBREW LETTER MEM WITH DAGESH
    'FB40',  # HEBREW LETTER NUN WITH DAGESH
    'FB41',  # HEBREW LETTER SAMEKH WITH DAGESH
    'FB43',  # HEBREW LETTER FINAL PE WITH DAGESH
    'FB44',  # HEBREW LETTER PE WITH DAGESH
    'FB46',  # HEBREW LETTER TSADI WITH DAGESH
    'FB47',  # HEBREW LETTER QOF WITH DAGESH
    'FB48',  # HEBREW LETTER RESH WITH DAGESH
    'FB49',  # HEBREW LETTER SHIN WITH DAGESH
    'FB4A',  # HEBREW LETTER TAV WITH DAGESH
    'FB4B',  # HEBREW LETTER VAV WITH HOLAM
    'FB4C',  # HEBREW LETTER BET WITH RAFE
    'FB4D',  # HEBREW LETTER KAF WITH RAFE
    'FB4E'   # HEBREW LETTER PE WITH RAFE
    );

    # remove script-specific
    foreach $hex  (@NoRecomp) {
	my $key = &hex2utf8($hex);
	print 'Non-recomposing composition: ', utf82ncr($key), ' from ',
                  utf82ncr($DecoCanoRest{$key}), ", removed.\n"
            if ($OPTd);
	delete $DecoCanoRest{$key};
    }

    # replace with fully expanded decompositions
    foreach $key  (keys %DecoCanoRest) {
        $DecoCanoRest{$key} =  $DecoCano{$key}
            if ($DecoCanoRest{$key} ne $DecoCano{$key});
    }

    # detect duplicate compositions ## should not find any
    if ($OPTd) {
        print "Checking duplicates, takes some time.\n";
        foreach $key  (sort keys %DecoCanoRest) {   # sort to sort output
	    my $s = $DecoCanoRest{$key};
	    foreach $key2  (keys %DecoCanoRest) {
	        if (($key lt $key2)   # don't compare with itself, don't warn twice
		        && ($DecoCanoRest{$key2} eq $s)) { # duplicate compositions
		    print STDERR 'Duplicate composition: ', utf82ncr($key),
		                        ' and ', utf82ncr($key2),
                                         ' to ', utf82ncr($s), "\n";
                    die "Giving up!\n";
	        }
	        else { next; }  # shortcut loop
	    }
        }
    }

    # invert for composition
    foreach $key  (keys %DecoCanoRest) {   # use reduced decomps for selection
	if (defined $CompCano{$DecoCanoData{$key}})  # use original data
	{                                            # (strictly binary)
	    die "Duplicate compositions, giving up.\n";
	}
        $CompCano{$DecoCanoData{$key}} = $key;
    }

    if ($OPTd) {
        foreach $key  (sort keys %DecoCano) {   # sort to sort output
	    print 'Final decomposition: ', utf82ncr($key),
		      ' to ', utf82ncr($DecoCano{$key}), "\n";
	}
        foreach $key  (sort keys %CompCano) {   # sort to sort output
	    print 'Final composition: ', utf82ncr($key),
		      ' to ', utf82ncr($CompCano{$key}), "\n";
	}
    }

print STDERR "Finished processing character data file(s).\n";

} # end (formerly if (!OPTC))

exit 0  if ($OPTD);

#
# PROCESS ACTUAL FILE(S)
#

$line = 0;

LINE:
while (<>) {
    $line++;

    # Convert NCRs on input
    if ($OPTn) {
        # decimal numeric character references
        s/&#([0-9]+)\;/Xnum2utf8($1)/eg;
        # hexadecimal numeric character references
        s/&#x([0-9a-fA-F]+)\;/Xhex2utf8($1)/eg;
    }

    # Check BOM
    if ($line == 1) {
        if (!$OPTb && /^\357\273\277/) {
            print STDERR "Initial BOM.\n";
        }
        if ($OPTB) {
            s/^\357\273\277//    # remove initial BOM
        }
    }

    # Check UTF-8
    if (!$OPTU && ($r = CheckUTF8 ($_))) {
	print STDERR "Line $line: Non-UTF-8 ($r).\n";
        die "Giving up!\n";
    }

    # Check nonexisting characters
    if (!$OPTE && CheckExists ($_)) {
        die "Line $line: Non-Existing codepoints.\nGiving up!\n";
    }

    # Check private characters
    if (!$OPTP && ($r = CheckPrivate ($_))) {
	die "Line $line: Private charaters ($r).\nGiving up!\n";
    }

    @line = splitutf8($_);
    @line2 = ();

    if (!$OPTC) {
        # decompose
        while (defined($s = shift @line)) {
            if (!($r = $DecoCano{$s})) {
                $r = $s;
            }
            push @line2, splitutf8($r);
        }


        # canonical reordering
        @line = sortCano(@line2);

        # recompose
        $starterPos   = 0;
        $sourceLength = @line;
        if ($sourceLength == 0) {
            $targetPos = 0;
        }
        else {
            $targetPos = 1;
            $starterCh = $line[0];
            $lastClass = -1;     # this eliminates a special check
        }
        for ($sourcePos = 1; $sourcePos < $sourceLength; $sourcePos++) {
            $ch = $line[$sourcePos];
            $chClass = $CombClass{$ch};
            $chClass = 0  if (!defined($chClass));
            $composite = $CompCano{join "", ($starterCh, $ch)};
            if (defined($composite) && $lastClass < $chClass) {
                $line[$starterPos] = $composite;
                $starterCh = $composite;
            }
            elsif ($chClass == 0) {
                $starterPos = $targetPos;
                $starterCh  = $ch;
                $lastClass  = -1;
                $line[$targetPos++] = $ch;
            }
            else {
                $lastClass = $chClass;
                $line[$targetPos++] = $ch;
            }
        }
        $#line = $targetPos-1;
        # end of recomposition
    } #if (!OPTC)

    printOPT (join "", @line);

} # end while <>