version 1.21, 2001/10/03 06:40:15
|
version 1.22, 2002/04/03 08:23:42
|
Line 1
|
Line 1
|
#!/usr/bin/perl |
#!/usr/bin/perl |
|
|
# Copyright notice: |
# Copyright notice: |
# (c) Copyright Keio University 1999, 2000 |
# (c) Copyright Keio University 1999-2002 |
# This software is made available under the terms of the |
# This software is made available under the terms of the |
# W3C Software Licence available at |
# W3C Software Licence available at |
# http://www.w3.org/Consortium/Legal/copyright-software. |
# http://www.w3.org/Consortium/Legal/copyright-software. |
Line 16
|
Line 16
|
# Kai Henningsen for finally getting me to clean up |
# Kai Henningsen for finally getting me to clean up |
# for 'use strict' and 'perl -w' |
# for 'use strict' and 'perl -w' |
|
|
# Authors: |
# Author: |
# MJD Martin J. Du"rst, duerst@w3.org |
# MJD Martin J. Du"rst, duerst@w3.org |
|
|
my $version = 'Version 0.49'; |
my $version = 'Version 0.50'; |
|
|
# History: |
# History: |
|
# 2002/04/03: 0.50, updated for 3.2.0; added -F951; added -c MJD |
# 2001/10/03: 0.49, code cleanup for use strict and -w MJD |
# 2001/10/03: 0.49, code cleanup for use strict and -w MJD |
# 2001/04/01: 0.48, updated for 3.1.0 (final) MJD |
# 2001/04/01: 0.48, updated for 3.1.0 (final) MJD |
# 2001/03/07: 0.47, YOD WITH HIRIQ corrigendum MJD |
# 2001/03/07: 0.47, YOD WITH HIRIQ corrigendum MJD |
Line 52 use Storable;
|
Line 53 use Storable;
|
# Global variables (options and data arrays) |
# Global variables (options and data arrays) |
use vars qw($OPTB $OPTC $OPTD $OPTE $OPTK |
use vars qw($OPTB $OPTC $OPTD $OPTE $OPTK |
$OPTN $OPTP $OPTS $OPTU $OPTX $OPTYWH |
$OPTN $OPTP $OPTS $OPTU $OPTX $OPTYWH |
$OPTb $OPTd $OPTf $OPTh $OPTn $OPTo |
$OPTb $OPTc $OPTd $OPTf $OPTF951 $OPTh |
$OPTq $OPTs $OPTv $OPTx |
$OPTn $OPTo $OPTq $OPTs $OPTv $OPTx |
%CombClass %CompCano %DecoCano %DecoCanoData |
%CombClass %CompCano %DecoCano %DecoCanoData |
%DecoCanoRest %DecoKompData %DecoKompKind %exists); |
%DecoCanoRest %DecoKompData %DecoKompKind %exists); |
|
|
Line 400 sub ReadCharacterDataFile {
|
Line 401 sub ReadCharacterDataFile {
|
close (BASE); |
close (BASE); |
print STDERR "Finished reading character database.\n" if (!$OPTq); |
print STDERR "Finished reading character database.\n" if (!$OPTq); |
|
|
|
if ($OPTF951) { |
|
$DecoCanoData{"\xEF\xA5\x91"} = |
|
$DecoKompData{"\xEF\xA5\x91"} = "\xE9\x9B\xBB"; |
|
} |
%DecoCanoRest = %DecoCano = %DecoCanoData; # keep original data as is, and |
%DecoCanoRest = %DecoCano = %DecoCanoData; # keep original data as is, and |
# copy to restrict for composition |
# copy to restrict for composition |
|
|
Line 607 sub ReadCharacterDataFile {
|
Line 612 sub ReadCharacterDataFile {
|
'FB4D', # HEBREW LETTER KAF WITH RAFE |
'FB4D', # HEBREW LETTER KAF WITH RAFE |
'FB4E', # HEBREW LETTER PE WITH RAFE |
'FB4E', # HEBREW LETTER PE WITH RAFE |
## post composition exclusion |
## post composition exclusion |
|
'2ADC', # FORKING |
'1D15E', # MUSICAL SYMBOL HALF NOTE |
'1D15E', # MUSICAL SYMBOL HALF NOTE |
'1D15F', # MUSICAL SYMBOL QUARTER NOTE |
'1D15F', # MUSICAL SYMBOL QUARTER NOTE |
'1D160', # MUSICAL SYMBOL EIGHTH NOTE |
'1D160', # MUSICAL SYMBOL EIGHTH NOTE |
'1D161', # MUSICAL SYMBOL SIXTEENTH NOTE |
'1D161', # MUSICAL SYMBOL SIXTEENTH NOTE |
'1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE |
'1D162', # MUSICAL SYMBOL THIRTY-SECOND NOTE |
'1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE |
'1D163', # MUSICAL SYMBOL SIXTY-FOURTH NOTE |
'1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE |
'1D164', # MUSICAL SYMBOL ONE HUNDRED TWENTY-EIGHTH NOTE |
'1D1BB', # MUSICAL SYMBOL MINIMA |
'1D1BB', # MUSICAL SYMBOL MINIMA |
'1D1BC', # MUSICAL SYMBOL MINIMA BLACK |
'1D1BC', # MUSICAL SYMBOL MINIMA BLACK |
'1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE |
'1D1BD', # MUSICAL SYMBOL SEMIMINIMA WHITE |
'1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK |
'1D1BE', # MUSICAL SYMBOL SEMIMINIMA BLACK |
'1D1BF', # MUSICAL SYMBOL FUSA WHITE |
'1D1BF', # MUSICAL SYMBOL FUSA WHITE |
'1D1C0' # MUSICAL SYMBOL FUSA BLACK |
'1D1C0' # MUSICAL SYMBOL FUSA BLACK |
); |
); |
|
|
if (!$OPTYWH) { |
if (!$OPTYWH) { |
Line 743 Available options:
|
Line 749 Available options:
|
(options prefixed by # are currently not available) |
(options prefixed by # are currently not available) |
-b: Remove initial 'Byte Order Mark' |
-b: Remove initial 'Byte Order Mark' |
-B: Supress warning about initial 'Byte Order Mark' |
-B: Supress warning about initial 'Byte Order Mark' |
|
-c: Detect non-normalized data (but do not normalize) |
-C: Do not normalize |
-C: Do not normalize |
-d: Debug: Thoroughly check character data table input |
-d: Debug: Thoroughly check character data table input |
-D: Leave after reading in character data |
-D: Leave after reading in character data |
-e: # remove undefined codepoints |
-e: # remove undefined codepoints |
-E: Do not warn about undefined codepoints |
-E: Do not warn about undefined codepoints |
-f file: Read data from file (no default anymore) |
-f file: Read data from file (no default anymore) |
(please use newest V3.0 datafiles) |
(please use newest V3.2.0 datafiles) |
|
-F951: Use old (wrong) mapping for U+F951 (use this option |
|
if you really need 3.1.0 behaviour) |
-h: Prints out this short description |
-h: Prints out this short description |
-k: # Warn about compatibility codepoints |
-k: # Warn about compatibility codepoints |
-K: Normalize out (i.e. decompose) compatibility codepoints |
-K: Normalize out (i.e. decompose) compatibility codepoints |
Line 768 Available options:
|
Line 777 Available options:
|
-x: Do decomposition only |
-x: Do decomposition only |
-X: Don't do decomposition (assume input is decomposed) |
-X: Don't do decomposition (assume input is decomposed) |
-YWH: Treat YOD WITH HIRIQ as precomposed (use this option |
-YWH: Treat YOD WITH HIRIQ as precomposed (use this option |
if you really need pre-corrigendum behaviour) |
if you really need 3.0.0 behaviour) |
|
|
EOF |
EOF |
# end of raw in-place text |
# end of raw in-place text |
|
|
# ideas for more options: |
# ideas for more options: |
# * don't normalize, just check |
|
# * allow to do kompatibility processing by category |
# * allow to do kompatibility processing by category |
# * warn/remove plane 14 language tag codes and other crap |
# * warn/remove plane 14 language tag codes and other crap |
# * convert crap to what it's supposed to be (difficult) |
# * convert crap to what it's supposed to be (difficult) |
Line 797 sub initialize {
|
Line 805 sub initialize {
|
$_ = shift(@ARGV); |
$_ = shift(@ARGV); |
$OPTb= 1, next OPTIONS if /^-b$/; |
$OPTb= 1, next OPTIONS if /^-b$/; |
$OPTB= 1, next OPTIONS if /^-B$/; |
$OPTB= 1, next OPTIONS if /^-B$/; |
|
$OPTc= 1, next OPTIONS if /^-c$/; |
$OPTC= 1, next OPTIONS if /^-C$/; |
$OPTC= 1, next OPTIONS if /^-C$/; |
$OPTd= 1, next OPTIONS if /^-d$/; |
$OPTd= 1, next OPTIONS if /^-d$/; |
$OPTD= 1, next OPTIONS if /^-D$/; |
$OPTD= 1, next OPTIONS if /^-D$/; |
Line 808 sub initialize {
|
Line 817 sub initialize {
|
" Maybe not what you intend.\n" if ($dataFile =~ /^-.$/ && !$OPTq); |
" Maybe not what you intend.\n" if ($dataFile =~ /^-.$/ && !$OPTq); |
next OPTIONS; |
next OPTIONS; |
} |
} |
|
$OPTF951= 1, next OPTIONS if /^-F951$/; |
$OPTh= 1, next OPTIONS if /^-h$/; |
$OPTh= 1, next OPTIONS if /^-h$/; |
$OPTK= 1, next OPTIONS if /^-K$/; |
$OPTK= 1, next OPTIONS if /^-K$/; |
$OPTn= 1, next OPTIONS if /^-n$/; |
$OPTn= 1, next OPTIONS if /^-n$/; |
Line 924 while (<>) {
|
Line 934 while (<>) {
|
} |
} |
|
|
my @line = splitutf8($_); |
my @line = splitutf8($_); |
|
my @lineoriginal = @line; |
my @line2 = (); |
my @line2 = (); |
|
|
if (!$OPTC) { |
if (!$OPTC) { |
Line 973 while (<>) {
|
Line 984 while (<>) {
|
} |
} |
$#line = $targetPos-1; |
$#line = $targetPos-1; |
} # end of recomposition |
} # end of recomposition |
} #if (!OPTC) |
if ($OPTc && join("",@line) ne join("",@lineoriginal)) { |
|
die "Line $line: Non-normalized data.\nGiving up!\n"; |
|
} |
|
} #if (!$OPTC) |
|
|
printOPT (join "", @line); |
printOPT (join "", @line); |
|
|