#!/usr/bin/perl -w
=pod
=head1 NAME
poxhtml - translate an XHTML file or generate a PO file for it
=head1 SYNOPSIS
poxhtml [ -a ] [ -c comment ] [ -r XHTML-file ] [ -- ] [ XHTML-file ]
poxhtml [ -t po-file ] [ -w ] [ -- ] [ XHTML-file ]
=head1 DESCRIPTION
Without the B<-t> option, B takes an XHTML file and generates
a PO file with all translatable text. With the B<-t> option, it takes
an XHTML file and uses the PO file of the B<-t> option to generate a
translation of the XHTML file.
Thus a typical sequence of actions is:
poxhtml my-english-file.xhtml >french.po
... translate french.po to French...
poxhtml -t french.po my-english-file.xhtml >my-french-file.xhtml
If no XHTML file is given, B reads from the standard input.
Output is always to standard output.
The translated output isn't validated and may contain mark-up errors
if the translations in the PO file contain invalid mark-up.
For best results, the XHTML file should be normalized in some
way. The translate program itself does not do so. E.g., use
=begin text
hxnormalize -l 10000 -i 0 -x
=end text
=head1 OPTIONS
=over
=item B<-a>
Normally, B only outputs the text that is likely to need
translation. It doesn't output chunks of text that only consist of
mark-up. With this option, it outputs everything.
In principle, it is possible to reconstruct the XHTML file from the PO
file (except for the DTD subset, if any). The process would be as
follows: start with the very last msgid in the PO file and look for
placeholders of the form EIE (where I is a number. Then
find the msgid with I preceded by S<"# I"> and replace the
placeholder by that msgid. And so on, recursively.
=item B<-c> I
Adds I as a comment before every translatable string, i.e.,
as a line of the form S<"# I">.
=item B<-r> I
The I is assumed to be an already translated version of
the XHTML file for which a PO file is generated. B will try
to use the translated file to pre-fill translations in the PO file. If
the two XHTML files cannot be aligned, i.e., if they do not have the
same number of translatable chunks, B will give an error and
will not output any PO file.
=item B<-t> I
I must be a valid PO file. The presence of the B<-t> option
indicates that B should not output a PO file for the given
XHTML file, but should try to translate the file, using the strings in
I.
If there are any chunks of translatable text in the XHTML file for
which no translation exists in I, they will be left
untranslated in the output.
=item B<-w>
With option B<-w>, any translatable text for which no translation was
found in the PO file (see option B<-t>) will be enclosed in Cins
class=untranslatedE...EinsE>.
=back
=head1 EXIT STATUS
Exit code is non-zero if the command line could not be parsed, a file
could not be opened, or a warning (see B<-w>) was inserted.
=head1 BUGS
Currently only handles XHTML (1.0 or 1.1). When converting HTML
documents in the XML encoding of HTML5 ot PO files, some chunks may be
missing (unless B<-a> is used).
=cut
# TODO: Check character encoding of PO file
#
# TODO: Do something smart with msgctxt to allow different
# translations for the same string in different contexts.
#
# TODO: Add the new elements from HTML5.
#
# Created: 27 April 2012
# Author: Bert Bos
#
# Copyright © 2012 World Wide Web Consortium
# See http://www.w3.org/Consortium/Legal/copyright-software
use strict;
use utf8;
use Getopt::Std;
sub USAGE {
my $p = $0 =~ s/.*\///r;
"Usage: $p [ -a ] [ -c comment ] [ -r translated-file ] [ XHTML-file ]\
or: $p -t PO-file [ -w ] [ XHTML-file ]\n"}
my $INTAG = qr/(?:[^\/"'>]|"[^"]*"|'[^']*')+/so;
my $INLINE_ELT = qr/\b(?:EM|STRONG|DFN|CODE|SAMP|KBD|VAR|CITE|ABBR|ACRONYM|A|Q|
SUB|SUP|SPAN|BDO|TT|I|B|BIG|SMALL|TEXTAREA|LABEL|OBJECT|
IMG|BR|DEL|INS)\b/xio;
my $BLOCK_ELT = qr/\b(?:ADDRESS|BLOCQUOTE|CAPTION|DIV|DD|DL|DT|FIELDSET|FORM|
H1|H2|H3|H4|H6|LI|P|OL|TABLE|TBODY|TD|TH|THEAD|TITLE|TR|
UL|PRE)\b/xio;
my $COMMENT = qr//so;
my $PI = qr/<\?.*?>/so;
my $CONTENT = qr/(?:[^<])+/so;
my $INLINE = qr/(?:${CONTENT}|<\/?${INLINE_ELT}${INTAG}?\/?>)+/so;
my $TEXTATTR = qr/\b(?:value|summary|alt|title)\b/io;
#my $NOTRANS = qr/\bclass\b\s*=\s*(?:notranslate\b|(?:"[^"]*|'[^']*)\bnotranslate\b)/io;
#my $NOTRANSNOTE = "Do not translate or show both translation and original.";
# The content of a string (or sequence of strings) in a PO file
my $INSTR = qr/(?:[^\\"]|\\.|"\s*")*/o;
# The parts of a PO file to skip
my $SKIP = qr/(?:\s|#.*|\b(?:msgctxt|msgstr_plural|msgstr\[[0-9]*\])\s*${INSTR})*/mo;
# escape -- escape characters for putting them in a PO file
sub escape($) {
my ($s) = @_;
$s =~ s/&([0-9]+);/<$1>/g;
$s =~ s/\\/\\\\/g;
$s =~ s/\r/\\r/g;
$s =~ s/\t/\\t/g;
$s =~ s/\f/\\f/g;
$s =~ s/"/\\"/g;
$s =~ s/\n/\\n"\n "/go; # Escape and also split the string
$s =~ s/(.{55,67}[ \/=])/$1"\n "/mgo; # Try to split long lines
$s =~ s/"\s*"$//so; # Remove last empty string
return $s;
}
# unescape -- unescape characters and concatenate strings
sub unescape($) {
my ($s) = @_;
$s =~ s/^"\s*"//; # Remove an initial empty string
$s =~ s/([^\\])"\s*"/$1/g; # Concatenate multiple strings
$s =~ s/\\n/\n/g;
$s =~ s/\\"/"/g;
$s =~ s/\\f/\f/g;
$s =~ s/\\t/\t/g;
$s =~ s/\\r/\r/g;
$s =~ s/\\\\/\\/g;
$s =~ s/<([0-9]+)>/&$1;/g;
return $s;
}
# read_messages -- read a PO file and return the string pairs as a hash
sub read_messages($) {
my ($po_file) = @_;
my (%h, $s, $handle);
local $/; # Undefine $/, enable slurp mode
open $handle, $po_file or die $po_file . ": " . $! . "\n";
$s = <$handle>;
close $handle ;
while ($s =~ /\G.*?\bmsgid\s*"(${INSTR})"${SKIP}\bmsgstr\s*"(${INSTR})"/gso) {
$h{unescape($1)} = unescape($2) if ($2 ne "");
}
if (defined $h{""} &&
$h{""} =~ /\^Content-Type\s*:.*charset\s*=\s*([^ ,;]+)/im &&
$1 !~ /^utf-8$/i) {
warn "PO file is not in UTF-8. Results may be inocrrect.\n"
}
return %h;
}
# marker -- generate numbered placeholder and remember the corresponding string
sub marker($$$) {
my ($s, $textref, $indexref) = @_;
return "" if ($s eq ""); # Don't replace an empty string
my $i = $$indexref{$s};
if (defined $i) { # We've seen this string before
return '&' . $i . ';';
} else { # It's a new string
my $n = @$textref;
$$textref[$n] = $s;
$$indexref{$s} = $n;
return '&' . $n . ';';
}
}
# make_chunks -- return an array with all translatable chunks
sub make_chunks($) {
my ($s) = @_;
my $n = 0; # The # of translatable strings
my @text; # Array of translatable strings
my %index; # Inverted @text, to find duplicate strings
# Get rid of comments and processing instructions first.
#
$s =~ s/${COMMENT}|${PI}/marker($&,\@text,\%index)/gseo;
# Next, replace translatable attributes on block elements, because
# block elements themselves are not put in the PO file (unless with
# option -a): TITLE, VALUE, ALT, SUMMARY
#
$s =~ s/(<${BLOCK_ELT}${INTAG}${TEXTATTR}\s*=\s*)(?:"([^"]+)"|'([^']+)')/
$1 . '"' . marker(defined $2?$2:$3,\@text,\%index) . '"'/sego;
# Some special cases as well:
# and and
#
$s =~ s/()(${INLINE}?)(<\/${BLOCK_ELT}\s*>)/
marker($1.marker($2,\@text,\%index).$3,\@text,\%index)/xgse ||
$s =~ s/<\w${INTAG}>${CONTENT}?<\/[^>]*>/marker($&,\@text,\%index)/se) {}
push @text, $s;
return @text;
}
# translatable -- check if a string is likely to need translation
sub translatable($) {
my ($s) = @_;
# True if the string starts with content (and the content is not
# just placeholders), or if the string contains an inline element
# with non-empty content, or if the string contains a human-readable
# attribute (VALUE, SUMMARY, ALT or TITLE).
#
return $s !~ /^(?:\s|${COMMENT}|${PI}|&[0-9]+;)*$/ &&
($s !~ /^ || $s =~ /<${INLINE_ELT}${INTAG}?(?:${TEXTATTR}|>${CONTENT})/);
}
# Main body
my @text; # List of chunks
my $orig = ""; # Original language
my @reference; # Translated strings
my $target = ""; # Target language
my %dict; # Translation dictionary read from PO file
my %opts; # Command line options
my $exitcode = 0; # 1 if warnings/errors are detected
getopts('wat:c:r:', \%opts) or die USAGE;
%dict = read_messages($opts{'t'}) if (defined $opts{'t'});
local $/; # Undefine $/, enable slurp mode
# my $_ = <>; # Read the file into $_
$_ = <>; # Read the file into $_
# See if we know the original language, so that we can put it back in
# if the option -w calls for warnings.
#
if (defined $opts{'w'} &&
/, but "&N;" is more cconvenient at this
# stage.
#
@text = make_chunks($_);
# Generate the translated input (option -t) or a PO/POT file.
#
if (defined $opts{'t'}) {
# Translate all strings.
#
for (my $i = 0; $i < @text; $i++) {
if (defined $dict{$text[$i]}) {
$text[$i] = $dict{$text[$i]};
} elsif (defined $opts{'w'} && translatable($text[$i])) {
$text[$i]= "$text[$i]";
$exitcode = 1;
}
}
# Replace all placeholders with the translated strings. The last
# item in @text is the "root chunk."
#
$_ = pop @text;
while (s/&([0-9]+);/$text[$1]/go) {}
# If the PO file contained a Language header, put a corresponding
# LANG attribute on the HTML element.
#
if (defined $dict{""} && $dict{""} =~ /^Language *: *(\S+)/mi) {
my $lang = unescape($1);
$lang =~ s/"/"/g;
s/(.\n" if ($exitcode);
} else {
# If we have an already translated reference file, analyze that, too.
#
if (defined $opts{'r'}) {
open(my $ref, $opts{'r'}) or die "Cannot open $opts{'r'}\n";
my $s = <$ref>;
$target = $1 if ($s =~ /