#!/usr/bin/perl -w
#
# Translate an XHTML file or generate a POT or PO file for it.
#
# Usage: translate [ -a ] [ -c comment ] [ -r XHTML-file ] [ -- ] [ XHTML-file ]
# or: translate [ -t po-file ] [ -w ] [ -- ] [ XHTML-file ]
#
# The input must be a well-formed XHTML file. If no file is given, the
# standard input is read.
#
# Without options, the program writes a POT file to standard output
# with all parts of the input that are likely to need translation:
# non-empty inline elements, the text of certain attributes, and the
# content of block elements that contain text.
#
# -a
# Output all parts of the input, including the parts that usually
# do not need to be translated, such as comments, empty elements
# and block elements. (In principle, the whole input can be
# reconstructed from the generated output, except for any local
# DTD subset.)
#
# -t PO-FILE
# Instead of generating a PO or POT file, use the given PO file to
# translate the input. The output is an XHTML file.
#
# -w
# Add warnings in the output when no translations are found in the
# PO file (see option -t) for parts that should normally be
# translated. The warning consist of enclosing the untranslated
# part in ""
#
# -c comment
# Add the comment as a developer comment ("#.") at each string.
#
# -r XHTML-file
# Use this XHTML file as a reference. It is assumed to be an
# already translated version of the input file and it must have
# the same structure. The output will be a PO files with
# translations taken from this file.
#
# Exit code is non-zero if the command line could not be parsed, a
# file could not be opened, or a warning (see -w) was inserted.
#
# The translated output isn't validated and may contain mark-up errors
# if the translations in the PO file contain invalid mark-up.
#
# For best results, the XHTML file should be normalized in some
# way. The translate program itself does not do so. E.g., use
#
# hxnormalize -l 10000 -i 0 -x
#
# To do: check character encoding of PO file
#
# To do: check some heuristics to see if the file from option -r
# indeed has the same structure.
#
# Created: 27 April 2012
# Author: Bert Bos
#
# Copyright © 2012 World Wide Web Consortium
# See http://www.w3.org/Consortium/Legal/copyright-software
use strict;
use utf8;
use Getopt::Std;
sub USAGE {
"Usage: translate.pl [ -a ] [ -c comment ] [ -r translated-file ] [ XHTML-file ]\
or: translate.pl -t PO-file [ -w ] [ XHTML-file ]\n"}
my $INTAG = qr/(?:[^\/"'>]|"[^"]*"|'[^']*')+/so;
my $INLINE_ELT = qr/\b(?:EM|STRONG|DFN|CODE|SAMP|KBD|VAR|CITE|ABBR|ACRONYM|A|Q|
SUB|SUP|SPAN|BDO|TT|I|B|BIG|SMALL|TEXTAREA|LABEL|OBJECT|
IMG|BR|DEL|INS)\b/xio;
my $BLOCK_ELT = qr/\b(?:ADDRESS|BLOCQUOTE|CAPTION|DIV|DD|DL|DT|FIELDSET|FORM|
H1|H2|H3|H4|H6|LI|P|OL|TABLE|TBODY|TD|TH|THEAD|TITLE|TR|
UL|PRE)\b/xio;
my $COMMENT = qr//so;
my $PI = qr/<\?.*?>/so;
my $CONTENT = qr/(?:[^<])+/so;
my $INLINE = qr/(?:${CONTENT}|<\/?${INLINE_ELT}${INTAG}?\/?>)+/so;
my $TEXTATTR = qr/\b(?:value|summary|alt|title)\b/io;
# The content of a string in a PO file
my $INSTR = qr/(?:[^\\"]|\\.|"\s*")*/o;
# The parts of a PO file to skip
my $SKIP = qr/(?:\s|#.*|\b(?:msgctxt|msgstr_plural|msgstr\[[0-9]*\])\s*${INSTR})*/mo;
# escape -- escape characters for putting them in a PO file
sub escape($) {
my ($s) = @_;
$s =~ s/&([0-9]+);/<$1>/g;
$s =~ s/\\/\\\\/g;
$s =~ s/\r/\\r/g;
$s =~ s/\t/\\t/g;
$s =~ s/\f/\\f/g;
$s =~ s/"/\\"/g;
$s =~ s/\n/\\n"\n "/go; # Escape and also split the string
$s =~ s/(.{55,67}[ \/=])/$1"\n "/mgo; # Try to split long lines
$s =~ s/"\s*"$//so; # Remove last empty string
return $s;
}
# unescape -- unescape characters and concatenate strings
sub unescape($) {
my ($s) = @_;
$s =~ s/"\s*"//g;
$s =~ s/\\n/\n/g;
$s =~ s/\\"/"/g;
$s =~ s/\\f/\f/g;
$s =~ s/\\t/\t/g;
$s =~ s/\\r/\r/g;
$s =~ s/\\\\/\\/g;
$s =~ s/<([0-9]+)>/&$1;/g;
return $s;
}
# read_messages -- read a PO file and return the string pairs as a hash
sub read_messages($) {
my ($po_file) = @_;
my (%h, $s, $handle);
local $/; # Undefine $/, enable slurp mode
open $handle, $po_file or die $po_file . ": " . $! . "\n";
$s = <$handle>;
close $handle ;
while ($s =~ /\G.*?\bmsgid\s*"(${INSTR})"${SKIP}\bmsgstr\s*"(${INSTR})"/gso) {
$h{unescape($1)} = unescape($2) if ($2 ne "");
}
if (defined $h{""} &&
$h{""} =~ /\^Content-Type\s*:.*charset\s*=\s*([^ ,;]+)/im &&
$1 !~ /^utf-8$/i) {
warn "PO file is not in UTF-8. Results may be inocrrect.\n"
}
return %h;
}
# marker -- generate numbered placeholder and remember the corresponding string
sub marker($$$) {
my ($s, $textref, $indexref) = @_;
return "" if ($s eq ""); # Don't replace an empty string
my $i = $$indexref{$s};
if (defined $i) { # We've seen this string before
return '&' . $i . ';';
} else { # It's a new string
my $n = @$textref;
$$textref[$n] = $s;
$$indexref{$s} = $n;
return '&' . $n . ';';
}
}
# make_chunks -- return an array with all translatable chunks
sub make_chunks($) {
my ($s) = @_;
my $n = 0; # The # of translatable strings
my @text; # Array of translatable strings
my %index; # Hash to translate text to its index in @text
# Get rid of comments and processing instructions first.
#
$s =~ s/${COMMENT}|${PI}/marker($&,\@text,\%index)/gseo;
# Next, replace translatable attributes on block elements, because
# block elements themselvs are not put in the PO file (unless with
# option -a): TITLE, VALUE, ALT, SUMMARY
#
$s =~ s/(<${BLOCK_ELT}${INTAG}${TEXTATTR}\s*=\s*)(?:"([^"]+)"|'([^']+)')/
$1 . '"' . marker(defined $2?$2:$3,\@text,\%index) . '"'/sego;
# Some special cases as well:
# and and
#
$s =~ s/()(${INLINE}?)(<\/${BLOCK_ELT}\s*>)/
marker($1.marker($2,\@text,\%index).$3,\@text,\%index)/xgse ||
$s =~ s/<\w${INTAG}>${CONTENT}?<\/[^>]*>/marker($&,\@text,\%index)/se) {}
push @text, $s;
return @text;
}
# translatable -- check if a string is likely to need translation
sub translatable($) {
my ($s) = @_;
# True if the string starts with content (and the content is not
# just placeholders), or if the string contains an inline element
# with non-empty content, or if the string contains a human-readable
# attribute (VALUE, SUMMARY, ALT or TITLE).
#
return $s !~ /^(?:\s|${COMMENT}|${PI}|&[0-9]+;)*$/ &&
($s !~ /^ || $s =~ /<${INLINE_ELT}${INTAG}?(?:${TEXTATTR}|>${CONTENT})/);
}
# Main body
my @text; # List of chunks
my $orig = ""; # Original language
my @reference; # Translated strings
my $target = ""; # Target language
my %dict; # Translation dictionary read from PO file
my %opts; # Command line options
my $exitcode = 0; # 1 if warnings/errors are detected
getopts('wat:c:r:', \%opts) or die USAGE;
%dict = read_messages($opts{'t'}) if (defined $opts{'t'});
local $/; # Undefine $/, enable slurp mode
my $_ = <>; # Read the file into $_
# See if we know the original language, so that we can put it back in
# if the option -w calls for warnings.
#
if (defined $opts{'w'} &&
/, but "&N;" is more cconvenient at this
# stage.
#
@text = make_chunks($_);
# Generate the translated input (option -t) or a PO/POT file.
#
if (defined $opts{'t'}) {
# Translate all strings.
#
for (my $i = 0; $i < @text; $i++) {
if (defined $dict{$text[$i]}) {
$text[$i] = $dict{$text[$i]};
} elsif (defined $opts{'w'} && translatable($text[$i])) {
$text[$i] = "$text[$i]";
$exitcode = 1;
}
}
# Replace all placeholders with the translated strings. The last
# item in @text is the "root chunk."
#
$_ = pop @text;
while (s/&([0-9]+);/$text[$1]/go) {}
# If the PO file contained a Language header, put a corresponding
# LANG attribute on the HTML element.
#
if (defined $dict{""} && $dict{""} =~ /^Language *: *(\S+)/mi) {
my $lang = unescape($1);
$lang =~ s/"/"/g;
s/(.\n" if ($exitcode);
} else {
# If we have an already translated reference file, analyze that, too.
#
if (defined $opts{'r'}) {
open(my $ref, $opts{'r'}) or die "Cannot open $opts{'r'}\n";
my $s = <$ref>;
$target = $1 if ($s =~ /