#!/usr/bin/perl -w =pod =head1 NAME poxhtml - translate an XHTML file or generate a PO file for it =head1 SYNOPSIS poxhtml [ -a ] [ -c comment ] [ -r XHTML-file ] [ -- ] [ XHTML-file ] poxhtml [ -t po-file ] [ -w ] [ -- ] [ XHTML-file ] =head1 DESCRIPTION Without the B<-t> option, B takes an XHTML file and generates a PO file with all translatable text. With the B<-t> option, it takes an XHTML file and uses the PO file of the B<-t> option to generate a translation of the XHTML file. Thus a typical sequence of actions is: poxhtml my-english-file.xhtml >french.po ... translate french.po to French... poxhtml -t french.po my-english-file.xhtml >my-french-file.xhtml If no XHTML file is given, B reads from the standard input. Output is always to standard output. The translated output isn't validated and may contain mark-up errors if the translations in the PO file contain invalid mark-up. For best results, the XHTML file should be normalized in some way. The translate program itself does not do so. E.g., use =begin text hxnormalize -l 10000 -i 0 -x =end text =head1 OPTIONS =over =item B<-a> Normally, B only outputs the text that is likely to need translation. It doesn't output chunks of text that only consist of mark-up. With this option, it outputs everything. In principle, it is possible to reconstruct the XHTML file from the PO file (except for the DTD subset, if any). The process would be as follows: start with the very last msgid in the PO file and look for placeholders of the form EIE (where I is a number. Then find the msgid with I preceded by S<"# I"> and replace the placeholder by that msgid. And so on, recursively. =item B<-c> I Adds I as a comment before every translatable string, i.e., as a line of the form S<"# I">. =item B<-r> I The I is assumed to be an already translated version of the XHTML file for which a PO file is generated. B will try to use the translated file to pre-fill translations in the PO file. If the two XHTML files cannot be aligned, i.e., if they do not have the same number of translatable chunks, B will give an error and will not output any PO file. =item B<-t> I I must be a valid PO file. The presence of the B<-t> option indicates that B should not output a PO file for the given XHTML file, but should try to translate the file, using the strings in I. If there are any chunks of translatable text in the XHTML file for which no translation exists in I, they will be left untranslated in the output. =item B<-w> With option B<-w>, any translatable text for which no translation was found in the PO file (see option B<-t>) will be enclosed in Cins class=untranslatedE...EinsE>. =back =head1 EXIT STATUS Exit code is non-zero if the command line could not be parsed, a file could not be opened, or a warning (see B<-w>) was inserted. =head1 BUGS Currently only handles XHTML (1.0 or 1.1). When converting HTML documents in the XML encoding of HTML5 ot PO files, some chunks may be missing (unless B<-a> is used). =cut # TODO: Check character encoding of PO file # # TODO: Do something smart with msgctxt to allow different # translations for the same string in different contexts. # # TODO: Add the new elements from HTML5. # # Created: 27 April 2012 # Author: Bert Bos # # Copyright © 2012 World Wide Web Consortium # See http://www.w3.org/Consortium/Legal/copyright-software use strict; use utf8; use Getopt::Std; sub USAGE { my $p = $0 =~ s/.*\///r; "Usage: $p [ -a ] [ -c comment ] [ -r translated-file ] [ XHTML-file ]\ or: $p -t PO-file [ -w ] [ XHTML-file ]\n"} my $INTAG = qr/(?:[^\/"'>]|"[^"]*"|'[^']*')+/so; my $INLINE_ELT = qr/\b(?:EM|STRONG|DFN|CODE|SAMP|KBD|VAR|CITE|ABBR|ACRONYM|A|Q| SUB|SUP|SPAN|BDO|TT|I|B|BIG|SMALL|TEXTAREA|LABEL|OBJECT| IMG|BR|DEL|INS)\b/xio; my $BLOCK_ELT = qr/\b(?:ADDRESS|BLOCQUOTE|CAPTION|DIV|DD|DL|DT|FIELDSET|FORM| H1|H2|H3|H4|H6|LI|P|OL|TABLE|TBODY|TD|TH|THEAD|TITLE|TR| UL|PRE)\b/xio; my $COMMENT = qr//so; my $PI = qr/<\?.*?>/so; my $CONTENT = qr/(?:[^<])+/so; my $INLINE = qr/(?:${CONTENT}|<\/?${INLINE_ELT}${INTAG}?\/?>)+/so; my $TEXTATTR = qr/\b(?:value|summary|alt|title)\b/io; #my $NOTRANS = qr/\bclass\b\s*=\s*(?:notranslate\b|(?:"[^"]*|'[^']*)\bnotranslate\b)/io; #my $NOTRANSNOTE = "Do not translate or show both translation and original."; # The content of a string (or sequence of strings) in a PO file my $INSTR = qr/(?:[^\\"]|\\.|"\s*")*/o; # The parts of a PO file to skip my $SKIP = qr/(?:\s|#.*|\b(?:msgctxt|msgstr_plural|msgstr\[[0-9]*\])\s*${INSTR})*/mo; # escape -- escape characters for putting them in a PO file sub escape($) { my ($s) = @_; $s =~ s/&([0-9]+);/<$1>/g; $s =~ s/\\/\\\\/g; $s =~ s/\r/\\r/g; $s =~ s/\t/\\t/g; $s =~ s/\f/\\f/g; $s =~ s/"/\\"/g; $s =~ s/\n/\\n"\n "/go; # Escape and also split the string $s =~ s/(.{55,67}[ \/=])/$1"\n "/mgo; # Try to split long lines $s =~ s/"\s*"$//so; # Remove last empty string return $s; } # unescape -- unescape characters and concatenate strings sub unescape($) { my ($s) = @_; $s =~ s/^"\s*"//; # Remove an initial empty string $s =~ s/([^\\])"\s*"/$1/g; # Concatenate multiple strings $s =~ s/\\n/\n/g; $s =~ s/\\"/"/g; $s =~ s/\\f/\f/g; $s =~ s/\\t/\t/g; $s =~ s/\\r/\r/g; $s =~ s/\\\\/\\/g; $s =~ s/<([0-9]+)>/&$1;/g; return $s; } # read_messages -- read a PO file and return the string pairs as a hash sub read_messages($) { my ($po_file) = @_; my (%h, $s, $handle); local $/; # Undefine $/, enable slurp mode open $handle, $po_file or die $po_file . ": " . $! . "\n"; $s = <$handle>; close $handle ; while ($s =~ /\G.*?\bmsgid\s*"(${INSTR})"${SKIP}\bmsgstr\s*"(${INSTR})"/gso) { $h{unescape($1)} = unescape($2) if ($2 ne ""); } if (defined $h{""} && $h{""} =~ /\^Content-Type\s*:.*charset\s*=\s*([^ ,;]+)/im && $1 !~ /^utf-8$/i) { warn "PO file is not in UTF-8. Results may be inocrrect.\n" } return %h; } # marker -- generate numbered placeholder and remember the corresponding string sub marker($$$) { my ($s, $textref, $indexref) = @_; return "" if ($s eq ""); # Don't replace an empty string my $i = $$indexref{$s}; if (defined $i) { # We've seen this string before return '&' . $i . ';'; } else { # It's a new string my $n = @$textref; $$textref[$n] = $s; $$indexref{$s} = $n; return '&' . $n . ';'; } } # make_chunks -- return an array with all translatable chunks sub make_chunks($) { my ($s) = @_; my $n = 0; # The # of translatable strings my @text; # Array of translatable strings my %index; # Inverted @text, to find duplicate strings # Get rid of comments and processing instructions first. # $s =~ s/${COMMENT}|${PI}/marker($&,\@text,\%index)/gseo; # Next, replace translatable attributes on block elements, because # block elements themselves are not put in the PO file (unless with # option -a): TITLE, VALUE, ALT, SUMMARY # $s =~ s/(<${BLOCK_ELT}${INTAG}${TEXTATTR}\s*=\s*)(?:"([^"]+)"|'([^']+)')/ $1 . '"' . marker(defined $2?$2:$3,\@text,\%index) . '"'/sego; # Some special cases as well: # and and # $s =~ s/()(${INLINE}?)(<\/${BLOCK_ELT}\s*>)/ marker($1.marker($2,\@text,\%index).$3,\@text,\%index)/xgse || $s =~ s/<\w${INTAG}>${CONTENT}?<\/[^>]*>/marker($&,\@text,\%index)/se) {} push @text, $s; return @text; } # translatable -- check if a string is likely to need translation sub translatable($) { my ($s) = @_; # True if the string starts with content (and the content is not # just placeholders), or if the string contains an inline element # with non-empty content, or if the string contains a human-readable # attribute (VALUE, SUMMARY, ALT or TITLE). # return $s !~ /^(?:\s|${COMMENT}|${PI}|&[0-9]+;)*$/ && ($s !~ /^${CONTENT})/); } # Main body my @text; # List of chunks my $orig = ""; # Original language my @reference; # Translated strings my $target = ""; # Target language my %dict; # Translation dictionary read from PO file my %opts; # Command line options my $exitcode = 0; # 1 if warnings/errors are detected getopts('wat:c:r:', \%opts) or die USAGE; %dict = read_messages($opts{'t'}) if (defined $opts{'t'}); local $/; # Undefine $/, enable slurp mode # my $_ = <>; # Read the file into $_ $_ = <>; # Read the file into $_ # See if we know the original language, so that we can put it back in # if the option -w calls for warnings. # if (defined $opts{'w'} && /, but "&N;" is more cconvenient at this # stage. # @text = make_chunks($_); # Generate the translated input (option -t) or a PO/POT file. # if (defined $opts{'t'}) { # Translate all strings. # for (my $i = 0; $i < @text; $i++) { if (defined $dict{$text[$i]}) { $text[$i] = $dict{$text[$i]}; } elsif (defined $opts{'w'} && translatable($text[$i])) { $text[$i]= "$text[$i]"; $exitcode = 1; } } # Replace all placeholders with the translated strings. The last # item in @text is the "root chunk." # $_ = pop @text; while (s/&([0-9]+);/$text[$1]/go) {} # If the PO file contained a Language header, put a corresponding # LANG attribute on the HTML element. # if (defined $dict{""} && $dict{""} =~ /^Language *: *(\S+)/mi) { my $lang = unescape($1); $lang =~ s/"/"/g; s/(.\n" if ($exitcode); } else { # If we have an already translated reference file, analyze that, too. # if (defined $opts{'r'}) { open(my $ref, $opts{'r'}) or die "Cannot open $opts{'r'}\n"; my $s = <$ref>; $target = $1 if ($s =~ /