#!/usr/bin/perl -w # # Translate an XHTML file or generate a POT or PO file for it. # # Usage: translate [ -a ] [ -c comment ] [ -r XHTML-file ] [ -- ] [ XHTML-file ] # or: translate [ -t po-file ] [ -w ] [ -- ] [ XHTML-file ] # # The input must be a well-formed XHTML file. If no file is given, the # standard input is read. # # Without options, the program writes a POT file to standard output # with all parts of the input that are likely to need translation: # non-empty inline elements, the text of certain attributes, and the # content of block elements that contain text. # # -a # Output all parts of the input, including the parts that usually # do not need to be translated, such as comments, empty elements # and block elements. (In principle, the whole input can be # reconstructed from the generated output, except for any local # DTD subset.) # # -t PO-FILE # Instead of generating a PO or POT file, use the given PO file to # translate the input. The output is an XHTML file. # # -w # Add warnings in the output when no translations are found in the # PO file (see option -t) for parts that should normally be # translated. The warning consist of enclosing the untranslated # part in "" # # -c comment # Add the comment as a developer comment ("#.") at each string. # # -r XHTML-file # Use this XHTML file as a reference. It is assumed to be an # already translated version of the input file and it must have # the same structure. The output will be a PO files with # translations taken from this file. # # Exit code is non-zero if the command line could not be parsed, a # file could not be opened, or a warning (see -w) was inserted. # # The translated output isn't validated and may contain mark-up errors # if the translations in the PO file contain invalid mark-up. # # For best results, the XHTML file should be normalized in some # way. The translate program itself does not do so. E.g., use # # hxnormalize -l 10000 -i 0 -x # # To do: check character encoding of PO file # # To do: check some heuristics to see if the file from option -r # indeed has the same structure. # # Created: 27 April 2012 # Author: Bert Bos # # Copyright © 2012 World Wide Web Consortium # See http://www.w3.org/Consortium/Legal/copyright-software use strict; use utf8; use Getopt::Std; sub USAGE { "Usage: translate.pl [ -a ] [ -c comment ] [ -r translated-file ] [ XHTML-file ]\ or: translate.pl -t PO-file [ -w ] [ XHTML-file ]\n"} my $INTAG = qr/(?:[^\/"'>]|"[^"]*"|'[^']*')+/so; my $INLINE_ELT = qr/\b(?:EM|STRONG|DFN|CODE|SAMP|KBD|VAR|CITE|ABBR|ACRONYM|A|Q| SUB|SUP|SPAN|BDO|TT|I|B|BIG|SMALL|TEXTAREA|LABEL|OBJECT| IMG|BR|DEL|INS)\b/xio; my $BLOCK_ELT = qr/\b(?:ADDRESS|BLOCQUOTE|CAPTION|DIV|DD|DL|DT|FIELDSET|FORM| H1|H2|H3|H4|H6|LI|P|OL|TABLE|TBODY|TD|TH|THEAD|TITLE|TR| UL|PRE)\b/xio; my $COMMENT = qr//so; my $PI = qr/<\?.*?>/so; my $CONTENT = qr/(?:[^<])+/so; my $INLINE = qr/(?:${CONTENT}|<\/?${INLINE_ELT}${INTAG}?\/?>)+/so; my $TEXTATTR = qr/\b(?:value|summary|alt|title)\b/io; # The content of a string in a PO file my $INSTR = qr/(?:[^\\"]|\\.|"\s*")*/o; # The parts of a PO file to skip my $SKIP = qr/(?:\s|#.*|\b(?:msgctxt|msgstr_plural|msgstr\[[0-9]*\])\s*${INSTR})*/mo; # escape -- escape characters for putting them in a PO file sub escape($) { my ($s) = @_; $s =~ s/&([0-9]+);/<$1>/g; $s =~ s/\\/\\\\/g; $s =~ s/\r/\\r/g; $s =~ s/\t/\\t/g; $s =~ s/\f/\\f/g; $s =~ s/"/\\"/g; $s =~ s/\n/\\n"\n "/go; # Escape and also split the string $s =~ s/(.{55,67}[ \/=])/$1"\n "/mgo; # Try to split long lines $s =~ s/"\s*"$//so; # Remove last empty string return $s; } # unescape -- unescape characters and concatenate strings sub unescape($) { my ($s) = @_; $s =~ s/"\s*"//g; $s =~ s/\\n/\n/g; $s =~ s/\\"/"/g; $s =~ s/\\f/\f/g; $s =~ s/\\t/\t/g; $s =~ s/\\r/\r/g; $s =~ s/\\\\/\\/g; $s =~ s/<([0-9]+)>/&$1;/g; return $s; } # read_messages -- read a PO file and return the string pairs as a hash sub read_messages($) { my ($po_file) = @_; my (%h, $s, $handle); local $/; # Undefine $/, enable slurp mode open $handle, $po_file or die $po_file . ": " . $! . "\n"; $s = <$handle>; close $handle ; while ($s =~ /\G.*?\bmsgid\s*"(${INSTR})"${SKIP}\bmsgstr\s*"(${INSTR})"/gso) { $h{unescape($1)} = unescape($2) if ($2 ne ""); } if (defined $h{""} && $h{""} =~ /\^Content-Type\s*:.*charset\s*=\s*([^ ,;]+)/im && $1 !~ /^utf-8$/i) { warn "PO file is not in UTF-8. Results may be inocrrect.\n" } return %h; } # marker -- generate numbered placeholder and remember the corresponding string sub marker($$$) { my ($s, $textref, $indexref) = @_; return "" if ($s eq ""); # Don't replace an empty string my $i = $$indexref{$s}; if (defined $i) { # We've seen this string before return '&' . $i . ';'; } else { # It's a new string my $n = @$textref; $$textref[$n] = $s; $$indexref{$s} = $n; return '&' . $n . ';'; } } # make_chunks -- return an array with all translatable chunks sub make_chunks($) { my ($s) = @_; my $n = 0; # The # of translatable strings my @text; # Array of translatable strings my %index; # Hash to translate text to its index in @text # Get rid of comments and processing instructions first. # $s =~ s/${COMMENT}|${PI}/marker($&,\@text,\%index)/gseo; # Next, replace translatable attributes on block elements, because # block elements themselvs are not put in the PO file (unless with # option -a): TITLE, VALUE, ALT, SUMMARY # $s =~ s/(<${BLOCK_ELT}${INTAG}${TEXTATTR}\s*=\s*)(?:"([^"]+)"|'([^']+)')/ $1 . '"' . marker(defined $2?$2:$3,\@text,\%index) . '"'/sego; # Some special cases as well: # and and # $s =~ s/()(${INLINE}?)(<\/${BLOCK_ELT}\s*>)/ marker($1.marker($2,\@text,\%index).$3,\@text,\%index)/xgse || $s =~ s/<\w${INTAG}>${CONTENT}?<\/[^>]*>/marker($&,\@text,\%index)/se) {} push @text, $s; return @text; } # translatable -- check if a string is likely to need translation sub translatable($) { my ($s) = @_; # True if the string starts with content (and the content is not # just placeholders), or if the string contains an inline element # with non-empty content, or if the string contains a human-readable # attribute (VALUE, SUMMARY, ALT or TITLE). # return $s !~ /^(?:\s|${COMMENT}|${PI}|&[0-9]+;)*$/ && ($s !~ /^${CONTENT})/); } # Main body my @text; # List of chunks my $orig = ""; # Original language my @reference; # Translated strings my $target = ""; # Target language my %dict; # Translation dictionary read from PO file my %opts; # Command line options my $exitcode = 0; # 1 if warnings/errors are detected getopts('wat:c:r:', \%opts) or die USAGE; %dict = read_messages($opts{'t'}) if (defined $opts{'t'}); local $/; # Undefine $/, enable slurp mode my $_ = <>; # Read the file into $_ # See if we know the original language, so that we can put it back in # if the option -w calls for warnings. # if (defined $opts{'w'} && /, but "&N;" is more cconvenient at this # stage. # @text = make_chunks($_); # Generate the translated input (option -t) or a PO/POT file. # if (defined $opts{'t'}) { # Translate all strings. # for (my $i = 0; $i < @text; $i++) { if (defined $dict{$text[$i]}) { $text[$i] = $dict{$text[$i]}; } elsif (defined $opts{'w'} && translatable($text[$i])) { $text[$i] = "$text[$i]"; $exitcode = 1; } } # Replace all placeholders with the translated strings. The last # item in @text is the "root chunk." # $_ = pop @text; while (s/&([0-9]+);/$text[$1]/go) {} # If the PO file contained a Language header, put a corresponding # LANG attribute on the HTML element. # if (defined $dict{""} && $dict{""} =~ /^Language *: *(\S+)/mi) { my $lang = unescape($1); $lang =~ s/"/"/g; s/(.\n" if ($exitcode); } else { # If we have an already translated reference file, analyze that, too. # if (defined $opts{'r'}) { open(my $ref, $opts{'r'}) or die "Cannot open $opts{'r'}\n"; my $s = <$ref>; $target = $1 if ($s =~ /