From da59f283350343a623820fa9ea48dd1ebb817064 Mon Sep 17 00:00:00 2001 From: "Wladimir J. van der Laan" Date: Thu, 28 Aug 2014 13:09:19 +0200 Subject: [PATCH] Add deeper XML checking to update-translation script - Catch problems such as mismatched formatting characters. Remove messages that can give problems at runtime. - Also remove unfinished/untranslated messages, they just take up space in the ts and waste parsing time. Fixes #4774. --- contrib/devtools/update-translations.py | 148 +++++++++++++++++++++--- 1 file changed, 134 insertions(+), 14 deletions(-) diff --git a/contrib/devtools/update-translations.py b/contrib/devtools/update-translations.py index 1950a426..0be63206 100755 --- a/contrib/devtools/update-translations.py +++ b/contrib/devtools/update-translations.py @@ -14,13 +14,14 @@ It will do the following automatically: TODO: - auto-add new translations to the build system according to the translation process -- remove 'unfinished' translation items ''' from __future__ import division, print_function import subprocess import re import sys import os +import io +import xml.etree.ElementTree as ET # Name of transifex tool TX = 'tx' @@ -40,24 +41,143 @@ def fetch_all_translations(): print('Error while fetching translations', file=sys.stderr) exit(1) -def postprocess_translations(): - print('Postprocessing...') +def find_format_specifiers(s): + '''Find all format specifiers in a string.''' + pos = 0 + specifiers = [] + while True: + percent = s.find('%', pos) + if percent < 0: + break + specifiers.append(s[percent+1]) + pos = percent+2 + return specifiers + +def split_format_specifiers(specifiers): + '''Split format specifiers between numeric (Qt) and others (strprintf)''' + numeric = [] + other = [] + for s in specifiers: + if s in {'1','2','3','4','5','6','7','8','9'}: + numeric.append(s) + else: + other.append(s) + + # numeric (Qt) can be present in any order, others (strprintf) must be in specified order + return set(numeric),other + +def sanitize_string(s): + '''Sanitize string for printing''' + return s.replace('\n',' ') + +def check_format_specifiers(source, translation, errors): + source_f = split_format_specifiers(find_format_specifiers(source)) + # assert that no source messages contain both Qt and strprintf format specifiers + # if this fails, go change the source as this is hacky and confusing! + assert(not(source_f[0] and source_f[1])) + try: + translation_f = split_format_specifiers(find_format_specifiers(translation)) + except IndexError: + errors.append("Parse error in translation '%s'" % sanitize_string(translation)) + return False + else: + if source_f != translation_f: + errors.append("Mismatch between '%s' and '%s'" % (sanitize_string(source), sanitize_string(translation))) + return False + return True + +def all_ts_files(suffix=''): for filename in os.listdir(LOCALE_DIR): # process only language files, and do not process source language - if not filename.endswith('.ts') or filename == SOURCE_LANG: + if not filename.endswith('.ts'+suffix) or filename == SOURCE_LANG+suffix: continue + if suffix: # remove provided suffix + filename = filename[0:-len(suffix)] filepath = os.path.join(LOCALE_DIR, filename) - with open(filepath, 'rb') as f: + yield(filename, filepath) + +FIX_RE = re.compile(b'[\x00-\x09\x0b\x0c\x0e-\x1f]') +def remove_invalid_characters(s): + '''Remove invalid characters from translation string''' + return FIX_RE.sub(b'', s) + +# Override cdata escape function to make our output match Qt's (optional, just for cleaner diffs for +# comparison, disable by default) +_orig_escape_cdata = None +def escape_cdata(text): + text = _orig_escape_cdata(text) + text = text.replace("'", ''') + text = text.replace('"', '"') + return text + +def postprocess_translations(reduce_diff_hacks=False): + print('Checking and postprocessing...') + + if reduce_diff_hacks: + global _orig_escape_cdata + _orig_escape_cdata = ET._escape_cdata + ET._escape_cdata = escape_cdata + + for (filename,filepath) in all_ts_files(): + os.rename(filepath, filepath+'.orig') + + have_errors = False + for (filename,filepath) in all_ts_files('.orig'): + # pre-fixups to cope with transifex output + parser = ET.XMLParser(encoding='utf-8') # need to override encoding because 'utf8' is not understood only 'utf-8' + with open(filepath + '.orig', 'rb') as f: data = f.read() - # remove non-allowed control characters - data = re.sub('[\x00-\x09\x0b\x0c\x0e-\x1f]', '', data) - data = data.split('\n') - # strip locations from non-origin translation - # location tags are used to guide translators, they are not necessary for compilation - # TODO: actually process XML instead of relying on Transifex's one-tag-per-line output format - data = [line for line in data if not '', b'/>') + with open(filepath, 'wb') as f: + f.write(out) + else: + tree.write(filepath, encoding='utf-8') + return have_errors if __name__ == '__main__': check_at_repository_root()