From b2112b81af8e47c21de063dbfe66505fa43c8b72 Mon Sep 17 00:00:00 2001 From: royjr Date: Sun, 31 Dec 2023 13:58:15 -0500 Subject: [PATCH] multilang: bad language translation check (#30783) * compare bad against list * use web * Update test_translations.py * uncomment * override * wrap * AssertionError * better * detent * selfish * check numerusforms * already checked * use name * not again * combined * sets * assume available * fix assume * check regardless of other tests * assert not print * raise for status * better * done * useless * happy ruff * better set * quiet * clean * obvious * clearer Co-authored-by: Adeeb Shihadeh * IGNORED_WORDS * assert match * direct assert * show bad word * fix numerous empty string checks * fix IGNORED_WORDS --------- Co-authored-by: Adeeb Shihadeh old-commit-hash: fd88990006d8d5d07f4cc79af64ae51999d1159c --- selfdrive/ui/tests/test_translations.py | 29 +++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/selfdrive/ui/tests/test_translations.py b/selfdrive/ui/tests/test_translations.py index 622d198f7d..c3c1760dbd 100755 --- a/selfdrive/ui/tests/test_translations.py +++ b/selfdrive/ui/tests/test_translations.py @@ -5,6 +5,8 @@ import re import shutil import unittest import xml.etree.ElementTree as ET +import string +import requests from openpilot.selfdrive.ui.update_translations import TRANSLATIONS_DIR, LANGUAGES_FILE, update_translations @@ -121,6 +123,33 @@ class TestTranslations(unittest.TestCase): matches = re.findall(r'@(\w+);', cur_translations) self.assertEqual(len(matches), 0, f"The string(s) {matches} were found with '@' instead of '&'") + def test_bad_language(self): + IGNORED_WORDS = {'pédale'} + + for name, file in self.translation_files.items(): + match = re.search(r'_([a-zA-Z]{2,3})', file) + assert match, f"{name} - could not parse language" + + response = requests.get(f"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/{match.group(1)}") + response.raise_for_status() + + banned_words = {line.strip() for line in response.text.splitlines()} + + for context in ET.parse(os.path.join(TRANSLATIONS_DIR, f"{file}.ts")).getroot(): + for message in context.iterfind("message"): + translation = message.find("translation") + if translation.get("type") == "unfinished": + continue + + translation_text = " ".join([t.text for t in translation.findall("numerusform")]) if message.get("numerus") == "yes" else translation.text + + if not translation_text: + continue + + words = set(translation_text.translate(str.maketrans('', '', string.punctuation + '%n')).lower().split()) + bad_words_found = words & (banned_words - IGNORED_WORDS) + assert not bad_words_found, f"Bad language found in {name}: '{translation_text}'. Bad word(s): {', '.join(bad_words_found)}" + if __name__ == "__main__": unittest.main()