multilang: bad language translation check (#30783)

* compare bad against list * use web * Update test_translations.py * uncomment * override * wrap * AssertionError * better * detent * selfish * check numerusforms * already checked * use name * not again * combined * sets * assume available * fix assume * check regardless of other tests * assert not print * raise for status * better * done * useless * happy ruff * better set * quiet * clean * obvious * clearer Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com> * IGNORED_WORDS * assert match * direct assert * show bad word * fix numerous empty string checks * fix IGNORED_WORDS --------- Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com> old-commit-hash: fd88990006
2 years ago · b2112b81af
parent 0312d3d0f3
commit b2112b81af
1 changed files with 29 additions and 0 deletions
--- a/selfdrive/ui/tests/test_translations.py
+++ b/selfdrive/ui/tests/test_translations.py
@ -5,6 +5,8 @@ import re
 import shutil
 import unittest
 import xml.etree.ElementTree as ET
+import string
+import requests

 from openpilot.selfdrive.ui.update_translations import TRANSLATIONS_DIR, LANGUAGES_FILE, update_translations

@ -121,6 +123,33 @@ class TestTranslations(unittest.TestCase):
        matches = re.findall(r'@(\w+);', cur_translations)
        self.assertEqual(len(matches), 0, f"The string(s) {matches} were found with '@' instead of '&'")

+  def test_bad_language(self):
+    IGNORED_WORDS = {'pédale'}
+
+    for name, file in self.translation_files.items():
+      match = re.search(r'_([a-zA-Z]{2,3})', file)
+      assert match, f"{name} - could not parse language"
+
+      response = requests.get(f"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/{match.group(1)}")
+      response.raise_for_status()
+
+      banned_words = {line.strip() for line in response.text.splitlines()}
+
+      for context in ET.parse(os.path.join(TRANSLATIONS_DIR, f"{file}.ts")).getroot():
+        for message in context.iterfind("message"):
+          translation = message.find("translation")
+          if translation.get("type") == "unfinished":
+            continue
+
+          translation_text = " ".join([t.text for t in translation.findall("numerusform")]) if message.get("numerus") == "yes" else translation.text
+
+          if not translation_text:
+            continue
+
+          words = set(translation_text.translate(str.maketrans('', '', string.punctuation + '%n')).lower().split())
+          bad_words_found = words & (banned_words - IGNORED_WORDS)
+          assert not bad_words_found, f"Bad language found in {name}: '{translation_text}'. Bad word(s): {', '.join(bad_words_found)}"
+

 if __name__ == "__main__":
  unittest.main()