From b2112b81af8e47c21de063dbfe66505fa43c8b72 Mon Sep 17 00:00:00 2001
From: royjr <royjr96@gmail.com>
Date: Sun, 31 Dec 2023 13:58:15 -0500
Subject: [PATCH] multilang: bad language translation check (#30783)

* compare bad against list

* use web

* Update test_translations.py

* uncomment

* override

* wrap

* AssertionError

* better

* detent

* selfish

* check numerusforms

* already checked

* use name

* not again

* combined

* sets

* assume available

* fix assume

* check regardless of other tests

* assert not print

* raise for status

* better

* done

* useless

* happy ruff

* better set

* quiet

* clean

* obvious

* clearer

Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com>

* IGNORED_WORDS

* assert match

* direct assert

* show bad word

* fix numerous empty string checks

* fix IGNORED_WORDS

---------

Co-authored-by: Adeeb Shihadeh <adeebshihadeh@gmail.com>
old-commit-hash: fd88990006d8d5d07f4cc79af64ae51999d1159c
---
 selfdrive/ui/tests/test_translations.py | 29 +++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/selfdrive/ui/tests/test_translations.py b/selfdrive/ui/tests/test_translations.py
index 622d198f7d..c3c1760dbd 100755
--- a/selfdrive/ui/tests/test_translations.py
+++ b/selfdrive/ui/tests/test_translations.py
@@ -5,6 +5,8 @@ import re
 import shutil
 import unittest
 import xml.etree.ElementTree as ET
+import string
+import requests
 
 from openpilot.selfdrive.ui.update_translations import TRANSLATIONS_DIR, LANGUAGES_FILE, update_translations
 
@@ -121,6 +123,33 @@ class TestTranslations(unittest.TestCase):
         matches = re.findall(r'@(\w+);', cur_translations)
         self.assertEqual(len(matches), 0, f"The string(s) {matches} were found with '@' instead of '&'")
 
+  def test_bad_language(self):
+    IGNORED_WORDS = {'pédale'}
+
+    for name, file in self.translation_files.items():
+      match = re.search(r'_([a-zA-Z]{2,3})', file)
+      assert match, f"{name} - could not parse language"
+
+      response = requests.get(f"https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/{match.group(1)}")
+      response.raise_for_status()
+
+      banned_words = {line.strip() for line in response.text.splitlines()}
+
+      for context in ET.parse(os.path.join(TRANSLATIONS_DIR, f"{file}.ts")).getroot():
+        for message in context.iterfind("message"):
+          translation = message.find("translation")
+          if translation.get("type") == "unfinished":
+            continue
+
+          translation_text = " ".join([t.text for t in translation.findall("numerusform")]) if message.get("numerus") == "yes" else translation.text
+
+          if not translation_text:
+            continue
+
+          words = set(translation_text.translate(str.maketrans('', '', string.punctuation + '%n')).lower().split())
+          bad_words_found = words & (banned_words - IGNORED_WORDS)
+          assert not bad_words_found, f"Bad language found in {name}: '{translation_text}'. Bad word(s): {', '.join(bad_words_found)}"
+
 
 if __name__ == "__main__":
   unittest.main()