From 452a89d1e5b080d74ea2e9fc17361dbbe70b43c0 Mon Sep 17 00:00:00 2001 From: gers Date: Wed, 18 Sep 2024 15:42:43 +0300 Subject: [PATCH] feat(machinery): aws glossary support (#12520) * feature implementation * documentation updates * update download_languages implementation Fixes #10527 --- docs/admin/machine.rst | 2 + docs/changes.rst | 3 +- docs/user/glossary.rst | 1 + scripts/generate-aws-languages | 31 ------ weblate/machinery/aws.py | 160 +++++++++++++++---------------- weblate/machinery/tests.py | 166 ++++++++++++++++++++++++++++++++- 6 files changed, 245 insertions(+), 118 deletions(-) delete mode 100755 scripts/generate-aws-languages diff --git a/docs/admin/machine.rst b/docs/admin/machine.rst index cf1dc9c0586f..8019b451308e 100644 --- a/docs/admin/machine.rst +++ b/docs/admin/machine.rst @@ -88,6 +88,8 @@ Amazon Translate Amazon Translate is a neural machine translation service for translating text to and from English across a breadth of supported languages. +The service automatically uses :ref:`glossary`, see :ref:`glossary-mt`. + .. seealso:: `Amazon Translate Documentation `_ diff --git a/docs/changes.rst b/docs/changes.rst index 9058ef24219d..0895c257162b 100644 --- a/docs/changes.rst +++ b/docs/changes.rst @@ -10,7 +10,8 @@ Not yet released. **Improvements** -* :ref`mt-deepl` now supports specifying translation context. +* :ref:`mt-deepl` now supports specifying translation context. +* :ref:`mt-aws` now supports :ref:`glossary-mt`. **Bug fixes** diff --git a/docs/user/glossary.rst b/docs/user/glossary.rst index 38b1f9107093..bcc78f90d239 100644 --- a/docs/user/glossary.rst +++ b/docs/user/glossary.rst @@ -127,6 +127,7 @@ Following automatic suggestion services utilize glossaries during the translatio * :ref:`mt-openai` * :ref:`mt-deepl` * :ref:`mt-microsoft-translator` +* :ref:`mt-aws` The glossary is processed before exposed to the service: diff --git a/scripts/generate-aws-languages b/scripts/generate-aws-languages deleted file mode 100755 index 19590d018761..000000000000 --- a/scripts/generate-aws-languages +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python - -# Copyright © Michal Čihař -# -# SPDX-License-Identifier: GPL-3.0-or-later - -import re - -import requests - -URL = "https://docs.aws.amazon.com/translate/latest/dg/what-is.html" - -LANG_RE = re.compile(".* *([a-z][a-z](-[A-Z][A-Z])?) *.*") - -CODES = set() - -response = requests.get(URL, timeout=1) -parse = False -for line in response.text.splitlines(): - if parse: - match = LANG_RE.match(line) - if match: - CODES.add(match[1]) - elif "" in line: - break - elif "Language Code" in line: - parse = True - - -for code in sorted(CODES): - print(f" {code!r},") diff --git a/weblate/machinery/aws.py b/weblate/machinery/aws.py index 7a5277969a97..77fba46ccf1a 100644 --- a/weblate/machinery/aws.py +++ b/weblate/machinery/aws.py @@ -2,13 +2,16 @@ # # SPDX-License-Identifier: GPL-3.0-or-later + +import operator + from django.utils.functional import cached_property -from .base import DownloadTranslations, MachineTranslation +from .base import DownloadTranslations, GlossaryMachineTranslationMixin from .forms import AWSMachineryForm -class AWSTranslation(MachineTranslation): +class AWSTranslation(GlossaryMachineTranslationMixin): """AWS machine translation.""" name = "Amazon Translate" @@ -19,6 +22,13 @@ class AWSTranslation(MachineTranslation): } settings_form = AWSMachineryForm + # glossary name must match the pattern ^([A-Za-z0-9-]_?)+$ + glossary_name_format = ( + "weblate_-_{project}_-_{source_language}_-_{target_language}_-_{checksum}" + ) + + glossary_count_limit = 100 + @classmethod def get_identifier(cls) -> str: return "aws" @@ -39,85 +49,9 @@ def map_language_code(self, code): return super().map_language_code(code).replace("_", "-").split("@")[0] def download_languages(self): - """ - Hardcoded list of supported languages as there is no API to get this. - - Can be generated by HTML scraping using - ./scripts/generate-aws-languages - """ - return ( - "af", - "am", - "ar", - "az", - "bg", - "bn", - "bs", - "ca", - "cs", - "cy", - "da", - "de", - "el", - "en", - "es", - "es-MX", - "et", - "fa", - "fa-AF", - "fi", - "fr", - "fr-CA", - "gu", - "ha", - "he", - "hi", - "hr", - "ht", - "hu", - "hy", - "id", - "is", - "it", - "ja", - "ka", - "kk", - "kn", - "ko", - "lt", - "lv", - "mk", - "ml", - "mn", - "ms", - "mt", - "nl", - "no", - "pl", - "ps", - "pt", - "ro", - "ru", - "si", - "sk", - "sl", - "so", - "sq", - "sr", - "sv", - "sw", - "ta", - "te", # codespell:ignore te - "th", - "tl", - "tr", - "uk", - "ur", - "uz", - "vi", - "zh", - "zh-TW", - ) + """List of supported languages.""" + result = self.client.list_languages() + return [lang["LanguageCode"] for lang in result["Languages"]] def download_translations( self, @@ -128,12 +62,70 @@ def download_translations( user, threshold: int = 75, ) -> DownloadTranslations: - response = self.client.translate_text( - Text=text, SourceLanguageCode=source, TargetLanguageCode=language - ) + params = { + "Text": text, + "SourceLanguageCode": source, + "TargetLanguageCode": language, + } + + glossary_name: str | None = self.get_glossary_id(source, language, unit) + if glossary_name: + params["TerminologyNames"] = [glossary_name] + + response = self.client.translate_text(**params) yield { "text": response["TranslatedText"], "quality": self.max_score, "service": self.name, "source": text, } + + def create_glossary( + self, source_language: str, target_language: str, name: str, tsv: str + ) -> None: + """Create glossary in the service.""" + # add header with source and target languages + tsv = f"{source_language}\t{target_language}\n{tsv}" + self.client.import_terminology( + Name=name, + MergeStrategy="OVERWRITE", + TerminologyData={ + "File": tsv.encode(), + "Format": "TSV", + "Directionality": "UNI", + }, + ) + + def is_glossary_supported(self, source_language: str, target_language: str) -> bool: + """Check whether given language combination is supported for glossary.""" + return self.is_supported(source_language, target_language) + + def list_glossaries(self) -> dict[str, str]: + """List all glossaries from service.""" + result = ( + self.client.get_paginator("list_terminologies") + .paginate() + .build_full_result() + ) + return { + terminology["Name"]: terminology["Name"] + for terminology in result["TerminologyPropertiesList"] + } + + def delete_glossary(self, glossary_id: str) -> None: + """Delete a single glossary from service.""" + self.client.delete_terminology(Name=glossary_id) + + def delete_oldest_glossary(self) -> None: + """Delete oldest glossary if any.""" + result = ( + self.client.get_paginator("list_terminologies") + .paginate() + .build_full_result() + ) + glossaries = sorted( + result["TerminologyPropertiesList"], + key=operator.itemgetter("CreatedAt"), + ) + if glossaries: + self.delete_glossary(glossaries[0]["Name"]) diff --git a/weblate/machinery/tests.py b/weblate/machinery/tests.py index fb868bc0f165..a4e2e936f02c 100644 --- a/weblate/machinery/tests.py +++ b/weblate/machinery/tests.py @@ -225,6 +225,15 @@ "translation": {"cs": "data", "en": "data", "es": "data", "de": "data"} } +AWS_LANGUAGES_RESPONSE = { + "Languages": [ + {"LanguageName": "Afrikaans", "LanguageCode": "af"}, + {"LanguageName": "Czech", "LanguageCode": "cs"}, + {"LanguageName": "German", "LanguageCode": "de"}, + {"LanguageName": "English", "LanguageCode": "en"}, + ] +} + class BaseMachineTranslationTest(TestCase): """Testing of machine translation core.""" @@ -259,9 +268,10 @@ def test_english_map(self) -> None: @responses.activate @respx.mock - def test_support(self) -> None: + def test_support(self, machine_translation=None) -> None: self.mock_response() - machine_translation = self.get_machine() + if machine_translation is None: + machine_translation = self.get_machine() self.assertTrue(machine_translation.is_supported(self.ENGLISH, self.SUPPORTED)) if self.NOTSUPPORTED: self.assertFalse( @@ -1378,9 +1388,23 @@ def mock_error(self) -> NoReturn: def mock_response(self) -> None: pass + def test_support(self) -> None: + machine = self.get_machine() + machine.delete_cache() + with Stubber(machine.client) as stubber: + stubber.add_response( + "list_languages", + AWS_LANGUAGES_RESPONSE, + ) + super().test_support(machine) + def test_validate_settings(self) -> None: machine = self.get_machine() with Stubber(machine.client) as stubber: + stubber.add_response( + "list_languages", + AWS_LANGUAGES_RESPONSE, + ) stubber.add_response( "translate_text", { @@ -1395,6 +1419,10 @@ def test_validate_settings(self) -> None: def test_translate(self, **kwargs) -> None: machine = self.get_machine() with Stubber(machine.client) as stubber: + stubber.add_response( + "list_languages", + AWS_LANGUAGES_RESPONSE, + ) stubber.add_response( "translate_text", { @@ -1414,6 +1442,10 @@ def test_translate(self, **kwargs) -> None: def test_translate_language_map(self, **kwargs) -> None: machine = self.get_machine() with Stubber(machine.client) as stubber: + stubber.add_response( + "list_languages", + AWS_LANGUAGES_RESPONSE, + ) stubber.add_response( "translate_text", { @@ -1446,6 +1478,10 @@ def test_batch(self, machine=None) -> None: if machine is None: machine = self.get_machine() with Stubber(machine.client) as stubber: + stubber.add_response( + "list_languages", + AWS_LANGUAGES_RESPONSE, + ) stubber.add_response( "translate_text", { @@ -1461,6 +1497,132 @@ def test_clean(self) -> NoReturn: # Stubbing here is tricky raise SkipTest("Not tested") + @patch("weblate.glossary.models.get_glossary_tsv", new=lambda _: "foo\tbar") + def test_glossary(self) -> None: + """Test translation with glossary (terminology).""" + machine = self.get_machine() + + with ( + Stubber(machine.client) as stubber, + patch( + "weblate.machinery.aws.AWSTranslation.glossary_count_limit", + new=1, + ), + ): + stubber.add_response( + "list_languages", + AWS_LANGUAGES_RESPONSE, + ) + # glossary list with stale glossary response + stubber.add_response( + "list_terminologies", + { + "TerminologyPropertiesList": [ + { + "Name": "weblate_-_1_-_en_-_de_-_a85e314d2f7614eb", + "SourceLanguageCode": "en", + "TargetLanguageCodes": ["de"], + "CreatedAt": "2021-03-03T14:16:18.329Z", + "Directionality": "UNI", + "Format": "TSV", + } + ] + }, + ) + + # glossary list with stale glossary response + stubber.add_response( + "list_terminologies", + { + "TerminologyPropertiesList": [ + { + "Name": "weblate_-_1_-_en_-_de_-_a85e314d2f7614eb", + "SourceLanguageCode": "en", + "TargetLanguageCodes": ["de"], + "CreatedAt": "2021-03-03T14:16:18.329Z", + "Directionality": "UNI", + "Format": "TSV", + } + ] + }, + ) + + # delete stale glossary response + stubber.add_response( + "delete_terminology", + {}, + {"Name": "weblate_-_1_-_en_-_de_-_a85e314d2f7614eb"}, + ) + + # create glossary response + stubber.add_response( + "import_terminology", + { + "AuxiliaryDataLocation": { + "Location": "location", + "RepositoryType": "type", + }, + "TerminologyProperties": {}, + }, + { + "Name": "weblate_-_1_-_en_-_cs_-_9e250d830c11d70f", + "MergeStrategy": "OVERWRITE", + "TerminologyData": { + "File": b"en\tcs\nfoo\tbar", + "Format": "TSV", + "Directionality": "UNI", + }, + }, + ) + + # return glossary list with newly created glossary + stubber.add_response( + "list_terminologies", + { + "TerminologyPropertiesList": [ + { + "Name": "weblate_-_1_-_en_-_cs_-_9e250d830c11d70f", + "SourceLanguageCode": "en", + "TargetLanguageCodes": ["cs"], + "CreatedAt": "2021-08-03T14:16:18.329Z", + "Directionality": "UNI", + "Format": "TSV", + }, + ] + }, + ) + + # translate with glossary + stubber.add_response( + "translate_text", + { + "TranslatedText": "Ahoj", + "SourceLanguageCode": "en", + "TargetLanguageCode": "cs", + "AppliedTerminologies": [ + { + "Name": "weblate_-_1_-_en_-_cs_-_9e250d830c11d70f", + "Terms": [ + {"SourceText": "foo", "TargetText": "bar"}, + ], + }, + ], + }, + { + "SourceLanguageCode": ANY, + "TargetLanguageCode": ANY, + "Text": ANY, + "TerminologyNames": ["weblate_-_1_-_en_-_cs_-_9e250d830c11d70f"], + }, + ) + + self.assert_translate( + self.SUPPORTED, + self.SOURCE_TRANSLATED, + self.EXPECTED_LEN, + machine=machine, + ) + class AlibabaTranslationTest(BaseMachineTranslationTest): MACHINE_CLS = AlibabaTranslation