Skip to content

Commit

Permalink
feat(machinery): aws glossary support (#12520)
Browse files Browse the repository at this point in the history
* feature implementation

* documentation updates

* update download_languages implementation

Fixes #10527
  • Loading branch information
gersona committed Sep 18, 2024
1 parent f76d41a commit 452a89d
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 118 deletions.
2 changes: 2 additions & 0 deletions docs/admin/machine.rst
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ Amazon Translate
Amazon Translate is a neural machine translation service for translating text
to and from English across a breadth of supported languages.

The service automatically uses :ref:`glossary`, see :ref:`glossary-mt`.

.. seealso::

`Amazon Translate Documentation <https://docs.aws.amazon.com/translate/>`_
Expand Down
3 changes: 2 additions & 1 deletion docs/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ Not yet released.

**Improvements**

* :ref`mt-deepl` now supports specifying translation context.
* :ref:`mt-deepl` now supports specifying translation context.
* :ref:`mt-aws` now supports :ref:`glossary-mt`.

**Bug fixes**

Expand Down
1 change: 1 addition & 0 deletions docs/user/glossary.rst
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ Following automatic suggestion services utilize glossaries during the translatio
* :ref:`mt-openai`
* :ref:`mt-deepl`
* :ref:`mt-microsoft-translator`
* :ref:`mt-aws`

The glossary is processed before exposed to the service:

Expand Down
31 changes: 0 additions & 31 deletions scripts/generate-aws-languages

This file was deleted.

160 changes: 76 additions & 84 deletions weblate/machinery/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
#
# SPDX-License-Identifier: GPL-3.0-or-later


import operator

from django.utils.functional import cached_property

from .base import DownloadTranslations, MachineTranslation
from .base import DownloadTranslations, GlossaryMachineTranslationMixin
from .forms import AWSMachineryForm


class AWSTranslation(MachineTranslation):
class AWSTranslation(GlossaryMachineTranslationMixin):
"""AWS machine translation."""

name = "Amazon Translate"
Expand All @@ -19,6 +22,13 @@ class AWSTranslation(MachineTranslation):
}
settings_form = AWSMachineryForm

# glossary name must match the pattern ^([A-Za-z0-9-]_?)+$
glossary_name_format = (
"weblate_-_{project}_-_{source_language}_-_{target_language}_-_{checksum}"
)

glossary_count_limit = 100

@classmethod
def get_identifier(cls) -> str:
return "aws"
Expand All @@ -39,85 +49,9 @@ def map_language_code(self, code):
return super().map_language_code(code).replace("_", "-").split("@")[0]

def download_languages(self):
"""
Hardcoded list of supported languages as there is no API to get this.
Can be generated by HTML scraping using
./scripts/generate-aws-languages
"""
return (
"af",
"am",
"ar",
"az",
"bg",
"bn",
"bs",
"ca",
"cs",
"cy",
"da",
"de",
"el",
"en",
"es",
"es-MX",
"et",
"fa",
"fa-AF",
"fi",
"fr",
"fr-CA",
"gu",
"ha",
"he",
"hi",
"hr",
"ht",
"hu",
"hy",
"id",
"is",
"it",
"ja",
"ka",
"kk",
"kn",
"ko",
"lt",
"lv",
"mk",
"ml",
"mn",
"ms",
"mt",
"nl",
"no",
"pl",
"ps",
"pt",
"ro",
"ru",
"si",
"sk",
"sl",
"so",
"sq",
"sr",
"sv",
"sw",
"ta",
"te", # codespell:ignore te
"th",
"tl",
"tr",
"uk",
"ur",
"uz",
"vi",
"zh",
"zh-TW",
)
"""List of supported languages."""
result = self.client.list_languages()
return [lang["LanguageCode"] for lang in result["Languages"]]

def download_translations(
self,
Expand All @@ -128,12 +62,70 @@ def download_translations(
user,
threshold: int = 75,
) -> DownloadTranslations:
response = self.client.translate_text(
Text=text, SourceLanguageCode=source, TargetLanguageCode=language
)
params = {
"Text": text,
"SourceLanguageCode": source,
"TargetLanguageCode": language,
}

glossary_name: str | None = self.get_glossary_id(source, language, unit)
if glossary_name:
params["TerminologyNames"] = [glossary_name]

response = self.client.translate_text(**params)
yield {
"text": response["TranslatedText"],
"quality": self.max_score,
"service": self.name,
"source": text,
}

def create_glossary(
self, source_language: str, target_language: str, name: str, tsv: str
) -> None:
"""Create glossary in the service."""
# add header with source and target languages
tsv = f"{source_language}\t{target_language}\n{tsv}"
self.client.import_terminology(
Name=name,
MergeStrategy="OVERWRITE",
TerminologyData={
"File": tsv.encode(),
"Format": "TSV",
"Directionality": "UNI",
},
)

def is_glossary_supported(self, source_language: str, target_language: str) -> bool:
"""Check whether given language combination is supported for glossary."""
return self.is_supported(source_language, target_language)

def list_glossaries(self) -> dict[str, str]:
"""List all glossaries from service."""
result = (
self.client.get_paginator("list_terminologies")
.paginate()
.build_full_result()
)
return {
terminology["Name"]: terminology["Name"]
for terminology in result["TerminologyPropertiesList"]
}

def delete_glossary(self, glossary_id: str) -> None:
"""Delete a single glossary from service."""
self.client.delete_terminology(Name=glossary_id)

def delete_oldest_glossary(self) -> None:
"""Delete oldest glossary if any."""
result = (
self.client.get_paginator("list_terminologies")
.paginate()
.build_full_result()
)
glossaries = sorted(
result["TerminologyPropertiesList"],
key=operator.itemgetter("CreatedAt"),
)
if glossaries:
self.delete_glossary(glossaries[0]["Name"])
Loading

0 comments on commit 452a89d

Please sign in to comment.