diff --git a/g2p/mappings/langs/langs.json.gz b/g2p/mappings/langs/langs.json.gz index 7ceba3c8..aa8685bb 100644 Binary files a/g2p/mappings/langs/langs.json.gz and b/g2p/mappings/langs/langs.json.gz differ diff --git a/g2p/mappings/utils.py b/g2p/mappings/utils.py index cad3a28e..fcd3e294 100644 --- a/g2p/mappings/utils.py +++ b/g2p/mappings/utils.py @@ -13,7 +13,18 @@ from copy import deepcopy from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Pattern, Tuple, TypeVar, Union, cast +from typing import ( + Any, + Dict, + List, + Optional, + Pattern, + Sequence, + Tuple, + TypeVar, + Union, + cast, +) import regex as re import yaml @@ -495,16 +506,50 @@ def get_alignment_sequence(alignment: str, delimiter="") -> List[Tuple[int, str] # The joiner between key and value must be 0 so that it sorts before all # characters and thus won't break bisect_left() _JOINER = "\0" +# For compacting a group of lexicon entries into one string. +# This just has to be somethign that does not occur in the lexicon data +_BLOCK_JOINER = "\1" def find_alignment(alignments: List[str], word: str) -> List[Tuple[int, str]]: - """Given a sorted list of (word, alignment), find word and return its parsed alignment.""" + """Given a sorted list of (word, alignment), find word and return its parsed alignment. + + Algorithm: double bisect over blocks and then entries within blocks. + """ i = bisect_left(alignments, word) - if i != len(alignments): - k, v = alignments[i].split(_JOINER, maxsplit=1) - if k == word: - return get_alignment_sequence(v) - return [] + if i != len(alignments) and alignments[i].startswith(word + _JOINER): + # Looking for the first entry of a block bisects to the correct block + alignment_entry, _, _ = alignments[i].partition(_BLOCK_JOINER) + elif i > 0: + # Looking for the remaining entries of a block bisects one block too far: + # bisect again within the previous block + alignment_block = alignments[i - 1].split(_BLOCK_JOINER) + j = bisect_left(alignment_block, word) + if j != len(alignment_block): + alignment_entry = alignment_block[j] + else: + return [] # word not found: would have been between this and next block + else: + return [] # word not found: would have been before the first block + + k, _, v = alignment_entry.partition(_JOINER) + if k == word: + return get_alignment_sequence(v) # word found + else: + return [] # word not found: key in bisected location does not match word + + +def compact_alignments(alignments: Sequence[str]) -> List[str]: + """Memory footprint optimization: compact the list of alignments into blocks. + + Each Python string has a significant overhead: grouping them into blocks of 16 + saves 15MB of RAM for the cmudict English lexicon, at no significant speed cost. + """ + _BLOCK_SIZE = 16 + return [ + _BLOCK_JOINER.join(alignments[i : i + _BLOCK_SIZE]) + for i in range(0, len(alignments), _BLOCK_SIZE) + ] def load_alignments_from_file(path, delimiter="") -> List[str]: @@ -526,7 +571,7 @@ def load_alignments_from_file(path, delimiter="") -> List[str]: continue word = get_alignment_input_string(spam) alignments.append(word + _JOINER + spam) - return sorted(alignments) + return compact_alignments(sorted(alignments)) def is_ipa(lang: str) -> bool: diff --git a/g2p/tests/test_lexicon_transducer.py b/g2p/tests/test_lexicon_transducer.py index d6d10c14..59f104f2 100644 --- a/g2p/tests/test_lexicon_transducer.py +++ b/g2p/tests/test_lexicon_transducer.py @@ -200,8 +200,7 @@ def test_eng_lexicon(self): ) def test_eng_transducer(self): - """Test the cached eng to eng-ipa lexicon from make_g2p - .""" + """Test the cached eng to eng-ipa lexicon from make_g2p.""" transducer = make_g2p("eng", "eng-arpabet") tg = transducer("hello") self.assertEqual(tg.output_string, "HH AH L OW ") @@ -211,6 +210,66 @@ def test_eng_transducer(self): transducer("hello my friend").output_string, "HH AH L OW M AY F R EH N D " ) + def test_eng_lexicon_corner_cases(self): + """White-box testing for compact storage of lexicon mappings.""" + test_cases = ( + ("'bout", "baʊt"), # first entry in eng->eng-ipa + ("'cause", "kʌz"), # second entry + ("'course", "kɔɹs"), # third + ("'tis", "tɪz"), # 15th entry + ("'twas", "twʌz"), # 16th entry + ("a", "ʌ"), # 17th entry + ("buttering", "bʌtɜ˞ɪŋ"), # 15998th, which is -2 mod 16 + ("buttermilk", "bʌtɜ˞mɪlk"), # 15999th, -1 mod 16 + ("buttermore", "bʌtɜ˞mɔɹ"), # 16000th, 0 mod 16 + ("butters", "bʌtɜ˞z"), # 16001th, 1 mod 16 + ("butterscotch", "bʌtɜ˞skɑtʃ"), + ("butterworth", "bʌtɜ˞wɜ˞θ"), + ("buttery", "bʌtɜ˞i"), + ("butthead", "bʌthɛd"), + ("butting", "bʌtɪŋ"), + ("buttitta", "butitʌ"), + ("buttke", "bʌtki"), + ("buttler", "bʌtlɜ˞"), + ("buttner", "bʌtnɜ˞"), + ("buttock", "bʌtʌk"), + ("buttocks", "bʌtʌks"), + ("button", "bʌtʌn"), + ("buttoned", "bʌtʌnd"), + ("buttonhole", "bʌtʌnhoʊl"), + ("buttonholed", "bʌtʌnhoʊld"), + ("buttonholes", "bʌtʌnhoʊlz"), + ("buttons", "bʌtʌnz"), # 16018th + ("zwieg", "zwiɡ"), # last block of the lexicon + ("zwilling", "zwɪlɪŋ"), + ("zwolinski", "zvʌlɪnski"), + ("zycad", "zɪkæd"), + ("zych", "zaɪtʃ"), + ("zycher", "zɪkɜ˞"), + ("zydeco", "zaɪdʌkoʊ"), + ("zygmunt", "zɪɡmʌnt"), + ("zygote", "zaɪɡoʊt"), + ("zyla", "zɪlʌ"), + ("zylka", "zɪlkʌ"), + ("zylstra", "zɪlstɹʌ"), + ("zyman", "zaɪmʌn"), + ("zynda", "zɪndʌ"), + ("zysk", "zaɪsk"), + ("zyskowski", "zɪskɔfski"), + ("zyuganov", "zjuɡɑnɑv"), + ("zyuganov's", "zjuɡɑnɑvz"), + ("zywicki", "zɪwɪki"), + ) + + transducer = make_g2p("eng", "eng-ipa", tokenize=False) + for word, expected in test_cases: + tg = transducer(word) + self.assertEqual(tg.output_string, expected) + before = word[:-1] + chr(ord(word[-1]) - 1) + "z" + self.assertEqual(transducer(before).output_string, "", f"word={word} before={before}") + after = word[:-1] + chr(ord(word[-1]) + 1) + "z" + self.assertEqual(transducer(after).output_string, "", f"word={word} after={after}") + if __name__ == "__main__": main()