Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev.ej/compact lexicon #400

Merged
merged 4 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified g2p/mappings/langs/langs.json.gz
Binary file not shown.
61 changes: 53 additions & 8 deletions g2p/mappings/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,18 @@
from copy import deepcopy
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional, Pattern, Tuple, TypeVar, Union, cast
from typing import (
Any,
Dict,
List,
Optional,
Pattern,
Sequence,
Tuple,
TypeVar,
Union,
cast,
)

import regex as re
import yaml
Expand Down Expand Up @@ -495,16 +506,50 @@ def get_alignment_sequence(alignment: str, delimiter="") -> List[Tuple[int, str]
# The joiner between key and value must be 0 so that it sorts before all
# characters and thus won't break bisect_left()
_JOINER = "\0"
# For compacting a group of lexicon entries into one string.
# This just has to be somethign that does not occur in the lexicon data
_BLOCK_JOINER = "\1"


def find_alignment(alignments: List[str], word: str) -> List[Tuple[int, str]]:
"""Given a sorted list of (word, alignment), find word and return its parsed alignment."""
"""Given a sorted list of (word, alignment), find word and return its parsed alignment.

Algorithm: double bisect over blocks and then entries within blocks.
"""
i = bisect_left(alignments, word)
if i != len(alignments):
k, v = alignments[i].split(_JOINER, maxsplit=1)
if k == word:
return get_alignment_sequence(v)
return []
if i != len(alignments) and alignments[i].startswith(word + _JOINER):
# Looking for the first entry of a block bisects to the correct block
alignment_entry, _, _ = alignments[i].partition(_BLOCK_JOINER)
elif i > 0:
dhdaines marked this conversation as resolved.
Show resolved Hide resolved
# Looking for the remaining entries of a block bisects one block too far:
# bisect again within the previous block
alignment_block = alignments[i - 1].split(_BLOCK_JOINER)
j = bisect_left(alignment_block, word)
if j != len(alignment_block):
dhdaines marked this conversation as resolved.
Show resolved Hide resolved
alignment_entry = alignment_block[j]
else:
return [] # word not found: would have been between this and next block
else:
return [] # word not found: would have been before the first block

k, _, v = alignment_entry.partition(_JOINER)
if k == word:
return get_alignment_sequence(v) # word found
else:
return [] # word not found: key in bisected location does not match word


def compact_alignments(alignments: Sequence[str]) -> List[str]:
"""Memory footprint optimization: compact the list of alignments into blocks.

Each Python string has a significant overhead: grouping them into blocks of 16
saves 15MB of RAM for the cmudict English lexicon, at no significant speed cost.
"""
_BLOCK_SIZE = 16
return [
_BLOCK_JOINER.join(alignments[i : i + _BLOCK_SIZE])
for i in range(0, len(alignments), _BLOCK_SIZE)
]


def load_alignments_from_file(path, delimiter="") -> List[str]:
Expand All @@ -526,7 +571,7 @@ def load_alignments_from_file(path, delimiter="") -> List[str]:
continue
word = get_alignment_input_string(spam)
alignments.append(word + _JOINER + spam)
return sorted(alignments)
return compact_alignments(sorted(alignments))


def is_ipa(lang: str) -> bool:
Expand Down
63 changes: 61 additions & 2 deletions g2p/tests/test_lexicon_transducer.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,7 @@ def test_eng_lexicon(self):
)

def test_eng_transducer(self):
"""Test the cached eng to eng-ipa lexicon from make_g2p
."""
"""Test the cached eng to eng-ipa lexicon from make_g2p."""
transducer = make_g2p("eng", "eng-arpabet")
tg = transducer("hello")
self.assertEqual(tg.output_string, "HH AH L OW ")
Expand All @@ -211,6 +210,66 @@ def test_eng_transducer(self):
transducer("hello my friend").output_string, "HH AH L OW M AY F R EH N D "
)

def test_eng_lexicon_corner_cases(self):
"""White-box testing for compact storage of lexicon mappings."""
test_cases = (
("'bout", "baʊt"), # first entry in eng->eng-ipa
("'cause", "kʌz"), # second entry
("'course", "kɔɹs"), # third
("'tis", "tɪz"), # 15th entry
("'twas", "twʌz"), # 16th entry
("a", "ʌ"), # 17th entry
("buttering", "bʌtɜ˞ɪŋ"), # 15998th, which is -2 mod 16
("buttermilk", "bʌtɜ˞mɪlk"), # 15999th, -1 mod 16
("buttermore", "bʌtɜ˞mɔɹ"), # 16000th, 0 mod 16
("butters", "bʌtɜ˞z"), # 16001th, 1 mod 16
("butterscotch", "bʌtɜ˞skɑtʃ"),
("butterworth", "bʌtɜ˞wɜ˞θ"),
("buttery", "bʌtɜ˞i"),
("butthead", "bʌthɛd"),
("butting", "bʌtɪŋ"),
("buttitta", "butitʌ"),
("buttke", "bʌtki"),
("buttler", "bʌtlɜ˞"),
("buttner", "bʌtnɜ˞"),
("buttock", "bʌtʌk"),
("buttocks", "bʌtʌks"),
("button", "bʌtʌn"),
("buttoned", "bʌtʌnd"),
("buttonhole", "bʌtʌnhoʊl"),
("buttonholed", "bʌtʌnhoʊld"),
("buttonholes", "bʌtʌnhoʊlz"),
("buttons", "bʌtʌnz"), # 16018th
("zwieg", "zwiɡ"), # last block of the lexicon
("zwilling", "zwɪlɪŋ"),
("zwolinski", "zvʌlɪnski"),
("zycad", "zɪkæd"),
("zych", "zaɪtʃ"),
("zycher", "zɪkɜ˞"),
("zydeco", "zaɪdʌkoʊ"),
("zygmunt", "zɪɡmʌnt"),
("zygote", "zaɪɡoʊt"),
("zyla", "zɪlʌ"),
("zylka", "zɪlkʌ"),
("zylstra", "zɪlstɹʌ"),
("zyman", "zaɪmʌn"),
("zynda", "zɪndʌ"),
("zysk", "zaɪsk"),
("zyskowski", "zɪskɔfski"),
("zyuganov", "zjuɡɑnɑv"),
("zyuganov's", "zjuɡɑnɑvz"),
("zywicki", "zɪwɪki"),
)

transducer = make_g2p("eng", "eng-ipa", tokenize=False)
for word, expected in test_cases:
tg = transducer(word)
self.assertEqual(tg.output_string, expected)
before = word[:-1] + chr(ord(word[-1]) - 1) + "z"
self.assertEqual(transducer(before).output_string, "", f"word={word} before={before}")
after = word[:-1] + chr(ord(word[-1]) + 1) + "z"
self.assertEqual(transducer(after).output_string, "", f"word={word} after={after}")


if __name__ == "__main__":
main()