From 926d6e834f63dbcc1ae78ff95fb9ddecf77b58ff Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Mon, 13 Apr 2015 15:10:07 -0700 Subject: [PATCH 01/19] working edges() for all but utf-8 --- dawg_python/dawgs.py | 20 ++++++++++++++++++++ dawg_python/wrapper.py | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index ae0f5b5..18a6e61 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -141,6 +141,26 @@ def keys(self, prefix=""): return res + def edges(self, prefix=""): + b_prefix = prefix.encode('utf8') + res = [] + + index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) + if index is None: + return res + + completer = wrapper.Completer(self.dct, self.guide) + if not completer.start_edges(index, b_prefix): + return res + + key = completer.key.decode('utf8') + res.append(key) + while completer.next_edge(): + key = completer.key.decode('utf8') + res.append(key) + + return res + def iterkeys(self, prefix=""): b_prefix = prefix.encode('utf8') index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 863faf8..21b3ee4 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, unicode_literals import struct import array +import pdb from . import units from .compat import int_from_byte @@ -105,6 +106,8 @@ def value(self): return self._dic.value(self._last_index) def start(self, index, prefix=b""): + "initial setup for a completer next() action on some prefix" + self.key = bytearray(prefix) if self._guide.size(): @@ -113,6 +116,40 @@ def start(self, index, prefix=b""): else: self._index_stack = [] + def start_edges(self, index, prefix=b""): + """initial setup for a completer next_edge() action on some prefix. If + there's a child for this prefix, we add that as the one item on the + index_stack. Otherwise, leave the stack empty, so next_edge() fails""" + + self.key = bytearray(prefix) + self._parent_index = index + self._sib_index = None + if self._guide.size(): + child_label = self._guide.child(index) # UCharType + + if child_label: + # Follows a transition to the first child. + next_index = self._dic.follow_char(child_label, index) + if index is not None: + self._sib_index = next_index + self.key.append(child_label) + return True + + def next_edge(self): + "Gets the next edge (not necessarily a terminal)" + + if not self._sib_index: + return False + + sibling_label = self._guide.sibling(self._sib_index) + self._sib_index = self._dic.follow_char(sibling_label, + self._parent_index) + if not self._sib_index: + return False + + self.key.pop() + self.key.append(sibling_label) + return True def next(self): "Gets the next key" @@ -153,7 +190,6 @@ def next(self): return self._find_terminal(index) - def _follow(self, label, index): next_index = self._dic.follow_char(label, index) if next_index is None: From fa6cd76748a273d8cf05103cf209e1e1d89af834 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Mon, 13 Apr 2015 16:22:17 -0700 Subject: [PATCH 02/19] edges() working with unicode; added working iteredges(); added basic tests --- dawg_python/dawgs.py | 21 +++++++++++++++++---- dawg_python/wrapper.py | 24 +++++++++++++++++++++--- tests/test_dawg.py | 10 ++++++++++ 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 18a6e61..6e4d594 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -153,14 +153,27 @@ def edges(self, prefix=""): if not completer.start_edges(index, b_prefix): return res - key = completer.key.decode('utf8') - res.append(key) + res.append(completer.decoded_key) while completer.next_edge(): - key = completer.key.decode('utf8') - res.append(key) + res.append(completer.decoded_key) return res + def iteredges(self, prefix=""): + b_prefix = prefix.encode('utf8') + + index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) + if index is None: + return + + completer = wrapper.Completer(self.dct, self.guide) + if not completer.start_edges(index, b_prefix): + return + + yield completer.decoded_key + while completer.next_edge(): + yield completer.decoded_key + def iterkeys(self, prefix=""): b_prefix = prefix.encode('utf8') index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 21b3ee4..1e741df 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -2,7 +2,6 @@ from __future__ import absolute_import, unicode_literals import struct import array -import pdb from . import units from .compat import int_from_byte @@ -122,6 +121,7 @@ def start_edges(self, index, prefix=b""): index_stack. Otherwise, leave the stack empty, so next_edge() fails""" self.key = bytearray(prefix) + self.base_key_len = len(self.key) self._parent_index = index self._sib_index = None if self._guide.size(): @@ -133,6 +133,7 @@ def start_edges(self, index, prefix=b""): if index is not None: self._sib_index = next_index self.key.append(child_label) + self.decoded_key = self.key.decode('utf-8') return True def next_edge(self): @@ -146,9 +147,26 @@ def next_edge(self): self._parent_index) if not self._sib_index: return False - - self.key.pop() + + self.key = self.key[:self.base_key_len] self.key.append(sibling_label) + try: + self.decoded_key = self.key.decode('utf-8') + except UnicodeDecodeError: + #this sibling is multi-character. keep following its children til + #something is decodable + cur_index = self._sib_index + while True: + child_label = self._guide.child(self._sib_index) + cur_index = self._dic.follow_char(child_label, cur_index) + if not cur_index: + return False + self.key.append(child_label) + try: + self.decoded_key = self.key.decode('utf-8') + break + except UnicodeDecodeError: + pass return True def next(self): diff --git a/tests/test_dawg.py b/tests/test_dawg.py index 0c74e90..29924df 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -46,10 +46,20 @@ def test_keys(self): d = self.dawg() assert d.keys() == sorted(self.keys) + def test_edges(self): + d = self.dawg() + assert d.edges() == ['b', 'f'] + assert d.edges('f') == ['fo'] + def test_iterkeys(self): d = self.dawg() assert list(d.iterkeys()) == d.keys() + def test_iter_edges(self): + d = self.dawg() + assert list(d.iteredges()) == ['b', 'f'] + assert list(d.edges('f')) == ['fo'] + def test_completion(self): d = self.dawg() From 8e7390a7e869b7762c307b542c2f2163d17b6dc4 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 14 Apr 2015 14:19:53 -0700 Subject: [PATCH 03/19] adding terminal indicators on edges; adding edge values for IntCompletionDawgs; adding tests for all --- dawg_python/dawgs.py | 85 ++++++++++++++++++++++++++++++++++-------- dawg_python/wrapper.py | 73 ++++++++++++++++++++++-------------- tests/test_dawg.py | 18 +++++++-- 3 files changed, 129 insertions(+), 47 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 6e4d594..92209a5 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -149,13 +149,13 @@ def edges(self, prefix=""): if index is None: return res - completer = wrapper.Completer(self.dct, self.guide) - if not completer.start_edges(index, b_prefix): + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, b_prefix): return res - res.append(completer.decoded_key) - while completer.next_edge(): - res.append(completer.decoded_key) + res.append(edge_follower.get_cur_edge()) + while edge_follower.next(): + res.append(edge_follower.get_cur_edge()) return res @@ -166,13 +166,13 @@ def iteredges(self, prefix=""): if index is None: return - completer = wrapper.Completer(self.dct, self.guide) - if not completer.start_edges(index, b_prefix): + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, b_prefix): return - yield completer.decoded_key - while completer.next_edge(): - yield completer.decoded_key + yield edge_follower.get_cur_edge() + while edge_follower.next(): + yield edge_follower.get_cur_edge() def iterkeys(self, prefix=""): b_prefix = prefix.encode('utf8') @@ -312,15 +312,14 @@ def iterkeys(self, prefix=""): yield u_key def items(self, prefix=""): + index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') - res = [] - - index = self.dct.ROOT if prefix: index = self.dct.follow_bytes(prefix, index) if not index: - return res + return + res = [] completer = wrapper.Completer(self.dct, self.guide) completer.start(index, prefix) @@ -333,11 +332,30 @@ def items(self, prefix=""): return res - def iteritems(self, prefix=""): + def edges(self, prefix=""): + index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + res = [] + + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return res + + res.append(edge_follower.decoded_key) + while edge_follower.next(): + res.append(edge_follower.decoded_key) + return res + + def iteritems(self, prefix=""): index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') if prefix: index = self.dct.follow_bytes(prefix, index) if not index: @@ -497,6 +515,43 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG): Dict-like class based on DAWG. It can store integer values for unicode keys and support key completion. """ + def edges(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + res = [] + + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return res + + res.append((edge_follower.decoded_key, edge_follower.value())) + while edge_follower.next(): + res.append((edge_follower.decoded_key, edge_follower.value())) + + return res + + def iteredges(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return + + yield (edge_follower.decoded_key, edge_follower.value()) + while edge_follower.next(): + yield (edge_follower.decoded_key, edge_follower.value()) + def items(self, prefix=""): if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 1e741df..74b6cbd 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -17,29 +17,29 @@ def __init__(self): "Root index" def has_value(self, index): - "Checks if a given index is related to the end of a key." + #Checks if a given index is related to the end of a key. return units.has_leaf(self._units[index]) def value(self, index): - "Gets a value from a given index." + #Gets a value from a given index. offset = units.offset(self._units[index]) value_index = (index ^ offset) & units.PRECISION_MASK return units.value(self._units[value_index]) def read(self, fp): - "Reads a dictionary from an input stream." + #Reads a dictionary from an input stream. base_size = struct.unpack(str("=I"), fp.read(4))[0] self._units.fromfile(fp, base_size) def contains(self, key): - "Exact matching." + #Exact matching. index = self.follow_bytes(key, self.ROOT) if index is None: return False return self.has_value(index) def find(self, key): - "Exact matching (returns value)" + #Exact matching (returns value) index = self.follow_bytes(key, self.ROOT) if index is None: return -1 @@ -48,7 +48,7 @@ def find(self, key): return self.value(index) def follow_char(self, label, index): - "Follows a transition" + #Follows a transition offset = units.offset(self._units[index]) next_index = (index ^ offset ^ label) & units.PRECISION_MASK @@ -58,7 +58,7 @@ def follow_char(self, label, index): return next_index def follow_bytes(self, s, index): - "Follows transitions." + #Follows transitions. for ch in s: index = self.follow_char(int_from_byte(ch), index) if index is None: @@ -95,27 +95,17 @@ def size(self): return len(self._units) -class Completer(object): - +class EdgeFollower(object): def __init__(self, dic=None, guide=None): self._dic = dic self._guide = guide def value(self): - return self._dic.value(self._last_index) + if self._dic.has_value(self._cur_index): + return self._dic.value(self._cur_index) + return False def start(self, index, prefix=b""): - "initial setup for a completer next() action on some prefix" - - self.key = bytearray(prefix) - - if self._guide.size(): - self._index_stack = [index] - self._last_index = self._dic.ROOT - else: - self._index_stack = [] - - def start_edges(self, index, prefix=b""): """initial setup for a completer next_edge() action on some prefix. If there's a child for this prefix, we add that as the one item on the index_stack. Otherwise, leave the stack empty, so next_edge() fails""" @@ -124,6 +114,7 @@ def start_edges(self, index, prefix=b""): self.base_key_len = len(self.key) self._parent_index = index self._sib_index = None + self._cur_index = None if self._guide.size(): child_label = self._guide.child(index) # UCharType @@ -132,12 +123,13 @@ def start_edges(self, index, prefix=b""): next_index = self._dic.follow_char(child_label, index) if index is not None: self._sib_index = next_index + self._cur_index = self._sib_index self.key.append(child_label) self.decoded_key = self.key.decode('utf-8') return True - def next_edge(self): - "Gets the next edge (not necessarily a terminal)" + def next(self): + #Gets the next edge (not necessarily a terminal) if not self._sib_index: return False @@ -145,6 +137,7 @@ def next_edge(self): sibling_label = self._guide.sibling(self._sib_index) self._sib_index = self._dic.follow_char(sibling_label, self._parent_index) + self._cur_index = self._sib_index if not self._sib_index: return False @@ -153,13 +146,13 @@ def next_edge(self): try: self.decoded_key = self.key.decode('utf-8') except UnicodeDecodeError: - #this sibling is multi-character. keep following its children til + #this sibling is a multibyte char. keep following its children til #something is decodable - cur_index = self._sib_index while True: child_label = self._guide.child(self._sib_index) - cur_index = self._dic.follow_char(child_label, cur_index) - if not cur_index: + self._cur_index = self._dic.follow_char(child_label, + self._cur_index) + if not self._cur_index: return False self.key.append(child_label) try: @@ -169,8 +162,32 @@ def next_edge(self): pass return True + def get_cur_edge(self): + return (self.decoded_key, self._dic.has_value(self._cur_index)) + + +class Completer(object): + + def __init__(self, dic=None, guide=None): + self._dic = dic + self._guide = guide + + def value(self): + return self._dic.value(self._last_index) + + def start(self, index, prefix=b""): + #initial setup for a completer next() action on some prefix + + self.key = bytearray(prefix) + + if self._guide.size(): + self._index_stack = [index] + self._last_index = self._dic.ROOT + else: + self._index_stack = [] + def next(self): - "Gets the next key" + #Gets the next key if not self._index_stack: return False diff --git a/tests/test_dawg.py b/tests/test_dawg.py index 29924df..e70aaa0 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -48,8 +48,9 @@ def test_keys(self): def test_edges(self): d = self.dawg() - assert d.edges() == ['b', 'f'] - assert d.edges('f') == ['fo'] + assert d.edges() == [('b', False), ('f', True)] + assert d.edges('b') == [('ba', False)] + assert d.edges('fo') == [('foo', True)] def test_iterkeys(self): d = self.dawg() @@ -57,8 +58,9 @@ def test_iterkeys(self): def test_iter_edges(self): d = self.dawg() - assert list(d.iteredges()) == ['b', 'f'] - assert list(d.edges('f')) == ['fo'] + assert list(d.iteredges()) == [('b', False), ('f', True)] + assert list(d.iteredges('b')) == [('ba', False)] + assert list(d.edges('fo')) == [('foo', True)] def test_completion(self): d = self.dawg() @@ -129,3 +131,11 @@ def test_completion_keys_with_prefix(self): def test_completion_items(self): assert self.dawg().items() == sorted(self.payload.items(), key=lambda r: r[0]) + + def test_completion_edges(self): + assert self.dawg().edges('ba') == [('bar', 5)] + assert self.dawg().edges('foob') == [('fooba', False)] + + def test_completion_iteredges(self): + assert list(self.dawg().iteredges('ba')) == [('bar', 5)] + assert list(self.dawg().iteredges('foob')) == [('fooba', False)] From 0211c195394af6bed7aa1060368e24db02a52a4e Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 14 Apr 2015 14:30:40 -0700 Subject: [PATCH 04/19] adding tests for larger values to intdawg and intcompletiondawg; replacing dev data for those --- dev_data/small/int_completion_dawg.dawg | Bin 1544 -> 1544 bytes dev_data/small/int_dawg.dawg | Bin 1028 -> 1028 bytes tests/test_dawg.py | 4 +++- 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dev_data/small/int_completion_dawg.dawg b/dev_data/small/int_completion_dawg.dawg index a25033ecb750db775bb7221502ee98ba0f5b6156..8d8794ca58e3871968d9f9aa4dfb053d8a741ce3 100644 GIT binary patch delta 12 TcmeC+>EM}Qz$mxT@H#617$O74 delta 12 TcmeC+>EM}Qz{tGO@H#617n=jC diff --git a/dev_data/small/int_dawg.dawg b/dev_data/small/int_dawg.dawg index 2c42a5cf5bb8cc648d15c544300486f8855853a7..db39e42a77c1d4c176881ab04a5e03d2ae69365f 100644 GIT binary patch delta 12 TcmZqSXyKS(z$mxT@G>(17jOfx delta 12 TcmZqSXyKS(z{tGO@G>(17U=_( diff --git a/tests/test_dawg.py b/tests/test_dawg.py index e70aaa0..fd16768 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -91,7 +91,7 @@ def test_prefixes(self): class TestIntDAWG(object): - payload = {'foo': 1, 'bar': 5, 'foobar': 3} + payload = {'foo': 1, 'bar': 5, 'foobar': 30} def dawg(self): return dawg_python.IntDAWG().load(data_path('small', 'int_dawg.dawg')) @@ -135,7 +135,9 @@ def test_completion_items(self): def test_completion_edges(self): assert self.dawg().edges('ba') == [('bar', 5)] assert self.dawg().edges('foob') == [('fooba', False)] + assert self.dawg().edges('fooba') == [('foobar', 30)] def test_completion_iteredges(self): assert list(self.dawg().iteredges('ba')) == [('bar', 5)] assert list(self.dawg().iteredges('foob')) == [('fooba', False)] + assert list(self.dawg().iteredges('fooba')) == [('foobar', 30)] From 30bf53bebcd7d4eda64dbda8c247ad866e168b4b Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 00:32:53 -0700 Subject: [PATCH 05/19] edges() and iteredges() now work for all applicable dawgs; tests added for all new edges methods --- dawg_python/dawgs.py | 43 +++++++++++++++++++++++++++++--------- tests/test_payload_dawg.py | 14 +++++++++++++ 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 92209a5..1d0127e 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -332,6 +332,24 @@ def items(self, prefix=""): return res + def iteritems(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + + completer = wrapper.Completer(self.dct, self.guide) + completer.start(index, prefix) + + while completer.next(): + key, value = completer.key.split(self._payload_separator) + # bytes() cast is a python 2.6 fix + item = (key.decode('utf8'), a2b_base64(bytes(value))) + yield item + def edges(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): @@ -346,13 +364,15 @@ def edges(self, prefix=""): if not edge_follower.start(index, prefix): return res - res.append(edge_follower.decoded_key) + vals = self.b_get_value(edge_follower.decoded_key) or [False] + res.extend([(edge_follower.decoded_key, val) for val in vals]) while edge_follower.next(): - res.append(edge_follower.decoded_key) + vals = self.b_get_value(edge_follower.decoded_key) or [False] + res.extend([(edge_follower.decoded_key, val) for val in vals]) return res - def iteritems(self, prefix=""): + def iteredges(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -361,14 +381,17 @@ def iteritems(self, prefix=""): if not index: return - completer = wrapper.Completer(self.dct, self.guide) - completer.start(index, prefix) - - while completer.next(): - key, value = completer.key.split(self._payload_separator) - item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix - yield item + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return + vals = self.b_get_value(edge_follower.decoded_key) or [False] + for val in vals: + yield (edge_follower.decoded_key, val or False) + while edge_follower.next(): + vals = self.b_get_value(edge_follower.decoded_key) or [False] + for val in vals: + yield (edge_follower.decoded_key, val or False) def _has_value(self, index): return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index) diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 4f9060d..b43ffda 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -58,6 +58,18 @@ def test_iterkeys(self): d = self.dawg() assert list(d.iterkeys()) == d.keys() + def test_edges(self): + d = self.dawg() + assert d.edges('foob') == [('fooba', False)] + assert d.edges('fooba') == [('foobar', b'data4')] + assert d.edges('fo') == [('foo', b'data1'), ('foo', b'data3')] + + def test_iteredges(self): + d = self.dawg() + assert list(d.iteredges('foob')) == [('fooba', False)] + assert list(d.iteredges('fooba')) == [('foobar', b'data4')] + assert list(d.iteredges('fo')) == [('foo', b'data1'), ('foo', b'data3')] + def test_key_completion(self): d = self.dawg() assert d.keys('fo') == ['foo', 'foo', 'foobar'] @@ -75,6 +87,8 @@ def test_iteritems(self): def test_items_completion(self): d = self.dawg() assert d.items('foob') == [('foobar', b'data4')] + assert d.items('foo') == [('foo', b'data1'), ('foo', b'data3'), + ('foobar', b'data4')] def test_prefixes(self): d = self.dawg() From 15355be17f1433adca05549e649fdd001593a729 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 00:53:10 -0700 Subject: [PATCH 06/19] items() miss returns empty list; adding test for this; moving appropriate comments to doc strings --- dawg_python/dawgs.py | 2 +- dawg_python/wrapper.py | 19 +++++++++++++------ tests/test_payload_dawg.py | 1 + 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 1d0127e..967eca1 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -318,7 +318,7 @@ def items(self, prefix=""): if prefix: index = self.dct.follow_bytes(prefix, index) if not index: - return + return [] res = [] completer = wrapper.Completer(self.dct, self.guide) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 74b6cbd..b73933d 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -101,14 +101,16 @@ def __init__(self, dic=None, guide=None): self._guide = guide def value(self): + "provides list of values at current index" + if self._dic.has_value(self._cur_index): return self._dic.value(self._cur_index) return False def start(self, index, prefix=b""): - """initial setup for a completer next_edge() action on some prefix. If - there's a child for this prefix, we add that as the one item on the - index_stack. Otherwise, leave the stack empty, so next_edge() fails""" + """initial setup for the next() action on some prefix. If there's a + child for this prefix, we add that as the one item on the index_stack. + Otherwise, leave the stack empty, so next() fails""" self.key = bytearray(prefix) self.base_key_len = len(self.key) @@ -129,7 +131,7 @@ def start(self, index, prefix=b""): return True def next(self): - #Gets the next edge (not necessarily a terminal) + "Gets the next edge (not necessarily a terminal)" if not self._sib_index: return False @@ -163,6 +165,9 @@ def next(self): return True def get_cur_edge(self): + """helper method for getting the decoded key along with whether or not + it is a terminal""" + return (self.decoded_key, self._dic.has_value(self._cur_index)) @@ -173,10 +178,12 @@ def __init__(self, dic=None, guide=None): self._guide = guide def value(self): + "provides list of values at current index" + return self._dic.value(self._last_index) def start(self, index, prefix=b""): - #initial setup for a completer next() action on some prefix + "initial setup for a completer next() action on some prefix" self.key = bytearray(prefix) @@ -187,7 +194,7 @@ def start(self, index, prefix=b""): self._index_stack = [] def next(self): - #Gets the next key + "Gets the next key" if not self._index_stack: return False diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index b43ffda..22866ca 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -77,6 +77,7 @@ def test_key_completion(self): def test_items(self): d = self.dawg() assert d.items() == sorted(self.DATA) + assert d.items('not a real key') == [] def test_iteritems(self): d = self.dawg() From dee560c4079d7e15127385416d4889318f70defb Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 01:26:16 -0700 Subject: [PATCH 07/19] b_get_value should always get bytes, not decoded unicode; utf8 should always be used-- not utf-8 --- dawg_python/dawgs.py | 8 ++++---- dawg_python/wrapper.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 967eca1..22c2791 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -364,10 +364,10 @@ def edges(self, prefix=""): if not edge_follower.start(index, prefix): return res - vals = self.b_get_value(edge_follower.decoded_key) or [False] + vals = self.b_get_value(edge_follower.key) or [False] res.extend([(edge_follower.decoded_key, val) for val in vals]) while edge_follower.next(): - vals = self.b_get_value(edge_follower.decoded_key) or [False] + vals = self.b_get_value(edge_follower.key) or [False] res.extend([(edge_follower.decoded_key, val) for val in vals]) return res @@ -385,11 +385,11 @@ def iteredges(self, prefix=""): if not edge_follower.start(index, prefix): return - vals = self.b_get_value(edge_follower.decoded_key) or [False] + vals = self.b_get_value(edge_follower.key) or [False] for val in vals: yield (edge_follower.decoded_key, val or False) while edge_follower.next(): - vals = self.b_get_value(edge_follower.decoded_key) or [False] + vals = self.b_get_value(edge_follower.key) or [False] for val in vals: yield (edge_follower.decoded_key, val or False) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index b73933d..68a0da4 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -127,7 +127,7 @@ def start(self, index, prefix=b""): self._sib_index = next_index self._cur_index = self._sib_index self.key.append(child_label) - self.decoded_key = self.key.decode('utf-8') + self.decoded_key = self.key.decode('utf8') return True def next(self): @@ -146,7 +146,7 @@ def next(self): self.key = self.key[:self.base_key_len] self.key.append(sibling_label) try: - self.decoded_key = self.key.decode('utf-8') + self.decoded_key = self.key.decode('utf8') except UnicodeDecodeError: #this sibling is a multibyte char. keep following its children til #something is decodable @@ -158,7 +158,7 @@ def next(self): return False self.key.append(child_label) try: - self.decoded_key = self.key.decode('utf-8') + self.decoded_key = self.key.decode('utf8') break except UnicodeDecodeError: pass From 2a931734694c90e7475d510736fe3e64bc72ba9a Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 01:52:58 -0700 Subject: [PATCH 08/19] adding explicit bytes() cast for b_get_value() for python 2.x compatibility --- dawg_python/dawgs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 22c2791..5926659 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -364,10 +364,10 @@ def edges(self, prefix=""): if not edge_follower.start(index, prefix): return res - vals = self.b_get_value(edge_follower.key) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [False] res.extend([(edge_follower.decoded_key, val) for val in vals]) while edge_follower.next(): - vals = self.b_get_value(edge_follower.key) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [False] res.extend([(edge_follower.decoded_key, val) for val in vals]) return res @@ -385,11 +385,11 @@ def iteredges(self, prefix=""): if not edge_follower.start(index, prefix): return - vals = self.b_get_value(edge_follower.key) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [False] for val in vals: yield (edge_follower.decoded_key, val or False) while edge_follower.next(): - vals = self.b_get_value(edge_follower.key) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [False] for val in vals: yield (edge_follower.decoded_key, val or False) From c94b4d87d9e8ecb05d8622865ee6342c681e9db9 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 18:53:17 -0700 Subject: [PATCH 09/19] edges and iter_edges always return boolean terminal; adding edges_data and iteredges_data for appropriate dawgs; adding tests for new methods --- dawg_python/dawgs.py | 94 ++++++++++++++++++++++++++++++++++---- dawg_python/wrapper.py | 6 +++ tests/test_payload_dawg.py | 21 +++++++-- 3 files changed, 108 insertions(+), 13 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 5926659..eae1628 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -364,15 +364,54 @@ def edges(self, prefix=""): if not edge_follower.start(index, prefix): return res - vals = self.b_get_value(bytes(edge_follower.key)) or [False] + val = True if self._follow_key(bytes(edge_follower.key)) else False + res.append((edge_follower.decoded_key, val)) + while edge_follower.next(): + val = True if self._follow_key(bytes(edge_follower.key)) else False + res.append((edge_follower.decoded_key, val)) + return res + + def iteredges(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return + + val = True if self._follow_key(bytes(edge_follower.key)) else False + yield (edge_follower.decoded_key, val) + while edge_follower.next(): + val = True if self._follow_key(bytes(edge_follower.key)) else False + yield (edge_follower.decoded_key, val) + + def edges_data(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + res = [] + + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return res + + vals = self.b_get_value(bytes(edge_follower.key)) or [None] res.extend([(edge_follower.decoded_key, val) for val in vals]) while edge_follower.next(): - vals = self.b_get_value(bytes(edge_follower.key)) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [None] res.extend([(edge_follower.decoded_key, val) for val in vals]) - return res - def iteredges(self, prefix=""): + def iteredges_data(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -385,13 +424,13 @@ def iteredges(self, prefix=""): if not edge_follower.start(index, prefix): return - vals = self.b_get_value(bytes(edge_follower.key)) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [None] for val in vals: - yield (edge_follower.decoded_key, val or False) + yield (edge_follower.decoded_key, val) while edge_follower.next(): - vals = self.b_get_value(bytes(edge_follower.key)) or [False] + vals = self.b_get_value(bytes(edge_follower.key)) or [None] for val in vals: - yield (edge_follower.decoded_key, val or False) + yield (edge_follower.decoded_key, val) def _has_value(self, index): return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index) @@ -548,6 +587,43 @@ def edges(self, prefix=""): return res = [] + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return res + + res.append((edge_follower.decoded_key, edge_follower.has_value())) + while edge_follower.next(): + res.append((edge_follower.decoded_key, edge_follower.has_value())) + + return res + + def iteredges(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + if not edge_follower.start(index, prefix): + return + + yield (edge_follower.decoded_key, edge_follower.has_value()) + while edge_follower.next(): + yield (edge_follower.decoded_key, edge_follower.has_value()) + + def edges_data(self, prefix=""): + index = self.dct.ROOT + if not isinstance(prefix, bytes): + prefix = prefix.encode('utf8') + if prefix: + index = self.dct.follow_bytes(prefix, index) + if not index: + return + res = [] + edge_follower = wrapper.EdgeFollower(self.dct, self.guide) if not edge_follower.start(index, prefix): return res @@ -558,7 +634,7 @@ def edges(self, prefix=""): return res - def iteredges(self, prefix=""): + def iteredges_data(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 68a0da4..3b2f3cd 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -105,6 +105,12 @@ def value(self): if self._dic.has_value(self._cur_index): return self._dic.value(self._cur_index) + return None + + def has_value(self): + "boolean telling whether or not cur_index has a value" + if self._dic.has_value(self._cur_index): + return True return False def start(self, index, prefix=b""): diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 22866ca..d0d7042 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -61,14 +61,27 @@ def test_iterkeys(self): def test_edges(self): d = self.dawg() assert d.edges('foob') == [('fooba', False)] - assert d.edges('fooba') == [('foobar', b'data4')] - assert d.edges('fo') == [('foo', b'data1'), ('foo', b'data3')] + assert d.edges('fooba') == [('foobar', True)] + assert d.edges('fo') == [('foo', True)] def test_iteredges(self): d = self.dawg() assert list(d.iteredges('foob')) == [('fooba', False)] - assert list(d.iteredges('fooba')) == [('foobar', b'data4')] - assert list(d.iteredges('fo')) == [('foo', b'data1'), ('foo', b'data3')] + assert list(d.iteredges('fooba')) == [('foobar', True)] + assert list(d.iteredges('fo')) == [('foo', True)] + + def test_edges_data(self): + d = self.dawg() + assert d.edges_data('foob') == [('fooba', None)] + assert d.edges_data('fooba') == [('foobar', b'data4')] + assert d.edges_data('fo') == [('foo', b'data1'), ('foo', b'data3')] + + def test_iteredges_data(self): + d = self.dawg() + assert list(d.iteredges_data('foob')) == [('fooba', None)] + assert list(d.iteredges_data('fooba')) == [('foobar', b'data4')] + assert list(d.iteredges_data('fo')) == \ + [('foo', b'data1'), ('foo', b'data3')] def test_key_completion(self): d = self.dawg() From 8cb08f348bd9d0a70ff59bc12197b76fb2511fc4 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 19:02:21 -0700 Subject: [PATCH 10/19] forgot to add one test in last commit --- tests/test_dawg.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/test_dawg.py b/tests/test_dawg.py index fd16768..b3d2796 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -133,11 +133,21 @@ def test_completion_items(self): assert self.dawg().items() == sorted(self.payload.items(), key=lambda r: r[0]) def test_completion_edges(self): - assert self.dawg().edges('ba') == [('bar', 5)] + assert self.dawg().edges('ba') == [('bar', True)] assert self.dawg().edges('foob') == [('fooba', False)] - assert self.dawg().edges('fooba') == [('foobar', 30)] + assert self.dawg().edges('fooba') == [('foobar', True)] def test_completion_iteredges(self): - assert list(self.dawg().iteredges('ba')) == [('bar', 5)] + assert list(self.dawg().iteredges('ba')) == [('bar', True)] assert list(self.dawg().iteredges('foob')) == [('fooba', False)] - assert list(self.dawg().iteredges('fooba')) == [('foobar', 30)] + assert list(self.dawg().iteredges('fooba')) == [('foobar', True)] + + def test_completion_edges_data(self): + assert self.dawg().edges_data('ba') == [('bar', 5)] + assert self.dawg().edges_data('foob') == [('fooba', None)] + assert self.dawg().edges_data('fooba') == [('foobar', 30)] + + def test_completion_iteredges_data(self): + assert list(self.dawg().iteredges_data('ba')) == [('bar', 5)] + assert list(self.dawg().iteredges_data('foob')) == [('fooba', None)] + assert list(self.dawg().iteredges_data('fooba')) == [('foobar', 30)] From f3baac8455a2e48f1edc5db8188d6cc53dd6db8a Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Wed, 15 Apr 2015 19:46:51 -0700 Subject: [PATCH 11/19] adding tests for RecordDawg edges_data() and edgesiter_data() --- dawg_python/dawgs.py | 7 +++++++ tests/test_payload_dawg.py | 13 +++++++++++++ 2 files changed, 20 insertions(+) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index eae1628..64cd6ef 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -537,12 +537,19 @@ def _value_for_index(self, index): def items(self, prefix=""): res = super(RecordDAWG, self).items(prefix) + print("items data:") + print(res) return [(key, self._struct.unpack(val)) for (key, val) in res] def iteritems(self, prefix=""): res = super(RecordDAWG, self).iteritems(prefix) return ((key, self._struct.unpack(val)) for (key, val) in res) + #def edges_data(self, prefix=""): + # return super(RecordDAWG, self).edges_data(prefix) + + #def iteredges_data(self, prefix=""): + # return super(RecordDAWG, self).iteredges_data(prefix) LOOKUP_ERROR = -1 diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index d0d7042..c6fbe3c 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -149,6 +149,19 @@ def test_record_items(self): d = self.dawg() assert d.items() == sorted(self.STRUCTURED_DATA) + def test_edges_data(self): + d = self.dawg() + assert d.edges_data('foob') == [('fooba', None)] + assert d.edges_data('fooba') == [('foobar', (6, 3, 0))] + assert d.edges_data('fo') == [('foo', (3, 2, 1)), ('foo', (3, 2, 256))] + + def test_iteredges_data(self): + d = self.dawg() + assert list(d.iteredges_data('foob')) == [('fooba', None)] + assert list(d.iteredges_data('fooba')) == [('foobar', (6, 3, 0))] + assert list(d.iteredges_data('fo')) == [('foo', (3, 2, 1)), + ('foo', (3, 2, 256))] + def test_record_keys(self): d = self.dawg() assert d.keys() == ['bar', 'foo', 'foo', 'foobar',] From 77f38024980005c216035dff4c2d727109a84870 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Sun, 19 Apr 2015 22:49:03 -0400 Subject: [PATCH 12/19] don't treat payload_separator as a normal edge --- dawg_python/dawgs.py | 13 +++++++++---- dawg_python/wrapper.py | 16 +++++++++++----- tests/test_dawg.py | 6 ++++++ tests/test_payload_dawg.py | 2 ++ 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index 64cd6ef..c5cff41 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -215,6 +215,7 @@ class BytesDAWG(CompletionDAWG): def __init__(self, payload_separator=PAYLOAD_SEPARATOR): self._payload_separator = payload_separator + self._payload_separator_byte = bytearray(payload_separator).pop() def __contains__(self, key): if not isinstance(key, bytes): @@ -360,7 +361,8 @@ def edges(self, prefix=""): return res = [] - edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + edge_follower = wrapper.EdgeFollower(self.dct, self.guide, + self._payload_separator_byte) if not edge_follower.start(index, prefix): return res @@ -380,7 +382,8 @@ def iteredges(self, prefix=""): if not index: return - edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + edge_follower = wrapper.EdgeFollower(self.dct, self.guide, + self._payload_separator_byte) if not edge_follower.start(index, prefix): return @@ -400,7 +403,8 @@ def edges_data(self, prefix=""): return res = [] - edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + edge_follower = wrapper.EdgeFollower(self.dct, self.guide, + self._payload_separator_byte) if not edge_follower.start(index, prefix): return res @@ -420,7 +424,8 @@ def iteredges_data(self, prefix=""): if not index: return - edge_follower = wrapper.EdgeFollower(self.dct, self.guide) + edge_follower = wrapper.EdgeFollower(self.dct, self.guide, + self._payload_separator_byte) if not edge_follower.start(index, prefix): return diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 3b2f3cd..9450028 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -96,7 +96,8 @@ def size(self): class EdgeFollower(object): - def __init__(self, dic=None, guide=None): + def __init__(self, dic=None, guide=None, payload_separator=1): + self._payload_separator = payload_separator self._dic = dic self._guide = guide @@ -132,9 +133,13 @@ def start(self, index, prefix=b""): if index is not None: self._sib_index = next_index self._cur_index = self._sib_index - self.key.append(child_label) - self.decoded_key = self.key.decode('utf8') - return True + #skip if the child is \x01 (the divider char) + if child_label == self._payload_separator: + self.next() + else: + self.key.append(child_label) + self.decoded_key = self.key.decode('utf8') + return True def next(self): "Gets the next edge (not necessarily a terminal)" @@ -148,7 +153,8 @@ def next(self): self._cur_index = self._sib_index if not self._sib_index: return False - + if sibling_label == self._payload_separator: + self.next() self.key = self.key[:self.base_key_len] self.key.append(sibling_label) try: diff --git a/tests/test_dawg.py b/tests/test_dawg.py index b3d2796..b4fd0d1 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -51,6 +51,7 @@ def test_edges(self): assert d.edges() == [('b', False), ('f', True)] assert d.edges('b') == [('ba', False)] assert d.edges('fo') == [('foo', True)] + assert d.edges('foobar') == [] def test_iterkeys(self): d = self.dawg() @@ -61,6 +62,7 @@ def test_iter_edges(self): assert list(d.iteredges()) == [('b', False), ('f', True)] assert list(d.iteredges('b')) == [('ba', False)] assert list(d.edges('fo')) == [('foo', True)] + assert list(d.edges('foobar')) == [] def test_completion(self): d = self.dawg() @@ -136,18 +138,22 @@ def test_completion_edges(self): assert self.dawg().edges('ba') == [('bar', True)] assert self.dawg().edges('foob') == [('fooba', False)] assert self.dawg().edges('fooba') == [('foobar', True)] + assert self.dawg().edges('foobar') == [] def test_completion_iteredges(self): assert list(self.dawg().iteredges('ba')) == [('bar', True)] assert list(self.dawg().iteredges('foob')) == [('fooba', False)] assert list(self.dawg().iteredges('fooba')) == [('foobar', True)] + assert list(self.dawg().iteredges('foobar')) == [] def test_completion_edges_data(self): assert self.dawg().edges_data('ba') == [('bar', 5)] assert self.dawg().edges_data('foob') == [('fooba', None)] assert self.dawg().edges_data('fooba') == [('foobar', 30)] + assert self.dawg().edges_data('foobar') == [] def test_completion_iteredges_data(self): assert list(self.dawg().iteredges_data('ba')) == [('bar', 5)] assert list(self.dawg().iteredges_data('foob')) == [('fooba', None)] assert list(self.dawg().iteredges_data('fooba')) == [('foobar', 30)] + assert list(self.dawg().iteredges_data('foobar')) == [] diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index c6fbe3c..709158e 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -75,6 +75,7 @@ def test_edges_data(self): assert d.edges_data('foob') == [('fooba', None)] assert d.edges_data('fooba') == [('foobar', b'data4')] assert d.edges_data('fo') == [('foo', b'data1'), ('foo', b'data3')] + assert d.edges_data('foobar') == [] def test_iteredges_data(self): d = self.dawg() @@ -82,6 +83,7 @@ def test_iteredges_data(self): assert list(d.iteredges_data('fooba')) == [('foobar', b'data4')] assert list(d.iteredges_data('fo')) == \ [('foo', b'data1'), ('foo', b'data3')] + assert list(d.iteredges_data('foobar')) == [] def test_key_completion(self): d = self.dawg() From ae7472ab7c5d9e7f6257e5f97da705b0d3a8cdb2 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Mon, 20 Apr 2015 01:19:52 -0400 Subject: [PATCH 13/19] use ord instead of hacking with bytearray --- dawg_python/dawgs.py | 9 ++++----- dawg_python/wrapper.py | 8 ++++---- tests/test_payload_dawg.py | 4 ++++ 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index c5cff41..e13549a 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -215,7 +215,6 @@ class BytesDAWG(CompletionDAWG): def __init__(self, payload_separator=PAYLOAD_SEPARATOR): self._payload_separator = payload_separator - self._payload_separator_byte = bytearray(payload_separator).pop() def __contains__(self, key): if not isinstance(key, bytes): @@ -362,7 +361,7 @@ def edges(self, prefix=""): res = [] edge_follower = wrapper.EdgeFollower(self.dct, self.guide, - self._payload_separator_byte) + self._payload_separator) if not edge_follower.start(index, prefix): return res @@ -383,7 +382,7 @@ def iteredges(self, prefix=""): return edge_follower = wrapper.EdgeFollower(self.dct, self.guide, - self._payload_separator_byte) + self._payload_separator) if not edge_follower.start(index, prefix): return @@ -404,7 +403,7 @@ def edges_data(self, prefix=""): res = [] edge_follower = wrapper.EdgeFollower(self.dct, self.guide, - self._payload_separator_byte) + self._payload_separator) if not edge_follower.start(index, prefix): return res @@ -425,7 +424,7 @@ def iteredges_data(self, prefix=""): return edge_follower = wrapper.EdgeFollower(self.dct, self.guide, - self._payload_separator_byte) + self._payload_separator) if not edge_follower.start(index, prefix): return diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 9450028..41b3fd0 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -96,8 +96,8 @@ def size(self): class EdgeFollower(object): - def __init__(self, dic=None, guide=None, payload_separator=1): - self._payload_separator = payload_separator + def __init__(self, dic=None, guide=None, payload_separator=b'\x01'): + self._payload_separator = ord(payload_separator) self._dic = dic self._guide = guide @@ -135,7 +135,7 @@ def start(self, index, prefix=b""): self._cur_index = self._sib_index #skip if the child is \x01 (the divider char) if child_label == self._payload_separator: - self.next() + return self.next() else: self.key.append(child_label) self.decoded_key = self.key.decode('utf8') @@ -154,7 +154,7 @@ def next(self): if not self._sib_index: return False if sibling_label == self._payload_separator: - self.next() + return self.next() self.key = self.key[:self.base_key_len] self.key.append(sibling_label) try: diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 709158e..9156246 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -63,12 +63,14 @@ def test_edges(self): assert d.edges('foob') == [('fooba', False)] assert d.edges('fooba') == [('foobar', True)] assert d.edges('fo') == [('foo', True)] + assert d.edges('foo') == [('foob', False)] def test_iteredges(self): d = self.dawg() assert list(d.iteredges('foob')) == [('fooba', False)] assert list(d.iteredges('fooba')) == [('foobar', True)] assert list(d.iteredges('fo')) == [('foo', True)] + assert list(d.iteredges('foo')) == [('foob', False)] def test_edges_data(self): d = self.dawg() @@ -76,6 +78,7 @@ def test_edges_data(self): assert d.edges_data('fooba') == [('foobar', b'data4')] assert d.edges_data('fo') == [('foo', b'data1'), ('foo', b'data3')] assert d.edges_data('foobar') == [] + assert d.edges_data('foo') == [('foob', None)] def test_iteredges_data(self): d = self.dawg() @@ -84,6 +87,7 @@ def test_iteredges_data(self): assert list(d.iteredges_data('fo')) == \ [('foo', b'data1'), ('foo', b'data3')] assert list(d.iteredges_data('foobar')) == [] + assert list(d.iteredges_data('foo')) == [('foob', None)] def test_key_completion(self): d = self.dawg() From 4975f07c0b971c366e79e34958c4c2bb213ddd3a Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Mon, 27 Apr 2015 11:52:25 -0700 Subject: [PATCH 14/19] adding unicode tests; starting to fix multibyte unicode issues --- dawg_python/dawgs.py | 36 ++++++++---------- dawg_python/wrapper.py | 19 +++++++++- dev_data/small/bytes.dawg | Bin 1544 -> 1544 bytes tests/test_dawg.py | 71 ++++++++++++++++++------------------ tests/test_payload_dawg.py | 73 +++++++++++++++++++++---------------- 5 files changed, 109 insertions(+), 90 deletions(-) diff --git a/dawg_python/dawgs.py b/dawg_python/dawgs.py index e13549a..93e2522 100644 --- a/dawg_python/dawgs.py +++ b/dawg_python/dawgs.py @@ -141,7 +141,7 @@ def keys(self, prefix=""): return res - def edges(self, prefix=""): + def children(self, prefix=""): b_prefix = prefix.encode('utf8') res = [] @@ -153,13 +153,13 @@ def edges(self, prefix=""): if not edge_follower.start(index, b_prefix): return res - res.append(edge_follower.get_cur_edge()) + res.append(edge_follower.get_cur_child()) while edge_follower.next(): - res.append(edge_follower.get_cur_edge()) + res.append(edge_follower.get_cur_child()) return res - def iteredges(self, prefix=""): + def iterchildren(self, prefix=""): b_prefix = prefix.encode('utf8') index = self.dct.follow_bytes(b_prefix, self.dct.ROOT) @@ -170,9 +170,9 @@ def iteredges(self, prefix=""): if not edge_follower.start(index, b_prefix): return - yield edge_follower.get_cur_edge() + yield edge_follower.get_cur_child() while edge_follower.next(): - yield edge_follower.get_cur_edge() + yield edge_follower.get_cur_child() def iterkeys(self, prefix=""): b_prefix = prefix.encode('utf8') @@ -350,7 +350,7 @@ def iteritems(self, prefix=""): item = (key.decode('utf8'), a2b_base64(bytes(value))) yield item - def edges(self, prefix=""): + def children(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -372,7 +372,7 @@ def edges(self, prefix=""): res.append((edge_follower.decoded_key, val)) return res - def iteredges(self, prefix=""): + def iterchildren(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -392,7 +392,7 @@ def iteredges(self, prefix=""): val = True if self._follow_key(bytes(edge_follower.key)) else False yield (edge_follower.decoded_key, val) - def edges_data(self, prefix=""): + def children_data(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -414,7 +414,7 @@ def edges_data(self, prefix=""): res.extend([(edge_follower.decoded_key, val) for val in vals]) return res - def iteredges_data(self, prefix=""): + def iterchildren_data(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -485,7 +485,6 @@ def similar_items(self, key, replaces): """ return self._similar_items("", key, self.dct.ROOT, replaces) - def _similar_item_values(self, start_pos, key, index, replace_chars): res = [] end_pos = len(key) @@ -549,14 +548,9 @@ def iteritems(self, prefix=""): res = super(RecordDAWG, self).iteritems(prefix) return ((key, self._struct.unpack(val)) for (key, val) in res) - #def edges_data(self, prefix=""): - # return super(RecordDAWG, self).edges_data(prefix) - - #def iteredges_data(self, prefix=""): - # return super(RecordDAWG, self).iteredges_data(prefix) - LOOKUP_ERROR = -1 + class IntDAWG(DAWG): """ Dict-like class based on DAWG. @@ -588,7 +582,7 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG): Dict-like class based on DAWG. It can store integer values for unicode keys and support key completion. """ - def edges(self, prefix=""): + def children(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -608,7 +602,7 @@ def edges(self, prefix=""): return res - def iteredges(self, prefix=""): + def iterchildren(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -625,7 +619,7 @@ def iteredges(self, prefix=""): while edge_follower.next(): yield (edge_follower.decoded_key, edge_follower.has_value()) - def edges_data(self, prefix=""): + def children_data(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') @@ -645,7 +639,7 @@ def edges_data(self, prefix=""): return res - def iteredges_data(self, prefix=""): + def iterchildren_data(self, prefix=""): index = self.dct.ROOT if not isinstance(prefix, bytes): prefix = prefix.encode('utf8') diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 41b3fd0..d2dd363 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -2,6 +2,7 @@ from __future__ import absolute_import, unicode_literals import struct import array +import pdb from . import units from .compat import int_from_byte @@ -142,7 +143,7 @@ def start(self, index, prefix=b""): return True def next(self): - "Gets the next edge (not necessarily a terminal)" + "Gets the next child (not necessarily a terminal)" if not self._sib_index: return False @@ -176,7 +177,7 @@ def next(self): pass return True - def get_cur_edge(self): + def get_cur_child(self): """helper method for getting the decoded key along with whether or not it is a terminal""" @@ -266,3 +267,17 @@ def _find_terminal(self, index): self._last_index = index return True + + +#the first byte in a utf-8 char determines how many total bytes are in the char. +#the number of bytes = number of leading ones in first byte (i.e. e5 = 225 = +#3 bytes (including the first) +def levels_to_descend(byte_val): + if byte_val < 128: + return 0 + elif byte_val < 192: + return 1 + elif byte_val < 224: + return 2 + else: + return 3 diff --git a/dev_data/small/bytes.dawg b/dev_data/small/bytes.dawg index debaacb3857aa0fda98084b04eafaecbb49f7a01..9ca2377fcf9d52a86165dd774dfedb6dc84438ed 100644 GIT binary patch literal 1544 zcmd_p+f&S89LMp`GcjYjcF~0k7hSs8Xa@Y_uKFDdA`r{+uyS?l8TLd$mu6}n~Y3` zQC6FgC(0->+9;txvSk~kJB^%!jFLJed$bhUo4HbcsNN{lM=}BC^fI!MOYFd$J{n4t%5%F^lzb0 zHua&@ZS0S|7+CxBaSaw51qK_9gw)jv2_uG|a+m6ksM+U?s}1 z9ILSg~VLd9a78|h%mDqs&H~_YjH*pX5aR+yC z1y{i<*JWJC4K(8#Uf?A-HhPZNc!PGl!b3d51GM5Pp5ZZ`;2XZ<3%=qPe&Yvz;vL?j z18?yOpYZ`7A!D5}Z*vZNYxsPXDr96=wH^Jl1k04 z{GtM?Ztl|LVpf*d78%E1{9P>W$K$f%E8y!>R`q)d&Mu~@PW(pD-+-cQQ<9i*`n rH)B>J#5LTRaWLW`X{CalyG&t@z4Na7KOT=iZ}(D<_k}%H|BvDys7~H% literal 1544 zcmd7R>9f~U7y$6+%w(qNwHIFWvTa0e*;@RR>h>$Ta_fh>h5NhJZPgu#EJ=GL$xaI) zyNXl_5w5yOSyGl3va}#c6d|8)W9m<+7oYDr^PJ~7=R0Sf6DcC5r>J6N!-(SA6sa;r z_0AL(r=-X?O3|e@qGWK2K7|y8)+xH}jmR~MXmFC+h;cQ`Q`FRtNW<0COHm>(f3mu| zh(et@@nWt0C8tsU@0+b>`_4>J+Cg4-dHqt9caLZ+&-T%jCX`i1G&45mR8HqK&Y%S? zDd9|7aW-dhF6YpO*0iH7InJX!=W_ua=tM_K=}ew3bfYU5aUo^+&FI0!lyeEa=*guN zxSY%ALvQ+0!4>r5Dz2nRCH<*lAOjdgHA5K8Fotq9!>M5e*HGoct~F+Nd>x}0&Gp>K z4cyF4+`<@cWh~xYc$_Dh&(l1`vpmD|JjViFU=a&>i5Gd9#VlnBuds|2Eax>|WhJlkCU5W- zt9XaEd6!z&u$uR%;{)F3L)NmMb$rZ6e99+$#s)UAkuUh1ulSNJY-Sr<`G&9gmhF7c zckJK?cCnKm+0D=V#4qgOSN8H7`}m#x9N-WB9^Z6(C@&5sYI-vXj diff --git a/tests/test_dawg.py b/tests/test_dawg.py index b4fd0d1..81fdd77 100644 --- a/tests/test_dawg.py +++ b/tests/test_dawg.py @@ -8,6 +8,7 @@ from .utils import data_path + def test_c_dawg_contains(): dawg = pytest.importorskip("dawg") # import dawg bin_dawg = dawg.IntDAWG({'foo': 1, 'bar': 2, 'foobar': 3}) @@ -30,7 +31,8 @@ class TestCompletionDAWG(object): keys = ['f', 'bar', 'foo', 'foobar'] def dawg(self): - return dawg_python.CompletionDAWG().load(data_path('small', 'completion.dawg')) + return dawg_python.CompletionDAWG().load(data_path('small', + 'completion.dawg')) def test_contains(self): d = self.dawg() @@ -46,23 +48,23 @@ def test_keys(self): d = self.dawg() assert d.keys() == sorted(self.keys) - def test_edges(self): + def test_children(self): d = self.dawg() - assert d.edges() == [('b', False), ('f', True)] - assert d.edges('b') == [('ba', False)] - assert d.edges('fo') == [('foo', True)] - assert d.edges('foobar') == [] + assert d.children() == [('b', False), ('f', True)] + assert d.children('b') == [('ba', False)] + assert d.children('fo') == [('foo', True)] + assert d.children('foobar') == [] def test_iterkeys(self): d = self.dawg() assert list(d.iterkeys()) == d.keys() - def test_iter_edges(self): + def test_iter_children(self): d = self.dawg() - assert list(d.iteredges()) == [('b', False), ('f', True)] - assert list(d.iteredges('b')) == [('ba', False)] - assert list(d.edges('fo')) == [('foo', True)] - assert list(d.edges('foobar')) == [] + assert list(d.iterchildren()) == [('b', False), ('f', True)] + assert list(d.iterchildren('b')) == [('ba', False)] + assert list(d.children('fo')) == [('foo', True)] + assert list(d.children('foobar')) == [] def test_completion(self): d = self.dawg() @@ -91,7 +93,6 @@ def test_prefixes(self): assert d.prefixes("bar") == ["bar"] - class TestIntDAWG(object): payload = {'foo': 1, 'bar': 5, 'foobar': 30} @@ -134,26 +135,26 @@ def test_completion_keys_with_prefix(self): def test_completion_items(self): assert self.dawg().items() == sorted(self.payload.items(), key=lambda r: r[0]) - def test_completion_edges(self): - assert self.dawg().edges('ba') == [('bar', True)] - assert self.dawg().edges('foob') == [('fooba', False)] - assert self.dawg().edges('fooba') == [('foobar', True)] - assert self.dawg().edges('foobar') == [] - - def test_completion_iteredges(self): - assert list(self.dawg().iteredges('ba')) == [('bar', True)] - assert list(self.dawg().iteredges('foob')) == [('fooba', False)] - assert list(self.dawg().iteredges('fooba')) == [('foobar', True)] - assert list(self.dawg().iteredges('foobar')) == [] - - def test_completion_edges_data(self): - assert self.dawg().edges_data('ba') == [('bar', 5)] - assert self.dawg().edges_data('foob') == [('fooba', None)] - assert self.dawg().edges_data('fooba') == [('foobar', 30)] - assert self.dawg().edges_data('foobar') == [] - - def test_completion_iteredges_data(self): - assert list(self.dawg().iteredges_data('ba')) == [('bar', 5)] - assert list(self.dawg().iteredges_data('foob')) == [('fooba', None)] - assert list(self.dawg().iteredges_data('fooba')) == [('foobar', 30)] - assert list(self.dawg().iteredges_data('foobar')) == [] + def test_completion_children(self): + assert self.dawg().children('ba') == [('bar', True)] + assert self.dawg().children('foob') == [('fooba', False)] + assert self.dawg().children('fooba') == [('foobar', True)] + assert self.dawg().children('foobar') == [] + + def test_completion_iterchildren(self): + assert list(self.dawg().iterchildren('ba')) == [('bar', True)] + assert list(self.dawg().iterchildren('foob')) == [('fooba', False)] + assert list(self.dawg().iterchildren('fooba')) == [('foobar', True)] + assert list(self.dawg().iterchildren('foobar')) == [] + + def test_completion_children_data(self): + assert self.dawg().children_data('ba') == [('bar', 5)] + assert self.dawg().children_data('foob') == [('fooba', None)] + assert self.dawg().children_data('fooba') == [('foobar', 30)] + assert self.dawg().children_data('foobar') == [] + + def test_completion_iterchildren_data(self): + assert list(self.dawg().iterchildren_data('ba')) == [('bar', 5)] + assert list(self.dawg().iterchildren_data('foob')) == [('fooba', None)] + assert list(self.dawg().iterchildren_data('fooba')) == [('foobar', 30)] + assert list(self.dawg().iterchildren_data('foobar')) == [] diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 9156246..3257414 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -11,7 +11,10 @@ class TestBytesDAWG(object): ('foo', b'data1'), ('bar', b'data2'), ('foo', b'data3'), - ('foobar', b'data4') + ('foobar', b'data4'), + (u'ሀ', b'ethiopic_sign1'), + (u'ሮ', b'ethiopic_sign2'), + (u'ቄ', b'ethiopic_sign3') ) def dawg(self): @@ -33,6 +36,7 @@ def test_getitem(self): assert d['foo'] == [b'data1', b'data3'] assert d['bar'] == [b'data2'] assert d['foobar'] == [b'data4'] + assert d[u'ቄ'] == [b'ethiopic_sign3'] def test_getitem_missing(self): @@ -52,42 +56,47 @@ def test_getitem_missing(self): def test_keys(self): d = self.dawg() - assert d.keys() == ['bar', 'foo', 'foo', 'foobar'] + assert d.keys() == [u'bar', u'foo', u'foo', u'foobar', u'ሀ', u'ሮ', + u'ቄ'] def test_iterkeys(self): d = self.dawg() assert list(d.iterkeys()) == d.keys() - def test_edges(self): + def test_children(self): d = self.dawg() - assert d.edges('foob') == [('fooba', False)] - assert d.edges('fooba') == [('foobar', True)] - assert d.edges('fo') == [('foo', True)] - assert d.edges('foo') == [('foob', False)] + assert d.children('foob') == [('fooba', False)] + assert d.children('fooba') == [('foobar', True)] + assert d.children('fo') == [('foo', True)] + assert d.children('foo') == [('foob', False)] - def test_iteredges(self): + def test_iterchildren(self): d = self.dawg() - assert list(d.iteredges('foob')) == [('fooba', False)] - assert list(d.iteredges('fooba')) == [('foobar', True)] - assert list(d.iteredges('fo')) == [('foo', True)] - assert list(d.iteredges('foo')) == [('foob', False)] + assert list(d.iterchildren('foob')) == [('fooba', False)] + assert list(d.iterchildren('fooba')) == [('foobar', True)] + assert list(d.iterchildren('fo')) == [('foo', True)] + assert list(d.iterchildren('foo')) == [('foob', False)] - def test_edges_data(self): + def test_children_data(self): d = self.dawg() - assert d.edges_data('foob') == [('fooba', None)] - assert d.edges_data('fooba') == [('foobar', b'data4')] - assert d.edges_data('fo') == [('foo', b'data1'), ('foo', b'data3')] - assert d.edges_data('foobar') == [] - assert d.edges_data('foo') == [('foob', None)] + assert d.children_data('foob') == [('fooba', None)] + assert d.children_data('fooba') == [('foobar', b'data4')] + assert d.children_data('fo') == [('foo', b'data1'), ('foo', b'data3')] + assert d.children_data('foobar') == [] + assert d.children_data('foo') == [('foob', None)] + assert set(d.children_data('')) == set([('b', None), ('f', None), + (u'ሀ', b'ethiopic_sign1'), + (u'ሮ', b'ethiopic_sign2'), + (u'ቄ', b'ethiopic_sign3')]) - def test_iteredges_data(self): + def test_iterchildren_data(self): d = self.dawg() - assert list(d.iteredges_data('foob')) == [('fooba', None)] - assert list(d.iteredges_data('fooba')) == [('foobar', b'data4')] - assert list(d.iteredges_data('fo')) == \ + assert list(d.iterchildren_data('foob')) == [('fooba', None)] + assert list(d.iterchildren_data('fooba')) == [('foobar', b'data4')] + assert list(d.iterchildren_data('fo')) == \ [('foo', b'data1'), ('foo', b'data3')] - assert list(d.iteredges_data('foobar')) == [] - assert list(d.iteredges_data('foo')) == [('foob', None)] + assert list(d.iterchildren_data('foobar')) == [] + assert list(d.iterchildren_data('foo')) == [('foob', None)] def test_key_completion(self): d = self.dawg() @@ -155,17 +164,17 @@ def test_record_items(self): d = self.dawg() assert d.items() == sorted(self.STRUCTURED_DATA) - def test_edges_data(self): + def test_children_data(self): d = self.dawg() - assert d.edges_data('foob') == [('fooba', None)] - assert d.edges_data('fooba') == [('foobar', (6, 3, 0))] - assert d.edges_data('fo') == [('foo', (3, 2, 1)), ('foo', (3, 2, 256))] + assert d.children_data('foob') == [('fooba', None)] + assert d.children_data('fooba') == [('foobar', (6, 3, 0))] + assert d.children_data('fo') == [('foo', (3, 2, 1)), ('foo', (3, 2, 256))] - def test_iteredges_data(self): + def test_iterchildren_data(self): d = self.dawg() - assert list(d.iteredges_data('foob')) == [('fooba', None)] - assert list(d.iteredges_data('fooba')) == [('foobar', (6, 3, 0))] - assert list(d.iteredges_data('fo')) == [('foo', (3, 2, 1)), + assert list(d.iterchildren_data('foob')) == [('fooba', None)] + assert list(d.iterchildren_data('fooba')) == [('foobar', (6, 3, 0))] + assert list(d.iterchildren_data('fo')) == [('foo', (3, 2, 1)), ('foo', (3, 2, 256))] def test_record_keys(self): From 1207380bf2db90e2844f2a81c9f5659b2d93b757 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 28 Apr 2015 00:33:29 -0700 Subject: [PATCH 15/19] working for all but multibyte --- dawg_python/wrapper.py | 82 ++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index d2dd363..f105101 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -123,59 +123,63 @@ def start(self, index, prefix=b""): self.key = bytearray(prefix) self.base_key_len = len(self.key) self._parent_index = index - self._sib_index = None - self._cur_index = None + self._sib_index_stack = [] if self._guide.size(): - child_label = self._guide.child(index) # UCharType - + child_label = self._guide.child(index) if child_label: # Follows a transition to the first child. - next_index = self._dic.follow_char(child_label, index) + child_index = self._dic.follow_char(child_label, index) if index is not None: - self._sib_index = next_index - self._cur_index = self._sib_index + self._sib_index_stack.append( + (child_index, 0, None, bytearray())) #skip if the child is \x01 (the divider char) if child_label == self._payload_separator: return self.next() else: - self.key.append(child_label) - self.decoded_key = self.key.decode('utf8') - return True + return self._get_next_multibyte( + child_label, child_index, None, bytearray()) + return False + + def _get_next_multibyte(self, child_label, index, lvls=None, + part_key=None): + """given some child_label and its index, goes down the approp num levels + to get the first decodable chr""" + part_key.append(child_label) + if lvls is None: + lvls = levels_to_descend(child_label) + if lvls > 0: + pdb.set_trace() + for i in xrange(lvls): + next_child_label = self._guide.child(index) + prev_index = index + index = self._dic.follow_char(next_child_label, index) + self._sib_index_stack.append( + (index, i, prev_index, part_key[:])) + part_key.append(next_child_label) + self.key.extend(part_key) + self.decoded_key = self.key.decode('utf8') + self._cur_index = index + return True def next(self): "Gets the next child (not necessarily a terminal)" - if not self._sib_index: + if not self._sib_index_stack: return False - - sibling_label = self._guide.sibling(self._sib_index) - self._sib_index = self._dic.follow_char(sibling_label, - self._parent_index) - self._cur_index = self._sib_index - if not self._sib_index: + sib_index, lvls, parent_index, part_key = self._sib_index_stack.pop() + if not parent_index: + parent_index = self._parent_index + sibling_label = self._guide.sibling(sib_index) + sib_index = self._dic.follow_char(sibling_label, parent_index) + #pdb.set_trace() + if not sib_index: return False + self._sib_index_stack.append((sib_index, None, None, bytearray())) if sibling_label == self._payload_separator: return self.next() self.key = self.key[:self.base_key_len] - self.key.append(sibling_label) - try: - self.decoded_key = self.key.decode('utf8') - except UnicodeDecodeError: - #this sibling is a multibyte char. keep following its children til - #something is decodable - while True: - child_label = self._guide.child(self._sib_index) - self._cur_index = self._dic.follow_char(child_label, - self._cur_index) - if not self._cur_index: - return False - self.key.append(child_label) - try: - self.decoded_key = self.key.decode('utf8') - break - except UnicodeDecodeError: - pass - return True + return self._get_next_multibyte(sibling_label, sib_index, lvls, + part_key) def get_cur_child(self): """helper method for getting the decoded key along with whether or not @@ -273,11 +277,11 @@ def _find_terminal(self, index): #the number of bytes = number of leading ones in first byte (i.e. e5 = 225 = #3 bytes (including the first) def levels_to_descend(byte_val): - if byte_val < 128: + if byte_val < 192: return 0 - elif byte_val < 192: - return 1 elif byte_val < 224: + return 1 + elif byte_val < 240: return 2 else: return 3 From 54629163f010d6da620b04a3d32d438569aa3738 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 28 Apr 2015 01:48:42 -0700 Subject: [PATCH 16/19] working with multibyte unicode tests --- dawg_python/wrapper.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index f105101..104a771 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -2,7 +2,6 @@ from __future__ import absolute_import, unicode_literals import struct import array -import pdb from . import units from .compat import int_from_byte @@ -148,8 +147,7 @@ def _get_next_multibyte(self, child_label, index, lvls=None, if lvls is None: lvls = levels_to_descend(child_label) if lvls > 0: - pdb.set_trace() - for i in xrange(lvls): + for i in reversed(xrange(lvls)): next_child_label = self._guide.child(index) prev_index = index index = self._dic.follow_char(next_child_label, index) @@ -171,10 +169,12 @@ def next(self): parent_index = self._parent_index sibling_label = self._guide.sibling(sib_index) sib_index = self._dic.follow_char(sibling_label, parent_index) - #pdb.set_trace() if not sib_index: - return False - self._sib_index_stack.append((sib_index, None, None, bytearray())) + return self.next() + if lvls == 0: + lvls = None + self._sib_index_stack.append( + (sib_index, lvls, parent_index, part_key[:])) if sibling_label == self._payload_separator: return self.next() self.key = self.key[:self.base_key_len] From 0b81a9fede3ec31906a1a0415206e2512bc9808e Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 28 Apr 2015 01:52:14 -0700 Subject: [PATCH 17/19] xrange -> range for py34 compatibility --- dawg_python/wrapper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dawg_python/wrapper.py b/dawg_python/wrapper.py index 104a771..be49230 100644 --- a/dawg_python/wrapper.py +++ b/dawg_python/wrapper.py @@ -147,7 +147,7 @@ def _get_next_multibyte(self, child_label, index, lvls=None, if lvls is None: lvls = levels_to_descend(child_label) if lvls > 0: - for i in reversed(xrange(lvls)): + for i in reversed(range(lvls)): next_child_label = self._guide.child(index) prev_index = index index = self._dic.follow_char(next_child_label, index) From 2cbd3404148f627fe03cb850194d150a122d7abd Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 28 Apr 2015 02:16:46 -0700 Subject: [PATCH 18/19] removing literal unicode for py32 compatibility in tests --- tests/test_payload_dawg.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 3257414..28cba8c 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -12,9 +12,9 @@ class TestBytesDAWG(object): ('bar', b'data2'), ('foo', b'data3'), ('foobar', b'data4'), - (u'ሀ', b'ethiopic_sign1'), - (u'ሮ', b'ethiopic_sign2'), - (u'ቄ', b'ethiopic_sign3') + (u'\u1200', b'ethiopic_sign1'), + (u'\u122e', b'ethiopic_sign2'), + (u'\u1244', b'ethiopic_sign3') ) def dawg(self): @@ -36,7 +36,7 @@ def test_getitem(self): assert d['foo'] == [b'data1', b'data3'] assert d['bar'] == [b'data2'] assert d['foobar'] == [b'data4'] - assert d[u'ቄ'] == [b'ethiopic_sign3'] + assert d[u'\u1244'] == [b'ethiopic_sign3'] def test_getitem_missing(self): From f56e2b9530dbc39cf2c3b1cceb8adefe48fba1a2 Mon Sep 17 00:00:00 2001 From: Eli Finkelshteyn Date: Tue, 28 Apr 2015 03:01:53 -0700 Subject: [PATCH 19/19] removing 'u' literal prefixes --- tests/test_payload_dawg.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/tests/test_payload_dawg.py b/tests/test_payload_dawg.py index 28cba8c..0fa0825 100644 --- a/tests/test_payload_dawg.py +++ b/tests/test_payload_dawg.py @@ -12,9 +12,9 @@ class TestBytesDAWG(object): ('bar', b'data2'), ('foo', b'data3'), ('foobar', b'data4'), - (u'\u1200', b'ethiopic_sign1'), - (u'\u122e', b'ethiopic_sign2'), - (u'\u1244', b'ethiopic_sign3') + ('ሀ', b'ethiopic_sign1'), + ('ሮ', b'ethiopic_sign2'), + ('ቄ', b'ethiopic_sign3') ) def dawg(self): @@ -36,7 +36,7 @@ def test_getitem(self): assert d['foo'] == [b'data1', b'data3'] assert d['bar'] == [b'data2'] assert d['foobar'] == [b'data4'] - assert d[u'\u1244'] == [b'ethiopic_sign3'] + assert d['\u1244'] == [b'ethiopic_sign3'] def test_getitem_missing(self): @@ -56,8 +56,7 @@ def test_getitem_missing(self): def test_keys(self): d = self.dawg() - assert d.keys() == [u'bar', u'foo', u'foo', u'foobar', u'ሀ', u'ሮ', - u'ቄ'] + assert d.keys() == ['bar', 'foo', 'foo', 'foobar', 'ሀ', 'ሮ', 'ቄ'] def test_iterkeys(self): d = self.dawg() @@ -85,9 +84,9 @@ def test_children_data(self): assert d.children_data('foobar') == [] assert d.children_data('foo') == [('foob', None)] assert set(d.children_data('')) == set([('b', None), ('f', None), - (u'ሀ', b'ethiopic_sign1'), - (u'ሮ', b'ethiopic_sign2'), - (u'ቄ', b'ethiopic_sign3')]) + ('ሀ', b'ethiopic_sign1'), + ('ሮ', b'ethiopic_sign2'), + ('ቄ', b'ethiopic_sign3')]) def test_iterchildren_data(self): d = self.dawg()