Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding edges() and iteredges() Functions for DAWGs #1

Open
wants to merge 19 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
926d6e8
working edges() for all but utf-8
EliFinkelshteyn Apr 13, 2015
fa6cd76
edges() working with unicode; added working iteredges(); added basic …
EliFinkelshteyn Apr 13, 2015
8e7390a
adding terminal indicators on edges; adding edge values for IntComple…
EliFinkelshteyn Apr 14, 2015
0211c19
adding tests for larger values to intdawg and intcompletiondawg; repl…
EliFinkelshteyn Apr 14, 2015
30bf53b
edges() and iteredges() now work for all applicable dawgs; tests adde…
EliFinkelshteyn Apr 15, 2015
15355be
items() miss returns empty list; adding test for this; moving appropr…
EliFinkelshteyn Apr 15, 2015
dee560c
b_get_value should always get bytes, not decoded unicode; utf8 should…
EliFinkelshteyn Apr 15, 2015
2a93173
adding explicit bytes() cast for b_get_value() for python 2.x compati…
EliFinkelshteyn Apr 15, 2015
c94b4d8
edges and iter_edges always return boolean terminal; adding edges_dat…
EliFinkelshteyn Apr 16, 2015
8cb08f3
forgot to add one test in last commit
EliFinkelshteyn Apr 16, 2015
f3baac8
adding tests for RecordDawg edges_data() and edgesiter_data()
EliFinkelshteyn Apr 16, 2015
77f3802
don't treat payload_separator as a normal edge
EliFinkelshteyn Apr 20, 2015
ae7472a
use ord instead of hacking with bytearray
EliFinkelshteyn Apr 20, 2015
4975f07
adding unicode tests; starting to fix multibyte unicode issues
EliFinkelshteyn Apr 27, 2015
1207380
working for all but multibyte
EliFinkelshteyn Apr 28, 2015
5462916
working with multibyte unicode tests
EliFinkelshteyn Apr 28, 2015
0b81a9f
xrange -> range for py34 compatibility
EliFinkelshteyn Apr 28, 2015
2cbd340
removing literal unicode for py32 compatibility in tests
EliFinkelshteyn Apr 28, 2015
f56e2b9
removing 'u' literal prefixes
EliFinkelshteyn Apr 28, 2015
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 118 additions & 7 deletions dawg_python/dawgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,39 @@ def keys(self, prefix=""):

return res

def edges(self, prefix=""):
b_prefix = prefix.encode('utf8')
res = []

index = self.dct.follow_bytes(b_prefix, self.dct.ROOT)
if index is None:
return res

edge_follower = wrapper.EdgeFollower(self.dct, self.guide)
if not edge_follower.start(index, b_prefix):
return res

res.append(edge_follower.get_cur_edge())
while edge_follower.next():
res.append(edge_follower.get_cur_edge())

return res

def iteredges(self, prefix=""):
b_prefix = prefix.encode('utf8')

index = self.dct.follow_bytes(b_prefix, self.dct.ROOT)
if index is None:
return

edge_follower = wrapper.EdgeFollower(self.dct, self.guide)
if not edge_follower.start(index, b_prefix):
return

yield edge_follower.get_cur_edge()
while edge_follower.next():
yield edge_follower.get_cur_edge()

def iterkeys(self, prefix=""):
b_prefix = prefix.encode('utf8')
index = self.dct.follow_bytes(b_prefix, self.dct.ROOT)
Expand Down Expand Up @@ -279,15 +312,14 @@ def iterkeys(self, prefix=""):
yield u_key

def items(self, prefix=""):
index = self.dct.ROOT
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')
res = []

index = self.dct.ROOT
if prefix:
index = self.dct.follow_bytes(prefix, index)
if not index:
return res
return []
res = []

completer = wrapper.Completer(self.dct, self.guide)
completer.start(index, prefix)
Expand All @@ -301,10 +333,9 @@ def items(self, prefix=""):
return res

def iteritems(self, prefix=""):
index = self.dct.ROOT
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')

index = self.dct.ROOT
if prefix:
index = self.dct.follow_bytes(prefix, index)
if not index:
Expand All @@ -315,9 +346,52 @@ def iteritems(self, prefix=""):

while completer.next():
key, value = completer.key.split(self._payload_separator)
item = (key.decode('utf8'), a2b_base64(bytes(value))) # bytes() cast is a python 2.6 fix
# bytes() cast is a python 2.6 fix
item = (key.decode('utf8'), a2b_base64(bytes(value)))
yield item

def edges(self, prefix=""):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that .edges method should return the same data regardless of DAWG class. It it returns a list of strings in a base class it should return a list of strings in all subclasses.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For BytesDAWG it could make sense to filter out edges leading to the values.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's similar data for all. It never returns a list of strings. It always returns a list of 2-tuples. For dawgs with no data, the tuples are (str, True) for terminal edges and (str, False) for non-terminals.

For dawgs with data, they're (str, data) for terminal edges, and (str, False) for non-terminals. Since data evaluates to true in a boolean situation, this seems most logical to me. If you want the data in an edge, you have it. If you want to just use the edges and know whether they're terminals or not, you can do that the same way across dawgs.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we really want them to be the same, we could make them return (str, True) for terminal edges always, and just add an extra edges_with_data() method for dawgs that provide any kind of data storage. That actually seems most consistent to me. If you agree, I'll make that addition.

index = self.dct.ROOT
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')
if prefix:
index = self.dct.follow_bytes(prefix, index)
if not index:
return
res = []

edge_follower = wrapper.EdgeFollower(self.dct, self.guide)
if not edge_follower.start(index, prefix):
return res

vals = self.b_get_value(bytes(edge_follower.key)) or [False]
res.extend([(edge_follower.decoded_key, val) for val in vals])
while edge_follower.next():
vals = self.b_get_value(bytes(edge_follower.key)) or [False]
res.extend([(edge_follower.decoded_key, val) for val in vals])

return res

def iteredges(self, prefix=""):
index = self.dct.ROOT
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')
if prefix:
index = self.dct.follow_bytes(prefix, index)
if not index:
return

edge_follower = wrapper.EdgeFollower(self.dct, self.guide)
if not edge_follower.start(index, prefix):
return

vals = self.b_get_value(bytes(edge_follower.key)) or [False]
for val in vals:
yield (edge_follower.decoded_key, val or False)
while edge_follower.next():
vals = self.b_get_value(bytes(edge_follower.key)) or [False]
for val in vals:
yield (edge_follower.decoded_key, val or False)

def _has_value(self, index):
return self.dct.follow_bytes(PAYLOAD_SEPARATOR, index)
Expand Down Expand Up @@ -464,6 +538,43 @@ class IntCompletionDAWG(CompletionDAWG, IntDAWG):
Dict-like class based on DAWG.
It can store integer values for unicode keys and support key completion.
"""
def edges(self, prefix=""):
index = self.dct.ROOT
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')
if prefix:
index = self.dct.follow_bytes(prefix, index)
if not index:
return
res = []

edge_follower = wrapper.EdgeFollower(self.dct, self.guide)
if not edge_follower.start(index, prefix):
return res

res.append((edge_follower.decoded_key, edge_follower.value()))
while edge_follower.next():
res.append((edge_follower.decoded_key, edge_follower.value()))

return res

def iteredges(self, prefix=""):
index = self.dct.ROOT
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')
if prefix:
index = self.dct.follow_bytes(prefix, index)
if not index:
return

edge_follower = wrapper.EdgeFollower(self.dct, self.guide)
if not edge_follower.start(index, prefix):
return

yield (edge_follower.decoded_key, edge_follower.value())
while edge_follower.next():
yield (edge_follower.decoded_key, edge_follower.value())

def items(self, prefix=""):
if not isinstance(prefix, bytes):
prefix = prefix.encode('utf8')
Expand Down
96 changes: 87 additions & 9 deletions dawg_python/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,29 +17,29 @@ def __init__(self):
"Root index"

def has_value(self, index):
"Checks if a given index is related to the end of a key."
#Checks if a given index is related to the end of a key.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are comments better than docstrings? It is nice to have some docs available at runtime, e.g. in REPL.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair. I changed it because PEP8 checks were complaining, and I never use docs at runtime, so forgot about that. I'm pretty neutral on this, so I'll change them back.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think using triple quotes and replacing "checks/gets/reads/..." with "check/get/read" should make them pep8-compatible.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code currently isn't pep8 compatible anyway because some older lines go over 80 chars, etc... I'm thinking let's leave this as is for now, and I'll do a later run through to get everything pep8 compatible so the whole repo passes pep8 compatibility checks.

return units.has_leaf(self._units[index])

def value(self, index):
"Gets a value from a given index."
#Gets a value from a given index.
offset = units.offset(self._units[index])
value_index = (index ^ offset) & units.PRECISION_MASK
return units.value(self._units[value_index])

def read(self, fp):
"Reads a dictionary from an input stream."
#Reads a dictionary from an input stream.
base_size = struct.unpack(str("=I"), fp.read(4))[0]
self._units.fromfile(fp, base_size)

def contains(self, key):
"Exact matching."
#Exact matching.
index = self.follow_bytes(key, self.ROOT)
if index is None:
return False
return self.has_value(index)

def find(self, key):
"Exact matching (returns value)"
#Exact matching (returns value)
index = self.follow_bytes(key, self.ROOT)
if index is None:
return -1
Expand All @@ -48,7 +48,7 @@ def find(self, key):
return self.value(index)

def follow_char(self, label, index):
"Follows a transition"
#Follows a transition
offset = units.offset(self._units[index])
next_index = (index ^ offset ^ label) & units.PRECISION_MASK

Expand All @@ -58,7 +58,7 @@ def follow_char(self, label, index):
return next_index

def follow_bytes(self, s, index):
"Follows transitions."
#Follows transitions.
for ch in s:
index = self.follow_char(int_from_byte(ch), index)
if index is None:
Expand Down Expand Up @@ -95,16 +95,96 @@ def size(self):
return len(self._units)


class EdgeFollower(object):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 for separating Completer and EdgeFollower

def __init__(self, dic=None, guide=None):
self._dic = dic
self._guide = guide

def value(self):
"provides list of values at current index"

if self._dic.has_value(self._cur_index):
return self._dic.value(self._cur_index)
return False

def start(self, index, prefix=b""):
"""initial setup for the next() action on some prefix. If there's a
child for this prefix, we add that as the one item on the index_stack.
Otherwise, leave the stack empty, so next() fails"""

self.key = bytearray(prefix)
self.base_key_len = len(self.key)
self._parent_index = index
self._sib_index = None
self._cur_index = None
if self._guide.size():
child_label = self._guide.child(index) # UCharType

if child_label:
# Follows a transition to the first child.
next_index = self._dic.follow_char(child_label, index)
if index is not None:
self._sib_index = next_index
self._cur_index = self._sib_index
self.key.append(child_label)
self.decoded_key = self.key.decode('utf8')
return True

def next(self):
"Gets the next edge (not necessarily a terminal)"

if not self._sib_index:
return False

sibling_label = self._guide.sibling(self._sib_index)
self._sib_index = self._dic.follow_char(sibling_label,
self._parent_index)
self._cur_index = self._sib_index
if not self._sib_index:
return False

self.key = self.key[:self.base_key_len]
self.key.append(sibling_label)
try:
self.decoded_key = self.key.decode('utf8')
except UnicodeDecodeError:
#this sibling is a multibyte char. keep following its children til
#something is decodable
while True:
child_label = self._guide.child(self._sib_index)
self._cur_index = self._dic.follow_char(child_label,
self._cur_index)
if not self._cur_index:
return False
self.key.append(child_label)
try:
self.decoded_key = self.key.decode('utf8')
break
except UnicodeDecodeError:
pass
return True

def get_cur_edge(self):
"""helper method for getting the decoded key along with whether or not
it is a terminal"""

return (self.decoded_key, self._dic.has_value(self._cur_index))


class Completer(object):

def __init__(self, dic=None, guide=None):
self._dic = dic
self._guide = guide

def value(self):
"provides list of values at current index"

return self._dic.value(self._last_index)

def start(self, index, prefix=b""):
"initial setup for a completer next() action on some prefix"

self.key = bytearray(prefix)

if self._guide.size():
Expand All @@ -113,7 +193,6 @@ def start(self, index, prefix=b""):
else:
self._index_stack = []


def next(self):
"Gets the next key"

Expand Down Expand Up @@ -153,7 +232,6 @@ def next(self):

return self._find_terminal(index)


def _follow(self, label, index):
next_index = self._dic.follow_char(label, index)
if next_index is None:
Expand Down
Binary file modified dev_data/small/int_completion_dawg.dawg
Binary file not shown.
Binary file modified dev_data/small/int_dawg.dawg
Binary file not shown.
24 changes: 23 additions & 1 deletion tests/test_dawg.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,22 @@ def test_keys(self):
d = self.dawg()
assert d.keys() == sorted(self.keys)

def test_edges(self):
d = self.dawg()
assert d.edges() == [('b', False), ('f', True)]
assert d.edges('b') == [('ba', False)]
assert d.edges('fo') == [('foo', True)]

def test_iterkeys(self):
d = self.dawg()
assert list(d.iterkeys()) == d.keys()

def test_iter_edges(self):
d = self.dawg()
assert list(d.iteredges()) == [('b', False), ('f', True)]
assert list(d.iteredges('b')) == [('ba', False)]
assert list(d.edges('fo')) == [('foo', True)]

def test_completion(self):
d = self.dawg()

Expand Down Expand Up @@ -79,7 +91,7 @@ def test_prefixes(self):


class TestIntDAWG(object):
payload = {'foo': 1, 'bar': 5, 'foobar': 3}
payload = {'foo': 1, 'bar': 5, 'foobar': 30}

def dawg(self):
return dawg_python.IntDAWG().load(data_path('small', 'int_dawg.dawg'))
Expand Down Expand Up @@ -119,3 +131,13 @@ def test_completion_keys_with_prefix(self):

def test_completion_items(self):
assert self.dawg().items() == sorted(self.payload.items(), key=lambda r: r[0])

def test_completion_edges(self):
assert self.dawg().edges('ba') == [('bar', 5)]
assert self.dawg().edges('foob') == [('fooba', False)]
assert self.dawg().edges('fooba') == [('foobar', 30)]

def test_completion_iteredges(self):
assert list(self.dawg().iteredges('ba')) == [('bar', 5)]
assert list(self.dawg().iteredges('foob')) == [('fooba', False)]
assert list(self.dawg().iteredges('fooba')) == [('foobar', 30)]
Loading