Skip to content

Commit

Permalink
SentencePieceProcessor: Handle zero-length protobufs during deseria…
Browse files Browse the repository at this point in the history
…lization
  • Loading branch information
shadeMe committed Jul 4, 2023
1 parent 4bdaf4c commit 74288ba
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 2 additions & 0 deletions curated_tokenizers/_spp.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ cdef class SentencePieceProcessor:
protocol buffer.
"""
cdef SentencePieceProcessor processor = SentencePieceProcessor.__new__(SentencePieceProcessor)
if len(protobuf) == 0:
return processor
cdef string_view protobuf_view = string_view(protobuf, len(protobuf))
_check_status(deref(processor.spp).LoadFromSerializedProto(protobuf_view))
return processor
Expand Down
6 changes: 5 additions & 1 deletion curated_tokenizers/tests/test_sp.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pathlib import Path
import pytest

import pytest
from curated_tokenizers import SentencePieceProcessor


Expand All @@ -22,6 +22,10 @@ def test_load_proto(test_dir):
serialized_data = spp.to_protobuf()
assert serialized_data == data

# Zero-length buffer.
spp = SentencePieceProcessor.from_protobuf(bytes())
assert spp.to_protobuf() == bytes()


def test_load_unknown_file():
with pytest.raises(OSError, match=r"No such file"):
Expand Down

0 comments on commit 74288ba

Please sign in to comment.