diff --git a/curated_tokenizers/_bbpe.pyx b/curated_tokenizers/_bbpe.pyx index 823776e..98a9c70 100644 --- a/curated_tokenizers/_bbpe.pyx +++ b/curated_tokenizers/_bbpe.pyx @@ -70,9 +70,9 @@ cdef class ByteBPEProcessor: return ByteBPEProcessor(vocab=self.vocab, merges=self.merges) def __deepcopy__(self, memo): - result = ByteBPEProcessor(vocab=self.vocab, merges=self.merges) - memo[id(self)] = result - return result + # We don't need a deepcopy of the vocab and merges dicts as their + # contents will be copied into a backing store in the c'tor. + return ByteBPEProcessor(vocab=self.vocab, merges=self.merges) @staticmethod def load_from_files(*, vocab: Path, merges: Path) -> ByteBPEProcessor: diff --git a/curated_tokenizers/_spp.pyx b/curated_tokenizers/_spp.pyx index 4383ed0..ad1cba9 100644 --- a/curated_tokenizers/_spp.pyx +++ b/curated_tokenizers/_spp.pyx @@ -43,6 +43,7 @@ cdef class SentencePieceProcessor: """ cdef SentencePieceProcessor processor = SentencePieceProcessor.__new__(SentencePieceProcessor) if len(protobuf) == 0: + # SentencePiece returns an empty protobuf for uninitialized models. return processor cdef string_view protobuf_view = string_view(protobuf, len(protobuf)) _check_status(deref(processor.spp).LoadFromSerializedProto(protobuf_view)) diff --git a/curated_tokenizers/_wordpiece.pyx b/curated_tokenizers/_wordpiece.pyx index 84ffd3b..204ff1c 100644 --- a/curated_tokenizers/_wordpiece.pyx +++ b/curated_tokenizers/_wordpiece.pyx @@ -19,17 +19,11 @@ cdef class WordPieceProcessor: self._pieces.add_piece(byte_array, is_initial) def __copy__(self): - cls = self.__class__ - data = self.to_list() - result = cls(data) - return result + # This is essentially a deepcopy, but there's no better way to do it. + return WordPieceProcessor(self.to_list()) def __deepcopy__(self, memo): - cls = self.__class__ - data = self.to_list() - result = cls(data) - memo[id(self)] = result - return result + return WordPieceProcessor(self.to_list()) def encode(self, token: str) -> Tuple[List[int], List[str]]: """