Remove code violating deepcopy semantics

Add comments
explosion · Jul 6, 2023 · 88f983c · 88f983c
1 parent 74288ba
commit 88f983c
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 12 deletions.
diff --git a/curated_tokenizers/_bbpe.pyx b/curated_tokenizers/_bbpe.pyx
@@ -70,9 +70,9 @@ cdef class ByteBPEProcessor:
         return ByteBPEProcessor(vocab=self.vocab, merges=self.merges)
 
     def __deepcopy__(self, memo):
-        result = ByteBPEProcessor(vocab=self.vocab, merges=self.merges)
-        memo[id(self)] = result
-        return result
+        # We don't need a deepcopy of the vocab and merges dicts as their
+        # contents will be copied into a backing store in the c'tor.
+        return ByteBPEProcessor(vocab=self.vocab, merges=self.merges)
 
     @staticmethod
     def load_from_files(*, vocab: Path, merges: Path) -> ByteBPEProcessor:

diff --git a/curated_tokenizers/_spp.pyx b/curated_tokenizers/_spp.pyx
@@ -43,6 +43,7 @@ cdef class SentencePieceProcessor:
         """
         cdef SentencePieceProcessor processor = SentencePieceProcessor.__new__(SentencePieceProcessor)
         if len(protobuf) == 0:
+            # SentencePiece returns an empty protobuf for uninitialized models.
             return processor
         cdef string_view protobuf_view = string_view(protobuf, len(protobuf))
         _check_status(deref(processor.spp).LoadFromSerializedProto(protobuf_view))

diff --git a/curated_tokenizers/_wordpiece.pyx b/curated_tokenizers/_wordpiece.pyx
@@ -19,17 +19,11 @@ cdef class WordPieceProcessor:
             self._pieces.add_piece(byte_array, is_initial)
 
     def __copy__(self):
-        cls = self.__class__
-        data = self.to_list()
-        result = cls(data)
-        return result
+        # This is essentially a deepcopy, but there's no better way to do it.
+        return WordPieceProcessor(self.to_list())
 
     def __deepcopy__(self, memo):
-        cls = self.__class__
-        data = self.to_list()
-        result = cls(data)
-        memo[id(self)] = result
-        return result
+        return WordPieceProcessor(self.to_list())
 
     def encode(self, token: str) -> Tuple[List[int], List[str]]:
         """