From 64967ebb821cfb05ee014e99032259766205948c Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 17 Aug 2022 17:37:52 +0200
Subject: [PATCH 01/30] Remove `thinc.extra.search` module and related tests
 (moved to spaCy) (#743)

---
 setup.py                              |   5 +-
 thinc/extra/__init__.pxd              |   0
 thinc/extra/__init__.py               |   0
 thinc/extra/search.pxd                |  92 --------
 thinc/extra/search.pyx                | 302 --------------------------
 thinc/extra/tests/__init__.py         |   0
 thinc/extra/tests/c_test_search.pyx   |  81 -------
 thinc/tests/extra/__init__.py         |   0
 thinc/tests/extra/test_beam_search.py |   5 -
 9 files changed, 3 insertions(+), 482 deletions(-)
 delete mode 100644 thinc/extra/__init__.pxd
 delete mode 100644 thinc/extra/__init__.py
 delete mode 100644 thinc/extra/search.pxd
 delete mode 100644 thinc/extra/search.pyx
 delete mode 100644 thinc/extra/tests/__init__.py
 delete mode 100644 thinc/extra/tests/c_test_search.pyx
 delete mode 100644 thinc/tests/extra/__init__.py
 delete mode 100644 thinc/tests/extra/test_beam_search.py

diff --git a/setup.py b/setup.py
index 27873beeb..50f1c65cc 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,6 @@
     "thinc.backends.cblas",
     "thinc.backends.linalg",
     "thinc.backends.numpy_ops",
-    "thinc.extra.search",
     "thinc.layers.sparselinear",
 ]
 COMPILE_OPTIONS = {
@@ -106,7 +105,9 @@ def setup_package():
         ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs)
         ext_modules.append(ext)
     print("Cythonizing sources")
-    ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES, language_level=2)
+    ext_modules = cythonize(
+        ext_modules, compiler_directives=COMPILER_DIRECTIVES, language_level=2
+    )
 
     setup(
         name="thinc",
diff --git a/thinc/extra/__init__.pxd b/thinc/extra/__init__.pxd
deleted file mode 100644
index e69de29bb..000000000
diff --git a/thinc/extra/__init__.py b/thinc/extra/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/thinc/extra/search.pxd b/thinc/extra/search.pxd
deleted file mode 100644
index daccbf58e..000000000
--- a/thinc/extra/search.pxd
+++ /dev/null
@@ -1,92 +0,0 @@
-from cymem.cymem cimport Pool
-
-from libc.stdint cimport uint32_t
-from libc.stdint cimport uint64_t
-from libcpp.pair cimport pair
-from libcpp.queue cimport priority_queue
-from libcpp.vector cimport vector
-
-ctypedef uint64_t hash_t
-ctypedef uint64_t class_t
-ctypedef float weight_t
-
-
-ctypedef pair[weight_t, size_t] Entry
-ctypedef priority_queue[Entry] Queue
-
-
-ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1
-
-ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL
-
-ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1
-
-ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1
-
-ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0
-
-
-cdef struct _State:
-    void* content
-    class_t* hist
-    weight_t score
-    weight_t loss
-    int i
-    int t
-    bint is_done
-
-
-cdef class Beam:
-    cdef Pool mem
-    cdef class_t nr_class
-    cdef class_t width
-    cdef class_t size
-    cdef public weight_t min_density
-    cdef int t
-    cdef readonly bint is_done
-    cdef list histories
-    cdef list _parent_histories
-    cdef weight_t** scores
-    cdef int** is_valid
-    cdef weight_t** costs
-    cdef _State* _parents
-    cdef _State* _states
-    cdef del_func_t del_func
-
-    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1
-
-    cdef inline void* at(self, int i) nogil:
-        return self._states[i].content
-
-    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1
-    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
-                     void* extra_args) except -1
-    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1
- 
-
-    cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil:
-        self.scores[i][j] = score
-        self.is_valid[i][j] = is_valid
-        self.costs[i][j] = cost
-
-    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
-                     const weight_t* costs) except -1
-    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1
-
-
-cdef class MaxViolation:
-    cdef Pool mem
-    cdef weight_t cost
-    cdef weight_t delta
-    cdef readonly weight_t p_score
-    cdef readonly weight_t g_score
-    cdef readonly double Z
-    cdef readonly double gZ
-    cdef class_t n
-    cdef readonly list p_hist
-    cdef readonly list g_hist
-    cdef readonly list p_probs
-    cdef readonly list g_probs
-
-    cpdef int check(self, Beam pred, Beam gold) except -1
-    cpdef int check_crf(self, Beam pred, Beam gold) except -1
diff --git a/thinc/extra/search.pyx b/thinc/extra/search.pyx
deleted file mode 100644
index d69756551..000000000
--- a/thinc/extra/search.pyx
+++ /dev/null
@@ -1,302 +0,0 @@
-# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True
-cimport cython
-from libc.string cimport memset, memcpy
-from libc.math cimport log, exp
-import math
-
-from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
-
-
-cdef class Beam:
-    def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0):
-        assert nr_class != 0
-        assert width != 0
-        self.nr_class = nr_class
-        self.width = width
-        self.min_density = min_density
-        self.size = 1
-        self.t = 0
-        self.mem = Pool()
-        self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State))
-        self._states = <_State*>self.mem.alloc(self.width, sizeof(_State))
-        cdef int i
-        self.histories = [[] for i in range(self.width)]
-        self._parent_histories = [[] for i in range(self.width)]
-
-        self.scores = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
-        self.is_valid = <int**>self.mem.alloc(self.width, sizeof(weight_t*))
-        self.costs = <weight_t**>self.mem.alloc(self.width, sizeof(weight_t*))
-        for i in range(self.width):
-            self.scores[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
-            self.is_valid[i] = <int*>self.mem.alloc(self.nr_class, sizeof(int))
-            self.costs[i] = <weight_t*>self.mem.alloc(self.nr_class, sizeof(weight_t))
-
-    def __len__(self):
-        return self.size
-
-    property score:
-        def __get__(self):
-            return self._states[0].score
-
-    property min_score:
-        def __get__(self):
-            return self._states[self.size-1].score
-
-    property loss:
-        def __get__(self):
-            return self._states[0].loss
-
-    property probs:
-        def __get__(self):
-            return _softmax([self._states[i].score for i in range(self.size)])
-
-    property scores:
-        def __get__(self):
-            return [self._states[i].score for i in range(self.size)]
-
-    property histories:
-        def __get__(self):
-            return self.histories
-
-    cdef int set_row(self, int i, const weight_t* scores, const int* is_valid,
-                     const weight_t* costs) except -1:
-        cdef int j
-        for j in range(self.nr_class):
-            self.scores[i][j] = scores[j]
-            self.is_valid[i][j] = is_valid[j]
-            self.costs[i][j] = costs[j]
-
-    cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1:
-        cdef int i, j
-        for i in range(self.width):
-            memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class)
-            memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class)
-            memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class)
-
-    cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1:
-        for i in range(self.width):
-            self._states[i].content = init_func(self.mem, n, extra_args)
-            self._parents[i].content = init_func(self.mem, n, extra_args)
-        self.del_func = del_func
-
-    def __dealloc__(self):
-        for i in range(self.width):
-            self.del_func(self.mem, self._states[i].content, NULL)
-            self.del_func(self.mem, self._parents[i].content, NULL)
-
-    @cython.cdivision(True)
-    cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func,
-                     void* extra_args) except -1:
-        cdef weight_t** scores = self.scores
-        cdef int** is_valid = self.is_valid
-        cdef weight_t** costs = self.costs
-
-        cdef Queue* q = new Queue()
-        self._fill(q, scores, is_valid)
-        # For a beam of width k, we only ever need 2k state objects. How?
-        # Each transition takes a parent and a class and produces a new state.
-        # So, we don't need the whole history --- just the parent. So at
-        # each step, we take a parent, and apply one or more extensions to
-        # it.
-        self._parents, self._states = self._states, self._parents
-        self._parent_histories, self.histories = self.histories, self._parent_histories
-        cdef weight_t score
-        cdef int p_i
-        cdef int i = 0
-        cdef class_t clas
-        cdef _State* parent
-        cdef _State* state
-        cdef hash_t key
-        cdef PreshMap seen_states = PreshMap(self.width)
-        cdef uint64_t is_seen
-        cdef uint64_t one = 1
-        while i < self.width and not q.empty():
-            data = q.top()
-            p_i = data.second / self.nr_class
-            clas = data.second % self.nr_class
-            score = data.first
-            q.pop()
-            parent = &self._parents[p_i]
-            # Indicates terminal state reached; i.e. state is done
-            if parent.is_done:
-                # Now parent will not be changed, so we don't have to copy.
-                # Once finished, should also be unbranching.
-                self._states[i], parent[0] = parent[0], self._states[i]
-                parent.i = self._states[i].i
-                parent.t = self._states[i].t
-                parent.is_done = self._states[i].t
-                self._states[i].score = score
-                self.histories[i] = list(self._parent_histories[p_i])
-                i += 1
-            else:
-                state = &self._states[i]
-                # The supplied transition function should adjust the destination
-                # state to be the result of applying the class to the source state
-                transition_func(state.content, parent.content, clas, extra_args)
-                key = hash_func(state.content, extra_args) if hash_func is not NULL else 0
-                is_seen = <uint64_t>seen_states.get(key)
-                if key == 0 or key == 1 or not is_seen:
-                    if key != 0 and key != 1:
-                        seen_states.set(key, <void*>one)
-                    state.score = score
-                    state.loss = parent.loss + costs[p_i][clas]
-                    self.histories[i] = list(self._parent_histories[p_i])
-                    self.histories[i].append(clas)
-                    i += 1
-        del q
-        self.size = i
-        assert self.size >= 1
-        for i in range(self.width):
-            memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class)
-            memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class)
-            memset(self.is_valid[i], 0, sizeof(int) * self.nr_class)
-        self.t += 1
-
-    cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1:
-        cdef int i
-        for i in range(self.size):
-            if not self._states[i].is_done:
-                self._states[i].is_done = finish_func(self._states[i].content, extra_args)
-        for i in range(self.size):
-            if not self._states[i].is_done:
-                self.is_done = False
-                break
-        else:
-            self.is_done = True
-
-    @cython.cdivision(True)
-    cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1:
-        """Populate the queue from a k * n matrix of scores, where k is the
-        beam-width, and n is the number of classes.
-        """
-        cdef Entry entry
-        cdef weight_t score
-        cdef _State* s
-        cdef int i, j, move_id
-        assert self.size >= 1
-        cdef vector[Entry] entries
-        for i in range(self.size):
-            s = &self._states[i]
-            move_id = i * self.nr_class
-            if s.is_done:
-                # Update score by path average, following TACL '13 paper.
-                if self.histories[i]:
-                    entry.first = s.score + (s.score / self.t)
-                else:
-                    entry.first = s.score
-                entry.second = move_id
-                entries.push_back(entry)
-            else:
-                for j in range(self.nr_class):
-                    if is_valid[i][j]:
-                        entry.first = s.score + scores[i][j]
-                        entry.second = move_id + j
-                        entries.push_back(entry)
-        cdef double max_, Z, cutoff
-        if self.min_density == 0.0:
-            for i in range(entries.size()):
-                q.push(entries[i])
-        elif not entries.empty():
-            max_ = entries[0].first
-            Z = 0.
-            cutoff = 0.
-            # Softmax into probabilities, so we can prune
-            for i in range(entries.size()):
-                if entries[i].first > max_:
-                    max_ = entries[i].first
-            for i in range(entries.size()):
-                Z += exp(entries[i].first-max_)
-            cutoff = (1. / Z) * self.min_density
-            for i in range(entries.size()):
-                prob = exp(entries[i].first-max_) / Z
-                if prob >= cutoff:
-                    q.push(entries[i])
-
-
-cdef class MaxViolation:
-    def __init__(self):
-        self.p_score = 0.0
-        self.g_score = 0.0
-        self.Z = 0.0
-        self.gZ = 0.0
-        self.delta = -1
-        self.cost = 0
-        self.p_hist = []
-        self.g_hist = []
-        self.p_probs = []
-        self.g_probs = []
-
-    cpdef int check(self, Beam pred, Beam gold) except -1:
-        cdef _State* p = &pred._states[0]
-        cdef _State* g = &gold._states[0]
-        cdef weight_t d = p.score - g.score
-        if p.loss >= 1 and (self.cost == 0 or d > self.delta):
-            self.cost = p.loss
-            self.delta = d
-            self.p_hist = list(pred.histories[0])
-            self.g_hist = list(gold.histories[0])
-            self.p_score = p.score
-            self.g_score = g.score
-            self.Z = 1e-10
-            self.gZ = 1e-10
-            for i in range(pred.size):
-                if pred._states[i].loss > 0:
-                    self.Z += exp(pred._states[i].score)
-            for i in range(gold.size):
-                if gold._states[i].loss == 0:
-                    prob = exp(gold._states[i].score)
-                    self.Z += prob
-                    self.gZ += prob
-
-    cpdef int check_crf(self, Beam pred, Beam gold) except -1:
-        d = pred.score - gold.score
-        seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)])
-        if pred.loss > 0 and (self.cost == 0 or d > self.delta):
-            p_hist = []
-            p_scores = []
-            g_hist = []
-            g_scores = []
-            for i in range(pred.size):
-                if pred._states[i].loss > 0:
-                    p_scores.append(pred._states[i].score)
-                    p_hist.append(list(pred.histories[i]))
-                # This can happen from non-monotonic actions
-                # If we find a better gold analysis this way, be sure to keep it.
-                elif pred._states[i].loss <= 0 \
-                and tuple(pred.histories[i]) not in seen_golds:
-                    g_scores.append(pred._states[i].score)
-                    g_hist.append(list(pred.histories[i]))
-            for i in range(gold.size):
-                if gold._states[i].loss == 0:
-                    g_scores.append(gold._states[i].score)
-                    g_hist.append(list(gold.histories[i]))
-
-            all_probs = _softmax(p_scores + g_scores)
-            p_probs = all_probs[:len(p_scores)]
-            g_probs_all = all_probs[len(p_scores):]
-            g_probs = _softmax(g_scores)
-
-            self.cost = pred.loss
-            self.delta = d
-            self.p_hist = p_hist
-            self.g_hist = g_hist
-            # TODO: These variables are misnamed! These are the gradients of the loss.
-            self.p_probs = p_probs
-            # Intuition here:
-            # The gradient of the loss is:
-            # P(model) - P(truth)
-            # Normally, P(truth) is 1 for the gold
-            # But, if we want to do the "partial credit" scheme, we want
-            # to create a distribution over the gold, proportional to the scores
-            # awarded.
-            self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)]
-
-
-def _softmax(nums):
-    if not nums:
-        return []
-    max_ = max(nums)
-    nums = [(exp(n-max_) if n is not None else None) for n in nums]
-    Z = sum(n for n in nums if n is not None)
-    return [(n/Z if n is not None else None) for n in nums]
diff --git a/thinc/extra/tests/__init__.py b/thinc/extra/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/thinc/extra/tests/c_test_search.pyx b/thinc/extra/tests/c_test_search.pyx
deleted file mode 100644
index a727d3364..000000000
--- a/thinc/extra/tests/c_test_search.pyx
+++ /dev/null
@@ -1,81 +0,0 @@
-from thinc.extra.search cimport Beam
-from cymem.cymem cimport Pool
-from thinc.typedefs cimport class_t, weight_t
-
-
-cdef struct TestState:
-    int length
-    int x
-    Py_UNICODE* string
-
-
-cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1:
-    dest_state = <TestState*>dest
-    src_state = <TestState*>src
-    dest_state.length = src_state.length
-    dest_state.x = src_state.x
-    dest_state.x += clas
-    if extra_args != NULL:
-        dest_state.string = <Py_UNICODE*>extra_args
-    else:
-        dest_state.string = src_state.string
-
-
-cdef void* initialize(Pool mem, int n, void* extra_args) except NULL:
-    state = <TestState*>mem.alloc(1, sizeof(TestState))
-    state.length = n
-    state.x = 1
-    if extra_args == NULL:
-        state.string = 'default'
-    else:
-        state.string = <Py_UNICODE*>extra_args
-    return state
-
-
-cdef int destroy(Pool mem, void* state, void* extra_args) except -1:
-    state = <TestState*>state
-    mem.free(state)
-
-
-def test_init(nr_class, beam_width):
-    b = Beam(nr_class, beam_width)
-    assert b.size == 1
-    assert b.width == beam_width
-    assert b.nr_class == nr_class
-
-
-def test_initialize(nr_class, beam_width, length):
-    b = Beam(nr_class, beam_width)
-    b.initialize(initialize, destroy, length, NULL)
-    for i in range(b.width):
-        s = <TestState*>b.at(i)
-        assert s.length == length, s.length
-        assert s.string == 'default'
-
-
-def test_initialize_extra(nr_class, beam_width, length, unicode extra):
-    b = Beam(nr_class, beam_width)
-    b.initialize(initialize, destroy, length, <void*><Py_UNICODE*>extra)
-    for i in range(b.width):
-        s = <TestState*>b.at(i)
-        assert s.length == length
-
-
-def test_transition(nr_class=3, beam_width=6, length=3):
-    b = Beam(nr_class, beam_width)
-    b.initialize(initialize, destroy, length, NULL)
-    b.set_cell(0, 2, 30, True, 0)
-    b.set_cell(0, 1, 42, False, 0)
-    b.advance(transition, NULL, NULL)
-    assert b.size == 1, b.size
-    assert b.score == 30, b.score
-    s = <TestState*>b.at(0)
-    assert s.x == 3
-    assert b._states[0].score == 30, b._states[0].score
-    b.set_cell(0, 1, 10, True, 0)
-    b.set_cell(0, 2, 20, True, 0)
-    b.advance(transition, NULL, NULL)
-    assert b._states[0].score == 50, b._states[0].score
-    assert b._states[1].score == 40
-    s = <TestState*>b.at(0)
-    assert s.x == 5
diff --git a/thinc/tests/extra/__init__.py b/thinc/tests/extra/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/thinc/tests/extra/test_beam_search.py b/thinc/tests/extra/test_beam_search.py
deleted file mode 100644
index ab7ab9f11..000000000
--- a/thinc/tests/extra/test_beam_search.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from thinc.extra.search import MaxViolation
-
-
-def test_init_violn():
-    MaxViolation()

From 43ef766e3bfd5c52d8f0a58ef54de252326f63dc Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 13 Sep 2022 09:45:04 +0200
Subject: [PATCH 02/30] `NumpyOps` cleanup (#760)

* `NumpyOps`: Remove unused/vestigial free functions, reuse functions in `Ops`

* Remove superfluous `typedef`
---
 thinc/backends/numpy_ops.pyx | 111 ++++-------------------------------
 1 file changed, 11 insertions(+), 100 deletions(-)

diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index c980e6c5d..130aec643 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -22,7 +22,7 @@ from ..util import copy_array, get_array_module
 from ..types import DeviceTypes, DTypes, Shape, ArrayXd
 from .cblas cimport CBlas, daxpy, saxpy
 from .linalg cimport VecVec, Vec
-from .ops import Ops
+from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights
 
 try:
     import blis.py
@@ -31,9 +31,6 @@ except ImportError:
     has_blis = False
 
 
-ctypedef float weight_t
-
-
 cdef extern from "math.h":
     float logf(float x) nogil
     float sqrtf(float x) nogil
@@ -118,12 +115,12 @@ class NumpyOps(Ops):
         _check_compatible_shape(dY, Y)
 
         cdef size_t size = Y.size
-        cdef weight_t* dX_ptr
-        cdef const weight_t* Y_ptr = <const weight_t*>Y.data
+        cdef float* dX_ptr
+        cdef const float* Y_ptr = <const float*>Y.data
         cdef np.ndarray dX
         if dY.dtype == "float32" and Y.dtype == "float32":
             dX = _inplace_or_copy(dY, inplace)
-            dX_ptr = <weight_t*>dX.data
+            dX_ptr = <float*>dX.data
             for i in range(size):
                 if Y_ptr[i] <= 0:
                     dX_ptr[i] = 0.
@@ -522,21 +519,6 @@ def check_seq2col_lengths(ops, lengths, B):
     return lengths
 
 
-def cpu_clip_gradient(weight_t[::1] gradient, weight_t threshold):
-    grad_norm = Vec.norm(&gradient[0], gradient.shape[0])
-    if grad_norm >= threshold:
-        Vec.mul_i(&gradient[0], threshold / grad_norm, gradient.shape[0])
-
-
-def add_gradient_noise(float[::1] gradient, weight_t noise_level,
-        weight_t timestep):
-    cdef weight_t variance = noise_level / ((1 + timestep) ** 0.55)
-    if variance >= 0.000001:
-        gradient += numpy.asarray(
-                       numpy.random.normal(scale=variance, loc=0., size=len(gradient)),
-                       dtype='float32')
-
-
 cdef void cpu_position_encode(float* output, float period, int N, int D) nogil:
     cdef float pos, d
     cdef int j
@@ -567,18 +549,18 @@ cdef void cpu_scatter_add(float* dest,
 
 
 @cython.cdivision(True)
-cdef void _adam_momentum(weight_t* gradient, weight_t* mom1, weight_t* mom2,
-        int nr_weight, weight_t beta1, weight_t beta2, weight_t eps,
-        weight_t learn_rate) nogil:
+cdef void _adam_momentum(float* gradient, float* mom1, float* mom2,
+        int nr_weight, float beta1, float beta2, float eps,
+        float learn_rate) nogil:
     # Calculate Adam on CPU, fused.
     # Assumes the learning rate adjustment is calculated by the caller;
     # a_t = learn_rate * sqrt(1-beta2**timestep) / (1-beta1**timestep)
-    cdef weight_t one_minus_beta1 = 1-beta1
-    cdef weight_t one_minus_beta2 = 1-beta2
-    cdef weight_t m1, m2, g
+    cdef float one_minus_beta1 = 1-beta1
+    cdef float one_minus_beta2 = 1-beta2
+    cdef float m1, m2, g
     cdef int i
     # Blockwise implementation is a bit faster. Adam is slooow :(
-    cdef weight_t[64] buff
+    cdef float[64] buff
     cdef int steps = nr_weight // 64
     if steps * 64 < nr_weight:
         steps += 1
@@ -604,18 +586,6 @@ cdef void _adam_momentum(weight_t* gradient, weight_t* mom1, weight_t* mom2,
         idx += step_size
 
 
-@cython.cdivision(True)
-cdef void cpu_update_averages(weight_t* ema,
-        const weight_t* weights, int nr_weight, weight_t t, weight_t max_decay) nogil:
-    cdef weight_t decay = (1.0 + t) / (10.0 + t)
-    if decay > max_decay:
-        decay = max_decay
-    cdef weight_t one_minus_decay = 1-decay
-    cdef int i
-    for i in range(nr_weight): # num_threads=4, schedule='static'):
-        ema[i] -= one_minus_decay * (ema[i] - weights[i])
-
-
 def lstm_forward_training(
     np.ndarray params, np.ndarray c_init, np.ndarray h_init,
     np.ndarray X, np.ndarray lengths
@@ -847,17 +817,6 @@ def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_stat
     return dX, numpy.concatenate(grad_parts)
 
 
-def _split_directions(X, dirs):
-    if dirs == 1:
-        return [X]
-    else:
-        X_ = X.reshape((X.shape[0], -1, dirs))
-        Xs = []
-        for d in range(dirs):
-            Xs.append(numpy.ascontiguousarray(X_[:, d]))
-        return Xs
-
-
 cdef int _lstm_backward_training(
     int d, int N, int nO, int nI, int nT,
     float* dX,
@@ -950,54 +909,6 @@ cdef int _lstm_backward_training(
     )
 
 
-def _split_weights(np.ndarray params, int i, int nO, int nI, int params_i):
-    Wx_size = 4 * nO * nI
-    bx_size = 4 * nO
-    Wh_size = 4 * nO * nO
-    bh_size = 4 * nO
-    Wx = params[params_i : params_i + Wx_size].reshape((4 * nO, nI))
-    params_i += Wx_size
-    bx = params[params_i : params_i + bx_size].reshape((4 * nO,))
-    params_i += bx_size
-    Wh = params[params_i : params_i + Wh_size].reshape((4 * nO, nO))
-    params_i += Wh_size
-    bh = params[params_i : params_i + bh_size].reshape((4 * nO,))
-    params_i += bh_size
-    return ((Wx, bx), (Wh, bh)), params_i
-
-
-def _transpose_weights(params):
-    # Transpose the parameters so that the gates are the last dimension. This
-    # makes it easier to fuse.
-    (Wx, bx), (Wh, bh) = params
-    Wx = Wx.reshape((4, -1, Wx.shape[-1]))
-    Wx = Wx.transpose((1, 0, 2)).reshape((-1, Wx.shape[-1]))
-    bx = bx.reshape((4, -1)).transpose((1, 0)).reshape((-1,))
-    Wh = Wh.reshape((4, -1, Wh.shape[-1]))
-    Wh = Wh.transpose((1, 0, 2)).reshape((-1, Wh.shape[-1]))
-    bh = bh.reshape((4, -1)).transpose((1, 0)).reshape((-1,))
-    ascontig = numpy.ascontiguousarray
-    Wx = ascontig(Wx)
-    Wh = ascontig(Wh)
-    bias = ascontig(bx) + bh
-    return Wx, Wh, bias
-
-
-def _untranspose_unsplit_weights(params):
-    Wx, Wh, bias = params
-    nO = Wh.shape[1]
-    nI = Wx.shape[1]
-    Wx = Wx.reshape((-1, 4, nI)).transpose((1, 0, 2)).reshape((-1, nI))
-    Wh = Wh.reshape((-1, 4, nO)).transpose((1, 0, 2)).reshape((-1, nO))
-    bias = bias.reshape((-1, 4)).transpose((1, 0)).reshape((-1,))
-    zeros = numpy.zeros(bias.shape, dtype="f")
-    return numpy.concatenate((Wx.ravel(), bias, Wh.ravel(), zeros))
-
-
-cdef inline float sigmoid(float X) nogil:
-    return 1./(1. + expf(-X))
-
-
 cdef inline float dsigmoid(float y) nogil:
     return y*(1-y)
 

From 17c823e06120d18441b47e0724723e69376126b2 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 17:41:47 +0200
Subject: [PATCH 03/30] disable mypy run for Python 3.10 (#768) (#769)

* disable mypy run for Python 3.10

* dot
---
 azure-pipelines.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c81c62689..f9096029c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,6 +63,7 @@ jobs:
   - script: |
       python -m mypy thinc
     displayName: 'Run mypy'
+    condition: ne(variables['python.version'], '3.10')
 
   - task: DeleteFiles@1
     inputs:

From 0366934a144f244714f36b62a82b8ac32386b9ed Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 16 Sep 2022 14:34:41 +0200
Subject: [PATCH 04/30] Remove vestigial/mostly unused `backends.linalg` module
 (#742)

* `CBlas`: Add `sscalv`

* `NumpyOps`: Replace usage of  `.linalg` with `numpy` and `BLAS` calls

* Remove vestigial/mostly unused `backends.linalg` module

* Use BLAS notation for `sscal`, add `dscal`
---
 setup.py                     |   1 -
 thinc/backends/cblas.pxd     |   6 +
 thinc/backends/cblas.pyx     |  24 +++
 thinc/backends/linalg.pxd    | 276 -----------------------------------
 thinc/backends/linalg.pyx    |   4 -
 thinc/backends/numpy_ops.pyx |  25 ++--
 6 files changed, 42 insertions(+), 294 deletions(-)
 delete mode 100644 thinc/backends/linalg.pxd
 delete mode 100644 thinc/backends/linalg.pyx

diff --git a/setup.py b/setup.py
index 50f1c65cc..c76034945 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,6 @@
 PACKAGES = find_packages()
 MOD_NAMES = [
     "thinc.backends.cblas",
-    "thinc.backends.linalg",
     "thinc.backends.numpy_ops",
     "thinc.layers.sparselinear",
 ]
diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
index 15837e5e7..0ec778dde 100644
--- a/thinc/backends/cblas.pxd
+++ b/thinc/backends/cblas.pxd
@@ -13,6 +13,8 @@ ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX,
 ctypedef void (*daxpy_ptr)(int N, double alpha, const double* X, int incX,
                            double *Y, int incY) nogil
 
+ctypedef void (*sscal_ptr)(int N, float alpha, float* X, int incX) nogil
+ctypedef void (*dscal_ptr)(int N, double alpha, double* X, int incX) nogil
 
 # Forward-declaration of the BlasFuncs struct. This struct must be opaque, so
 # that consumers of the CBlas class cannot become dependent on its size or
@@ -33,6 +35,10 @@ cdef class CBlas:
 cdef daxpy_ptr daxpy(CBlas cblas) nogil
 cdef saxpy_ptr saxpy(CBlas cblas) nogil
 cdef sgemm_ptr sgemm(CBlas cblas) nogil
+cdef sscal_ptr sscal(CBlas cblas) nogil
+cdef dscal_ptr dscal(CBlas cblas) nogil
 cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil
 cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil
 cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil
+cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil
+cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil
diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx
index 9eb4514d8..9348096b8 100644
--- a/thinc/backends/cblas.pyx
+++ b/thinc/backends/cblas.pyx
@@ -3,10 +3,20 @@ from cython.operator cimport dereference as deref
 from libcpp.memory cimport make_shared
 
 
+# Single- and double-precision wrappers for `blis.cy.scalv`
+cdef void blis_sscal(int N, float alpha, float* X, int incX) nogil:
+    blis.cy.scalv(blis.cy.NO_CONJUGATE, N, alpha, X, incX)
+
+cdef void blis_dscal(int N, double alpha, double* X, int incX) nogil:
+    blis.cy.scalv(blis.cy.NO_CONJUGATE, N, alpha, X, incX)
+
+
 cdef struct BlasFuncs:
     daxpy_ptr daxpy
     saxpy_ptr saxpy
     sgemm_ptr sgemm
+    sscal_ptr sscal
+    dscal_ptr dscal
 
 
 cdef class CBlas:
@@ -19,6 +29,8 @@ cdef class CBlas:
         funcs.daxpy = blis.cy.daxpy
         funcs.saxpy = blis.cy.saxpy
         funcs.sgemm = blis.cy.sgemm
+        funcs.sscal = blis_sscal
+        funcs.dscal = blis_dscal
         self.ptr = make_shared[BlasFuncs](funcs)
 
 cdef daxpy_ptr daxpy(CBlas cblas) nogil:
@@ -30,6 +42,12 @@ cdef saxpy_ptr saxpy(CBlas cblas) nogil:
 cdef sgemm_ptr sgemm(CBlas cblas) nogil:
     return deref(cblas.ptr).sgemm
 
+cdef sscal_ptr sscal(CBlas cblas) nogil:
+    return deref(cblas.ptr).sscal
+
+cdef dscal_ptr dscal(CBlas cblas) nogil:
+    return deref(cblas.ptr).dscal
+
 cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil:
     deref(cblas.ptr).daxpy = daxpy
 
@@ -38,3 +56,9 @@ cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil:
 
 cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil:
     deref(cblas.ptr).sgemm = sgemm
+
+cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil:
+    deref(cblas.ptr).sscal = sscal
+
+cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil:
+    deref(cblas.ptr).dscal = dscal
diff --git a/thinc/backends/linalg.pxd b/thinc/backends/linalg.pxd
deleted file mode 100644
index 494a26c30..000000000
--- a/thinc/backends/linalg.pxd
+++ /dev/null
@@ -1,276 +0,0 @@
-# cython: infer_types=True
-# cython: cdivision=True
-
-cimport cython
-from libc.stdint cimport int32_t
-from libc.string cimport memset, memcpy
-from cymem.cymem cimport Pool
-
-
-ctypedef float weight_t
-
-DEF USE_BLAS = False
-DEF EPS = 1e-5
-
-
-IF USE_BLAS:
-    cimport blis.cy
-
-cdef extern from "math.h" nogil:
-    weight_t exp(weight_t x)
-    weight_t sqrt(weight_t x)
-
-
-cdef class Matrix:
-    cdef readonly Pool mem
-    cdef weight_t* data
-    cdef readonly int32_t nr_row
-    cdef readonly int32_t nr_col
-
-
-cdef class Vec:
-    @staticmethod    
-    cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil:
-        if n_classes == 2:
-            return 0 if scores[0] > scores[1] else 1
-        cdef int i
-        cdef int best = 0
-        cdef weight_t mode = scores[0]
-        for i in range(1, n_classes):
-            if scores[i] > mode:
-                mode = scores[i]
-                best = i
-        return best
-
-    @staticmethod
-    cdef inline weight_t max(const weight_t* x, int32_t nr) nogil:
-        if nr == 0:
-            return 0
-        cdef int i
-        cdef weight_t mode = x[0]
-        for i in range(1, nr):
-            if x[i] > mode:
-                mode = x[i]
-        return mode
-
-    @staticmethod
-    cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil:
-        cdef int i
-        cdef weight_t total = 0
-        for i in range(nr):
-            total += vec[i]
-        return total
-
-    @staticmethod
-    cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil:
-        cdef weight_t total = 0
-        for i in range(nr):
-            total += vec[i] ** 2
-        return sqrt(total)
-
-    @staticmethod
-    cdef inline void add(weight_t* output, const weight_t* x,
-            weight_t inc, int32_t nr) nogil:
-        memcpy(output, x, sizeof(output[0]) * nr)
-        Vec.add_i(output, inc, nr)
-
-    @staticmethod
-    cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            vec[i] += inc
-
-    @staticmethod
-    cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal,
-            int32_t nr) nogil:
-        memcpy(output, vec, sizeof(output[0]) * nr)
-        Vec.mul_i(output, scal, nr)
-
-    @staticmethod
-    cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil:
-        cdef int i
-        IF USE_BLAS:
-            blis.cy.scalv(BLIS_NO_CONJUGATE, nr, scal, vec, 1)
-        ELSE:
-            for i in range(nr):
-                vec[i] *= scal
-
-    @staticmethod
-    cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal,
-            int32_t nr) nogil:
-        memcpy(output, vec, sizeof(output[0]) * nr)
-        Vec.pow_i(output, scal, nr)
-
-    @staticmethod
-    cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            vec[i] **= scal
-
-    @staticmethod
-    @cython.cdivision(True)
-    cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal,
-            int32_t nr) nogil:
-        memcpy(output, vec, sizeof(output[0]) * nr)
-        Vec.div_i(output, scal, nr)
-
-    @staticmethod
-    @cython.cdivision(True)
-    cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            vec[i] /= scal
-
-    @staticmethod
-    cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil:
-        memcpy(output, vec, sizeof(output[0]) * nr)
-        Vec.exp_i(output, nr)
-
-    @staticmethod
-    cdef inline void exp_i(weight_t* vec, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            vec[i] = exp(vec[i])
-
-    @staticmethod
-    cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            vec[i] = 1.0 / vec[i]
-
-    @staticmethod
-    cdef inline weight_t mean(const weight_t* X, int32_t nr_dim) nogil:
-        cdef weight_t mean = 0.
-        for x in X[:nr_dim]:
-            mean += x
-        return mean / nr_dim
-
-    @staticmethod
-    cdef inline weight_t variance(const weight_t* X, int32_t nr_dim) nogil:
-        # See https://www.johndcook.com/blog/standard_deviation/
-        cdef double m = X[0]
-        cdef double v = 0.
-        for i in range(1, nr_dim):
-            diff = X[i]-m
-            m += diff / (i+1)
-            v += diff * (X[i] - m)
-        return v / nr_dim
-
-
-cdef class VecVec:
-    @staticmethod
-    cdef inline void add(weight_t* output,
-                         const weight_t* x, 
-                         const weight_t* y,
-                         weight_t scale,
-                         int32_t nr) nogil:
-        memcpy(output, x, sizeof(output[0]) * nr)
-        VecVec.add_i(output, y, scale, nr)
-   
-    @staticmethod
-    cdef inline void add_i(weight_t* x, 
-                           const weight_t* y,
-                           weight_t scale,
-                           int32_t nr) nogil:
-        cdef int i
-        IF USE_BLAS:
-            blis.cy.axpyv(BLIS_NO_CONJUGATE, nr, scale, y, 1, x, 1)
-        ELSE:
-            for i in range(nr):
-                x[i] += y[i] * scale
-    
-    @staticmethod
-    cdef inline void batch_add_i(weight_t* x, 
-                           const weight_t* y,
-                           weight_t scale,
-                           int32_t nr, int32_t nr_batch) nogil:
-        # For fixed x, matrix of y
-        cdef int i, _
-        for _ in range(nr_batch):
-            VecVec.add_i(x,
-                y, scale, nr)
-            y += nr
- 
-    @staticmethod
-    cdef inline void add_pow(weight_t* output,
-            const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil:
-        memcpy(output, x, sizeof(output[0]) * nr)
-        VecVec.add_pow_i(output, y, power, nr)
-
-   
-    @staticmethod
-    cdef inline void add_pow_i(weight_t* x, 
-            const weight_t* y, weight_t power, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            x[i] += y[i] ** power
- 
-    @staticmethod
-    cdef inline void mul(weight_t* output,
-            const weight_t* x, const weight_t* y, int32_t nr) nogil:
-        memcpy(output, x, sizeof(output[0]) * nr)
-        VecVec.mul_i(output, y, nr)
-   
-    @staticmethod
-    cdef inline void mul_i(weight_t* x, 
-            const weight_t* y, int32_t nr) nogil:
-        cdef int i
-        for i in range(nr):
-            x[i] *= y[i]
-
-    @staticmethod
-    cdef inline weight_t dot(
-            const weight_t* x, const weight_t* y, int32_t nr) nogil:
-        cdef int i
-        cdef weight_t total = 0
-        for i in range(nr):
-            total += x[i] * y[i]
-        return total
- 
-    @staticmethod
-    cdef inline int arg_max_if_true(
-            const weight_t* scores, const int* is_valid, const int n_classes) nogil:
-        cdef int i
-        cdef int best = -1
-        for i in range(n_classes):
-            if is_valid[i] and (best == -1 or scores[i] > scores[best]):
-                best = i
-        return best
-
-    @staticmethod
-    cdef inline int arg_max_if_zero(
-            const weight_t* scores, const weight_t* costs, const int n_classes) nogil:
-        cdef int i
-        cdef int best = -1
-        for i in range(n_classes):
-            if costs[i] == 0 and (best == -1 or scores[i] > scores[best]):
-                best = i
-        return best
-
-
-cdef class Mat:
-    @staticmethod
-    cdef inline void mean_row(weight_t* Ex,
-            const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil:
-        memset(Ex, 0, sizeof(Ex[0]) * nr_col)
-        for i in range(nr_row):
-            VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col)
-        Vec.mul_i(Ex, 1.0 / nr_row, nr_col)
-
-    @staticmethod
-    cdef inline void var_row(weight_t* Vx,
-            const weight_t* mat, const weight_t* Ex,
-            int32_t nr_row, int32_t nr_col, weight_t eps) nogil:
-        # From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
-        if nr_row == 0 or nr_col == 0:
-            return
-        cdef weight_t sum_, sum2
-        for i in range(nr_col):
-            sum_ = 0.0
-            sum2 = 0.0
-            for j in range(nr_row):
-                x = mat[j * nr_col + i]
-                sum2 += (x - Ex[i]) ** 2
-                sum_ += x - Ex[i]
-            Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row
-            Vx[i] += eps
diff --git a/thinc/backends/linalg.pyx b/thinc/backends/linalg.pyx
deleted file mode 100644
index 4979e8aa9..000000000
--- a/thinc/backends/linalg.pyx
+++ /dev/null
@@ -1,4 +0,0 @@
-try:
-    import blis.py
-except ImportError:
-    pass
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 130aec643..884c74941 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -20,8 +20,7 @@ cimport blis.cy
 from .. import registry
 from ..util import copy_array, get_array_module
 from ..types import DeviceTypes, DTypes, Shape, ArrayXd
-from .cblas cimport CBlas, daxpy, saxpy
-from .linalg cimport VecVec, Vec
+from .cblas cimport CBlas, daxpy, saxpy, sscal
 from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights
 
 try:
@@ -463,7 +462,7 @@ class NumpyOps(Ops):
         and values.ndim == 2 \
         and values.shape[0] == indices.shape[0] \
         and values.shape[1] == table.shape[1]:
-            cpu_scatter_add(<float*>table.data,
+            cpu_scatter_add(self.cblas(), <float*>table.data,
                 <int*>indices.data, <float*>values.data,
                 indices.shape[0], table.shape[1])
         else:
@@ -479,10 +478,11 @@ class NumpyOps(Ops):
         _check_compatible_shape(weights, mom1)
         _check_compatible_shape(weights, mom2)
 
-        _adam_momentum(<float*>gradient.data, <float*>mom1.data, <float*>mom2.data,
+        cdef CBlas cblas = self.cblas()
+        _adam_momentum(cblas, <float*>gradient.data, <float*>mom1.data, <float*>mom2.data,
             weights.shape[0], beta1, beta2, eps, learn_rate)
-        VecVec.add_i(<float*>weights.data,
-            <float*>gradient.data, -learn_rate, weights.shape[0])
+        saxpy(cblas)(weights.shape[0], -learn_rate, <float*>gradient.data, 1, <float*>weights.data, 1)
+
         memset(<float*>gradient.data, 0, gradient.size * sizeof(float))
         return weights, gradient, mom1, mom2
 
@@ -537,19 +537,18 @@ cdef void cpu_position_encode(float* output, float period, int N, int D) nogil:
         output += D
 
 
-cdef void cpu_scatter_add(float* dest,
+cdef void cpu_scatter_add(CBlas cblas, float* dest,
         const int* indices, const float* src,
         int nr_id, int nr_col) nogil:
     cdef int i
     for i in range(nr_id):
         id_ = indices[i]
         if id_ >= 0:
-            VecVec.add_i(&dest[id_*nr_col],
-        	&src[i*nr_col], 1., nr_col)
+            saxpy(cblas)(nr_col, 1., &src[i*nr_col], 1, &dest[id_*nr_col], 1)
 
 
 @cython.cdivision(True)
-cdef void _adam_momentum(float* gradient, float* mom1, float* mom2,
+cdef void _adam_momentum(CBlas cblas, float* gradient, float* mom1, float* mom2,
         int nr_weight, float beta1, float beta2, float eps,
         float learn_rate) nogil:
     # Calculate Adam on CPU, fused.
@@ -567,9 +566,9 @@ cdef void _adam_momentum(float* gradient, float* mom1, float* mom2,
     idx = 0
     for i in range(steps):
         step_size = min(64, nr_weight-idx)
-        Vec.mul_i(mom1, beta1, step_size)
-        VecVec.add_i(mom1, gradient, one_minus_beta1, step_size)
-        Vec.mul_i(mom2, beta2, step_size)
+        sscal(cblas)(step_size, beta1, mom1, 1)
+        saxpy(cblas)(step_size, one_minus_beta1, gradient, 1, mom1, 1)
+        sscal(cblas)(step_size, beta2, mom2, 1)
         for j in range(step_size):
             mom2[j] += one_minus_beta2 * gradient[j] ** 2
         for j in range(step_size):

From de40bdf352e58f8808d11099adb3b157dc91de49 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 16 Sep 2022 19:25:58 +0200
Subject: [PATCH 05/30] Standardize `blis` calls in `NumpyOps` (#763)

* `NumpyOps`: Move `blis` detection to `compat` module, replace `blis.cy.gemm` calls with `CBlas` calls

* `NumpOps`: Call `self.cblas()` instead of directly instantiating `CBlas`

* `CBlas`: Add `dgemm`

* `NumpyOps`: Use `CBlas.?gemm` in `gemm`
---
 thinc/backends/cblas.pxd     |   7 ++-
 thinc/backends/cblas.pyx     |   8 +++
 thinc/backends/numpy_ops.pyx | 109 ++++++++++++++++++++++-------------
 thinc/compat.py              |   9 +++
 4 files changed, 93 insertions(+), 40 deletions(-)

diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
index 0ec778dde..a789ef4a3 100644
--- a/thinc/backends/cblas.pxd
+++ b/thinc/backends/cblas.pxd
@@ -2,8 +2,11 @@ from libcpp.memory cimport shared_ptr
 
 
 ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K,
-                           float alpha, const float* A, int lda, const float *B,
+                           float alpha, const float* A, int lda, const float* B,
                            int ldb, float beta, float* C, int ldc) nogil
+ctypedef void (*dgemm_ptr)(bint transA, bint transB, int M, int N, int K,
+                           double alpha, const double* A, int lda, const double* B,
+                           int ldb, double beta, double* C, int ldc) nogil
 
 
 ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX,
@@ -35,10 +38,12 @@ cdef class CBlas:
 cdef daxpy_ptr daxpy(CBlas cblas) nogil
 cdef saxpy_ptr saxpy(CBlas cblas) nogil
 cdef sgemm_ptr sgemm(CBlas cblas) nogil
+cdef dgemm_ptr dgemm(CBlas cblas) nogil
 cdef sscal_ptr sscal(CBlas cblas) nogil
 cdef dscal_ptr dscal(CBlas cblas) nogil
 cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil
 cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil
 cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil
+cdef void set_dgemm(CBlas cblas, dgemm_ptr dgemm) nogil
 cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil
 cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil
diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx
index 9348096b8..bb479e56d 100644
--- a/thinc/backends/cblas.pyx
+++ b/thinc/backends/cblas.pyx
@@ -15,6 +15,7 @@ cdef struct BlasFuncs:
     daxpy_ptr daxpy
     saxpy_ptr saxpy
     sgemm_ptr sgemm
+    dgemm_ptr dgemm
     sscal_ptr sscal
     dscal_ptr dscal
 
@@ -29,6 +30,7 @@ cdef class CBlas:
         funcs.daxpy = blis.cy.daxpy
         funcs.saxpy = blis.cy.saxpy
         funcs.sgemm = blis.cy.sgemm
+        funcs.dgemm = blis.cy.dgemm
         funcs.sscal = blis_sscal
         funcs.dscal = blis_dscal
         self.ptr = make_shared[BlasFuncs](funcs)
@@ -42,6 +44,9 @@ cdef saxpy_ptr saxpy(CBlas cblas) nogil:
 cdef sgemm_ptr sgemm(CBlas cblas) nogil:
     return deref(cblas.ptr).sgemm
 
+cdef dgemm_ptr dgemm(CBlas cblas) nogil:
+    return deref(cblas.ptr).dgemm
+
 cdef sscal_ptr sscal(CBlas cblas) nogil:
     return deref(cblas.ptr).sscal
 
@@ -57,6 +62,9 @@ cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil:
 cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil:
     deref(cblas.ptr).sgemm = sgemm
 
+cdef void set_dgemm(CBlas cblas, dgemm_ptr dgemm) nogil:
+    deref(cblas.ptr).dgemm = dgemm
+
 cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil:
     deref(cblas.ptr).sscal = sscal
 
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 884c74941..45d3d9093 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -15,19 +15,13 @@ from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 from murmurhash.mrmr cimport hash64
 cimport numpy as np
-cimport blis.cy
 
 from .. import registry
 from ..util import copy_array, get_array_module
 from ..types import DeviceTypes, DTypes, Shape, ArrayXd
-from .cblas cimport CBlas, daxpy, saxpy, sscal
+from .cblas cimport CBlas, daxpy, saxpy, sgemm, dgemm, sscal
 from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights
-
-try:
-    import blis.py
-    has_blis = True
-except ImportError:
-    has_blis = False
+from ..compat import has_blis
 
 
 cdef extern from "math.h":
@@ -90,11 +84,45 @@ class NumpyOps(Ops):
             raise ValueError(f"Provided 'y' array should be 2-dimensional, but found {y.ndim} dimension(s).")
         if not self.use_blis:  # delegate to base Ops
             return super().gemm(x, y, out=out, trans1=trans1, trans2=trans2)
+
         x = self.as_contig(x)
         y = self.as_contig(y)
+
+        cdef int nM = x.shape[0] if not trans1 else x.shape[1]
+        cdef int nK = x.shape[1] if not trans1 else x.shape[0]
+        cdef int nK_b = y.shape[0] if not trans2 else y.shape[1]
+        cdef int nN = y.shape[1] if not trans2 else y.shape[0]
+        if nK != nK_b:
+            msg = "Shape mismatch for blis.gemm: (%d, %d), (%d, %d)"
+            raise ValueError(msg % (nM, nK, nK_b, nN))
+
         if out is not None:
             out = self.as_contig(out)
-        return blis.py.gemm(x, y, out=out, trans1=trans1, trans2=trans2, beta=0.)
+        else:
+            # Can be uninitialized as 'beta' is zero.
+            out = numpy.empty((nM, nN), dtype=x.dtype)
+
+        cblas = self.cblas()
+        if x.dtype == "float32" and y.dtype == "float32" and out.dtype == "float32":
+            sgemm(cblas)(trans1, trans2,
+                nM, nN, nK,
+                1.0,
+                <float*>(x.data), x.shape[1],
+                <float*>(y.data), y.shape[1],
+                0.0,
+                <float*>(out.data), out.shape[1])
+        elif x.dtype == "float64" and y.dtype == "float64" and out.dtype == "float64":
+            dgemm(cblas)(trans1, trans2,
+                nM, nN, nK,
+                1.0,
+                <double*>(x.data), x.shape[1],
+                <double*>(y.data), y.shape[1],
+                0.0,
+                <double*>(out.data), out.shape[1])
+        else:
+            raise ValueError(f"unsupported or mismatching array data types; got '{x.dtype}', '{y.dtype}', '{out.dtype}'")
+
+        return out
 
     def relu(self, np.ndarray X, inplace=False):
         cdef np.ndarray Y
@@ -137,7 +165,7 @@ class NumpyOps(Ops):
     ):
         assert H0.shape[0] == C0.shape[0]
         assert H0.shape[1] == C0.shape[1]
-        Y, fwd_state = lstm_forward_training(params, H0, C0, X, size_at_t)
+        Y, fwd_state = lstm_forward_training(self.cblas(), params, H0, C0, X, size_at_t)
         return Y, fwd_state
 
     def lstm_forward_inference(
@@ -148,13 +176,13 @@ class NumpyOps(Ops):
         np.ndarray X,
         np.ndarray size_at_t
     ):
-        Y, _ = lstm_forward_training(params, H0, C0, X, size_at_t)
+        Y, _ = lstm_forward_training(self.cblas(), params, H0, C0, X, size_at_t)
         return Y
 
     def backprop_lstm(
             self, np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state
     ):
-        dX, d_params = backprop_lstm(dY, lengths, params, fwd_state)
+        dX, d_params = backprop_lstm(self.cblas(), dY, lengths, params, fwd_state)
         return dX, d_params
 
     def maxout(self, reals3d_ft X):
@@ -585,7 +613,7 @@ cdef void _adam_momentum(CBlas cblas, float* gradient, float* mom1, float* mom2,
         idx += step_size
 
 
-def lstm_forward_training(
+def lstm_forward_training(CBlas cblas,
     np.ndarray params, np.ndarray c_init, np.ndarray h_init,
     np.ndarray X, np.ndarray lengths
 ):
@@ -627,6 +655,7 @@ def lstm_forward_training(
             Cid = C[i, d]
             Gid = G[i, d]
             _lstm_forward_training(
+                cblas,
                 d, N, nO, nI, nT,
                 Gid,
                 <float*>Yid.data,
@@ -647,6 +676,7 @@ def lstm_forward_training(
 
 
 cdef int _lstm_forward_training(
+    CBlas cblas,
     int d, int N, int nO, int nI, int nT,
     np.ndarray G,
     float* Y,
@@ -660,13 +690,13 @@ cdef int _lstm_forward_training(
     float* Ct2,
 ) except -1:
     cdef double one = 1.0
-    blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
+    sgemm(cblas)(False, True,
         N, nO*4, nI,
         one,
-        X, nI, 1,
-        Wx, nI, 1,
+        X, nI,
+        Wx, nI,
         one,
-        <float*>G.data, nO*4, 1
+        <float*>G.data, nO*4
     )
     cdef int t, batch_size
     cdef int seq_i = 0 if d == 0 else N
@@ -684,13 +714,13 @@ cdef int _lstm_forward_training(
         Gt3_ = G[seq_i : seq_i+batch_size]
         Gt3 = <float*>Gt3_.data
         # Now do the actual calculation
-        blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
+        sgemm(cblas)(False, True,
             batch_size, nO*4, nO,
             one,
-            Yt2, nO, 1,
-            Wh, nO, 1,
+            Yt2, nO,
+            Wh, nO,
             one,
-            Gt3, nO*4, 1
+            Gt3, nO*4
         )
         # This is super weird: if we remove this add, it gets slower? I guess
         # it does cache prefetching or something?
@@ -714,7 +744,7 @@ cdef int _lstm_forward_training(
         memcpy(Ct2, Ct3, sizeof(Ct3[0]) * batch_size * nO)
 
 
-def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state):
+def backprop_lstm(CBlas cblas, np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state):
     xp = numpy
     cdef np.ndarray Y
     cdef np.ndarray G
@@ -791,7 +821,7 @@ def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_stat
             assert (dYid.shape[0], dYid.shape[1]) == (N, nO)
             assert (dC.shape[0], dC.shape[1]) == (N, nO)
             assert (dG.shape[0], dG.shape[1]) == (N, nO*4)
-            _lstm_backward_training(d, N, nO, dX.shape[1], nT,
+            _lstm_backward_training(cblas, d, N, nO, dX.shape[1], nT,
                 <float*>dX.data,
                 <float*>dYid.data,
                 <float*>dC.data,
@@ -817,6 +847,7 @@ def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_stat
 
 
 cdef int _lstm_backward_training(
+    CBlas cblas,
     int d, int N, int nO, int nI, int nT,
     float* dX,
     float* dY,
@@ -861,36 +892,36 @@ cdef int _lstm_backward_training(
         )
         # Backprop hidden-to-hidden w.r.t. hidden.
         #     dYt2 += dGt3 @ Wh
-        blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE,
+        sgemm(cblas)(False, False,
             batch_size, nO, nO*4,
             one,
-            <float*>dGt3, nO*4, 1,
-            <float*>Wh, nO, 1,
+            <float*>dGt3, nO*4,
+            <float*>Wh, nO,
             one,
-            dYt2, nO, 1
+            dYt2, nO
         )
         seq_t3 = seq_t2
         size_t3 = size_t2
 
     # Backprop input-to-hidden w.r.t. weights.
     #     dWx += dG @ X
-    blis.cy.gemm(blis.cy.TRANSPOSE, blis.cy.NO_TRANSPOSE,
+    sgemm(cblas)(True, False,
         nO*4, nI, N,
         one,
-        <float*>dG, nO*4, 1,
-        <float*>X, nI, 1,
+        <float*>dG, nO*4,
+        <float*>X, nI,
         one,
-        dWx, nI, 1
+        dWx, nI
     )
     # Backprop hidden-to-hidden w.r.t weights.
     #     dWh += dG @ Y
-    blis.cy.gemm(blis.cy.TRANSPOSE, blis.cy.NO_TRANSPOSE,
+    sgemm(cblas)(True, False,
         nO*4, nO, N,
         one,
-        <float*>dG, nO*4, 1,
-        <float*>Y, nO, 1,
+        <float*>dG, nO*4,
+        <float*>Y, nO,
         one,
-        dWh, nO, 1
+        dWh, nO
     )
     # Backprop bias
     for i in range(N):
@@ -898,13 +929,13 @@ cdef int _lstm_backward_training(
             d_bias[j] += dG[i*nO*4+j]
 
     # Backprop input-to-hidden w.r.t. input
-    blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE,
+    sgemm(cblas)(False, False,
         N, nI, nO*4,
         one,
-        <float*>dG, nO*4, 1,
-        <float*>Wx, nI, 1,
+        <float*>dG, nO*4,
+        <float*>Wx, nI,
         one,
-        dX, nI, 1
+        dX, nI
     )
 
 
diff --git a/thinc/compat.py b/thinc/compat.py
index 2d8b40345..9e80f8dfe 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -87,4 +87,13 @@
     has_os_signpost = False
 
 
+try:  # pragma: no cover
+    import blis
+
+    has_blis = True
+except ImportError:
+    blis = None
+    has_blis = False
+
+
 has_gpu = has_cupy_gpu or has_torch_mps_gpu

From c8ac07fe734aaee43d8197bbf5c9a370f692766b Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Fri, 28 Oct 2022 12:26:18 +0200
Subject: [PATCH 06/30] Cross entropy fix (#647)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* return logloss instead of squared differrence

* check whether to comput binary or categorical loss value

* function to apply label smoothing to 2d array

* force exclusive classes

* formatting

* mypy debug

* bugfix

* compare cross entropy to torch

* fix type and error message

* updating cross-entropy tests

* all categorical crossentropy tests updated

* sequence crossentropy test

* rearrange if statements

* sequence ce negprefix test start

* all tests for (sequence) cross entropy

* use CategoricalCrossentropy as loss

* don't run conversion and validation twice in __call__

* add type for truths in convert_truths (thnx @ richardpaulhudson)

* fix one-hot check and no unexpected error branch

* cupy support for torch comparison

* import floats2d

* hopefully right type to pass old torch cross-entropy

* nonstrict sum to 1

* typo

* remove redundant work for sequential cross entropy

* type typo

* fix imports

* remove misleading comments

* assertion for clarity

* add back mistakenly removed imports

* throw error rather than assert

* legacy versions and tests for crossentropy + sequential

* type fix

* Update thinc/legacy/loss.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* legacy cross-entropy import through registry

* no legacy test module

* type fix

* hacking types for mypy

* return type

* Update thinc/legacy/loss.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Update thinc/legacy/__init__.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* initial functional sparse ce los

* separate functionality for SparseCE and CategoricalCrossentropy

* fix missing value type

* correcting label smoothing param contraint

* test new label smooth validation error

* less than 0 input validation

* string concat

* small update to error msg

* fix max smoothing coefficient

* double check error message

* Categorical and Sparse factories and tests

* Update thinc/util.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* update test with less strict match

* Fix types, pair-hacked with @kadarakos

* (Sparse)CategoricalEntropy: support Ragged guesses

Since we can encoder sequences as Ragged, this could
replace (Sparse)SequenceCategoricalEntropy.

* follow updated api

* Update thinc/util.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* indent fix

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* remove unnecessary list copy

* add type to truths

* fix missing assignment

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* rever suggestion

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/legacy/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/tests/test_loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/util.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Update thinc/loss.py

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* move check1d out of loss and more general signature

* mypy fix

* SparseCE rename

Co-authored-by: Kádár Ákos <akos@onyx.uvt.nl>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Daniël de Kok <me@danieldk.eu>
Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 examples/mnist.py        |   5 +-
 thinc/legacy/__init__.py |   8 +
 thinc/legacy/loss.py     | 282 ++++++++++++++++++
 thinc/loss.py            | 503 +++++++++++++++++++++-----------
 thinc/tests/test_loss.py | 602 +++++++++++++++++++++++++++++++--------
 thinc/util.py            |  28 +-
 6 files changed, 1146 insertions(+), 282 deletions(-)
 create mode 100644 thinc/legacy/__init__.py
 create mode 100644 thinc/legacy/loss.py

diff --git a/examples/mnist.py b/examples/mnist.py
index 971f4645b..790bcc640 100644
--- a/examples/mnist.py
+++ b/examples/mnist.py
@@ -4,6 +4,7 @@
 """
 # pip install thinc ml_datasets typer
 from thinc.api import Model, chain, Relu, Softmax, Adam
+from thinc.api import CategoricalCrossentropy
 import ml_datasets
 from wasabi import msg
 from tqdm import tqdm
@@ -21,6 +22,7 @@ def main(
     )
     # Load the data
     (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist()
+    loss_func = CategoricalCrossentropy()
     # Set any missing shapes for the model.
     model.initialize(X=train_X[:5], Y=train_Y[:5])
     train_data = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True)
@@ -30,7 +32,8 @@ def main(
     for i in range(n_iter):
         for X, Y in tqdm(train_data, leave=False):
             Yh, backprop = model.begin_update(X)
-            backprop(Yh - Y)
+            grad, loss = loss_func(Yh, Y)
+            backprop(grad)
             model.finish_update(optimizer)
         # Evaluate and print progress
         correct = 0
diff --git a/thinc/legacy/__init__.py b/thinc/legacy/__init__.py
new file mode 100644
index 000000000..ced5121ba
--- /dev/null
+++ b/thinc/legacy/__init__.py
@@ -0,0 +1,8 @@
+from .loss import LegacyCategoricalCrossentropy
+from .loss import LegacySequenceCategoricalCrossentropy
+
+
+__all__ = [
+    "LegacyCategoricalCrossentropy",
+    "LegacySequenceCategoricalCrossentropy"
+]
diff --git a/thinc/legacy/loss.py b/thinc/legacy/loss.py
new file mode 100644
index 000000000..439a2ca21
--- /dev/null
+++ b/thinc/legacy/loss.py
@@ -0,0 +1,282 @@
+from typing import Optional, Sequence, Dict, Union, Tuple
+from typing import cast, List
+from ..types import Floats2d, Ints1d, Ints2d
+from ..config import registry
+from ..util import to_categorical, get_array_module
+from ..loss import IntsOrFloatsOrStrs, Loss
+from ..loss import _make_mask, _make_mask_by_value
+
+
+TruthsT = Union[List[str], List[int], Ints1d, Floats2d]
+
+
+class LegacyCategoricalCrossentropy(Loss):
+    names: Optional[Sequence[str]]
+    missing_value: Optional[Union[str, int]]
+    _name_to_i: Dict[str, int]
+
+    def __init__(
+        self,
+        *,
+        normalize: bool = True,
+        names: Optional[Sequence[str]] = None,
+        missing_value: Optional[Union[str, int]] = None,
+        neg_prefix: Optional[str] = None,
+        label_smoothing: float = 0.0,
+    ):
+        self.normalize = normalize
+        self.names = names
+        self.missing_value = missing_value
+        self.neg_prefix = neg_prefix
+        self.label_smoothing = label_smoothing
+        if names is not None:
+            self._name_to_i = {name: i for i, name in enumerate(names)}
+        else:
+            self._name_to_i = {}
+
+    def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]:
+        xp = get_array_module(guesses)
+        missing = []
+        negatives_mask = None
+        if self.names:
+            negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f")
+        missing_value = self.missing_value
+        # Convert list of ints or list of strings
+        if isinstance(truths, list):
+            if len(truths):
+                if isinstance(truths[0], int):
+                    for i, value in enumerate(truths):
+                        if not isinstance(value, int):
+                            raise ValueError(
+                                "All values in the truths list have to "
+                                "have the same time. The first value was "
+                                f"detected to be integer, but found {type(value)}."
+                            )
+                        if value == missing_value:
+                            missing.append(i)
+                else:
+                    truths = cast(List[str], truths)
+                    if self.names is None:
+                        msg = (
+                            "Cannot calculate loss from list of strings without names. "
+                            "You can pass the names as a keyword argument when you "
+                            "create the loss object, "
+                            "e.g. CategoricalCrossentropy(names=['dog', 'cat'])"
+                        )
+                        raise ValueError(msg)
+                    for i, value in enumerate(truths):
+                        if not isinstance(value, str):
+                            raise ValueError(
+                                "All values in the truths list have to "
+                                "have the same time. The first value was "
+                                f"detected to be string, but found {type(value)}."
+                            )
+                        if value == missing_value:
+                            truths[i] = self.names[0]
+                            missing.append(i)
+                        elif (
+                            value
+                            and self.neg_prefix
+                            and value.startswith(self.neg_prefix)
+                        ):
+                            truths[i] = value[len(self.neg_prefix) :]
+                            neg_index = self._name_to_i[truths[i]]
+                            negatives_mask[i] = 0  # type: ignore
+                            negatives_mask[i][neg_index] = -1  # type: ignore
+                    truths = [self._name_to_i[name] for name in truths]
+            truths = xp.asarray(truths, dtype="i")
+            mask = _make_mask(guesses, missing)
+        else:
+            mask = _make_mask_by_value(truths, guesses, missing_value)
+        truths = cast(Union[Ints1d, Floats2d], truths)
+        if truths.ndim != guesses.ndim:
+            # transform categorical values to one-hot encoding
+            truths_2d = to_categorical(
+                truths,
+                n_classes=guesses.shape[-1],
+                label_smoothing=self.label_smoothing,
+            )
+        else:
+            if self.label_smoothing:
+                raise ValueError(
+                    "Label smoothing is only applied, when truths have type "
+                    "List[str], List[int] or Ints1d, but it seems like Floats2d "
+                    "was provided."
+                )
+            truths_2d = cast(Floats2d, truths)
+        # Transform negative annotations to a 0 for the negated value
+        # + mask all other values for that row
+        if negatives_mask is not None:
+            truths_2d *= negatives_mask
+            truths_2d[truths_2d == -1] = 0
+            negatives_mask[negatives_mask == -1] = 1
+            mask *= negatives_mask
+        return cast(Floats2d, truths_2d), mask
+
+    def __call__(
+        self, guesses: Floats2d, truths: TruthsT
+    ) -> Tuple[Floats2d, float]:
+        d_truth = self.get_grad(guesses, truths)
+        return (d_truth, self._get_loss_from_grad(d_truth))
+
+    def get_grad(self, guesses: Floats2d, truths: TruthsT) -> Floats2d:
+        target, mask = self.convert_truths(truths, guesses)
+        xp = get_array_module(target)
+        if guesses.shape != target.shape:  # pragma: no cover
+            err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}."
+            raise ValueError(err)
+        elif xp.any(guesses > 1) or xp.any(guesses < 0):  # pragma: no cover
+            err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval."
+            raise ValueError(err)
+        elif xp.any(target > 1) or xp.any(target < 0):  # pragma: no cover
+            err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval."
+            raise ValueError(err)
+        difference = guesses - target
+        difference *= mask
+        if self.normalize:
+            difference = difference / guesses.shape[0]
+        return difference
+
+    def get_loss(self, guesses: Floats2d, truths: TruthsT) -> float:
+        d_truth = self.get_grad(guesses, truths)
+        return self._get_loss_from_grad(d_truth)
+
+    def _get_loss_from_grad(self, d_truth: Floats2d) -> float:
+        # TODO: Add overload for axis=None case to sum
+        return (d_truth**2).sum()  # type: ignore
+
+
+class LegacySequenceCategoricalCrossentropy(Loss):
+    def __init__(
+        self,
+        *,
+        normalize: bool = True,
+        names: Optional[Sequence[str]] = None,
+        missing_value: Optional[Union[str, int]] = None,
+        neg_prefix: Optional[str] = None,
+        label_smoothing: float = 0.0,
+    ):
+        self.cc = LegacyCategoricalCrossentropy(
+            normalize=False,
+            names=names,
+            missing_value=missing_value,
+            neg_prefix=neg_prefix,
+            label_smoothing=label_smoothing,
+        )
+        self.normalize = normalize
+
+    def __call__(
+        self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]
+    ) -> Tuple[List[Floats2d], float]:
+        grads = self.get_grad(guesses, truths)
+        loss = self._get_loss_from_grad(grads)
+        return grads, loss
+
+    def get_grad(
+        self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]
+    ) -> List[Floats2d]:
+        if len(guesses) != len(truths):  # pragma: no cover
+            err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length"
+            raise ValueError(err)
+        n = len(guesses)
+        d_scores = []
+        for yh, y in zip(guesses, truths):
+            d_yh = self.cc.get_grad(yh, y)
+            if self.normalize:
+                d_yh /= n
+            d_scores.append(d_yh)
+        return d_scores
+
+    def get_loss(
+        self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]
+    ) -> float:
+        return self._get_loss_from_grad(self.get_grad(guesses, truths))
+
+    def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float:
+        loss = 0.0
+        for grad in grads:
+            loss += self.cc._get_loss_from_grad(grad)  # type: ignore
+        return loss
+
+
+@registry.losses("CategoricalCrossentropy.v1")
+def configure_CategoricalCrossentropy_v1(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    missing_value: Optional[Union[str, int]] = None,
+) -> LegacyCategoricalCrossentropy:
+    return LegacyCategoricalCrossentropy(
+        normalize=normalize, names=names, missing_value=missing_value
+    )
+
+
+@registry.losses("CategoricalCrossentropy.v2")
+def configure_CategoricalCrossentropy_v2(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    missing_value: Optional[Union[str, int]] = None,
+    neg_prefix: Optional[str] = None,
+) -> LegacyCategoricalCrossentropy:
+    return LegacyCategoricalCrossentropy(
+        normalize=normalize,
+        names=names,
+        missing_value=missing_value,
+        neg_prefix=neg_prefix,
+    )
+
+
+@registry.losses("CategoricalCrossentropy.v3")
+def configure_CategoricalCrossentropy_v3(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    missing_value: Optional[Union[str, int]] = None,
+    neg_prefix: Optional[str] = None,
+    label_smoothing: float = 0.0,
+) -> LegacyCategoricalCrossentropy:
+    return LegacyCategoricalCrossentropy(
+        normalize=normalize,
+        names=names,
+        missing_value=missing_value,
+        neg_prefix=neg_prefix,
+        label_smoothing=label_smoothing,
+    )
+
+
+@registry.losses("SequenceCategoricalCrossentropy.v1")
+def configure_SequenceCategoricalCrossentropy_v1(
+    *, normalize: bool = True, names: Optional[Sequence[str]] = None
+) -> LegacySequenceCategoricalCrossentropy:
+    return LegacySequenceCategoricalCrossentropy(normalize=normalize, names=names)
+
+
+@registry.losses("SequenceCategoricalCrossentropy.v2")
+def configure_SequenceCategoricalCrossentropy_v2(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    neg_prefix: Optional[str] = None,
+) -> LegacySequenceCategoricalCrossentropy:
+    return LegacySequenceCategoricalCrossentropy(
+        normalize=normalize, names=names, neg_prefix=neg_prefix
+    )
+
+
+@registry.losses("SequenceCategoricalCrossentropy.v3")
+def configure_SequenceCategoricalCrossentropy_v3(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    missing_value: Optional[Union[str, int]] = None,
+    neg_prefix: Optional[str] = None,
+    label_smoothing: float = 0.0,
+) -> LegacySequenceCategoricalCrossentropy:
+    return LegacySequenceCategoricalCrossentropy(
+        normalize=normalize,
+        names=names,
+        neg_prefix=neg_prefix,
+        missing_value=missing_value,
+        label_smoothing=label_smoothing,
+    )
diff --git a/thinc/loss.py b/thinc/loss.py
index 990b30df1..5a81170d0 100644
--- a/thinc/loss.py
+++ b/thinc/loss.py
@@ -1,17 +1,19 @@
 from typing import Tuple, Sequence, cast, TypeVar, Generic, Any, Union, Optional, List
 from typing import Dict
 
-from .types import Floats2d, Ints1d
-from .util import get_array_module, to_categorical
+from .types import Floats2d, Ints1d, Ragged, ArrayXd
+from .util import get_array_module, to_categorical, smooth_one_hot
+from .util import is_xp_array
 from .config import registry
 
-
 LossT = TypeVar("LossT")
 GradT = TypeVar("GradT")
 GuessT = TypeVar("GuessT")
 TruthT = TypeVar("TruthT")
+FloatsOrRaggedT = TypeVar("FloatsOrRaggedT", Floats2d, Ragged)
 IntsOrFloats = Union[Ints1d, Floats2d]
 IntsOrFloatsOrStrs = Union[Ints1d, Floats2d, Sequence[int], Sequence[str]]
+Categories1d = Union[Ints1d, Sequence[int], Sequence[str]]
 
 
 class Loss(Generic[GuessT, TruthT, GradT, LossT]):  # pragma: no cover
@@ -34,7 +36,118 @@ def get_loss(self, guesses: GuessT, truths: TruthT) -> LossT:
         ...
 
 
-class CategoricalCrossentropy(Loss):
+class CategoricalCrossentropyBase(Loss):
+    normalize: bool
+
+    def _validate_input(self, guesses: FloatsOrRaggedT, target: Floats2d) -> None:
+        guesses_f2d = _to_array(guesses)
+        xp = get_array_module(target)
+        if not xp.allclose(guesses_f2d.sum(axis=1), 1.0):
+            raise ValueError(
+                "Cannot calculate CategoricalCrossentropy if "
+                "some rows of 'guesses' are not "
+                "valid categorical distributions (do not sum to 1)."
+            )
+        elif guesses_f2d.shape != target.shape:  # pragma: no cover
+            raise ValueError(
+                "Cannot calculate CategoricalCrossentropy loss "
+                f"with mismatching shapes: {guesses_f2d.shape} vs {target.shape}."
+            )
+        elif xp.any(guesses_f2d > 1) or xp.any(guesses_f2d < 0):  # pragma: no cover
+            raise ValueError(
+                "Cannot calculate CategoricalCrossentropy loss "
+                "with guesses outside the [0,1] interval."
+            )
+        elif xp.any(target > 1) or xp.any(target < 0):  # pragma: no cover
+            raise ValueError(
+                "Cannot calculate CategoricalCrossentropy loss "
+                "with truth values outside the [0,1] interval."
+            )
+
+    def _get_grad(
+        self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d
+    ) -> FloatsOrRaggedT:
+        difference = _to_array(guesses) - target
+        difference *= mask
+        if self.normalize:
+            # FIXME: normalized by the number of sequences, also support normalizing
+            #  by the number of instances.
+            difference /= _normalization_length(guesses)
+
+        return _array_like(difference, guesses)
+
+    def _get_loss(
+        self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d
+    ) -> float:
+        guesses_f2d = _to_array(guesses)
+        xp = get_array_module(guesses_f2d)
+        logprobs = xp.log(guesses_f2d + 1e-9)
+        logprobs *= mask
+        if self.normalize:
+            return -(target * logprobs).sum() / _normalization_length(guesses)
+        else:
+            return -(target * logprobs).sum()
+
+
+class CategoricalCrossentropy(CategoricalCrossentropyBase):
+    missing_value: Optional[Union[str, int]]
+
+    def __init__(
+        self,
+        *,
+        normalize: bool = True,
+        missing_value: Optional[int] = None,
+        label_smoothing: float = 0.0,
+    ):
+        self.normalize = normalize
+        self.missing_value = missing_value
+        self.label_smoothing = label_smoothing
+
+    def __call__(
+        self, guesses: FloatsOrRaggedT, truths: Floats2d
+    ) -> Tuple[FloatsOrRaggedT, float]:
+        target, mask = self.convert_truths(truths, guesses)
+        self._validate_input(guesses, target)
+        d_truth = self._get_grad(guesses, target, mask)
+        loss = self._get_loss(guesses, target, mask)
+
+        return d_truth, loss
+
+    def convert_truths(
+        self, truths: Floats2d, guesses: FloatsOrRaggedT
+    ) -> Tuple[Floats2d, Floats2d]:
+        if truths.ndim != 2:
+            raise ValueError(f"'truths' have to have 2 axes, but found {truths.ndim}")
+        guesses_2d = _to_array(guesses)
+        missing_value = self.missing_value
+        xp = get_array_module(guesses_2d)
+        mask = _make_mask_by_value(truths, guesses_2d, missing_value)
+        if not xp.allclose(truths.sum(axis=1), 1.0):
+            raise ValueError(
+                "Cannot calculate CategoricalCrossentropy. "
+                "All rows of 'truths' have to be a "
+                "valid categorical distribution (sum to 1)."
+            )
+        if self.label_smoothing:
+            # Validate that array is binary, ergo one-hot at this point
+            if ((truths == 0) | (truths == 1)).all():
+                truths = smooth_one_hot(truths, self.label_smoothing)
+            else:
+                raise ValueError("Can only apply label-smoothing to one-hot target.")
+        return truths, mask
+
+    def get_grad(self, guesses: FloatsOrRaggedT, truths: Floats2d) -> FloatsOrRaggedT:
+        target, mask = self.convert_truths(truths, guesses)
+        self._validate_input(guesses, target)
+        return self._get_grad(guesses, target, mask)
+
+    def get_loss(self, guesses: Floats2d, truths: Floats2d) -> float:
+        target, mask = self.convert_truths(truths, guesses)
+        self._validate_input(guesses, target)
+        return self._get_loss(guesses, target, mask)
+
+
+class SparseCategoricalCrossentropy(CategoricalCrossentropyBase):
     names: Optional[Sequence[str]]
     missing_value: Optional[Union[str, int]]
     _name_to_i: Dict[str, int]
@@ -58,142 +171,174 @@ def __init__(
         else:
             self._name_to_i = {}
 
-    def convert_truths(self, truths, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]:
-        xp = get_array_module(guesses)
+    def __call__(
+        self, guesses: Floats2d, truths: Union[Sequence[int], Sequence[str]]
+    ) -> Tuple[Floats2d, float]:
+        target, mask = self.convert_truths(truths, guesses)
+        self._validate_input(guesses, target)
+        d_truth = self._get_grad(guesses, target, mask)
+        loss = self._get_loss(guesses, target, mask)
+        return (d_truth, loss)
+
+    def _convert_ints(
+        self, guesses: Floats2d, truths: Sequence[int]
+    ) -> Tuple[Floats2d, Floats2d]:
+        """
+        Convert Sequence[int] into a Floats2d one-hot array.
+        """
+        missing_value = self.missing_value
+        if missing_value is not None and not isinstance(missing_value, int):
+            raise ValueError(
+                "'truths' provided in Sequence[int] format, but "
+                f"'missing_value' was set to be {self.missing_value} "
+                f", which has type {type(self.missing_value)}."
+            )
         missing = []
-        negatives_mask = None
-        if self.names:
-            negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f")
+        for i, value in enumerate(truths):
+            if not isinstance(value, int):
+                raise ValueError(
+                    "The first value of `truths` was of type "
+                    f"integer, but found {type(value)} during iteration."
+                )
+            if value == missing_value:
+                missing.append(i)
+        xp = get_array_module(guesses)
+        # FIXME: convert using ops?
+        xp_truths = cast(Ints1d, xp.asarray(truths, dtype="i"))
+        truths_2d = to_categorical(
+            xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing
+        )
+        mask = _make_mask(guesses, missing)
+        return cast(Floats2d, truths_2d), mask
+
+    def _convert_strs(
+        self, guesses: Floats2d, truths: Sequence[str]
+    ) -> Tuple[Floats2d, Floats2d]:
+        """
+        Convert Sequence[int] into a Floats2d one-hot array.
+        """
+
         missing_value = self.missing_value
-        # Convert list of ints or list of strings
-        if isinstance(truths, list):
-            truths = list(truths)
-            if len(truths):
-                if isinstance(truths[0], int):
-                    for i, value in enumerate(truths):
-                        if value == missing_value:
-                            missing.append(i)
-                else:
-                    if self.names is None:
-                        msg = (
-                            "Cannot calculate loss from list of strings without names. "
-                            "You can pass the names as a keyword argument when you "
-                            "create the loss object, "
-                            "e.g. CategoricalCrossentropy(names=['dog', 'cat'])"
-                        )
-                        raise ValueError(msg)
-                    for i, value in enumerate(truths):
-                        if value == missing_value:
-                            truths[i] = self.names[0]
-                            missing.append(i)
-                        elif (
-                            value
-                            and self.neg_prefix
-                            and value.startswith(self.neg_prefix)
-                        ):
-                            truths[i] = value[len(self.neg_prefix) :]
-                            neg_index = self._name_to_i[truths[i]]
-                            negatives_mask[i] = 0  # type: ignore
-                            negatives_mask[i][neg_index] = -1  # type: ignore
-                    truths = [self._name_to_i[name] for name in truths]
-            truths = xp.asarray(truths, dtype="i")
-            mask = _make_mask(guesses, missing)
-        else:
-            mask = _make_mask_by_value(truths, guesses, missing_value)
-        if truths.ndim != guesses.ndim:
-            # transform categorical values to one-hot encoding
-            truths = to_categorical(
-                cast(Ints1d, truths),
-                n_classes=guesses.shape[-1],
+        if self.names is None:
+            raise ValueError(
+                "Cannot calculate loss from Sequence[str] without names. "
+                "You can pass the names as a keyword argument when you "
+                "create the loss object"
+            )
+        elif missing_value is not None and not isinstance(missing_value, str):
+            raise ValueError(
+                "'truths' provided in Sequence[str] format, but "
+                f"'missing_value' was set to be {self.missing_value} "
+                f", which has type {type(self.missing_value)}."
+            )
+        xp = get_array_module(guesses)
+        missing = []
+        negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f")
+        truths_int = []
+        for i, value in enumerate(truths):
+            if not isinstance(value, str):
+                raise ValueError(
+                    "The first value of the 'truths' was of type "
+                    f"string, but found {type(value)} during iteration."
+                )
+            # missing value
+            if value == missing_value:
+                label_i = self._name_to_i[self.names[0]]
+                missing.append(i)
+            # negative labels
+            elif self.neg_prefix and value.startswith(self.neg_prefix):
+                label_i = self._name_to_i[value[len(self.neg_prefix) :]]
+                negatives_mask[i] = 0  # type: ignore
+                negatives_mask[i][label_i] = -1  # type: ignore
+            # nothing special
+            else:
+                label_i = self._name_to_i[value]
+            truths_int.append(label_i)
+        xp_truths = cast(Ints1d, xp.asarray(truths_int, dtype="i"))
+        truths_2d = to_categorical(
+            xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing
+        )
+        mask = _make_mask(guesses, missing)
+        truths_2d *= negatives_mask
+        truths_2d[truths_2d == -1] = 0
+        negatives_mask[negatives_mask == -1] = 1
+        mask *= negatives_mask
+        return cast(Floats2d, truths_2d), mask
+
+    def convert_truths(
+        self, truths: Categories1d, guesses: Floats2d
+    ) -> Tuple[Floats2d, Floats2d]:
+        guesses_f2d = _to_array(guesses)
+
+        if is_xp_array(truths):
+            _check_ints1d(cast(ArrayXd, truths))
+            xp_truths = cast(Ints1d, truths)
+            truths_2d = to_categorical(
+                xp_truths,
                 label_smoothing=self.label_smoothing,
+                n_classes=guesses_f2d.shape[1],
             )
-        else:
-            if self.label_smoothing:
+            mask = _make_mask_by_value(truths_2d, guesses_f2d, self.missing_value)
+        elif isinstance(truths, Sequence):
+            if isinstance(truths[0], int):
+                truths_2d, mask = self._convert_ints(
+                    guesses_f2d, cast(Sequence[int], truths)
+                )
+            elif isinstance(truths[0], str):
+                truths_2d, mask = self._convert_strs(
+                    guesses_f2d, cast(Sequence[str], truths)
+                )
+            else:
                 raise ValueError(
-                    "Label smoothing is only applied, when truths have type "
-                    "List[str], List[int] or Ints1d, but it seems like Floats2d "
-                    "was provided."
+                    "When truths to SparseCategoricalCrossentropy is provided "
+                    "in Sequence format, elements need to be "
+                    "of type str or int, but first element "
+                    f"was found to be {type(truths[0])}."
                 )
-        # Transform negative annotations to a 0 for the negated value
-        # + mask all other values for that row
-        if negatives_mask is not None:
-            truths *= negatives_mask
-            truths[truths == -1] = 0
-            negatives_mask[negatives_mask == -1] = 1
-            mask *= negatives_mask
-        return truths, mask
+        else:
+            raise ValueError(
+                "Truths have to be provided either as 1D "
+                "numpy/cupy integer array or as Sequence[int] or "
+                "Sequence[str], but truths has different type."
+            )
 
-    def __call__(
-        self, guesses: Floats2d, truths: IntsOrFloatsOrStrs
-    ) -> Tuple[Floats2d, float]:
-        d_truth = self.get_grad(guesses, truths)
-        return (d_truth, self._get_loss_from_grad(d_truth))
+        return cast(Floats2d, truths_2d), mask
 
-    def get_grad(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> Floats2d:
+    def get_grad(self, guesses: Floats2d, truths: Categories1d) -> Floats2d:
         target, mask = self.convert_truths(truths, guesses)
-        xp = get_array_module(target)
-        if guesses.shape != target.shape:  # pragma: no cover
-            err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}."
-            raise ValueError(err)
-        if xp.any(guesses > 1) or xp.any(guesses < 0):  # pragma: no cover
-            err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval."
-            raise ValueError(err)
-        if xp.any(target > 1) or xp.any(target < 0):  # pragma: no cover
-            err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval."
-            raise ValueError(err)
-        difference = guesses - target
-        difference *= mask
-        if self.normalize:
-            difference = difference / guesses.shape[0]
-        return difference
+        self._validate_input(guesses, target)
+        return self._get_grad(guesses, target, mask)
 
-    def get_loss(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> float:
-        d_truth = self.get_grad(guesses, truths)
-        return self._get_loss_from_grad(d_truth)
-
-    def _get_loss_from_grad(self, d_truth: Floats2d) -> float:
-        # TODO: Add overload for axis=None case to sum
-        return (d_truth**2).sum()  # type: ignore
-
-
-@registry.losses("CategoricalCrossentropy.v1")
-def configure_CategoricalCrossentropy_v1(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    missing_value: Optional[Union[str, int]] = None,
-) -> CategoricalCrossentropy:
-    return CategoricalCrossentropy(
-        normalize=normalize, names=names, missing_value=missing_value
-    )
+    def get_loss(self, guesses: Floats2d, truths: Categories1d) -> float:
+        target, mask = self.convert_truths(truths, guesses)
+        self._validate_input(guesses, target)
+        return self._get_loss(guesses, target, mask)
 
 
-@registry.losses("CategoricalCrossentropy.v2")
-def configure_CategoricalCrossentropy_v2(
+@registry.losses("CategoricalCrossentropy.v4")
+def configure_CategoricalCrossentropy_v4(
     *,
     normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    missing_value: Optional[Union[str, int]] = None,
-    neg_prefix: Optional[str] = None,
+    missing_value: Optional[int] = None,
+    label_smoothing: float = 0.0,
 ) -> CategoricalCrossentropy:
     return CategoricalCrossentropy(
         normalize=normalize,
-        names=names,
         missing_value=missing_value,
-        neg_prefix=neg_prefix,
+        label_smoothing=label_smoothing,
     )
 
 
-@registry.losses("CategoricalCrossentropy.v3")
-def configure_CategoricalCrossentropy_v3(
+@registry.losses("SparseCategoricalCrossentropy.v4")
+def configure_SparseCategoricalCrossentropy_v4(
     *,
     normalize: bool = True,
     names: Optional[Sequence[str]] = None,
     missing_value: Optional[Union[str, int]] = None,
     neg_prefix: Optional[str] = None,
     label_smoothing: float = 0.0,
-) -> CategoricalCrossentropy:
-    return CategoricalCrossentropy(
+) -> SparseCategoricalCrossentropy:
+    return SparseCategoricalCrossentropy(
         normalize=normalize,
         names=names,
         missing_value=missing_value,
@@ -206,38 +351,44 @@ class SequenceCategoricalCrossentropy(Loss):
     def __init__(
         self,
         *,
+        cross_entropy: Union[CategoricalCrossentropy, SparseCategoricalCrossentropy],
         normalize: bool = True,
-        names: Optional[Sequence[str]] = None,
-        missing_value: Optional[Union[str, int]] = None,
-        neg_prefix: Optional[str] = None,
-        label_smoothing: float = 0.0,
     ):
-        self.cc = CategoricalCrossentropy(
-            normalize=False,
-            names=names,
-            missing_value=missing_value,
-            neg_prefix=neg_prefix,
-            label_smoothing=label_smoothing,
-        )
+        self.cc = cross_entropy
         self.normalize = normalize
 
     def __call__(
         self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
     ) -> Tuple[List[Floats2d], float]:
-        grads = self.get_grad(guesses, truths)
-        loss = self._get_loss_from_grad(grads)
-        return grads, loss
+        self._validate_input(guesses, truths)
+        n = len(guesses)
+        d_scores = []
+        loss = 0.0
+        for yh, y in zip(guesses, truths):
+            d_yh, l = self.cc(yh, y)  # type: ignore
+            if self.normalize:
+                d_yh /= n
+            d_scores.append(d_yh)
+            loss += l
+        return d_scores, loss
+
+    def _validate_input(
+        self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
+    ):
+        if len(guesses) != len(truths):  # pragma: no cover
+            raise ValueError(
+                "Cannot calculate SequenceCategoricalCrossentropy loss: "
+                "guesses and truths must be same length!"
+            )
 
     def get_grad(
         self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
     ) -> List[Floats2d]:
-        err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length"
-        if len(guesses) != len(truths):  # pragma: no cover
-            raise ValueError(err)
+        self._validate_input(guesses, truths)
         n = len(guesses)
         d_scores = []
         for yh, y in zip(guesses, truths):
-            d_yh = self.cc.get_grad(yh, y)
+            d_yh = self.cc.get_grad(yh, y)  # type: ignore
             if self.normalize:
                 d_yh /= n
             d_scores.append(d_yh)
@@ -246,49 +397,42 @@ def get_grad(
     def get_loss(
         self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
     ) -> float:
-        return self._get_loss_from_grad(self.get_grad(guesses, truths))
-
-    def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float:
+        self._validate_input(guesses, truths)
         loss = 0.0
-        for grad in grads:
-            loss += self.cc._get_loss_from_grad(grad)
+        for guess, truth in zip(guesses, truths):
+            loss += self.cc.get_loss(guess, truth)  # type: ignore
         return loss
 
 
-@registry.losses("SequenceCategoricalCrossentropy.v1")
-def configure_SequenceCategoricalCrossentropy_v1(
-    *, normalize: bool = True, names: Optional[Sequence[str]] = None
-) -> SequenceCategoricalCrossentropy:
-    return SequenceCategoricalCrossentropy(normalize=normalize, names=names)
-
-
-@registry.losses("SequenceCategoricalCrossentropy.v2")
-def configure_SequenceCategoricalCrossentropy_v2(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    neg_prefix: Optional[str] = None,
-) -> SequenceCategoricalCrossentropy:
-    return SequenceCategoricalCrossentropy(
-        normalize=normalize, names=names, neg_prefix=neg_prefix
-    )
-
-
-@registry.losses("SequenceCategoricalCrossentropy.v3")
-def configure_SequenceCategoricalCrossentropy_v3(
+@registry.losses("SequenceCategoricalCrossentropy.v4")
+def configure_SequenceCategoricalCrossentropy_v4(
     *,
     normalize: bool = True,
+    sparse: bool = True,
     names: Optional[Sequence[str]] = None,
     missing_value: Optional[Union[str, int]] = None,
     neg_prefix: Optional[str] = None,
     label_smoothing: float = 0.0,
 ) -> SequenceCategoricalCrossentropy:
+    if names is None and neg_prefix is None and not sparse:
+        cross_entropy: Union[
+            CategoricalCrossentropy, SparseCategoricalCrossentropy
+        ] = CategoricalCrossentropy(
+            normalize=False,
+            missing_value=cast(Optional[int], missing_value),
+            label_smoothing=label_smoothing,
+        )
+    else:
+        cross_entropy = SparseCategoricalCrossentropy(
+            normalize=False,
+            names=names,
+            missing_value=cast(Optional[Union[str, int]], missing_value),
+            neg_prefix=neg_prefix,
+            label_smoothing=label_smoothing,
+        )
     return SequenceCategoricalCrossentropy(
+        cross_entropy=cross_entropy,
         normalize=normalize,
-        names=names,
-        missing_value=missing_value,
-        neg_prefix=neg_prefix,
-        label_smoothing=label_smoothing,
     )
 
 
@@ -419,6 +563,43 @@ def _make_mask_by_value(truths, guesses, missing_value) -> Floats2d:
     return mask
 
 
+def _array_like(a: Floats2d, like: FloatsOrRaggedT) -> FloatsOrRaggedT:
+    if isinstance(like, Ragged):
+        return Ragged(a, lengths=like.lengths)
+    else:
+        return a
+
+
+def _to_array(guesses: FloatsOrRaggedT) -> Floats2d:
+    if isinstance(guesses, Ragged):
+        return cast(Floats2d, guesses.data.astype("float32"))
+    else:
+        return guesses
+
+
+def _normalization_length(guesses: FloatsOrRaggedT) -> int:
+    if isinstance(guesses, Ragged):
+        return len(guesses.lengths)
+    else:
+        return guesses.shape[0]
+
+
+def _check_ints1d(arr: ArrayXd):
+    """
+    Check whether array is 1D and has type integer.
+    """
+    if arr.ndim != 1:
+        raise ValueError(
+            "SparseCategoricalCrossentropy only accepts 1D arrays, but "
+            f"array with shape {arr.shape} was given."
+        )
+    if arr.dtype.kind != "i":  # type: ignore
+        raise ValueError(
+            "SparseCategoricalCrossentropy only accepts integer arrays, but "
+            f"array with {arr.dtype} was given."
+        )
+
+
 __all__ = [
     "SequenceCategoricalCrossentropy",
     "CategoricalCrossentropy",
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 75206d240..47e170ec0 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -1,108 +1,375 @@
 import pytest
 import numpy
-from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy
-from thinc.api import L2Distance, CosineDistance
+from functools import partial
+from thinc.api import CategoricalCrossentropy
+from thinc.api import L2Distance, CosineDistance, softmax_activation
+from thinc.api import Ragged
 from thinc import registry
+from thinc.util import has_torch, to_categorical
+from hypothesis import given, settings
+from hypothesis.strategies import integers, floats
+from thinc.legacy import loss
 
+
+ALL_XP = [numpy]
+try:
+    import cupy
+
+    ALL_XP.append(cupy)
+except ImportError:
+    pass
+
+
+softmax_func = partial(softmax_activation(), is_train=False)
+MAX_EXAMPLES = 50
 # some simple arrays
 scores0 = numpy.zeros((3, 3), dtype="f")
 labels0 = numpy.asarray([0, 1, 1], dtype="i")
 
 # a few more diverse ones to test realistic values
-guesses1 = numpy.asarray([[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]])
+guesses1 = numpy.asarray(
+    [[0.1, 0.5, 0.4], [0.4, 0.3, 0.3], [0, 1, 0], [0.1, 0.05, 0.85]], dtype="f"
+)
+guesses1_legacy = numpy.asarray(
+    [[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]], dtype="f"
+)
 labels1 = numpy.asarray([2, 1, 0, 2])
-labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]])
+labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]], dtype="f")
 labels1_strings = ["C", "B", "A", "C"]
-
-guesses2 = numpy.asarray([[0.2, 0.3, 0.0]])
+d_guesses1 = numpy.array(
+    [
+        [0.025, 0.125, -0.15],
+        [0.1, -0.175, 0.075],
+        [-0.25, 0.25, 0.0],
+        [0.025, 0.0125, -0.0375],
+    ],
+    dtype="f",
+)
+d_guesses1_seq = numpy.array(
+    [
+        [0.05, 0.25, -0.3],
+        [0.2, -0.35, 0.15],
+        [-0.5, 0.5, 0.0],
+        [0.05, 0.025, -0.075],
+    ],
+    dtype="f",
+)
+d_guesses1_0_missing = numpy.array(
+    [
+        [0.025, 0.125, -0.15],
+        [0.1, -0.175, 0.075],
+        [0.0, 0.0, 0.0],
+        [0.025, 0.0125, -0.0375],
+    ],
+    dtype="f",
+)
+d_guesses1_sum = numpy.array(
+    [
+        [0.1, 0.5, -0.6],
+        [0.4, -0.7, 0.3],
+        [-1.0, 1.0, 0.0],
+        [0.1, 0.05, -0.15],
+    ],
+    dtype="f",
+)
+loss1 = 5.75151207
+loss1_seq = 11.50302410
+loss1_0_missing = 0.57069561
+guesses2 = numpy.asarray([[0.2, 0.3, 0.5]])
+guesses2_legacy = numpy.asarray([[0.2, 0.3, 0.0]])
 labels2 = numpy.asarray([1])
 labels2_strings = ["B"]
+d_guesses2_sum = numpy.asarray([[0.2, -0.7, 0.5]])
+sequence_loss = 24.210021096627
+eps = 1e-6
+
+
+ce_factory = registry.get("losses", "CategoricalCrossentropy.v4")
+
+sparse_ce_factory = registry.get("losses", "SparseCategoricalCrossentropy.v4")
+
+seq_ce_factory = registry.get("losses", "SequenceCategoricalCrossentropy.v4")
+
+
+def _get_legacy_cross_entropy(version: int, **kwargs):
+    return registry.get("losses", f"CategoricalCrossentropy.v{version}")(**kwargs)
 
-eps = 0.0001
 
+def _get_legacy_seq_cross_entropy(version: int, **kwargs):
+    return registry.get("losses", f"SequenceCategoricalCrossentropy.v{version}")(
+        **kwargs
+    )
+
+
+def test_cross_entropy_types_shapes():
+    sparse_cross_entropy = ce_factory()
+    cross_entropy = ce_factory()
+    sparse_seq_cross_entropy = seq_ce_factory()
+    seq_cross_entropy = seq_ce_factory(sparse=False)
+    d_scores_sparse = sparse_cross_entropy.get_grad(guesses1, labels1_full)
+    d_scores = cross_entropy.get_grad(guesses1, labels1_full)
+    assert d_scores_sparse.dtype == "float32"
+    assert d_scores.dtype == "float32"
+    assert d_scores_sparse.shape == guesses1.shape
+    assert d_scores.shape == guesses1.shape
+    d_scores_sparse = sparse_seq_cross_entropy.get_grad([guesses1], [labels1])
+    d_scores = seq_cross_entropy.get_grad([guesses1], [labels1_full])
+    assert d_scores_sparse[0].dtype == "float32"
+    assert d_scores[0].dtype == "float32"
+    assert d_scores_sparse[0].shape == guesses1.shape
+    assert d_scores[0].shape == guesses1.shape
+    assert sparse_seq_cross_entropy.get_grad([], []) == []
+    assert seq_cross_entropy.get_grad([], []) == []
+    d_scores_ragged = cross_entropy.get_grad(
+        Ragged(numpy.array(guesses1), lengths=[3, 1]), labels1_full
+    )
+    assert isinstance(d_scores_ragged, Ragged)
+    assert d_scores_ragged.dataXd.dtype == "float32"
+    assert d_scores_ragged.dataXd.shape == guesses1.shape
 
-def test_loss():
-    d_scores = CategoricalCrossentropy().get_grad(scores0, labels0)
+
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_cross_entropy_types_shapes(version):
+    cross_entropy = _get_legacy_cross_entropy(version)
+    seq_cross_entropy = _get_legacy_seq_cross_entropy(version)
+    d_scores = cross_entropy.get_grad(scores0, labels0)
     assert d_scores.dtype == "float32"
     assert d_scores.shape == scores0.shape
-    d_scores = SequenceCategoricalCrossentropy().get_grad([scores0], [labels0])
+    d_scores = seq_cross_entropy.get_grad([scores0], [labels0])
     assert d_scores[0].dtype == "float32"
     assert d_scores[0].shape == scores0.shape
-    assert SequenceCategoricalCrossentropy().get_grad([], []) == []
+    assert seq_cross_entropy.get_grad([], []) == []
 
 
-@pytest.mark.parametrize(
-    "dist", [CategoricalCrossentropy(), CosineDistance(ignore_zeros=True), L2Distance()]
+@pytest.mark.skipif(not has_torch, reason="needs PyTorch")
+@pytest.mark.parametrize("xp", ALL_XP)
+@settings(max_examples=MAX_EXAMPLES, deadline=None)
+@given(
+    n_samples=integers(min_value=1, max_value=100),
+    n_classes=integers(min_value=1, max_value=100),
+    low=floats(min_value=-20, max_value=10),
+    offset=floats(min_value=1, max_value=10),
 )
+def test_compare_cross_entropy_to_torch(xp, n_samples, n_classes, low, offset):
+    import torch
+
+    sparse_loss_sum = sparse_ce_factory(normalize=False)
+    sparse_loss_mean = sparse_ce_factory()
+    loss_sum = ce_factory(normalize=False)
+    loss_mean = ce_factory()
+    torch_loss_sum = torch.nn.CrossEntropyLoss(reduction="sum")
+    torch_loss_mean = torch.nn.CrossEntropyLoss()
+    logits = xp.random.uniform(low, low + offset, (n_samples, n_classes))
+    labels = xp.random.randint(0, n_classes, n_samples)
+    labels_full = to_categorical(labels, n_classes=n_classes)
+    torch_logits = torch.tensor(logits, requires_grad=True)
+    torch_labels = torch.tensor(labels, dtype=torch.long)
+    probs, _ = softmax_func(logits)
+    d_sum_sparse, l_sum_sparse = sparse_loss_sum(probs, labels)
+    d_sum, l_sum = loss_sum(probs, labels_full)
+    torch_l_sum = torch_loss_sum(torch_logits, torch_labels)
+    torch_l_sum.backward()
+    torch_d_sum = torch_logits.grad
+    torch_logits = torch.tensor(logits, requires_grad=True)
+    d_mean_sparse, l_mean_sparse = sparse_loss_mean(probs, labels)
+    d_mean, l_mean = loss_mean(probs, labels_full)
+    torch_l_mean = torch_loss_mean(torch_logits, torch_labels)
+    torch_l_mean.backward()
+    torch_d_mean = torch_logits.grad
+    assert xp.isclose(float(l_sum), float(torch_l_sum), atol=1e-06)
+    assert xp.allclose(d_sum, torch_d_sum.numpy())
+    assert xp.isclose(float(l_mean), float(torch_l_mean))
+    assert xp.allclose(d_mean, torch_d_mean.numpy())
+    assert xp.isclose(float(l_sum_sparse), float(torch_l_sum), atol=1e-06)
+    assert xp.allclose(d_sum_sparse, torch_d_sum.numpy())
+    assert xp.isclose(float(l_mean_sparse), float(torch_l_mean))
+    assert xp.allclose(d_mean_sparse, torch_d_mean.numpy())
+
+
+@pytest.mark.parametrize("dist", [CosineDistance(ignore_zeros=True), L2Distance()])
 @pytest.mark.parametrize("vect", [scores0, guesses1, guesses2])
-def test_equality(dist, vect):
-    assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, eps)
-    assert dist.get_loss(vect, vect) == pytest.approx(0, eps)
+def test_equal_distance(dist, vect):
+    assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps)
+    assert dist.get_loss(vect, vect) == pytest.approx(0, abs=eps)
+
+
+@pytest.mark.parametrize("version", [1, 2, 3])
+@pytest.mark.parametrize("vect", [scores0, guesses1_legacy, guesses2_legacy])
+def test_equal_legacy_cross_entropy(vect, version):
+    cross_entropy = _get_legacy_cross_entropy(version)
+    assert int(cross_entropy.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps)
+    assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps)
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, grad, grad_seq, loss, loss_seq",
+    [
+        (guesses1, labels1_full, d_guesses1, d_guesses1_seq, loss1, loss1_seq),
+    ],
+)
+def test_categorical_crossentropy(guesses, labels, grad, grad_seq, loss, loss_seq):
+    cross_entropy = ce_factory()
+    d_scores = cross_entropy.get_grad(guesses, labels)
+    loss_val = cross_entropy.get_loss(guesses, labels)
+    assert d_scores.shape == guesses.shape
+    assert numpy.allclose(d_scores, grad)
+    assert numpy.isclose(loss_val, loss)
+
+    # Test with Ragged inputs
+    d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels)
+    loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels)
+    assert d_scores_ragged.dataXd.shape == guesses.shape
+    assert numpy.allclose(d_scores_ragged.dataXd, grad_seq)
+    assert numpy.isclose(loss_ragged, loss_seq)
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, grad, grad_seq, loss, loss_seq",
+    [
+        (guesses1, labels1, d_guesses1, d_guesses1_seq, loss1, loss1_seq),
+    ],
+)
+def test_sparse_categorical_crossentropy(
+    guesses, labels, grad, grad_seq, loss, loss_seq
+):
+    cross_entropy = sparse_ce_factory()
+    d_scores = cross_entropy.get_grad(guesses, labels)
+    loss_val = cross_entropy.get_loss(guesses, labels)
+    assert d_scores.shape == guesses.shape
+    assert numpy.allclose(d_scores, grad)
+    assert numpy.isclose(loss_val, loss)
+
+    # Test with Ragged inputs
+    d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels)
+    loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels)
+    assert d_scores_ragged.dataXd.shape == guesses.shape
+    assert numpy.allclose(d_scores_ragged.dataXd, grad_seq)
+    assert numpy.isclose(loss_ragged, loss_seq)
 
 
 @pytest.mark.parametrize(
-    "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)]
+    "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)]
 )
-def test_categorical_crossentropy(guesses, labels):
-    d_scores = CategoricalCrossentropy(normalize=True).get_grad(guesses, labels)
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_categorical_crossentropy(guesses, labels, version):
+    cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True)
+    d_scores = cross_entropy_normalize.get_grad(guesses, labels)
     assert d_scores.shape == guesses.shape
 
     # The normalization divides the difference (e.g. 0.4) by the number of vectors (4)
-    assert d_scores[1][0] == pytest.approx(0.1, eps)
-    assert d_scores[1][1] == pytest.approx(-0.1, eps)
+    assert d_scores[1][0] == pytest.approx(0.1, abs=eps)
+    assert d_scores[1][1] == pytest.approx(-0.1, abs=eps)
 
     # The third vector predicted all labels, but only the first one was correct
-    assert d_scores[2][0] == pytest.approx(0, eps)
-    assert d_scores[2][1] == pytest.approx(0.25, eps)
-    assert d_scores[2][2] == pytest.approx(0.25, eps)
+    assert d_scores[2][0] == pytest.approx(0, abs=eps)
+    assert d_scores[2][1] == pytest.approx(0.25, abs=eps)
+    assert d_scores[2][2] == pytest.approx(0.25, abs=eps)
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores[3][0] == pytest.approx(0, eps)
-    assert d_scores[3][1] == pytest.approx(0, eps)
-    assert d_scores[3][2] == pytest.approx(-0.25, eps)
+    assert d_scores[3][0] == pytest.approx(0, abs=eps)
+    assert d_scores[3][1] == pytest.approx(0, abs=eps)
+    assert d_scores[3][2] == pytest.approx(-0.25, abs=eps)
 
-    loss = CategoricalCrossentropy(normalize=True).get_loss(guesses, labels)
-    assert loss == pytest.approx(0.239375, eps)
+    loss = cross_entropy_normalize.get_loss(guesses, labels)
+    assert loss == pytest.approx(0.239375, abs=eps)
 
 
 def test_crossentropy_incorrect_scores_targets():
     labels = numpy.asarray([2])
+    labels_full = numpy.asarray([[0.0, 0.0, 1.0]])
+    cross_entropy = ce_factory()
+    sparse_cross_entropy = sparse_ce_factory()
 
     guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        CategoricalCrossentropy(normalize=True).get_grad(guesses_neg, labels)
+        cross_entropy.get_grad(guesses_neg, labels_full)
+    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
+        sparse_cross_entropy.get_grad(guesses_neg, labels)
+
+    guesses_dont_sum_one = numpy.asarray([[0.1, 0.5, 0.6]])
+    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
+        cross_entropy.get_grad(guesses_dont_sum_one, labels_full)
+    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
+        sparse_cross_entropy.get_grad(guesses_dont_sum_one, labels)
 
     guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        CategoricalCrossentropy(normalize=True).get_grad(
-            guesses_larger_than_one, labels
-        )
+        cross_entropy.get_grad(guesses_larger_than_one, labels_full)
+    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
+        sparse_cross_entropy.get_grad(guesses_larger_than_one, labels)
 
     guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]])
     targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        CategoricalCrossentropy(normalize=True).get_grad(guesses_ok, targets_neg)
+        cross_entropy.get_grad(guesses_ok, targets_neg)
 
     targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        CategoricalCrossentropy(normalize=True).get_grad(
-            guesses_ok, targets_larger_than_one
-        )
+        cross_entropy.get_grad(guesses_ok, targets_larger_than_one)
+
+    targets_dont_sum_one = numpy.asarray([[0.9, 0.5, 0.6]])
+    with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
+        cross_entropy.get_grad(guesses_ok, targets_dont_sum_one)
+
+
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_categorical_cross_entropy_incorrect_scores_targets(version):
+    labels = numpy.asarray([2])
+    cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True)
+    guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
+    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
+        cross_entropy_normalize.get_grad(guesses_neg, labels)
+
+    guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]])
+    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
+        cross_entropy_normalize.get_grad(guesses_larger_than_one, labels)
+
+    guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]])
+    targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
+    with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
+        cross_entropy_normalize.get_grad(guesses_ok, targets_neg)
+
+    targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]])
+    with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
+        cross_entropy_normalize.get_grad(guesses_ok, targets_larger_than_one)
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, grad, missing_value",
+    [
+        (guesses1, [2, 1, 0, 2], d_guesses1_0_missing, 0),
+        (guesses1, labels1, d_guesses1_0_missing, 0),
+        (guesses1, labels1_strings, d_guesses1_0_missing, "A"),
+    ],
+)
+def test_sparse_crossentropy_missing(guesses, labels, grad, missing_value):
+    if missing_value == "A":
+        names = ["A", "B", "C"]
+    else:
+        names = None
+    sparse_cross_entropy = sparse_ce_factory(missing_value=missing_value, names=names)
+    d_scores = sparse_cross_entropy.get_grad(guesses, labels)
+    assert d_scores.shape == guesses.shape
+    assert numpy.allclose(d_scores, grad)
+    loss = sparse_cross_entropy.get_loss(guesses, labels)
+    assert numpy.isclose(loss, loss1_0_missing)
 
 
 @pytest.mark.parametrize(
     "guesses, labels",
-    [(guesses1, [2, 1, 0, 2])],
+    [(guesses1_legacy, [2, 1, 0, 2])],
 )
-def test_categorical_crossentropy_int_list_missing(guesses, labels):
-    d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad(
-        guesses, labels
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_categorical_crossentropy_int_list_missing(guesses, labels, version):
+    cross_entropy_normalize_missing = _get_legacy_cross_entropy(
+        version, normalize=True, missing_value=0
     )
+    d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels)
     assert d_scores.shape == guesses.shape
 
     # The normalization divides the difference (e.g. 0.4) by the number of vectors (4)
-    assert d_scores[1][0] == pytest.approx(0.1, eps)
-    assert d_scores[1][1] == pytest.approx(-0.1, eps)
+    assert d_scores[1][0] == pytest.approx(0.1, abs=eps)
+    assert d_scores[1][1] == pytest.approx(-0.1, abs=eps)
 
     # Label 0 is masked, because it represents the missing value
     assert d_scores[2][0] == 0.0
@@ -110,28 +377,46 @@ def test_categorical_crossentropy_int_list_missing(guesses, labels):
     assert d_scores[2][2] == 0.0
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores[3][0] == pytest.approx(0, eps)
-    assert d_scores[3][1] == pytest.approx(0, eps)
-    assert d_scores[3][2] == pytest.approx(-0.25, eps)
+    assert d_scores[3][0] == pytest.approx(0, abs=eps)
+    assert d_scores[3][1] == pytest.approx(0, abs=eps)
+    assert d_scores[3][2] == pytest.approx(-0.25, abs=eps)
+
+    loss = cross_entropy_normalize_missing.get_loss(guesses, labels)
+    assert loss == pytest.approx(0.114375, abs=eps)
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, grad",
+    [
+        (guesses1, labels1_full, d_guesses1_0_missing),
+    ],
+)
+def test_categorical_crossentropy_missing(guesses, labels, grad):
+    cross_entropy = ce_factory(missing_value=0)
+    d_scores = cross_entropy.get_grad(guesses, labels)
+    assert d_scores.shape == guesses.shape
+    assert numpy.allclose(d_scores, grad)
 
     loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss(
         guesses, labels
     )
-    assert loss == pytest.approx(0.114375, eps)
+    assert numpy.isclose(loss, loss1_0_missing)
 
 
 @pytest.mark.parametrize(
-    "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)]
+    "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)]
 )
-def test_categorical_crossentropy_missing(guesses, labels):
-    d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad(
-        guesses, labels
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_categorical_crossentropy_missing(guesses, labels, version):
+    cross_entropy_normalize_missing = _get_legacy_cross_entropy(
+        version, normalize=True, missing_value=0
     )
+    d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels)
     assert d_scores.shape == guesses.shape
 
     # The normalization divides the difference (e.g. 0.4) by the number of vectors (4)
-    assert d_scores[1][0] == pytest.approx(0.1, eps)
-    assert d_scores[1][1] == pytest.approx(-0.1, eps)
+    assert d_scores[1][0] == pytest.approx(0.1, abs=eps)
+    assert d_scores[1][1] == pytest.approx(-0.1, abs=eps)
 
     # Label 0 is masked, because it represents the missing value
     assert d_scores[2][0] == 0.0
@@ -139,95 +424,179 @@ def test_categorical_crossentropy_missing(guesses, labels):
     assert d_scores[2][2] == 0.0
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores[3][0] == pytest.approx(0, eps)
-    assert d_scores[3][1] == pytest.approx(0, eps)
-    assert d_scores[3][2] == pytest.approx(-0.25, eps)
+    assert d_scores[3][0] == pytest.approx(0, abs=eps)
+    assert d_scores[3][1] == pytest.approx(0, abs=eps)
+    assert d_scores[3][2] == pytest.approx(-0.25, abs=eps)
 
-    loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss(
-        guesses, labels
-    )
-    assert loss == pytest.approx(0.114375, eps)
+    loss = cross_entropy_normalize_missing.get_loss(guesses, labels)
+    assert loss == pytest.approx(0.114375, abs=eps)
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, names, grad, loss",
+    [
+        (
+            [guesses1, guesses2],
+            [labels1, labels2],
+            [],
+            [d_guesses1_sum, d_guesses2_sum],
+            sequence_loss,
+        ),
+        (
+            [guesses1, guesses2],
+            [labels1_strings, labels2_strings],
+            ["A", "B", "C"],
+            [d_guesses1_sum, d_guesses2_sum],
+            sequence_loss,
+        ),
+    ],
+)
+def test_sequence_sparse_crossentropy(guesses, labels, names, grad, loss):
+    sparse_seq_cross_entropy_sum = seq_ce_factory(names=names, normalize=False)
+    sparse_seq_cross_entropy = seq_ce_factory(names=names, normalize=True)
+    d_scores = sparse_seq_cross_entropy_sum.get_grad(guesses, labels)
+    assert numpy.allclose(d_scores[0], grad[0])
+    assert numpy.allclose(d_scores[1], grad[1])
+    # The normalization divides the difference (e.g. 0.4) by the number of seqs
+    d_scores = sparse_seq_cross_entropy.get_grad(guesses, labels)
+    assert numpy.allclose(d_scores[0], grad[0] / 2.0)
+    assert numpy.allclose(d_scores[1], grad[1] / 2.0)
+    loss_val = sparse_seq_cross_entropy.get_loss(guesses, labels)
+    assert numpy.isclose(loss_val, loss)
+    d_scores, loss_val = sparse_seq_cross_entropy_sum(guesses, labels)
+    assert numpy.isclose(loss_val, loss)
+    assert numpy.allclose(d_scores[0], grad[0])
+    assert numpy.allclose(d_scores[1], grad[1])
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, grad, loss",
+    [([guesses1], [labels1_full], [d_guesses1_sum], [23.00604829563447])],
+)
+def test_sequence_crossentropy(guesses, labels, grad, loss):
+    seq_cross_entropy = seq_ce_factory(sparse=False, normalize=False)
+    d_scores = seq_cross_entropy.get_grad(guesses, labels)
+    assert numpy.allclose(d_scores[0], grad[0])
+    # The normalization divides the difference (e.g. 0.4) by the number of seqs
+    loss_val = seq_cross_entropy.get_loss(guesses, labels)
+    assert numpy.isclose(loss_val, loss)
+    d_scores, loss_val = seq_cross_entropy(guesses, labels)
+    assert numpy.isclose(loss_val, loss)
+    assert numpy.allclose(d_scores[0], grad[0])
 
 
 @pytest.mark.parametrize(
     "guesses, labels, names",
     [
-        ([guesses1, guesses2], [labels1, labels2], []),
-        ([guesses1, guesses2], [labels1_full, labels2], []),
-        ([guesses1, guesses2], [labels1_strings, labels2_strings], ["A", "B", "C"]),
+        ([guesses1_legacy, guesses2_legacy], [labels1, labels2], []),
+        ([guesses1_legacy, guesses2_legacy], [labels1_full, labels2], []),
+        (
+            [guesses1_legacy, guesses2_legacy],
+            [labels1_strings, labels2_strings],
+            ["A", "B", "C"],
+        ),
     ],
 )
-def test_sequence_categorical_crossentropy(guesses, labels, names):
-    d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names).get_grad(
-        guesses, labels
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_sequence_categorical_crossentropy(guesses, labels, names, version):
+    seq_cross_entropy_names = _get_legacy_seq_cross_entropy(
+        version, normalize=False, names=names
     )
+    seq_cross_entropy_names_normalize = _get_legacy_seq_cross_entropy(
+        version, normalize=True, names=names
+    )
+    d_scores = seq_cross_entropy_names.get_grad(guesses, labels)
     d_scores1 = d_scores[0]
     d_scores2 = d_scores[1]
     assert d_scores1.shape == guesses1.shape
     assert d_scores2.shape == guesses2.shape
-    assert d_scores1[1][0] == pytest.approx(0.4, eps)
-    assert d_scores1[1][1] == pytest.approx(-0.4, eps)
+    assert d_scores1[1][0] == pytest.approx(0.4, abs=eps)
+    assert d_scores1[1][1] == pytest.approx(-0.4, abs=eps)
     # The normalization divides the difference (e.g. 0.4) by the number of seqs
-    d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad(
-        guesses, labels
-    )
+    d_scores = seq_cross_entropy_names_normalize.get_grad(guesses, labels)
     d_scores1 = d_scores[0]
     d_scores2 = d_scores[1]
 
-    assert d_scores1[1][0] == pytest.approx(0.2, eps)
-    assert d_scores1[1][1] == pytest.approx(-0.2, eps)
+    assert d_scores1[1][0] == pytest.approx(0.2, abs=eps)
+    assert d_scores1[1][1] == pytest.approx(-0.2, abs=eps)
 
     # The third vector predicted all labels, but only the first one was correct
-    assert d_scores1[2][0] == pytest.approx(0, eps)
-    assert d_scores1[2][1] == pytest.approx(0.5, eps)
-    assert d_scores1[2][2] == pytest.approx(0.5, eps)
+    assert d_scores1[2][0] == pytest.approx(0, abs=eps)
+    assert d_scores1[2][1] == pytest.approx(0.5, abs=eps)
+    assert d_scores1[2][2] == pytest.approx(0.5, abs=eps)
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores1[3][0] == pytest.approx(0, eps)
-    assert d_scores1[3][1] == pytest.approx(0, eps)
-    assert d_scores1[3][2] == pytest.approx(-0.5, eps)
+    assert d_scores1[3][0] == pytest.approx(0, abs=eps)
+    assert d_scores1[3][1] == pytest.approx(0, abs=eps)
+    assert d_scores1[3][2] == pytest.approx(-0.5, abs=eps)
 
     # Test the second batch
-    assert d_scores2[0][0] == pytest.approx(0.1, eps)
-    assert d_scores2[0][1] == pytest.approx(-0.35, eps)
+    assert d_scores2[0][0] == pytest.approx(0.1, abs=eps)
+    assert d_scores2[0][1] == pytest.approx(-0.35, abs=eps)
 
-    loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss(
-        guesses, labels
+    loss = seq_cross_entropy_names_normalize.get_loss(guesses, labels)
+    assert loss == pytest.approx(1.09, abs=eps)
+
+
+@pytest.mark.parametrize(
+    "guesses, labels, names, grad",
+    [
+        (
+            [guesses1],
+            [["A", "!A", "", "!C"]],
+            ["A", "B", "C"],
+            numpy.array(
+                [
+                    [-0.9, 0.5, 0.4],  # First is correct
+                    [0.4, 0.0, 0.0],  # Not first one
+                    [0.0, 0.0, 0.0],  # Missing
+                    [0.0, 0.0, 0.85],  # Not last one
+                ]
+            ),
+        )
+    ],
+)
+def test_sequence_crossentropy_missing_negative(guesses, labels, names, grad):
+    sparse_seq_ce = seq_ce_factory(
+        names=names, normalize=False, neg_prefix="!", missing_value=""
     )
-    assert loss == pytest.approx(1.09, eps)
+    d_scores = sparse_seq_ce.get_grad(guesses, labels)
+    assert numpy.allclose(d_scores, grad)
 
 
 @pytest.mark.parametrize(
     "guesses, labels, names",
     [
-        ([guesses1], [["A", "!A", "", "!C"]], ["A", "B", "C"]),
+        ([guesses1_legacy], [["A", "!A", "", "!C"]], ["A", "B", "C"]),
     ],
 )
-def test_sequence_categorical_missing_negative(guesses, labels, names):
-    d_scores = SequenceCategoricalCrossentropy(
-        normalize=False, names=names, neg_prefix="!", missing_value=""
-    ).get_grad(guesses, labels)
+@pytest.mark.parametrize("version", [3])
+def test_legacy_sequence_categorical_missing_negative(guesses, labels, names, version):
+    seq_cross_entropy = _get_legacy_seq_cross_entropy(
+        version, normalize=False, names=names, neg_prefix="!", missing_value=""
+    )
+    d_scores = seq_cross_entropy.get_grad(guesses, labels)
     d_scores0 = d_scores[0]
 
     # [0.1, 0.5, 0.6] should be A
-    assert d_scores0[0][0] == pytest.approx(-0.9, eps)
-    assert d_scores0[0][1] == pytest.approx(0.5, eps)
-    assert d_scores0[0][2] == pytest.approx(0.6, eps)
+    assert d_scores0[0][0] == pytest.approx(-0.9, abs=eps)
+    assert d_scores0[0][1] == pytest.approx(0.5, abs=eps)
+    assert d_scores0[0][2] == pytest.approx(0.6, abs=eps)
 
     # [0.4, 0.6, 0.3] should NOT be A
-    assert d_scores0[1][0] == pytest.approx(0.4, eps)
-    assert d_scores0[1][1] == pytest.approx(0.0, eps)
-    assert d_scores0[1][2] == pytest.approx(0.0, eps)
+    assert d_scores0[1][0] == pytest.approx(0.4, abs=eps)
+    assert d_scores0[1][1] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[1][2] == pytest.approx(0.0, abs=eps)
 
     # [1, 1, 1] has missing gold label
-    assert d_scores0[2][0] == pytest.approx(0.0, eps)
-    assert d_scores0[2][1] == pytest.approx(0.0, eps)
-    assert d_scores0[2][2] == pytest.approx(0.0, eps)
+    assert d_scores0[2][0] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[2][1] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[2][2] == pytest.approx(0.0, abs=eps)
 
     # [0.0, 0.0, 0.0] should NOT be C
-    assert d_scores0[3][0] == pytest.approx(0.0, eps)
-    assert d_scores0[3][1] == pytest.approx(0.0, eps)
-    assert d_scores0[3][2] == pytest.approx(0.0, eps)
+    assert d_scores0[3][0] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[3][1] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[3][2] == pytest.approx(0.0, abs=eps)
 
 
 def test_L2():
@@ -241,10 +610,10 @@ def test_L2():
     )
 
     loss_not_normalized = L2Distance(normalize=False).get_loss(vec1, vec2)
-    assert loss_not_normalized == pytest.approx(20, eps)
+    assert loss_not_normalized == pytest.approx(20, abs=eps)
 
     loss_normalized = L2Distance(normalize=True).get_loss(vec1, vec2)
-    assert loss_normalized == pytest.approx(5, eps)
+    assert loss_normalized == pytest.approx(5, abs=eps)
 
 
 def test_cosine_orthogonal():
@@ -260,10 +629,10 @@ def test_cosine_orthogonal():
     assert d_vecs[1][1] > 0
 
     loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2)
-    assert loss_not_normalized == pytest.approx(2, eps)
+    assert loss_not_normalized == pytest.approx(2, abs=eps)
 
     loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2)
-    assert loss_normalized == pytest.approx(1, eps)
+    assert loss_normalized == pytest.approx(1, abs=eps)
 
 
 def test_cosine_equal():
@@ -276,10 +645,10 @@ def test_cosine_equal():
     numpy.testing.assert_allclose(d_vec1, numpy.zeros(d_vec1.shape), rtol=eps, atol=eps)
 
     loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2)
-    assert loss_not_normalized == pytest.approx(0, eps)
+    assert loss_not_normalized == pytest.approx(0, abs=eps)
 
     loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2)
-    assert loss_normalized == pytest.approx(0, eps)
+    assert loss_normalized == pytest.approx(0, abs=eps)
 
 
 def test_cosine_unmatched():
@@ -292,19 +661,26 @@ def test_cosine_unmatched():
 @pytest.mark.parametrize(
     "name,kwargs,args",
     [
-        ("CategoricalCrossentropy.v1", {}, (scores0, labels0)),
-        ("SequenceCategoricalCrossentropy.v1", {}, ([scores0], [labels0])),
-        ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (scores0, labels0)),
-        ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (scores0, labels0)),
+        ("CategoricalCrossentropy.v1", {}, (guesses1, labels1)),
+        ("SequenceCategoricalCrossentropy.v1", {}, ([guesses1], [labels1])),
+        ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (guesses1, labels1)),
+        ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (guesses1, labels1)),
+        ("SparseCategoricalCrossentropy.v4", {"neg_prefix": "!"}, (guesses1, labels1)),
+        ("CategoricalCrossentropy.v4", {}, (guesses1, labels1_full)),
         (
             "SequenceCategoricalCrossentropy.v2",
             {"neg_prefix": "!"},
-            ([scores0], [labels0]),
+            ([guesses1], [labels1]),
         ),
         (
             "SequenceCategoricalCrossentropy.v3",
             {"neg_prefix": "!"},
-            ([scores0], [labels0]),
+            ([guesses1], [labels1]),
+        ),
+        (
+            "SequenceCategoricalCrossentropy.v4",
+            {"neg_prefix": "!"},
+            ([guesses1], [labels1]),
         ),
         ("L2Distance.v1", {}, (scores0, scores0)),
         (
diff --git a/thinc/util.py b/thinc/util.py
index b87ca4e5f..059f2c235 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -1,7 +1,8 @@
 from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar
-from typing import List, Mapping, Tuple
+from typing import List, Mapping
+from typing import TYPE_CHECKING
+
 import numpy
-from packaging.version import Version
 import random
 import functools
 from wasabi import table
@@ -15,18 +16,16 @@
 from dataclasses import dataclass
 from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
 from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu
-from .compat import has_torch_mps_gpu
 from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
 
-DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
-
-from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd  # noqa: E402
+from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd, Floats2d  # noqa: E402
 from . import types  # noqa: E402
-from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
     from .api import Ops
 
+DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
+
 
 def get_torch_default_device() -> "torch.device":
     if torch is None:
@@ -254,6 +253,21 @@ def to_categorical(
     return label_distr[Y]
 
 
+def smooth_one_hot(X: Floats2d, label_smoothing: float) -> Floats2d:
+    """
+    Apply label-smoothing to one-hot array.
+    """
+    if not 0.0 <= label_smoothing < 0.5:
+        raise ValueError(
+            "label_smoothing should be greater or "
+            "equal to 0.0 and less than 0.5, "
+            f"but {label_smoothing} was provided."
+        )
+    X[X == 1] = 1 - label_smoothing
+    X[X == 0] = label_smoothing / (X.shape[1] - 1)
+    return X
+
+
 def get_width(
     X: Union[ArrayXd, Ragged, Padded, Sequence[ArrayXd]], *, dim: int = -1
 ) -> int:

From cdc971702dd2cd14e45dfb06ed5cbad816771239 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 9 Dec 2022 08:49:13 +0100
Subject: [PATCH 07/30] Bring back support for missing labels to legacy cross
 entropy (#809)

* Bring back support for missing labels to legacy cross entropy

* Use `missing_value` to detect missing values

* Typing fixes
---
 thinc/legacy/loss.py     | 35 +++++++++++++++++++----------------
 thinc/tests/test_loss.py |  9 +++++++++
 2 files changed, 28 insertions(+), 16 deletions(-)

diff --git a/thinc/legacy/loss.py b/thinc/legacy/loss.py
index 439a2ca21..ab9871625 100644
--- a/thinc/legacy/loss.py
+++ b/thinc/legacy/loss.py
@@ -1,13 +1,13 @@
 from typing import Optional, Sequence, Dict, Union, Tuple
 from typing import cast, List
-from ..types import Floats2d, Ints1d, Ints2d
+from ..types import Floats2d, Ints1d
 from ..config import registry
 from ..util import to_categorical, get_array_module
 from ..loss import IntsOrFloatsOrStrs, Loss
 from ..loss import _make_mask, _make_mask_by_value
 
 
-TruthsT = Union[List[str], List[int], Ints1d, Floats2d]
+TruthsT = Union[List[Optional[str]], List[int], Ints1d, Floats2d]
 
 
 class LegacyCategoricalCrossentropy(Loss):
@@ -34,7 +34,9 @@ def __init__(
         else:
             self._name_to_i = {}
 
-    def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]:
+    def convert_truths(
+        self, truths: TruthsT, guesses: Floats2d
+    ) -> Tuple[Floats2d, Floats2d]:
         xp = get_array_module(guesses)
         missing = []
         negatives_mask = None
@@ -49,13 +51,13 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d,
                         if not isinstance(value, int):
                             raise ValueError(
                                 "All values in the truths list have to "
-                                "have the same time. The first value was "
+                                "have the same type. The first value was "
                                 f"detected to be integer, but found {type(value)}."
                             )
                         if value == missing_value:
                             missing.append(i)
                 else:
-                    truths = cast(List[str], truths)
+                    truths = cast(List[Optional[str]], truths)
                     if self.names is None:
                         msg = (
                             "Cannot calculate loss from list of strings without names. "
@@ -65,10 +67,10 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d,
                         )
                         raise ValueError(msg)
                     for i, value in enumerate(truths):
-                        if not isinstance(value, str):
+                        if not (isinstance(value, str) or value == missing_value):
                             raise ValueError(
                                 "All values in the truths list have to "
-                                "have the same time. The first value was "
+                                "have the same type. The first value was "
                                 f"detected to be string, but found {type(value)}."
                             )
                         if value == missing_value:
@@ -79,11 +81,16 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d,
                             and self.neg_prefix
                             and value.startswith(self.neg_prefix)
                         ):
-                            truths[i] = value[len(self.neg_prefix) :]
-                            neg_index = self._name_to_i[truths[i]]
+                            neg_value = value[len(self.neg_prefix) :]
+                            truths[i] = neg_value
+                            neg_index = self._name_to_i[neg_value]
                             negatives_mask[i] = 0  # type: ignore
                             negatives_mask[i][neg_index] = -1  # type: ignore
-                    truths = [self._name_to_i[name] for name in truths]
+                    # In the loop above, we have ensured that `truths` doesn't
+                    # contain `None` (anymore). However, mypy can't infer this
+                    # and doesn't like the shadowing.
+                    truths_str = cast(List[str], truths)
+                    truths = [self._name_to_i[name] for name in truths_str]
             truths = xp.asarray(truths, dtype="i")
             mask = _make_mask(guesses, missing)
         else:
@@ -113,9 +120,7 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d,
             mask *= negatives_mask
         return cast(Floats2d, truths_2d), mask
 
-    def __call__(
-        self, guesses: Floats2d, truths: TruthsT
-    ) -> Tuple[Floats2d, float]:
+    def __call__(self, guesses: Floats2d, truths: TruthsT) -> Tuple[Floats2d, float]:
         d_truth = self.get_grad(guesses, truths)
         return (d_truth, self._get_loss_from_grad(d_truth))
 
@@ -187,9 +192,7 @@ def get_grad(
             d_scores.append(d_yh)
         return d_scores
 
-    def get_loss(
-        self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]
-    ) -> float:
+    def get_loss(self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]) -> float:
         return self._get_loss_from_grad(self.get_grad(guesses, truths))
 
     def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float:
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 47e170ec0..2cb49e466 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -199,6 +199,15 @@ def test_equal_legacy_cross_entropy(vect, version):
     cross_entropy = _get_legacy_cross_entropy(version)
     assert int(cross_entropy.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps)
     assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps)
+    assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps)
+
+
+@pytest.mark.parametrize("version", [1, 2, 3])
+def test_legacy_cross_entropy_absent_labels(version):
+    cross_entropy = _get_legacy_cross_entropy(version, names=["cat", "dog", "rat"])
+    assert cross_entropy.get_loss(scores0, [None, None, None]) == pytest.approx(
+        0, abs=eps
+    )
 
 
 @pytest.mark.parametrize(

From 9743709d5a705366c79c15e03f1b2bf5ead96955 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 9 Dec 2022 13:42:14 +0100
Subject: [PATCH 08/30] Set version to v9.0.0.dev0 (#816)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 087ac261f..bcdeb1bbc 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.5"
+__version__ = "v9.0.0.dev0"
 __release__ = True

From 07f8f888308b9ed453ff4e1bb09c3eb505c98558 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 9 Dec 2022 14:28:03 +0100
Subject: [PATCH 09/30] Fix spurious `v` prefix in the version number (#818)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index bcdeb1bbc..3c68811b6 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "v9.0.0.dev0"
+__version__ = "9.0.0.dev0"
 __release__ = True

From 717c70e03007b5b376dc353daf501a2b5d0043b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 22 Dec 2022 19:51:44 +0100
Subject: [PATCH 10/30] Give schedules access to the key, step, and last eval
 score (#804)

* Give schedulers access to the key, step, and last eval score

Before this change schedules were generators that generate a value for
each training step. This, however has the limitation that scheduler
cannot use other information that is available in the optimizer such
as the parameter key. This information is useful for e.g. discriminative
learning rates, where certain parameters are on a different schedule
than others.

To accommodate passing additional information, this change converts
schedules to callables. These callables are passed the training step,
the parameter key, and the last evaluation score (when available).

Traditional scalar and generated schedules are converted to callables
by the optimizer for compatibility.

* Fix use of the `t` parameter where used in the schedules

Also add tests, so that doesn't break again.

* Fixes from @shadeMe

* Call _schedule_args once

* Make Optimizer.step private

* Fix two missed step uses in tests

* Float fix

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* Fix schedule call

* Move `ScheduleCallable` to `thinc.types`

* Move from callables to a `Schedule` class

The new learning rate functionality used `Callable`s. However, the issue
with callables it that they cannot be pickled. This is problematic,
because schedules can end up in spaCy pipelines (e.g. through the
optimizer associated with the `Language` object).

This change solves this issue by refactoring the schedules into regular
objects. This now works similar to Thinc `Model`s -- there is a new
`Scheduler` class which can be constructed with composition.

I tested the changes with spaCy and pickling as well as usin existing
configurations works.

* Remove stray `runtime_checkable` import

* Apply suggestions from code review

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 thinc/api.py                   |   6 +-
 thinc/optimizers.py            | 213 +++++++++++++++++++++++----------
 thinc/schedules.py             | 209 ++++++++++++++++++++++++--------
 thinc/tests/test_config.py     |   5 +-
 thinc/tests/test_optimizers.py |  74 ++++++++++--
 thinc/tests/test_schedules.py  |  53 ++++----
 website/docs/api-optimizers.md |  33 +++--
 website/docs/api-schedules.md  | 111 ++++++++++++++---
 website/docs/usage-config.md   |  53 ++++----
 website/docs/usage-training.md |  74 ++++++------
 10 files changed, 588 insertions(+), 243 deletions(-)

diff --git a/thinc/api.py b/thinc/api.py
index 8f5b3247e..f9e392048 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -8,8 +8,8 @@
 from .shims import Shim, PyTorchGradScaler, PyTorchShim, TensorFlowShim, keras_model_fns
 from .shims import MXNetShim, TorchScriptShim, maybe_handshake_model
 from .optimizers import Adam, RAdam, SGD, Optimizer
-from .schedules import cyclic_triangular, warmup_linear, constant, constant_then
-from .schedules import decaying, slanted_triangular, compounding
+from .schedules import Schedule, cyclic_triangular, warmup_linear, constant
+from .schedules import constant_then, decaying, slanted_triangular, compounding
 from .types import Ragged, Padded, ArgsKwargs, Unserializable
 from .util import fix_random_seed, is_cupy_array, set_active_gpu
 from .util import prefer_gpu, require_gpu, require_cpu
@@ -66,7 +66,7 @@
     # .optimizers
     "Adam", "RAdam", "SGD", "Optimizer",
     # .schedules
-    "cyclic_triangular", "warmup_linear", "constant", "constant_then",
+    "Schedule", "cyclic_triangular", "warmup_linear", "constant", "constant_then",
     "decaying", "slanted_triangular", "compounding",
     # .types
     "Ragged", "Padded", "ArgsKwargs", "Unserializable",
diff --git a/thinc/optimizers.py b/thinc/optimizers.py
index f34cd2ff8..b0636fd87 100644
--- a/thinc/optimizers.py
+++ b/thinc/optimizers.py
@@ -1,16 +1,17 @@
-import math
-
-from typing import Dict, Optional, Union, Tuple, List, cast
+from typing import Any, Dict, Optional, Union, Tuple, List, cast
 from collections import defaultdict
+import itertools
+import math
+from types import GeneratorType
 
 from .backends import get_array_ops
 from .types import Generator, FloatsXd
 from .config import registry
+from .schedules import constant, Schedule
 
 
 KeyT = Tuple[int, str]
-FloatOrSeq = Union[float, List[float], Generator]
-IntOrSeq = Union[int, List[int], Generator]
+ScheduleT = Union[float, List[float], Generator, Schedule]
 
 SGD_DEFAULTS: Dict[str, Union[float, bool, int]] = {
     "L2": 0.0,
@@ -32,14 +33,14 @@
 
 @registry.optimizers("RAdam.v1")
 def RAdam(
-    learn_rate: FloatOrSeq = ADAM_DEFAULTS["learn_rate"],
+    learn_rate: ScheduleT = ADAM_DEFAULTS["learn_rate"],
     *,
-    beta1: FloatOrSeq = ADAM_DEFAULTS["beta1"],
-    beta2: FloatOrSeq = ADAM_DEFAULTS["beta2"],
-    eps: FloatOrSeq = ADAM_DEFAULTS["eps"],
-    L2: FloatOrSeq = ADAM_DEFAULTS["L2"],
+    beta1: ScheduleT = ADAM_DEFAULTS["beta1"],
+    beta2: ScheduleT = ADAM_DEFAULTS["beta2"],
+    eps: ScheduleT = ADAM_DEFAULTS["eps"],
+    L2: ScheduleT = ADAM_DEFAULTS["L2"],
     L2_is_weight_decay: bool = cast(bool, ADAM_DEFAULTS["L2_is_weight_decay"]),
-    grad_clip: FloatOrSeq = ADAM_DEFAULTS["grad_clip"],
+    grad_clip: ScheduleT = ADAM_DEFAULTS["grad_clip"],
     use_averages: bool = True,
 ):
     return Optimizer(
@@ -57,13 +58,13 @@ def RAdam(
 
 @registry.optimizers("Adam.v1")
 def Adam(
-    learn_rate: FloatOrSeq = ADAM_DEFAULTS["learn_rate"],
+    learn_rate: ScheduleT = ADAM_DEFAULTS["learn_rate"],
     *,
-    L2: FloatOrSeq = ADAM_DEFAULTS["L2"],
-    beta1: FloatOrSeq = ADAM_DEFAULTS["beta1"],
-    beta2: FloatOrSeq = ADAM_DEFAULTS["beta2"],
-    eps: FloatOrSeq = ADAM_DEFAULTS["eps"],
-    grad_clip: FloatOrSeq = ADAM_DEFAULTS["grad_clip"],
+    L2: ScheduleT = ADAM_DEFAULTS["L2"],
+    beta1: ScheduleT = ADAM_DEFAULTS["beta1"],
+    beta2: ScheduleT = ADAM_DEFAULTS["beta2"],
+    eps: ScheduleT = ADAM_DEFAULTS["eps"],
+    grad_clip: ScheduleT = ADAM_DEFAULTS["grad_clip"],
     L2_is_weight_decay: bool = cast(bool, ADAM_DEFAULTS["L2_is_weight_decay"]),
     use_averages: bool = True,
 ):
@@ -82,10 +83,10 @@ def Adam(
 
 @registry.optimizers("SGD.v1")
 def SGD(
-    learn_rate: FloatOrSeq,
+    learn_rate: ScheduleT,
     *,
-    L2: FloatOrSeq = SGD_DEFAULTS["L2"],
-    grad_clip: FloatOrSeq = SGD_DEFAULTS["grad_clip"],
+    L2: ScheduleT = SGD_DEFAULTS["L2"],
+    grad_clip: ScheduleT = SGD_DEFAULTS["grad_clip"],
     L2_is_weight_decay: bool = cast(bool, SGD_DEFAULTS["L2_is_weight_decay"]),
     use_averages: bool = True,
 ):
@@ -111,15 +112,17 @@ class Optimizer(object):
     schedules: Dict[str, Generator]
     nr_update: Dict[KeyT, int]
     last_seen: Dict[KeyT, int]
-    grad_clip: float
-    learn_rate: float
-    b1: float
-    b2: float
-    eps: float
-    L2: float
+    grad_clip: Schedule
+    learn_rate: Schedule
+    b1: Schedule
+    b2: Schedule
+    eps: Schedule
+    L2: Schedule
     use_radam: bool
     L2_is_weight_decay: bool
     _radam_buffer: List[List[Optional[FloatsXd]]]
+    _step: int
+    _last_score: Optional[Tuple[int, float]]
 
     # This "locks" the class, so we get an error if you try to assign to
     # an unexpected variable.
@@ -139,17 +142,19 @@ class Optimizer(object):
         "use_radam",
         "L2_is_weight_decay",
         "_radam_buffer",
+        "_step",
+        "_last_score",
     ]
 
     def __init__(
         self,
-        learn_rate: FloatOrSeq,
+        learn_rate: ScheduleT,
         *,
-        L2: FloatOrSeq = ADAM_DEFAULTS["L2"],
-        beta1: FloatOrSeq = ADAM_DEFAULTS["beta1"],
-        beta2: FloatOrSeq = ADAM_DEFAULTS["beta2"],
-        eps: FloatOrSeq = ADAM_DEFAULTS["eps"],
-        grad_clip: FloatOrSeq = ADAM_DEFAULTS["grad_clip"],
+        L2: ScheduleT = ADAM_DEFAULTS["L2"],
+        beta1: ScheduleT = ADAM_DEFAULTS["beta1"],
+        beta2: ScheduleT = ADAM_DEFAULTS["beta2"],
+        eps: ScheduleT = ADAM_DEFAULTS["eps"],
+        grad_clip: ScheduleT = ADAM_DEFAULTS["grad_clip"],
         use_averages: bool = True,
         use_radam: bool = False,
         L2_is_weight_decay: bool = True,
@@ -168,13 +173,14 @@ def __init__(
         L2_is_weight_decay (bool): Whether to interpret the L2 parameter as a
             weight decay term, in the style of the AdamW optimizer.
         """
+        self._step = 0
+        self._last_score = None
         self.mom1 = {}
         self.mom2 = {}
         if use_averages:
             self.averages = {}
         else:
             self.averages = None
-        self.schedules = {}
         self.nr_update = defaultdict(int)
         self.last_seen = defaultdict(int)
         self._set_attr_or_schedule("grad_clip", grad_clip)
@@ -189,24 +195,38 @@ def __init__(
 
     def _set_attr_or_schedule(self, name, value):
         if isinstance(value, (float, bool, int)):
+            setattr(self, name, constant(value))
+        elif isinstance(value, list):
+            value = iter(value)
+            setattr(self, name, _wrap_generator(name, value))
+        elif isinstance(value, GeneratorType):
+            setattr(self, name, _wrap_generator(name, value))
+        elif isinstance(value, Schedule):
             setattr(self, name, value)
         else:
-            if isinstance(value, list):
-                value = iter(value)
-            self.schedules[name] = value
-            try:
-                setattr(self, name, next(value))
-            except (StopIteration, TypeError) as e:
-                err = f"Invalid schedule for '{name}' ({type(value)})\n{e}"
-                raise ValueError(err)
+            err = f"Invalid schedule for '{name}' ({type(value)})"
+            raise ValueError(err)
 
     def step_schedules(self):
-        for key, schedule in self.schedules.items():
-            try:
-                value = next(schedule)
-            except StopIteration:  # schedule exhausted, use last value
-                value = getattr(self, key)
-            setattr(self, key, value)
+        self._step += 1
+
+    @property
+    def last_score(self) -> Optional[Tuple[int, float]]:
+        return self._last_score
+
+    @last_score.setter
+    def last_score(self, score: float):
+        self._last_score = (self._step, score)
+
+    @property
+    def step(self) -> int:
+        return self._step
+
+    def _schedule_args(self, key: KeyT) -> Dict[str, Any]:
+        return {
+            "key": key,
+            "last_score": self.last_score,
+        }
 
     def __call__(
         self,
@@ -221,28 +241,42 @@ def __call__(
         """
         if len(gradient) < 1:
             return weights, gradient
+
         ops = get_array_ops(weights)
         self.nr_update[key] += 1
         nr_upd = self.nr_update[key]
-        if self.L2 != 0 and not self.L2_is_weight_decay:
-            gradient += self.L2 * weights
-        if self.grad_clip:
-            gradient = ops.clip_gradient(gradient, self.grad_clip)
+        schedule_args = self._schedule_args(key)
+
+        if self.L2(self.step, **schedule_args) != 0 and not self.L2_is_weight_decay:
+            gradient += self.L2(self.step, **schedule_args) * weights
+        if self.grad_clip(self.step, **schedule_args):
+            gradient = ops.clip_gradient(
+                gradient,
+                self.grad_clip(self.step, **schedule_args),
+            )
         if self.use_radam:
             weights, gradient = self._radam(
                 ops, weights, gradient, lr_scale, key, nr_upd
             )
-        elif self.b1 > 0.0 and self.b2 > 0.0:
+        elif (
+            self.b1(self.step, **schedule_args) > 0.0
+            and self.b2(self.step, **schedule_args) > 0.0
+        ):
             weights, gradient = self._adam(
                 ops, weights, gradient, lr_scale, key, nr_upd
             )
-        elif self.b2 > 0.0:  # pragma: no cover
+        elif self.b2(self.step, **schedule_args) > 0.0:  # pragma: no cover
             raise NotImplementedError  # TODO: error message
         else:
-            weights -= lr_scale * self.learn_rate * gradient
+            weights -= lr_scale * self.learn_rate(self.step, **schedule_args) * gradient
         gradient *= 0
-        if self.L2 != 0 and self.L2_is_weight_decay:
-            weights -= lr_scale * self.learn_rate * self.L2 * weights
+        if self.L2(self.step, **schedule_args) != 0 and self.L2_is_weight_decay:
+            weights -= (
+                lr_scale
+                * self.learn_rate(self.step, **schedule_args)
+                * self.L2(self.step, **schedule_args)
+                * weights
+            )
         if self.averages is not None:
             if key not in self.averages:
                 self.averages[key] = ops.alloc(weights.shape, dtype="float32")
@@ -258,6 +292,8 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd):
         weights_1D = ops.reshape1f(weights, weights.size)
         gradient_1D = ops.reshape1f(grad, grad.size)
 
+        schedule_args = self._schedule_args(key)
+
         # While we port from the pytorch implementation, keep some of the same
         # naming
         state = {
@@ -266,9 +302,12 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd):
             "exp_avg_sq": self.mom2[key],
         }
         group = {
-            "lr": self.learn_rate,
-            "betas": [self.b1, self.b2],
-            "eps": self.eps,
+            "lr": self.learn_rate(self.step, **schedule_args),
+            "betas": [
+                self.b1(self.step, **schedule_args),
+                self.b2(self.step, **schedule_args),
+            ],
+            "eps": self.eps(self.step, **schedule_args),
             "weight_decay": 0.0,
             "buffer": self._radam_buffer,
         }
@@ -330,18 +369,21 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd):
     def _adam(self, ops, weights, gradient, lr_scale, key, nr_upd):
         weights_1D = ops.reshape1f(weights, weights.size)
         gradient_1D = ops.reshape1f(gradient, gradient.size)
+
+        schedule_args = self._schedule_args(key)
+
         if key not in self.mom1:
             self.mom1[key] = ops.alloc1f(weights.size)
         if key not in self.mom2:
             self.mom2[key] = ops.alloc1f(weights.size)
         mom1 = self.mom1[key]
         mom2 = self.mom2[key]
-        b1 = self.b1
-        b2 = self.b2
+        b1 = self.b1(self.step, **schedule_args)
+        b2 = self.b2(self.step, **schedule_args)
         fix1 = 1.0 - (b1**nr_upd)
         fix2 = 1.0 - (b2**nr_upd)
-        lr = self.learn_rate * fix2**0.5 / fix1
-        eps = self.eps
+        lr = self.learn_rate(self.step, **schedule_args) * fix2**0.5 / fix1
+        eps = self.eps(self.step, **schedule_args)
         # needs to be 1D going into the adam function
         weights_1D, gradient_1D, mom1, mom2 = ops.adam(
             weights_1D, gradient_1D, mom1, mom2, b1, b2, eps, lr * lr_scale
@@ -354,4 +396,49 @@ def _adam(self, ops, weights, gradient, lr_scale, key, nr_upd):
         )
 
 
+def _wrap_generator(attr_name: str, generator: Generator) -> Schedule[Any]:
+    try:
+        peek = next(generator)
+    except (StopIteration, TypeError) as e:
+        err = f"Invalid schedule for '{attr_name}' ({type(generator)})\n{e}"
+        raise ValueError(err)
+    return Schedule(
+        "wrap_generator",
+        _wrap_generator_schedule,
+        attrs={
+            "attr_name": attr_name,
+            "last_step": -1,
+            "last_value": peek,
+            "generator": itertools.chain([peek], generator),
+        },
+    )
+
+
+def _wrap_generator_schedule(schedule: Schedule, step, **kwargs) -> float:
+    attr_name = schedule.attrs["attr_name"]
+    last_step = schedule.attrs["last_step"]
+    last_value = schedule.attrs["last_value"]
+    generator = schedule.attrs["generator"]
+
+    if step < last_step:
+        raise ValueError(
+            f"'step' of the generator-based schedule for {attr_name} must not decrease"
+        )
+
+    # Ensure that we have a value when we didn't step or when the
+    # generator is exhausted.
+    value = last_value
+
+    for i in range(step - last_step):
+        try:
+            value = next(generator)
+        except StopIteration:  # schedule exhausted, use last value
+            break
+
+    schedule.attrs["last_step"] = step
+    schedule.attrs["last_value"] = value
+
+    return value
+
+
 __all__ = ["Adam", "RAdam", "SGD", "Optimizer", "ADAM_DEFAULTS", "SGD_DEFAULTS"]
diff --git a/thinc/schedules.py b/thinc/schedules.py
index 87581af74..73711f87e 100644
--- a/thinc/schedules.py
+++ b/thinc/schedules.py
@@ -1,32 +1,83 @@
 """Generators that provide different rates, schedules, decays or series."""
-from typing import Iterable
+from typing import Any, Callable, Dict, Generic, TypeVar
 import numpy
 
 from .config import registry
 
+OutT = TypeVar("OutT")
+
+
+class Schedule(Generic[OutT]):
+    """Class for implementing Thinc schedules."""
+
+    name: str
+    _schedule: Callable
+    _attrs: Dict[str, Any]
+
+    __slots__ = ["name", "_schedule", "_attrs"]
+
+    def __init__(
+        self, name: str, schedule: Callable, *, attrs: Dict[str, Any] = {}
+    ) -> None:
+        """Initialize a new schedule.
+
+        name (str): The name of the schedule type.
+        schedule (Callable): The schedule function.
+        """
+        self.name = name
+        self._schedule = schedule
+        self._attrs = dict(attrs)
+
+    def __call__(self, step: int, **extra) -> OutT:
+        """Compute the schedule for a given step."""
+
+        if step < 0:
+            raise ValueError(f"Step must be non-negative, was: {step}")
+
+        return self._schedule(self, step, **extra)
+
+    @property
+    def attrs(self):
+        """Schedule attributes."""
+        return self._attrs
+
 
 @registry.schedules("constant_then.v1")
-def constant_then(
-    rate: float, steps: int, schedule: Iterable[float]
-) -> Iterable[float]:
+def constant_then(rate: OutT, steps: int, schedule: Schedule[OutT]) -> Schedule[OutT]:
     """Yield a constant rate for N steps, before starting a schedule."""
-    for i in range(steps):
-        yield rate
-    for value in schedule:
-        yield value
+    return Schedule(
+        "constant_then",
+        _constant_then_schedule,
+        attrs={"rate": rate, "steps": steps, "schedule": schedule},
+    )
+
+
+def _constant_then_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    rate = schedule.attrs["rate"]
+    steps = schedule.attrs["steps"]
+    schedule = schedule.attrs["schedule"]
+
+    if step < steps:
+        return rate
+    else:
+        return schedule(step=step, **kwargs)
 
 
 @registry.schedules("constant.v1")
-def constant(rate: float) -> Iterable[float]:
+def constant(rate: OutT) -> Schedule[OutT]:
     """Yield a constant rate."""
-    while True:
-        yield rate
+    return Schedule("constant", _constant_schedule, attrs={"rate": rate})
+
+
+def _constant_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    rate = schedule.attrs["rate"]
+    return rate
 
 
 @registry.schedules("decaying.v1")
-def decaying(base_rate: float, decay: float, *, t: int = 0) -> Iterable[float]:
+def decaying(base_rate: float, decay: float, *, t: float = 0.0) -> Schedule[float]:
     """Yield an infinite series of linearly decaying values,
-    following the schedule: base_rate * 1 / (1 + decay * t)
+    following the schedule: base_rate * 1 / (1 + decay * (t + step))
 
     EXAMPLE:
         >>> learn_rates = decaying(0.001, 1e-4)
@@ -35,15 +86,24 @@ def decaying(base_rate: float, decay: float, *, t: int = 0) -> Iterable[float]:
         >>> next(learn_rates)
         0.00999
     """
-    while True:
-        yield base_rate * (1.0 / (1.0 + decay * t))
-        t += 1
+    return Schedule(
+        "decaying",
+        _decaying_schedule,
+        attrs={"base_rate": base_rate, "decay": decay, "t": t},
+    )
+
+
+def _decaying_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    base_rate = schedule.attrs["base_rate"]
+    decay = schedule.attrs["decay"]
+    t = schedule.attrs["t"]
+    return base_rate * (1.0 / (1.0 + decay * (step + t)))
 
 
 @registry.schedules("compounding.v1")
 def compounding(
     start: float, stop: float, compound: float, *, t: float = 0.0
-) -> Iterable[float]:
+) -> Schedule[float]:
     """Yield an infinite series of compounding values. Each time the
     generator is called, a value is produced by multiplying the previous
     value by the compound rate.
@@ -54,10 +114,19 @@ def compounding(
         >>> assert next(sizes) == 1 * 1.5
         >>> assert next(sizes) == 1.5 * 1.5
     """
-    curr = float(start)
-    while True:
-        yield _clip(curr, start, stop)
-        curr *= compound
+    return Schedule(
+        "compounding",
+        _compounding_schedule,
+        attrs={"start": start, "stop": stop, "compound": compound, "t": t},
+    )
+
+
+def _compounding_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    start = schedule.attrs["start"]
+    stop = schedule.attrs["stop"]
+    compound = schedule.attrs["compound"]
+    t = schedule.attrs["t"]
+    return _clip(start * (compound ** (step + t)), start, stop)
 
 
 def _clip(value: float, start: float, stop: float) -> float:
@@ -71,52 +140,90 @@ def slanted_triangular(
     *,
     cut_frac: float = 0.1,
     ratio: int = 32,
-    decay: float = 1.0,
     t: float = 0.0,
-) -> Iterable[float]:
+) -> Schedule[float]:
     """Yield an infinite series of values according to Howard and Ruder's
     "slanted triangular learning rate" schedule.
     """
     cut = int(num_steps * cut_frac)
-    while True:
-        t += 1
-        if t < cut:
-            p = t / cut
-        else:
-            p = 1 - ((t - cut) / (cut * (1 / cut_frac - 1)))
-        learn_rate = max_rate * (1 + p * (ratio - 1)) * (1 / ratio)
-        yield learn_rate
+    return Schedule(
+        "slanted_triangular",
+        _slanted_triangular_schedule,
+        attrs={
+            "max_rate": max_rate,
+            "cut": cut,
+            "cut_frac": cut_frac,
+            "ratio": ratio,
+            "t": t,
+        },
+    )
+
+
+def _slanted_triangular_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    max_rate = schedule.attrs["max_rate"]
+    cut = schedule.attrs["cut"]
+    cut_frac = schedule.attrs["cut_frac"]
+    ratio = schedule.attrs["ratio"]
+    t = schedule.attrs["t"]
+
+    t_step = step + t + 1.0
+    if t_step < cut:
+        p = t_step / cut
+    else:
+        p = 1 - ((t_step - cut) / (cut * (1 / cut_frac - 1)))
+    return max_rate * (1 + p * (ratio - 1)) * (1 / ratio)
 
 
 @registry.schedules("warmup_linear.v1")
 def warmup_linear(
     initial_rate: float, warmup_steps: int, total_steps: int
-) -> Iterable[float]:
+) -> Schedule[float]:
     """Generate a series, starting from an initial rate, and then with a warmup
     period, and then a linear decline. Used for learning rates.
     """
-    step = 0
-    while True:
-        if step < warmup_steps:
-            factor = step / max(1, warmup_steps)
-        else:
-            factor = max(
-                0.0, (total_steps - step) / max(1.0, total_steps - warmup_steps)
-            )
-        yield factor * initial_rate
-        step += 1
+    return Schedule(
+        "warmup_linear",
+        _warmup_linear_schedule,
+        attrs={
+            "initial_rate": initial_rate,
+            "warmup_steps": warmup_steps,
+            "total_steps": total_steps,
+        },
+    )
+
+
+def _warmup_linear_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    initial_rate = schedule.attrs["initial_rate"]
+    warmup_steps = schedule.attrs["warmup_steps"]
+    total_steps = schedule.attrs["total_steps"]
+
+    if step < warmup_steps:
+        factor = step / max(1, warmup_steps)
+    else:
+        factor = max(0.0, (total_steps - step) / max(1.0, total_steps - warmup_steps))
+    return factor * initial_rate
 
 
 @registry.schedules("cyclic_triangular.v1")
-def cyclic_triangular(min_lr: float, max_lr: float, period: int) -> Iterable[float]:
-    it = 1
-    while True:
-        # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
-        cycle = numpy.floor(1 + it / (2 * period))
-        x = numpy.abs(it / period - 2 * cycle + 1)
-        relative = max(0, 1 - x)
-        yield min_lr + (max_lr - min_lr) * relative
-        it += 1
+def cyclic_triangular(min_lr: float, max_lr: float, period: int) -> Schedule[float]:
+    return Schedule(
+        "cyclic_triangular",
+        _cyclic_triangular_schedule,
+        attrs={"min_lr": min_lr, "max_lr": max_lr, "period": period},
+    )
+
+
+def _cyclic_triangular_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    min_lr = schedule.attrs["min_lr"]
+    max_lr = schedule.attrs["max_lr"]
+    period = schedule.attrs["period"]
+
+    it = step + 1
+    # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee
+    cycle = numpy.floor(1 + it / (2 * period))
+    x = numpy.abs(it / period - 2 * cycle + 1)
+    relative = max(0, 1 - x)
+    return min_lr + (max_lr - min_lr) * relative
 
 
 __all__ = [
diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py
index 0dceadfc4..e028937da 100644
--- a/thinc/tests/test_config.py
+++ b/thinc/tests/test_config.py
@@ -178,9 +178,8 @@ def decaying(base_rate: float, repeat: int) -> List[float]:
         return repeat * [base_rate]
 
     optimizer = my_registry.resolve(config)["optimizer"]
-    assert optimizer.b1 == 0.2
-    assert "learn_rate" in optimizer.schedules
-    assert optimizer.learn_rate == 0.001
+    assert optimizer.b1(step=optimizer._step, key=(0, "")) == 0.2
+    assert optimizer.learn_rate(step=optimizer._step, key=(0, "")) == 0.001
 
 
 def test_handle_generic_model_type():
diff --git a/thinc/tests/test_optimizers.py b/thinc/tests/test_optimizers.py
index a31dbce32..0fab737f9 100644
--- a/thinc/tests/test_optimizers.py
+++ b/thinc/tests/test_optimizers.py
@@ -1,8 +1,12 @@
 import pytest
 from thinc.api import registry, Optimizer
+from thinc.optimizers import KeyT, _wrap_generator
 import numpy
 
 
+STUB_KEY: KeyT = (0, "")
+
+
 def _test_schedule_valid():
     while True:
         yield 0.456
@@ -29,6 +33,22 @@ def schedule_valid(request):
     return r_func(), r1, r2, r3
 
 
+@pytest.fixture(
+    params=[
+        (lambda: 0.123, 0.123, 0.123, 0.123),
+        (lambda: (i for i in [0.2, 0.1, 0.4, 0.5, 0.6, 0.7, 0.8]), 0.2, 0.1, 0.4),
+        (lambda: (i for i in [0.333, 0.666]), 0.333, 0.666, 0.666),
+        (lambda: [0.9, 0.8, 0.7], 0.9, 0.8, 0.7),
+        (lambda: [0.0, 0.123], 0.0, 0.123, 0.123),
+    ],
+    scope="function",
+)
+def schedule_config_valid(request):
+    # Use lambda to prevent iterator from being consumed by first test
+    r_func, r1, r2, r3 = request.param
+    return r_func(), r1, r2, r3
+
+
 @pytest.fixture(
     params=[
         (lambda: "hello"),
@@ -49,32 +69,32 @@ def test_optimizers_from_config(name):
     learn_rate = 0.123
     cfg = {"@optimizers": name, "learn_rate": learn_rate}
     optimizer = registry.resolve({"config": cfg})["config"]
-    assert optimizer.learn_rate == learn_rate
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == learn_rate
 
 
-def test_optimizer_schedules_from_config(schedule_valid):
-    lr, lr_next1, lr_next2, lr_next3 = schedule_valid
+def test_optimizer_schedules_from_config(schedule_config_valid):
+    lr, lr_next1, lr_next2, lr_next3 = schedule_config_valid
     cfg = {"@optimizers": "Adam.v1", "learn_rate": lr}
     optimizer = registry.resolve({"cfg": cfg})["cfg"]
-    assert optimizer.learn_rate == lr_next1
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next1
     optimizer.step_schedules()
-    assert optimizer.learn_rate == lr_next2
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next2
     optimizer.step_schedules()
-    assert optimizer.learn_rate == lr_next3
-    optimizer.learn_rate = 1.0
-    assert optimizer.learn_rate == 1.0
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next3
+    optimizer.learn_rate = lambda *, step, key: 1.0
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == 1.0
 
 
 def test_optimizer_schedules_valid(schedule_valid):
     lr, lr_next1, lr_next2, lr_next3 = schedule_valid
     optimizer = Optimizer(learn_rate=lr)
-    assert optimizer.learn_rate == lr_next1
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next1
     optimizer.step_schedules()
-    assert optimizer.learn_rate == lr_next2
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next2
     optimizer.step_schedules()
-    assert optimizer.learn_rate == lr_next3
-    optimizer.learn_rate = 1.0
-    assert optimizer.learn_rate == 1.0
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next3
+    optimizer.learn_rate = lambda *, step, key: 1.0
+    assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == 1.0
 
 
 def test_optimizer_schedules_invalid(schedule_invalid):
@@ -97,3 +117,31 @@ def test_optimizer_init():
     optimizer((0, "x"), W, dW)
     optimizer = Optimizer(learn_rate=0.123, beta1=0.1, beta2=0.1)
     optimizer((1, "x"), W, dW)
+
+
+def test_optimizer_last_score():
+    optimizer = Optimizer(
+        learn_rate=0.123,
+    )
+
+    assert optimizer.last_score is None
+    optimizer.last_score = 1.0
+    assert optimizer.last_score == (0, 1.0)
+    optimizer.step_schedules()
+    optimizer.step_schedules()
+    assert optimizer.last_score == (0, 1.0)
+    optimizer.last_score = 2.0
+    assert optimizer.last_score == (2, 2.0)
+
+
+def test_generator_schedule():
+    s = _wrap_generator("test", iter([0.0, 1.0, 2.0, 3.0]))
+    assert s(step=0, key=STUB_KEY, last_score=None) == 0.0
+    assert s(step=0, key=STUB_KEY, last_score=None) == 0.0
+    assert s(step=1, key=STUB_KEY, last_score=None) == 1.0
+    assert s(step=1, key=STUB_KEY, last_score=None) == 1.0
+    assert s(step=3, key=STUB_KEY, last_score=None) == 3.0
+    assert s(step=10, key=STUB_KEY, last_score=None) == 3.0
+
+    with pytest.raises(ValueError, match=r"must not decrease"):
+        s(step=1, key=STUB_KEY, last_score=None)
diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py
index d975d2dbd..d8da928e4 100644
--- a/thinc/tests/test_schedules.py
+++ b/thinc/tests/test_schedules.py
@@ -1,63 +1,76 @@
 from thinc.api import decaying, compounding, slanted_triangular, constant_then
 from thinc.api import constant, warmup_linear, cyclic_triangular
+from thinc.optimizers import KeyT
 
 
 def test_decaying_rate():
     rates = decaying(0.001, 1e-4)
-    rate = next(rates)
+    rate = rates(step=0)
     assert rate == 0.001
-    next_rate = next(rates)
+    next_rate = rates(step=1)
     assert next_rate < rate
     assert next_rate > 0
-    assert next_rate > next(rates)
+    assert next_rate > rates(step=2)
+
+    rates_offset = decaying(0.001, 1e-4, t=1.0)
+    assert rates(step=1) == rates_offset(step=0)
+    assert rates(step=2) == rates_offset(step=1)
 
 
 def test_compounding_rate():
     rates = compounding(1, 16, 1.01)
-    rate0 = next(rates)
+    rate0 = rates(step=0)
     assert rate0 == 1.0
-    rate1 = next(rates)
-    rate2 = next(rates)
-    rate3 = next(rates)
+    rate1 = rates(step=1)
+    rate2 = rates(step=2)
+    rate3 = rates(step=3)
     assert rate3 > rate2 > rate1 > rate0
     assert (rate3 - rate2) > (rate2 - rate1) > (rate1 - rate0)
 
+    rates_offset = compounding(1, 16, 1.01, t=1.0)
+    assert rates(step=1) == rates_offset(step=0)
+    assert rates(step=2) == rates_offset(step=1)
+
 
 def test_slanted_triangular_rate():
     rates = slanted_triangular(1.0, 20.0, ratio=10)
-    rate0 = next(rates)
+    rate0 = rates(step=0)
     assert rate0 < 1.0
-    rate1 = next(rates)
+    rate1 = rates(step=1)
     assert rate1 > rate0
-    rate2 = next(rates)
+    rate2 = rates(step=2)
     assert rate2 < rate1
-    rate3 = next(rates)
+    rate3 = rates(step=3)
     assert rate0 < rate3 < rate2
 
+    rates_offset = slanted_triangular(1.0, 20.0, ratio=10, t=1.0)
+    assert rates(step=1) == rates_offset(step=0)
+    assert rates(step=2) == rates_offset(step=1)
+
 
 def test_constant_then_schedule():
-    rates = constant_then(1.0, 2, [100, 200])
-    assert next(rates) == 1.0
-    assert next(rates) == 1.0
-    assert next(rates) == 100
-    assert next(rates) == 200
+    rates = constant_then(1.0, 2, constant(100))
+    assert rates(step=0) == 1.0
+    assert rates(step=1) == 1.0
+    assert rates(step=2) == 100
+    assert rates(step=3) == 100
 
 
 def test_constant():
     rates = constant(123)
-    assert next(rates) == 123
-    assert next(rates) == 123
+    assert rates(step=0, key=(0, "")) == 123
+    assert rates(step=0, key=(0, "")) == 123
 
 
 def test_warmup_linear():
     rates = warmup_linear(1.0, 2, 10)
     expected = [0.0, 0.5, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125, 0.0]
     for i in range(11):
-        assert next(rates) == expected[i]
+        assert rates(step=i, key=(0, "")) == expected[i]
 
 
 def test_cyclic_triangular():
     rates = cyclic_triangular(0.1, 1.0, 2)
     expected = [0.55, 1.0, 0.55, 0.1, 0.55, 1.0, 0.55, 0.1, 0.55, 1.0]
     for i in range(10):
-        assert next(rates) == expected[i]
+        assert rates(step=i, key=(0, "")) == expected[i]
diff --git a/website/docs/api-optimizers.md b/website/docs/api-optimizers.md
index 47873cc1c..2deab184e 100644
--- a/website/docs/api-optimizers.md
+++ b/website/docs/api-optimizers.md
@@ -14,10 +14,9 @@ zero the gradients in place. The optimizers are registered in the
 
 ### SGD {#sgd tag="function"}
 
-If a hyperparameter specifies a schedule as a list or generator, its value will
-be replaced with the next item on each call to
-[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted,
-its last value will be used.
+Function to create a SGD optimizer. If a hyperparameter specifies a schedule,
+the step that is passed to the schedule will be incremented on each call to
+[`Optimizer.step_schedules`](#step-schedules).
 
 <grid>
 
@@ -58,10 +57,9 @@ use_averages = true
 ### Adam {#adam tag="function"}
 
 Function to create an Adam optimizer. Returns an instance of
-[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule as a list or
-generator, its value will be replaced with the next item on each call to
-[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted,
-its last value will be used.
+[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule, the step
+that is passed to the schedule will be incremented on each call to
+[`Optimizer.step_schedules`](#step-schedules).
 
 <grid>
 
@@ -113,10 +111,9 @@ use_averages = true
 ### RAdam {#radam tag="function"}
 
 Function to create an RAdam optimizer. Returns an instance of
-[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule as a list or
-generator, its value will be replaced with the next item on each call to
-[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted,
-its last value will be used.
+[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule, the step
+that is passed to the schedule will be incremented on each call to
+[`Optimizer.step_schedules`](#step-schedules).
 
 <grid>
 
@@ -171,10 +168,9 @@ momentum. Currently support "vanilla" SGD, Adam, and RAdam.
 
 ### Optimizer.\_\_init\_\_ {#init tag="method"}
 
-Initialize an optimizer. If a hyperparameter specifies a schedule as a list or
-generator, its value will be replaced with the next item on each call to
-[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted,
-its last value will be used.
+Initialize an optimizer. If a hyperparameter specifies a schedule, the step that
+is passed to the schedule will be incremented on each call to
+[`Optimizer.step_schedules`](#step-schedules).
 
 ```python
 ### Example
@@ -213,9 +209,8 @@ and parameter name.
 
 ### Optimizer.step_schedules {#step_schedules tag="method"}
 
-Replace the the named hyperparameters with the next item from the schedules
-iterator, if available. Once the schedule is exhausted, its last value will be
-used.
+Increase the current step of the optimizer. This step will be used by schedules
+to determine their next value.
 
 ```python
 ### Example
diff --git a/website/docs/api-schedules.md b/website/docs/api-schedules.md
index f15877111..0a395ff6d 100644
--- a/website/docs/api-schedules.md
+++ b/website/docs/api-schedules.md
@@ -5,11 +5,94 @@ next: /docs/api-loss
 
 Schedules are generators that provide different rates, schedules, decays or
 series. They're typically used for batch sizes or learning rates. You can easily
-implement your own schedules as well: just write your own generator function,
-that produces whatever series of values you need. A common use case for
-schedules is within [`Optimizer`](/docs/api-optimizer) objects, which accept
-iterators for most of their parameters. See the
-[training guide](/docs/usage-training) for details.
+implement your own schedules as well: just write your own
+[`Schedule`](#schedule) implementation, that produces whatever series of values
+you need. A common use case for schedules is within
+[`Optimizer`](/docs/api-optimizer) objects, which accept iterators for most of
+their parameters. See the [training guide](/docs/usage-training) for details.
+
+## Schedule {#schedule tag="class" new="9"}
+
+Class for implementing Thinc schedules.
+
+<infobox variant="warning">
+
+There's only one `Schedule` class in Thinc and schedules are built using
+**composition**, not inheritance. This means that a schedule or composed
+schedule will return an **instance** of `Schedule` – it doesn't subclass it. To
+read more about this concept, see the pages on
+[Thinc's philosophy](/docs/concept).
+
+</infobox>
+
+### Typing {#typing}
+
+`Schedule` can be used as a
+[generic type](https://docs.python.org/3/library/typing.html#generics) with one
+parameter. This parameter specifies the type that is returned by the schedule.
+For instance, `Schedule[int]` denotes a scheduler that returns integers when
+called. A mismatch will cause a type error. For more details, see the docs on
+[type checking](/docs/usage-type-checking).
+
+```python
+from thinc.api import Schedule
+
+def my_function(schedule: Schedule[int]):
+    ...
+```
+
+### Attributes {#attributes}
+
+| Name   | Type         | Description                     |
+| ------ | ------------ | ------------------------------- |
+| `name` | <tt>str</tt> | The name of the scheduler type. |
+
+### Properties {#properties}
+
+| Name    | Type                    | Description                                                                                                                                                               |
+| ------- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `attrs` | <tt>Dict[str, Any]</tt> | The scheduler attributes. You can use the dict directly and assign _to_ it – but you cannot reassign `schedule.attrs` to a new variable: `schedule.attrs = {}` will fail. |
+
+### Schedule.\_\_init\_\_ {#init tag="method"}
+
+Initialize a new schedule.
+
+```python
+### Example
+schedule = Schedule(
+    "constant",
+    constant_schedule,
+    attrs={"rate": rate},
+)
+```
+
+| Argument       | Type                    | Description                                              |
+| -------------- | ----------------------- | -------------------------------------------------------- |
+| `name`         | <tt>str</tt>            | The name of the schedule type.                           |
+| `schedule`     | <tt>Callable</tt>       | Function to compute the schedule value for a given step. |
+| _keyword-only_ |                         |                                                          |
+| `attrs`        | <tt>Dict[str, Any]</tt> | Dictionary of non-parameter attributes.                  |
+
+### Schedule.\_\_call\_\_ {#call tag="method"}
+
+Call the schedule function, returning the value for the given step. The
+`step` positional argument is always required. Some schedules may require
+additional keyword arguments.
+
+```python
+### Example
+from thinc.api import constant
+
+schedule = constant(0.1)
+assert schedule(0) == 0.1
+assert schedule(1000) == 0.1
+```
+
+| Argument    | Type         | Description                                |
+| ----------- | ------------ | ------------------------------------------ |
+| `step`      | <tt>int</tt> | The step to compute the schedule for.      |
+| `**kwargs`  |              | Optional arguments passed to the schedule. |
+| **RETURNS** | <tt>Any</tt> | The schedule value for the step.           |
 
 ## constant {#constant tag="function"}
 
@@ -24,7 +107,7 @@ Yield a constant rate.
 from thinc.api import constant
 
 batch_sizes = constant(0.001)
-batch_size = next(batch_sizes)
+batch_size = batch_sizes(step=0)
 ```
 
 ```ini
@@ -58,7 +141,7 @@ learn_rates = constant_then(
     1000,
     decaying(0.005, 1e-4)
 )
-learn_rate = next(learn_rates)
+learn_rate = learn_rates(step=0)
 ```
 
 ```ini
@@ -97,8 +180,8 @@ Yield an infinite series of linearly decaying values, following the schedule
 from thinc.api import decaying
 
 learn_rates = decaying(0.005, 1e-4)
-learn_rate = next(learn_rates)  # 0.001
-learn_rate = next(learn_rates)  # 0.00999
+learn_rate = learn_rates(step=0)  # 0.001
+learn_rate = learn_rates(step=1)  # 0.00999
 ```
 
 ```ini
@@ -135,8 +218,8 @@ rate.
 from thinc.api import compounding
 
 batch_sizes = compounding(1.0, 32.0, 1.001)
-batch_size = next(batch_sizes)  # 1.0
-batch_size = next(batch_sizes)  # 1.0 * 1.001
+batch_size = batch_sizes(step=0)  # 1.0
+batch_size = batch_sizes(step=1)  # 1.0 * 1.001
 ```
 
 ```ini
@@ -174,7 +257,7 @@ and then a linear decline. Used for learning rates.
 from thinc.api import warmup_linear
 
 learn_rates = warmup_linear(0.01, 3000, 6000)
-learn_rate = next(learn_rates)
+learn_rate = learn_rates(step=0)
 ```
 
 ```ini
@@ -210,7 +293,7 @@ triangular learning rate" schedule.
 from thinc.api import slanted_triangular
 
 learn_rates = slanted_triangular(0.1, 5000)
-learn_rate = next(learn_rates)
+learn_rate = learn_rates(step=0)
 ```
 
 ```ini
@@ -251,7 +334,7 @@ Linearly increasing then linearly decreasing the rate at each cycle.
 from thinc.api import cyclic_triangular
 
 learn_rates = cyclic_triangular(0.005, 0.001, 1000)
-learn_rate = next(learn_rates)
+learn_rate = learn_rates(step=0)
 ```
 
 ```ini
diff --git a/website/docs/usage-config.md b/website/docs/usage-config.md
index 73a1638ac..2887c39d5 100644
--- a/website/docs/usage-config.md
+++ b/website/docs/usage-config.md
@@ -190,21 +190,30 @@ For details and examples, see the
 
 The function registry integration becomes even more powerful when used to build
 **recursive structures**. Let's say you want to use a learning rate schedule and
-pass in a generator as the `learn_rate` argument. Here's an example of a
-function that yields an infinite series of decaying values, following the
-schedule `base_rate * 1 / (1 + decay * t)`. It's also available in Thinc as
+pass in a schedule as the `learn_rate` argument. Here's an example of a function
+that yields an infinite series of decaying values, following the schedule
+`base_rate * 1 / (1 + decay * t)`. It's also available in Thinc as
 [`schedules.decaying`](/docs/api-schedules#decaying). The decorator registers
 the function `"my_cool_decaying_schedule.v1"` in the registry `schedules`:
 
 ```python
-from typing import Iterable
 import thinc
+from thinc.schedules import Schedule
 
 @thinc.registry.schedules("my_cool_decaying_schedule.v1")
-def decaying(base_rate: float, decay: float, *, t: int = 0) -> Iterable[float]:
-    while True:
-        yield base_rate * (1.0 / (1.0 + decay * t))
-        t += 1
+def decaying(base_rate: float, decay: float, *, t: int = 0) -> Schedule[float]:
+    return Schedule(
+        "decaying",
+        _decaying_schedule,
+        attrs={"base_rate": base_rate, "decay": decay, "t": t}
+    )
+
+
+def _decaying_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    base_rate = schedule.attrs["base_rate"]
+    decay = schedule.attrs["decay"]
+    t = schedule.attrs["t"]
+    return base_rate * (1.0 / (1.0 + decay * (step + t)))
 ```
 
 In your config, you can now define the `learn_rate` as a subsection of
@@ -230,15 +239,6 @@ argument. If type annotations are available for the return value and it's a type
 that can be evaluated, the return value of the function will be validated as
 well.
 
-<infobox variant="warning">
-
-**A note on validating generators:** If a value is a generator, it won't be
-validated further, since this would mean having to execute and consume it.
-Generators can potentially be infinite – like the decaying schedule in this
-example – so checking its return value isn't viable.
-
-</infobox>
-
 ```python
 ### Under the hood
 learn_rate_func = thinc.registry.get("schedules", "my_cool_decaying_schedule.v1")
@@ -290,11 +290,22 @@ values:
 
 ```python
 ### {small="true"}
+import thinc
+from thinc.schedules import Schedule
+
 @thinc.registry.schedules("my_cool_schedule.v1")
-def schedule(*steps: float, final: float = 1.0) -> Iterable[float]:
-    yield from steps
-    while True:
-        yield final
+def step_values(*steps: float, final: float = 1.0) -> Schedule[float]:
+    step_list = list(steps)
+    return Schedule(
+        "step_values",
+        _step_values_schedule,
+        attrs={"steps": list(steps), "final": final}
+    )
+
+def _step_values_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    steps = schedule.attrs["steps"]
+    final = schedule.attrs["final"]
+    return steps[step] if step < len(steps) else final
 ```
 
 ```ini
diff --git a/website/docs/usage-training.md b/website/docs/usage-training.md
index c34648b89..8df7127a3 100644
--- a/website/docs/usage-training.md
+++ b/website/docs/usage-training.md
@@ -120,10 +120,9 @@ also simply consume the entire generator, by calling `list()` on it.
 
 Finally, `minibatch` and `multibatch` support **variable length batching**,
 based on a schedule you can provide as the `batch_size` argument. Simply pass in
-an iterable (such as a generator from the
-[built-in schedules](/docs/api-schedules)) instead of an integer. Variable
-length batching is non-standard, but we regularly use it for some of
-[spaCy](https://spacy.io)'s models, especially the parser and entity recognizer.
+an iterable. Variable length batching is non-standard, but we regularly use it
+for some of [spaCy](https://spacy.io)'s models, especially the parser and entity
+recognizer.
 
 ```python
 from thinc.api import compounding
@@ -225,37 +224,39 @@ normalize = true
 A common trick for stochastic gradient descent is to **vary the learning rate or
 other hyperparameters** over the course of training. Since there are many
 possible ways to vary the learning rate, Thinc lets you implement hyperparameter
-schedules as simple generator functions. Thinc also provides a number of
-[popular schedules](/docs/api-schedules) built-in.
-
-You can use schedules directly, by calling `next()` on the schedule and using it
-to update hyperparameters in your training loop. Since schedules are
-particularly common for optimization settings, the
-[`Optimizer`](/docs/api-optimizer) object accepts floats, lists and iterators
-for most of its parameters. When you call
-[`Optimizer.step_schedules`](/docs/api-optimizer#step_schedules), the optimizer
-will draw the next value from the generators and use them to change the given
-attributes. For instance, here's how to create an instance of the `Adam`
-optimizer with a custom learning rate schedule:
+schedules as instances of the [`Schedule`](/docs/api-schedules#schedule) class.
+Thinc also provides a number of [popular schedules](/docs/api-schedules)
+built-in.
+
+You can use schedules directly, by calling the schedule with the `step` keyword
+argument and using it to update hyperparameters in your training loop. Since
+schedules are particularly common for optimization settings, the
+[`Optimizer`](/docs/api-optimizer) object accepts floats, lists, iterators, and
+[`Schedule`](/docs/api-schedules#schedule) instances for most of its parameters.
+When you call [`Optimizer.step_schedules`](/docs/api-optimizer#step_schedules),
+the optimizer will increase its step count and pass it to the schedules. For
+instance, this is how one creates an instance of the `Adam` optimizer with a
+custom learning rate schedule:
 
 ```python
 ### Custom learning rate schedule
-from thinc.api import Adam
+from thinc.api import Adam, Schedule
 
-def my_schedule():
+def cycle():
     values = [0.001, 0.01, 0.1]
-    while True:
-        for value in values:
-            yield value
-        for value in reversed(values):
-            yield value
-
-optimizer = Adam(learn_rate=my_schedule())
-assert optimizer.learn_rate == 0.001
+    all_values = values + list(reversed(values))
+    return Schedule("cycle", _cycle_schedule, attrs={"all_values": all_values})
+
+def _cycle_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    all_values = schedule.attrs["all_values"]
+    return all_values[step % len(all_values)]
+
+optimizer = Adam(learn_rate=cycle())
+assert optimizer.learn_rate(optimizer.step) == 0.001
 optimizer.step_schedules()
-assert optimizer.learn_rate == 0.01
+assert optimizer.learn_rate(optimizer.step) == 0.01
 optimizer.step_schedules()
-assert optimizer.learn_rate == 0.1
+assert optimizer.learn_rate(optimizer.step) == 0.1
 ```
 
 ![](images/schedules_custom1.svg)
@@ -271,13 +272,14 @@ of the optimizer. Check out the
 
 ```python
 ### Registered function {small="true"}
-@thinc.registry.schedules("my_schedule.v1")
-def my_schedule(values):
-    while True:
-        for value in values:
-            yield value
-        for value in reversed(values):
-            yield value
+@thinc.registry.schedules("cycle.v1")
+def cycle(values):
+    all_values = values + list(reversed(values))
+    return Schedule("cycle", _cycle_schedule, attrs={"all_values": all_values})
+
+def _cycle_schedule(schedule: Schedule, step: int, **kwargs) -> float:
+    all_values = schedule.attrs["all_values"]
+    return all_values[step % len(all_values)]
 ```
 
 ```ini
@@ -286,7 +288,7 @@ def my_schedule(values):
 @optimizers = "Adam.v1"
 
 [optimizer.learn_rate]
-@schedules = "my_schedule.v1"
+@schedules = "cycle.v1"
 values = [0.001, 0.01, 0.1]
 ```
 

From f6f6c81b4b60ccab4988d9b30acac5d08303e1fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 23 Dec 2022 09:49:28 +0100
Subject: [PATCH 11/30] Set version to v9.0.0.dev1 (#829)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 3c68811b6..bb8e99dad 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev0"
+__version__ = "9.0.0.dev1"
 __release__ = True

From 7f35b3c48955e8e88deaf6c75a3d03d38c6be1a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 12 Jan 2023 13:44:27 +0100
Subject: [PATCH 12/30] Add `Schedule.to_generator` (#837)

* Add `Schedule.to_generator`

This method turns a `Schedule` into a generator by feeding the
`Schedule` steps with a given starting step and increment.

* Doc fix

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* docs: add default values for Schedule.to_generator

* fix anchor

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 thinc/schedules.py            | 24 +++++++++++++++++++++++-
 thinc/tests/test_schedules.py |  7 +++++++
 website/docs/api-schedules.md | 27 ++++++++++++++++++++++++---
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/thinc/schedules.py b/thinc/schedules.py
index 73711f87e..37a3cc04c 100644
--- a/thinc/schedules.py
+++ b/thinc/schedules.py
@@ -1,5 +1,6 @@
 """Generators that provide different rates, schedules, decays or series."""
-from typing import Any, Callable, Dict, Generic, TypeVar
+from typing import Any, Callable, Dict, Generator, Generic, TypeVar
+import itertools
 import numpy
 
 from .config import registry
@@ -41,6 +42,27 @@ def attrs(self):
         """Schedule attributes."""
         return self._attrs
 
+    def to_generator(
+        self, start: int = 0, step_size=1, **extra
+    ) -> Generator[OutT, None, None]:
+        """Turn the schedule into a generator.
+
+        start (int): The schedule initial step.
+        step_size (int): The amount to increase the step for each generated value.
+        **extra: Additional arguments that are passed to the schedule.
+        RETURNS (Generator[OutT, None, None]): The generator.
+        """
+        if start < 0:
+            raise ValueError(f"Schedule start must be non-negative, was: {start}")
+        if step_size < 0:
+            raise ValueError(f"Step size must be non-negative, was: {step_size}")
+
+        def generate():
+            for step in itertools.count(start, step_size):
+                yield self(step, **extra)
+
+        return generate()
+
 
 @registry.schedules("constant_then.v1")
 def constant_then(rate: OutT, steps: int, schedule: Schedule[OutT]) -> Schedule[OutT]:
diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py
index d8da928e4..710d304de 100644
--- a/thinc/tests/test_schedules.py
+++ b/thinc/tests/test_schedules.py
@@ -1,3 +1,4 @@
+from itertools import islice
 from thinc.api import decaying, compounding, slanted_triangular, constant_then
 from thinc.api import constant, warmup_linear, cyclic_triangular
 from thinc.optimizers import KeyT
@@ -74,3 +75,9 @@ def test_cyclic_triangular():
     expected = [0.55, 1.0, 0.55, 0.1, 0.55, 1.0, 0.55, 0.1, 0.55, 1.0]
     for i in range(10):
         assert rates(step=i, key=(0, "")) == expected[i]
+
+
+def test_to_generator():
+    rates = warmup_linear(1.0, 2, 10)
+    expected = [0.0, 0.5, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125, 0.0]
+    assert list(islice(rates.to_generator(), len(expected))) == expected
diff --git a/website/docs/api-schedules.md b/website/docs/api-schedules.md
index 0a395ff6d..872d73cdf 100644
--- a/website/docs/api-schedules.md
+++ b/website/docs/api-schedules.md
@@ -75,9 +75,9 @@ schedule = Schedule(
 
 ### Schedule.\_\_call\_\_ {#call tag="method"}
 
-Call the schedule function, returning the value for the given step. The
-`step` positional argument is always required. Some schedules may require
-additional keyword arguments.
+Call the schedule function, returning the value for the given step. The `step`
+positional argument is always required. Some schedules may require additional
+keyword arguments.
 
 ```python
 ### Example
@@ -94,6 +94,27 @@ assert schedule(1000) == 0.1
 | `**kwargs`  |              | Optional arguments passed to the schedule. |
 | **RETURNS** | <tt>Any</tt> | The schedule value for the step.           |
 
+### Schedule.to_generator {#to_generator tag="method"}
+
+Turn the schedule into a generator by passing monotonically increasing step
+count into the schedule.
+
+```python
+### Example
+from thinc.api import constant
+
+g = constant(0.1).to_generator()
+assert next(g) == 0.1
+assert next(g) == 0.1
+```
+
+| Argument    | Type                                 | Description                                                                     |
+| ----------- | ------------------------------------ | ------------------------------------------------------------------------------- |
+| `start`     | <tt>int</tt>                         | The initial schedule step. Defaults to `0`.                                     |
+| `step_size` | <tt>int</tt>                         | The amount to increase the step with for each generated value. Defaults to `1`. |
+| `**kwargs`  |                                      | Optional arguments passed to the schedule.                                      |
+| **RETURNS** | <tt>Generator[OutT, None, None]</tt> | The generator.                                                                  |
+
 ## constant {#constant tag="function"}
 
 Yield a constant rate.

From bbe8f537cfad778f2b6f3753a9a4d7a4b9e0c933 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 12 Jan 2023 17:36:17 +0100
Subject: [PATCH 13/30] Set version to v9.0.0.dev2

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index bb8e99dad..b8ed8d6e7 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev1"
+__version__ = "9.0.0.dev2"
 __release__ = True

From f576d1e2b3e8e5c9223bb21ed8f3321727fbe5b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 19 Jan 2023 11:14:17 +0100
Subject: [PATCH 14/30] Add plateau.v1 schedule (#842)

* Add plateau.v1 schedule

This schedule yields values from the wrapped schedule, exponentially
scaled by the number of times optimization has plateaued.

* Fix anchor

* Remove stagnant wording in favor of plateaus

* Type annotation: last_score is Optional

Also set a default value, to that the schedule does not fail when
the last_score argument is not provided.

* Update docs to clarify that passing last_score is not mandatory

* Document plateau arguments
---
 thinc/api.py                  |   3 +-
 thinc/schedules.py            | 107 +++++++++++++++++++++++++++++++++-
 thinc/tests/test_schedules.py |  19 ++++++
 website/docs/api-schedules.md |  44 ++++++++++++++
 4 files changed, 171 insertions(+), 2 deletions(-)

diff --git a/thinc/api.py b/thinc/api.py
index 3d904fe29..b296875b6 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -10,6 +10,7 @@
 from .optimizers import Adam, RAdam, SGD, Optimizer
 from .schedules import Schedule, cyclic_triangular, warmup_linear, constant
 from .schedules import constant_then, decaying, slanted_triangular, compounding
+from .schedules import plateau
 from .types import Ragged, Padded, ArgsKwargs, Unserializable
 from .util import fix_random_seed, is_cupy_array, set_active_gpu
 from .util import prefer_gpu, require_gpu, require_cpu
@@ -67,7 +68,7 @@
     "Adam", "RAdam", "SGD", "Optimizer",
     # .schedules
     "Schedule", "cyclic_triangular", "warmup_linear", "constant", "constant_then",
-    "decaying", "slanted_triangular", "compounding",
+    "decaying", "slanted_triangular", "compounding", "plateau",
     # .types
     "Ragged", "Padded", "ArgsKwargs", "Unserializable",
     # .util
diff --git a/thinc/schedules.py b/thinc/schedules.py
index 37a3cc04c..49e43a0c8 100644
--- a/thinc/schedules.py
+++ b/thinc/schedules.py
@@ -1,5 +1,7 @@
 """Generators that provide different rates, schedules, decays or series."""
-from typing import Any, Callable, Dict, Generator, Generic, TypeVar
+from typing import Any, Callable, Dict, Generator, Generic, Tuple, TypeVar
+from typing import Optional
+from dataclasses import dataclass
 import itertools
 import numpy
 
@@ -155,6 +157,109 @@ def _clip(value: float, start: float, stop: float) -> float:
     return max(value, stop) if (start > stop) else min(value, stop)
 
 
+@registry.schedules("plateau.v1")
+def plateau(
+    max_patience: int, scale: float, schedule: Schedule[float]
+) -> Schedule[float]:
+
+    """Yields values from the wrapped schedule, exponentially scaled by the
+    number of times optimization has plateaued. The caller must pass model
+    evaluation scores through the last_score argument for the scaling to be
+    adjusted. The last evaluation score is passed through the last_score argument
+    as a tuple (last_score_step, last_score). This tuple indicates when a model
+    was last evaluated (last_score_step) and with what score (last_score).
+
+    max_patience (int): the number of evaluations without improvement when
+        we consider the model to have plateaued.
+    scale (float): scaling of the inner schedule (scale**n_plateaus * inner).
+    schedule (Schedule[float]): the schedule to wrap.
+    """
+
+    return Schedule(
+        "plateau",
+        _plateau_schedule,
+        attrs={
+            "scale": scale,
+            "max_patience": max_patience,
+            "schedule": schedule,
+            "state": _PlateauState(
+                best_score=None, last_score_step=None, patience=0, n_plateaus=0
+            ),
+        },
+    )
+
+
+def _plateau_schedule(
+    schedule: Schedule,
+    step: int,
+    *,
+    last_score: Optional[Tuple[int, float]] = None,
+    **kwargs,
+) -> float:
+    inner_schedule: Schedule[float] = schedule.attrs["schedule"]
+    max_patience: int = schedule.attrs["max_patience"]
+    scale: float = schedule.attrs["scale"]
+    state: _PlateauState = schedule.attrs["state"]
+
+    if last_score is None:
+        return (scale**state.n_plateaus) * inner_schedule(
+            step=step, last_score=last_score, **kwargs
+        )
+
+    last_score_step, last_score_ = last_score
+
+    if (
+        state.best_score is None
+        or state.last_score_step is None
+        or last_score_ > state.best_score
+    ):
+        state.best_score = last_score_
+        state.patience = 0
+    elif last_score_step < state.last_score_step:
+        raise ValueError(
+            f"Expected score with step >= {state.last_score_step}, was: {last_score_step}"
+        )
+    elif last_score_step > state.last_score_step:
+        # If the score didn't improve and we are not seeing the last
+        # score again, we may be at a plateau, so increase patience.
+        state.patience += 1
+
+        # If we are at the maximum patience, we consider the optimization
+        # to have reached a plateau.
+        if state.patience == max_patience:
+            state.n_plateaus += 1
+            state.patience = 0
+
+    state.last_score_step = last_score_step
+
+    return (scale**state.n_plateaus) * inner_schedule(
+        step=step, last_score=last_score, **kwargs
+    )
+
+
+@dataclass
+class _PlateauState:
+    """Plateau schedule state.
+
+    best_score (Optional[float]): the best score so far, or None when no
+        score has been observed.
+    last_score_step (Optional[int]): the step of the last score that was
+        observed.
+    patience (int): the number of scores so far which do not improve over
+        the best score (reset after reaching the maximum patience).
+    n_plateaus (int): the number of times the maximum patience has been
+        reached.
+    """
+
+    best_score: Optional[float]
+    last_score_step: Optional[int]
+    patience: int
+    n_plateaus: int
+
+    # @dataclass(slots=True) is only supported in Python >= 3.10
+    __slots__ = ["best_score", "last_score_step", "patience", "n_plateaus"]
+
+
 @registry.schedules("slanted_triangular.v1")
 def slanted_triangular(
     max_rate: float,
diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py
index 710d304de..c404fe128 100644
--- a/thinc/tests/test_schedules.py
+++ b/thinc/tests/test_schedules.py
@@ -1,7 +1,9 @@
 from itertools import islice
+import pytest
 from thinc.api import decaying, compounding, slanted_triangular, constant_then
 from thinc.api import constant, warmup_linear, cyclic_triangular
 from thinc.optimizers import KeyT
+from thinc.schedules import plateau
 
 
 def test_decaying_rate():
@@ -77,6 +79,23 @@ def test_cyclic_triangular():
         assert rates(step=i, key=(0, "")) == expected[i]
 
 
+def test_plateau():
+    schedule = plateau(2, 0.5, constant(1.0))
+    assert schedule(step=0, last_score=None) == 1.0
+    assert schedule(step=1, last_score=(1, 1.0)) == 1.0  # patience == 0
+    assert schedule(step=2, last_score=(2, 1.0)) == 1.0  # patience == 1
+    assert schedule(step=3, last_score=None) == 1.0  # patience == 1
+    assert schedule(step=4, last_score=(4, 1.0)) == 0.5  # patience == 2, reset
+    assert schedule(step=5, last_score=(4, 1.0)) == 0.5  # patience == 0
+    assert schedule(step=6, last_score=(6, 0.9)) == 0.5  # patience == 1
+    assert schedule(step=7, last_score=(7, 2.0)) == 0.5  # patience == 0
+    assert schedule(step=8, last_score=(8, 1.0)) == 0.5  # patience == 1
+    assert schedule(step=9, last_score=(9, 2.0)) == 0.25  # patience == 2, reset
+
+    with pytest.raises(ValueError, match=r"Expected score with step"):
+        schedule(step=1, last_score=(1, 1.0)) == 1.0
+
+
 def test_to_generator():
     rates = warmup_linear(1.0, 2, 10)
     expected = [0.0, 0.5, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125, 0.0]
diff --git a/website/docs/api-schedules.md b/website/docs/api-schedules.md
index 872d73cdf..c3837055b 100644
--- a/website/docs/api-schedules.md
+++ b/website/docs/api-schedules.md
@@ -375,3 +375,47 @@ period = 1000
 | `max_lr`   | <tt>float</tt> |
 | `period`   | <tt>int</tt>   |
 | **YIELDS** | <tt>float</tt> |
+
+## plateau {#plateau tag="function" new="9"}
+
+Yields values from the wrapped schedule, exponentially scaled by the number of
+times optimization has plateaued. The caller must pass model evaluation scores
+through the `last_score` argument for the scaling to be adjusted. The last
+evaluation score is passed through the `last_score` argument as a tuple
+(`last_score_step`, `last_score`). This tuple indicates when a model was last
+evaluated (`last_score_step`) and with what score (`last_score`).
+
+<grid>
+
+```python
+### {small="true"}
+from thinc.api import constant, plateau
+
+schedule = plateau(2, 0.5, constant(1.0))
+assert schedule(step=0, last_score=(0, 1.0)) == 1.0
+assert schedule(step=1, last_score=(1, 1.0)) == 1.0
+assert schedule(step=2, last_score=(2, 1.0)) == 0.5
+assert schedule(step=3, last_score=(3, 1.0)) == 0.5
+assert schedule(step=4, last_score=(4, 1.0)) == 0.25
+```
+
+```ini
+### config {small="true"}
+[learn_rate]
+@schedules = "plateau.v1"
+scale = 0.5
+max_patience = 2
+
+[learn_rate.shedule]
+@schedules = "constant.v1"
+rate = 1.0
+```
+
+</grid>
+
+| Argument       | Type                     | Description                                                                           |
+| -------------- | ------------------------ | ------------------------------------------------------------------------------------- | ----------------------------------------------- |
+| `max_patience` | <tt>int</tt>             | Number of evaluations without an improvement to consider the model to have plateaued. |
+| `scale`        | <tt>float</tt>           |                                                                                       | Scaling of the inner schedule after plateauing. |
+| `schedule`     | <tt>Schedule[float]</tt> |                                                                                       | The schedule to wrap.                           |
+| **RETURNS**    | <tt>Schedule[float]</tt> |                                                                                       |

From fc24e8a7cf1d56ae069f824b1b8fb9f87ebef1a3 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Wed, 1 Feb 2023 13:54:15 +0100
Subject: [PATCH 15/30] Smooth one hot fix (#830)

* fix valid label smoothing parameter

* remove print

* fix typo

* ensure number of classes larger than one
---
 thinc/tests/test_util.py | 22 ++++++++++++++++++++++
 thinc/util.py            | 22 +++++++++++++++++-----
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index 8d2d0058d..f525a5133 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -5,8 +5,10 @@
 from thinc.util import get_array_module, is_numpy_array, to_categorical
 from thinc.util import is_cupy_array
 from thinc.util import convert_recursive
+from thinc.util import smooth_one_hot
 from thinc.types import ArgsKwargs
 
+
 from . import strategies
 
 ALL_XP = [numpy]
@@ -145,6 +147,26 @@ def test_to_categorical(label_smoothing):
         to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.88)
 
 
+@given(
+    n_classes=strategies.lengths(lo=2, hi=100),
+    n_samples=strategies.lengths(lo=1, hi=100),
+    label_smoothing=strategies.floats(min_value=0.0, max_value=1.0)
+)
+def test_smooth_one_hot(n_samples, n_classes, label_smoothing):
+    one_hot = numpy.zeros((n_samples, n_classes))
+    labels = numpy.random.randint(0, n_classes, (n_samples,))
+    one_hot[numpy.arange(n_samples), labels] = 1
+    max_smooth = (n_classes - 1) / n_classes
+    if label_smoothing >= max_smooth:
+        with pytest.raises(ValueError, match=r"label_smoothing parameter has to be less than"):
+            smooth_one_hot(one_hot, label_smoothing)
+    else:
+        smoothed = smooth_one_hot(one_hot, label_smoothing)
+        assert numpy.all(numpy.argmax(smoothed, axis=1) == labels)
+        assert smoothed.shape == one_hot.shape
+        assert numpy.allclose(smoothed.sum(1), 1.0)
+
+
 def test_convert_recursive():
     is_match = lambda obj: obj == "foo"
     convert_item = lambda obj: obj.upper()
diff --git a/thinc/util.py b/thinc/util.py
index 059f2c235..08ad6c3d7 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -257,14 +257,25 @@ def smooth_one_hot(X: Floats2d, label_smoothing: float) -> Floats2d:
     """
     Apply label-smoothing to one-hot array.
     """
-    if not 0.0 <= label_smoothing < 0.5:
+    n_classes = X.shape[1]
+    max_smooth = (n_classes - 1) / n_classes
+    if label_smoothing < 0.0:
+        raise ValueError(
+            "Label-smoothing parameter has to be greater than or equal to 0"
+        )
+    if not n_classes > 1:
         raise ValueError(
-            "label_smoothing should be greater or "
-            "equal to 0.0 and less than 0.5, "
-            f"but {label_smoothing} was provided."
+            "n_classes should be greater than 1 when label smoothing is enabled,"
+            f"but {n_classes} was provided."
+        )
+    if label_smoothing >= max_smooth:
+        raise ValueError(
+            f"For {n_classes} classes "
+            "label_smoothing parameter has to be less than "
+            f"{max_smooth}, but found {label_smoothing}."
         )
     X[X == 1] = 1 - label_smoothing
-    X[X == 0] = label_smoothing / (X.shape[1] - 1)
+    X[X == 0] = label_smoothing / (n_classes - 1)
     return X
 
 
@@ -631,6 +642,7 @@ def check_consistency(self, arr: ArrayXd):
     "require_gpu",
     "copy_array",
     "to_categorical",
+    "smooth_one_hot",
     "get_width",
     "xp2torch",
     "torch2xp",

From bf0e2762c674973d56f09bf666d08ab7d84e2bef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 22 Mar 2023 15:29:48 +0100
Subject: [PATCH 16/30] Set version to v9.0.0.dev3 (#868)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index b8ed8d6e7..502500b04 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev2"
+__version__ = "9.0.0.dev3"
 __release__ = True

From 816ea330f1a47e476e1dab75d771c28a8837699b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 8 Jan 2024 16:50:15 +0100
Subject: [PATCH 17/30] Temporarily revert new loss implementations (#916)

* Revert "Cross entropy fix (#647)"

This reverts commit c8ac07fe734aaee43d8197bbf5c9a370f692766b.

* Cherry pick MPS Torch bug to get CI unstuck
---
 .github/workflows/tests.yml |   6 +-
 examples/mnist.py           |   5 +-
 thinc/legacy/__init__.py    |   8 -
 thinc/legacy/loss.py        | 285 -----------------
 thinc/loss.py               | 503 ++++++++++-------------------
 thinc/tests/test_loss.py    | 611 +++++++-----------------------------
 thinc/tests/test_util.py    |  21 --
 thinc/util.py               |  36 +--
 8 files changed, 284 insertions(+), 1191 deletions(-)
 delete mode 100644 thinc/legacy/__init__.py
 delete mode 100644 thinc/legacy/loss.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 90ea34aa2..8c868d876 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -101,7 +101,11 @@ jobs:
         run: |
           pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
           pip install "mxnet; sys_platform != 'win32'"
-          pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu
+          pip install "torch!=1.13.0; sys_platform!='darwin'" --extra-index-url https://download.pytorch.org/whl/cpu
+          # there is a bug related to MPS devices in github macos runners that
+          # will be fixed in torch v2.1.1
+          # https://github.com/pytorch/pytorch/pull/111576
+          pip install "torch>=2.1.1; sys_platform=='darwin'" --extra-index-url https://download.pytorch.org/whl/cpu
           pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
           pip install "numpy<1.24.0"
           pip install -r requirements.txt
diff --git a/examples/mnist.py b/examples/mnist.py
index 790bcc640..971f4645b 100644
--- a/examples/mnist.py
+++ b/examples/mnist.py
@@ -4,7 +4,6 @@
 """
 # pip install thinc ml_datasets typer
 from thinc.api import Model, chain, Relu, Softmax, Adam
-from thinc.api import CategoricalCrossentropy
 import ml_datasets
 from wasabi import msg
 from tqdm import tqdm
@@ -22,7 +21,6 @@ def main(
     )
     # Load the data
     (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist()
-    loss_func = CategoricalCrossentropy()
     # Set any missing shapes for the model.
     model.initialize(X=train_X[:5], Y=train_Y[:5])
     train_data = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True)
@@ -32,8 +30,7 @@ def main(
     for i in range(n_iter):
         for X, Y in tqdm(train_data, leave=False):
             Yh, backprop = model.begin_update(X)
-            grad, loss = loss_func(Yh, Y)
-            backprop(grad)
+            backprop(Yh - Y)
             model.finish_update(optimizer)
         # Evaluate and print progress
         correct = 0
diff --git a/thinc/legacy/__init__.py b/thinc/legacy/__init__.py
deleted file mode 100644
index ced5121ba..000000000
--- a/thinc/legacy/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-from .loss import LegacyCategoricalCrossentropy
-from .loss import LegacySequenceCategoricalCrossentropy
-
-
-__all__ = [
-    "LegacyCategoricalCrossentropy",
-    "LegacySequenceCategoricalCrossentropy"
-]
diff --git a/thinc/legacy/loss.py b/thinc/legacy/loss.py
deleted file mode 100644
index ab9871625..000000000
--- a/thinc/legacy/loss.py
+++ /dev/null
@@ -1,285 +0,0 @@
-from typing import Optional, Sequence, Dict, Union, Tuple
-from typing import cast, List
-from ..types import Floats2d, Ints1d
-from ..config import registry
-from ..util import to_categorical, get_array_module
-from ..loss import IntsOrFloatsOrStrs, Loss
-from ..loss import _make_mask, _make_mask_by_value
-
-
-TruthsT = Union[List[Optional[str]], List[int], Ints1d, Floats2d]
-
-
-class LegacyCategoricalCrossentropy(Loss):
-    names: Optional[Sequence[str]]
-    missing_value: Optional[Union[str, int]]
-    _name_to_i: Dict[str, int]
-
-    def __init__(
-        self,
-        *,
-        normalize: bool = True,
-        names: Optional[Sequence[str]] = None,
-        missing_value: Optional[Union[str, int]] = None,
-        neg_prefix: Optional[str] = None,
-        label_smoothing: float = 0.0,
-    ):
-        self.normalize = normalize
-        self.names = names
-        self.missing_value = missing_value
-        self.neg_prefix = neg_prefix
-        self.label_smoothing = label_smoothing
-        if names is not None:
-            self._name_to_i = {name: i for i, name in enumerate(names)}
-        else:
-            self._name_to_i = {}
-
-    def convert_truths(
-        self, truths: TruthsT, guesses: Floats2d
-    ) -> Tuple[Floats2d, Floats2d]:
-        xp = get_array_module(guesses)
-        missing = []
-        negatives_mask = None
-        if self.names:
-            negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f")
-        missing_value = self.missing_value
-        # Convert list of ints or list of strings
-        if isinstance(truths, list):
-            if len(truths):
-                if isinstance(truths[0], int):
-                    for i, value in enumerate(truths):
-                        if not isinstance(value, int):
-                            raise ValueError(
-                                "All values in the truths list have to "
-                                "have the same type. The first value was "
-                                f"detected to be integer, but found {type(value)}."
-                            )
-                        if value == missing_value:
-                            missing.append(i)
-                else:
-                    truths = cast(List[Optional[str]], truths)
-                    if self.names is None:
-                        msg = (
-                            "Cannot calculate loss from list of strings without names. "
-                            "You can pass the names as a keyword argument when you "
-                            "create the loss object, "
-                            "e.g. CategoricalCrossentropy(names=['dog', 'cat'])"
-                        )
-                        raise ValueError(msg)
-                    for i, value in enumerate(truths):
-                        if not (isinstance(value, str) or value == missing_value):
-                            raise ValueError(
-                                "All values in the truths list have to "
-                                "have the same type. The first value was "
-                                f"detected to be string, but found {type(value)}."
-                            )
-                        if value == missing_value:
-                            truths[i] = self.names[0]
-                            missing.append(i)
-                        elif (
-                            value
-                            and self.neg_prefix
-                            and value.startswith(self.neg_prefix)
-                        ):
-                            neg_value = value[len(self.neg_prefix) :]
-                            truths[i] = neg_value
-                            neg_index = self._name_to_i[neg_value]
-                            negatives_mask[i] = 0  # type: ignore
-                            negatives_mask[i][neg_index] = -1  # type: ignore
-                    # In the loop above, we have ensured that `truths` doesn't
-                    # contain `None` (anymore). However, mypy can't infer this
-                    # and doesn't like the shadowing.
-                    truths_str = cast(List[str], truths)
-                    truths = [self._name_to_i[name] for name in truths_str]
-            truths = xp.asarray(truths, dtype="i")
-            mask = _make_mask(guesses, missing)
-        else:
-            mask = _make_mask_by_value(truths, guesses, missing_value)
-        truths = cast(Union[Ints1d, Floats2d], truths)
-        if truths.ndim != guesses.ndim:
-            # transform categorical values to one-hot encoding
-            truths_2d = to_categorical(
-                truths,
-                n_classes=guesses.shape[-1],
-                label_smoothing=self.label_smoothing,
-            )
-        else:
-            if self.label_smoothing:
-                raise ValueError(
-                    "Label smoothing is only applied, when truths have type "
-                    "List[str], List[int] or Ints1d, but it seems like Floats2d "
-                    "was provided."
-                )
-            truths_2d = cast(Floats2d, truths)
-        # Transform negative annotations to a 0 for the negated value
-        # + mask all other values for that row
-        if negatives_mask is not None:
-            truths_2d *= negatives_mask
-            truths_2d[truths_2d == -1] = 0
-            negatives_mask[negatives_mask == -1] = 1
-            mask *= negatives_mask
-        return cast(Floats2d, truths_2d), mask
-
-    def __call__(self, guesses: Floats2d, truths: TruthsT) -> Tuple[Floats2d, float]:
-        d_truth = self.get_grad(guesses, truths)
-        return (d_truth, self._get_loss_from_grad(d_truth))
-
-    def get_grad(self, guesses: Floats2d, truths: TruthsT) -> Floats2d:
-        target, mask = self.convert_truths(truths, guesses)
-        xp = get_array_module(target)
-        if guesses.shape != target.shape:  # pragma: no cover
-            err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}."
-            raise ValueError(err)
-        elif xp.any(guesses > 1) or xp.any(guesses < 0):  # pragma: no cover
-            err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval."
-            raise ValueError(err)
-        elif xp.any(target > 1) or xp.any(target < 0):  # pragma: no cover
-            err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval."
-            raise ValueError(err)
-        difference = guesses - target
-        difference *= mask
-        if self.normalize:
-            difference = difference / guesses.shape[0]
-        return difference
-
-    def get_loss(self, guesses: Floats2d, truths: TruthsT) -> float:
-        d_truth = self.get_grad(guesses, truths)
-        return self._get_loss_from_grad(d_truth)
-
-    def _get_loss_from_grad(self, d_truth: Floats2d) -> float:
-        # TODO: Add overload for axis=None case to sum
-        return (d_truth**2).sum()  # type: ignore
-
-
-class LegacySequenceCategoricalCrossentropy(Loss):
-    def __init__(
-        self,
-        *,
-        normalize: bool = True,
-        names: Optional[Sequence[str]] = None,
-        missing_value: Optional[Union[str, int]] = None,
-        neg_prefix: Optional[str] = None,
-        label_smoothing: float = 0.0,
-    ):
-        self.cc = LegacyCategoricalCrossentropy(
-            normalize=False,
-            names=names,
-            missing_value=missing_value,
-            neg_prefix=neg_prefix,
-            label_smoothing=label_smoothing,
-        )
-        self.normalize = normalize
-
-    def __call__(
-        self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]
-    ) -> Tuple[List[Floats2d], float]:
-        grads = self.get_grad(guesses, truths)
-        loss = self._get_loss_from_grad(grads)
-        return grads, loss
-
-    def get_grad(
-        self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]
-    ) -> List[Floats2d]:
-        if len(guesses) != len(truths):  # pragma: no cover
-            err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length"
-            raise ValueError(err)
-        n = len(guesses)
-        d_scores = []
-        for yh, y in zip(guesses, truths):
-            d_yh = self.cc.get_grad(yh, y)
-            if self.normalize:
-                d_yh /= n
-            d_scores.append(d_yh)
-        return d_scores
-
-    def get_loss(self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]) -> float:
-        return self._get_loss_from_grad(self.get_grad(guesses, truths))
-
-    def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float:
-        loss = 0.0
-        for grad in grads:
-            loss += self.cc._get_loss_from_grad(grad)  # type: ignore
-        return loss
-
-
-@registry.losses("CategoricalCrossentropy.v1")
-def configure_CategoricalCrossentropy_v1(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    missing_value: Optional[Union[str, int]] = None,
-) -> LegacyCategoricalCrossentropy:
-    return LegacyCategoricalCrossentropy(
-        normalize=normalize, names=names, missing_value=missing_value
-    )
-
-
-@registry.losses("CategoricalCrossentropy.v2")
-def configure_CategoricalCrossentropy_v2(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    missing_value: Optional[Union[str, int]] = None,
-    neg_prefix: Optional[str] = None,
-) -> LegacyCategoricalCrossentropy:
-    return LegacyCategoricalCrossentropy(
-        normalize=normalize,
-        names=names,
-        missing_value=missing_value,
-        neg_prefix=neg_prefix,
-    )
-
-
-@registry.losses("CategoricalCrossentropy.v3")
-def configure_CategoricalCrossentropy_v3(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    missing_value: Optional[Union[str, int]] = None,
-    neg_prefix: Optional[str] = None,
-    label_smoothing: float = 0.0,
-) -> LegacyCategoricalCrossentropy:
-    return LegacyCategoricalCrossentropy(
-        normalize=normalize,
-        names=names,
-        missing_value=missing_value,
-        neg_prefix=neg_prefix,
-        label_smoothing=label_smoothing,
-    )
-
-
-@registry.losses("SequenceCategoricalCrossentropy.v1")
-def configure_SequenceCategoricalCrossentropy_v1(
-    *, normalize: bool = True, names: Optional[Sequence[str]] = None
-) -> LegacySequenceCategoricalCrossentropy:
-    return LegacySequenceCategoricalCrossentropy(normalize=normalize, names=names)
-
-
-@registry.losses("SequenceCategoricalCrossentropy.v2")
-def configure_SequenceCategoricalCrossentropy_v2(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    neg_prefix: Optional[str] = None,
-) -> LegacySequenceCategoricalCrossentropy:
-    return LegacySequenceCategoricalCrossentropy(
-        normalize=normalize, names=names, neg_prefix=neg_prefix
-    )
-
-
-@registry.losses("SequenceCategoricalCrossentropy.v3")
-def configure_SequenceCategoricalCrossentropy_v3(
-    *,
-    normalize: bool = True,
-    names: Optional[Sequence[str]] = None,
-    missing_value: Optional[Union[str, int]] = None,
-    neg_prefix: Optional[str] = None,
-    label_smoothing: float = 0.0,
-) -> LegacySequenceCategoricalCrossentropy:
-    return LegacySequenceCategoricalCrossentropy(
-        normalize=normalize,
-        names=names,
-        neg_prefix=neg_prefix,
-        missing_value=missing_value,
-        label_smoothing=label_smoothing,
-    )
diff --git a/thinc/loss.py b/thinc/loss.py
index e194516de..e8edb194d 100644
--- a/thinc/loss.py
+++ b/thinc/loss.py
@@ -2,19 +2,17 @@
 from typing import Dict
 from abc import abstractmethod
 
-from .types import Floats2d, Ints1d, Ragged, ArrayXd
-from .util import get_array_module, to_categorical, smooth_one_hot
-from .util import is_xp_array
+from .types import Floats2d, Ints1d
+from .util import get_array_module, to_categorical
 from .config import registry
 
+
 LossT = TypeVar("LossT")
 GradT = TypeVar("GradT")
 GuessT = TypeVar("GuessT")
 TruthT = TypeVar("TruthT")
-FloatsOrRaggedT = TypeVar("FloatsOrRaggedT", Floats2d, Ragged)
 IntsOrFloats = Union[Ints1d, Floats2d]
 IntsOrFloatsOrStrs = Union[Ints1d, Floats2d, Sequence[int], Sequence[str]]
-Categories1d = Union[Ints1d, Sequence[int], Sequence[str]]
 
 
 class Loss(Generic[GuessT, TruthT, GradT, LossT]):  # pragma: no cover
@@ -39,118 +37,7 @@ def get_loss(self, guesses: GuessT, truths: TruthT) -> LossT:
         ...
 
 
-class CategoricalCrossentropyBase(Loss):
-    normalize: bool
-
-    def _validate_input(self, guesses: FloatsOrRaggedT, target: Floats2d) -> None:
-        guesses_f2d = _to_array(guesses)
-        xp = get_array_module(target)
-        if not xp.allclose(guesses_f2d.sum(axis=1), 1.0):
-            raise ValueError(
-                "Cannot calculate CategoricalCrossentropy if "
-                "some rows of 'guesses' are not "
-                "valid categorical distributions (do not sum to 1)."
-            )
-        elif guesses_f2d.shape != target.shape:  # pragma: no cover
-            raise ValueError(
-                "Cannot calculate CategoricalCrossentropy loss "
-                f"with mismatching shapes: {guesses_f2d.shape} vs {target.shape}."
-            )
-        elif xp.any(guesses_f2d > 1) or xp.any(guesses_f2d < 0):  # pragma: no cover
-            raise ValueError(
-                "Cannot calculate CategoricalCrossentropy loss "
-                "with guesses outside the [0,1] interval."
-            )
-        elif xp.any(target > 1) or xp.any(target < 0):  # pragma: no cover
-            raise ValueError(
-                "Cannot calculate CategoricalCrossentropy loss "
-                "with truth values outside the [0,1] interval."
-            )
-
-    def _get_grad(
-        self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d
-    ) -> FloatsOrRaggedT:
-        difference = _to_array(guesses) - target
-        difference *= mask
-        if self.normalize:
-            # FIXME: normalized by the number of sequences, also support normalizing
-            #  by the number of instances.
-            difference /= _normalization_length(guesses)
-
-        return _array_like(difference, guesses)
-
-    def _get_loss(
-        self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d
-    ) -> float:
-        guesses_f2d = _to_array(guesses)
-        xp = get_array_module(guesses_f2d)
-        logprobs = xp.log(guesses_f2d + 1e-9)
-        logprobs *= mask
-        if self.normalize:
-            return -(target * logprobs).sum() / _normalization_length(guesses)
-        else:
-            return -(target * logprobs).sum()
-
-
-class CategoricalCrossentropy(CategoricalCrossentropyBase):
-    missing_value: Optional[Union[str, int]]
-
-    def __init__(
-        self,
-        *,
-        normalize: bool = True,
-        missing_value: Optional[int] = None,
-        label_smoothing: float = 0.0,
-    ):
-        self.normalize = normalize
-        self.missing_value = missing_value
-        self.label_smoothing = label_smoothing
-
-    def __call__(
-        self, guesses: FloatsOrRaggedT, truths: Floats2d
-    ) -> Tuple[FloatsOrRaggedT, float]:
-        target, mask = self.convert_truths(truths, guesses)
-        self._validate_input(guesses, target)
-        d_truth = self._get_grad(guesses, target, mask)
-        loss = self._get_loss(guesses, target, mask)
-
-        return d_truth, loss
-
-    def convert_truths(
-        self, truths: Floats2d, guesses: FloatsOrRaggedT
-    ) -> Tuple[Floats2d, Floats2d]:
-        if truths.ndim != 2:
-            raise ValueError(f"'truths' have to have 2 axes, but found {truths.ndim}")
-        guesses_2d = _to_array(guesses)
-        missing_value = self.missing_value
-        xp = get_array_module(guesses_2d)
-        mask = _make_mask_by_value(truths, guesses_2d, missing_value)
-        if not xp.allclose(truths.sum(axis=1), 1.0):
-            raise ValueError(
-                "Cannot calculate CategoricalCrossentropy. "
-                "All rows of 'truths' have to be a "
-                "valid categorical distribution (sum to 1)."
-            )
-        if self.label_smoothing:
-            # Validate that array is binary, ergo one-hot at this point
-            if ((truths == 0) | (truths == 1)).all():
-                truths = smooth_one_hot(truths, self.label_smoothing)
-            else:
-                raise ValueError("Can only apply label-smoothing to one-hot target.")
-        return truths, mask
-
-    def get_grad(self, guesses: FloatsOrRaggedT, truths: Floats2d) -> FloatsOrRaggedT:
-        target, mask = self.convert_truths(truths, guesses)
-        self._validate_input(guesses, target)
-        return self._get_grad(guesses, target, mask)
-
-    def get_loss(self, guesses: Floats2d, truths: Floats2d) -> float:
-        target, mask = self.convert_truths(truths, guesses)
-        self._validate_input(guesses, target)
-        return self._get_loss(guesses, target, mask)
-
-
-class SparseCategoricalCrossentropy(CategoricalCrossentropyBase):
+class CategoricalCrossentropy(Loss):
     names: Optional[Sequence[str]]
     missing_value: Optional[Union[str, int]]
     _name_to_i: Dict[str, int]
@@ -174,174 +61,142 @@ def __init__(
         else:
             self._name_to_i = {}
 
-    def __call__(
-        self, guesses: Floats2d, truths: Union[Sequence[int], Sequence[str]]
-    ) -> Tuple[Floats2d, float]:
-        target, mask = self.convert_truths(truths, guesses)
-        self._validate_input(guesses, target)
-        d_truth = self._get_grad(guesses, target, mask)
-        loss = self._get_loss(guesses, target, mask)
-        return (d_truth, loss)
-
-    def _convert_ints(
-        self, guesses: Floats2d, truths: Sequence[int]
-    ) -> Tuple[Floats2d, Floats2d]:
-        """
-        Convert Sequence[int] into a Floats2d one-hot array.
-        """
-        missing_value = self.missing_value
-        if missing_value is not None and not isinstance(missing_value, int):
-            raise ValueError(
-                "'truths' provided in Sequence[int] format, but "
-                f"'missing_value' was set to be {self.missing_value} "
-                f", which has type {type(self.missing_value)}."
-            )
-        missing = []
-        for i, value in enumerate(truths):
-            if not isinstance(value, int):
-                raise ValueError(
-                    "The first value of `truths` was of type "
-                    f"integer, but found {type(value)} during iteration."
-                )
-            if value == missing_value:
-                missing.append(i)
-        xp = get_array_module(guesses)
-        # FIXME: convert using ops?
-        xp_truths = cast(Ints1d, xp.asarray(truths, dtype="i"))
-        truths_2d = to_categorical(
-            xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing
-        )
-        mask = _make_mask(guesses, missing)
-        return cast(Floats2d, truths_2d), mask
-
-    def _convert_strs(
-        self, guesses: Floats2d, truths: Sequence[str]
-    ) -> Tuple[Floats2d, Floats2d]:
-        """
-        Convert Sequence[int] into a Floats2d one-hot array.
-        """
-
-        missing_value = self.missing_value
-        if self.names is None:
-            raise ValueError(
-                "Cannot calculate loss from Sequence[str] without names. "
-                "You can pass the names as a keyword argument when you "
-                "create the loss object"
-            )
-        elif missing_value is not None and not isinstance(missing_value, str):
-            raise ValueError(
-                "'truths' provided in Sequence[str] format, but "
-                f"'missing_value' was set to be {self.missing_value} "
-                f", which has type {type(self.missing_value)}."
-            )
+    def convert_truths(self, truths, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]:
         xp = get_array_module(guesses)
         missing = []
-        negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f")
-        truths_int = []
-        for i, value in enumerate(truths):
-            if not isinstance(value, str):
-                raise ValueError(
-                    "The first value of the 'truths' was of type "
-                    f"string, but found {type(value)} during iteration."
-                )
-            # missing value
-            if value == missing_value:
-                label_i = self._name_to_i[self.names[0]]
-                missing.append(i)
-            # negative labels
-            elif self.neg_prefix and value.startswith(self.neg_prefix):
-                label_i = self._name_to_i[value[len(self.neg_prefix) :]]
-                negatives_mask[i] = 0  # type: ignore
-                negatives_mask[i][label_i] = -1  # type: ignore
-            # nothing special
-            else:
-                label_i = self._name_to_i[value]
-            truths_int.append(label_i)
-        xp_truths = cast(Ints1d, xp.asarray(truths_int, dtype="i"))
-        truths_2d = to_categorical(
-            xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing
-        )
-        mask = _make_mask(guesses, missing)
-        truths_2d *= negatives_mask
-        truths_2d[truths_2d == -1] = 0
-        negatives_mask[negatives_mask == -1] = 1
-        mask *= negatives_mask
-        return cast(Floats2d, truths_2d), mask
-
-    def convert_truths(
-        self, truths: Categories1d, guesses: Floats2d
-    ) -> Tuple[Floats2d, Floats2d]:
-        guesses_f2d = _to_array(guesses)
-
-        if is_xp_array(truths):
-            _check_ints1d(cast(ArrayXd, truths))
-            xp_truths = cast(Ints1d, truths)
-            truths_2d = to_categorical(
-                xp_truths,
+        negatives_mask = None
+        if self.names:
+            negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f")
+        missing_value = self.missing_value
+        # Convert list of ints or list of strings
+        if isinstance(truths, list):
+            truths = list(truths)
+            if len(truths):
+                if isinstance(truths[0], int):
+                    for i, value in enumerate(truths):
+                        if value == missing_value:
+                            missing.append(i)
+                else:
+                    if self.names is None:
+                        msg = (
+                            "Cannot calculate loss from list of strings without names. "
+                            "You can pass the names as a keyword argument when you "
+                            "create the loss object, "
+                            "e.g. CategoricalCrossentropy(names=['dog', 'cat'])"
+                        )
+                        raise ValueError(msg)
+                    for i, value in enumerate(truths):
+                        if value == missing_value:
+                            truths[i] = self.names[0]
+                            missing.append(i)
+                        elif (
+                            value
+                            and self.neg_prefix
+                            and value.startswith(self.neg_prefix)
+                        ):
+                            truths[i] = value[len(self.neg_prefix) :]
+                            neg_index = self._name_to_i[truths[i]]
+                            negatives_mask[i] = 0  # type: ignore
+                            negatives_mask[i][neg_index] = -1  # type: ignore
+                    truths = [self._name_to_i[name] for name in truths]
+            truths = xp.asarray(truths, dtype="i")
+            mask = _make_mask(guesses, missing)
+        else:
+            mask = _make_mask_by_value(truths, guesses, missing_value)
+        if truths.ndim != guesses.ndim:
+            # transform categorical values to one-hot encoding
+            truths = to_categorical(
+                cast(Ints1d, truths),
+                n_classes=guesses.shape[-1],
                 label_smoothing=self.label_smoothing,
-                n_classes=guesses_f2d.shape[1],
             )
-            mask = _make_mask_by_value(truths_2d, guesses_f2d, self.missing_value)
-        elif isinstance(truths, Sequence):
-            if isinstance(truths[0], int):
-                truths_2d, mask = self._convert_ints(
-                    guesses_f2d, cast(Sequence[int], truths)
-                )
-            elif isinstance(truths[0], str):
-                truths_2d, mask = self._convert_strs(
-                    guesses_f2d, cast(Sequence[str], truths)
-                )
-            else:
+        else:
+            if self.label_smoothing:
                 raise ValueError(
-                    "When truths to SparseCategoricalCrossentropy is provided "
-                    "in Sequence format, elements need to be "
-                    "of type str or int, but first element "
-                    f"was found to be {type(truths[0])}."
+                    "Label smoothing is only applied, when truths have type "
+                    "List[str], List[int] or Ints1d, but it seems like Floats2d "
+                    "was provided."
                 )
-        else:
-            raise ValueError(
-                "Truths have to be provided either as 1D "
-                "numpy/cupy integer array or as Sequence[int] or "
-                "Sequence[str], but truths has different type."
-            )
+        # Transform negative annotations to a 0 for the negated value
+        # + mask all other values for that row
+        if negatives_mask is not None:
+            truths *= negatives_mask
+            truths[truths == -1] = 0
+            negatives_mask[negatives_mask == -1] = 1
+            mask *= negatives_mask
+        return truths, mask
 
-        return cast(Floats2d, truths_2d), mask
+    def __call__(
+        self, guesses: Floats2d, truths: IntsOrFloatsOrStrs
+    ) -> Tuple[Floats2d, float]:
+        d_truth = self.get_grad(guesses, truths)
+        return (d_truth, self._get_loss_from_grad(d_truth))
 
-    def get_grad(self, guesses: Floats2d, truths: Categories1d) -> Floats2d:
+    def get_grad(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> Floats2d:
         target, mask = self.convert_truths(truths, guesses)
-        self._validate_input(guesses, target)
-        return self._get_grad(guesses, target, mask)
+        xp = get_array_module(target)
+        if guesses.shape != target.shape:  # pragma: no cover
+            err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}."
+            raise ValueError(err)
+        if xp.any(guesses > 1) or xp.any(guesses < 0):  # pragma: no cover
+            err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval."
+            raise ValueError(err)
+        if xp.any(target > 1) or xp.any(target < 0):  # pragma: no cover
+            err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval."
+            raise ValueError(err)
+        difference = guesses - target
+        difference *= mask
+        if self.normalize:
+            difference = difference / guesses.shape[0]
+        return difference
 
-    def get_loss(self, guesses: Floats2d, truths: Categories1d) -> float:
-        target, mask = self.convert_truths(truths, guesses)
-        self._validate_input(guesses, target)
-        return self._get_loss(guesses, target, mask)
+    def get_loss(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> float:
+        d_truth = self.get_grad(guesses, truths)
+        return self._get_loss_from_grad(d_truth)
+
+    def _get_loss_from_grad(self, d_truth: Floats2d) -> float:
+        # TODO: Add overload for axis=None case to sum
+        return (d_truth**2).sum()  # type: ignore
 
 
-@registry.losses("CategoricalCrossentropy.v4")
-def configure_CategoricalCrossentropy_v4(
+@registry.losses("CategoricalCrossentropy.v1")
+def configure_CategoricalCrossentropy_v1(
     *,
     normalize: bool = True,
-    missing_value: Optional[int] = None,
-    label_smoothing: float = 0.0,
+    names: Optional[Sequence[str]] = None,
+    missing_value: Optional[Union[str, int]] = None,
+) -> CategoricalCrossentropy:
+    return CategoricalCrossentropy(
+        normalize=normalize, names=names, missing_value=missing_value
+    )
+
+
+@registry.losses("CategoricalCrossentropy.v2")
+def configure_CategoricalCrossentropy_v2(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    missing_value: Optional[Union[str, int]] = None,
+    neg_prefix: Optional[str] = None,
 ) -> CategoricalCrossentropy:
     return CategoricalCrossentropy(
         normalize=normalize,
+        names=names,
         missing_value=missing_value,
-        label_smoothing=label_smoothing,
+        neg_prefix=neg_prefix,
     )
 
 
-@registry.losses("SparseCategoricalCrossentropy.v4")
-def configure_SparseCategoricalCrossentropy_v4(
+@registry.losses("CategoricalCrossentropy.v3")
+def configure_CategoricalCrossentropy_v3(
     *,
     normalize: bool = True,
     names: Optional[Sequence[str]] = None,
     missing_value: Optional[Union[str, int]] = None,
     neg_prefix: Optional[str] = None,
     label_smoothing: float = 0.0,
-) -> SparseCategoricalCrossentropy:
-    return SparseCategoricalCrossentropy(
+) -> CategoricalCrossentropy:
+    return CategoricalCrossentropy(
         normalize=normalize,
         names=names,
         missing_value=missing_value,
@@ -354,44 +209,38 @@ class SequenceCategoricalCrossentropy(Loss):
     def __init__(
         self,
         *,
-        cross_entropy: Union[CategoricalCrossentropy, SparseCategoricalCrossentropy],
         normalize: bool = True,
+        names: Optional[Sequence[str]] = None,
+        missing_value: Optional[Union[str, int]] = None,
+        neg_prefix: Optional[str] = None,
+        label_smoothing: float = 0.0,
     ):
-        self.cc = cross_entropy
+        self.cc = CategoricalCrossentropy(
+            normalize=False,
+            names=names,
+            missing_value=missing_value,
+            neg_prefix=neg_prefix,
+            label_smoothing=label_smoothing,
+        )
         self.normalize = normalize
 
     def __call__(
         self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
     ) -> Tuple[List[Floats2d], float]:
-        self._validate_input(guesses, truths)
-        n = len(guesses)
-        d_scores = []
-        loss = 0.0
-        for yh, y in zip(guesses, truths):
-            d_yh, l = self.cc(yh, y)  # type: ignore
-            if self.normalize:
-                d_yh /= n
-            d_scores.append(d_yh)
-            loss += l
-        return d_scores, loss
-
-    def _validate_input(
-        self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
-    ):
-        if len(guesses) != len(truths):  # pragma: no cover
-            raise ValueError(
-                "Cannot calculate SequenceCategoricalCrossentropy loss: "
-                "guesses and truths must be same length!"
-            )
+        grads = self.get_grad(guesses, truths)
+        loss = self._get_loss_from_grad(grads)
+        return grads, loss
 
     def get_grad(
         self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
     ) -> List[Floats2d]:
-        self._validate_input(guesses, truths)
+        err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length"
+        if len(guesses) != len(truths):  # pragma: no cover
+            raise ValueError(err)
         n = len(guesses)
         d_scores = []
         for yh, y in zip(guesses, truths):
-            d_yh = self.cc.get_grad(yh, y)  # type: ignore
+            d_yh = self.cc.get_grad(yh, y)
             if self.normalize:
                 d_yh /= n
             d_scores.append(d_yh)
@@ -400,42 +249,49 @@ def get_grad(
     def get_loss(
         self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs]
     ) -> float:
-        self._validate_input(guesses, truths)
+        return self._get_loss_from_grad(self.get_grad(guesses, truths))
+
+    def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float:
         loss = 0.0
-        for guess, truth in zip(guesses, truths):
-            loss += self.cc.get_loss(guess, truth)  # type: ignore
+        for grad in grads:
+            loss += self.cc._get_loss_from_grad(grad)
         return loss
 
 
-@registry.losses("SequenceCategoricalCrossentropy.v4")
-def configure_SequenceCategoricalCrossentropy_v4(
+@registry.losses("SequenceCategoricalCrossentropy.v1")
+def configure_SequenceCategoricalCrossentropy_v1(
+    *, normalize: bool = True, names: Optional[Sequence[str]] = None
+) -> SequenceCategoricalCrossentropy:
+    return SequenceCategoricalCrossentropy(normalize=normalize, names=names)
+
+
+@registry.losses("SequenceCategoricalCrossentropy.v2")
+def configure_SequenceCategoricalCrossentropy_v2(
+    *,
+    normalize: bool = True,
+    names: Optional[Sequence[str]] = None,
+    neg_prefix: Optional[str] = None,
+) -> SequenceCategoricalCrossentropy:
+    return SequenceCategoricalCrossentropy(
+        normalize=normalize, names=names, neg_prefix=neg_prefix
+    )
+
+
+@registry.losses("SequenceCategoricalCrossentropy.v3")
+def configure_SequenceCategoricalCrossentropy_v3(
     *,
     normalize: bool = True,
-    sparse: bool = True,
     names: Optional[Sequence[str]] = None,
     missing_value: Optional[Union[str, int]] = None,
     neg_prefix: Optional[str] = None,
     label_smoothing: float = 0.0,
 ) -> SequenceCategoricalCrossentropy:
-    if names is None and neg_prefix is None and not sparse:
-        cross_entropy: Union[
-            CategoricalCrossentropy, SparseCategoricalCrossentropy
-        ] = CategoricalCrossentropy(
-            normalize=False,
-            missing_value=cast(Optional[int], missing_value),
-            label_smoothing=label_smoothing,
-        )
-    else:
-        cross_entropy = SparseCategoricalCrossentropy(
-            normalize=False,
-            names=names,
-            missing_value=cast(Optional[Union[str, int]], missing_value),
-            neg_prefix=neg_prefix,
-            label_smoothing=label_smoothing,
-        )
     return SequenceCategoricalCrossentropy(
-        cross_entropy=cross_entropy,
         normalize=normalize,
+        names=names,
+        missing_value=missing_value,
+        neg_prefix=neg_prefix,
+        label_smoothing=label_smoothing,
     )
 
 
@@ -566,43 +422,6 @@ def _make_mask_by_value(truths, guesses, missing_value) -> Floats2d:
     return mask
 
 
-def _array_like(a: Floats2d, like: FloatsOrRaggedT) -> FloatsOrRaggedT:
-    if isinstance(like, Ragged):
-        return Ragged(a, lengths=like.lengths)
-    else:
-        return a
-
-
-def _to_array(guesses: FloatsOrRaggedT) -> Floats2d:
-    if isinstance(guesses, Ragged):
-        return cast(Floats2d, guesses.data.astype("float32"))
-    else:
-        return guesses
-
-
-def _normalization_length(guesses: FloatsOrRaggedT) -> int:
-    if isinstance(guesses, Ragged):
-        return len(guesses.lengths)
-    else:
-        return guesses.shape[0]
-
-
-def _check_ints1d(arr: ArrayXd):
-    """
-    Check whether array is 1D and has type integer.
-    """
-    if arr.ndim != 1:
-        raise ValueError(
-            "SparseCategoricalCrossentropy only accepts 1D arrays, but "
-            f"array with shape {arr.shape} was given."
-        )
-    if arr.dtype.kind != "i":  # type: ignore
-        raise ValueError(
-            "SparseCategoricalCrossentropy only accepts integer arrays, but "
-            f"array with {arr.dtype} was given."
-        )
-
-
 __all__ = [
     "SequenceCategoricalCrossentropy",
     "CategoricalCrossentropy",
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 2cb49e466..75206d240 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -1,384 +1,108 @@
 import pytest
 import numpy
-from functools import partial
-from thinc.api import CategoricalCrossentropy
-from thinc.api import L2Distance, CosineDistance, softmax_activation
-from thinc.api import Ragged
+from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy
+from thinc.api import L2Distance, CosineDistance
 from thinc import registry
-from thinc.util import has_torch, to_categorical
-from hypothesis import given, settings
-from hypothesis.strategies import integers, floats
-from thinc.legacy import loss
 
-
-ALL_XP = [numpy]
-try:
-    import cupy
-
-    ALL_XP.append(cupy)
-except ImportError:
-    pass
-
-
-softmax_func = partial(softmax_activation(), is_train=False)
-MAX_EXAMPLES = 50
 # some simple arrays
 scores0 = numpy.zeros((3, 3), dtype="f")
 labels0 = numpy.asarray([0, 1, 1], dtype="i")
 
 # a few more diverse ones to test realistic values
-guesses1 = numpy.asarray(
-    [[0.1, 0.5, 0.4], [0.4, 0.3, 0.3], [0, 1, 0], [0.1, 0.05, 0.85]], dtype="f"
-)
-guesses1_legacy = numpy.asarray(
-    [[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]], dtype="f"
-)
+guesses1 = numpy.asarray([[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]])
 labels1 = numpy.asarray([2, 1, 0, 2])
-labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]], dtype="f")
+labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]])
 labels1_strings = ["C", "B", "A", "C"]
-d_guesses1 = numpy.array(
-    [
-        [0.025, 0.125, -0.15],
-        [0.1, -0.175, 0.075],
-        [-0.25, 0.25, 0.0],
-        [0.025, 0.0125, -0.0375],
-    ],
-    dtype="f",
-)
-d_guesses1_seq = numpy.array(
-    [
-        [0.05, 0.25, -0.3],
-        [0.2, -0.35, 0.15],
-        [-0.5, 0.5, 0.0],
-        [0.05, 0.025, -0.075],
-    ],
-    dtype="f",
-)
-d_guesses1_0_missing = numpy.array(
-    [
-        [0.025, 0.125, -0.15],
-        [0.1, -0.175, 0.075],
-        [0.0, 0.0, 0.0],
-        [0.025, 0.0125, -0.0375],
-    ],
-    dtype="f",
-)
-d_guesses1_sum = numpy.array(
-    [
-        [0.1, 0.5, -0.6],
-        [0.4, -0.7, 0.3],
-        [-1.0, 1.0, 0.0],
-        [0.1, 0.05, -0.15],
-    ],
-    dtype="f",
-)
-loss1 = 5.75151207
-loss1_seq = 11.50302410
-loss1_0_missing = 0.57069561
-guesses2 = numpy.asarray([[0.2, 0.3, 0.5]])
-guesses2_legacy = numpy.asarray([[0.2, 0.3, 0.0]])
+
+guesses2 = numpy.asarray([[0.2, 0.3, 0.0]])
 labels2 = numpy.asarray([1])
 labels2_strings = ["B"]
-d_guesses2_sum = numpy.asarray([[0.2, -0.7, 0.5]])
-sequence_loss = 24.210021096627
-eps = 1e-6
-
-
-ce_factory = registry.get("losses", "CategoricalCrossentropy.v4")
-
-sparse_ce_factory = registry.get("losses", "SparseCategoricalCrossentropy.v4")
 
-seq_ce_factory = registry.get("losses", "SequenceCategoricalCrossentropy.v4")
+eps = 0.0001
 
 
-def _get_legacy_cross_entropy(version: int, **kwargs):
-    return registry.get("losses", f"CategoricalCrossentropy.v{version}")(**kwargs)
-
-
-def _get_legacy_seq_cross_entropy(version: int, **kwargs):
-    return registry.get("losses", f"SequenceCategoricalCrossentropy.v{version}")(
-        **kwargs
-    )
-
-
-def test_cross_entropy_types_shapes():
-    sparse_cross_entropy = ce_factory()
-    cross_entropy = ce_factory()
-    sparse_seq_cross_entropy = seq_ce_factory()
-    seq_cross_entropy = seq_ce_factory(sparse=False)
-    d_scores_sparse = sparse_cross_entropy.get_grad(guesses1, labels1_full)
-    d_scores = cross_entropy.get_grad(guesses1, labels1_full)
-    assert d_scores_sparse.dtype == "float32"
-    assert d_scores.dtype == "float32"
-    assert d_scores_sparse.shape == guesses1.shape
-    assert d_scores.shape == guesses1.shape
-    d_scores_sparse = sparse_seq_cross_entropy.get_grad([guesses1], [labels1])
-    d_scores = seq_cross_entropy.get_grad([guesses1], [labels1_full])
-    assert d_scores_sparse[0].dtype == "float32"
-    assert d_scores[0].dtype == "float32"
-    assert d_scores_sparse[0].shape == guesses1.shape
-    assert d_scores[0].shape == guesses1.shape
-    assert sparse_seq_cross_entropy.get_grad([], []) == []
-    assert seq_cross_entropy.get_grad([], []) == []
-    d_scores_ragged = cross_entropy.get_grad(
-        Ragged(numpy.array(guesses1), lengths=[3, 1]), labels1_full
-    )
-    assert isinstance(d_scores_ragged, Ragged)
-    assert d_scores_ragged.dataXd.dtype == "float32"
-    assert d_scores_ragged.dataXd.shape == guesses1.shape
-
-
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_cross_entropy_types_shapes(version):
-    cross_entropy = _get_legacy_cross_entropy(version)
-    seq_cross_entropy = _get_legacy_seq_cross_entropy(version)
-    d_scores = cross_entropy.get_grad(scores0, labels0)
+def test_loss():
+    d_scores = CategoricalCrossentropy().get_grad(scores0, labels0)
     assert d_scores.dtype == "float32"
     assert d_scores.shape == scores0.shape
-    d_scores = seq_cross_entropy.get_grad([scores0], [labels0])
+    d_scores = SequenceCategoricalCrossentropy().get_grad([scores0], [labels0])
     assert d_scores[0].dtype == "float32"
     assert d_scores[0].shape == scores0.shape
-    assert seq_cross_entropy.get_grad([], []) == []
-
-
-@pytest.mark.skipif(not has_torch, reason="needs PyTorch")
-@pytest.mark.parametrize("xp", ALL_XP)
-@settings(max_examples=MAX_EXAMPLES, deadline=None)
-@given(
-    n_samples=integers(min_value=1, max_value=100),
-    n_classes=integers(min_value=1, max_value=100),
-    low=floats(min_value=-20, max_value=10),
-    offset=floats(min_value=1, max_value=10),
-)
-def test_compare_cross_entropy_to_torch(xp, n_samples, n_classes, low, offset):
-    import torch
-
-    sparse_loss_sum = sparse_ce_factory(normalize=False)
-    sparse_loss_mean = sparse_ce_factory()
-    loss_sum = ce_factory(normalize=False)
-    loss_mean = ce_factory()
-    torch_loss_sum = torch.nn.CrossEntropyLoss(reduction="sum")
-    torch_loss_mean = torch.nn.CrossEntropyLoss()
-    logits = xp.random.uniform(low, low + offset, (n_samples, n_classes))
-    labels = xp.random.randint(0, n_classes, n_samples)
-    labels_full = to_categorical(labels, n_classes=n_classes)
-    torch_logits = torch.tensor(logits, requires_grad=True)
-    torch_labels = torch.tensor(labels, dtype=torch.long)
-    probs, _ = softmax_func(logits)
-    d_sum_sparse, l_sum_sparse = sparse_loss_sum(probs, labels)
-    d_sum, l_sum = loss_sum(probs, labels_full)
-    torch_l_sum = torch_loss_sum(torch_logits, torch_labels)
-    torch_l_sum.backward()
-    torch_d_sum = torch_logits.grad
-    torch_logits = torch.tensor(logits, requires_grad=True)
-    d_mean_sparse, l_mean_sparse = sparse_loss_mean(probs, labels)
-    d_mean, l_mean = loss_mean(probs, labels_full)
-    torch_l_mean = torch_loss_mean(torch_logits, torch_labels)
-    torch_l_mean.backward()
-    torch_d_mean = torch_logits.grad
-    assert xp.isclose(float(l_sum), float(torch_l_sum), atol=1e-06)
-    assert xp.allclose(d_sum, torch_d_sum.numpy())
-    assert xp.isclose(float(l_mean), float(torch_l_mean))
-    assert xp.allclose(d_mean, torch_d_mean.numpy())
-    assert xp.isclose(float(l_sum_sparse), float(torch_l_sum), atol=1e-06)
-    assert xp.allclose(d_sum_sparse, torch_d_sum.numpy())
-    assert xp.isclose(float(l_mean_sparse), float(torch_l_mean))
-    assert xp.allclose(d_mean_sparse, torch_d_mean.numpy())
-
-
-@pytest.mark.parametrize("dist", [CosineDistance(ignore_zeros=True), L2Distance()])
-@pytest.mark.parametrize("vect", [scores0, guesses1, guesses2])
-def test_equal_distance(dist, vect):
-    assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps)
-    assert dist.get_loss(vect, vect) == pytest.approx(0, abs=eps)
-
-
-@pytest.mark.parametrize("version", [1, 2, 3])
-@pytest.mark.parametrize("vect", [scores0, guesses1_legacy, guesses2_legacy])
-def test_equal_legacy_cross_entropy(vect, version):
-    cross_entropy = _get_legacy_cross_entropy(version)
-    assert int(cross_entropy.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps)
-    assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps)
-    assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps)
-
-
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_cross_entropy_absent_labels(version):
-    cross_entropy = _get_legacy_cross_entropy(version, names=["cat", "dog", "rat"])
-    assert cross_entropy.get_loss(scores0, [None, None, None]) == pytest.approx(
-        0, abs=eps
-    )
+    assert SequenceCategoricalCrossentropy().get_grad([], []) == []
 
 
 @pytest.mark.parametrize(
-    "guesses, labels, grad, grad_seq, loss, loss_seq",
-    [
-        (guesses1, labels1_full, d_guesses1, d_guesses1_seq, loss1, loss1_seq),
-    ],
-)
-def test_categorical_crossentropy(guesses, labels, grad, grad_seq, loss, loss_seq):
-    cross_entropy = ce_factory()
-    d_scores = cross_entropy.get_grad(guesses, labels)
-    loss_val = cross_entropy.get_loss(guesses, labels)
-    assert d_scores.shape == guesses.shape
-    assert numpy.allclose(d_scores, grad)
-    assert numpy.isclose(loss_val, loss)
-
-    # Test with Ragged inputs
-    d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels)
-    loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels)
-    assert d_scores_ragged.dataXd.shape == guesses.shape
-    assert numpy.allclose(d_scores_ragged.dataXd, grad_seq)
-    assert numpy.isclose(loss_ragged, loss_seq)
-
-
-@pytest.mark.parametrize(
-    "guesses, labels, grad, grad_seq, loss, loss_seq",
-    [
-        (guesses1, labels1, d_guesses1, d_guesses1_seq, loss1, loss1_seq),
-    ],
+    "dist", [CategoricalCrossentropy(), CosineDistance(ignore_zeros=True), L2Distance()]
 )
-def test_sparse_categorical_crossentropy(
-    guesses, labels, grad, grad_seq, loss, loss_seq
-):
-    cross_entropy = sparse_ce_factory()
-    d_scores = cross_entropy.get_grad(guesses, labels)
-    loss_val = cross_entropy.get_loss(guesses, labels)
-    assert d_scores.shape == guesses.shape
-    assert numpy.allclose(d_scores, grad)
-    assert numpy.isclose(loss_val, loss)
-
-    # Test with Ragged inputs
-    d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels)
-    loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels)
-    assert d_scores_ragged.dataXd.shape == guesses.shape
-    assert numpy.allclose(d_scores_ragged.dataXd, grad_seq)
-    assert numpy.isclose(loss_ragged, loss_seq)
+@pytest.mark.parametrize("vect", [scores0, guesses1, guesses2])
+def test_equality(dist, vect):
+    assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, eps)
+    assert dist.get_loss(vect, vect) == pytest.approx(0, eps)
 
 
 @pytest.mark.parametrize(
-    "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)]
+    "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)]
 )
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_categorical_crossentropy(guesses, labels, version):
-    cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True)
-    d_scores = cross_entropy_normalize.get_grad(guesses, labels)
+def test_categorical_crossentropy(guesses, labels):
+    d_scores = CategoricalCrossentropy(normalize=True).get_grad(guesses, labels)
     assert d_scores.shape == guesses.shape
 
     # The normalization divides the difference (e.g. 0.4) by the number of vectors (4)
-    assert d_scores[1][0] == pytest.approx(0.1, abs=eps)
-    assert d_scores[1][1] == pytest.approx(-0.1, abs=eps)
+    assert d_scores[1][0] == pytest.approx(0.1, eps)
+    assert d_scores[1][1] == pytest.approx(-0.1, eps)
 
     # The third vector predicted all labels, but only the first one was correct
-    assert d_scores[2][0] == pytest.approx(0, abs=eps)
-    assert d_scores[2][1] == pytest.approx(0.25, abs=eps)
-    assert d_scores[2][2] == pytest.approx(0.25, abs=eps)
+    assert d_scores[2][0] == pytest.approx(0, eps)
+    assert d_scores[2][1] == pytest.approx(0.25, eps)
+    assert d_scores[2][2] == pytest.approx(0.25, eps)
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores[3][0] == pytest.approx(0, abs=eps)
-    assert d_scores[3][1] == pytest.approx(0, abs=eps)
-    assert d_scores[3][2] == pytest.approx(-0.25, abs=eps)
+    assert d_scores[3][0] == pytest.approx(0, eps)
+    assert d_scores[3][1] == pytest.approx(0, eps)
+    assert d_scores[3][2] == pytest.approx(-0.25, eps)
 
-    loss = cross_entropy_normalize.get_loss(guesses, labels)
-    assert loss == pytest.approx(0.239375, abs=eps)
+    loss = CategoricalCrossentropy(normalize=True).get_loss(guesses, labels)
+    assert loss == pytest.approx(0.239375, eps)
 
 
 def test_crossentropy_incorrect_scores_targets():
     labels = numpy.asarray([2])
-    labels_full = numpy.asarray([[0.0, 0.0, 1.0]])
-    cross_entropy = ce_factory()
-    sparse_cross_entropy = sparse_ce_factory()
 
     guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        cross_entropy.get_grad(guesses_neg, labels_full)
-    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        sparse_cross_entropy.get_grad(guesses_neg, labels)
-
-    guesses_dont_sum_one = numpy.asarray([[0.1, 0.5, 0.6]])
-    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        cross_entropy.get_grad(guesses_dont_sum_one, labels_full)
-    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        sparse_cross_entropy.get_grad(guesses_dont_sum_one, labels)
+        CategoricalCrossentropy(normalize=True).get_grad(guesses_neg, labels)
 
     guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        cross_entropy.get_grad(guesses_larger_than_one, labels_full)
-    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        sparse_cross_entropy.get_grad(guesses_larger_than_one, labels)
-
-    guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]])
-    targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
-    with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        cross_entropy.get_grad(guesses_ok, targets_neg)
-
-    targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]])
-    with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        cross_entropy.get_grad(guesses_ok, targets_larger_than_one)
-
-    targets_dont_sum_one = numpy.asarray([[0.9, 0.5, 0.6]])
-    with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        cross_entropy.get_grad(guesses_ok, targets_dont_sum_one)
-
-
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_categorical_cross_entropy_incorrect_scores_targets(version):
-    labels = numpy.asarray([2])
-    cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True)
-    guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
-    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        cross_entropy_normalize.get_grad(guesses_neg, labels)
-
-    guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]])
-    with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"):
-        cross_entropy_normalize.get_grad(guesses_larger_than_one, labels)
+        CategoricalCrossentropy(normalize=True).get_grad(
+            guesses_larger_than_one, labels
+        )
 
     guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]])
     targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        cross_entropy_normalize.get_grad(guesses_ok, targets_neg)
+        CategoricalCrossentropy(normalize=True).get_grad(guesses_ok, targets_neg)
 
     targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]])
     with pytest.raises(ValueError, match=r"Cannot calculate.*truth"):
-        cross_entropy_normalize.get_grad(guesses_ok, targets_larger_than_one)
-
-
-@pytest.mark.parametrize(
-    "guesses, labels, grad, missing_value",
-    [
-        (guesses1, [2, 1, 0, 2], d_guesses1_0_missing, 0),
-        (guesses1, labels1, d_guesses1_0_missing, 0),
-        (guesses1, labels1_strings, d_guesses1_0_missing, "A"),
-    ],
-)
-def test_sparse_crossentropy_missing(guesses, labels, grad, missing_value):
-    if missing_value == "A":
-        names = ["A", "B", "C"]
-    else:
-        names = None
-    sparse_cross_entropy = sparse_ce_factory(missing_value=missing_value, names=names)
-    d_scores = sparse_cross_entropy.get_grad(guesses, labels)
-    assert d_scores.shape == guesses.shape
-    assert numpy.allclose(d_scores, grad)
-    loss = sparse_cross_entropy.get_loss(guesses, labels)
-    assert numpy.isclose(loss, loss1_0_missing)
+        CategoricalCrossentropy(normalize=True).get_grad(
+            guesses_ok, targets_larger_than_one
+        )
 
 
 @pytest.mark.parametrize(
     "guesses, labels",
-    [(guesses1_legacy, [2, 1, 0, 2])],
+    [(guesses1, [2, 1, 0, 2])],
 )
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_categorical_crossentropy_int_list_missing(guesses, labels, version):
-    cross_entropy_normalize_missing = _get_legacy_cross_entropy(
-        version, normalize=True, missing_value=0
+def test_categorical_crossentropy_int_list_missing(guesses, labels):
+    d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad(
+        guesses, labels
     )
-    d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels)
     assert d_scores.shape == guesses.shape
 
     # The normalization divides the difference (e.g. 0.4) by the number of vectors (4)
-    assert d_scores[1][0] == pytest.approx(0.1, abs=eps)
-    assert d_scores[1][1] == pytest.approx(-0.1, abs=eps)
+    assert d_scores[1][0] == pytest.approx(0.1, eps)
+    assert d_scores[1][1] == pytest.approx(-0.1, eps)
 
     # Label 0 is masked, because it represents the missing value
     assert d_scores[2][0] == 0.0
@@ -386,46 +110,28 @@ def test_legacy_categorical_crossentropy_int_list_missing(guesses, labels, versi
     assert d_scores[2][2] == 0.0
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores[3][0] == pytest.approx(0, abs=eps)
-    assert d_scores[3][1] == pytest.approx(0, abs=eps)
-    assert d_scores[3][2] == pytest.approx(-0.25, abs=eps)
-
-    loss = cross_entropy_normalize_missing.get_loss(guesses, labels)
-    assert loss == pytest.approx(0.114375, abs=eps)
-
-
-@pytest.mark.parametrize(
-    "guesses, labels, grad",
-    [
-        (guesses1, labels1_full, d_guesses1_0_missing),
-    ],
-)
-def test_categorical_crossentropy_missing(guesses, labels, grad):
-    cross_entropy = ce_factory(missing_value=0)
-    d_scores = cross_entropy.get_grad(guesses, labels)
-    assert d_scores.shape == guesses.shape
-    assert numpy.allclose(d_scores, grad)
+    assert d_scores[3][0] == pytest.approx(0, eps)
+    assert d_scores[3][1] == pytest.approx(0, eps)
+    assert d_scores[3][2] == pytest.approx(-0.25, eps)
 
     loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss(
         guesses, labels
     )
-    assert numpy.isclose(loss, loss1_0_missing)
+    assert loss == pytest.approx(0.114375, eps)
 
 
 @pytest.mark.parametrize(
-    "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)]
+    "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)]
 )
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_categorical_crossentropy_missing(guesses, labels, version):
-    cross_entropy_normalize_missing = _get_legacy_cross_entropy(
-        version, normalize=True, missing_value=0
+def test_categorical_crossentropy_missing(guesses, labels):
+    d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad(
+        guesses, labels
     )
-    d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels)
     assert d_scores.shape == guesses.shape
 
     # The normalization divides the difference (e.g. 0.4) by the number of vectors (4)
-    assert d_scores[1][0] == pytest.approx(0.1, abs=eps)
-    assert d_scores[1][1] == pytest.approx(-0.1, abs=eps)
+    assert d_scores[1][0] == pytest.approx(0.1, eps)
+    assert d_scores[1][1] == pytest.approx(-0.1, eps)
 
     # Label 0 is masked, because it represents the missing value
     assert d_scores[2][0] == 0.0
@@ -433,179 +139,95 @@ def test_legacy_categorical_crossentropy_missing(guesses, labels, version):
     assert d_scores[2][2] == 0.0
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores[3][0] == pytest.approx(0, abs=eps)
-    assert d_scores[3][1] == pytest.approx(0, abs=eps)
-    assert d_scores[3][2] == pytest.approx(-0.25, abs=eps)
-
-    loss = cross_entropy_normalize_missing.get_loss(guesses, labels)
-    assert loss == pytest.approx(0.114375, abs=eps)
-
-
-@pytest.mark.parametrize(
-    "guesses, labels, names, grad, loss",
-    [
-        (
-            [guesses1, guesses2],
-            [labels1, labels2],
-            [],
-            [d_guesses1_sum, d_guesses2_sum],
-            sequence_loss,
-        ),
-        (
-            [guesses1, guesses2],
-            [labels1_strings, labels2_strings],
-            ["A", "B", "C"],
-            [d_guesses1_sum, d_guesses2_sum],
-            sequence_loss,
-        ),
-    ],
-)
-def test_sequence_sparse_crossentropy(guesses, labels, names, grad, loss):
-    sparse_seq_cross_entropy_sum = seq_ce_factory(names=names, normalize=False)
-    sparse_seq_cross_entropy = seq_ce_factory(names=names, normalize=True)
-    d_scores = sparse_seq_cross_entropy_sum.get_grad(guesses, labels)
-    assert numpy.allclose(d_scores[0], grad[0])
-    assert numpy.allclose(d_scores[1], grad[1])
-    # The normalization divides the difference (e.g. 0.4) by the number of seqs
-    d_scores = sparse_seq_cross_entropy.get_grad(guesses, labels)
-    assert numpy.allclose(d_scores[0], grad[0] / 2.0)
-    assert numpy.allclose(d_scores[1], grad[1] / 2.0)
-    loss_val = sparse_seq_cross_entropy.get_loss(guesses, labels)
-    assert numpy.isclose(loss_val, loss)
-    d_scores, loss_val = sparse_seq_cross_entropy_sum(guesses, labels)
-    assert numpy.isclose(loss_val, loss)
-    assert numpy.allclose(d_scores[0], grad[0])
-    assert numpy.allclose(d_scores[1], grad[1])
+    assert d_scores[3][0] == pytest.approx(0, eps)
+    assert d_scores[3][1] == pytest.approx(0, eps)
+    assert d_scores[3][2] == pytest.approx(-0.25, eps)
 
-
-@pytest.mark.parametrize(
-    "guesses, labels, grad, loss",
-    [([guesses1], [labels1_full], [d_guesses1_sum], [23.00604829563447])],
-)
-def test_sequence_crossentropy(guesses, labels, grad, loss):
-    seq_cross_entropy = seq_ce_factory(sparse=False, normalize=False)
-    d_scores = seq_cross_entropy.get_grad(guesses, labels)
-    assert numpy.allclose(d_scores[0], grad[0])
-    # The normalization divides the difference (e.g. 0.4) by the number of seqs
-    loss_val = seq_cross_entropy.get_loss(guesses, labels)
-    assert numpy.isclose(loss_val, loss)
-    d_scores, loss_val = seq_cross_entropy(guesses, labels)
-    assert numpy.isclose(loss_val, loss)
-    assert numpy.allclose(d_scores[0], grad[0])
+    loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss(
+        guesses, labels
+    )
+    assert loss == pytest.approx(0.114375, eps)
 
 
 @pytest.mark.parametrize(
     "guesses, labels, names",
     [
-        ([guesses1_legacy, guesses2_legacy], [labels1, labels2], []),
-        ([guesses1_legacy, guesses2_legacy], [labels1_full, labels2], []),
-        (
-            [guesses1_legacy, guesses2_legacy],
-            [labels1_strings, labels2_strings],
-            ["A", "B", "C"],
-        ),
+        ([guesses1, guesses2], [labels1, labels2], []),
+        ([guesses1, guesses2], [labels1_full, labels2], []),
+        ([guesses1, guesses2], [labels1_strings, labels2_strings], ["A", "B", "C"]),
     ],
 )
-@pytest.mark.parametrize("version", [1, 2, 3])
-def test_legacy_sequence_categorical_crossentropy(guesses, labels, names, version):
-    seq_cross_entropy_names = _get_legacy_seq_cross_entropy(
-        version, normalize=False, names=names
-    )
-    seq_cross_entropy_names_normalize = _get_legacy_seq_cross_entropy(
-        version, normalize=True, names=names
+def test_sequence_categorical_crossentropy(guesses, labels, names):
+    d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names).get_grad(
+        guesses, labels
     )
-    d_scores = seq_cross_entropy_names.get_grad(guesses, labels)
     d_scores1 = d_scores[0]
     d_scores2 = d_scores[1]
     assert d_scores1.shape == guesses1.shape
     assert d_scores2.shape == guesses2.shape
-    assert d_scores1[1][0] == pytest.approx(0.4, abs=eps)
-    assert d_scores1[1][1] == pytest.approx(-0.4, abs=eps)
+    assert d_scores1[1][0] == pytest.approx(0.4, eps)
+    assert d_scores1[1][1] == pytest.approx(-0.4, eps)
     # The normalization divides the difference (e.g. 0.4) by the number of seqs
-    d_scores = seq_cross_entropy_names_normalize.get_grad(guesses, labels)
+    d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad(
+        guesses, labels
+    )
     d_scores1 = d_scores[0]
     d_scores2 = d_scores[1]
 
-    assert d_scores1[1][0] == pytest.approx(0.2, abs=eps)
-    assert d_scores1[1][1] == pytest.approx(-0.2, abs=eps)
+    assert d_scores1[1][0] == pytest.approx(0.2, eps)
+    assert d_scores1[1][1] == pytest.approx(-0.2, eps)
 
     # The third vector predicted all labels, but only the first one was correct
-    assert d_scores1[2][0] == pytest.approx(0, abs=eps)
-    assert d_scores1[2][1] == pytest.approx(0.5, abs=eps)
-    assert d_scores1[2][2] == pytest.approx(0.5, abs=eps)
+    assert d_scores1[2][0] == pytest.approx(0, eps)
+    assert d_scores1[2][1] == pytest.approx(0.5, eps)
+    assert d_scores1[2][2] == pytest.approx(0.5, eps)
 
     # The fourth vector predicted no labels but should have predicted the last one
-    assert d_scores1[3][0] == pytest.approx(0, abs=eps)
-    assert d_scores1[3][1] == pytest.approx(0, abs=eps)
-    assert d_scores1[3][2] == pytest.approx(-0.5, abs=eps)
+    assert d_scores1[3][0] == pytest.approx(0, eps)
+    assert d_scores1[3][1] == pytest.approx(0, eps)
+    assert d_scores1[3][2] == pytest.approx(-0.5, eps)
 
     # Test the second batch
-    assert d_scores2[0][0] == pytest.approx(0.1, abs=eps)
-    assert d_scores2[0][1] == pytest.approx(-0.35, abs=eps)
-
-    loss = seq_cross_entropy_names_normalize.get_loss(guesses, labels)
-    assert loss == pytest.approx(1.09, abs=eps)
+    assert d_scores2[0][0] == pytest.approx(0.1, eps)
+    assert d_scores2[0][1] == pytest.approx(-0.35, eps)
 
-
-@pytest.mark.parametrize(
-    "guesses, labels, names, grad",
-    [
-        (
-            [guesses1],
-            [["A", "!A", "", "!C"]],
-            ["A", "B", "C"],
-            numpy.array(
-                [
-                    [-0.9, 0.5, 0.4],  # First is correct
-                    [0.4, 0.0, 0.0],  # Not first one
-                    [0.0, 0.0, 0.0],  # Missing
-                    [0.0, 0.0, 0.85],  # Not last one
-                ]
-            ),
-        )
-    ],
-)
-def test_sequence_crossentropy_missing_negative(guesses, labels, names, grad):
-    sparse_seq_ce = seq_ce_factory(
-        names=names, normalize=False, neg_prefix="!", missing_value=""
+    loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss(
+        guesses, labels
     )
-    d_scores = sparse_seq_ce.get_grad(guesses, labels)
-    assert numpy.allclose(d_scores, grad)
+    assert loss == pytest.approx(1.09, eps)
 
 
 @pytest.mark.parametrize(
     "guesses, labels, names",
     [
-        ([guesses1_legacy], [["A", "!A", "", "!C"]], ["A", "B", "C"]),
+        ([guesses1], [["A", "!A", "", "!C"]], ["A", "B", "C"]),
     ],
 )
-@pytest.mark.parametrize("version", [3])
-def test_legacy_sequence_categorical_missing_negative(guesses, labels, names, version):
-    seq_cross_entropy = _get_legacy_seq_cross_entropy(
-        version, normalize=False, names=names, neg_prefix="!", missing_value=""
-    )
-    d_scores = seq_cross_entropy.get_grad(guesses, labels)
+def test_sequence_categorical_missing_negative(guesses, labels, names):
+    d_scores = SequenceCategoricalCrossentropy(
+        normalize=False, names=names, neg_prefix="!", missing_value=""
+    ).get_grad(guesses, labels)
     d_scores0 = d_scores[0]
 
     # [0.1, 0.5, 0.6] should be A
-    assert d_scores0[0][0] == pytest.approx(-0.9, abs=eps)
-    assert d_scores0[0][1] == pytest.approx(0.5, abs=eps)
-    assert d_scores0[0][2] == pytest.approx(0.6, abs=eps)
+    assert d_scores0[0][0] == pytest.approx(-0.9, eps)
+    assert d_scores0[0][1] == pytest.approx(0.5, eps)
+    assert d_scores0[0][2] == pytest.approx(0.6, eps)
 
     # [0.4, 0.6, 0.3] should NOT be A
-    assert d_scores0[1][0] == pytest.approx(0.4, abs=eps)
-    assert d_scores0[1][1] == pytest.approx(0.0, abs=eps)
-    assert d_scores0[1][2] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[1][0] == pytest.approx(0.4, eps)
+    assert d_scores0[1][1] == pytest.approx(0.0, eps)
+    assert d_scores0[1][2] == pytest.approx(0.0, eps)
 
     # [1, 1, 1] has missing gold label
-    assert d_scores0[2][0] == pytest.approx(0.0, abs=eps)
-    assert d_scores0[2][1] == pytest.approx(0.0, abs=eps)
-    assert d_scores0[2][2] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[2][0] == pytest.approx(0.0, eps)
+    assert d_scores0[2][1] == pytest.approx(0.0, eps)
+    assert d_scores0[2][2] == pytest.approx(0.0, eps)
 
     # [0.0, 0.0, 0.0] should NOT be C
-    assert d_scores0[3][0] == pytest.approx(0.0, abs=eps)
-    assert d_scores0[3][1] == pytest.approx(0.0, abs=eps)
-    assert d_scores0[3][2] == pytest.approx(0.0, abs=eps)
+    assert d_scores0[3][0] == pytest.approx(0.0, eps)
+    assert d_scores0[3][1] == pytest.approx(0.0, eps)
+    assert d_scores0[3][2] == pytest.approx(0.0, eps)
 
 
 def test_L2():
@@ -619,10 +241,10 @@ def test_L2():
     )
 
     loss_not_normalized = L2Distance(normalize=False).get_loss(vec1, vec2)
-    assert loss_not_normalized == pytest.approx(20, abs=eps)
+    assert loss_not_normalized == pytest.approx(20, eps)
 
     loss_normalized = L2Distance(normalize=True).get_loss(vec1, vec2)
-    assert loss_normalized == pytest.approx(5, abs=eps)
+    assert loss_normalized == pytest.approx(5, eps)
 
 
 def test_cosine_orthogonal():
@@ -638,10 +260,10 @@ def test_cosine_orthogonal():
     assert d_vecs[1][1] > 0
 
     loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2)
-    assert loss_not_normalized == pytest.approx(2, abs=eps)
+    assert loss_not_normalized == pytest.approx(2, eps)
 
     loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2)
-    assert loss_normalized == pytest.approx(1, abs=eps)
+    assert loss_normalized == pytest.approx(1, eps)
 
 
 def test_cosine_equal():
@@ -654,10 +276,10 @@ def test_cosine_equal():
     numpy.testing.assert_allclose(d_vec1, numpy.zeros(d_vec1.shape), rtol=eps, atol=eps)
 
     loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2)
-    assert loss_not_normalized == pytest.approx(0, abs=eps)
+    assert loss_not_normalized == pytest.approx(0, eps)
 
     loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2)
-    assert loss_normalized == pytest.approx(0, abs=eps)
+    assert loss_normalized == pytest.approx(0, eps)
 
 
 def test_cosine_unmatched():
@@ -670,26 +292,19 @@ def test_cosine_unmatched():
 @pytest.mark.parametrize(
     "name,kwargs,args",
     [
-        ("CategoricalCrossentropy.v1", {}, (guesses1, labels1)),
-        ("SequenceCategoricalCrossentropy.v1", {}, ([guesses1], [labels1])),
-        ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (guesses1, labels1)),
-        ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (guesses1, labels1)),
-        ("SparseCategoricalCrossentropy.v4", {"neg_prefix": "!"}, (guesses1, labels1)),
-        ("CategoricalCrossentropy.v4", {}, (guesses1, labels1_full)),
+        ("CategoricalCrossentropy.v1", {}, (scores0, labels0)),
+        ("SequenceCategoricalCrossentropy.v1", {}, ([scores0], [labels0])),
+        ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (scores0, labels0)),
+        ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (scores0, labels0)),
         (
             "SequenceCategoricalCrossentropy.v2",
             {"neg_prefix": "!"},
-            ([guesses1], [labels1]),
+            ([scores0], [labels0]),
         ),
         (
             "SequenceCategoricalCrossentropy.v3",
             {"neg_prefix": "!"},
-            ([guesses1], [labels1]),
-        ),
-        (
-            "SequenceCategoricalCrossentropy.v4",
-            {"neg_prefix": "!"},
-            ([guesses1], [labels1]),
+            ([scores0], [labels0]),
         ),
         ("L2Distance.v1", {}, (scores0, scores0)),
         (
diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index f525a5133..133efbe60 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -5,7 +5,6 @@
 from thinc.util import get_array_module, is_numpy_array, to_categorical
 from thinc.util import is_cupy_array
 from thinc.util import convert_recursive
-from thinc.util import smooth_one_hot
 from thinc.types import ArgsKwargs
 
 
@@ -147,26 +146,6 @@ def test_to_categorical(label_smoothing):
         to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.88)
 
 
-@given(
-    n_classes=strategies.lengths(lo=2, hi=100),
-    n_samples=strategies.lengths(lo=1, hi=100),
-    label_smoothing=strategies.floats(min_value=0.0, max_value=1.0)
-)
-def test_smooth_one_hot(n_samples, n_classes, label_smoothing):
-    one_hot = numpy.zeros((n_samples, n_classes))
-    labels = numpy.random.randint(0, n_classes, (n_samples,))
-    one_hot[numpy.arange(n_samples), labels] = 1
-    max_smooth = (n_classes - 1) / n_classes
-    if label_smoothing >= max_smooth:
-        with pytest.raises(ValueError, match=r"label_smoothing parameter has to be less than"):
-            smooth_one_hot(one_hot, label_smoothing)
-    else:
-        smoothed = smooth_one_hot(one_hot, label_smoothing)
-        assert numpy.all(numpy.argmax(smoothed, axis=1) == labels)
-        assert smoothed.shape == one_hot.shape
-        assert numpy.allclose(smoothed.sum(1), 1.0)
-
-
 def test_convert_recursive():
     is_match = lambda obj: obj == "foo"
     convert_item = lambda obj: obj.upper()
diff --git a/thinc/util.py b/thinc/util.py
index c7212818f..9afec29ba 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -1,7 +1,5 @@
 from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar
-from typing import List, Mapping
-from typing import TYPE_CHECKING
-
+from typing import List, Mapping, TYPE_CHECKING
 import numpy
 import platform
 import random
@@ -15,17 +13,18 @@
 import contextlib
 from contextvars import ContextVar
 from dataclasses import dataclass
+
 from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
 from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu
 from .compat import has_torch_mps
 from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
-
-from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd, Floats2d  # noqa: E402
+from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd  # noqa: E402
 from . import types  # noqa: E402
 
 if TYPE_CHECKING:
     from .api import Ops
 
+
 DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
 
 
@@ -261,32 +260,6 @@ def to_categorical(
     return label_distr[Y]
 
 
-def smooth_one_hot(X: Floats2d, label_smoothing: float) -> Floats2d:
-    """
-    Apply label-smoothing to one-hot array.
-    """
-    n_classes = X.shape[1]
-    max_smooth = (n_classes - 1) / n_classes
-    if label_smoothing < 0.0:
-        raise ValueError(
-            "Label-smoothing parameter has to be greater than or equal to 0"
-        )
-    if not n_classes > 1:
-        raise ValueError(
-            "n_classes should be greater than 1 when label smoothing is enabled,"
-            f"but {n_classes} was provided."
-        )
-    if label_smoothing >= max_smooth:
-        raise ValueError(
-            f"For {n_classes} classes "
-            "label_smoothing parameter has to be less than "
-            f"{max_smooth}, but found {label_smoothing}."
-        )
-    X[X == 1] = 1 - label_smoothing
-    X[X == 0] = label_smoothing / (n_classes - 1)
-    return X
-
-
 def get_width(
     X: Union[ArrayXd, Ragged, Padded, Sequence[ArrayXd]], *, dim: int = -1
 ) -> int:
@@ -650,7 +623,6 @@ def check_consistency(self, arr: ArrayXd):
     "require_gpu",
     "copy_array",
     "to_categorical",
-    "smooth_one_hot",
     "get_width",
     "xp2torch",
     "torch2xp",

From 95f894f3c4b5e6df5cf26fae280d3864e3f25423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 9 Jan 2024 09:19:28 +0100
Subject: [PATCH 18/30] isort

---
 thinc/__init__.py                             |   1 -
 thinc/api.py                                  | 210 ++++++++++++++----
 thinc/backends/__init__.py                    |  27 ++-
 thinc/backends/_cupy_allocators.py            |   2 +-
 thinc/backends/_custom_kernels.py             |  11 +-
 thinc/backends/_param_server.py               |   3 +-
 thinc/backends/cblas.pxd                      |   1 -
 thinc/backends/cupy_ops.py                    |  21 +-
 thinc/backends/mps_ops.py                     |   4 +-
 thinc/backends/numpy_ops.pyx                  |  24 +-
 thinc/backends/ops.py                         |  55 ++++-
 thinc/compat.py                               |   4 +-
 thinc/config.py                               |   3 +-
 thinc/initializers.py                         |   1 +
 thinc/layers/__init__.py                      |  94 ++++----
 thinc/layers/add.py                           |   5 +-
 thinc/layers/array_getitem.py                 |   6 +-
 thinc/layers/bidirectional.py                 |   5 +-
 thinc/layers/cauchysimilarity.py              |   5 +-
 thinc/layers/chain.py                         |   7 +-
 thinc/layers/clipped_linear.py                |  10 +-
 thinc/layers/clone.py                         |   9 +-
 thinc/layers/concatenate.py                   |  20 +-
 thinc/layers/dish.py                          |  10 +-
 thinc/layers/dropout.py                       |   7 +-
 thinc/layers/embed.py                         |  11 +-
 thinc/layers/expand_window.py                 |   5 +-
 thinc/layers/gelu.py                          |  10 +-
 thinc/layers/hard_swish.py                    |  10 +-
 thinc/layers/hard_swish_mobilenet.py          |  10 +-
 thinc/layers/hashembed.py                     |  11 +-
 thinc/layers/layernorm.py                     |   7 +-
 thinc/layers/linear.py                        |   7 +-
 thinc/layers/list2array.py                    |   5 +-
 thinc/layers/list2padded.py                   |   7 +-
 thinc/layers/list2ragged.py                   |   7 +-
 thinc/layers/logistic.py                      |   5 +-
 thinc/layers/lstm.py                          |  13 +-
 thinc/layers/map_list.py                      |   4 +-
 thinc/layers/maxout.py                        |   7 +-
 thinc/layers/mish.py                          |   9 +-
 thinc/layers/multisoftmax.py                  |   7 +-
 thinc/layers/mxnetwrapper.py                  |   7 +-
 thinc/layers/noop.py                          |   5 +-
 thinc/layers/padded2list.py                   |   7 +-
 thinc/layers/parametricattention.py           |   5 +-
 thinc/layers/premap_ids.pyx                   |   8 +-
 thinc/layers/pytorchwrapper.py                |  16 +-
 thinc/layers/ragged2list.py                   |   7 +-
 thinc/layers/reduce_first.py                  |   5 +-
 thinc/layers/reduce_last.py                   |   4 +-
 thinc/layers/reduce_max.py                    |   7 +-
 thinc/layers/reduce_mean.py                   |   7 +-
 thinc/layers/reduce_sum.py                    |   3 +-
 thinc/layers/relu.py                          |  11 +-
 thinc/layers/remap_ids.py                     |   8 +-
 thinc/layers/residual.py                      |   6 +-
 thinc/layers/resizable.py                     |   2 +-
 thinc/layers/siamese.py                       |   5 +-
 thinc/layers/sigmoid.py                       |   7 +-
 thinc/layers/sigmoid_activation.py            |   4 +-
 thinc/layers/softmax.py                       |   9 +-
 thinc/layers/softmax_activation.py            |   5 +-
 thinc/layers/sparselinear.pyx                 |  15 +-
 thinc/layers/strings2arrays.py                |   6 +-
 thinc/layers/swish.py                         |  10 +-
 thinc/layers/tensorflowwrapper.py             |  14 +-
 thinc/layers/torchscriptwrapper.py            |   7 +-
 thinc/layers/tuplify.py                       |   4 +-
 thinc/layers/uniqued.py                       |   8 +-
 thinc/layers/with_array.py                    |   7 +-
 thinc/layers/with_array2d.py                  |   5 +-
 thinc/layers/with_cpu.py                      |   5 +-
 thinc/layers/with_debug.py                    |   2 +-
 thinc/layers/with_flatten.py                  |   4 +-
 thinc/layers/with_flatten_v2.py               |   5 +-
 thinc/layers/with_getitem.py                  |   5 +-
 thinc/layers/with_list.py                     |   6 +-
 thinc/layers/with_nvtx_range.py               |   3 +-
 thinc/layers/with_padded.py                   |   7 +-
 thinc/layers/with_ragged.py                   |   7 +-
 thinc/layers/with_reshape.py                  |   7 +-
 thinc/layers/with_signpost_interval.py        |   3 +-
 thinc/loss.py                                 |  17 +-
 thinc/model.py                                |  37 ++-
 thinc/mypy.py                                 |  13 +-
 thinc/optimizers.py                           |   9 +-
 thinc/schedules.py                            |   6 +-
 thinc/shims/__init__.py                       |   7 +-
 thinc/shims/mxnet.py                          |  14 +-
 thinc/shims/pytorch.py                        |  17 +-
 thinc/shims/shim.py                           |   6 +-
 thinc/shims/tensorflow.py                     |   9 +-
 thinc/shims/torchscript.py                    |   3 +-
 thinc/tests/backends/test_mem.py              |   3 +-
 thinc/tests/backends/test_ops.py              |  29 ++-
 thinc/tests/conftest.py                       |   5 +-
 thinc/tests/layers/test_basic_tagger.py       |  17 +-
 thinc/tests/layers/test_combinators.py        |  16 +-
 thinc/tests/layers/test_feed_forward.py       |   8 +-
 thinc/tests/layers/test_hash_embed.py         |   1 +
 thinc/tests/layers/test_layers_api.py         |  13 +-
 thinc/tests/layers/test_linear.py             |   7 +-
 thinc/tests/layers/test_lstm.py               |   9 +-
 thinc/tests/layers/test_mappers.py            |   3 +-
 thinc/tests/layers/test_mnist.py              |  16 +-
 thinc/tests/layers/test_mxnet_wrapper.py      |  15 +-
 thinc/tests/layers/test_pytorch_wrapper.py    |  38 +++-
 thinc/tests/layers/test_reduce.py             |   3 +-
 thinc/tests/layers/test_resizable.py          |   8 +-
 thinc/tests/layers/test_shim.py               |   2 +
 thinc/tests/layers/test_softmax.py            |   2 +-
 thinc/tests/layers/test_sparse_linear.py      |   4 +-
 thinc/tests/layers/test_tensorflow_wrapper.py |  16 +-
 thinc/tests/layers/test_torchscriptwrapper.py |   9 +-
 thinc/tests/layers/test_transforms.py         |   3 +-
 thinc/tests/layers/test_uniqued.py            |   9 +-
 thinc/tests/layers/test_with_debug.py         |   3 +-
 thinc/tests/layers/test_with_flatten.py       |   1 +
 thinc/tests/layers/test_with_transforms.py    |  19 +-
 thinc/tests/model/test_model.py               |  29 ++-
 thinc/tests/model/test_validation.py          |  13 +-
 thinc/tests/mypy/modules/fail_no_plugin.py    |   2 +-
 thinc/tests/mypy/modules/fail_plugin.py       |   2 +-
 thinc/tests/mypy/modules/success_no_plugin.py |   2 +-
 thinc/tests/mypy/modules/success_plugin.py    |   2 +-
 thinc/tests/mypy/test_mypy.py                 |   2 +-
 thinc/tests/regression/issue519/program.py    |   2 +-
 thinc/tests/regression/test_issue208.py       |   2 +-
 thinc/tests/shims/test_pytorch_grad_scaler.py |   4 +-
 thinc/tests/strategies.py                     |   5 +-
 thinc/tests/test_config.py                    |  17 +-
 thinc/tests/test_import__all__.py             |   4 +-
 thinc/tests/test_indexing.py                  |   5 +-
 thinc/tests/test_initializers.py              |  12 +-
 thinc/tests/test_loss.py                      |  11 +-
 thinc/tests/test_optimizers.py                |   6 +-
 thinc/tests/test_schedules.py                 |  13 +-
 thinc/tests/test_serialize.py                 |  13 +-
 thinc/tests/test_types.py                     |  15 +-
 thinc/tests/test_util.py                      |  16 +-
 thinc/tests/util.py                           |   8 +-
 thinc/types.py                                |  31 ++-
 thinc/util.py                                 |  53 +++--
 144 files changed, 1006 insertions(+), 608 deletions(-)

diff --git a/thinc/__init__.py b/thinc/__init__.py
index dfa821c4f..8f4a8a5a5 100644
--- a/thinc/__init__.py
+++ b/thinc/__init__.py
@@ -4,7 +4,6 @@
 from .about import __version__
 from .config import registry
 
-
 # fmt: off
 __all__ = [
     "registry",
diff --git a/thinc/api.py b/thinc/api.py
index 74633addd..74a654622 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -1,53 +1,165 @@
-from .config import Config, registry, ConfigValidationError
-from .initializers import normal_init, uniform_init, glorot_uniform_init, zero_init
-from .initializers import configure_normal_init
-from .loss import CategoricalCrossentropy, L2Distance, CosineDistance
-from .loss import SequenceCategoricalCrossentropy
-from .model import Model, serialize_attr, deserialize_attr
-from .model import set_dropout_rate, change_attr_values, wrap_model_recursive
-from .shims import Shim, PyTorchGradScaler, PyTorchShim, TensorFlowShim, keras_model_fns
-from .shims import MXNetShim, TorchScriptShim, maybe_handshake_model
-from .optimizers import Adam, RAdam, SGD, Optimizer
-from .schedules import Schedule, cyclic_triangular, warmup_linear, constant
-from .schedules import constant_then, decaying, slanted_triangular, compounding
-from .schedules import plateau
-from .types import Ragged, Padded, ArgsKwargs, Unserializable
-from .util import fix_random_seed, is_cupy_array, set_active_gpu
-from .util import prefer_gpu, require_gpu, require_cpu
-from .util import DataValidationError, data_validation
-from .util import to_categorical, get_width, get_array_module, to_numpy
-from .util import torch2xp, xp2torch, tensorflow2xp, xp2tensorflow, mxnet2xp, xp2mxnet
-from .util import get_torch_default_device
+from .backends import (
+    CupyOps,
+    MPSOps,
+    NumpyOps,
+    Ops,
+    get_current_ops,
+    get_ops,
+    set_current_ops,
+    set_gpu_allocator,
+    use_ops,
+    use_pytorch_for_gpu_memory,
+    use_tensorflow_for_gpu_memory,
+)
 from .compat import has_cupy
-from .backends import get_ops, set_current_ops, get_current_ops, use_ops
-from .backends import Ops, CupyOps, MPSOps, NumpyOps, set_gpu_allocator
-from .backends import use_pytorch_for_gpu_memory, use_tensorflow_for_gpu_memory
-
-from .layers import Dropout, Embed, expand_window, HashEmbed, LayerNorm, Linear
-from .layers import Maxout, Mish, MultiSoftmax, Relu, softmax_activation, Softmax, LSTM
-from .layers import CauchySimilarity, ParametricAttention, Logistic
-from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear
-from .layers import SparseLinear_v2, ClippedLinear, ReluK, HardTanh, HardSigmoid
-from .layers import Dish, HardSwish, HardSwishMobilenet, Swish, Gelu
-from .layers import PyTorchWrapper, PyTorchRNNWrapper, PyTorchLSTM
-from .layers import TensorFlowWrapper, keras_subclass, MXNetWrapper
-from .layers import PyTorchWrapper_v2, Softmax_v2, PyTorchWrapper_v3
-from .layers import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper
-
-from .layers import add, bidirectional, chain, clone, concatenate, noop
-from .layers import residual, uniqued, siamese, list2ragged, ragged2list
-from .layers import map_list
-from .layers import with_array, with_array2d
-from .layers import with_padded, with_list, with_ragged, with_flatten
-from .layers import with_reshape, with_getitem, strings2arrays, list2array
-from .layers import list2ragged, ragged2list, list2padded, padded2list
-from .layers import remap_ids, remap_ids_v2, premap_ids
-from .layers import array_getitem, with_cpu, with_debug, with_nvtx_range
-from .layers import with_signpost_interval
-from .layers import tuplify, with_flatten_v2
-
-from .layers import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum
-
+from .config import Config, ConfigValidationError, registry
+from .initializers import (
+    configure_normal_init,
+    glorot_uniform_init,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
+from .layers import (
+    LSTM,
+    CauchySimilarity,
+    ClippedLinear,
+    Dish,
+    Dropout,
+    Embed,
+    Gelu,
+    HardSigmoid,
+    HardSwish,
+    HardSwishMobilenet,
+    HardTanh,
+    HashEmbed,
+    LayerNorm,
+    Linear,
+    Logistic,
+    Maxout,
+    Mish,
+    MultiSoftmax,
+    MXNetWrapper,
+    ParametricAttention,
+    PyTorchLSTM,
+    PyTorchRNNWrapper,
+    PyTorchWrapper,
+    PyTorchWrapper_v2,
+    PyTorchWrapper_v3,
+    Relu,
+    ReluK,
+    Sigmoid,
+    Softmax,
+    Softmax_v2,
+    SparseLinear,
+    SparseLinear_v2,
+    Swish,
+    TensorFlowWrapper,
+    TorchScriptWrapper_v1,
+    add,
+    array_getitem,
+    bidirectional,
+    chain,
+    clone,
+    concatenate,
+    expand_window,
+    keras_subclass,
+    list2array,
+    list2padded,
+    list2ragged,
+    map_list,
+    noop,
+    padded2list,
+    premap_ids,
+    pytorch_to_torchscript_wrapper,
+    ragged2list,
+    reduce_first,
+    reduce_last,
+    reduce_max,
+    reduce_mean,
+    reduce_sum,
+    remap_ids,
+    remap_ids_v2,
+    residual,
+    resizable,
+    siamese,
+    sigmoid_activation,
+    softmax_activation,
+    strings2arrays,
+    tuplify,
+    uniqued,
+    with_array,
+    with_array2d,
+    with_cpu,
+    with_debug,
+    with_flatten,
+    with_flatten_v2,
+    with_getitem,
+    with_list,
+    with_nvtx_range,
+    with_padded,
+    with_ragged,
+    with_reshape,
+    with_signpost_interval,
+)
+from .loss import (
+    CategoricalCrossentropy,
+    CosineDistance,
+    L2Distance,
+    SequenceCategoricalCrossentropy,
+)
+from .model import (
+    Model,
+    change_attr_values,
+    deserialize_attr,
+    serialize_attr,
+    set_dropout_rate,
+    wrap_model_recursive,
+)
+from .optimizers import SGD, Adam, Optimizer, RAdam
+from .schedules import (
+    Schedule,
+    compounding,
+    constant,
+    constant_then,
+    cyclic_triangular,
+    decaying,
+    plateau,
+    slanted_triangular,
+    warmup_linear,
+)
+from .shims import (
+    MXNetShim,
+    PyTorchGradScaler,
+    PyTorchShim,
+    Shim,
+    TensorFlowShim,
+    TorchScriptShim,
+    keras_model_fns,
+    maybe_handshake_model,
+)
+from .types import ArgsKwargs, Padded, Ragged, Unserializable
+from .util import (
+    DataValidationError,
+    data_validation,
+    fix_random_seed,
+    get_array_module,
+    get_torch_default_device,
+    get_width,
+    is_cupy_array,
+    mxnet2xp,
+    prefer_gpu,
+    require_cpu,
+    require_gpu,
+    set_active_gpu,
+    tensorflow2xp,
+    to_categorical,
+    to_numpy,
+    torch2xp,
+    xp2mxnet,
+    xp2tensorflow,
+    xp2torch,
+)
 
 # fmt: off
 __all__ = [
diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
index c21620126..8973c8836 100644
--- a/thinc/backends/__init__.py
+++ b/thinc/backends/__init__.py
@@ -1,20 +1,23 @@
 import contextlib
-from typing import Type, Dict, Any, Callable, Optional, cast
-
-from contextvars import ContextVar
 import threading
+from contextvars import ContextVar
+from typing import Any, Callable, Dict, Optional, Type, cast
 
-from .ops import Ops
-from .cupy_ops import CupyOps
-from .numpy_ops import NumpyOps
-from .mps_ops import MPSOps
-from ._cupy_allocators import cupy_tensorflow_allocator, cupy_pytorch_allocator
-from ._param_server import ParamServer
-from ..util import assert_tensorflow_installed, assert_pytorch_installed
-from ..util import get_torch_default_device, is_cupy_array, require_cpu
 from .. import registry
 from ..compat import cupy, has_cupy
-
+from ..util import (
+    assert_pytorch_installed,
+    assert_tensorflow_installed,
+    get_torch_default_device,
+    is_cupy_array,
+    require_cpu,
+)
+from ._cupy_allocators import cupy_pytorch_allocator, cupy_tensorflow_allocator
+from ._param_server import ParamServer
+from .cupy_ops import CupyOps
+from .mps_ops import MPSOps
+from .numpy_ops import NumpyOps
+from .ops import Ops
 
 context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None)
 context_pools: ContextVar[dict] = ContextVar("context_pools", default={})
diff --git a/thinc/backends/_cupy_allocators.py b/thinc/backends/_cupy_allocators.py
index f2b6faee9..77c958e36 100644
--- a/thinc/backends/_cupy_allocators.py
+++ b/thinc/backends/_cupy_allocators.py
@@ -1,8 +1,8 @@
 from typing import cast
 
+from ..compat import cupy, tensorflow, torch
 from ..types import ArrayXd
 from ..util import get_torch_default_device, tensorflow2xp
-from ..compat import torch, cupy, tensorflow
 
 
 def cupy_tensorflow_allocator(size_in_bytes: int):
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index 0b868e6d6..fa837017d 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -1,12 +1,13 @@
-from typing import Callable, Optional, Tuple
-from functools import reduce
-import numpy
 import operator
 import re
-from pathlib import Path
 from collections import defaultdict
-from ..compat import cupy, has_cupy_gpu
+from functools import reduce
+from pathlib import Path
+from typing import Callable, Optional, Tuple
 
+import numpy
+
+from ..compat import cupy, has_cupy_gpu
 
 PWD = Path(__file__).parent
 KERNELS_SRC = (PWD / "_custom_kernels.cu").read_text(encoding="utf8")
diff --git a/thinc/backends/_param_server.py b/thinc/backends/_param_server.py
index 4ce374a4e..db7b5a505 100644
--- a/thinc/backends/_param_server.py
+++ b/thinc/backends/_param_server.py
@@ -1,9 +1,8 @@
-from typing import Dict, Tuple, Optional, Any
+from typing import Any, Dict, Optional, Tuple
 
 from ..types import FloatsXd
 from ..util import get_array_module
 
-
 KeyT = Tuple[int, str]
 
 
diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
index a789ef4a3..c608d8702 100644
--- a/thinc/backends/cblas.pxd
+++ b/thinc/backends/cblas.pxd
@@ -1,6 +1,5 @@
 from libcpp.memory cimport shared_ptr
 
-
 ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K,
                            float alpha, const float* A, int lda, const float* B,
                            int ldb, float beta, float* C, int ldc) nogil
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 506276380..366faf70a 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -1,13 +1,20 @@
 import numpy
+
 from .. import registry
-from .ops import Ops
-from .numpy_ops import NumpyOps
-from . import _custom_kernels
-from ..types import DeviceTypes
-from ..util import torch2xp, tensorflow2xp, mxnet2xp
-from ..util import is_cupy_array
-from ..util import is_torch_cuda_array, is_tensorflow_gpu_array, is_mxnet_gpu_array
 from ..compat import cupy, cupyx
+from ..types import DeviceTypes
+from ..util import (
+    is_cupy_array,
+    is_mxnet_gpu_array,
+    is_tensorflow_gpu_array,
+    is_torch_cuda_array,
+    mxnet2xp,
+    tensorflow2xp,
+    torch2xp,
+)
+from . import _custom_kernels
+from .numpy_ops import NumpyOps
+from .ops import Ops
 
 
 @registry.ops("CupyOps")
diff --git a/thinc/backends/mps_ops.py b/thinc/backends/mps_ops.py
index 8ebbd4e4b..c6ba71f11 100644
--- a/thinc/backends/mps_ops.py
+++ b/thinc/backends/mps_ops.py
@@ -1,8 +1,10 @@
 from typing import TYPE_CHECKING
+
 import numpy
 
 from .. import registry
-from . import NumpyOps, Ops
+from .numpy_ops import NumpyOps
+from .ops import Ops
 
 if TYPE_CHECKING:
     # Type checking does not work with dynamic base classes, since MyPy cannot
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 45d3d9093..87c6b9d01 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -1,27 +1,29 @@
 # cython: cdivision=True
 # cython: infer_types=True
 # cython: profile=True
-from typing import Optional
 from collections.abc import Sized
+from typing import Optional
+
 import numpy
 
 cimport cython
-from libc.string cimport memcpy, memset
-from libc.stdlib cimport calloc, malloc, free
-from libc.stdint cimport uint32_t, uint64_t
-from libc.string cimport memcpy
-from libc.math cimport isnan
+cimport numpy as np
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap
+from libc.math cimport isnan
+from libc.stdint cimport uint32_t, uint64_t
+from libc.stdlib cimport calloc, free, malloc
+from libc.string cimport memcpy, memset
 from murmurhash.mrmr cimport hash64
-cimport numpy as np
+from preshed.maps cimport PreshMap
 
 from .. import registry
+from ..types import ArrayXd, DeviceTypes, DTypes, Shape
 from ..util import copy_array, get_array_module
-from ..types import DeviceTypes, DTypes, Shape, ArrayXd
-from .cblas cimport CBlas, daxpy, saxpy, sgemm, dgemm, sscal
-from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights
+
+from .cblas cimport CBlas, daxpy, dgemm, saxpy, sgemm, sscal
+
 from ..compat import has_blis
+from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights
 
 
 cdef extern from "math.h":
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 8bb770023..01bb2f852 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1,18 +1,53 @@
+import itertools
 import math
+from typing import (
+    Any,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+    overload,
+)
 
-from typing import Optional, List, Tuple, Sequence, Type, Union, cast, TypeVar
-from typing import Iterator, overload, Any
 import numpy
-import itertools
 
-from ..types import Xp, Shape, DTypes, DTypesInt, DTypesFloat, List2d, ArrayXd
-from ..types import Floats1d, Floats2d, Floats3d, Floats4d
-from ..types import Array1d, Array2d, Array3d, Array4d, ListXd
-from ..types import FloatsXd, Ints1d, Ints2d, Ints3d, Ints4d, IntsXd, _Floats
-from ..types import FloatsXdT
-from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator
+from ..types import (
+    Array1d,
+    Array2d,
+    Array3d,
+    Array4d,
+    ArrayXd,
+    Batchable,
+    DeviceTypes,
+    DTypes,
+    DTypesFloat,
+    DTypesInt,
+    Floats1d,
+    Floats2d,
+    Floats3d,
+    Floats4d,
+    FloatsXd,
+    FloatsXdT,
+    Generator,
+    Ints1d,
+    Ints2d,
+    Ints3d,
+    Ints4d,
+    IntsXd,
+    List2d,
+    ListXd,
+    Padded,
+    Shape,
+    SizedGenerator,
+    Xp,
+    _Floats,
+)
 from ..util import get_array_module, is_xp_array, to_numpy
-
 from .cblas import CBlas
 
 ArrayT = TypeVar("ArrayT", bound=ArrayXd)
diff --git a/thinc/compat.py b/thinc/compat.py
index 6d8b139fe..7e79cdaf9 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -27,8 +27,8 @@
 
 
 try:  # pragma: no cover
-    import torch.utils.dlpack
     import torch
+    import torch.utils.dlpack
 
     has_torch = True
     has_torch_cuda_gpu = torch.cuda.device_count() != 0
@@ -51,8 +51,8 @@
     torch_version = Version("0.0.0")
 
 try:  # pragma: no cover
-    import tensorflow.experimental.dlpack
     import tensorflow
+    import tensorflow.experimental.dlpack
 
     has_tensorflow = True
     has_tensorflow_gpu = len(tensorflow.config.get_visible_devices("GPU")) > 0
diff --git a/thinc/config.py b/thinc/config.py
index e5452819b..434c96085 100644
--- a/thinc/config.py
+++ b/thinc/config.py
@@ -1,6 +1,7 @@
 import catalogue
 import confection
-from confection import Config, ConfigValidationError, Promise, VARIABLE_RE
+from confection import VARIABLE_RE, Config, ConfigValidationError, Promise
+
 from .types import Decorator
 
 
diff --git a/thinc/initializers.py b/thinc/initializers.py
index 1333911a3..feb02889d 100644
--- a/thinc/initializers.py
+++ b/thinc/initializers.py
@@ -1,4 +1,5 @@
 from typing import Callable, cast
+
 import numpy
 
 from .backends import Ops
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 4b73a2dce..032af5fde 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -1,48 +1,48 @@
 # Weights layers
+# Combinators
+from .add import add
+
+# Array manipulation
+from .array_getitem import array_getitem
+from .bidirectional import bidirectional
 from .cauchysimilarity import CauchySimilarity
+from .chain import chain
+from .clipped_linear import ClippedLinear, HardSigmoid, HardTanh, ReluK
+from .clone import clone
+from .concatenate import concatenate
 from .dish import Dish
 from .dropout import Dropout
 from .embed import Embed
 from .expand_window import expand_window
+from .gelu import Gelu
+from .hard_swish import HardSwish
+from .hard_swish_mobilenet import HardSwishMobilenet
 from .hashembed import HashEmbed
 from .layernorm import LayerNorm
 from .linear import Linear
-from .lstm import LSTM, PyTorchLSTM
+
+# Data-type transfers
+from .list2array import list2array
+from .list2padded import list2padded
+from .list2ragged import list2ragged
 from .logistic import Logistic
+from .lstm import LSTM, PyTorchLSTM
+from .map_list import map_list
 from .maxout import Maxout
 from .mish import Mish
 from .multisoftmax import MultiSoftmax
-from .parametricattention import ParametricAttention
-from .pytorchwrapper import PyTorchWrapper, PyTorchWrapper_v2, PyTorchWrapper_v3
-from .pytorchwrapper import PyTorchRNNWrapper
-from .relu import Relu
-from .clipped_linear import ClippedLinear, ReluK, HardSigmoid, HardTanh
-from .hard_swish import HardSwish
-from .hard_swish_mobilenet import HardSwishMobilenet
-from .swish import Swish
-from .gelu import Gelu
-from .resizable import resizable
-from .sigmoid_activation import sigmoid_activation
-from .sigmoid import Sigmoid
-from .softmax_activation import softmax_activation
-from .softmax import Softmax, Softmax_v2
-from .sparselinear import SparseLinear, SparseLinear_v2
-from .tensorflowwrapper import TensorFlowWrapper, keras_subclass
-from .torchscriptwrapper import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper
 from .mxnetwrapper import MXNetWrapper
-
-# Combinators
-from .add import add
-from .bidirectional import bidirectional
-from .chain import chain
-from .clone import clone
-from .concatenate import concatenate
-from .map_list import map_list
 from .noop import noop
-from .residual import residual
-from .uniqued import uniqued
-from .siamese import siamese
-from .tuplify import tuplify
+from .padded2list import padded2list
+from .parametricattention import ParametricAttention
+from .premap_ids import premap_ids
+from .pytorchwrapper import (
+    PyTorchRNNWrapper,
+    PyTorchWrapper,
+    PyTorchWrapper_v2,
+    PyTorchWrapper_v3,
+)
+from .ragged2list import ragged2list
 
 # Pooling
 from .reduce_first import reduce_first
@@ -50,34 +50,36 @@
 from .reduce_max import reduce_max
 from .reduce_mean import reduce_mean
 from .reduce_sum import reduce_sum
-
-# Array manipulation
-from .array_getitem import array_getitem
-
-# Data-type transfers
-from .list2array import list2array
-from .list2ragged import list2ragged
-from .list2padded import list2padded
-from .ragged2list import ragged2list
-from .padded2list import padded2list
+from .relu import Relu
 from .remap_ids import remap_ids, remap_ids_v2
-from .premap_ids import premap_ids
+from .residual import residual
+from .resizable import resizable
+from .siamese import siamese
+from .sigmoid import Sigmoid
+from .sigmoid_activation import sigmoid_activation
+from .softmax import Softmax, Softmax_v2
+from .softmax_activation import softmax_activation
+from .sparselinear import SparseLinear, SparseLinear_v2
 from .strings2arrays import strings2arrays
+from .swish import Swish
+from .tensorflowwrapper import TensorFlowWrapper, keras_subclass
+from .torchscriptwrapper import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper
+from .tuplify import tuplify
+from .uniqued import uniqued
 from .with_array import with_array
 from .with_array2d import with_array2d
 from .with_cpu import with_cpu
+from .with_debug import with_debug
 from .with_flatten import with_flatten
 from .with_flatten_v2 import with_flatten_v2
-from .with_padded import with_padded
+from .with_getitem import with_getitem
 from .with_list import with_list
+from .with_nvtx_range import with_nvtx_range
+from .with_padded import with_padded
 from .with_ragged import with_ragged
 from .with_reshape import with_reshape
-from .with_getitem import with_getitem
-from .with_debug import with_debug
-from .with_nvtx_range import with_nvtx_range
 from .with_signpost_interval import with_signpost_interval
 
-
 # fmt: off
 __all__ = [
     "CauchySimilarity",
diff --git a/thinc/layers/add.py b/thinc/layers/add.py
index 60b1f46b9..a3aa1af17 100644
--- a/thinc/layers/add.py
+++ b/thinc/layers/add.py
@@ -1,11 +1,10 @@
-from typing import Any, Tuple, Callable, Optional, TypeVar, Dict
+from typing import Any, Callable, Dict, Optional, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import ArrayXd, XY_XY_OutT
 from ..util import get_width
 
-
 InT = TypeVar("InT", bound=Any)
 OutT = TypeVar("OutT", bound=ArrayXd)
 
diff --git a/thinc/layers/array_getitem.py b/thinc/layers/array_getitem.py
index 17ffcb7ee..219b4ea1c 100644
--- a/thinc/layers/array_getitem.py
+++ b/thinc/layers/array_getitem.py
@@ -1,7 +1,7 @@
-from typing import Union, Sequence, Tuple, TypeVar
-from ..types import ArrayXd, FloatsXd, IntsXd
-from ..model import Model
+from typing import Sequence, Tuple, TypeVar, Union
 
+from ..model import Model
+from ..types import ArrayXd, FloatsXd, IntsXd
 
 AxisIndex = Union[int, slice, Sequence[int]]
 Index = Union[AxisIndex, Tuple[AxisIndex, ...]]
diff --git a/thinc/layers/bidirectional.py b/thinc/layers/bidirectional.py
index 1ff73f013..8cea04e30 100644
--- a/thinc/layers/bidirectional.py
+++ b/thinc/layers/bidirectional.py
@@ -1,11 +1,10 @@
-from typing import Optional, Tuple, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..backends import Ops
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Padded
 
-
 InT = Padded
 OutT = Padded
 
diff --git a/thinc/layers/cauchysimilarity.py b/thinc/layers/cauchysimilarity.py
index 25af8d9df..57e5932ec 100644
--- a/thinc/layers/cauchysimilarity.py
+++ b/thinc/layers/cauchysimilarity.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats1d, Floats2d
 from ..util import get_width
 
-
 InT = Tuple[Floats2d, Floats2d]
 OutT = Floats1d
 
diff --git a/thinc/layers/chain.py b/thinc/layers/chain.py
index 258ee0902..a7e3ee7da 100644
--- a/thinc/layers/chain.py
+++ b/thinc/layers/chain.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable, Optional, TypeVar, Any, Dict, List, cast
+from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..util import get_width
+from ..model import Model
 from ..types import XY_YZ_OutT
-
+from ..util import get_width
 
 InT = TypeVar("InT")
 MidT = TypeVar("MidT")
diff --git a/thinc/layers/clipped_linear.py b/thinc/layers/clipped_linear.py
index 34bb8ade8..efe295fa6 100644
--- a/thinc/layers/clipped_linear.py
+++ b/thinc/layers/clipped_linear.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import glorot_uniform_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import glorot_uniform_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("ClippedLinear.v1")
diff --git a/thinc/layers/clone.py b/thinc/layers/clone.py
index 8b433407d..1758f5fe7 100644
--- a/thinc/layers/clone.py
+++ b/thinc/layers/clone.py
@@ -1,10 +1,9 @@
-from typing import TypeVar, cast, List
+from typing import List, TypeVar, cast
 
-from .noop import noop
-from .chain import chain
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from .chain import chain
+from .noop import noop
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/layers/concatenate.py b/thinc/layers/concatenate.py
index 4cce96954..e810cefc3 100644
--- a/thinc/layers/concatenate.py
+++ b/thinc/layers/concatenate.py
@@ -1,14 +1,22 @@
-from typing import Any, List, Tuple, Callable, Optional
-from typing import TypeVar, cast, Dict, Union, Sequence
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
-from ..types import Array2d, Ragged
+from ..model import Model
+from ..types import Array2d, Ragged, XY_XY_OutT
 from ..util import get_width
 from .noop import noop
-from ..types import XY_XY_OutT
-
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/thinc/layers/dish.py b/thinc/layers/dish.py
index 1092638e7..dc871ad24 100644
--- a/thinc/layers/dish.py
+++ b/thinc/layers/dish.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("Dish.v1")
diff --git a/thinc/layers/dropout.py b/thinc/layers/dropout.py
index f4fa29445..7db35261a 100644
--- a/thinc/layers/dropout.py
+++ b/thinc/layers/dropout.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, List, TypeVar, cast, Union, Sequence
+from typing import Callable, List, Sequence, Tuple, TypeVar, Union, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import ArrayXd, Ragged, Padded
-
+from ..model import Model
+from ..types import ArrayXd, Padded, Ragged
 
 InT = TypeVar("InT", bound=Union[ArrayXd, Sequence[ArrayXd], Ragged, Padded])
 
diff --git a/thinc/layers/embed.py b/thinc/layers/embed.py
index 703baf475..9d8d34e4a 100644
--- a/thinc/layers/embed.py
+++ b/thinc/layers/embed.py
@@ -1,13 +1,12 @@
-from typing import Dict, Callable, Tuple, Optional, Union, cast, TypeVar
+from typing import Callable, Dict, Optional, Tuple, TypeVar, Union, cast
 
-from .chain import chain
-from .array_getitem import ints_getitem
-from ..model import Model
 from ..config import registry
-from ..types import Ints1d, Ints2d, Floats1d, Floats2d
 from ..initializers import uniform_init
+from ..model import Model
+from ..types import Floats1d, Floats2d, Ints1d, Ints2d
 from ..util import get_width, partial
-
+from .array_getitem import ints_getitem
+from .chain import chain
 
 InT = TypeVar("InT", bound=Union[Ints1d, Ints2d])
 OutT = Floats2d
diff --git a/thinc/layers/expand_window.py b/thinc/layers/expand_window.py
index 1075a49a2..193b82d39 100644
--- a/thinc/layers/expand_window.py
+++ b/thinc/layers/expand_window.py
@@ -1,10 +1,9 @@
-from typing import Tuple, TypeVar, Callable, Union, cast
+from typing import Callable, Tuple, TypeVar, Union, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d, Ragged
 
-
 InT = TypeVar("InT", Floats2d, Ragged)
 
 
diff --git a/thinc/layers/gelu.py b/thinc/layers/gelu.py
index 686b1f0d8..f51ee4545 100644
--- a/thinc/layers/gelu.py
+++ b/thinc/layers/gelu.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("Gelu.v1")
diff --git a/thinc/layers/hard_swish.py b/thinc/layers/hard_swish.py
index 773314a38..2fc135e41 100644
--- a/thinc/layers/hard_swish.py
+++ b/thinc/layers/hard_swish.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("HardSwish.v1")
diff --git a/thinc/layers/hard_swish_mobilenet.py b/thinc/layers/hard_swish_mobilenet.py
index 9f5f3fb9f..400622497 100644
--- a/thinc/layers/hard_swish_mobilenet.py
+++ b/thinc/layers/hard_swish_mobilenet.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("HardSwishMobilenet.v1")
diff --git a/thinc/layers/hashembed.py b/thinc/layers/hashembed.py
index 8c85fdb02..7ecd9b26a 100644
--- a/thinc/layers/hashembed.py
+++ b/thinc/layers/hashembed.py
@@ -1,13 +1,12 @@
-from typing import Callable, Dict, Tuple, Optional, Any, Union, cast, TypeVar
+from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, Union, cast
 
-from .chain import chain
-from .array_getitem import ints_getitem
-from ..model import Model
 from ..config import registry
-from ..types import Floats1d, Floats2d, Ints2d, Ints1d
 from ..initializers import uniform_init
+from ..model import Model
+from ..types import Floats1d, Floats2d, Ints1d, Ints2d
 from ..util import partial
-
+from .array_getitem import ints_getitem
+from .chain import chain
 
 InT = TypeVar("InT", bound=Union[Ints1d, Ints2d])
 OutT = Floats2d
diff --git a/thinc/layers/layernorm.py b/thinc/layers/layernorm.py
index 684489c54..2090ed9a8 100644
--- a/thinc/layers/layernorm.py
+++ b/thinc/layers/layernorm.py
@@ -1,12 +1,11 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
+from ..backends import Ops
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
-from ..backends import Ops
 from ..util import get_width
 
-
 InT = Floats2d
 
 
diff --git a/thinc/layers/linear.py b/thinc/layers/linear.py
index bbf7b7874..ef24ec044 100644
--- a/thinc/layers/linear.py
+++ b/thinc/layers/linear.py
@@ -1,12 +1,11 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats1d, Floats2d
 from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/list2array.py b/thinc/layers/list2array.py
index a52d6e6c6..a31d5d80d 100644
--- a/thinc/layers/list2array.py
+++ b/thinc/layers/list2array.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, TypeVar, List
+from typing import Callable, List, Tuple, TypeVar
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Array2d
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/thinc/layers/list2padded.py b/thinc/layers/list2padded.py
index 2a02f90e0..e98e88a5c 100644
--- a/thinc/layers/list2padded.py
+++ b/thinc/layers/list2padded.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, TypeVar, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..types import Padded, List2d
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from ..types import List2d, Padded
 
 InT = TypeVar("InT", bound=List2d)
 OutT = Padded
diff --git a/thinc/layers/list2ragged.py b/thinc/layers/list2ragged.py
index a63237dfe..25ad7bed3 100644
--- a/thinc/layers/list2ragged.py
+++ b/thinc/layers/list2ragged.py
@@ -1,9 +1,8 @@
-from typing import Tuple, List, Callable, cast, TypeVar
+from typing import Callable, List, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import ListXd, ArrayXd, Ragged
-
+from ..model import Model
+from ..types import ArrayXd, ListXd, Ragged
 
 InT = TypeVar("InT", bound=ListXd)
 OutT = Ragged
diff --git a/thinc/layers/logistic.py b/thinc/layers/logistic.py
index cda0c7dd5..43d45a330 100644
--- a/thinc/layers/logistic.py
+++ b/thinc/layers/logistic.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable
+from typing import Callable, Tuple
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/lstm.py b/thinc/layers/lstm.py
index 266fee6e3..c817cd4db 100644
--- a/thinc/layers/lstm.py
+++ b/thinc/layers/lstm.py
@@ -1,13 +1,13 @@
-from typing import Optional, Tuple, Callable, cast
 from functools import partial
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
+from ..backends import Ops
 from ..config import registry
-from ..util import get_width
+from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
 from ..types import Floats1d, Floats2d, Floats4d, Padded, Ragged
+from ..util import get_width
 from .noop import noop
-from ..initializers import glorot_uniform_init, zero_init
-from ..backends import Ops
 
 
 @registry.layers("LSTM.v1")
@@ -45,8 +45,9 @@ def PyTorchLSTM(
     nO: int, nI: int, *, bi: bool = False, depth: int = 1, dropout: float = 0.0
 ) -> Model[Padded, Padded]:
     import torch.nn
-    from .with_padded import with_padded
+
     from .pytorchwrapper import PyTorchRNNWrapper
+    from .with_padded import with_padded
 
     if depth == 0:
         return noop()  # type: ignore[misc]
diff --git a/thinc/layers/map_list.py b/thinc/layers/map_list.py
index b05a934b1..aaadf0b55 100644
--- a/thinc/layers/map_list.py
+++ b/thinc/layers/map_list.py
@@ -1,6 +1,6 @@
-from typing import Callable, TypeVar, List, Tuple, Optional
-from ..model import Model
+from typing import Callable, List, Optional, Tuple, TypeVar
 
+from ..model import Model
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/layers/maxout.py b/thinc/layers/maxout.py
index 72788a5c7..ff0e52037 100644
--- a/thinc/layers/maxout.py
+++ b/thinc/layers/maxout.py
@@ -1,14 +1,13 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
 from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
 from ..types import Floats2d
 from ..util import get_width, partial
+from .chain import chain
 from .dropout import Dropout
 from .layernorm import LayerNorm
-from .chain import chain
-
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/mish.py b/thinc/layers/mish.py
index ab7a2a76c..32542b963 100644
--- a/thinc/layers/mish.py
+++ b/thinc/layers/mish.py
@@ -1,14 +1,13 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
-from ..initializers import glorot_uniform_init, zero_init
 from ..config import registry
+from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
 from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-
+from .layernorm import LayerNorm
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/multisoftmax.py b/thinc/layers/multisoftmax.py
index cf55ecc37..d07b684f4 100644
--- a/thinc/layers/multisoftmax.py
+++ b/thinc/layers/multisoftmax.py
@@ -1,11 +1,10 @@
-from typing import Optional, Tuple, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..types import Floats2d, Floats1d
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/mxnetwrapper.py b/thinc/layers/mxnetwrapper.py
index 642d01f38..2303871fb 100644
--- a/thinc/layers/mxnetwrapper.py
+++ b/thinc/layers/mxnetwrapper.py
@@ -1,11 +1,10 @@
-from typing import Callable, Tuple, Optional, Any, Type
+from typing import Any, Callable, Optional, Tuple, Type
 
+from ..config import registry
 from ..model import Model
 from ..shims import MXNetShim
-from ..config import registry
-from ..util import is_xp_array, is_mxnet_array
-from ..util import mxnet2xp, xp2mxnet, convert_recursive
 from ..types import ArgsKwargs
+from ..util import convert_recursive, is_mxnet_array, is_xp_array, mxnet2xp, xp2mxnet
 
 
 @registry.layers("MXNetWrapper.v1")
diff --git a/thinc/layers/noop.py b/thinc/layers/noop.py
index d1c83d1cd..2e855b875 100644
--- a/thinc/layers/noop.py
+++ b/thinc/layers/noop.py
@@ -1,8 +1,7 @@
-from typing import Tuple, Callable, TypeVar
+from typing import Callable, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
 
 InOutT = TypeVar("InOutT")
 
diff --git a/thinc/layers/padded2list.py b/thinc/layers/padded2list.py
index 8f1bee7e8..a4d374e6b 100644
--- a/thinc/layers/padded2list.py
+++ b/thinc/layers/padded2list.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, TypeVar, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..types import Padded, List2d
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from ..types import List2d, Padded
 
 InT = Padded
 OutT = TypeVar("OutT", bound=List2d)
diff --git a/thinc/layers/parametricattention.py b/thinc/layers/parametricattention.py
index d54a2f19e..a03906f51 100644
--- a/thinc/layers/parametricattention.py
+++ b/thinc/layers/parametricattention.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional
+from typing import Callable, Optional, Tuple
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Ragged
 from ..util import get_width
 
-
 InT = Ragged
 OutT = Ragged
 
diff --git a/thinc/layers/premap_ids.pyx b/thinc/layers/premap_ids.pyx
index 74bc8dc6a..17acafa8e 100644
--- a/thinc/layers/premap_ids.pyx
+++ b/thinc/layers/premap_ids.pyx
@@ -1,13 +1,15 @@
 # cython: binding=True, infer_types=True
 import numpy
+
 from preshed.maps cimport PreshMap
-from typing import Dict, Union, Optional, cast, Callable, Tuple, Mapping
-from ..types import Ints1d, Ints2d
+
+from typing import Callable, Dict, Mapping, Optional, Tuple, Union, cast
+
 from ..config import registry
 from ..model import Model
+from ..types import Ints1d, Ints2d
 from ..util import to_numpy
 
-
 InT = Union[Ints1d, Ints2d]
 OutT = Ints2d
 
diff --git a/thinc/layers/pytorchwrapper.py b/thinc/layers/pytorchwrapper.py
index a1b0c462a..39c8b95c1 100644
--- a/thinc/layers/pytorchwrapper.py
+++ b/thinc/layers/pytorchwrapper.py
@@ -1,12 +1,18 @@
-from typing import Callable, Dict, Tuple, Optional, Any, cast
+from typing import Any, Callable, Dict, Optional, Tuple, cast
 
 from ..compat import torch
+from ..config import registry
 from ..model import Model
 from ..shims import PyTorchGradScaler, PyTorchShim
-from ..config import registry
-from ..util import is_xp_array, is_torch_array, partial
-from ..util import xp2torch, torch2xp, convert_recursive
-from ..types import Floats3d, ArgsKwargs, Padded
+from ..types import ArgsKwargs, Floats3d, Padded
+from ..util import (
+    convert_recursive,
+    is_torch_array,
+    is_xp_array,
+    partial,
+    torch2xp,
+    xp2torch,
+)
 
 
 @registry.layers("PyTorchRNNWrapper.v1")
diff --git a/thinc/layers/ragged2list.py b/thinc/layers/ragged2list.py
index 35af28f2f..3d8463f11 100644
--- a/thinc/layers/ragged2list.py
+++ b/thinc/layers/ragged2list.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, TypeVar, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ragged, ListXd
-
+from ..model import Model
+from ..types import ListXd, Ragged
 
 InT = Ragged
 OutT = TypeVar("OutT", bound=ListXd)
diff --git a/thinc/layers/reduce_first.py b/thinc/layers/reduce_first.py
index ab72cb5e3..ede42c5d0 100644
--- a/thinc/layers/reduce_first.py
+++ b/thinc/layers/reduce_first.py
@@ -1,11 +1,10 @@
 from typing import Callable, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ragged, Floats2d
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/reduce_last.py b/thinc/layers/reduce_last.py
index b8194ec2b..d2de6a877 100644
--- a/thinc/layers/reduce_last.py
+++ b/thinc/layers/reduce_last.py
@@ -1,8 +1,8 @@
 from typing import Callable, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ragged, Floats2d
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
 InT = Ragged
diff --git a/thinc/layers/reduce_max.py b/thinc/layers/reduce_max.py
index ebafb5172..e6f033e48 100644
--- a/thinc/layers/reduce_max.py
+++ b/thinc/layers/reduce_max.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, cast
+from typing import Callable, Tuple, cast
 
-from ..types import Floats2d, Ragged
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/reduce_mean.py b/thinc/layers/reduce_mean.py
index f37ae8253..f1bd04898 100644
--- a/thinc/layers/reduce_mean.py
+++ b/thinc/layers/reduce_mean.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, cast
+from typing import Callable, Tuple, cast
 
-from ..types import Floats2d, Ragged
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/reduce_sum.py b/thinc/layers/reduce_sum.py
index e93a362d8..62ade00f6 100644
--- a/thinc/layers/reduce_sum.py
+++ b/thinc/layers/reduce_sum.py
@@ -1,11 +1,10 @@
 from typing import Callable, Tuple, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d, Ragged
 from ..util import ArrayInfo
 
-
 InT = Ragged
 OutT = Floats2d
 
diff --git a/thinc/layers/relu.py b/thinc/layers/relu.py
index d1d3ebf74..488a1eff7 100644
--- a/thinc/layers/relu.py
+++ b/thinc/layers/relu.py
@@ -1,14 +1,13 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
-from ..initializers import glorot_uniform_init, zero_init
 from ..config import registry
-from ..types import Floats2d, Floats1d
+from ..initializers import glorot_uniform_init, zero_init
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-
+from .layernorm import LayerNorm
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/remap_ids.py b/thinc/layers/remap_ids.py
index 265b24a9d..3801b703f 100644
--- a/thinc/layers/remap_ids.py
+++ b/thinc/layers/remap_ids.py
@@ -1,12 +1,10 @@
-from typing import Tuple, Callable, Sequence, cast
-from typing import Dict, Union, Optional, Hashable, Any
+from typing import Any, Callable, Dict, Hashable, Optional, Sequence, Tuple, Union, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Ints1d, Ints2d, DTypes
+from ..model import Model
+from ..types import DTypes, Ints1d, Ints2d
 from ..util import is_xp_array, to_numpy
 
-
 InT = Union[Sequence[Hashable], Ints1d, Ints2d]
 OutT = Ints2d
 
diff --git a/thinc/layers/residual.py b/thinc/layers/residual.py
index 3793ee1d5..f213e9bf5 100644
--- a/thinc/layers/residual.py
+++ b/thinc/layers/residual.py
@@ -1,8 +1,8 @@
-from typing import Tuple, Callable, Optional, List, TypeVar
+from typing import Callable, List, Optional, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats1d, Floats2d, Floats3d, Floats4d, FloatsXd, Ragged, Padded
+from ..model import Model
+from ..types import Floats1d, Floats2d, Floats3d, Floats4d, FloatsXd, Padded, Ragged
 
 # fmt: off
 InT = TypeVar(  
diff --git a/thinc/layers/resizable.py b/thinc/layers/resizable.py
index 2dd4dde1a..606d50dae 100644
--- a/thinc/layers/resizable.py
+++ b/thinc/layers/resizable.py
@@ -1,7 +1,7 @@
 from typing import Callable, Optional, TypeVar
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
 
 InT = TypeVar("InT")
diff --git a/thinc/layers/siamese.py b/thinc/layers/siamese.py
index 82bafacbb..33579a4de 100644
--- a/thinc/layers/siamese.py
+++ b/thinc/layers/siamese.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, TypeVar
+from typing import Callable, Optional, Tuple, TypeVar
 
+from ..config import registry
 from ..model import Model
 from ..types import ArrayXd
-from ..config import registry
 from ..util import get_width
 
-
 LayerT = TypeVar("LayerT")
 SimT = TypeVar("SimT")
 InT = Tuple[LayerT, LayerT]
diff --git a/thinc/layers/sigmoid.py b/thinc/layers/sigmoid.py
index d8933b66e..157047e37 100644
--- a/thinc/layers/sigmoid.py
+++ b/thinc/layers/sigmoid.py
@@ -1,12 +1,11 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats2d, Floats1d
 from ..initializers import zero_init
+from ..model import Model
+from ..types import Floats1d, Floats2d
 from ..util import get_width, partial
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/sigmoid_activation.py b/thinc/layers/sigmoid_activation.py
index b87261075..37e188ab8 100644
--- a/thinc/layers/sigmoid_activation.py
+++ b/thinc/layers/sigmoid_activation.py
@@ -1,7 +1,7 @@
-from typing import TypeVar, Tuple, Callable, cast
+from typing import Callable, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import FloatsXdT
 
 
diff --git a/thinc/layers/softmax.py b/thinc/layers/softmax.py
index 9d766f1db..8b7301af0 100644
--- a/thinc/layers/softmax.py
+++ b/thinc/layers/softmax.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, cast
+from typing import Callable, Optional, Tuple, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Floats2d, Floats1d
 from ..initializers import zero_init
-from ..util import get_width, partial, ArrayInfo
-
+from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import ArrayInfo, get_width, partial
 
 InT = Floats2d
 OutT = Floats2d
diff --git a/thinc/layers/softmax_activation.py b/thinc/layers/softmax_activation.py
index 858320143..974ed2c8c 100644
--- a/thinc/layers/softmax_activation.py
+++ b/thinc/layers/softmax_activation.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable
+from typing import Callable, Tuple
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Floats2d
 
-
 InT = Floats2d
 OutT = Floats2d
 
diff --git a/thinc/layers/sparselinear.pyx b/thinc/layers/sparselinear.pyx
index b9a982f4b..a1be75ccc 100644
--- a/thinc/layers/sparselinear.pyx
+++ b/thinc/layers/sparselinear.pyx
@@ -1,16 +1,15 @@
 # cython: infer_types=True, cdivision=True, bounds_check=False, wraparound=False
-cimport numpy as np
-from libc.stdint cimport uint64_t, int32_t, uint32_t
 cimport cython
+cimport numpy as np
+from libc.stdint cimport int32_t, uint32_t, uint64_t
 
-from typing import Tuple, Callable, Optional
+from typing import Callable, Optional, Tuple
 
-from ..types import ArrayXd
-from ..model import Model
+from ..backends import CupyOps, NumpyOps
 from ..config import registry
-from ..util import get_width, is_cupy_array, is_numpy_array, get_array_module
-from ..backends import NumpyOps, CupyOps
-
+from ..model import Model
+from ..types import ArrayXd
+from ..util import get_array_module, get_width, is_cupy_array, is_numpy_array
 
 InT = Tuple[ArrayXd, ArrayXd, ArrayXd]
 OutT = ArrayXd
diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py
index 469b1636d..91a6b1a31 100644
--- a/thinc/layers/strings2arrays.py
+++ b/thinc/layers/strings2arrays.py
@@ -1,11 +1,11 @@
-from typing import Tuple, List, Callable, Sequence
+from typing import Callable, List, Sequence, Tuple
+
 from murmurhash import hash_unicode
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Ints2d
 
-
 InT = Sequence[Sequence[str]]
 OutT = List[Ints2d]
 
diff --git a/thinc/layers/swish.py b/thinc/layers/swish.py
index 4f3fe49d5..5cf8be50f 100644
--- a/thinc/layers/swish.py
+++ b/thinc/layers/swish.py
@@ -1,13 +1,13 @@
-from typing import Tuple, Optional, Callable, cast
+from typing import Callable, Optional, Tuple, cast
 
 from ..config import registry
+from ..initializers import he_normal_init, zero_init
 from ..model import Model
+from ..types import Floats1d, Floats2d
+from ..util import get_width, partial
 from .chain import chain
-from .layernorm import LayerNorm
 from .dropout import Dropout
-from ..types import Floats1d, Floats2d
-from ..util import partial, get_width
-from ..initializers import he_normal_init, zero_init
+from .layernorm import LayerNorm
 
 
 @registry.layers("Swish.v1")
diff --git a/thinc/layers/tensorflowwrapper.py b/thinc/layers/tensorflowwrapper.py
index 7e166ea50..a77e0b3af 100644
--- a/thinc/layers/tensorflowwrapper.py
+++ b/thinc/layers/tensorflowwrapper.py
@@ -2,12 +2,18 @@
 
 import srsly
 
+from ..compat import tensorflow as tf
 from ..model import Model
 from ..shims import TensorFlowShim, keras_model_fns, maybe_handshake_model
-from ..util import xp2tensorflow, tensorflow2xp, assert_tensorflow_installed
-from ..util import is_tensorflow_array, convert_recursive, is_xp_array
-from ..types import ArrayXd, ArgsKwargs
-from ..compat import tensorflow as tf
+from ..types import ArgsKwargs, ArrayXd
+from ..util import (
+    assert_tensorflow_installed,
+    convert_recursive,
+    is_tensorflow_array,
+    is_xp_array,
+    tensorflow2xp,
+    xp2tensorflow,
+)
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/layers/torchscriptwrapper.py b/thinc/layers/torchscriptwrapper.py
index a74db9225..a3a8e1ac0 100644
--- a/thinc/layers/torchscriptwrapper.py
+++ b/thinc/layers/torchscriptwrapper.py
@@ -3,8 +3,11 @@
 from ..compat import torch
 from ..model import Model
 from ..shims import PyTorchGradScaler, PyTorchShim, TorchScriptShim
-from .pytorchwrapper import forward, convert_pytorch_default_inputs
-from .pytorchwrapper import convert_pytorch_default_outputs
+from .pytorchwrapper import (
+    convert_pytorch_default_inputs,
+    convert_pytorch_default_outputs,
+    forward,
+)
 
 
 def TorchScriptWrapper_v1(
diff --git a/thinc/layers/tuplify.py b/thinc/layers/tuplify.py
index 99b4d7589..35dfdc66f 100644
--- a/thinc/layers/tuplify.py
+++ b/thinc/layers/tuplify.py
@@ -1,7 +1,7 @@
-from typing import Optional, Tuple, Any, TypeVar
+from typing import Any, Optional, Tuple, TypeVar
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 
 InT = TypeVar("InT")
 OutT = Tuple
diff --git a/thinc/layers/uniqued.py b/thinc/layers/uniqued.py
index 582b31093..26f2cdf16 100644
--- a/thinc/layers/uniqued.py
+++ b/thinc/layers/uniqued.py
@@ -1,10 +1,10 @@
-from typing import Tuple, Callable, Optional
+from typing import Callable, Optional, Tuple
+
 import numpy
 
-from ..model import Model
 from ..config import registry
-from ..types import Ints2d, Floats2d
-
+from ..model import Model
+from ..types import Floats2d, Ints2d
 
 InT = Ints2d
 OutT = Floats2d
diff --git a/thinc/layers/with_array.py b/thinc/layers/with_array.py
index 2511b3c17..31b9fa494 100644
--- a/thinc/layers/with_array.py
+++ b/thinc/layers/with_array.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable, Optional, TypeVar, Union, cast
+from typing import Callable, Optional, Tuple, TypeVar, Union, cast
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
-from ..types import Padded, Ragged, ArrayXd, Array3d, ListXd
-
+from ..model import Model
+from ..types import Array3d, ArrayXd, ListXd, Padded, Ragged
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/thinc/layers/with_array2d.py b/thinc/layers/with_array2d.py
index 740593a26..98eba8b96 100644
--- a/thinc/layers/with_array2d.py
+++ b/thinc/layers/with_array2d.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
 from ..backends import NumpyOps
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import Array2d, Floats2d, List2d, Padded, Ragged
 
-
 NUMPY_OPS = NumpyOps()
 
 
diff --git a/thinc/layers/with_cpu.py b/thinc/layers/with_cpu.py
index 3fc7645a8..39e5965f2 100644
--- a/thinc/layers/with_cpu.py
+++ b/thinc/layers/with_cpu.py
@@ -1,10 +1,11 @@
-from typing import Tuple, Callable, Any
+from typing import Any, Callable, Tuple
 
 import numpy
+
 from thinc.backends import Ops
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 
 
 @registry.layers("with_cpu.v1")
diff --git a/thinc/layers/with_debug.py b/thinc/layers/with_debug.py
index 91505c9f6..21790e468 100644
--- a/thinc/layers/with_debug.py
+++ b/thinc/layers/with_debug.py
@@ -1,4 +1,4 @@
-from typing import Optional, Callable, Any, Tuple, TypeVar
+from typing import Any, Callable, Optional, Tuple, TypeVar
 
 from ..model import Model
 
diff --git a/thinc/layers/with_flatten.py b/thinc/layers/with_flatten.py
index 5cf8a85cf..9658a788f 100644
--- a/thinc/layers/with_flatten.py
+++ b/thinc/layers/with_flatten.py
@@ -1,7 +1,7 @@
-from typing import Tuple, Callable, Sequence, Any, cast, TypeVar, Optional, List
+from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
+from ..model import Model
 from ..types import ArrayXd, ListXd
 
 ItemT = TypeVar("ItemT")
diff --git a/thinc/layers/with_flatten_v2.py b/thinc/layers/with_flatten_v2.py
index 4dd75e0d1..95549994f 100644
--- a/thinc/layers/with_flatten_v2.py
+++ b/thinc/layers/with_flatten_v2.py
@@ -1,8 +1,7 @@
-from typing import Tuple, Callable, Sequence, Any, cast, TypeVar, Optional, List
+from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
 
 InItemT = TypeVar("InItemT")
 OutItemT = TypeVar("OutItemT")
diff --git a/thinc/layers/with_getitem.py b/thinc/layers/with_getitem.py
index 9f6b93459..fb6a3cccf 100644
--- a/thinc/layers/with_getitem.py
+++ b/thinc/layers/with_getitem.py
@@ -1,8 +1,7 @@
-from typing import Callable, Optional, Tuple, Any
+from typing import Any, Callable, Optional, Tuple
 
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
 
 InT = Tuple[Any, ...]
 OutT = Tuple[Any, ...]
diff --git a/thinc/layers/with_list.py b/thinc/layers/with_list.py
index 9f86c24dc..5331758a5 100644
--- a/thinc/layers/with_list.py
+++ b/thinc/layers/with_list.py
@@ -1,8 +1,8 @@
-from typing import Tuple, Callable, List, Optional, TypeVar, Union, cast
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
-from ..types import Padded, Ragged, Array2d, List2d, Floats2d, Ints2d
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Array2d, Floats2d, Ints2d, List2d, Padded, Ragged
 
 SeqT = TypeVar("SeqT", Padded, Ragged, List2d, List[Floats2d], List[Ints2d])
 
diff --git a/thinc/layers/with_nvtx_range.py b/thinc/layers/with_nvtx_range.py
index bf270abce..480f82a7c 100644
--- a/thinc/layers/with_nvtx_range.py
+++ b/thinc/layers/with_nvtx_range.py
@@ -1,9 +1,8 @@
-from typing import Optional, Callable, Any, Tuple, TypeVar
+from typing import Any, Callable, Optional, Tuple, TypeVar
 
 from ..model import Model
 from ..util import use_nvtx_range
 
-
 _ModelT = TypeVar("_ModelT", bound=Model)
 
 
diff --git a/thinc/layers/with_padded.py b/thinc/layers/with_padded.py
index 379df1bef..b92c6308a 100644
--- a/thinc/layers/with_padded.py
+++ b/thinc/layers/with_padded.py
@@ -1,11 +1,10 @@
-from typing import Tuple, Callable, Optional, TypeVar, Union, cast, List
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
-from ..types import Padded, Ragged, Floats3d, Ints1d, List2d, Array2d
-from ..model import Model
 from ..config import registry
+from ..model import Model
+from ..types import Array2d, Floats3d, Ints1d, List2d, Padded, Ragged
 from ..util import is_xp_array
 
-
 PaddedData = Tuple[Floats3d, Ints1d, Ints1d, Ints1d]
 SeqT = TypeVar("SeqT", bound=Union[Padded, Ragged, List2d, Floats3d, PaddedData])
 
diff --git a/thinc/layers/with_ragged.py b/thinc/layers/with_ragged.py
index cbff6f59d..6cf45d9e8 100644
--- a/thinc/layers/with_ragged.py
+++ b/thinc/layers/with_ragged.py
@@ -1,10 +1,9 @@
-from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union
+from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast
 
 from ..backends import NumpyOps
-from ..types import Padded, Ragged, Array2d, ListXd, List2d, Ints1d
-from ..model import Model
 from ..config import registry
-
+from ..model import Model
+from ..types import Array2d, Ints1d, List2d, ListXd, Padded, Ragged
 
 NUMPY_OPS = NumpyOps()
 
diff --git a/thinc/layers/with_reshape.py b/thinc/layers/with_reshape.py
index 5bd3e9025..b40ada757 100644
--- a/thinc/layers/with_reshape.py
+++ b/thinc/layers/with_reshape.py
@@ -1,9 +1,8 @@
-from typing import Tuple, Callable, Optional, cast, TypeVar, List
+from typing import Callable, List, Optional, Tuple, TypeVar, cast
 
-from ..model import Model
 from ..config import registry
-from ..types import Array3d, Array2d
-
+from ..model import Model
+from ..types import Array2d, Array3d
 
 InT = TypeVar("InT", bound=Array3d)
 OutT = TypeVar("OutT", bound=Array2d)
diff --git a/thinc/layers/with_signpost_interval.py b/thinc/layers/with_signpost_interval.py
index 9a468d896..58f5d4165 100644
--- a/thinc/layers/with_signpost_interval.py
+++ b/thinc/layers/with_signpost_interval.py
@@ -1,9 +1,8 @@
-from typing import Optional, Callable, Any, Tuple, TypeVar
+from typing import Any, Callable, Optional, Tuple, TypeVar
 
 from ..compat import has_os_signpost, os_signpost
 from ..model import Model
 
-
 _ModelT = TypeVar("_ModelT", bound=Model)
 
 
diff --git a/thinc/loss.py b/thinc/loss.py
index e8edb194d..756dac4c3 100644
--- a/thinc/loss.py
+++ b/thinc/loss.py
@@ -1,11 +1,20 @@
-from typing import Tuple, Sequence, cast, TypeVar, Generic, Any, Union, Optional, List
-from typing import Dict
 from abc import abstractmethod
+from typing import (
+    Any,
+    Dict,
+    Generic,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
 
+from .config import registry
 from .types import Floats2d, Ints1d
 from .util import get_array_module, to_categorical
-from .config import registry
-
 
 LossT = TypeVar("LossT")
 GradT = TypeVar("GradT")
diff --git a/thinc/model.py b/thinc/model.py
index e094d5294..ba49215c1 100644
--- a/thinc/model.py
+++ b/thinc/model.py
@@ -1,20 +1,39 @@
-from typing import Dict, List, Callable, Optional, Any, Union, Iterable, Set, cast
-from typing import Generic, Sequence, Tuple, TypeVar, Iterator
 import contextlib
-from contextvars import ContextVar
-import srsly
-from pathlib import Path
 import copy
 import functools
 import threading
+from contextvars import ContextVar
+from pathlib import Path
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+)
+
+import srsly
 
-from .backends import ParamServer, Ops, NumpyOps, CupyOps, get_current_ops
+from .backends import CupyOps, NumpyOps, Ops, ParamServer, get_current_ops
 from .optimizers import Optimizer  # noqa: F401
 from .shims import Shim
-from .util import convert_recursive, is_xp_array, DATA_VALIDATION
-from .util import partial, validate_fwd_input_output
 from .types import FloatsXd
-
+from .util import (
+    DATA_VALIDATION,
+    convert_recursive,
+    is_xp_array,
+    partial,
+    validate_fwd_input_output,
+)
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/mypy.py b/thinc/mypy.py
index e02f6d5be..73c6e72f6 100644
--- a/thinc/mypy.py
+++ b/thinc/mypy.py
@@ -1,13 +1,14 @@
-from typing import Dict, List
 import itertools
-from mypy.errors import Errors
+from typing import Dict, List
+
+from mypy.checker import TypeChecker
 from mypy.errorcodes import ErrorCode
+from mypy.errors import Errors
+from mypy.nodes import CallExpr, Decorator, Expression, FuncDef, MypyFile, NameExpr
 from mypy.options import Options
-from mypy.plugin import FunctionContext, Plugin, CheckerPluginInterface
-from mypy.types import Instance, Type, CallableType, TypeVarType
-from mypy.nodes import Expression, CallExpr, NameExpr, FuncDef, Decorator, MypyFile
-from mypy.checker import TypeChecker
+from mypy.plugin import CheckerPluginInterface, FunctionContext, Plugin
 from mypy.subtypes import is_subtype
+from mypy.types import CallableType, Instance, Type, TypeVarType
 
 thinc_model_fullname = "thinc.model.Model"
 chained_out_fullname = "thinc.types.XY_YZ_OutT"
diff --git a/thinc/optimizers.py b/thinc/optimizers.py
index b0636fd87..071ad4e85 100644
--- a/thinc/optimizers.py
+++ b/thinc/optimizers.py
@@ -1,14 +1,13 @@
-from typing import Any, Dict, Optional, Union, Tuple, List, cast
-from collections import defaultdict
 import itertools
 import math
+from collections import defaultdict
 from types import GeneratorType
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
 
 from .backends import get_array_ops
-from .types import Generator, FloatsXd
 from .config import registry
-from .schedules import constant, Schedule
-
+from .schedules import Schedule, constant
+from .types import FloatsXd, Generator
 
 KeyT = Tuple[int, str]
 ScheduleT = Union[float, List[float], Generator, Schedule]
diff --git a/thinc/schedules.py b/thinc/schedules.py
index 49e43a0c8..2f99a536a 100644
--- a/thinc/schedules.py
+++ b/thinc/schedules.py
@@ -1,8 +1,8 @@
 """Generators that provide different rates, schedules, decays or series."""
-from typing import Any, Callable, Dict, Generator, Generic, Tuple, TypeVar
-from typing import Optional
-from dataclasses import dataclass
 import itertools
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Generator, Generic, Optional, Tuple, TypeVar
+
 import numpy
 
 from .config import registry
diff --git a/thinc/shims/__init__.py b/thinc/shims/__init__.py
index 9cd8bd030..fb246c9f2 100644
--- a/thinc/shims/__init__.py
+++ b/thinc/shims/__init__.py
@@ -1,10 +1,9 @@
-from .shim import Shim
+from .mxnet import MXNetShim
 from .pytorch import PyTorchShim
 from .pytorch_grad_scaler import PyTorchGradScaler
-from .tensorflow import keras_model_fns, TensorFlowShim, maybe_handshake_model
+from .shim import Shim
+from .tensorflow import TensorFlowShim, keras_model_fns, maybe_handshake_model
 from .torchscript import TorchScriptShim
-from .mxnet import MXNetShim
-
 
 # fmt: off
 __all__ = [
diff --git a/thinc/shims/mxnet.py b/thinc/shims/mxnet.py
index 3962a2ef5..2dd36a62f 100644
--- a/thinc/shims/mxnet.py
+++ b/thinc/shims/mxnet.py
@@ -1,13 +1,19 @@
+import copy
 from typing import Any, cast
+
 import srsly
-import copy
 
-from ..util import mxnet2xp, convert_recursive, make_tempfile, xp2mxnet
-from ..util import get_array_module
+from ..compat import mxnet as mx
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, FloatsXd
+from ..util import (
+    convert_recursive,
+    get_array_module,
+    make_tempfile,
+    mxnet2xp,
+    xp2mxnet,
+)
 from .shim import Shim
-from ..compat import mxnet as mx
 
 
 class MXNetShim(Shim):
diff --git a/thinc/shims/pytorch.py b/thinc/shims/pytorch.py
index 9582c8616..505669867 100644
--- a/thinc/shims/pytorch.py
+++ b/thinc/shims/pytorch.py
@@ -1,16 +1,21 @@
-from typing import Any, Dict, Optional, cast, Callable
 import contextlib
-from io import BytesIO
 import itertools
+from io import BytesIO
+from typing import Any, Callable, Dict, Optional, cast
+
 import srsly
 
-from ..util import torch2xp, xp2torch, convert_recursive, iterate_recursive
-from ..util import get_torch_default_device
+from ..backends import CupyOps, context_pools, get_current_ops, set_gpu_allocator
 from ..compat import torch
-from ..backends import get_current_ops, context_pools, CupyOps
-from ..backends import set_gpu_allocator
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, FloatsXd
+from ..util import (
+    convert_recursive,
+    get_torch_default_device,
+    iterate_recursive,
+    torch2xp,
+    xp2torch,
+)
 from .pytorch_grad_scaler import PyTorchGradScaler
 from .shim import Shim
 
diff --git a/thinc/shims/shim.py b/thinc/shims/shim.py
index 0c246e8d4..ef88408a3 100644
--- a/thinc/shims/shim.py
+++ b/thinc/shims/shim.py
@@ -1,8 +1,8 @@
-from typing import Any, Optional, Tuple, Callable, Dict, Union
-import copy
 import contextlib
-from pathlib import Path
+import copy
 import threading
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional, Tuple, Union
 
 
 class Shim:  # pragma: no cover
diff --git a/thinc/shims/tensorflow.py b/thinc/shims/tensorflow.py
index d630d86f9..bcaae3aac 100644
--- a/thinc/shims/tensorflow.py
+++ b/thinc/shims/tensorflow.py
@@ -1,17 +1,18 @@
-from typing import Any, Dict, List, Optional
-import catalogue
 import contextlib
 import copy
 from io import BytesIO
+from typing import Any, Dict, List, Optional
+
+import catalogue
 import numpy
 
 from ..backends import Ops, get_current_ops
+from ..compat import cupy, h5py
+from ..compat import tensorflow as tf
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, ArrayXd
 from ..util import get_array_module
 from .shim import Shim
-from ..compat import tensorflow as tf
-from ..compat import cupy, h5py
 
 keras_model_fns = catalogue.create("thinc", "keras", entry_points=True)
 
diff --git a/thinc/shims/torchscript.py b/thinc/shims/torchscript.py
index 675718cd1..6c05c8a9b 100644
--- a/thinc/shims/torchscript.py
+++ b/thinc/shims/torchscript.py
@@ -1,5 +1,6 @@
-from typing import Any, Optional
 from io import BytesIO
+from typing import Any, Optional
+
 import srsly
 
 from ..compat import torch
diff --git a/thinc/tests/backends/test_mem.py b/thinc/tests/backends/test_mem.py
index cb26e24e0..bf867726d 100644
--- a/thinc/tests/backends/test_mem.py
+++ b/thinc/tests/backends/test_mem.py
@@ -1,6 +1,7 @@
-from thinc.backends._param_server import ParamServer
 import numpy
 
+from thinc.backends._param_server import ParamServer
+
 
 def test_param_server_init():
     array = numpy.zeros((5,), dtype="f")
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 83dd582ea..3cec4b6fa 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -1,26 +1,32 @@
+import inspect
+import platform
 from typing import Tuple, cast
 
-import pytest
 import numpy
-import platform
+import pytest
 from hypothesis import given, settings
 from hypothesis.strategies import composite, integers
 from numpy.testing import assert_allclose
 from packaging.version import Version
-from thinc.api import NumpyOps, CupyOps, Ops, get_ops
-from thinc.api import get_current_ops, use_ops
-from thinc.util import torch2xp, xp2torch
+
+from thinc.api import (
+    LSTM,
+    CupyOps,
+    NumpyOps,
+    Ops,
+    fix_random_seed,
+    get_current_ops,
+    get_ops,
+    use_ops,
+)
+from thinc.backends._custom_kernels import KERNELS, KERNELS_LIST, compile_mmh
 from thinc.compat import has_cupy_gpu, has_torch, torch_version
-from thinc.api import fix_random_seed
-from thinc.api import LSTM
 from thinc.types import Floats2d
-from thinc.backends._custom_kernels import KERNELS_LIST, KERNELS, compile_mmh
-import inspect
+from thinc.util import torch2xp, xp2torch
 
 from .. import strategies
 from ..strategies import arrays_BI, ndarrays_of_shape
 
-
 MAX_EXAMPLES = 10
 
 VANILLA_OPS = Ops(numpy)  # type:ignore
@@ -37,9 +43,10 @@
 
 
 def create_pytorch_funcs():
-    import torch
     import math
 
+    import torch
+
     def torch_relu(x):
         return torch.nn.functional.relu(x)
 
diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py
index 19b5137d3..026f3eb06 100644
--- a/thinc/tests/conftest.py
+++ b/thinc/tests/conftest.py
@@ -52,9 +52,10 @@ def getopt(opt):
 @pytest.fixture()
 def pathy_fixture():
     pytest.importorskip("pathy")
-    import tempfile
     import shutil
-    from pathy import use_fs, Pathy
+    import tempfile
+
+    from pathy import Pathy, use_fs
 
     temp_folder = tempfile.mkdtemp(prefix="thinc-pathy")
     use_fs(temp_folder)
diff --git a/thinc/tests/layers/test_basic_tagger.py b/thinc/tests/layers/test_basic_tagger.py
index 3046c1b04..855a6d6ad 100644
--- a/thinc/tests/layers/test_basic_tagger.py
+++ b/thinc/tests/layers/test_basic_tagger.py
@@ -1,7 +1,18 @@
-import pytest
 import random
-from thinc.api import Model, Relu, Softmax, HashEmbed, expand_window
-from thinc.api import chain, with_array, Adam, strings2arrays
+
+import pytest
+
+from thinc.api import (
+    Adam,
+    HashEmbed,
+    Model,
+    Relu,
+    Softmax,
+    chain,
+    expand_window,
+    strings2arrays,
+    with_array,
+)
 
 
 @pytest.fixture(scope="module")
diff --git a/thinc/tests/layers/test_combinators.py b/thinc/tests/layers/test_combinators.py
index ea5583108..c7b4fbe9f 100644
--- a/thinc/tests/layers/test_combinators.py
+++ b/thinc/tests/layers/test_combinators.py
@@ -1,8 +1,18 @@
-import pytest
 import numpy
+import pytest
 from numpy.testing import assert_allclose
-from thinc.api import clone, concatenate, noop, add, map_list
-from thinc.api import Linear, Dropout, Model, NumpyOps
+
+from thinc.api import (
+    Dropout,
+    Linear,
+    Model,
+    NumpyOps,
+    add,
+    clone,
+    concatenate,
+    map_list,
+    noop,
+)
 from thinc.layers import chain, tuplify
 
 
diff --git a/thinc/tests/layers/test_feed_forward.py b/thinc/tests/layers/test_feed_forward.py
index b18a0fc0b..a808bb445 100644
--- a/thinc/tests/layers/test_feed_forward.py
+++ b/thinc/tests/layers/test_feed_forward.py
@@ -1,8 +1,10 @@
-import pytest
-import numpy
 from functools import partial
+
+import numpy
+import pytest
 from numpy.testing import assert_allclose
-from thinc.api import chain, Linear, Relu, NumpyOps
+
+from thinc.api import Linear, NumpyOps, Relu, chain
 
 
 @pytest.fixture(params=[1, 2, 9])
diff --git a/thinc/tests/layers/test_hash_embed.py b/thinc/tests/layers/test_hash_embed.py
index 8df50a03f..5b79539fa 100644
--- a/thinc/tests/layers/test_hash_embed.py
+++ b/thinc/tests/layers/test_hash_embed.py
@@ -1,4 +1,5 @@
 import numpy
+
 from thinc.api import HashEmbed
 
 
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 761cad880..0ef559d96 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -1,14 +1,15 @@
 from typing import List, Optional
 
-from numpy.testing import assert_almost_equal
-from thinc.api import registry, with_padded, Dropout, NumpyOps, Model
-from thinc.backends import NumpyOps
-from thinc.util import data_validation, get_width
-from thinc.types import Ragged, Padded, Array2d, Floats2d, FloatsXd, Shape
-from thinc.compat import has_torch
 import numpy
 import pytest
 import srsly
+from numpy.testing import assert_almost_equal
+
+from thinc.api import Dropout, Model, NumpyOps, registry, with_padded
+from thinc.backends import NumpyOps
+from thinc.compat import has_torch
+from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape
+from thinc.util import data_validation, get_width
 
 OPS = NumpyOps()
 
diff --git a/thinc/tests/layers/test_linear.py b/thinc/tests/layers/test_linear.py
index 2362b556b..345669d87 100644
--- a/thinc/tests/layers/test_linear.py
+++ b/thinc/tests/layers/test_linear.py
@@ -1,9 +1,10 @@
+import numpy
 import pytest
-from mock import MagicMock
 from hypothesis import given, settings
-import numpy
+from mock import MagicMock
 from numpy.testing import assert_allclose
-from thinc.api import Linear, chain, Dropout, SGD
+
+from thinc.api import SGD, Dropout, Linear, chain
 
 from ..strategies import arrays_OI_O_BI
 from ..util import get_model, get_shape
diff --git a/thinc/tests/layers/test_lstm.py b/thinc/tests/layers/test_lstm.py
index 208ffb58b..44c90ed4c 100644
--- a/thinc/tests/layers/test_lstm.py
+++ b/thinc/tests/layers/test_lstm.py
@@ -1,10 +1,11 @@
-import numpy
 import timeit
-from thinc.api import NumpyOps, LSTM, PyTorchLSTM, with_padded, fix_random_seed
-from thinc.api import Ops
-from thinc.compat import has_torch
+
+import numpy
 import pytest
 
+from thinc.api import LSTM, NumpyOps, Ops, PyTorchLSTM, fix_random_seed, with_padded
+from thinc.compat import has_torch
+
 
 @pytest.fixture(params=[1, 6])
 def nI(request):
diff --git a/thinc/tests/layers/test_mappers.py b/thinc/tests/layers/test_mappers.py
index e890dd086..85e984bc4 100644
--- a/thinc/tests/layers/test_mappers.py
+++ b/thinc/tests/layers/test_mappers.py
@@ -1,5 +1,6 @@
-import pytest
 import numpy
+import pytest
+
 from thinc.layers import premap_ids, remap_ids, remap_ids_v2
 
 
diff --git a/thinc/tests/layers/test_mnist.py b/thinc/tests/layers/test_mnist.py
index 321de3a0f..060007cfd 100644
--- a/thinc/tests/layers/test_mnist.py
+++ b/thinc/tests/layers/test_mnist.py
@@ -1,8 +1,16 @@
 import pytest
-from thinc.api import Relu, Softmax, chain, clone, Adam
-from thinc.api import PyTorchWrapper, TensorFlowWrapper
-from thinc.api import get_current_ops
-from thinc.compat import has_torch, has_tensorflow
+
+from thinc.api import (
+    Adam,
+    PyTorchWrapper,
+    Relu,
+    Softmax,
+    TensorFlowWrapper,
+    chain,
+    clone,
+    get_current_ops,
+)
+from thinc.compat import has_tensorflow, has_torch
 
 
 @pytest.fixture(scope="module")
diff --git a/thinc/tests/layers/test_mxnet_wrapper.py b/thinc/tests/layers/test_mxnet_wrapper.py
index b954a8ec5..8ddf5dfce 100644
--- a/thinc/tests/layers/test_mxnet_wrapper.py
+++ b/thinc/tests/layers/test_mxnet_wrapper.py
@@ -2,10 +2,19 @@
 
 import numpy
 import pytest
-from thinc.api import Adam, ArgsKwargs, Model, Ops, MXNetWrapper
-from thinc.api import get_current_ops, mxnet2xp, xp2mxnet
-from thinc.types import Array2d, Array1d, IntsXd
+
+from thinc.api import (
+    Adam,
+    ArgsKwargs,
+    Model,
+    MXNetWrapper,
+    Ops,
+    get_current_ops,
+    mxnet2xp,
+    xp2mxnet,
+)
 from thinc.compat import has_cupy_gpu, has_mxnet
+from thinc.types import Array1d, Array2d, IntsXd
 from thinc.util import to_categorical
 
 from ..util import check_input_converters, make_tempdir
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index f4f83cb60..aa40d9044 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -1,20 +1,34 @@
-from thinc.api import Linear, SGD, PyTorchWrapper, PyTorchWrapper_v2, PyTorchWrapper_v3
-from thinc.api import xp2torch, torch2xp, ArgsKwargs, use_ops
-from thinc.api import chain, get_current_ops, Relu
-from thinc.api import CupyOps, MPSOps, NumpyOps
+import numpy
+import pytest
+
+from thinc.api import (
+    SGD,
+    ArgsKwargs,
+    CupyOps,
+    Linear,
+    MPSOps,
+    NumpyOps,
+    PyTorchWrapper,
+    PyTorchWrapper_v2,
+    PyTorchWrapper_v3,
+    Relu,
+    chain,
+    get_current_ops,
+    torch2xp,
+    use_ops,
+    xp2torch,
+)
 from thinc.backends import context_pools
+from thinc.compat import has_cupy_gpu, has_torch, has_torch_amp, has_torch_mps_gpu
 from thinc.layers.pytorchwrapper import PyTorchWrapper_v3
+from thinc.shims.pytorch import (
+    default_deserialize_torch_model,
+    default_serialize_torch_model,
+)
 from thinc.shims.pytorch_grad_scaler import PyTorchGradScaler
-from thinc.shims.pytorch import default_deserialize_torch_model
-from thinc.shims.pytorch import default_serialize_torch_model
-from thinc.compat import has_torch, has_torch_amp
-from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
-import numpy
-import pytest
 from thinc.util import get_torch_default_device
 
-from ..util import make_tempdir, check_input_converters
-
+from ..util import check_input_converters, make_tempdir
 
 XP_OPS = [NumpyOps()]
 if has_cupy_gpu:
diff --git a/thinc/tests/layers/test_reduce.py b/thinc/tests/layers/test_reduce.py
index d26065c4a..608561e13 100644
--- a/thinc/tests/layers/test_reduce.py
+++ b/thinc/tests/layers/test_reduce.py
@@ -1,5 +1,6 @@
-import pytest
 import numpy
+import pytest
+
 from thinc.api import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum
 from thinc.types import Ragged
 
diff --git a/thinc/tests/layers/test_resizable.py b/thinc/tests/layers/test_resizable.py
index dfb6c67fd..ffa256de5 100644
--- a/thinc/tests/layers/test_resizable.py
+++ b/thinc/tests/layers/test_resizable.py
@@ -1,7 +1,9 @@
-import pytest
 from functools import partial
-from thinc.api import resizable, Linear
-from thinc.layers.resizable import resize_model, resize_linear_weighted
+
+import pytest
+
+from thinc.api import Linear, resizable
+from thinc.layers.resizable import resize_linear_weighted, resize_model
 
 
 @pytest.fixture
diff --git a/thinc/tests/layers/test_shim.py b/thinc/tests/layers/test_shim.py
index bacde5cf6..dcb43ab1e 100644
--- a/thinc/tests/layers/test_shim.py
+++ b/thinc/tests/layers/test_shim.py
@@ -1,5 +1,7 @@
 from typing import List
+
 from thinc.shims.shim import Shim
+
 from ..util import make_tempdir
 
 
diff --git a/thinc/tests/layers/test_softmax.py b/thinc/tests/layers/test_softmax.py
index 69072b558..95e2f41c7 100644
--- a/thinc/tests/layers/test_softmax.py
+++ b/thinc/tests/layers/test_softmax.py
@@ -1,8 +1,8 @@
 from typing import Tuple, cast
 
 import numpy
-from numpy.testing import assert_allclose
 import pytest
+from numpy.testing import assert_allclose
 
 from thinc.api import Model, NumpyOps, Softmax_v2
 from thinc.types import Floats2d, Ints1d
diff --git a/thinc/tests/layers/test_sparse_linear.py b/thinc/tests/layers/test_sparse_linear.py
index 87c5a3a75..cce0d1023 100644
--- a/thinc/tests/layers/test_sparse_linear.py
+++ b/thinc/tests/layers/test_sparse_linear.py
@@ -1,7 +1,9 @@
 import math
+
 import numpy
 import pytest
-from thinc.api import SGD, to_categorical, SparseLinear, SparseLinear_v2
+
+from thinc.api import SGD, SparseLinear, SparseLinear_v2, to_categorical
 
 
 @pytest.fixture
diff --git a/thinc/tests/layers/test_tensorflow_wrapper.py b/thinc/tests/layers/test_tensorflow_wrapper.py
index c1b85da3b..4741f6dc3 100644
--- a/thinc/tests/layers/test_tensorflow_wrapper.py
+++ b/thinc/tests/layers/test_tensorflow_wrapper.py
@@ -1,9 +1,19 @@
 import numpy
 import pytest
-from thinc.api import Adam, ArgsKwargs, Linear, Model, TensorFlowWrapper
-from thinc.api import get_current_ops, keras_subclass, tensorflow2xp, xp2tensorflow
-from thinc.util import to_categorical
+
+from thinc.api import (
+    Adam,
+    ArgsKwargs,
+    Linear,
+    Model,
+    TensorFlowWrapper,
+    get_current_ops,
+    keras_subclass,
+    tensorflow2xp,
+    xp2tensorflow,
+)
 from thinc.compat import has_cupy_gpu, has_tensorflow
+from thinc.util import to_categorical
 
 from ..util import check_input_converters, make_tempdir
 
diff --git a/thinc/tests/layers/test_torchscriptwrapper.py b/thinc/tests/layers/test_torchscriptwrapper.py
index 37ff9ef08..b37afa3c3 100644
--- a/thinc/tests/layers/test_torchscriptwrapper.py
+++ b/thinc/tests/layers/test_torchscriptwrapper.py
@@ -1,8 +1,11 @@
-import pytest
 import numpy
+import pytest
 
-from thinc.api import PyTorchWrapper_v2, TorchScriptWrapper_v1
-from thinc.api import pytorch_to_torchscript_wrapper
+from thinc.api import (
+    PyTorchWrapper_v2,
+    TorchScriptWrapper_v1,
+    pytorch_to_torchscript_wrapper,
+)
 from thinc.compat import has_torch, torch
 
 
diff --git a/thinc/tests/layers/test_transforms.py b/thinc/tests/layers/test_transforms.py
index 8de5341d7..3a9a110f1 100644
--- a/thinc/tests/layers/test_transforms.py
+++ b/thinc/tests/layers/test_transforms.py
@@ -1,7 +1,8 @@
-from thinc.api import strings2arrays, NumpyOps, Ragged, registry
 import numpy
 import pytest
 
+from thinc.api import NumpyOps, Ragged, registry, strings2arrays
+
 from ..util import get_data_checker
 
 
diff --git a/thinc/tests/layers/test_uniqued.py b/thinc/tests/layers/test_uniqued.py
index 9cb207ca5..685da1deb 100644
--- a/thinc/tests/layers/test_uniqued.py
+++ b/thinc/tests/layers/test_uniqued.py
@@ -1,10 +1,11 @@
-import pytest
 import numpy
+import pytest
+from hypothesis import given, settings
+from hypothesis.strategies import composite, integers, lists
+from numpy.testing import assert_allclose
+
 from thinc.layers import Embed
 from thinc.layers.uniqued import uniqued
-from numpy.testing import assert_allclose
-from hypothesis import given, settings
-from hypothesis.strategies import integers, lists, composite
 
 ROWS = 10
 
diff --git a/thinc/tests/layers/test_with_debug.py b/thinc/tests/layers/test_with_debug.py
index 679c1f21e..3f65a3ac3 100644
--- a/thinc/tests/layers/test_with_debug.py
+++ b/thinc/tests/layers/test_with_debug.py
@@ -1,5 +1,6 @@
 from mock import MagicMock
-from thinc.api import with_debug, Linear
+
+from thinc.api import Linear, with_debug
 
 
 def test_with_debug():
diff --git a/thinc/tests/layers/test_with_flatten.py b/thinc/tests/layers/test_with_flatten.py
index 1ff622026..86d18eb67 100644
--- a/thinc/tests/layers/test_with_flatten.py
+++ b/thinc/tests/layers/test_with_flatten.py
@@ -1,4 +1,5 @@
 from typing import List
+
 from thinc.api import Model, with_flatten_v2
 
 INPUT = [[1, 2, 3], [4, 5], [], [6, 7, 8]]
diff --git a/thinc/tests/layers/test_with_transforms.py b/thinc/tests/layers/test_with_transforms.py
index c23db1463..82cdaed36 100644
--- a/thinc/tests/layers/test_with_transforms.py
+++ b/thinc/tests/layers/test_with_transforms.py
@@ -1,11 +1,20 @@
-import pytest
 import numpy
 import numpy.testing
-from thinc.api import NumpyOps, Model, Linear, noop
-from thinc.api import with_array2d, with_array, with_padded, with_list
-from thinc.api import with_ragged, with_getitem
-from thinc.types import Padded, Ragged
+import pytest
 
+from thinc.api import (
+    Linear,
+    Model,
+    NumpyOps,
+    noop,
+    with_array,
+    with_array2d,
+    with_getitem,
+    with_list,
+    with_padded,
+    with_ragged,
+)
+from thinc.types import Padded, Ragged
 
 from ..util import get_data_checker
 
diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py
index 733b3329f..f93b46c8c 100644
--- a/thinc/tests/model/test_model.py
+++ b/thinc/tests/model/test_model.py
@@ -1,13 +1,28 @@
-from collections import Counter
-import pytest
 import threading
 import time
-from thinc.api import Adam, CupyOps, Dropout, Linear, Model, Relu
-from thinc.api import Shim, Softmax, chain, change_attr_values
-from thinc.api import concatenate, set_dropout_rate
-from thinc.api import use_ops, with_debug, wrap_model_recursive
-from thinc.compat import has_cupy_gpu
+from collections import Counter
+
 import numpy
+import pytest
+
+from thinc.api import (
+    Adam,
+    CupyOps,
+    Dropout,
+    Linear,
+    Model,
+    Relu,
+    Shim,
+    Softmax,
+    chain,
+    change_attr_values,
+    concatenate,
+    set_dropout_rate,
+    use_ops,
+    with_debug,
+    wrap_model_recursive,
+)
+from thinc.compat import has_cupy_gpu
 
 from ..util import make_tempdir
 
diff --git a/thinc/tests/model/test_validation.py b/thinc/tests/model/test_validation.py
index adecdd6d5..c58efd015 100644
--- a/thinc/tests/model/test_validation.py
+++ b/thinc/tests/model/test_validation.py
@@ -1,6 +1,15 @@
 import pytest
-from thinc.api import chain, Relu, reduce_max, Softmax, with_ragged
-from thinc.api import ParametricAttention, list2ragged, reduce_sum
+
+from thinc.api import (
+    ParametricAttention,
+    Relu,
+    Softmax,
+    chain,
+    list2ragged,
+    reduce_max,
+    reduce_sum,
+    with_ragged,
+)
 from thinc.util import DataValidationError, data_validation
 
 
diff --git a/thinc/tests/mypy/modules/fail_no_plugin.py b/thinc/tests/mypy/modules/fail_no_plugin.py
index 807fd672b..f53e33ef3 100644
--- a/thinc/tests/mypy/modules/fail_no_plugin.py
+++ b/thinc/tests/mypy/modules/fail_no_plugin.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Relu, reduce_max, Softmax, add
+from thinc.api import Relu, Softmax, add, chain, reduce_max
 
 bad_model = chain(Relu(10), reduce_max(), Softmax())
 
diff --git a/thinc/tests/mypy/modules/fail_plugin.py b/thinc/tests/mypy/modules/fail_plugin.py
index b14fcecf0..6f23c82b1 100644
--- a/thinc/tests/mypy/modules/fail_plugin.py
+++ b/thinc/tests/mypy/modules/fail_plugin.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Relu, reduce_max, Softmax, add, concatenate
+from thinc.api import Relu, Softmax, add, chain, concatenate, reduce_max
 
 bad_model = chain(Relu(10), reduce_max(), Softmax())
 
diff --git a/thinc/tests/mypy/modules/success_no_plugin.py b/thinc/tests/mypy/modules/success_no_plugin.py
index b17cff053..058573e5b 100644
--- a/thinc/tests/mypy/modules/success_no_plugin.py
+++ b/thinc/tests/mypy/modules/success_no_plugin.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Relu, reduce_max, Softmax, add
+from thinc.api import Relu, Softmax, add, chain, reduce_max
 
 good_model = chain(Relu(10), Relu(10), Softmax())
 reveal_type(good_model)
diff --git a/thinc/tests/mypy/modules/success_plugin.py b/thinc/tests/mypy/modules/success_plugin.py
index 85879a88a..3214bdcb7 100644
--- a/thinc/tests/mypy/modules/success_plugin.py
+++ b/thinc/tests/mypy/modules/success_plugin.py
@@ -1,6 +1,6 @@
 from typing import Any, TypeVar
 
-from thinc.api import chain, Relu, reduce_max, Softmax, add, Model
+from thinc.api import Model, Relu, Softmax, add, chain, reduce_max
 
 good_model = chain(Relu(10), Relu(10), Softmax())
 reveal_type(good_model)
diff --git a/thinc/tests/mypy/test_mypy.py b/thinc/tests/mypy/test_mypy.py
index 2f2976882..f144128f4 100644
--- a/thinc/tests/mypy/test_mypy.py
+++ b/thinc/tests/mypy/test_mypy.py
@@ -1,8 +1,8 @@
 import os
 import re
-from pathlib import Path
 import shutil
 import sys
+from pathlib import Path
 
 import pytest
 
diff --git a/thinc/tests/regression/issue519/program.py b/thinc/tests/regression/issue519/program.py
index b3e6dc9ba..bce5f3234 100644
--- a/thinc/tests/regression/issue519/program.py
+++ b/thinc/tests/regression/issue519/program.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, concatenate, Relu, Softmax
+from thinc.api import Relu, Softmax, chain, concatenate
 from thinc.model import Model
 from thinc.types import Floats2d
 
diff --git a/thinc/tests/regression/test_issue208.py b/thinc/tests/regression/test_issue208.py
index 25d7280f1..0c574d6d1 100644
--- a/thinc/tests/regression/test_issue208.py
+++ b/thinc/tests/regression/test_issue208.py
@@ -1,4 +1,4 @@
-from thinc.api import chain, Linear
+from thinc.api import Linear, chain
 
 
 def test_issue208():
diff --git a/thinc/tests/shims/test_pytorch_grad_scaler.py b/thinc/tests/shims/test_pytorch_grad_scaler.py
index 2ab0fa738..d4ac10fec 100644
--- a/thinc/tests/shims/test_pytorch_grad_scaler.py
+++ b/thinc/tests/shims/test_pytorch_grad_scaler.py
@@ -1,10 +1,10 @@
 import pytest
-
 from hypothesis import given, settings
 from hypothesis.strategies import lists, one_of, tuples
+
+from thinc.api import PyTorchGradScaler
 from thinc.compat import has_torch, has_torch_amp, has_torch_cuda_gpu, torch
 from thinc.util import is_torch_array
-from thinc.api import PyTorchGradScaler
 
 from ..strategies import ndarrays
 
diff --git a/thinc/tests/strategies.py b/thinc/tests/strategies.py
index 322728cd9..bc12975aa 100644
--- a/thinc/tests/strategies.py
+++ b/thinc/tests/strategies.py
@@ -1,7 +1,8 @@
 import numpy
-from hypothesis.strategies import just, tuples, integers, floats
 from hypothesis.extra.numpy import arrays
-from thinc.api import NumpyOps, Linear
+from hypothesis.strategies import floats, integers, just, tuples
+
+from thinc.api import Linear, NumpyOps
 
 
 def get_ops():
diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py
index e028937da..254fcf078 100644
--- a/thinc/tests/test_config.py
+++ b/thinc/tests/test_config.py
@@ -1,20 +1,21 @@
-import pytest
-from typing import Iterable, Union, Optional, List, Callable, Dict, Any
+import inspect
+import pickle
 from types import GeneratorType
-from pydantic import BaseModel, StrictBool, StrictFloat, PositiveInt, constr
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
+
 import catalogue
+import numpy
+import pytest
+from pydantic import BaseModel, PositiveInt, StrictBool, StrictFloat, constr
+
 import thinc.config
+from thinc.api import Config, Model, NumpyOps, RAdam
 from thinc.config import ConfigValidationError
 from thinc.types import Generator, Ragged
-from thinc.api import Config, RAdam, Model, NumpyOps
 from thinc.util import partial
-import numpy
-import inspect
-import pickle
 
 from .util import make_tempdir
 
-
 EXAMPLE_CONFIG = """
 [optimizer]
 @optimizers = "Adam.v1"
diff --git a/thinc/tests/test_import__all__.py b/thinc/tests/test_import__all__.py
index 226783ec2..fb0a08a20 100644
--- a/thinc/tests/test_import__all__.py
+++ b/thinc/tests/test_import__all__.py
@@ -1,9 +1,9 @@
 import ast
+import importlib
 from collections import namedtuple
-from typing import Tuple, List
+from typing import List, Tuple
 
 import pytest
-import importlib
 
 _Import = namedtuple("_Import", ["module", "name", "alias"])
 
diff --git a/thinc/tests/test_indexing.py b/thinc/tests/test_indexing.py
index 98fbc4437..2703e5dfa 100644
--- a/thinc/tests/test_indexing.py
+++ b/thinc/tests/test_indexing.py
@@ -1,7 +1,8 @@
-import pytest
 import numpy
+import pytest
 from numpy.testing import assert_allclose
-from thinc.types import Ragged, Pairs
+
+from thinc.types import Pairs, Ragged
 
 
 @pytest.fixture
diff --git a/thinc/tests/test_initializers.py b/thinc/tests/test_initializers.py
index 4f7c8f2cc..628398be0 100644
--- a/thinc/tests/test_initializers.py
+++ b/thinc/tests/test_initializers.py
@@ -1,8 +1,14 @@
+import numpy
 import pytest
-from thinc.api import glorot_uniform_init, zero_init, uniform_init, normal_init
-from thinc.api import NumpyOps
+
 from thinc import registry
-import numpy
+from thinc.api import (
+    NumpyOps,
+    glorot_uniform_init,
+    normal_init,
+    uniform_init,
+    zero_init,
+)
 
 
 @pytest.mark.parametrize(
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 75206d240..fc100dd3a 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -1,8 +1,13 @@
-import pytest
 import numpy
-from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy
-from thinc.api import L2Distance, CosineDistance
+import pytest
+
 from thinc import registry
+from thinc.api import (
+    CategoricalCrossentropy,
+    CosineDistance,
+    L2Distance,
+    SequenceCategoricalCrossentropy,
+)
 
 # some simple arrays
 scores0 = numpy.zeros((3, 3), dtype="f")
diff --git a/thinc/tests/test_optimizers.py b/thinc/tests/test_optimizers.py
index 0fab737f9..57b5a27ff 100644
--- a/thinc/tests/test_optimizers.py
+++ b/thinc/tests/test_optimizers.py
@@ -1,8 +1,8 @@
-import pytest
-from thinc.api import registry, Optimizer
-from thinc.optimizers import KeyT, _wrap_generator
 import numpy
+import pytest
 
+from thinc.api import Optimizer, registry
+from thinc.optimizers import KeyT, _wrap_generator
 
 STUB_KEY: KeyT = (0, "")
 
diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py
index c404fe128..693dcfcc7 100644
--- a/thinc/tests/test_schedules.py
+++ b/thinc/tests/test_schedules.py
@@ -1,7 +1,16 @@
 from itertools import islice
+
 import pytest
-from thinc.api import decaying, compounding, slanted_triangular, constant_then
-from thinc.api import constant, warmup_linear, cyclic_triangular
+
+from thinc.api import (
+    compounding,
+    constant,
+    constant_then,
+    cyclic_triangular,
+    decaying,
+    slanted_triangular,
+    warmup_linear,
+)
 from thinc.optimizers import KeyT
 from thinc.schedules import plateau
 
diff --git a/thinc/tests/test_serialize.py b/thinc/tests/test_serialize.py
index b89fc2d94..a457cd237 100644
--- a/thinc/tests/test_serialize.py
+++ b/thinc/tests/test_serialize.py
@@ -1,7 +1,16 @@
 import pytest
 import srsly
-from thinc.api import with_array, Linear, Maxout, chain, Model, Shim
-from thinc.api import serialize_attr, deserialize_attr
+
+from thinc.api import (
+    Linear,
+    Maxout,
+    Model,
+    Shim,
+    chain,
+    deserialize_attr,
+    serialize_attr,
+    with_array,
+)
 
 
 @pytest.fixture
diff --git a/thinc/tests/test_types.py b/thinc/tests/test_types.py
index 249ce2b80..ebfbb6fb6 100644
--- a/thinc/tests/test_types.py
+++ b/thinc/tests/test_types.py
@@ -1,8 +1,17 @@
 import numpy
-from pydantic import create_model, ValidationError
-from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
-from thinc.types import Ints1d, Ints2d, Ints3d, Ints4d
 import pytest
+from pydantic import ValidationError, create_model
+
+from thinc.types import (
+    Floats1d,
+    Floats2d,
+    Floats3d,
+    Floats4d,
+    Ints1d,
+    Ints2d,
+    Ints3d,
+    Ints4d,
+)
 
 
 @pytest.mark.parametrize(
diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index 133efbe60..77f6a7b86 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -1,12 +1,16 @@
-import pytest
 import numpy
+import pytest
 from hypothesis import given
-from thinc.api import get_width, Ragged, Padded
-from thinc.util import get_array_module, is_numpy_array, to_categorical
-from thinc.util import is_cupy_array
-from thinc.util import convert_recursive
-from thinc.types import ArgsKwargs
 
+from thinc.api import Padded, Ragged, get_width
+from thinc.types import ArgsKwargs
+from thinc.util import (
+    convert_recursive,
+    get_array_module,
+    is_cupy_array,
+    is_numpy_array,
+    to_categorical,
+)
 
 from . import strategies
 
diff --git a/thinc/tests/util.py b/thinc/tests/util.py
index 7440a4b6e..defb9a2f6 100644
--- a/thinc/tests/util.py
+++ b/thinc/tests/util.py
@@ -1,10 +1,12 @@
 import contextlib
-from pathlib import Path
-import tempfile
 import shutil
-from thinc.api import Linear, Ragged, Padded, ArgsKwargs
+import tempfile
+from pathlib import Path
+
 import numpy
 import pytest
+
+from thinc.api import ArgsKwargs, Linear, Padded, Ragged
 from thinc.util import has_cupy, is_cupy_array, is_numpy_array
 
 
diff --git a/thinc/types.py b/thinc/types.py
index c7e6a00f6..9a9487cb4 100644
--- a/thinc/types.py
+++ b/thinc/types.py
@@ -1,11 +1,28 @@
-from typing import Union, Tuple, Sized, Container, Any, TypeVar, Callable
-from typing import Iterable, Iterator, Sequence, Dict, Generic, cast
-from typing import Optional, List, overload
+import sys
 from abc import abstractmethod
 from dataclasses import dataclass
+from typing import (
+    Any,
+    Callable,
+    Container,
+    Dict,
+    Generic,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Sized,
+    Tuple,
+    TypeVar,
+    Union,
+    cast,
+    overload,
+)
+
 import numpy
-import sys
-from .compat import has_cupy, cupy
+
+from .compat import cupy, has_cupy
 
 if has_cupy:
     get_array_module = cupy.get_array_module
@@ -14,9 +31,9 @@
 
 # Use typing_extensions for Python versions < 3.8
 if sys.version_info < (3, 8):
-    from typing_extensions import Protocol, Literal
+    from typing_extensions import Literal, Protocol
 else:
-    from typing import Protocol, Literal  # noqa: F401
+    from typing import Literal, Protocol  # noqa: F401
 
 
 # fmt: off
diff --git a/thinc/util.py b/thinc/util.py
index 9afec29ba..6f47f38df 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -1,25 +1,48 @@
-from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar
-from typing import List, Mapping, TYPE_CHECKING
-import numpy
-import platform
-import random
+import contextlib
 import functools
-from wasabi import table
-from pydantic import create_model, ValidationError
 import inspect
 import os
+import platform
+import random
 import tempfile
 import threading
-import contextlib
 from contextvars import ContextVar
 from dataclasses import dataclass
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    TypeVar,
+    Union,
+    cast,
+)
+
+import numpy
+from pydantic import ValidationError, create_model
+from wasabi import table
 
-from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
-from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu
-from .compat import has_torch_mps
-from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
-from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd  # noqa: E402
 from . import types  # noqa: E402
+from .compat import (
+    cupy,
+    cupy_from_dlpack,
+    has_cupy,
+    has_cupy_gpu,
+    has_gpu,
+    has_mxnet,
+    has_tensorflow,
+    has_torch,
+    has_torch_cuda_gpu,
+    has_torch_mps,
+)
+from .compat import mxnet as mx
+from .compat import tensorflow as tf
+from .compat import torch
+from .types import ArgsKwargs, ArrayXd, FloatsXd, IntsXd, Padded, Ragged  # noqa: E402
 
 if TYPE_CHECKING:
     from .api import Ops
@@ -173,7 +196,7 @@ def set_active_gpu(gpu_id: int) -> "cupy.cuda.Device":  # pragma: no cover
 
 def require_cpu() -> bool:  # pragma: no cover
     """Use CPU through best available backend."""
-    from .backends import set_current_ops, get_ops
+    from .backends import get_ops, set_current_ops
 
     ops = get_ops("cpu")
     set_current_ops(ops)
@@ -189,7 +212,7 @@ def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
 
 
 def require_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
-    from .backends import set_current_ops, CupyOps, MPSOps
+    from .backends import CupyOps, MPSOps, set_current_ops
 
     if platform.system() == "Darwin" and not has_torch_mps:
         if has_torch:

From d34f536ea6cb2df6ad2f72e1e5b7511aafe3c66d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 9 Jan 2024 10:24:20 +0100
Subject: [PATCH 19/30] strings2arrays: make work again for sequences of
 inequal length

PR #897 fixed the dtypes in strings2arrays, however also broke
strings2arrays for batches with sequences if inequal lengths.
---
 thinc/layers/strings2arrays.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py
index 91a6b1a31..eba2c983d 100644
--- a/thinc/layers/strings2arrays.py
+++ b/thinc/layers/strings2arrays.py
@@ -1,3 +1,4 @@
+from ctypes import c_uint64
 from typing import Callable, List, Sequence, Tuple
 
 from murmurhash import hash_unicode
@@ -17,8 +18,10 @@ def strings2arrays() -> Model[InT, OutT]:
 
 
 def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]:
-    hashes = [[hash_unicode(word) for word in X] for X in Xs]
-    hash_arrays = [model.ops.asarray2i(h, dtype="uint64") for h in hashes]
+    # Cast 32-bit (signed) integer to 64-bit unsigned, since such casting
+    # is deprecated in NumPy.
+    hashes = [[c_uint64(hash_unicode(word)).value for word in X] for X in Xs]
+    hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes]
     arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays]
 
     def backprop(dX: OutT) -> InT:

From 5c46b82a47781ebff91c953c42c2361a3cdd4f15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 9 Jan 2024 11:05:46 +0100
Subject: [PATCH 20/30] Fix local thread storage usage and make it typecheck

The way we used local thread storage before did not typecheck, since we
assigned to `Thread`. Thread local storage can be a global variable, the
state of this object will be different per thread.
---
 thinc/backends/__init__.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
index 8973c8836..eb954370f 100644
--- a/thinc/backends/__init__.py
+++ b/thinc/backends/__init__.py
@@ -26,6 +26,9 @@
 # notebook might not have preserved contextvars across cells.
 _GLOBAL_STATE = {"ops": None}
 
+# Thread-local state.
+_LOCAL_STATE = threading.local()
+
 
 def set_gpu_allocator(allocator: str) -> None:  # pragma: no cover
     """Route GPU memory allocation via PyTorch or tensorflow.
@@ -152,22 +155,14 @@ def contextvars_eq_thread_ops() -> bool:
     return False
 
 
-def _get_thread_state():
+def _get_thread_state() -> threading.local:
     """Get a thread-specific state variable that inherits from a global
     state when it's created."""
-    thread: threading.Thread = threading.current_thread()
-    if not hasattr(thread, "__local"):
-        thread.__local = _create_thread_local(_GLOBAL_STATE)
-    return thread.__local
-
-
-def _create_thread_local(
-    attrs: Dict[str, Any], local_class: Type[threading.local] = threading.local
-):
-    obj = local_class()
-    for name, value in attrs.items():
-        setattr(obj, name, value)
-    return obj
+    if not hasattr(_LOCAL_STATE, "initialized") or not _LOCAL_STATE.initialized:
+        for name, value in _GLOBAL_STATE.items():
+            setattr(_LOCAL_STATE, name, value)
+        _LOCAL_STATE.initialized = True
+    return _LOCAL_STATE
 
 
 __all__ = [

From 09e955586fa4f84308508257642cae17e61bad1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 9 Jan 2024 11:07:34 +0100
Subject: [PATCH 21/30] Fixup imports that lead to type checking issues

---
 thinc/tests/test_types.py | 1 -
 thinc/util.py             | 1 -
 2 files changed, 2 deletions(-)

diff --git a/thinc/tests/test_types.py b/thinc/tests/test_types.py
index 6bdf4ea23..bf2740bbb 100644
--- a/thinc/tests/test_types.py
+++ b/thinc/tests/test_types.py
@@ -1,6 +1,5 @@
 import numpy
 import pytest
-from pydantic import ValidationError, create_model
 
 from thinc.types import (
     Floats1d,
diff --git a/thinc/util.py b/thinc/util.py
index c6b4bcbc5..529faf875 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -32,7 +32,6 @@
     from pydantic import ValidationError, create_model  # type: ignore
 
 import numpy
-from pydantic import ValidationError, create_model
 from wasabi import table
 
 from . import types  # noqa: E402

From 6c314d27f350e0bc2a9206ad33b10b3ccf1d1282 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 16 Jan 2024 11:34:48 +0100
Subject: [PATCH 22/30] Fix strings2array (#918)

* remove slow marker from basic tagger test

* fix strings2array

* isort
---
 thinc/layers/strings2arrays.py          | 7 ++++---
 thinc/tests/layers/test_basic_tagger.py | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py
index ed40b1e88..eba2c983d 100644
--- a/thinc/layers/strings2arrays.py
+++ b/thinc/layers/strings2arrays.py
@@ -1,3 +1,4 @@
+from ctypes import c_uint64
 from typing import Callable, List, Sequence, Tuple
 
 from murmurhash import hash_unicode
@@ -17,9 +18,9 @@ def strings2arrays() -> Model[InT, OutT]:
 
 
 def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]:
-    hashes = model.ops.asarray2i(
-        [[hash_unicode(word) for word in X] for X in Xs], dtype="int32"
-    )
+    # Cast 32-bit (signed) integer to 64-bit unsigned, since such casting
+    # is deprecated in NumPy.
+    hashes = [[c_uint64(hash_unicode(word)).value for word in X] for X in Xs]
     hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes]
     arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays]
 
diff --git a/thinc/tests/layers/test_basic_tagger.py b/thinc/tests/layers/test_basic_tagger.py
index 855a6d6ad..3bc772940 100644
--- a/thinc/tests/layers/test_basic_tagger.py
+++ b/thinc/tests/layers/test_basic_tagger.py
@@ -60,7 +60,6 @@ def get_shuffled_batches(Xs, Ys, batch_size):
         yield list(batch_X), list(batch_Y)
 
 
-@pytest.mark.slow
 @pytest.mark.parametrize(
     ("depth", "width", "vector_width", "nb_epoch"), [(2, 32, 16, 5)]
 )

From 40d41487ed5f8270e974b16ec8e2edc097faef22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Jan 2024 12:12:43 +0100
Subject: [PATCH 23/30] Set version to v9.0.0.dev4 (#919)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 502500b04..19a87d71d 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev3"
+__version__ = "9.0.0.dev4"
 __release__ = True

From 307a4f83465c132f0a4d6d6af83c029d9eaea3bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 7 Feb 2024 15:37:29 +0100
Subject: [PATCH 24/30] Fix `cupy.cublas` import  (#921)

* Fix `cupy.cublas` import

Reported in #920.

* Update mypy to work with recent Torch versions

* CI: Do not run MyPy on Python 3.6/3.7.
---
 .github/workflows/tests.yml | 4 +++-
 requirements.txt            | 2 +-
 thinc/backends/cupy_ops.py  | 4 ++--
 thinc/compat.py             | 3 +++
 thinc/shims/torchscript.py  | 2 +-
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 035be0baf..1ed106d59 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -87,7 +87,9 @@ jobs:
 
       - name: Run mypy
         run: python -m mypy thinc --no-implicit-reexport
-        if: matrix.python_version != '3.6'
+        if: |
+          matrix.python_version != '3.6' &&
+          matrix.python_version != '3.7'
 
       - name: Delete source directory
         run: rm -rf thinc
diff --git a/requirements.txt b/requirements.txt
index b7682e738..3e3c9901e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,7 +25,7 @@ pytest-cov>=2.7.0,<5.0.0
 coverage>=5.0.0,<8.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-mypy>=1.0.0,<1.1.0; python_version >= "3.7"
+mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8"
 types-mock>=0.1.1
 types-contextvars>=0.1.2; python_version < "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 1e1e5b92b..472b6c542 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -1,7 +1,7 @@
 import numpy
 
 from .. import registry
-from ..compat import cupy, cupyx
+from ..compat import cublas, cupy, cupyx
 from ..types import DeviceTypes
 from ..util import (
     is_cupy_array,
@@ -257,7 +257,7 @@ def clip_gradient(self, gradient, threshold):
         # implementation.
         def frobenius_norm(X):
             X_vec = X.reshape(-1)
-            return cupy.cublas.nrm2(X_vec)
+            return cublas.nrm2(X_vec)
 
         grad_norm = cupy.maximum(frobenius_norm(gradient), 1e-12)
         gradient *= cupy.minimum(threshold, grad_norm) / grad_norm
diff --git a/thinc/compat.py b/thinc/compat.py
index 5d600796a..c7b47cbe6 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -4,9 +4,11 @@
 
 try:  # pragma: no cover
     import cupy
+    import cupy.cublas
     import cupyx
 
     has_cupy = True
+    cublas = cupy.cublas
     cupy_version = Version(cupy.__version__)
     try:
         cupy.cuda.runtime.getDeviceCount()
@@ -20,6 +22,7 @@
     else:
         cupy_from_dlpack = cupy.fromDlpack
 except (ImportError, AttributeError):
+    cublas = None
     cupy = None
     cupyx = None
     cupy_version = Version("0.0.0")
diff --git a/thinc/shims/torchscript.py b/thinc/shims/torchscript.py
index 6c05c8a9b..9d413f93a 100644
--- a/thinc/shims/torchscript.py
+++ b/thinc/shims/torchscript.py
@@ -30,7 +30,7 @@ class TorchScriptShim(PyTorchShim):
 
     def __init__(
         self,
-        model: Optional["torch.ScriptModule"],
+        model: Optional["torch.jit.ScriptModule"],
         config=None,
         optimizer: Any = None,
         mixed_precision: bool = False,

From 3aae298d32adc167ac57e0791a5a8c0544b1e8a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 7 Feb 2024 16:14:19 +0100
Subject: [PATCH 25/30] Set version to v8.2.3 (#922)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 394a8253e..e7455c55b 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.2.2"
+__version__ = "8.2.3"
 __release__ = True

From ec68d7d558783a40ccb7a4f4627070f9aa4fb195 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 8 Apr 2024 15:56:35 +0200
Subject: [PATCH 26/30] Set version to 9.0.0.dev5 (#925)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 19a87d71d..ebf1604dc 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev4"
+__version__ = "9.0.0.dev5"
 __release__ = True

From c998bf2a6d304a0539b987e289245f5311820fd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 16 Apr 2024 12:07:19 +0200
Subject: [PATCH 27/30] Merge `thinc-apple-ops` into Thinc (#927)

This change adds `AppleOps` to Thinc, to ensure that the AMX unit is
always used on Apple Silicon Macs. Before this change, a user would get
much worse performance if they forgot to install `thinc-apple-ops`.

The `apple_ops` and `_accelerate` modules are built conditionally. When
detecting the best CPU implementation, we rely on a `try...except`
import to determine whether Apple ops are available.

Even though x86_64 Macs do not have an AMX unit, Accelerate is
competitive with BLIS, so it does not hurt to enable Apple ops on all
Macs.
---
 .github/workflows/tests.yml                   | 11 ---
 setup.py                                      | 16 +++-
 thinc/api.py                                  |  7 +-
 thinc/backends/__init__.py                    | 10 ++-
 thinc/backends/_accelerate.pxd                | 40 ++++++++++
 thinc/backends/_accelerate.pyx                | 75 ++++++++++++++++++
 thinc/backends/apple_ops.pyx                  | 39 +++++++++
 thinc/backends/mps_ops.py                     |  7 +-
 thinc/compat.py                               |  4 +
 thinc/tests/backends/_apple_blas/__init__.py  |  0
 thinc/tests/backends/_apple_blas/test_gemm.py | 79 +++++++++++++++++++
 thinc/tests/backends/test_mps_ops.py          | 11 +++
 thinc/tests/backends/test_ops.py              |  2 +-
 13 files changed, 279 insertions(+), 22 deletions(-)
 create mode 100644 thinc/backends/_accelerate.pxd
 create mode 100644 thinc/backends/_accelerate.pyx
 create mode 100644 thinc/backends/apple_ops.pyx
 create mode 100644 thinc/tests/backends/_apple_blas/__init__.py
 create mode 100644 thinc/tests/backends/_apple_blas/test_gemm.py
 create mode 100644 thinc/tests/backends/test_mps_ops.py

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1ed106d59..cd569bafa 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -152,14 +152,3 @@ jobs:
 
       - name: Run tests with extras
         run: python -m pytest --pyargs thinc --cov=thinc --cov-report=term -p thinc.tests.enable_tensorflow -p thinc.tests.enable_mxnet
-
-      - name: Run tests for thinc-apple-ops
-        run: |
-          pip uninstall -y tensorflow
-          pip install thinc-apple-ops
-          python -m pytest --pyargs thinc_apple_ops
-        if: matrix.os == 'macos-latest' && matrix.python_version == '3.10'
-
-      - name: Run tests with thinc-apple-ops
-        run: python -m pytest --pyargs thinc
-        if: matrix.os == 'macos-latest' && matrix.python_version == '3.10'
diff --git a/setup.py b/setup.py
index 231f7298b..e380c815c 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+import platform
 import sys
 from setuptools.command.build_ext import build_ext
 from sysconfig import get_path
@@ -13,6 +14,8 @@
 # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options
 Options.docstrings = True
 
+ACCELERATE = "thinc.backends._accelerate"
+APPLE_OPS = ["thinc.backends.apple_ops", ACCELERATE]
 
 PACKAGES = find_packages()
 MOD_NAMES = [
@@ -20,7 +23,7 @@
     "thinc.backends.numpy_ops",
     "thinc.layers.sparselinear",
     "thinc.layers.premap_ids",
-]
+] + (APPLE_OPS if platform.system() == "Darwin" else [])
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
     "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function", "-std=c++11"],
@@ -78,7 +81,16 @@ def setup_package():
     ext_modules = []
     for name in MOD_NAMES:
         mod_path = name.replace(".", "/") + ".pyx"
-        ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs)
+        if name == ACCELERATE:
+            ext = Extension(
+                name,
+                [mod_path],
+                language="c++",
+                include_dirs=include_dirs,
+                libraries=["blas"],
+            )
+        else:
+            ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs)
         ext_modules.append(ext)
     print("Cythonizing sources")
     ext_modules = cythonize(
diff --git a/thinc/api.py b/thinc/api.py
index 0c4d0a0e1..798ef6f08 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -162,6 +162,11 @@
     xp2torch,
 )
 
+try:
+    from .backends import AppleOps
+except ImportError:
+    AppleOps = None
+
 # fmt: off
 __all__ = [
     # .config
@@ -198,7 +203,7 @@
     "has_cupy",
     # .backends
     "get_ops", "set_current_ops", "get_current_ops", "use_ops",
-    "Ops", "CupyOps", "MPSOps", "NumpyOps", "set_gpu_allocator",
+    "Ops", "AppleOps", "CupyOps", "MPSOps", "NumpyOps", "set_gpu_allocator",
     "use_pytorch_for_gpu_memory", "use_tensorflow_for_gpu_memory",
     # .layers
     "Dropout", "Embed", "expand_window", "HashEmbed", "LayerNorm", "Linear",
diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
index eb954370f..5d33c2c34 100644
--- a/thinc/backends/__init__.py
+++ b/thinc/backends/__init__.py
@@ -19,6 +19,11 @@
 from .numpy_ops import NumpyOps
 from .ops import Ops
 
+try:
+    from .apple_ops import AppleOps
+except ImportError:
+    AppleOps = None
+
 context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None)
 context_pools: ContextVar[dict] = ContextVar("context_pools", default={})
 
@@ -83,10 +88,6 @@ def use_tensorflow_for_gpu_memory() -> None:  # pragma: no cover
 
 
 def _import_extra_cpu_backends():
-    try:
-        from thinc_apple_ops import AppleOps
-    except ImportError:
-        pass
     try:
         from thinc_bigendian_ops import BigEndianOps
     except ImportError:
@@ -171,6 +172,7 @@ def _get_thread_state() -> threading.local:
     "use_ops",
     "ParamServer",
     "Ops",
+    "AppleOps",
     "CupyOps",
     "MPSOps",
     "NumpyOps",
diff --git a/thinc/backends/_accelerate.pxd b/thinc/backends/_accelerate.pxd
new file mode 100644
index 000000000..8bc0ce233
--- /dev/null
+++ b/thinc/backends/_accelerate.pxd
@@ -0,0 +1,40 @@
+cdef extern from "Accelerate/Accelerate.h":
+    enum CBLAS_ORDER:     CblasRowMajor, CblasColMajor
+    enum CBLAS_TRANSPOSE: CblasNoTrans, CblasTrans, CblasConjTrans
+    enum CBLAS_UPLO:      CblasUpper, CblasLower
+    enum CBLAS_DIAG:      CblasNonUnit, CblasUnit
+    enum CBLAS_SIDE:      CblasLeft, CblasRight
+
+    # BLAS level 1 routines
+
+    void cblas_sswap(int M, float  *x, int incX, float  *y, int incY) nogil
+    void cblas_sscal(int N, float  alpha, float  *x, int incX) nogil
+    void cblas_scopy(int N, float  *x, int incX, float  *y, int incY) nogil
+    void cblas_saxpy(int N, float  alpha, float  *x, int incX, float  *y, int incY ) nogil
+    float cblas_sdot(int N, float  *x, int incX, float  *y, int incY ) nogil
+    float cblas_snrm2(int N, float  *x, int incX) nogil
+    float cblas_sasum(int N, float  *x, int incX) nogil
+    int cblas_isamax(int N, float  *x, int incX) nogil
+
+    # BLAS level 2 routines
+    void cblas_sgemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N,
+                                 float  alpha, float  *A, int lda, float  *x, int incX,
+                                 float  beta, float  *y, int incY) nogil
+
+    void cblas_sger(CBLAS_ORDER Order, int M, int N, float  alpha, float  *x,
+                                int incX, float  *y, int incY, float  *A, int lda) nogil
+
+    # BLAS level 3 routines
+    void cblas_sgemm(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA,
+                                 CBLAS_TRANSPOSE TransB, int M, int N, int K,
+                                 float  alpha, float  *A, int lda, float  *B, int ldb,
+                                 float  beta, float  *C, int ldc) nogil
+
+
+cdef void sgemm(bint TransA, bint TransB, int M, int N, int K,
+                    float alpha, const float* A, int lda, const float *B,
+                    int ldb, float beta, float* C, int ldc) nogil
+
+
+cdef void saxpy(int N, float alpha, const float* X, int incX,
+                float *Y, int incY) nogil
diff --git a/thinc/backends/_accelerate.pyx b/thinc/backends/_accelerate.pyx
new file mode 100644
index 000000000..094cb9443
--- /dev/null
+++ b/thinc/backends/_accelerate.pyx
@@ -0,0 +1,75 @@
+cimport numpy as np
+from libc.stdint cimport uintptr_t
+
+import numpy
+
+
+cpdef np.ndarray gemm(float[:, ::1] A, float[:, ::1] B,
+                      bint trans1=False, bint trans2=False,
+                      np.ndarray out=None):
+    cdef int nM = A.shape[0] if not trans1 else A.shape[1]
+    cdef int nK = A.shape[1] if not trans1 else A.shape[0]
+    cdef int nK_b = B.shape[0] if not trans2 else B.shape[1]
+    cdef int nN = B.shape[1] if not trans2 else B.shape[0]
+
+    cdef float[:, ::1] C = out
+
+    if out is None:
+        out = numpy.empty((nM, nN), dtype="f")
+        C = out
+    else:
+        if C.shape[0] != nM or C.shape[1] != nN:
+            msg = "Shape mismatch for output matrix, was: (%d, %d), expected (%d, %d)"
+            raise ValueError(msg % (C.shape[0], C.shape[1], nM, nN))
+
+
+    if nK != nK_b:
+        msg = "Shape mismatch for gemm: (%d, %d), (%d, %d)"
+        raise ValueError(msg % (nM, nK, nK_b, nN))
+
+    if nM == 0 or nK == 0 or nN == 0:
+        return out
+
+    cblas_sgemm(
+        CblasRowMajor,
+        CblasTrans if trans1 else CblasNoTrans,
+        CblasTrans if trans2 else CblasNoTrans,
+        nM,
+        nN,
+        nK,
+        1.0,
+        &A[0, 0],
+        A.shape[1],
+        &B[0, 0],
+        B.shape[1],
+        0.0,
+        &C[0, 0],
+        C.shape[1]
+    )
+    return out
+
+
+cdef void sgemm(bint TransA, bint TransB, int M, int N, int K,
+                    float alpha, const float* A, int lda, const float *B,
+                    int ldb, float beta, float* C, int ldc) nogil:
+    cblas_sgemm(
+        CblasRowMajor,
+        CblasTrans if TransA else CblasNoTrans,
+        CblasTrans if TransB else CblasNoTrans,
+        M,
+        N,
+        K,
+        alpha,
+        A,
+        lda,
+        B,
+        ldb,
+        beta,
+        C,
+        ldc
+    )
+
+
+cdef void saxpy(int N, float alpha, const float* X, int incX,
+                float *Y, int incY) nogil:
+    cblas_saxpy(N, alpha, X, incX, Y, incY)
diff --git a/thinc/backends/apple_ops.pyx b/thinc/backends/apple_ops.pyx
new file mode 100644
index 000000000..95a710c0d
--- /dev/null
+++ b/thinc/backends/apple_ops.pyx
@@ -0,0 +1,39 @@
+from typing import Optional
+
+import numpy
+
+from ._accelerate import gemm
+
+from ._accelerate cimport saxpy, sgemm
+from .cblas cimport CBlas, set_saxpy, set_sgemm
+
+from .. import registry
+from ..types import Floats2d
+from .numpy_ops import NumpyOps
+
+
+@registry.ops("AppleOps")
+class AppleOps(NumpyOps):
+    """Thinc Ops class that calls into Apple's native libraries for some
+    operations. Other operations fall back to numpy."""
+    name = "apple"
+    xp = numpy
+
+    def cblas(self) -> CBlas:
+        cdef CBlas cblas = CBlas()
+        set_saxpy(cblas, saxpy)
+        set_sgemm(cblas, sgemm)
+        return cblas
+
+    def gemm(
+        self,
+        x: Floats2d,
+        y: Floats2d,
+        out: Optional[Floats2d] = None,
+        trans1: bool = False,
+        trans2: bool = False,
+    ) -> Floats2d:
+        """Perform General Matrix Multiplication (GeMM) and optionally store
+        the result in the specified output variable.
+        """
+        return gemm(x, y, out=out, trans1=trans1, trans2=trans2)
diff --git a/thinc/backends/mps_ops.py b/thinc/backends/mps_ops.py
index c6ba71f11..fb242f0f1 100644
--- a/thinc/backends/mps_ops.py
+++ b/thinc/backends/mps_ops.py
@@ -3,6 +3,7 @@
 import numpy
 
 from .. import registry
+from ..compat import has_apple_ops
 from .numpy_ops import NumpyOps
 from .ops import Ops
 
@@ -12,11 +13,11 @@
     # during type checking.
     _Ops = Ops
 else:
-    try:
-        from thinc_apple_ops import AppleOps
+    if has_apple_ops:
+        from .apple_ops import AppleOps
 
         _Ops = AppleOps
-    except ImportError:
+    else:
         _Ops = NumpyOps
 
 
diff --git a/thinc/compat.py b/thinc/compat.py
index 7275bfc6e..2ec91de48 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -1,3 +1,4 @@
+import platform
 import warnings
 
 from packaging.version import Version
@@ -119,6 +120,9 @@ def enable_mxnet():
     has_blis = False
 
 
+# AppleOps is available unconditionally on macOS.
+has_apple_ops = platform.system() == "Darwin"
+
 has_gpu = has_cupy_gpu or has_torch_mps_gpu
 
 __all__ = [
diff --git a/thinc/tests/backends/_apple_blas/__init__.py b/thinc/tests/backends/_apple_blas/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/thinc/tests/backends/_apple_blas/test_gemm.py b/thinc/tests/backends/_apple_blas/test_gemm.py
new file mode 100644
index 000000000..10e662110
--- /dev/null
+++ b/thinc/tests/backends/_apple_blas/test_gemm.py
@@ -0,0 +1,79 @@
+import numpy
+import pytest
+
+from thinc.compat import has_apple_ops
+
+try:
+    import thinc.backends._accelerate as accelerate
+except:
+    pass
+
+
+@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available")
+def test_basic_sgemm():
+    A = numpy.random.randn(5, 4).astype("f")
+    B = numpy.random.randn(4, 7).astype("f")
+    C = accelerate.gemm(A, B)
+    assert C.shape == (A.shape[0], B.shape[1])
+
+    C_out = numpy.empty((5, 7), dtype="f")
+    accelerate.gemm(A, B, out=C_out)
+
+    numpy.testing.assert_allclose(C, C_out)
+
+
+@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available")
+def test_incorrect_output_size():
+    A = numpy.ndarray((5, 4), dtype="f")
+    B = numpy.ndarray((4, 7), dtype="f")
+
+    with pytest.raises(ValueError, match=r"Shape mismatch for output matrix"):
+        accelerate.gemm(A, B, out=numpy.ndarray((3, 7), dtype="f"))
+
+    with pytest.raises(ValueError, match=r"Shape mismatch for output matrix"):
+        accelerate.gemm(A, B, out=numpy.ndarray((5, 3), dtype="f"))
+
+
+@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available")
+@pytest.mark.parametrize(
+    "A_shape,B_shape,transA,transB",
+    [
+        [(0, 0), (0, 0), False, False],
+        [(0, 0), (0, 0), True, False],
+        [(0, 0), (0, 0), False, True],
+        [(0, 0), (0, 0), True, True],
+        [(0, 5), (5, 0), False, False],
+        [(5, 0), (5, 0), False, True],
+        [(5, 0), (5, 0), True, False],
+    ],
+)
+def test_zero_size(A_shape, B_shape, transA, transB):
+    A = numpy.ndarray(A_shape, dtype="f")
+    B = numpy.ndarray(B_shape, dtype="f")
+    if not transA and not transB:
+        C = numpy.dot(A, B)
+    elif transA:
+        C = numpy.dot(A.T, B)
+    elif transB:
+        C = numpy.dot(A, B.T)
+    else:
+        C = numpy.dot(A.T, B.T)
+    C_ = accelerate.gemm(A, B, trans1=transA, trans2=transB)
+    assert C.shape == C_.shape
+
+
+@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available")
+@pytest.mark.parametrize(
+    "A_shape,B_shape,transA,transB",
+    [
+        [(4, 5), (4, 5), False, False],
+        [(5, 4), (4, 5), True, False],
+        [(4, 5), (5, 4), False, True],
+        [(5, 4), (5, 4), True, True],
+    ],
+)
+def test_incorrect_shapes(A_shape, B_shape, transA, transB):
+    A = numpy.ndarray(A_shape, dtype="f")
+    B = numpy.ndarray(B_shape, dtype="f")
+    with pytest.raises(ValueError, match=r"Shape mismatch"):
+        accelerate.gemm(A, B, trans1=transA, trans2=transB)
diff --git a/thinc/tests/backends/test_mps_ops.py b/thinc/tests/backends/test_mps_ops.py
new file mode 100644
index 000000000..1bd5838b1
--- /dev/null
+++ b/thinc/tests/backends/test_mps_ops.py
@@ -0,0 +1,11 @@
+from thinc.api import NumpyOps, get_ops
+from thinc.compat import has_apple_ops
+
+
+def test_mps_ops_inherits_apple_ops():
+    ops = get_ops("mps")
+    assert isinstance(ops, NumpyOps)
+    if has_apple_ops:
+        # We can't import AppleOps directly, because its' not
+        # available on non-Darwin systems.
+        assert "AppleOps" in [base.__name__ for base in type(ops).__bases__]
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 9f03c0438..7cf4a935d 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -1403,7 +1403,7 @@ def test_get_ops():
     # If Apple ops are available, "cpu" should return AppleOps or
     # NumpyOps otherwise.
     try:
-        from thinc_apple_ops import AppleOps
+        from thinc.backends.apple_ops import AppleOps
 
         assert isinstance(get_ops("cpu"), AppleOps)
     except ImportError:

From 2a0b9c1e41ef29052bfc422076f65a2e522a850d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 17 Apr 2024 10:59:21 +0200
Subject: [PATCH 28/30] Set version to 9.0.0.dev6 (#928)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index ebf1604dc..1d2afbabb 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev5"
+__version__ = "9.0.0.dev6"
 __release__ = True

From ccae25849587dda3dbdb4e6cbc8836cba506220e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 18 Apr 2024 09:48:26 +0200
Subject: [PATCH 29/30] Document `AppleOps` and `MPSOps` (#929)

* Document AppleOps and MPSOps

* Reformat Ops table

- Sort alphabetically.
- Note that `AppleOps` is new in 9.0.

* Missing comma
---
 website/docs/api-backends.md | 22 ++++++++--------
 website/docs/api-model.md    | 50 ++++++++++++++++++------------------
 2 files changed, 37 insertions(+), 35 deletions(-)

diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md
index fc69a775d..853fada3b 100644
--- a/website/docs/api-backends.md
+++ b/website/docs/api-backends.md
@@ -17,16 +17,18 @@ specialized versions can be called for different backends. You can also create
 your own `Ops` subclasses with specialized routines for your layers, and use the
 [`set_current_ops`](#set_current_ops) function to change the default.
 
-| Backend    |        CPU         |        GPU         |        TPU        | Description                                                                                           |
-| ---------- | :----------------: | :----------------: | :---------------: | ----------------------------------------------------------------------------------------------------- |
-| `NumpyOps` | <i name="yes"></i> | <i name="no"></i>  | <i name="no"></i> | Execute via `numpy`, [`blis`](https://github.com/explosion/cython-blis) (optional) and custom Cython. |
-| `CupyOps`  | <i name="no"></i>  | <i name="yes"></i> | <i name="no"></i> | Execute via [`cupy`](https://cupy.chainer.org/) and custom CUDA.                                      |
+| Backend    |        CPU         |        GPU         |        TPU        | Description                                                                                                 |
+| ---------- | :----------------: | :----------------: | :---------------: | ----------------------------------------------------------------------------------------------------------- |
+| `AppleOps` | <i name="yes"></i> | <i name="no"></i>  | <i name="no"></i> | Use AMX matrix multiplication units on Apple Silicon Macs. Added in Thinc 9.0.                              |
+| `CupyOps`  | <i name="no"></i>  | <i name="yes"></i> | <i name="no"></i> | Execute via [`cupy`](https://cupy.chainer.org/) and custom CUDA.                                            |
+| `MPSOps`   | <i name="yes"></i> | <i name="yes"></i> | <i name="no"></i> | Use the GPU on Apple Silicon Macs for PyTorch models, use AMX matrix multiplication units for Thinc Models. |
+| `NumpyOps` | <i name="yes"></i> | <i name="no"></i>  | <i name="no"></i> | Execute via `numpy`, [`blis`](https://github.com/explosion/cython-blis) (optional) and custom Cython.       |
 
 ## Ops {#ops tag="class"}
 
-The `Ops` class is typically not used directly but via `NumpyOps` or `CupyOps`,
-which are subclasses of `Ops` and implement a **more efficient subset of the
-methods**. You also have access to the ops via the
+The `Ops` class is typically not used directly but via `NumpyOps`, `AppleOps`,
+`CupyOps` or `MPSOps`, which are subclasses of `Ops` and implement a **more
+efficient subset of the methods**. You also have access to the ops via the
 [`Model.ops`](/docs/api-model#attributes) attribute. The documented methods
 below list which backends provide optimized and more efficient versions
 (indicated by <i name="yes"></i>), and which use the default implementation.
@@ -56,7 +58,7 @@ use_ops(blis_ops)
 
 | Name          | Type         | Description                                                                              |
 | ------------- | ------------ | ---------------------------------------------------------------------------------------- |
-| `name`        | <tt>str</tt> | **Class attribute:** Backend name, `"numpy"` or `"cupy"`.                                |
+| `name`        | <tt>str</tt> | **Class attribute:** Backend name, `"numpy"`, `"apple"`, `"cupy"` or `"mps"`.            |
 | `xp`          | <tt>Xp</tt>  | **Class attribute:** `numpy` or `cupy`.                                                  |
 | `device_type` | <tt>str</tt> | The device type to use, if available for the given backend: `"cpu"`, `"gpu"` or `"tpu"`. |
 | `device_id`   | <tt>int</tt> | The device ID to use, if available for the given backend.                                |
@@ -1553,7 +1555,7 @@ numpy_ops = get_ops("numpy")
 
 | Argument    | Type         | Description                                           |
 | ----------- | ------------ | ----------------------------------------------------- |
-| `ops`       | <tt>str</tt> | `"numpy"` or `"cupy"`.                                |
+| `ops`       | <tt>str</tt> | `"numpy"`, `"apple"`, `"cupy"` or `"mps"`.            |
 | `**kwargs`  |              | Optional arguments passed to [`Ops.__init__`](#init). |
 | **RETURNS** | <tt>Ops</tt> | The backend object.                                   |
 
@@ -1572,7 +1574,7 @@ with use_ops("cupy"):
 
 | Argument   | Type         | Description                                           |
 | ---------- | ------------ | ----------------------------------------------------- |
-| `ops`      | <tt>str</tt> | `"numpy"` or `"cupy"`.                                |
+| `ops`      | <tt>str</tt> | `"numpy"`, `"apple"`, `"cupy"` or `"mps"`.            |
 | `**kwargs` |              | Optional arguments passed to [`Ops.__init__`](#init). |
 
 ### get_current_ops {#get_current_ops tag="function"}
diff --git a/website/docs/api-model.md b/website/docs/api-model.md
index 597f67ec9..193fd1acb 100644
--- a/website/docs/api-model.md
+++ b/website/docs/api-model.md
@@ -84,19 +84,19 @@ model = Model(
 )
 ```
 
-| Argument       | Type                                        | Description                                                                             |
-| -------------- | ------------------------------------------- | --------------------------------------------------------------------------------------- |
-| `name`         | <tt>str</tt>                                | The name of the layer type.                                                             |
-| `forward`      | <tt>Callable</tt>                           | Function to compute the forward result and the backpropagation callback.                |
-| _keyword-only_ |                                             |                                                                                         |
-| `init`         | <tt>Callable</tt>                           | Function to define the initialization logic.                                            |
-| `dims`         | <tt>Dict[str, Optional[int]]</tt>           | Dictionary describing the model's dimensions. Map unknown dimensions to `None`.         |
-| `params`       | <tt>Dict[str, Optional[FloatsXd]]</tt>      | Dictionary with the model's parameters. Set currently unavailable parameters to `None`. |
-| `refs`         | <tt>Dict[str, Optional[Model]]</tt>         | Dictionary mapping specific nodes (sublayers) of the network to a name.                 |
-| `attrs`        | <tt>Dict[str, Any]</tt>                     | Dictionary of non-parameter attributes.                                                 |
-| `layers`       | <tt>List[Model]</tt>                        | List of child layers.                                                                   |
-| `shims`        | <tt>List[Shim]</tt>                         | List of interfaces for external models.                                                 |
-| `ops`          | <tt>Optional[Union[NumpyOps, CupyOps]]</tt> | An `Ops` instance, which provides mathematical and memory operations.                   |
+| Argument       | Type                                                          | Description                                                                             |
+| -------------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------- |
+| `name`         | <tt>str</tt>                                                  | The name of the layer type.                                                             |
+| `forward`      | <tt>Callable</tt>                                             | Function to compute the forward result and the backpropagation callback.                |
+| _keyword-only_ |                                                               |                                                                                         |
+| `init`         | <tt>Callable</tt>                                             | Function to define the initialization logic.                                            |
+| `dims`         | <tt>Dict[str, Optional[int]]</tt>                             | Dictionary describing the model's dimensions. Map unknown dimensions to `None`.         |
+| `params`       | <tt>Dict[str, Optional[FloatsXd]]</tt>                        | Dictionary with the model's parameters. Set currently unavailable parameters to `None`. |
+| `refs`         | <tt>Dict[str, Optional[Model]]</tt>                           | Dictionary mapping specific nodes (sublayers) of the network to a name.                 |
+| `attrs`        | <tt>Dict[str, Any]</tt>                                       | Dictionary of non-parameter attributes.                                                 |
+| `layers`       | <tt>List[Model]</tt>                                          | List of child layers.                                                                   |
+| `shims`        | <tt>List[Shim]</tt>                                           | List of interfaces for external models.                                                 |
+| `ops`          | <tt>Optional[Union[NumpyOps, AppleOps, CupyOps, MPSOps]]</tt> | An `Ops` instance, which provides mathematical and memory operations.                   |
 
 ### Model.define_operators {#define_operators tag="classmethod,contextmanager"}
 
@@ -260,17 +260,17 @@ for node in model.walk():
 
 The `walk` method supports three iteration orders through the `order` argument:
 
-* `"bfs"`: breadth-first. Iteration order of the example above:
-  *1 - 2 - 4 - 3 - 5*
-* `"dfs_pre"`: depth-first preorder, outputs a node before its children.
-  Iteration order of the example above: *1 - 2 - 3 - 4 - 5*
-* `"dfs_post"`: depth-first postorder, outputs children before a node itself.
-  Iteration order of the example above: *3 - 2 - 5 - 4 - 1*
+- `"bfs"`: breadth-first. Iteration order of the example above: _1 - 2 - 4 - 3 -
+  5_
+- `"dfs_pre"`: depth-first preorder, outputs a node before its children.
+  Iteration order of the example above: _1 - 2 - 3 - 4 - 5_
+- `"dfs_post"`: depth-first postorder, outputs children before a node itself.
+  Iteration order of the example above: _3 - 2 - 5 - 4 - 1_
 
-| Argument    | Type                     | Description                                                                                                                                |
-|-------------|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------|
+| Argument    | Type                     | Description                                                                                                                               |
+| ----------- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
 | `order`     | <tt>str</tt>             | Node iteration order. `"bfs"` (breadth-first), `"dfs_pre"` (depth-first preorder), `"dfs_post"` (depth-first postorder) Default: `"bfs"`. |
-| **RETURNS** | <tt>Iterable[Model]</tt> | The layers of the model.                                                                                                                   |
+| **RETURNS** | <tt>Iterable[Model]</tt> | The layers of the model.                                                                                                                  |
 
 ### Model.remove_node {#remove_node tag="method"}
 
@@ -329,9 +329,9 @@ assert model.get_dim("nI") == 16
 Retrieve the value of a dimension of the given name, or `None` if the dimension
 is either unregistered or the value is currently unset.
 
-| Argument    | Type                  | Description                             |
-| ----------- | --------------------- | --------------------------------------- |
-| `name`      | <tt>str</tt>          | The name of the dimension, e.g. `"nO"`. |
+| Argument    | Type                   | Description                             |
+| ----------- | ---------------------- | --------------------------------------- |
+| `name`      | <tt>str</tt>           | The name of the dimension, e.g. `"nO"`. |
 | **RETURNS** | <tt>Optional[int]</tt> | The size of the dimension, or `None`.   |
 
 ### Model.set_dim {#set_dim tag="method"}

From 5be631e9623434ec2b4a3dc5989ce1cb13062ebe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 18 Apr 2024 10:22:55 +0200
Subject: [PATCH 30/30] Set version to 9.0.0 (#930)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 1d2afbabb..d2a73d579 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "9.0.0.dev6"
+__version__ = "9.0.0"
 __release__ = True