From 64967ebb821cfb05ee014e99032259766205948c Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Wed, 17 Aug 2022 17:37:52 +0200 Subject: [PATCH 01/30] Remove `thinc.extra.search` module and related tests (moved to spaCy) (#743) --- setup.py | 5 +- thinc/extra/__init__.pxd | 0 thinc/extra/__init__.py | 0 thinc/extra/search.pxd | 92 -------- thinc/extra/search.pyx | 302 -------------------------- thinc/extra/tests/__init__.py | 0 thinc/extra/tests/c_test_search.pyx | 81 ------- thinc/tests/extra/__init__.py | 0 thinc/tests/extra/test_beam_search.py | 5 - 9 files changed, 3 insertions(+), 482 deletions(-) delete mode 100644 thinc/extra/__init__.pxd delete mode 100644 thinc/extra/__init__.py delete mode 100644 thinc/extra/search.pxd delete mode 100644 thinc/extra/search.pyx delete mode 100644 thinc/extra/tests/__init__.py delete mode 100644 thinc/extra/tests/c_test_search.pyx delete mode 100644 thinc/tests/extra/__init__.py delete mode 100644 thinc/tests/extra/test_beam_search.py diff --git a/setup.py b/setup.py index 27873beeb..50f1c65cc 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ "thinc.backends.cblas", "thinc.backends.linalg", "thinc.backends.numpy_ops", - "thinc.extra.search", "thinc.layers.sparselinear", ] COMPILE_OPTIONS = { @@ -106,7 +105,9 @@ def setup_package(): ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs) ext_modules.append(ext) print("Cythonizing sources") - ext_modules = cythonize(ext_modules, compiler_directives=COMPILER_DIRECTIVES, language_level=2) + ext_modules = cythonize( + ext_modules, compiler_directives=COMPILER_DIRECTIVES, language_level=2 + ) setup( name="thinc", diff --git a/thinc/extra/__init__.pxd b/thinc/extra/__init__.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/thinc/extra/__init__.py b/thinc/extra/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/thinc/extra/search.pxd b/thinc/extra/search.pxd deleted file mode 100644 index daccbf58e..000000000 --- a/thinc/extra/search.pxd +++ /dev/null @@ -1,92 +0,0 @@ -from cymem.cymem cimport Pool - -from libc.stdint cimport uint32_t -from libc.stdint cimport uint64_t -from libcpp.pair cimport pair -from libcpp.queue cimport priority_queue -from libcpp.vector cimport vector - -ctypedef uint64_t hash_t -ctypedef uint64_t class_t -ctypedef float weight_t - - -ctypedef pair[weight_t, size_t] Entry -ctypedef priority_queue[Entry] Queue - - -ctypedef int (*trans_func_t)(void* dest, void* src, class_t clas, void* x) except -1 - -ctypedef void* (*init_func_t)(Pool mem, int n, void* extra_args) except NULL - -ctypedef int (*del_func_t)(Pool mem, void* state, void* extra_args) except -1 - -ctypedef int (*finish_func_t)(void* state, void* extra_args) except -1 - -ctypedef hash_t (*hash_func_t)(void* state, void* x) except 0 - - -cdef struct _State: - void* content - class_t* hist - weight_t score - weight_t loss - int i - int t - bint is_done - - -cdef class Beam: - cdef Pool mem - cdef class_t nr_class - cdef class_t width - cdef class_t size - cdef public weight_t min_density - cdef int t - cdef readonly bint is_done - cdef list histories - cdef list _parent_histories - cdef weight_t** scores - cdef int** is_valid - cdef weight_t** costs - cdef _State* _parents - cdef _State* _states - cdef del_func_t del_func - - cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1 - - cdef inline void* at(self, int i) nogil: - return self._states[i].content - - cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1 - cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func, - void* extra_args) except -1 - cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1 - - - cdef inline void set_cell(self, int i, int j, weight_t score, int is_valid, weight_t cost) nogil: - self.scores[i][j] = score - self.is_valid[i][j] = is_valid - self.costs[i][j] = cost - - cdef int set_row(self, int i, const weight_t* scores, const int* is_valid, - const weight_t* costs) except -1 - cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1 - - -cdef class MaxViolation: - cdef Pool mem - cdef weight_t cost - cdef weight_t delta - cdef readonly weight_t p_score - cdef readonly weight_t g_score - cdef readonly double Z - cdef readonly double gZ - cdef class_t n - cdef readonly list p_hist - cdef readonly list g_hist - cdef readonly list p_probs - cdef readonly list g_probs - - cpdef int check(self, Beam pred, Beam gold) except -1 - cpdef int check_crf(self, Beam pred, Beam gold) except -1 diff --git a/thinc/extra/search.pyx b/thinc/extra/search.pyx deleted file mode 100644 index d69756551..000000000 --- a/thinc/extra/search.pyx +++ /dev/null @@ -1,302 +0,0 @@ -# cython: profile=True, experimental_cpp_class_def=True, cdivision=True, infer_types=True -cimport cython -from libc.string cimport memset, memcpy -from libc.math cimport log, exp -import math - -from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap - - -cdef class Beam: - def __init__(self, class_t nr_class, class_t width, weight_t min_density=0.0): - assert nr_class != 0 - assert width != 0 - self.nr_class = nr_class - self.width = width - self.min_density = min_density - self.size = 1 - self.t = 0 - self.mem = Pool() - self._parents = <_State*>self.mem.alloc(self.width, sizeof(_State)) - self._states = <_State*>self.mem.alloc(self.width, sizeof(_State)) - cdef int i - self.histories = [[] for i in range(self.width)] - self._parent_histories = [[] for i in range(self.width)] - - self.scores = self.mem.alloc(self.width, sizeof(weight_t*)) - self.is_valid = self.mem.alloc(self.width, sizeof(weight_t*)) - self.costs = self.mem.alloc(self.width, sizeof(weight_t*)) - for i in range(self.width): - self.scores[i] = self.mem.alloc(self.nr_class, sizeof(weight_t)) - self.is_valid[i] = self.mem.alloc(self.nr_class, sizeof(int)) - self.costs[i] = self.mem.alloc(self.nr_class, sizeof(weight_t)) - - def __len__(self): - return self.size - - property score: - def __get__(self): - return self._states[0].score - - property min_score: - def __get__(self): - return self._states[self.size-1].score - - property loss: - def __get__(self): - return self._states[0].loss - - property probs: - def __get__(self): - return _softmax([self._states[i].score for i in range(self.size)]) - - property scores: - def __get__(self): - return [self._states[i].score for i in range(self.size)] - - property histories: - def __get__(self): - return self.histories - - cdef int set_row(self, int i, const weight_t* scores, const int* is_valid, - const weight_t* costs) except -1: - cdef int j - for j in range(self.nr_class): - self.scores[i][j] = scores[j] - self.is_valid[i][j] = is_valid[j] - self.costs[i][j] = costs[j] - - cdef int set_table(self, weight_t** scores, int** is_valid, weight_t** costs) except -1: - cdef int i, j - for i in range(self.width): - memcpy(self.scores[i], scores[i], sizeof(weight_t) * self.nr_class) - memcpy(self.is_valid[i], is_valid[i], sizeof(bint) * self.nr_class) - memcpy(self.costs[i], costs[i], sizeof(int) * self.nr_class) - - cdef int initialize(self, init_func_t init_func, del_func_t del_func, int n, void* extra_args) except -1: - for i in range(self.width): - self._states[i].content = init_func(self.mem, n, extra_args) - self._parents[i].content = init_func(self.mem, n, extra_args) - self.del_func = del_func - - def __dealloc__(self): - for i in range(self.width): - self.del_func(self.mem, self._states[i].content, NULL) - self.del_func(self.mem, self._parents[i].content, NULL) - - @cython.cdivision(True) - cdef int advance(self, trans_func_t transition_func, hash_func_t hash_func, - void* extra_args) except -1: - cdef weight_t** scores = self.scores - cdef int** is_valid = self.is_valid - cdef weight_t** costs = self.costs - - cdef Queue* q = new Queue() - self._fill(q, scores, is_valid) - # For a beam of width k, we only ever need 2k state objects. How? - # Each transition takes a parent and a class and produces a new state. - # So, we don't need the whole history --- just the parent. So at - # each step, we take a parent, and apply one or more extensions to - # it. - self._parents, self._states = self._states, self._parents - self._parent_histories, self.histories = self.histories, self._parent_histories - cdef weight_t score - cdef int p_i - cdef int i = 0 - cdef class_t clas - cdef _State* parent - cdef _State* state - cdef hash_t key - cdef PreshMap seen_states = PreshMap(self.width) - cdef uint64_t is_seen - cdef uint64_t one = 1 - while i < self.width and not q.empty(): - data = q.top() - p_i = data.second / self.nr_class - clas = data.second % self.nr_class - score = data.first - q.pop() - parent = &self._parents[p_i] - # Indicates terminal state reached; i.e. state is done - if parent.is_done: - # Now parent will not be changed, so we don't have to copy. - # Once finished, should also be unbranching. - self._states[i], parent[0] = parent[0], self._states[i] - parent.i = self._states[i].i - parent.t = self._states[i].t - parent.is_done = self._states[i].t - self._states[i].score = score - self.histories[i] = list(self._parent_histories[p_i]) - i += 1 - else: - state = &self._states[i] - # The supplied transition function should adjust the destination - # state to be the result of applying the class to the source state - transition_func(state.content, parent.content, clas, extra_args) - key = hash_func(state.content, extra_args) if hash_func is not NULL else 0 - is_seen = seen_states.get(key) - if key == 0 or key == 1 or not is_seen: - if key != 0 and key != 1: - seen_states.set(key, one) - state.score = score - state.loss = parent.loss + costs[p_i][clas] - self.histories[i] = list(self._parent_histories[p_i]) - self.histories[i].append(clas) - i += 1 - del q - self.size = i - assert self.size >= 1 - for i in range(self.width): - memset(self.scores[i], 0, sizeof(weight_t) * self.nr_class) - memset(self.costs[i], 0, sizeof(weight_t) * self.nr_class) - memset(self.is_valid[i], 0, sizeof(int) * self.nr_class) - self.t += 1 - - cdef int check_done(self, finish_func_t finish_func, void* extra_args) except -1: - cdef int i - for i in range(self.size): - if not self._states[i].is_done: - self._states[i].is_done = finish_func(self._states[i].content, extra_args) - for i in range(self.size): - if not self._states[i].is_done: - self.is_done = False - break - else: - self.is_done = True - - @cython.cdivision(True) - cdef int _fill(self, Queue* q, weight_t** scores, int** is_valid) except -1: - """Populate the queue from a k * n matrix of scores, where k is the - beam-width, and n is the number of classes. - """ - cdef Entry entry - cdef weight_t score - cdef _State* s - cdef int i, j, move_id - assert self.size >= 1 - cdef vector[Entry] entries - for i in range(self.size): - s = &self._states[i] - move_id = i * self.nr_class - if s.is_done: - # Update score by path average, following TACL '13 paper. - if self.histories[i]: - entry.first = s.score + (s.score / self.t) - else: - entry.first = s.score - entry.second = move_id - entries.push_back(entry) - else: - for j in range(self.nr_class): - if is_valid[i][j]: - entry.first = s.score + scores[i][j] - entry.second = move_id + j - entries.push_back(entry) - cdef double max_, Z, cutoff - if self.min_density == 0.0: - for i in range(entries.size()): - q.push(entries[i]) - elif not entries.empty(): - max_ = entries[0].first - Z = 0. - cutoff = 0. - # Softmax into probabilities, so we can prune - for i in range(entries.size()): - if entries[i].first > max_: - max_ = entries[i].first - for i in range(entries.size()): - Z += exp(entries[i].first-max_) - cutoff = (1. / Z) * self.min_density - for i in range(entries.size()): - prob = exp(entries[i].first-max_) / Z - if prob >= cutoff: - q.push(entries[i]) - - -cdef class MaxViolation: - def __init__(self): - self.p_score = 0.0 - self.g_score = 0.0 - self.Z = 0.0 - self.gZ = 0.0 - self.delta = -1 - self.cost = 0 - self.p_hist = [] - self.g_hist = [] - self.p_probs = [] - self.g_probs = [] - - cpdef int check(self, Beam pred, Beam gold) except -1: - cdef _State* p = &pred._states[0] - cdef _State* g = &gold._states[0] - cdef weight_t d = p.score - g.score - if p.loss >= 1 and (self.cost == 0 or d > self.delta): - self.cost = p.loss - self.delta = d - self.p_hist = list(pred.histories[0]) - self.g_hist = list(gold.histories[0]) - self.p_score = p.score - self.g_score = g.score - self.Z = 1e-10 - self.gZ = 1e-10 - for i in range(pred.size): - if pred._states[i].loss > 0: - self.Z += exp(pred._states[i].score) - for i in range(gold.size): - if gold._states[i].loss == 0: - prob = exp(gold._states[i].score) - self.Z += prob - self.gZ += prob - - cpdef int check_crf(self, Beam pred, Beam gold) except -1: - d = pred.score - gold.score - seen_golds = set([tuple(gold.histories[i]) for i in range(gold.size)]) - if pred.loss > 0 and (self.cost == 0 or d > self.delta): - p_hist = [] - p_scores = [] - g_hist = [] - g_scores = [] - for i in range(pred.size): - if pred._states[i].loss > 0: - p_scores.append(pred._states[i].score) - p_hist.append(list(pred.histories[i])) - # This can happen from non-monotonic actions - # If we find a better gold analysis this way, be sure to keep it. - elif pred._states[i].loss <= 0 \ - and tuple(pred.histories[i]) not in seen_golds: - g_scores.append(pred._states[i].score) - g_hist.append(list(pred.histories[i])) - for i in range(gold.size): - if gold._states[i].loss == 0: - g_scores.append(gold._states[i].score) - g_hist.append(list(gold.histories[i])) - - all_probs = _softmax(p_scores + g_scores) - p_probs = all_probs[:len(p_scores)] - g_probs_all = all_probs[len(p_scores):] - g_probs = _softmax(g_scores) - - self.cost = pred.loss - self.delta = d - self.p_hist = p_hist - self.g_hist = g_hist - # TODO: These variables are misnamed! These are the gradients of the loss. - self.p_probs = p_probs - # Intuition here: - # The gradient of the loss is: - # P(model) - P(truth) - # Normally, P(truth) is 1 for the gold - # But, if we want to do the "partial credit" scheme, we want - # to create a distribution over the gold, proportional to the scores - # awarded. - self.g_probs = [x-y for x, y in zip(g_probs_all, g_probs)] - - -def _softmax(nums): - if not nums: - return [] - max_ = max(nums) - nums = [(exp(n-max_) if n is not None else None) for n in nums] - Z = sum(n for n in nums if n is not None) - return [(n/Z if n is not None else None) for n in nums] diff --git a/thinc/extra/tests/__init__.py b/thinc/extra/tests/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/thinc/extra/tests/c_test_search.pyx b/thinc/extra/tests/c_test_search.pyx deleted file mode 100644 index a727d3364..000000000 --- a/thinc/extra/tests/c_test_search.pyx +++ /dev/null @@ -1,81 +0,0 @@ -from thinc.extra.search cimport Beam -from cymem.cymem cimport Pool -from thinc.typedefs cimport class_t, weight_t - - -cdef struct TestState: - int length - int x - Py_UNICODE* string - - -cdef int transition(void* dest, void* src, class_t clas, void* extra_args) except -1: - dest_state = dest - src_state = src - dest_state.length = src_state.length - dest_state.x = src_state.x - dest_state.x += clas - if extra_args != NULL: - dest_state.string = extra_args - else: - dest_state.string = src_state.string - - -cdef void* initialize(Pool mem, int n, void* extra_args) except NULL: - state = mem.alloc(1, sizeof(TestState)) - state.length = n - state.x = 1 - if extra_args == NULL: - state.string = 'default' - else: - state.string = extra_args - return state - - -cdef int destroy(Pool mem, void* state, void* extra_args) except -1: - state = state - mem.free(state) - - -def test_init(nr_class, beam_width): - b = Beam(nr_class, beam_width) - assert b.size == 1 - assert b.width == beam_width - assert b.nr_class == nr_class - - -def test_initialize(nr_class, beam_width, length): - b = Beam(nr_class, beam_width) - b.initialize(initialize, destroy, length, NULL) - for i in range(b.width): - s = b.at(i) - assert s.length == length, s.length - assert s.string == 'default' - - -def test_initialize_extra(nr_class, beam_width, length, unicode extra): - b = Beam(nr_class, beam_width) - b.initialize(initialize, destroy, length, extra) - for i in range(b.width): - s = b.at(i) - assert s.length == length - - -def test_transition(nr_class=3, beam_width=6, length=3): - b = Beam(nr_class, beam_width) - b.initialize(initialize, destroy, length, NULL) - b.set_cell(0, 2, 30, True, 0) - b.set_cell(0, 1, 42, False, 0) - b.advance(transition, NULL, NULL) - assert b.size == 1, b.size - assert b.score == 30, b.score - s = b.at(0) - assert s.x == 3 - assert b._states[0].score == 30, b._states[0].score - b.set_cell(0, 1, 10, True, 0) - b.set_cell(0, 2, 20, True, 0) - b.advance(transition, NULL, NULL) - assert b._states[0].score == 50, b._states[0].score - assert b._states[1].score == 40 - s = b.at(0) - assert s.x == 5 diff --git a/thinc/tests/extra/__init__.py b/thinc/tests/extra/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/thinc/tests/extra/test_beam_search.py b/thinc/tests/extra/test_beam_search.py deleted file mode 100644 index ab7ab9f11..000000000 --- a/thinc/tests/extra/test_beam_search.py +++ /dev/null @@ -1,5 +0,0 @@ -from thinc.extra.search import MaxViolation - - -def test_init_violn(): - MaxViolation() From 43ef766e3bfd5c52d8f0a58ef54de252326f63dc Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Tue, 13 Sep 2022 09:45:04 +0200 Subject: [PATCH 02/30] `NumpyOps` cleanup (#760) * `NumpyOps`: Remove unused/vestigial free functions, reuse functions in `Ops` * Remove superfluous `typedef` --- thinc/backends/numpy_ops.pyx | 111 ++++------------------------------- 1 file changed, 11 insertions(+), 100 deletions(-) diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx index c980e6c5d..130aec643 100644 --- a/thinc/backends/numpy_ops.pyx +++ b/thinc/backends/numpy_ops.pyx @@ -22,7 +22,7 @@ from ..util import copy_array, get_array_module from ..types import DeviceTypes, DTypes, Shape, ArrayXd from .cblas cimport CBlas, daxpy, saxpy from .linalg cimport VecVec, Vec -from .ops import Ops +from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights try: import blis.py @@ -31,9 +31,6 @@ except ImportError: has_blis = False -ctypedef float weight_t - - cdef extern from "math.h": float logf(float x) nogil float sqrtf(float x) nogil @@ -118,12 +115,12 @@ class NumpyOps(Ops): _check_compatible_shape(dY, Y) cdef size_t size = Y.size - cdef weight_t* dX_ptr - cdef const weight_t* Y_ptr = Y.data + cdef float* dX_ptr + cdef const float* Y_ptr = Y.data cdef np.ndarray dX if dY.dtype == "float32" and Y.dtype == "float32": dX = _inplace_or_copy(dY, inplace) - dX_ptr = dX.data + dX_ptr = dX.data for i in range(size): if Y_ptr[i] <= 0: dX_ptr[i] = 0. @@ -522,21 +519,6 @@ def check_seq2col_lengths(ops, lengths, B): return lengths -def cpu_clip_gradient(weight_t[::1] gradient, weight_t threshold): - grad_norm = Vec.norm(&gradient[0], gradient.shape[0]) - if grad_norm >= threshold: - Vec.mul_i(&gradient[0], threshold / grad_norm, gradient.shape[0]) - - -def add_gradient_noise(float[::1] gradient, weight_t noise_level, - weight_t timestep): - cdef weight_t variance = noise_level / ((1 + timestep) ** 0.55) - if variance >= 0.000001: - gradient += numpy.asarray( - numpy.random.normal(scale=variance, loc=0., size=len(gradient)), - dtype='float32') - - cdef void cpu_position_encode(float* output, float period, int N, int D) nogil: cdef float pos, d cdef int j @@ -567,18 +549,18 @@ cdef void cpu_scatter_add(float* dest, @cython.cdivision(True) -cdef void _adam_momentum(weight_t* gradient, weight_t* mom1, weight_t* mom2, - int nr_weight, weight_t beta1, weight_t beta2, weight_t eps, - weight_t learn_rate) nogil: +cdef void _adam_momentum(float* gradient, float* mom1, float* mom2, + int nr_weight, float beta1, float beta2, float eps, + float learn_rate) nogil: # Calculate Adam on CPU, fused. # Assumes the learning rate adjustment is calculated by the caller; # a_t = learn_rate * sqrt(1-beta2**timestep) / (1-beta1**timestep) - cdef weight_t one_minus_beta1 = 1-beta1 - cdef weight_t one_minus_beta2 = 1-beta2 - cdef weight_t m1, m2, g + cdef float one_minus_beta1 = 1-beta1 + cdef float one_minus_beta2 = 1-beta2 + cdef float m1, m2, g cdef int i # Blockwise implementation is a bit faster. Adam is slooow :( - cdef weight_t[64] buff + cdef float[64] buff cdef int steps = nr_weight // 64 if steps * 64 < nr_weight: steps += 1 @@ -604,18 +586,6 @@ cdef void _adam_momentum(weight_t* gradient, weight_t* mom1, weight_t* mom2, idx += step_size -@cython.cdivision(True) -cdef void cpu_update_averages(weight_t* ema, - const weight_t* weights, int nr_weight, weight_t t, weight_t max_decay) nogil: - cdef weight_t decay = (1.0 + t) / (10.0 + t) - if decay > max_decay: - decay = max_decay - cdef weight_t one_minus_decay = 1-decay - cdef int i - for i in range(nr_weight): # num_threads=4, schedule='static'): - ema[i] -= one_minus_decay * (ema[i] - weights[i]) - - def lstm_forward_training( np.ndarray params, np.ndarray c_init, np.ndarray h_init, np.ndarray X, np.ndarray lengths @@ -847,17 +817,6 @@ def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_stat return dX, numpy.concatenate(grad_parts) -def _split_directions(X, dirs): - if dirs == 1: - return [X] - else: - X_ = X.reshape((X.shape[0], -1, dirs)) - Xs = [] - for d in range(dirs): - Xs.append(numpy.ascontiguousarray(X_[:, d])) - return Xs - - cdef int _lstm_backward_training( int d, int N, int nO, int nI, int nT, float* dX, @@ -950,54 +909,6 @@ cdef int _lstm_backward_training( ) -def _split_weights(np.ndarray params, int i, int nO, int nI, int params_i): - Wx_size = 4 * nO * nI - bx_size = 4 * nO - Wh_size = 4 * nO * nO - bh_size = 4 * nO - Wx = params[params_i : params_i + Wx_size].reshape((4 * nO, nI)) - params_i += Wx_size - bx = params[params_i : params_i + bx_size].reshape((4 * nO,)) - params_i += bx_size - Wh = params[params_i : params_i + Wh_size].reshape((4 * nO, nO)) - params_i += Wh_size - bh = params[params_i : params_i + bh_size].reshape((4 * nO,)) - params_i += bh_size - return ((Wx, bx), (Wh, bh)), params_i - - -def _transpose_weights(params): - # Transpose the parameters so that the gates are the last dimension. This - # makes it easier to fuse. - (Wx, bx), (Wh, bh) = params - Wx = Wx.reshape((4, -1, Wx.shape[-1])) - Wx = Wx.transpose((1, 0, 2)).reshape((-1, Wx.shape[-1])) - bx = bx.reshape((4, -1)).transpose((1, 0)).reshape((-1,)) - Wh = Wh.reshape((4, -1, Wh.shape[-1])) - Wh = Wh.transpose((1, 0, 2)).reshape((-1, Wh.shape[-1])) - bh = bh.reshape((4, -1)).transpose((1, 0)).reshape((-1,)) - ascontig = numpy.ascontiguousarray - Wx = ascontig(Wx) - Wh = ascontig(Wh) - bias = ascontig(bx) + bh - return Wx, Wh, bias - - -def _untranspose_unsplit_weights(params): - Wx, Wh, bias = params - nO = Wh.shape[1] - nI = Wx.shape[1] - Wx = Wx.reshape((-1, 4, nI)).transpose((1, 0, 2)).reshape((-1, nI)) - Wh = Wh.reshape((-1, 4, nO)).transpose((1, 0, 2)).reshape((-1, nO)) - bias = bias.reshape((-1, 4)).transpose((1, 0)).reshape((-1,)) - zeros = numpy.zeros(bias.shape, dtype="f") - return numpy.concatenate((Wx.ravel(), bias, Wh.ravel(), zeros)) - - -cdef inline float sigmoid(float X) nogil: - return 1./(1. + expf(-X)) - - cdef inline float dsigmoid(float y) nogil: return y*(1-y) From 17c823e06120d18441b47e0724723e69376126b2 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 15 Sep 2022 17:41:47 +0200 Subject: [PATCH 03/30] disable mypy run for Python 3.10 (#768) (#769) * disable mypy run for Python 3.10 * dot --- azure-pipelines.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index c81c62689..f9096029c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -63,6 +63,7 @@ jobs: - script: | python -m mypy thinc displayName: 'Run mypy' + condition: ne(variables['python.version'], '3.10') - task: DeleteFiles@1 inputs: From 0366934a144f244714f36b62a82b8ac32386b9ed Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 16 Sep 2022 14:34:41 +0200 Subject: [PATCH 04/30] Remove vestigial/mostly unused `backends.linalg` module (#742) * `CBlas`: Add `sscalv` * `NumpyOps`: Replace usage of `.linalg` with `numpy` and `BLAS` calls * Remove vestigial/mostly unused `backends.linalg` module * Use BLAS notation for `sscal`, add `dscal` --- setup.py | 1 - thinc/backends/cblas.pxd | 6 + thinc/backends/cblas.pyx | 24 +++ thinc/backends/linalg.pxd | 276 ----------------------------------- thinc/backends/linalg.pyx | 4 - thinc/backends/numpy_ops.pyx | 25 ++-- 6 files changed, 42 insertions(+), 294 deletions(-) delete mode 100644 thinc/backends/linalg.pxd delete mode 100644 thinc/backends/linalg.pyx diff --git a/setup.py b/setup.py index 50f1c65cc..c76034945 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,6 @@ PACKAGES = find_packages() MOD_NAMES = [ "thinc.backends.cblas", - "thinc.backends.linalg", "thinc.backends.numpy_ops", "thinc.layers.sparselinear", ] diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd index 15837e5e7..0ec778dde 100644 --- a/thinc/backends/cblas.pxd +++ b/thinc/backends/cblas.pxd @@ -13,6 +13,8 @@ ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX, ctypedef void (*daxpy_ptr)(int N, double alpha, const double* X, int incX, double *Y, int incY) nogil +ctypedef void (*sscal_ptr)(int N, float alpha, float* X, int incX) nogil +ctypedef void (*dscal_ptr)(int N, double alpha, double* X, int incX) nogil # Forward-declaration of the BlasFuncs struct. This struct must be opaque, so # that consumers of the CBlas class cannot become dependent on its size or @@ -33,6 +35,10 @@ cdef class CBlas: cdef daxpy_ptr daxpy(CBlas cblas) nogil cdef saxpy_ptr saxpy(CBlas cblas) nogil cdef sgemm_ptr sgemm(CBlas cblas) nogil +cdef sscal_ptr sscal(CBlas cblas) nogil +cdef dscal_ptr dscal(CBlas cblas) nogil cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil +cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil +cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx index 9eb4514d8..9348096b8 100644 --- a/thinc/backends/cblas.pyx +++ b/thinc/backends/cblas.pyx @@ -3,10 +3,20 @@ from cython.operator cimport dereference as deref from libcpp.memory cimport make_shared +# Single- and double-precision wrappers for `blis.cy.scalv` +cdef void blis_sscal(int N, float alpha, float* X, int incX) nogil: + blis.cy.scalv(blis.cy.NO_CONJUGATE, N, alpha, X, incX) + +cdef void blis_dscal(int N, double alpha, double* X, int incX) nogil: + blis.cy.scalv(blis.cy.NO_CONJUGATE, N, alpha, X, incX) + + cdef struct BlasFuncs: daxpy_ptr daxpy saxpy_ptr saxpy sgemm_ptr sgemm + sscal_ptr sscal + dscal_ptr dscal cdef class CBlas: @@ -19,6 +29,8 @@ cdef class CBlas: funcs.daxpy = blis.cy.daxpy funcs.saxpy = blis.cy.saxpy funcs.sgemm = blis.cy.sgemm + funcs.sscal = blis_sscal + funcs.dscal = blis_dscal self.ptr = make_shared[BlasFuncs](funcs) cdef daxpy_ptr daxpy(CBlas cblas) nogil: @@ -30,6 +42,12 @@ cdef saxpy_ptr saxpy(CBlas cblas) nogil: cdef sgemm_ptr sgemm(CBlas cblas) nogil: return deref(cblas.ptr).sgemm +cdef sscal_ptr sscal(CBlas cblas) nogil: + return deref(cblas.ptr).sscal + +cdef dscal_ptr dscal(CBlas cblas) nogil: + return deref(cblas.ptr).dscal + cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil: deref(cblas.ptr).daxpy = daxpy @@ -38,3 +56,9 @@ cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil: cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil: deref(cblas.ptr).sgemm = sgemm + +cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil: + deref(cblas.ptr).sscal = sscal + +cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil: + deref(cblas.ptr).dscal = dscal diff --git a/thinc/backends/linalg.pxd b/thinc/backends/linalg.pxd deleted file mode 100644 index 494a26c30..000000000 --- a/thinc/backends/linalg.pxd +++ /dev/null @@ -1,276 +0,0 @@ -# cython: infer_types=True -# cython: cdivision=True - -cimport cython -from libc.stdint cimport int32_t -from libc.string cimport memset, memcpy -from cymem.cymem cimport Pool - - -ctypedef float weight_t - -DEF USE_BLAS = False -DEF EPS = 1e-5 - - -IF USE_BLAS: - cimport blis.cy - -cdef extern from "math.h" nogil: - weight_t exp(weight_t x) - weight_t sqrt(weight_t x) - - -cdef class Matrix: - cdef readonly Pool mem - cdef weight_t* data - cdef readonly int32_t nr_row - cdef readonly int32_t nr_col - - -cdef class Vec: - @staticmethod - cdef inline int arg_max(const weight_t* scores, const int n_classes) nogil: - if n_classes == 2: - return 0 if scores[0] > scores[1] else 1 - cdef int i - cdef int best = 0 - cdef weight_t mode = scores[0] - for i in range(1, n_classes): - if scores[i] > mode: - mode = scores[i] - best = i - return best - - @staticmethod - cdef inline weight_t max(const weight_t* x, int32_t nr) nogil: - if nr == 0: - return 0 - cdef int i - cdef weight_t mode = x[0] - for i in range(1, nr): - if x[i] > mode: - mode = x[i] - return mode - - @staticmethod - cdef inline weight_t sum(const weight_t* vec, int32_t nr) nogil: - cdef int i - cdef weight_t total = 0 - for i in range(nr): - total += vec[i] - return total - - @staticmethod - cdef inline weight_t norm(const weight_t* vec, int32_t nr) nogil: - cdef weight_t total = 0 - for i in range(nr): - total += vec[i] ** 2 - return sqrt(total) - - @staticmethod - cdef inline void add(weight_t* output, const weight_t* x, - weight_t inc, int32_t nr) nogil: - memcpy(output, x, sizeof(output[0]) * nr) - Vec.add_i(output, inc, nr) - - @staticmethod - cdef inline void add_i(weight_t* vec, weight_t inc, int32_t nr) nogil: - cdef int i - for i in range(nr): - vec[i] += inc - - @staticmethod - cdef inline void mul(weight_t* output, const weight_t* vec, weight_t scal, - int32_t nr) nogil: - memcpy(output, vec, sizeof(output[0]) * nr) - Vec.mul_i(output, scal, nr) - - @staticmethod - cdef inline void mul_i(weight_t* vec, weight_t scal, int32_t nr) nogil: - cdef int i - IF USE_BLAS: - blis.cy.scalv(BLIS_NO_CONJUGATE, nr, scal, vec, 1) - ELSE: - for i in range(nr): - vec[i] *= scal - - @staticmethod - cdef inline void pow(weight_t* output, const weight_t* vec, weight_t scal, - int32_t nr) nogil: - memcpy(output, vec, sizeof(output[0]) * nr) - Vec.pow_i(output, scal, nr) - - @staticmethod - cdef inline void pow_i(weight_t* vec, const weight_t scal, int32_t nr) nogil: - cdef int i - for i in range(nr): - vec[i] **= scal - - @staticmethod - @cython.cdivision(True) - cdef inline void div(weight_t* output, const weight_t* vec, weight_t scal, - int32_t nr) nogil: - memcpy(output, vec, sizeof(output[0]) * nr) - Vec.div_i(output, scal, nr) - - @staticmethod - @cython.cdivision(True) - cdef inline void div_i(weight_t* vec, const weight_t scal, int32_t nr) nogil: - cdef int i - for i in range(nr): - vec[i] /= scal - - @staticmethod - cdef inline void exp(weight_t* output, const weight_t* vec, int32_t nr) nogil: - memcpy(output, vec, sizeof(output[0]) * nr) - Vec.exp_i(output, nr) - - @staticmethod - cdef inline void exp_i(weight_t* vec, int32_t nr) nogil: - cdef int i - for i in range(nr): - vec[i] = exp(vec[i]) - - @staticmethod - cdef inline void reciprocal_i(weight_t* vec, int32_t nr) nogil: - cdef int i - for i in range(nr): - vec[i] = 1.0 / vec[i] - - @staticmethod - cdef inline weight_t mean(const weight_t* X, int32_t nr_dim) nogil: - cdef weight_t mean = 0. - for x in X[:nr_dim]: - mean += x - return mean / nr_dim - - @staticmethod - cdef inline weight_t variance(const weight_t* X, int32_t nr_dim) nogil: - # See https://www.johndcook.com/blog/standard_deviation/ - cdef double m = X[0] - cdef double v = 0. - for i in range(1, nr_dim): - diff = X[i]-m - m += diff / (i+1) - v += diff * (X[i] - m) - return v / nr_dim - - -cdef class VecVec: - @staticmethod - cdef inline void add(weight_t* output, - const weight_t* x, - const weight_t* y, - weight_t scale, - int32_t nr) nogil: - memcpy(output, x, sizeof(output[0]) * nr) - VecVec.add_i(output, y, scale, nr) - - @staticmethod - cdef inline void add_i(weight_t* x, - const weight_t* y, - weight_t scale, - int32_t nr) nogil: - cdef int i - IF USE_BLAS: - blis.cy.axpyv(BLIS_NO_CONJUGATE, nr, scale, y, 1, x, 1) - ELSE: - for i in range(nr): - x[i] += y[i] * scale - - @staticmethod - cdef inline void batch_add_i(weight_t* x, - const weight_t* y, - weight_t scale, - int32_t nr, int32_t nr_batch) nogil: - # For fixed x, matrix of y - cdef int i, _ - for _ in range(nr_batch): - VecVec.add_i(x, - y, scale, nr) - y += nr - - @staticmethod - cdef inline void add_pow(weight_t* output, - const weight_t* x, const weight_t* y, weight_t power, int32_t nr) nogil: - memcpy(output, x, sizeof(output[0]) * nr) - VecVec.add_pow_i(output, y, power, nr) - - - @staticmethod - cdef inline void add_pow_i(weight_t* x, - const weight_t* y, weight_t power, int32_t nr) nogil: - cdef int i - for i in range(nr): - x[i] += y[i] ** power - - @staticmethod - cdef inline void mul(weight_t* output, - const weight_t* x, const weight_t* y, int32_t nr) nogil: - memcpy(output, x, sizeof(output[0]) * nr) - VecVec.mul_i(output, y, nr) - - @staticmethod - cdef inline void mul_i(weight_t* x, - const weight_t* y, int32_t nr) nogil: - cdef int i - for i in range(nr): - x[i] *= y[i] - - @staticmethod - cdef inline weight_t dot( - const weight_t* x, const weight_t* y, int32_t nr) nogil: - cdef int i - cdef weight_t total = 0 - for i in range(nr): - total += x[i] * y[i] - return total - - @staticmethod - cdef inline int arg_max_if_true( - const weight_t* scores, const int* is_valid, const int n_classes) nogil: - cdef int i - cdef int best = -1 - for i in range(n_classes): - if is_valid[i] and (best == -1 or scores[i] > scores[best]): - best = i - return best - - @staticmethod - cdef inline int arg_max_if_zero( - const weight_t* scores, const weight_t* costs, const int n_classes) nogil: - cdef int i - cdef int best = -1 - for i in range(n_classes): - if costs[i] == 0 and (best == -1 or scores[i] > scores[best]): - best = i - return best - - -cdef class Mat: - @staticmethod - cdef inline void mean_row(weight_t* Ex, - const weight_t* mat, int32_t nr_row, int32_t nr_col) nogil: - memset(Ex, 0, sizeof(Ex[0]) * nr_col) - for i in range(nr_row): - VecVec.add_i(Ex, &mat[i * nr_col], 1.0, nr_col) - Vec.mul_i(Ex, 1.0 / nr_row, nr_col) - - @staticmethod - cdef inline void var_row(weight_t* Vx, - const weight_t* mat, const weight_t* Ex, - int32_t nr_row, int32_t nr_col, weight_t eps) nogil: - # From https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - if nr_row == 0 or nr_col == 0: - return - cdef weight_t sum_, sum2 - for i in range(nr_col): - sum_ = 0.0 - sum2 = 0.0 - for j in range(nr_row): - x = mat[j * nr_col + i] - sum2 += (x - Ex[i]) ** 2 - sum_ += x - Ex[i] - Vx[i] = (sum2 - sum_**2 / nr_row) / nr_row - Vx[i] += eps diff --git a/thinc/backends/linalg.pyx b/thinc/backends/linalg.pyx deleted file mode 100644 index 4979e8aa9..000000000 --- a/thinc/backends/linalg.pyx +++ /dev/null @@ -1,4 +0,0 @@ -try: - import blis.py -except ImportError: - pass diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx index 130aec643..884c74941 100644 --- a/thinc/backends/numpy_ops.pyx +++ b/thinc/backends/numpy_ops.pyx @@ -20,8 +20,7 @@ cimport blis.cy from .. import registry from ..util import copy_array, get_array_module from ..types import DeviceTypes, DTypes, Shape, ArrayXd -from .cblas cimport CBlas, daxpy, saxpy -from .linalg cimport VecVec, Vec +from .cblas cimport CBlas, daxpy, saxpy, sscal from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights try: @@ -463,7 +462,7 @@ class NumpyOps(Ops): and values.ndim == 2 \ and values.shape[0] == indices.shape[0] \ and values.shape[1] == table.shape[1]: - cpu_scatter_add(table.data, + cpu_scatter_add(self.cblas(), table.data, indices.data, values.data, indices.shape[0], table.shape[1]) else: @@ -479,10 +478,11 @@ class NumpyOps(Ops): _check_compatible_shape(weights, mom1) _check_compatible_shape(weights, mom2) - _adam_momentum(gradient.data, mom1.data, mom2.data, + cdef CBlas cblas = self.cblas() + _adam_momentum(cblas, gradient.data, mom1.data, mom2.data, weights.shape[0], beta1, beta2, eps, learn_rate) - VecVec.add_i(weights.data, - gradient.data, -learn_rate, weights.shape[0]) + saxpy(cblas)(weights.shape[0], -learn_rate, gradient.data, 1, weights.data, 1) + memset(gradient.data, 0, gradient.size * sizeof(float)) return weights, gradient, mom1, mom2 @@ -537,19 +537,18 @@ cdef void cpu_position_encode(float* output, float period, int N, int D) nogil: output += D -cdef void cpu_scatter_add(float* dest, +cdef void cpu_scatter_add(CBlas cblas, float* dest, const int* indices, const float* src, int nr_id, int nr_col) nogil: cdef int i for i in range(nr_id): id_ = indices[i] if id_ >= 0: - VecVec.add_i(&dest[id_*nr_col], - &src[i*nr_col], 1., nr_col) + saxpy(cblas)(nr_col, 1., &src[i*nr_col], 1, &dest[id_*nr_col], 1) @cython.cdivision(True) -cdef void _adam_momentum(float* gradient, float* mom1, float* mom2, +cdef void _adam_momentum(CBlas cblas, float* gradient, float* mom1, float* mom2, int nr_weight, float beta1, float beta2, float eps, float learn_rate) nogil: # Calculate Adam on CPU, fused. @@ -567,9 +566,9 @@ cdef void _adam_momentum(float* gradient, float* mom1, float* mom2, idx = 0 for i in range(steps): step_size = min(64, nr_weight-idx) - Vec.mul_i(mom1, beta1, step_size) - VecVec.add_i(mom1, gradient, one_minus_beta1, step_size) - Vec.mul_i(mom2, beta2, step_size) + sscal(cblas)(step_size, beta1, mom1, 1) + saxpy(cblas)(step_size, one_minus_beta1, gradient, 1, mom1, 1) + sscal(cblas)(step_size, beta2, mom2, 1) for j in range(step_size): mom2[j] += one_minus_beta2 * gradient[j] ** 2 for j in range(step_size): From de40bdf352e58f8808d11099adb3b157dc91de49 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 16 Sep 2022 19:25:58 +0200 Subject: [PATCH 05/30] Standardize `blis` calls in `NumpyOps` (#763) * `NumpyOps`: Move `blis` detection to `compat` module, replace `blis.cy.gemm` calls with `CBlas` calls * `NumpOps`: Call `self.cblas()` instead of directly instantiating `CBlas` * `CBlas`: Add `dgemm` * `NumpyOps`: Use `CBlas.?gemm` in `gemm` --- thinc/backends/cblas.pxd | 7 ++- thinc/backends/cblas.pyx | 8 +++ thinc/backends/numpy_ops.pyx | 109 ++++++++++++++++++++++------------- thinc/compat.py | 9 +++ 4 files changed, 93 insertions(+), 40 deletions(-) diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd index 0ec778dde..a789ef4a3 100644 --- a/thinc/backends/cblas.pxd +++ b/thinc/backends/cblas.pxd @@ -2,8 +2,11 @@ from libcpp.memory cimport shared_ptr ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K, - float alpha, const float* A, int lda, const float *B, + float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, int ldc) nogil +ctypedef void (*dgemm_ptr)(bint transA, bint transB, int M, int N, int K, + double alpha, const double* A, int lda, const double* B, + int ldb, double beta, double* C, int ldc) nogil ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX, @@ -35,10 +38,12 @@ cdef class CBlas: cdef daxpy_ptr daxpy(CBlas cblas) nogil cdef saxpy_ptr saxpy(CBlas cblas) nogil cdef sgemm_ptr sgemm(CBlas cblas) nogil +cdef dgemm_ptr dgemm(CBlas cblas) nogil cdef sscal_ptr sscal(CBlas cblas) nogil cdef dscal_ptr dscal(CBlas cblas) nogil cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil +cdef void set_dgemm(CBlas cblas, dgemm_ptr dgemm) nogil cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil cdef void set_dscal(CBlas cblas, dscal_ptr dscal) nogil diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx index 9348096b8..bb479e56d 100644 --- a/thinc/backends/cblas.pyx +++ b/thinc/backends/cblas.pyx @@ -15,6 +15,7 @@ cdef struct BlasFuncs: daxpy_ptr daxpy saxpy_ptr saxpy sgemm_ptr sgemm + dgemm_ptr dgemm sscal_ptr sscal dscal_ptr dscal @@ -29,6 +30,7 @@ cdef class CBlas: funcs.daxpy = blis.cy.daxpy funcs.saxpy = blis.cy.saxpy funcs.sgemm = blis.cy.sgemm + funcs.dgemm = blis.cy.dgemm funcs.sscal = blis_sscal funcs.dscal = blis_dscal self.ptr = make_shared[BlasFuncs](funcs) @@ -42,6 +44,9 @@ cdef saxpy_ptr saxpy(CBlas cblas) nogil: cdef sgemm_ptr sgemm(CBlas cblas) nogil: return deref(cblas.ptr).sgemm +cdef dgemm_ptr dgemm(CBlas cblas) nogil: + return deref(cblas.ptr).dgemm + cdef sscal_ptr sscal(CBlas cblas) nogil: return deref(cblas.ptr).sscal @@ -57,6 +62,9 @@ cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil: cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil: deref(cblas.ptr).sgemm = sgemm +cdef void set_dgemm(CBlas cblas, dgemm_ptr dgemm) nogil: + deref(cblas.ptr).dgemm = dgemm + cdef void set_sscal(CBlas cblas, sscal_ptr sscal) nogil: deref(cblas.ptr).sscal = sscal diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx index 884c74941..45d3d9093 100644 --- a/thinc/backends/numpy_ops.pyx +++ b/thinc/backends/numpy_ops.pyx @@ -15,19 +15,13 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap from murmurhash.mrmr cimport hash64 cimport numpy as np -cimport blis.cy from .. import registry from ..util import copy_array, get_array_module from ..types import DeviceTypes, DTypes, Shape, ArrayXd -from .cblas cimport CBlas, daxpy, saxpy, sscal +from .cblas cimport CBlas, daxpy, saxpy, sgemm, dgemm, sscal from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights - -try: - import blis.py - has_blis = True -except ImportError: - has_blis = False +from ..compat import has_blis cdef extern from "math.h": @@ -90,11 +84,45 @@ class NumpyOps(Ops): raise ValueError(f"Provided 'y' array should be 2-dimensional, but found {y.ndim} dimension(s).") if not self.use_blis: # delegate to base Ops return super().gemm(x, y, out=out, trans1=trans1, trans2=trans2) + x = self.as_contig(x) y = self.as_contig(y) + + cdef int nM = x.shape[0] if not trans1 else x.shape[1] + cdef int nK = x.shape[1] if not trans1 else x.shape[0] + cdef int nK_b = y.shape[0] if not trans2 else y.shape[1] + cdef int nN = y.shape[1] if not trans2 else y.shape[0] + if nK != nK_b: + msg = "Shape mismatch for blis.gemm: (%d, %d), (%d, %d)" + raise ValueError(msg % (nM, nK, nK_b, nN)) + if out is not None: out = self.as_contig(out) - return blis.py.gemm(x, y, out=out, trans1=trans1, trans2=trans2, beta=0.) + else: + # Can be uninitialized as 'beta' is zero. + out = numpy.empty((nM, nN), dtype=x.dtype) + + cblas = self.cblas() + if x.dtype == "float32" and y.dtype == "float32" and out.dtype == "float32": + sgemm(cblas)(trans1, trans2, + nM, nN, nK, + 1.0, + (x.data), x.shape[1], + (y.data), y.shape[1], + 0.0, + (out.data), out.shape[1]) + elif x.dtype == "float64" and y.dtype == "float64" and out.dtype == "float64": + dgemm(cblas)(trans1, trans2, + nM, nN, nK, + 1.0, + (x.data), x.shape[1], + (y.data), y.shape[1], + 0.0, + (out.data), out.shape[1]) + else: + raise ValueError(f"unsupported or mismatching array data types; got '{x.dtype}', '{y.dtype}', '{out.dtype}'") + + return out def relu(self, np.ndarray X, inplace=False): cdef np.ndarray Y @@ -137,7 +165,7 @@ class NumpyOps(Ops): ): assert H0.shape[0] == C0.shape[0] assert H0.shape[1] == C0.shape[1] - Y, fwd_state = lstm_forward_training(params, H0, C0, X, size_at_t) + Y, fwd_state = lstm_forward_training(self.cblas(), params, H0, C0, X, size_at_t) return Y, fwd_state def lstm_forward_inference( @@ -148,13 +176,13 @@ class NumpyOps(Ops): np.ndarray X, np.ndarray size_at_t ): - Y, _ = lstm_forward_training(params, H0, C0, X, size_at_t) + Y, _ = lstm_forward_training(self.cblas(), params, H0, C0, X, size_at_t) return Y def backprop_lstm( self, np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state ): - dX, d_params = backprop_lstm(dY, lengths, params, fwd_state) + dX, d_params = backprop_lstm(self.cblas(), dY, lengths, params, fwd_state) return dX, d_params def maxout(self, reals3d_ft X): @@ -585,7 +613,7 @@ cdef void _adam_momentum(CBlas cblas, float* gradient, float* mom1, float* mom2, idx += step_size -def lstm_forward_training( +def lstm_forward_training(CBlas cblas, np.ndarray params, np.ndarray c_init, np.ndarray h_init, np.ndarray X, np.ndarray lengths ): @@ -627,6 +655,7 @@ def lstm_forward_training( Cid = C[i, d] Gid = G[i, d] _lstm_forward_training( + cblas, d, N, nO, nI, nT, Gid, Yid.data, @@ -647,6 +676,7 @@ def lstm_forward_training( cdef int _lstm_forward_training( + CBlas cblas, int d, int N, int nO, int nI, int nT, np.ndarray G, float* Y, @@ -660,13 +690,13 @@ cdef int _lstm_forward_training( float* Ct2, ) except -1: cdef double one = 1.0 - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, + sgemm(cblas)(False, True, N, nO*4, nI, one, - X, nI, 1, - Wx, nI, 1, + X, nI, + Wx, nI, one, - G.data, nO*4, 1 + G.data, nO*4 ) cdef int t, batch_size cdef int seq_i = 0 if d == 0 else N @@ -684,13 +714,13 @@ cdef int _lstm_forward_training( Gt3_ = G[seq_i : seq_i+batch_size] Gt3 = Gt3_.data # Now do the actual calculation - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, + sgemm(cblas)(False, True, batch_size, nO*4, nO, one, - Yt2, nO, 1, - Wh, nO, 1, + Yt2, nO, + Wh, nO, one, - Gt3, nO*4, 1 + Gt3, nO*4 ) # This is super weird: if we remove this add, it gets slower? I guess # it does cache prefetching or something? @@ -714,7 +744,7 @@ cdef int _lstm_forward_training( memcpy(Ct2, Ct3, sizeof(Ct3[0]) * batch_size * nO) -def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state): +def backprop_lstm(CBlas cblas, np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state): xp = numpy cdef np.ndarray Y cdef np.ndarray G @@ -791,7 +821,7 @@ def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_stat assert (dYid.shape[0], dYid.shape[1]) == (N, nO) assert (dC.shape[0], dC.shape[1]) == (N, nO) assert (dG.shape[0], dG.shape[1]) == (N, nO*4) - _lstm_backward_training(d, N, nO, dX.shape[1], nT, + _lstm_backward_training(cblas, d, N, nO, dX.shape[1], nT, dX.data, dYid.data, dC.data, @@ -817,6 +847,7 @@ def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_stat cdef int _lstm_backward_training( + CBlas cblas, int d, int N, int nO, int nI, int nT, float* dX, float* dY, @@ -861,36 +892,36 @@ cdef int _lstm_backward_training( ) # Backprop hidden-to-hidden w.r.t. hidden. # dYt2 += dGt3 @ Wh - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE, + sgemm(cblas)(False, False, batch_size, nO, nO*4, one, - dGt3, nO*4, 1, - Wh, nO, 1, + dGt3, nO*4, + Wh, nO, one, - dYt2, nO, 1 + dYt2, nO ) seq_t3 = seq_t2 size_t3 = size_t2 # Backprop input-to-hidden w.r.t. weights. # dWx += dG @ X - blis.cy.gemm(blis.cy.TRANSPOSE, blis.cy.NO_TRANSPOSE, + sgemm(cblas)(True, False, nO*4, nI, N, one, - dG, nO*4, 1, - X, nI, 1, + dG, nO*4, + X, nI, one, - dWx, nI, 1 + dWx, nI ) # Backprop hidden-to-hidden w.r.t weights. # dWh += dG @ Y - blis.cy.gemm(blis.cy.TRANSPOSE, blis.cy.NO_TRANSPOSE, + sgemm(cblas)(True, False, nO*4, nO, N, one, - dG, nO*4, 1, - Y, nO, 1, + dG, nO*4, + Y, nO, one, - dWh, nO, 1 + dWh, nO ) # Backprop bias for i in range(N): @@ -898,13 +929,13 @@ cdef int _lstm_backward_training( d_bias[j] += dG[i*nO*4+j] # Backprop input-to-hidden w.r.t. input - blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE, + sgemm(cblas)(False, False, N, nI, nO*4, one, - dG, nO*4, 1, - Wx, nI, 1, + dG, nO*4, + Wx, nI, one, - dX, nI, 1 + dX, nI ) diff --git a/thinc/compat.py b/thinc/compat.py index 2d8b40345..9e80f8dfe 100644 --- a/thinc/compat.py +++ b/thinc/compat.py @@ -87,4 +87,13 @@ has_os_signpost = False +try: # pragma: no cover + import blis + + has_blis = True +except ImportError: + blis = None + has_blis = False + + has_gpu = has_cupy_gpu or has_torch_mps_gpu From c8ac07fe734aaee43d8197bbf5c9a370f692766b Mon Sep 17 00:00:00 2001 From: kadarakos Date: Fri, 28 Oct 2022 12:26:18 +0200 Subject: [PATCH 06/30] Cross entropy fix (#647) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * return logloss instead of squared differrence * check whether to comput binary or categorical loss value * function to apply label smoothing to 2d array * force exclusive classes * formatting * mypy debug * bugfix * compare cross entropy to torch * fix type and error message * updating cross-entropy tests * all categorical crossentropy tests updated * sequence crossentropy test * rearrange if statements * sequence ce negprefix test start * all tests for (sequence) cross entropy * use CategoricalCrossentropy as loss * don't run conversion and validation twice in __call__ * add type for truths in convert_truths (thnx @ richardpaulhudson) * fix one-hot check and no unexpected error branch * cupy support for torch comparison * import floats2d * hopefully right type to pass old torch cross-entropy * nonstrict sum to 1 * typo * remove redundant work for sequential cross entropy * type typo * fix imports * remove misleading comments * assertion for clarity * add back mistakenly removed imports * throw error rather than assert * legacy versions and tests for crossentropy + sequential * type fix * Update thinc/legacy/loss.py Co-authored-by: Sofie Van Landeghem * legacy cross-entropy import through registry * no legacy test module * type fix * hacking types for mypy * return type * Update thinc/legacy/loss.py Co-authored-by: Sofie Van Landeghem * Update thinc/legacy/__init__.py Co-authored-by: Sofie Van Landeghem * initial functional sparse ce los * separate functionality for SparseCE and CategoricalCrossentropy * fix missing value type * correcting label smoothing param contraint * test new label smooth validation error * less than 0 input validation * string concat * small update to error msg * fix max smoothing coefficient * double check error message * Categorical and Sparse factories and tests * Update thinc/util.py Co-authored-by: Adriane Boyd * update test with less strict match * Fix types, pair-hacked with @kadarakos * (Sparse)CategoricalEntropy: support Ragged guesses Since we can encoder sequences as Ragged, this could replace (Sparse)SequenceCategoricalEntropy. * follow updated api * Update thinc/util.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * indent fix * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * remove unnecessary list copy * add type to truths * fix missing assignment * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * rever suggestion * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * Update thinc/legacy/loss.py Co-authored-by: Madeesh Kannan * Update thinc/tests/test_loss.py Co-authored-by: Madeesh Kannan * Update thinc/util.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * Update thinc/loss.py Co-authored-by: Madeesh Kannan * move check1d out of loss and more general signature * mypy fix * SparseCE rename Co-authored-by: Kádár Ákos Co-authored-by: Sofie Van Landeghem Co-authored-by: Adriane Boyd Co-authored-by: Daniël de Kok Co-authored-by: Madeesh Kannan --- examples/mnist.py | 5 +- thinc/legacy/__init__.py | 8 + thinc/legacy/loss.py | 282 ++++++++++++++++++ thinc/loss.py | 503 +++++++++++++++++++++----------- thinc/tests/test_loss.py | 602 +++++++++++++++++++++++++++++++-------- thinc/util.py | 28 +- 6 files changed, 1146 insertions(+), 282 deletions(-) create mode 100644 thinc/legacy/__init__.py create mode 100644 thinc/legacy/loss.py diff --git a/examples/mnist.py b/examples/mnist.py index 971f4645b..790bcc640 100644 --- a/examples/mnist.py +++ b/examples/mnist.py @@ -4,6 +4,7 @@ """ # pip install thinc ml_datasets typer from thinc.api import Model, chain, Relu, Softmax, Adam +from thinc.api import CategoricalCrossentropy import ml_datasets from wasabi import msg from tqdm import tqdm @@ -21,6 +22,7 @@ def main( ) # Load the data (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() + loss_func = CategoricalCrossentropy() # Set any missing shapes for the model. model.initialize(X=train_X[:5], Y=train_Y[:5]) train_data = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True) @@ -30,7 +32,8 @@ def main( for i in range(n_iter): for X, Y in tqdm(train_data, leave=False): Yh, backprop = model.begin_update(X) - backprop(Yh - Y) + grad, loss = loss_func(Yh, Y) + backprop(grad) model.finish_update(optimizer) # Evaluate and print progress correct = 0 diff --git a/thinc/legacy/__init__.py b/thinc/legacy/__init__.py new file mode 100644 index 000000000..ced5121ba --- /dev/null +++ b/thinc/legacy/__init__.py @@ -0,0 +1,8 @@ +from .loss import LegacyCategoricalCrossentropy +from .loss import LegacySequenceCategoricalCrossentropy + + +__all__ = [ + "LegacyCategoricalCrossentropy", + "LegacySequenceCategoricalCrossentropy" +] diff --git a/thinc/legacy/loss.py b/thinc/legacy/loss.py new file mode 100644 index 000000000..439a2ca21 --- /dev/null +++ b/thinc/legacy/loss.py @@ -0,0 +1,282 @@ +from typing import Optional, Sequence, Dict, Union, Tuple +from typing import cast, List +from ..types import Floats2d, Ints1d, Ints2d +from ..config import registry +from ..util import to_categorical, get_array_module +from ..loss import IntsOrFloatsOrStrs, Loss +from ..loss import _make_mask, _make_mask_by_value + + +TruthsT = Union[List[str], List[int], Ints1d, Floats2d] + + +class LegacyCategoricalCrossentropy(Loss): + names: Optional[Sequence[str]] + missing_value: Optional[Union[str, int]] + _name_to_i: Dict[str, int] + + def __init__( + self, + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, + label_smoothing: float = 0.0, + ): + self.normalize = normalize + self.names = names + self.missing_value = missing_value + self.neg_prefix = neg_prefix + self.label_smoothing = label_smoothing + if names is not None: + self._name_to_i = {name: i for i, name in enumerate(names)} + else: + self._name_to_i = {} + + def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]: + xp = get_array_module(guesses) + missing = [] + negatives_mask = None + if self.names: + negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f") + missing_value = self.missing_value + # Convert list of ints or list of strings + if isinstance(truths, list): + if len(truths): + if isinstance(truths[0], int): + for i, value in enumerate(truths): + if not isinstance(value, int): + raise ValueError( + "All values in the truths list have to " + "have the same time. The first value was " + f"detected to be integer, but found {type(value)}." + ) + if value == missing_value: + missing.append(i) + else: + truths = cast(List[str], truths) + if self.names is None: + msg = ( + "Cannot calculate loss from list of strings without names. " + "You can pass the names as a keyword argument when you " + "create the loss object, " + "e.g. CategoricalCrossentropy(names=['dog', 'cat'])" + ) + raise ValueError(msg) + for i, value in enumerate(truths): + if not isinstance(value, str): + raise ValueError( + "All values in the truths list have to " + "have the same time. The first value was " + f"detected to be string, but found {type(value)}." + ) + if value == missing_value: + truths[i] = self.names[0] + missing.append(i) + elif ( + value + and self.neg_prefix + and value.startswith(self.neg_prefix) + ): + truths[i] = value[len(self.neg_prefix) :] + neg_index = self._name_to_i[truths[i]] + negatives_mask[i] = 0 # type: ignore + negatives_mask[i][neg_index] = -1 # type: ignore + truths = [self._name_to_i[name] for name in truths] + truths = xp.asarray(truths, dtype="i") + mask = _make_mask(guesses, missing) + else: + mask = _make_mask_by_value(truths, guesses, missing_value) + truths = cast(Union[Ints1d, Floats2d], truths) + if truths.ndim != guesses.ndim: + # transform categorical values to one-hot encoding + truths_2d = to_categorical( + truths, + n_classes=guesses.shape[-1], + label_smoothing=self.label_smoothing, + ) + else: + if self.label_smoothing: + raise ValueError( + "Label smoothing is only applied, when truths have type " + "List[str], List[int] or Ints1d, but it seems like Floats2d " + "was provided." + ) + truths_2d = cast(Floats2d, truths) + # Transform negative annotations to a 0 for the negated value + # + mask all other values for that row + if negatives_mask is not None: + truths_2d *= negatives_mask + truths_2d[truths_2d == -1] = 0 + negatives_mask[negatives_mask == -1] = 1 + mask *= negatives_mask + return cast(Floats2d, truths_2d), mask + + def __call__( + self, guesses: Floats2d, truths: TruthsT + ) -> Tuple[Floats2d, float]: + d_truth = self.get_grad(guesses, truths) + return (d_truth, self._get_loss_from_grad(d_truth)) + + def get_grad(self, guesses: Floats2d, truths: TruthsT) -> Floats2d: + target, mask = self.convert_truths(truths, guesses) + xp = get_array_module(target) + if guesses.shape != target.shape: # pragma: no cover + err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}." + raise ValueError(err) + elif xp.any(guesses > 1) or xp.any(guesses < 0): # pragma: no cover + err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval." + raise ValueError(err) + elif xp.any(target > 1) or xp.any(target < 0): # pragma: no cover + err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval." + raise ValueError(err) + difference = guesses - target + difference *= mask + if self.normalize: + difference = difference / guesses.shape[0] + return difference + + def get_loss(self, guesses: Floats2d, truths: TruthsT) -> float: + d_truth = self.get_grad(guesses, truths) + return self._get_loss_from_grad(d_truth) + + def _get_loss_from_grad(self, d_truth: Floats2d) -> float: + # TODO: Add overload for axis=None case to sum + return (d_truth**2).sum() # type: ignore + + +class LegacySequenceCategoricalCrossentropy(Loss): + def __init__( + self, + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, + label_smoothing: float = 0.0, + ): + self.cc = LegacyCategoricalCrossentropy( + normalize=False, + names=names, + missing_value=missing_value, + neg_prefix=neg_prefix, + label_smoothing=label_smoothing, + ) + self.normalize = normalize + + def __call__( + self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT] + ) -> Tuple[List[Floats2d], float]: + grads = self.get_grad(guesses, truths) + loss = self._get_loss_from_grad(grads) + return grads, loss + + def get_grad( + self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT] + ) -> List[Floats2d]: + if len(guesses) != len(truths): # pragma: no cover + err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length" + raise ValueError(err) + n = len(guesses) + d_scores = [] + for yh, y in zip(guesses, truths): + d_yh = self.cc.get_grad(yh, y) + if self.normalize: + d_yh /= n + d_scores.append(d_yh) + return d_scores + + def get_loss( + self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT] + ) -> float: + return self._get_loss_from_grad(self.get_grad(guesses, truths)) + + def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float: + loss = 0.0 + for grad in grads: + loss += self.cc._get_loss_from_grad(grad) # type: ignore + return loss + + +@registry.losses("CategoricalCrossentropy.v1") +def configure_CategoricalCrossentropy_v1( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, +) -> LegacyCategoricalCrossentropy: + return LegacyCategoricalCrossentropy( + normalize=normalize, names=names, missing_value=missing_value + ) + + +@registry.losses("CategoricalCrossentropy.v2") +def configure_CategoricalCrossentropy_v2( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, +) -> LegacyCategoricalCrossentropy: + return LegacyCategoricalCrossentropy( + normalize=normalize, + names=names, + missing_value=missing_value, + neg_prefix=neg_prefix, + ) + + +@registry.losses("CategoricalCrossentropy.v3") +def configure_CategoricalCrossentropy_v3( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, + label_smoothing: float = 0.0, +) -> LegacyCategoricalCrossentropy: + return LegacyCategoricalCrossentropy( + normalize=normalize, + names=names, + missing_value=missing_value, + neg_prefix=neg_prefix, + label_smoothing=label_smoothing, + ) + + +@registry.losses("SequenceCategoricalCrossentropy.v1") +def configure_SequenceCategoricalCrossentropy_v1( + *, normalize: bool = True, names: Optional[Sequence[str]] = None +) -> LegacySequenceCategoricalCrossentropy: + return LegacySequenceCategoricalCrossentropy(normalize=normalize, names=names) + + +@registry.losses("SequenceCategoricalCrossentropy.v2") +def configure_SequenceCategoricalCrossentropy_v2( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + neg_prefix: Optional[str] = None, +) -> LegacySequenceCategoricalCrossentropy: + return LegacySequenceCategoricalCrossentropy( + normalize=normalize, names=names, neg_prefix=neg_prefix + ) + + +@registry.losses("SequenceCategoricalCrossentropy.v3") +def configure_SequenceCategoricalCrossentropy_v3( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, + label_smoothing: float = 0.0, +) -> LegacySequenceCategoricalCrossentropy: + return LegacySequenceCategoricalCrossentropy( + normalize=normalize, + names=names, + neg_prefix=neg_prefix, + missing_value=missing_value, + label_smoothing=label_smoothing, + ) diff --git a/thinc/loss.py b/thinc/loss.py index 990b30df1..5a81170d0 100644 --- a/thinc/loss.py +++ b/thinc/loss.py @@ -1,17 +1,19 @@ from typing import Tuple, Sequence, cast, TypeVar, Generic, Any, Union, Optional, List from typing import Dict -from .types import Floats2d, Ints1d -from .util import get_array_module, to_categorical +from .types import Floats2d, Ints1d, Ragged, ArrayXd +from .util import get_array_module, to_categorical, smooth_one_hot +from .util import is_xp_array from .config import registry - LossT = TypeVar("LossT") GradT = TypeVar("GradT") GuessT = TypeVar("GuessT") TruthT = TypeVar("TruthT") +FloatsOrRaggedT = TypeVar("FloatsOrRaggedT", Floats2d, Ragged) IntsOrFloats = Union[Ints1d, Floats2d] IntsOrFloatsOrStrs = Union[Ints1d, Floats2d, Sequence[int], Sequence[str]] +Categories1d = Union[Ints1d, Sequence[int], Sequence[str]] class Loss(Generic[GuessT, TruthT, GradT, LossT]): # pragma: no cover @@ -34,7 +36,118 @@ def get_loss(self, guesses: GuessT, truths: TruthT) -> LossT: ... -class CategoricalCrossentropy(Loss): +class CategoricalCrossentropyBase(Loss): + normalize: bool + + def _validate_input(self, guesses: FloatsOrRaggedT, target: Floats2d) -> None: + guesses_f2d = _to_array(guesses) + xp = get_array_module(target) + if not xp.allclose(guesses_f2d.sum(axis=1), 1.0): + raise ValueError( + "Cannot calculate CategoricalCrossentropy if " + "some rows of 'guesses' are not " + "valid categorical distributions (do not sum to 1)." + ) + elif guesses_f2d.shape != target.shape: # pragma: no cover + raise ValueError( + "Cannot calculate CategoricalCrossentropy loss " + f"with mismatching shapes: {guesses_f2d.shape} vs {target.shape}." + ) + elif xp.any(guesses_f2d > 1) or xp.any(guesses_f2d < 0): # pragma: no cover + raise ValueError( + "Cannot calculate CategoricalCrossentropy loss " + "with guesses outside the [0,1] interval." + ) + elif xp.any(target > 1) or xp.any(target < 0): # pragma: no cover + raise ValueError( + "Cannot calculate CategoricalCrossentropy loss " + "with truth values outside the [0,1] interval." + ) + + def _get_grad( + self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d + ) -> FloatsOrRaggedT: + difference = _to_array(guesses) - target + difference *= mask + if self.normalize: + # FIXME: normalized by the number of sequences, also support normalizing + # by the number of instances. + difference /= _normalization_length(guesses) + + return _array_like(difference, guesses) + + def _get_loss( + self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d + ) -> float: + guesses_f2d = _to_array(guesses) + xp = get_array_module(guesses_f2d) + logprobs = xp.log(guesses_f2d + 1e-9) + logprobs *= mask + if self.normalize: + return -(target * logprobs).sum() / _normalization_length(guesses) + else: + return -(target * logprobs).sum() + + +class CategoricalCrossentropy(CategoricalCrossentropyBase): + missing_value: Optional[Union[str, int]] + + def __init__( + self, + *, + normalize: bool = True, + missing_value: Optional[int] = None, + label_smoothing: float = 0.0, + ): + self.normalize = normalize + self.missing_value = missing_value + self.label_smoothing = label_smoothing + + def __call__( + self, guesses: FloatsOrRaggedT, truths: Floats2d + ) -> Tuple[FloatsOrRaggedT, float]: + target, mask = self.convert_truths(truths, guesses) + self._validate_input(guesses, target) + d_truth = self._get_grad(guesses, target, mask) + loss = self._get_loss(guesses, target, mask) + + return d_truth, loss + + def convert_truths( + self, truths: Floats2d, guesses: FloatsOrRaggedT + ) -> Tuple[Floats2d, Floats2d]: + if truths.ndim != 2: + raise ValueError(f"'truths' have to have 2 axes, but found {truths.ndim}") + guesses_2d = _to_array(guesses) + missing_value = self.missing_value + xp = get_array_module(guesses_2d) + mask = _make_mask_by_value(truths, guesses_2d, missing_value) + if not xp.allclose(truths.sum(axis=1), 1.0): + raise ValueError( + "Cannot calculate CategoricalCrossentropy. " + "All rows of 'truths' have to be a " + "valid categorical distribution (sum to 1)." + ) + if self.label_smoothing: + # Validate that array is binary, ergo one-hot at this point + if ((truths == 0) | (truths == 1)).all(): + truths = smooth_one_hot(truths, self.label_smoothing) + else: + raise ValueError("Can only apply label-smoothing to one-hot target.") + return truths, mask + + def get_grad(self, guesses: FloatsOrRaggedT, truths: Floats2d) -> FloatsOrRaggedT: + target, mask = self.convert_truths(truths, guesses) + self._validate_input(guesses, target) + return self._get_grad(guesses, target, mask) + + def get_loss(self, guesses: Floats2d, truths: Floats2d) -> float: + target, mask = self.convert_truths(truths, guesses) + self._validate_input(guesses, target) + return self._get_loss(guesses, target, mask) + + +class SparseCategoricalCrossentropy(CategoricalCrossentropyBase): names: Optional[Sequence[str]] missing_value: Optional[Union[str, int]] _name_to_i: Dict[str, int] @@ -58,142 +171,174 @@ def __init__( else: self._name_to_i = {} - def convert_truths(self, truths, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]: - xp = get_array_module(guesses) + def __call__( + self, guesses: Floats2d, truths: Union[Sequence[int], Sequence[str]] + ) -> Tuple[Floats2d, float]: + target, mask = self.convert_truths(truths, guesses) + self._validate_input(guesses, target) + d_truth = self._get_grad(guesses, target, mask) + loss = self._get_loss(guesses, target, mask) + return (d_truth, loss) + + def _convert_ints( + self, guesses: Floats2d, truths: Sequence[int] + ) -> Tuple[Floats2d, Floats2d]: + """ + Convert Sequence[int] into a Floats2d one-hot array. + """ + missing_value = self.missing_value + if missing_value is not None and not isinstance(missing_value, int): + raise ValueError( + "'truths' provided in Sequence[int] format, but " + f"'missing_value' was set to be {self.missing_value} " + f", which has type {type(self.missing_value)}." + ) missing = [] - negatives_mask = None - if self.names: - negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f") + for i, value in enumerate(truths): + if not isinstance(value, int): + raise ValueError( + "The first value of `truths` was of type " + f"integer, but found {type(value)} during iteration." + ) + if value == missing_value: + missing.append(i) + xp = get_array_module(guesses) + # FIXME: convert using ops? + xp_truths = cast(Ints1d, xp.asarray(truths, dtype="i")) + truths_2d = to_categorical( + xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing + ) + mask = _make_mask(guesses, missing) + return cast(Floats2d, truths_2d), mask + + def _convert_strs( + self, guesses: Floats2d, truths: Sequence[str] + ) -> Tuple[Floats2d, Floats2d]: + """ + Convert Sequence[int] into a Floats2d one-hot array. + """ + missing_value = self.missing_value - # Convert list of ints or list of strings - if isinstance(truths, list): - truths = list(truths) - if len(truths): - if isinstance(truths[0], int): - for i, value in enumerate(truths): - if value == missing_value: - missing.append(i) - else: - if self.names is None: - msg = ( - "Cannot calculate loss from list of strings without names. " - "You can pass the names as a keyword argument when you " - "create the loss object, " - "e.g. CategoricalCrossentropy(names=['dog', 'cat'])" - ) - raise ValueError(msg) - for i, value in enumerate(truths): - if value == missing_value: - truths[i] = self.names[0] - missing.append(i) - elif ( - value - and self.neg_prefix - and value.startswith(self.neg_prefix) - ): - truths[i] = value[len(self.neg_prefix) :] - neg_index = self._name_to_i[truths[i]] - negatives_mask[i] = 0 # type: ignore - negatives_mask[i][neg_index] = -1 # type: ignore - truths = [self._name_to_i[name] for name in truths] - truths = xp.asarray(truths, dtype="i") - mask = _make_mask(guesses, missing) - else: - mask = _make_mask_by_value(truths, guesses, missing_value) - if truths.ndim != guesses.ndim: - # transform categorical values to one-hot encoding - truths = to_categorical( - cast(Ints1d, truths), - n_classes=guesses.shape[-1], + if self.names is None: + raise ValueError( + "Cannot calculate loss from Sequence[str] without names. " + "You can pass the names as a keyword argument when you " + "create the loss object" + ) + elif missing_value is not None and not isinstance(missing_value, str): + raise ValueError( + "'truths' provided in Sequence[str] format, but " + f"'missing_value' was set to be {self.missing_value} " + f", which has type {type(self.missing_value)}." + ) + xp = get_array_module(guesses) + missing = [] + negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f") + truths_int = [] + for i, value in enumerate(truths): + if not isinstance(value, str): + raise ValueError( + "The first value of the 'truths' was of type " + f"string, but found {type(value)} during iteration." + ) + # missing value + if value == missing_value: + label_i = self._name_to_i[self.names[0]] + missing.append(i) + # negative labels + elif self.neg_prefix and value.startswith(self.neg_prefix): + label_i = self._name_to_i[value[len(self.neg_prefix) :]] + negatives_mask[i] = 0 # type: ignore + negatives_mask[i][label_i] = -1 # type: ignore + # nothing special + else: + label_i = self._name_to_i[value] + truths_int.append(label_i) + xp_truths = cast(Ints1d, xp.asarray(truths_int, dtype="i")) + truths_2d = to_categorical( + xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing + ) + mask = _make_mask(guesses, missing) + truths_2d *= negatives_mask + truths_2d[truths_2d == -1] = 0 + negatives_mask[negatives_mask == -1] = 1 + mask *= negatives_mask + return cast(Floats2d, truths_2d), mask + + def convert_truths( + self, truths: Categories1d, guesses: Floats2d + ) -> Tuple[Floats2d, Floats2d]: + guesses_f2d = _to_array(guesses) + + if is_xp_array(truths): + _check_ints1d(cast(ArrayXd, truths)) + xp_truths = cast(Ints1d, truths) + truths_2d = to_categorical( + xp_truths, label_smoothing=self.label_smoothing, + n_classes=guesses_f2d.shape[1], ) - else: - if self.label_smoothing: + mask = _make_mask_by_value(truths_2d, guesses_f2d, self.missing_value) + elif isinstance(truths, Sequence): + if isinstance(truths[0], int): + truths_2d, mask = self._convert_ints( + guesses_f2d, cast(Sequence[int], truths) + ) + elif isinstance(truths[0], str): + truths_2d, mask = self._convert_strs( + guesses_f2d, cast(Sequence[str], truths) + ) + else: raise ValueError( - "Label smoothing is only applied, when truths have type " - "List[str], List[int] or Ints1d, but it seems like Floats2d " - "was provided." + "When truths to SparseCategoricalCrossentropy is provided " + "in Sequence format, elements need to be " + "of type str or int, but first element " + f"was found to be {type(truths[0])}." ) - # Transform negative annotations to a 0 for the negated value - # + mask all other values for that row - if negatives_mask is not None: - truths *= negatives_mask - truths[truths == -1] = 0 - negatives_mask[negatives_mask == -1] = 1 - mask *= negatives_mask - return truths, mask + else: + raise ValueError( + "Truths have to be provided either as 1D " + "numpy/cupy integer array or as Sequence[int] or " + "Sequence[str], but truths has different type." + ) - def __call__( - self, guesses: Floats2d, truths: IntsOrFloatsOrStrs - ) -> Tuple[Floats2d, float]: - d_truth = self.get_grad(guesses, truths) - return (d_truth, self._get_loss_from_grad(d_truth)) + return cast(Floats2d, truths_2d), mask - def get_grad(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> Floats2d: + def get_grad(self, guesses: Floats2d, truths: Categories1d) -> Floats2d: target, mask = self.convert_truths(truths, guesses) - xp = get_array_module(target) - if guesses.shape != target.shape: # pragma: no cover - err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}." - raise ValueError(err) - if xp.any(guesses > 1) or xp.any(guesses < 0): # pragma: no cover - err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval." - raise ValueError(err) - if xp.any(target > 1) or xp.any(target < 0): # pragma: no cover - err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval." - raise ValueError(err) - difference = guesses - target - difference *= mask - if self.normalize: - difference = difference / guesses.shape[0] - return difference + self._validate_input(guesses, target) + return self._get_grad(guesses, target, mask) - def get_loss(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> float: - d_truth = self.get_grad(guesses, truths) - return self._get_loss_from_grad(d_truth) - - def _get_loss_from_grad(self, d_truth: Floats2d) -> float: - # TODO: Add overload for axis=None case to sum - return (d_truth**2).sum() # type: ignore - - -@registry.losses("CategoricalCrossentropy.v1") -def configure_CategoricalCrossentropy_v1( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, -) -> CategoricalCrossentropy: - return CategoricalCrossentropy( - normalize=normalize, names=names, missing_value=missing_value - ) + def get_loss(self, guesses: Floats2d, truths: Categories1d) -> float: + target, mask = self.convert_truths(truths, guesses) + self._validate_input(guesses, target) + return self._get_loss(guesses, target, mask) -@registry.losses("CategoricalCrossentropy.v2") -def configure_CategoricalCrossentropy_v2( +@registry.losses("CategoricalCrossentropy.v4") +def configure_CategoricalCrossentropy_v4( *, normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, + missing_value: Optional[int] = None, + label_smoothing: float = 0.0, ) -> CategoricalCrossentropy: return CategoricalCrossentropy( normalize=normalize, - names=names, missing_value=missing_value, - neg_prefix=neg_prefix, + label_smoothing=label_smoothing, ) -@registry.losses("CategoricalCrossentropy.v3") -def configure_CategoricalCrossentropy_v3( +@registry.losses("SparseCategoricalCrossentropy.v4") +def configure_SparseCategoricalCrossentropy_v4( *, normalize: bool = True, names: Optional[Sequence[str]] = None, missing_value: Optional[Union[str, int]] = None, neg_prefix: Optional[str] = None, label_smoothing: float = 0.0, -) -> CategoricalCrossentropy: - return CategoricalCrossentropy( +) -> SparseCategoricalCrossentropy: + return SparseCategoricalCrossentropy( normalize=normalize, names=names, missing_value=missing_value, @@ -206,38 +351,44 @@ class SequenceCategoricalCrossentropy(Loss): def __init__( self, *, + cross_entropy: Union[CategoricalCrossentropy, SparseCategoricalCrossentropy], normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, - label_smoothing: float = 0.0, ): - self.cc = CategoricalCrossentropy( - normalize=False, - names=names, - missing_value=missing_value, - neg_prefix=neg_prefix, - label_smoothing=label_smoothing, - ) + self.cc = cross_entropy self.normalize = normalize def __call__( self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] ) -> Tuple[List[Floats2d], float]: - grads = self.get_grad(guesses, truths) - loss = self._get_loss_from_grad(grads) - return grads, loss + self._validate_input(guesses, truths) + n = len(guesses) + d_scores = [] + loss = 0.0 + for yh, y in zip(guesses, truths): + d_yh, l = self.cc(yh, y) # type: ignore + if self.normalize: + d_yh /= n + d_scores.append(d_yh) + loss += l + return d_scores, loss + + def _validate_input( + self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] + ): + if len(guesses) != len(truths): # pragma: no cover + raise ValueError( + "Cannot calculate SequenceCategoricalCrossentropy loss: " + "guesses and truths must be same length!" + ) def get_grad( self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] ) -> List[Floats2d]: - err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length" - if len(guesses) != len(truths): # pragma: no cover - raise ValueError(err) + self._validate_input(guesses, truths) n = len(guesses) d_scores = [] for yh, y in zip(guesses, truths): - d_yh = self.cc.get_grad(yh, y) + d_yh = self.cc.get_grad(yh, y) # type: ignore if self.normalize: d_yh /= n d_scores.append(d_yh) @@ -246,49 +397,42 @@ def get_grad( def get_loss( self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] ) -> float: - return self._get_loss_from_grad(self.get_grad(guesses, truths)) - - def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float: + self._validate_input(guesses, truths) loss = 0.0 - for grad in grads: - loss += self.cc._get_loss_from_grad(grad) + for guess, truth in zip(guesses, truths): + loss += self.cc.get_loss(guess, truth) # type: ignore return loss -@registry.losses("SequenceCategoricalCrossentropy.v1") -def configure_SequenceCategoricalCrossentropy_v1( - *, normalize: bool = True, names: Optional[Sequence[str]] = None -) -> SequenceCategoricalCrossentropy: - return SequenceCategoricalCrossentropy(normalize=normalize, names=names) - - -@registry.losses("SequenceCategoricalCrossentropy.v2") -def configure_SequenceCategoricalCrossentropy_v2( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - neg_prefix: Optional[str] = None, -) -> SequenceCategoricalCrossentropy: - return SequenceCategoricalCrossentropy( - normalize=normalize, names=names, neg_prefix=neg_prefix - ) - - -@registry.losses("SequenceCategoricalCrossentropy.v3") -def configure_SequenceCategoricalCrossentropy_v3( +@registry.losses("SequenceCategoricalCrossentropy.v4") +def configure_SequenceCategoricalCrossentropy_v4( *, normalize: bool = True, + sparse: bool = True, names: Optional[Sequence[str]] = None, missing_value: Optional[Union[str, int]] = None, neg_prefix: Optional[str] = None, label_smoothing: float = 0.0, ) -> SequenceCategoricalCrossentropy: + if names is None and neg_prefix is None and not sparse: + cross_entropy: Union[ + CategoricalCrossentropy, SparseCategoricalCrossentropy + ] = CategoricalCrossentropy( + normalize=False, + missing_value=cast(Optional[int], missing_value), + label_smoothing=label_smoothing, + ) + else: + cross_entropy = SparseCategoricalCrossentropy( + normalize=False, + names=names, + missing_value=cast(Optional[Union[str, int]], missing_value), + neg_prefix=neg_prefix, + label_smoothing=label_smoothing, + ) return SequenceCategoricalCrossentropy( + cross_entropy=cross_entropy, normalize=normalize, - names=names, - missing_value=missing_value, - neg_prefix=neg_prefix, - label_smoothing=label_smoothing, ) @@ -419,6 +563,43 @@ def _make_mask_by_value(truths, guesses, missing_value) -> Floats2d: return mask +def _array_like(a: Floats2d, like: FloatsOrRaggedT) -> FloatsOrRaggedT: + if isinstance(like, Ragged): + return Ragged(a, lengths=like.lengths) + else: + return a + + +def _to_array(guesses: FloatsOrRaggedT) -> Floats2d: + if isinstance(guesses, Ragged): + return cast(Floats2d, guesses.data.astype("float32")) + else: + return guesses + + +def _normalization_length(guesses: FloatsOrRaggedT) -> int: + if isinstance(guesses, Ragged): + return len(guesses.lengths) + else: + return guesses.shape[0] + + +def _check_ints1d(arr: ArrayXd): + """ + Check whether array is 1D and has type integer. + """ + if arr.ndim != 1: + raise ValueError( + "SparseCategoricalCrossentropy only accepts 1D arrays, but " + f"array with shape {arr.shape} was given." + ) + if arr.dtype.kind != "i": # type: ignore + raise ValueError( + "SparseCategoricalCrossentropy only accepts integer arrays, but " + f"array with {arr.dtype} was given." + ) + + __all__ = [ "SequenceCategoricalCrossentropy", "CategoricalCrossentropy", diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py index 75206d240..47e170ec0 100644 --- a/thinc/tests/test_loss.py +++ b/thinc/tests/test_loss.py @@ -1,108 +1,375 @@ import pytest import numpy -from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy -from thinc.api import L2Distance, CosineDistance +from functools import partial +from thinc.api import CategoricalCrossentropy +from thinc.api import L2Distance, CosineDistance, softmax_activation +from thinc.api import Ragged from thinc import registry +from thinc.util import has_torch, to_categorical +from hypothesis import given, settings +from hypothesis.strategies import integers, floats +from thinc.legacy import loss + +ALL_XP = [numpy] +try: + import cupy + + ALL_XP.append(cupy) +except ImportError: + pass + + +softmax_func = partial(softmax_activation(), is_train=False) +MAX_EXAMPLES = 50 # some simple arrays scores0 = numpy.zeros((3, 3), dtype="f") labels0 = numpy.asarray([0, 1, 1], dtype="i") # a few more diverse ones to test realistic values -guesses1 = numpy.asarray([[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]]) +guesses1 = numpy.asarray( + [[0.1, 0.5, 0.4], [0.4, 0.3, 0.3], [0, 1, 0], [0.1, 0.05, 0.85]], dtype="f" +) +guesses1_legacy = numpy.asarray( + [[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]], dtype="f" +) labels1 = numpy.asarray([2, 1, 0, 2]) -labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) +labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]], dtype="f") labels1_strings = ["C", "B", "A", "C"] - -guesses2 = numpy.asarray([[0.2, 0.3, 0.0]]) +d_guesses1 = numpy.array( + [ + [0.025, 0.125, -0.15], + [0.1, -0.175, 0.075], + [-0.25, 0.25, 0.0], + [0.025, 0.0125, -0.0375], + ], + dtype="f", +) +d_guesses1_seq = numpy.array( + [ + [0.05, 0.25, -0.3], + [0.2, -0.35, 0.15], + [-0.5, 0.5, 0.0], + [0.05, 0.025, -0.075], + ], + dtype="f", +) +d_guesses1_0_missing = numpy.array( + [ + [0.025, 0.125, -0.15], + [0.1, -0.175, 0.075], + [0.0, 0.0, 0.0], + [0.025, 0.0125, -0.0375], + ], + dtype="f", +) +d_guesses1_sum = numpy.array( + [ + [0.1, 0.5, -0.6], + [0.4, -0.7, 0.3], + [-1.0, 1.0, 0.0], + [0.1, 0.05, -0.15], + ], + dtype="f", +) +loss1 = 5.75151207 +loss1_seq = 11.50302410 +loss1_0_missing = 0.57069561 +guesses2 = numpy.asarray([[0.2, 0.3, 0.5]]) +guesses2_legacy = numpy.asarray([[0.2, 0.3, 0.0]]) labels2 = numpy.asarray([1]) labels2_strings = ["B"] +d_guesses2_sum = numpy.asarray([[0.2, -0.7, 0.5]]) +sequence_loss = 24.210021096627 +eps = 1e-6 + + +ce_factory = registry.get("losses", "CategoricalCrossentropy.v4") + +sparse_ce_factory = registry.get("losses", "SparseCategoricalCrossentropy.v4") + +seq_ce_factory = registry.get("losses", "SequenceCategoricalCrossentropy.v4") + + +def _get_legacy_cross_entropy(version: int, **kwargs): + return registry.get("losses", f"CategoricalCrossentropy.v{version}")(**kwargs) -eps = 0.0001 +def _get_legacy_seq_cross_entropy(version: int, **kwargs): + return registry.get("losses", f"SequenceCategoricalCrossentropy.v{version}")( + **kwargs + ) + + +def test_cross_entropy_types_shapes(): + sparse_cross_entropy = ce_factory() + cross_entropy = ce_factory() + sparse_seq_cross_entropy = seq_ce_factory() + seq_cross_entropy = seq_ce_factory(sparse=False) + d_scores_sparse = sparse_cross_entropy.get_grad(guesses1, labels1_full) + d_scores = cross_entropy.get_grad(guesses1, labels1_full) + assert d_scores_sparse.dtype == "float32" + assert d_scores.dtype == "float32" + assert d_scores_sparse.shape == guesses1.shape + assert d_scores.shape == guesses1.shape + d_scores_sparse = sparse_seq_cross_entropy.get_grad([guesses1], [labels1]) + d_scores = seq_cross_entropy.get_grad([guesses1], [labels1_full]) + assert d_scores_sparse[0].dtype == "float32" + assert d_scores[0].dtype == "float32" + assert d_scores_sparse[0].shape == guesses1.shape + assert d_scores[0].shape == guesses1.shape + assert sparse_seq_cross_entropy.get_grad([], []) == [] + assert seq_cross_entropy.get_grad([], []) == [] + d_scores_ragged = cross_entropy.get_grad( + Ragged(numpy.array(guesses1), lengths=[3, 1]), labels1_full + ) + assert isinstance(d_scores_ragged, Ragged) + assert d_scores_ragged.dataXd.dtype == "float32" + assert d_scores_ragged.dataXd.shape == guesses1.shape -def test_loss(): - d_scores = CategoricalCrossentropy().get_grad(scores0, labels0) + +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_cross_entropy_types_shapes(version): + cross_entropy = _get_legacy_cross_entropy(version) + seq_cross_entropy = _get_legacy_seq_cross_entropy(version) + d_scores = cross_entropy.get_grad(scores0, labels0) assert d_scores.dtype == "float32" assert d_scores.shape == scores0.shape - d_scores = SequenceCategoricalCrossentropy().get_grad([scores0], [labels0]) + d_scores = seq_cross_entropy.get_grad([scores0], [labels0]) assert d_scores[0].dtype == "float32" assert d_scores[0].shape == scores0.shape - assert SequenceCategoricalCrossentropy().get_grad([], []) == [] + assert seq_cross_entropy.get_grad([], []) == [] -@pytest.mark.parametrize( - "dist", [CategoricalCrossentropy(), CosineDistance(ignore_zeros=True), L2Distance()] +@pytest.mark.skipif(not has_torch, reason="needs PyTorch") +@pytest.mark.parametrize("xp", ALL_XP) +@settings(max_examples=MAX_EXAMPLES, deadline=None) +@given( + n_samples=integers(min_value=1, max_value=100), + n_classes=integers(min_value=1, max_value=100), + low=floats(min_value=-20, max_value=10), + offset=floats(min_value=1, max_value=10), ) +def test_compare_cross_entropy_to_torch(xp, n_samples, n_classes, low, offset): + import torch + + sparse_loss_sum = sparse_ce_factory(normalize=False) + sparse_loss_mean = sparse_ce_factory() + loss_sum = ce_factory(normalize=False) + loss_mean = ce_factory() + torch_loss_sum = torch.nn.CrossEntropyLoss(reduction="sum") + torch_loss_mean = torch.nn.CrossEntropyLoss() + logits = xp.random.uniform(low, low + offset, (n_samples, n_classes)) + labels = xp.random.randint(0, n_classes, n_samples) + labels_full = to_categorical(labels, n_classes=n_classes) + torch_logits = torch.tensor(logits, requires_grad=True) + torch_labels = torch.tensor(labels, dtype=torch.long) + probs, _ = softmax_func(logits) + d_sum_sparse, l_sum_sparse = sparse_loss_sum(probs, labels) + d_sum, l_sum = loss_sum(probs, labels_full) + torch_l_sum = torch_loss_sum(torch_logits, torch_labels) + torch_l_sum.backward() + torch_d_sum = torch_logits.grad + torch_logits = torch.tensor(logits, requires_grad=True) + d_mean_sparse, l_mean_sparse = sparse_loss_mean(probs, labels) + d_mean, l_mean = loss_mean(probs, labels_full) + torch_l_mean = torch_loss_mean(torch_logits, torch_labels) + torch_l_mean.backward() + torch_d_mean = torch_logits.grad + assert xp.isclose(float(l_sum), float(torch_l_sum), atol=1e-06) + assert xp.allclose(d_sum, torch_d_sum.numpy()) + assert xp.isclose(float(l_mean), float(torch_l_mean)) + assert xp.allclose(d_mean, torch_d_mean.numpy()) + assert xp.isclose(float(l_sum_sparse), float(torch_l_sum), atol=1e-06) + assert xp.allclose(d_sum_sparse, torch_d_sum.numpy()) + assert xp.isclose(float(l_mean_sparse), float(torch_l_mean)) + assert xp.allclose(d_mean_sparse, torch_d_mean.numpy()) + + +@pytest.mark.parametrize("dist", [CosineDistance(ignore_zeros=True), L2Distance()]) @pytest.mark.parametrize("vect", [scores0, guesses1, guesses2]) -def test_equality(dist, vect): - assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, eps) - assert dist.get_loss(vect, vect) == pytest.approx(0, eps) +def test_equal_distance(dist, vect): + assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps) + assert dist.get_loss(vect, vect) == pytest.approx(0, abs=eps) + + +@pytest.mark.parametrize("version", [1, 2, 3]) +@pytest.mark.parametrize("vect", [scores0, guesses1_legacy, guesses2_legacy]) +def test_equal_legacy_cross_entropy(vect, version): + cross_entropy = _get_legacy_cross_entropy(version) + assert int(cross_entropy.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps) + assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps) + + +@pytest.mark.parametrize( + "guesses, labels, grad, grad_seq, loss, loss_seq", + [ + (guesses1, labels1_full, d_guesses1, d_guesses1_seq, loss1, loss1_seq), + ], +) +def test_categorical_crossentropy(guesses, labels, grad, grad_seq, loss, loss_seq): + cross_entropy = ce_factory() + d_scores = cross_entropy.get_grad(guesses, labels) + loss_val = cross_entropy.get_loss(guesses, labels) + assert d_scores.shape == guesses.shape + assert numpy.allclose(d_scores, grad) + assert numpy.isclose(loss_val, loss) + + # Test with Ragged inputs + d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels) + loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels) + assert d_scores_ragged.dataXd.shape == guesses.shape + assert numpy.allclose(d_scores_ragged.dataXd, grad_seq) + assert numpy.isclose(loss_ragged, loss_seq) + + +@pytest.mark.parametrize( + "guesses, labels, grad, grad_seq, loss, loss_seq", + [ + (guesses1, labels1, d_guesses1, d_guesses1_seq, loss1, loss1_seq), + ], +) +def test_sparse_categorical_crossentropy( + guesses, labels, grad, grad_seq, loss, loss_seq +): + cross_entropy = sparse_ce_factory() + d_scores = cross_entropy.get_grad(guesses, labels) + loss_val = cross_entropy.get_loss(guesses, labels) + assert d_scores.shape == guesses.shape + assert numpy.allclose(d_scores, grad) + assert numpy.isclose(loss_val, loss) + + # Test with Ragged inputs + d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels) + loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels) + assert d_scores_ragged.dataXd.shape == guesses.shape + assert numpy.allclose(d_scores_ragged.dataXd, grad_seq) + assert numpy.isclose(loss_ragged, loss_seq) @pytest.mark.parametrize( - "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)] + "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)] ) -def test_categorical_crossentropy(guesses, labels): - d_scores = CategoricalCrossentropy(normalize=True).get_grad(guesses, labels) +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_categorical_crossentropy(guesses, labels, version): + cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True) + d_scores = cross_entropy_normalize.get_grad(guesses, labels) assert d_scores.shape == guesses.shape # The normalization divides the difference (e.g. 0.4) by the number of vectors (4) - assert d_scores[1][0] == pytest.approx(0.1, eps) - assert d_scores[1][1] == pytest.approx(-0.1, eps) + assert d_scores[1][0] == pytest.approx(0.1, abs=eps) + assert d_scores[1][1] == pytest.approx(-0.1, abs=eps) # The third vector predicted all labels, but only the first one was correct - assert d_scores[2][0] == pytest.approx(0, eps) - assert d_scores[2][1] == pytest.approx(0.25, eps) - assert d_scores[2][2] == pytest.approx(0.25, eps) + assert d_scores[2][0] == pytest.approx(0, abs=eps) + assert d_scores[2][1] == pytest.approx(0.25, abs=eps) + assert d_scores[2][2] == pytest.approx(0.25, abs=eps) # The fourth vector predicted no labels but should have predicted the last one - assert d_scores[3][0] == pytest.approx(0, eps) - assert d_scores[3][1] == pytest.approx(0, eps) - assert d_scores[3][2] == pytest.approx(-0.25, eps) + assert d_scores[3][0] == pytest.approx(0, abs=eps) + assert d_scores[3][1] == pytest.approx(0, abs=eps) + assert d_scores[3][2] == pytest.approx(-0.25, abs=eps) - loss = CategoricalCrossentropy(normalize=True).get_loss(guesses, labels) - assert loss == pytest.approx(0.239375, eps) + loss = cross_entropy_normalize.get_loss(guesses, labels) + assert loss == pytest.approx(0.239375, abs=eps) def test_crossentropy_incorrect_scores_targets(): labels = numpy.asarray([2]) + labels_full = numpy.asarray([[0.0, 0.0, 1.0]]) + cross_entropy = ce_factory() + sparse_cross_entropy = sparse_ce_factory() guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - CategoricalCrossentropy(normalize=True).get_grad(guesses_neg, labels) + cross_entropy.get_grad(guesses_neg, labels_full) + with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): + sparse_cross_entropy.get_grad(guesses_neg, labels) + + guesses_dont_sum_one = numpy.asarray([[0.1, 0.5, 0.6]]) + with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): + cross_entropy.get_grad(guesses_dont_sum_one, labels_full) + with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): + sparse_cross_entropy.get_grad(guesses_dont_sum_one, labels) guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - CategoricalCrossentropy(normalize=True).get_grad( - guesses_larger_than_one, labels - ) + cross_entropy.get_grad(guesses_larger_than_one, labels_full) + with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): + sparse_cross_entropy.get_grad(guesses_larger_than_one, labels) guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]]) targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - CategoricalCrossentropy(normalize=True).get_grad(guesses_ok, targets_neg) + cross_entropy.get_grad(guesses_ok, targets_neg) targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - CategoricalCrossentropy(normalize=True).get_grad( - guesses_ok, targets_larger_than_one - ) + cross_entropy.get_grad(guesses_ok, targets_larger_than_one) + + targets_dont_sum_one = numpy.asarray([[0.9, 0.5, 0.6]]) + with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): + cross_entropy.get_grad(guesses_ok, targets_dont_sum_one) + + +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_categorical_cross_entropy_incorrect_scores_targets(version): + labels = numpy.asarray([2]) + cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True) + guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) + with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): + cross_entropy_normalize.get_grad(guesses_neg, labels) + + guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]]) + with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): + cross_entropy_normalize.get_grad(guesses_larger_than_one, labels) + + guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]]) + targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) + with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): + cross_entropy_normalize.get_grad(guesses_ok, targets_neg) + + targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]]) + with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): + cross_entropy_normalize.get_grad(guesses_ok, targets_larger_than_one) + + +@pytest.mark.parametrize( + "guesses, labels, grad, missing_value", + [ + (guesses1, [2, 1, 0, 2], d_guesses1_0_missing, 0), + (guesses1, labels1, d_guesses1_0_missing, 0), + (guesses1, labels1_strings, d_guesses1_0_missing, "A"), + ], +) +def test_sparse_crossentropy_missing(guesses, labels, grad, missing_value): + if missing_value == "A": + names = ["A", "B", "C"] + else: + names = None + sparse_cross_entropy = sparse_ce_factory(missing_value=missing_value, names=names) + d_scores = sparse_cross_entropy.get_grad(guesses, labels) + assert d_scores.shape == guesses.shape + assert numpy.allclose(d_scores, grad) + loss = sparse_cross_entropy.get_loss(guesses, labels) + assert numpy.isclose(loss, loss1_0_missing) @pytest.mark.parametrize( "guesses, labels", - [(guesses1, [2, 1, 0, 2])], + [(guesses1_legacy, [2, 1, 0, 2])], ) -def test_categorical_crossentropy_int_list_missing(guesses, labels): - d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad( - guesses, labels +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_categorical_crossentropy_int_list_missing(guesses, labels, version): + cross_entropy_normalize_missing = _get_legacy_cross_entropy( + version, normalize=True, missing_value=0 ) + d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels) assert d_scores.shape == guesses.shape # The normalization divides the difference (e.g. 0.4) by the number of vectors (4) - assert d_scores[1][0] == pytest.approx(0.1, eps) - assert d_scores[1][1] == pytest.approx(-0.1, eps) + assert d_scores[1][0] == pytest.approx(0.1, abs=eps) + assert d_scores[1][1] == pytest.approx(-0.1, abs=eps) # Label 0 is masked, because it represents the missing value assert d_scores[2][0] == 0.0 @@ -110,28 +377,46 @@ def test_categorical_crossentropy_int_list_missing(guesses, labels): assert d_scores[2][2] == 0.0 # The fourth vector predicted no labels but should have predicted the last one - assert d_scores[3][0] == pytest.approx(0, eps) - assert d_scores[3][1] == pytest.approx(0, eps) - assert d_scores[3][2] == pytest.approx(-0.25, eps) + assert d_scores[3][0] == pytest.approx(0, abs=eps) + assert d_scores[3][1] == pytest.approx(0, abs=eps) + assert d_scores[3][2] == pytest.approx(-0.25, abs=eps) + + loss = cross_entropy_normalize_missing.get_loss(guesses, labels) + assert loss == pytest.approx(0.114375, abs=eps) + + +@pytest.mark.parametrize( + "guesses, labels, grad", + [ + (guesses1, labels1_full, d_guesses1_0_missing), + ], +) +def test_categorical_crossentropy_missing(guesses, labels, grad): + cross_entropy = ce_factory(missing_value=0) + d_scores = cross_entropy.get_grad(guesses, labels) + assert d_scores.shape == guesses.shape + assert numpy.allclose(d_scores, grad) loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss( guesses, labels ) - assert loss == pytest.approx(0.114375, eps) + assert numpy.isclose(loss, loss1_0_missing) @pytest.mark.parametrize( - "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)] + "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)] ) -def test_categorical_crossentropy_missing(guesses, labels): - d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad( - guesses, labels +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_categorical_crossentropy_missing(guesses, labels, version): + cross_entropy_normalize_missing = _get_legacy_cross_entropy( + version, normalize=True, missing_value=0 ) + d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels) assert d_scores.shape == guesses.shape # The normalization divides the difference (e.g. 0.4) by the number of vectors (4) - assert d_scores[1][0] == pytest.approx(0.1, eps) - assert d_scores[1][1] == pytest.approx(-0.1, eps) + assert d_scores[1][0] == pytest.approx(0.1, abs=eps) + assert d_scores[1][1] == pytest.approx(-0.1, abs=eps) # Label 0 is masked, because it represents the missing value assert d_scores[2][0] == 0.0 @@ -139,95 +424,179 @@ def test_categorical_crossentropy_missing(guesses, labels): assert d_scores[2][2] == 0.0 # The fourth vector predicted no labels but should have predicted the last one - assert d_scores[3][0] == pytest.approx(0, eps) - assert d_scores[3][1] == pytest.approx(0, eps) - assert d_scores[3][2] == pytest.approx(-0.25, eps) + assert d_scores[3][0] == pytest.approx(0, abs=eps) + assert d_scores[3][1] == pytest.approx(0, abs=eps) + assert d_scores[3][2] == pytest.approx(-0.25, abs=eps) - loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss( - guesses, labels - ) - assert loss == pytest.approx(0.114375, eps) + loss = cross_entropy_normalize_missing.get_loss(guesses, labels) + assert loss == pytest.approx(0.114375, abs=eps) + + +@pytest.mark.parametrize( + "guesses, labels, names, grad, loss", + [ + ( + [guesses1, guesses2], + [labels1, labels2], + [], + [d_guesses1_sum, d_guesses2_sum], + sequence_loss, + ), + ( + [guesses1, guesses2], + [labels1_strings, labels2_strings], + ["A", "B", "C"], + [d_guesses1_sum, d_guesses2_sum], + sequence_loss, + ), + ], +) +def test_sequence_sparse_crossentropy(guesses, labels, names, grad, loss): + sparse_seq_cross_entropy_sum = seq_ce_factory(names=names, normalize=False) + sparse_seq_cross_entropy = seq_ce_factory(names=names, normalize=True) + d_scores = sparse_seq_cross_entropy_sum.get_grad(guesses, labels) + assert numpy.allclose(d_scores[0], grad[0]) + assert numpy.allclose(d_scores[1], grad[1]) + # The normalization divides the difference (e.g. 0.4) by the number of seqs + d_scores = sparse_seq_cross_entropy.get_grad(guesses, labels) + assert numpy.allclose(d_scores[0], grad[0] / 2.0) + assert numpy.allclose(d_scores[1], grad[1] / 2.0) + loss_val = sparse_seq_cross_entropy.get_loss(guesses, labels) + assert numpy.isclose(loss_val, loss) + d_scores, loss_val = sparse_seq_cross_entropy_sum(guesses, labels) + assert numpy.isclose(loss_val, loss) + assert numpy.allclose(d_scores[0], grad[0]) + assert numpy.allclose(d_scores[1], grad[1]) + + +@pytest.mark.parametrize( + "guesses, labels, grad, loss", + [([guesses1], [labels1_full], [d_guesses1_sum], [23.00604829563447])], +) +def test_sequence_crossentropy(guesses, labels, grad, loss): + seq_cross_entropy = seq_ce_factory(sparse=False, normalize=False) + d_scores = seq_cross_entropy.get_grad(guesses, labels) + assert numpy.allclose(d_scores[0], grad[0]) + # The normalization divides the difference (e.g. 0.4) by the number of seqs + loss_val = seq_cross_entropy.get_loss(guesses, labels) + assert numpy.isclose(loss_val, loss) + d_scores, loss_val = seq_cross_entropy(guesses, labels) + assert numpy.isclose(loss_val, loss) + assert numpy.allclose(d_scores[0], grad[0]) @pytest.mark.parametrize( "guesses, labels, names", [ - ([guesses1, guesses2], [labels1, labels2], []), - ([guesses1, guesses2], [labels1_full, labels2], []), - ([guesses1, guesses2], [labels1_strings, labels2_strings], ["A", "B", "C"]), + ([guesses1_legacy, guesses2_legacy], [labels1, labels2], []), + ([guesses1_legacy, guesses2_legacy], [labels1_full, labels2], []), + ( + [guesses1_legacy, guesses2_legacy], + [labels1_strings, labels2_strings], + ["A", "B", "C"], + ), ], ) -def test_sequence_categorical_crossentropy(guesses, labels, names): - d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names).get_grad( - guesses, labels +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_sequence_categorical_crossentropy(guesses, labels, names, version): + seq_cross_entropy_names = _get_legacy_seq_cross_entropy( + version, normalize=False, names=names ) + seq_cross_entropy_names_normalize = _get_legacy_seq_cross_entropy( + version, normalize=True, names=names + ) + d_scores = seq_cross_entropy_names.get_grad(guesses, labels) d_scores1 = d_scores[0] d_scores2 = d_scores[1] assert d_scores1.shape == guesses1.shape assert d_scores2.shape == guesses2.shape - assert d_scores1[1][0] == pytest.approx(0.4, eps) - assert d_scores1[1][1] == pytest.approx(-0.4, eps) + assert d_scores1[1][0] == pytest.approx(0.4, abs=eps) + assert d_scores1[1][1] == pytest.approx(-0.4, abs=eps) # The normalization divides the difference (e.g. 0.4) by the number of seqs - d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad( - guesses, labels - ) + d_scores = seq_cross_entropy_names_normalize.get_grad(guesses, labels) d_scores1 = d_scores[0] d_scores2 = d_scores[1] - assert d_scores1[1][0] == pytest.approx(0.2, eps) - assert d_scores1[1][1] == pytest.approx(-0.2, eps) + assert d_scores1[1][0] == pytest.approx(0.2, abs=eps) + assert d_scores1[1][1] == pytest.approx(-0.2, abs=eps) # The third vector predicted all labels, but only the first one was correct - assert d_scores1[2][0] == pytest.approx(0, eps) - assert d_scores1[2][1] == pytest.approx(0.5, eps) - assert d_scores1[2][2] == pytest.approx(0.5, eps) + assert d_scores1[2][0] == pytest.approx(0, abs=eps) + assert d_scores1[2][1] == pytest.approx(0.5, abs=eps) + assert d_scores1[2][2] == pytest.approx(0.5, abs=eps) # The fourth vector predicted no labels but should have predicted the last one - assert d_scores1[3][0] == pytest.approx(0, eps) - assert d_scores1[3][1] == pytest.approx(0, eps) - assert d_scores1[3][2] == pytest.approx(-0.5, eps) + assert d_scores1[3][0] == pytest.approx(0, abs=eps) + assert d_scores1[3][1] == pytest.approx(0, abs=eps) + assert d_scores1[3][2] == pytest.approx(-0.5, abs=eps) # Test the second batch - assert d_scores2[0][0] == pytest.approx(0.1, eps) - assert d_scores2[0][1] == pytest.approx(-0.35, eps) + assert d_scores2[0][0] == pytest.approx(0.1, abs=eps) + assert d_scores2[0][1] == pytest.approx(-0.35, abs=eps) - loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss( - guesses, labels + loss = seq_cross_entropy_names_normalize.get_loss(guesses, labels) + assert loss == pytest.approx(1.09, abs=eps) + + +@pytest.mark.parametrize( + "guesses, labels, names, grad", + [ + ( + [guesses1], + [["A", "!A", "", "!C"]], + ["A", "B", "C"], + numpy.array( + [ + [-0.9, 0.5, 0.4], # First is correct + [0.4, 0.0, 0.0], # Not first one + [0.0, 0.0, 0.0], # Missing + [0.0, 0.0, 0.85], # Not last one + ] + ), + ) + ], +) +def test_sequence_crossentropy_missing_negative(guesses, labels, names, grad): + sparse_seq_ce = seq_ce_factory( + names=names, normalize=False, neg_prefix="!", missing_value="" ) - assert loss == pytest.approx(1.09, eps) + d_scores = sparse_seq_ce.get_grad(guesses, labels) + assert numpy.allclose(d_scores, grad) @pytest.mark.parametrize( "guesses, labels, names", [ - ([guesses1], [["A", "!A", "", "!C"]], ["A", "B", "C"]), + ([guesses1_legacy], [["A", "!A", "", "!C"]], ["A", "B", "C"]), ], ) -def test_sequence_categorical_missing_negative(guesses, labels, names): - d_scores = SequenceCategoricalCrossentropy( - normalize=False, names=names, neg_prefix="!", missing_value="" - ).get_grad(guesses, labels) +@pytest.mark.parametrize("version", [3]) +def test_legacy_sequence_categorical_missing_negative(guesses, labels, names, version): + seq_cross_entropy = _get_legacy_seq_cross_entropy( + version, normalize=False, names=names, neg_prefix="!", missing_value="" + ) + d_scores = seq_cross_entropy.get_grad(guesses, labels) d_scores0 = d_scores[0] # [0.1, 0.5, 0.6] should be A - assert d_scores0[0][0] == pytest.approx(-0.9, eps) - assert d_scores0[0][1] == pytest.approx(0.5, eps) - assert d_scores0[0][2] == pytest.approx(0.6, eps) + assert d_scores0[0][0] == pytest.approx(-0.9, abs=eps) + assert d_scores0[0][1] == pytest.approx(0.5, abs=eps) + assert d_scores0[0][2] == pytest.approx(0.6, abs=eps) # [0.4, 0.6, 0.3] should NOT be A - assert d_scores0[1][0] == pytest.approx(0.4, eps) - assert d_scores0[1][1] == pytest.approx(0.0, eps) - assert d_scores0[1][2] == pytest.approx(0.0, eps) + assert d_scores0[1][0] == pytest.approx(0.4, abs=eps) + assert d_scores0[1][1] == pytest.approx(0.0, abs=eps) + assert d_scores0[1][2] == pytest.approx(0.0, abs=eps) # [1, 1, 1] has missing gold label - assert d_scores0[2][0] == pytest.approx(0.0, eps) - assert d_scores0[2][1] == pytest.approx(0.0, eps) - assert d_scores0[2][2] == pytest.approx(0.0, eps) + assert d_scores0[2][0] == pytest.approx(0.0, abs=eps) + assert d_scores0[2][1] == pytest.approx(0.0, abs=eps) + assert d_scores0[2][2] == pytest.approx(0.0, abs=eps) # [0.0, 0.0, 0.0] should NOT be C - assert d_scores0[3][0] == pytest.approx(0.0, eps) - assert d_scores0[3][1] == pytest.approx(0.0, eps) - assert d_scores0[3][2] == pytest.approx(0.0, eps) + assert d_scores0[3][0] == pytest.approx(0.0, abs=eps) + assert d_scores0[3][1] == pytest.approx(0.0, abs=eps) + assert d_scores0[3][2] == pytest.approx(0.0, abs=eps) def test_L2(): @@ -241,10 +610,10 @@ def test_L2(): ) loss_not_normalized = L2Distance(normalize=False).get_loss(vec1, vec2) - assert loss_not_normalized == pytest.approx(20, eps) + assert loss_not_normalized == pytest.approx(20, abs=eps) loss_normalized = L2Distance(normalize=True).get_loss(vec1, vec2) - assert loss_normalized == pytest.approx(5, eps) + assert loss_normalized == pytest.approx(5, abs=eps) def test_cosine_orthogonal(): @@ -260,10 +629,10 @@ def test_cosine_orthogonal(): assert d_vecs[1][1] > 0 loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2) - assert loss_not_normalized == pytest.approx(2, eps) + assert loss_not_normalized == pytest.approx(2, abs=eps) loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2) - assert loss_normalized == pytest.approx(1, eps) + assert loss_normalized == pytest.approx(1, abs=eps) def test_cosine_equal(): @@ -276,10 +645,10 @@ def test_cosine_equal(): numpy.testing.assert_allclose(d_vec1, numpy.zeros(d_vec1.shape), rtol=eps, atol=eps) loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2) - assert loss_not_normalized == pytest.approx(0, eps) + assert loss_not_normalized == pytest.approx(0, abs=eps) loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2) - assert loss_normalized == pytest.approx(0, eps) + assert loss_normalized == pytest.approx(0, abs=eps) def test_cosine_unmatched(): @@ -292,19 +661,26 @@ def test_cosine_unmatched(): @pytest.mark.parametrize( "name,kwargs,args", [ - ("CategoricalCrossentropy.v1", {}, (scores0, labels0)), - ("SequenceCategoricalCrossentropy.v1", {}, ([scores0], [labels0])), - ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (scores0, labels0)), - ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (scores0, labels0)), + ("CategoricalCrossentropy.v1", {}, (guesses1, labels1)), + ("SequenceCategoricalCrossentropy.v1", {}, ([guesses1], [labels1])), + ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (guesses1, labels1)), + ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (guesses1, labels1)), + ("SparseCategoricalCrossentropy.v4", {"neg_prefix": "!"}, (guesses1, labels1)), + ("CategoricalCrossentropy.v4", {}, (guesses1, labels1_full)), ( "SequenceCategoricalCrossentropy.v2", {"neg_prefix": "!"}, - ([scores0], [labels0]), + ([guesses1], [labels1]), ), ( "SequenceCategoricalCrossentropy.v3", {"neg_prefix": "!"}, - ([scores0], [labels0]), + ([guesses1], [labels1]), + ), + ( + "SequenceCategoricalCrossentropy.v4", + {"neg_prefix": "!"}, + ([guesses1], [labels1]), ), ("L2Distance.v1", {}, (scores0, scores0)), ( diff --git a/thinc/util.py b/thinc/util.py index b87ca4e5f..059f2c235 100644 --- a/thinc/util.py +++ b/thinc/util.py @@ -1,7 +1,8 @@ from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar -from typing import List, Mapping, Tuple +from typing import List, Mapping +from typing import TYPE_CHECKING + import numpy -from packaging.version import Version import random import functools from wasabi import table @@ -15,18 +16,16 @@ from dataclasses import dataclass from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu -from .compat import has_torch_mps_gpu from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack -DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False) - -from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd # noqa: E402 +from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd, Floats2d # noqa: E402 from . import types # noqa: E402 -from typing import TYPE_CHECKING if TYPE_CHECKING: from .api import Ops +DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False) + def get_torch_default_device() -> "torch.device": if torch is None: @@ -254,6 +253,21 @@ def to_categorical( return label_distr[Y] +def smooth_one_hot(X: Floats2d, label_smoothing: float) -> Floats2d: + """ + Apply label-smoothing to one-hot array. + """ + if not 0.0 <= label_smoothing < 0.5: + raise ValueError( + "label_smoothing should be greater or " + "equal to 0.0 and less than 0.5, " + f"but {label_smoothing} was provided." + ) + X[X == 1] = 1 - label_smoothing + X[X == 0] = label_smoothing / (X.shape[1] - 1) + return X + + def get_width( X: Union[ArrayXd, Ragged, Padded, Sequence[ArrayXd]], *, dim: int = -1 ) -> int: From cdc971702dd2cd14e45dfb06ed5cbad816771239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 9 Dec 2022 08:49:13 +0100 Subject: [PATCH 07/30] Bring back support for missing labels to legacy cross entropy (#809) * Bring back support for missing labels to legacy cross entropy * Use `missing_value` to detect missing values * Typing fixes --- thinc/legacy/loss.py | 35 +++++++++++++++++++---------------- thinc/tests/test_loss.py | 9 +++++++++ 2 files changed, 28 insertions(+), 16 deletions(-) diff --git a/thinc/legacy/loss.py b/thinc/legacy/loss.py index 439a2ca21..ab9871625 100644 --- a/thinc/legacy/loss.py +++ b/thinc/legacy/loss.py @@ -1,13 +1,13 @@ from typing import Optional, Sequence, Dict, Union, Tuple from typing import cast, List -from ..types import Floats2d, Ints1d, Ints2d +from ..types import Floats2d, Ints1d from ..config import registry from ..util import to_categorical, get_array_module from ..loss import IntsOrFloatsOrStrs, Loss from ..loss import _make_mask, _make_mask_by_value -TruthsT = Union[List[str], List[int], Ints1d, Floats2d] +TruthsT = Union[List[Optional[str]], List[int], Ints1d, Floats2d] class LegacyCategoricalCrossentropy(Loss): @@ -34,7 +34,9 @@ def __init__( else: self._name_to_i = {} - def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]: + def convert_truths( + self, truths: TruthsT, guesses: Floats2d + ) -> Tuple[Floats2d, Floats2d]: xp = get_array_module(guesses) missing = [] negatives_mask = None @@ -49,13 +51,13 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, if not isinstance(value, int): raise ValueError( "All values in the truths list have to " - "have the same time. The first value was " + "have the same type. The first value was " f"detected to be integer, but found {type(value)}." ) if value == missing_value: missing.append(i) else: - truths = cast(List[str], truths) + truths = cast(List[Optional[str]], truths) if self.names is None: msg = ( "Cannot calculate loss from list of strings without names. " @@ -65,10 +67,10 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, ) raise ValueError(msg) for i, value in enumerate(truths): - if not isinstance(value, str): + if not (isinstance(value, str) or value == missing_value): raise ValueError( "All values in the truths list have to " - "have the same time. The first value was " + "have the same type. The first value was " f"detected to be string, but found {type(value)}." ) if value == missing_value: @@ -79,11 +81,16 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, and self.neg_prefix and value.startswith(self.neg_prefix) ): - truths[i] = value[len(self.neg_prefix) :] - neg_index = self._name_to_i[truths[i]] + neg_value = value[len(self.neg_prefix) :] + truths[i] = neg_value + neg_index = self._name_to_i[neg_value] negatives_mask[i] = 0 # type: ignore negatives_mask[i][neg_index] = -1 # type: ignore - truths = [self._name_to_i[name] for name in truths] + # In the loop above, we have ensured that `truths` doesn't + # contain `None` (anymore). However, mypy can't infer this + # and doesn't like the shadowing. + truths_str = cast(List[str], truths) + truths = [self._name_to_i[name] for name in truths_str] truths = xp.asarray(truths, dtype="i") mask = _make_mask(guesses, missing) else: @@ -113,9 +120,7 @@ def convert_truths(self, truths: TruthsT, guesses: Floats2d) -> Tuple[Floats2d, mask *= negatives_mask return cast(Floats2d, truths_2d), mask - def __call__( - self, guesses: Floats2d, truths: TruthsT - ) -> Tuple[Floats2d, float]: + def __call__(self, guesses: Floats2d, truths: TruthsT) -> Tuple[Floats2d, float]: d_truth = self.get_grad(guesses, truths) return (d_truth, self._get_loss_from_grad(d_truth)) @@ -187,9 +192,7 @@ def get_grad( d_scores.append(d_yh) return d_scores - def get_loss( - self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT] - ) -> float: + def get_loss(self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]) -> float: return self._get_loss_from_grad(self.get_grad(guesses, truths)) def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float: diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py index 47e170ec0..2cb49e466 100644 --- a/thinc/tests/test_loss.py +++ b/thinc/tests/test_loss.py @@ -199,6 +199,15 @@ def test_equal_legacy_cross_entropy(vect, version): cross_entropy = _get_legacy_cross_entropy(version) assert int(cross_entropy.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps) assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps) + assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps) + + +@pytest.mark.parametrize("version", [1, 2, 3]) +def test_legacy_cross_entropy_absent_labels(version): + cross_entropy = _get_legacy_cross_entropy(version, names=["cat", "dog", "rat"]) + assert cross_entropy.get_loss(scores0, [None, None, None]) == pytest.approx( + 0, abs=eps + ) @pytest.mark.parametrize( From 9743709d5a705366c79c15e03f1b2bf5ead96955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 9 Dec 2022 13:42:14 +0100 Subject: [PATCH 08/30] Set version to v9.0.0.dev0 (#816) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index 087ac261f..bcdeb1bbc 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "8.1.5" +__version__ = "v9.0.0.dev0" __release__ = True From 07f8f888308b9ed453ff4e1bb09c3eb505c98558 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 9 Dec 2022 14:28:03 +0100 Subject: [PATCH 09/30] Fix spurious `v` prefix in the version number (#818) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index bcdeb1bbc..3c68811b6 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "v9.0.0.dev0" +__version__ = "9.0.0.dev0" __release__ = True From 717c70e03007b5b376dc353daf501a2b5d0043b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 22 Dec 2022 19:51:44 +0100 Subject: [PATCH 10/30] Give schedules access to the key, step, and last eval score (#804) * Give schedulers access to the key, step, and last eval score Before this change schedules were generators that generate a value for each training step. This, however has the limitation that scheduler cannot use other information that is available in the optimizer such as the parameter key. This information is useful for e.g. discriminative learning rates, where certain parameters are on a different schedule than others. To accommodate passing additional information, this change converts schedules to callables. These callables are passed the training step, the parameter key, and the last evaluation score (when available). Traditional scalar and generated schedules are converted to callables by the optimizer for compatibility. * Fix use of the `t` parameter where used in the schedules Also add tests, so that doesn't break again. * Fixes from @shadeMe * Call _schedule_args once * Make Optimizer.step private * Fix two missed step uses in tests * Float fix Co-authored-by: Madeesh Kannan * Fix schedule call * Move `ScheduleCallable` to `thinc.types` * Move from callables to a `Schedule` class The new learning rate functionality used `Callable`s. However, the issue with callables it that they cannot be pickled. This is problematic, because schedules can end up in spaCy pipelines (e.g. through the optimizer associated with the `Language` object). This change solves this issue by refactoring the schedules into regular objects. This now works similar to Thinc `Model`s -- there is a new `Scheduler` class which can be constructed with composition. I tested the changes with spaCy and pickling as well as usin existing configurations works. * Remove stray `runtime_checkable` import * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem Co-authored-by: Madeesh Kannan Co-authored-by: Sofie Van Landeghem --- thinc/api.py | 6 +- thinc/optimizers.py | 213 +++++++++++++++++++++++---------- thinc/schedules.py | 209 ++++++++++++++++++++++++-------- thinc/tests/test_config.py | 5 +- thinc/tests/test_optimizers.py | 74 ++++++++++-- thinc/tests/test_schedules.py | 53 ++++---- website/docs/api-optimizers.md | 33 +++-- website/docs/api-schedules.md | 111 ++++++++++++++--- website/docs/usage-config.md | 53 ++++---- website/docs/usage-training.md | 74 ++++++------ 10 files changed, 588 insertions(+), 243 deletions(-) diff --git a/thinc/api.py b/thinc/api.py index 8f5b3247e..f9e392048 100644 --- a/thinc/api.py +++ b/thinc/api.py @@ -8,8 +8,8 @@ from .shims import Shim, PyTorchGradScaler, PyTorchShim, TensorFlowShim, keras_model_fns from .shims import MXNetShim, TorchScriptShim, maybe_handshake_model from .optimizers import Adam, RAdam, SGD, Optimizer -from .schedules import cyclic_triangular, warmup_linear, constant, constant_then -from .schedules import decaying, slanted_triangular, compounding +from .schedules import Schedule, cyclic_triangular, warmup_linear, constant +from .schedules import constant_then, decaying, slanted_triangular, compounding from .types import Ragged, Padded, ArgsKwargs, Unserializable from .util import fix_random_seed, is_cupy_array, set_active_gpu from .util import prefer_gpu, require_gpu, require_cpu @@ -66,7 +66,7 @@ # .optimizers "Adam", "RAdam", "SGD", "Optimizer", # .schedules - "cyclic_triangular", "warmup_linear", "constant", "constant_then", + "Schedule", "cyclic_triangular", "warmup_linear", "constant", "constant_then", "decaying", "slanted_triangular", "compounding", # .types "Ragged", "Padded", "ArgsKwargs", "Unserializable", diff --git a/thinc/optimizers.py b/thinc/optimizers.py index f34cd2ff8..b0636fd87 100644 --- a/thinc/optimizers.py +++ b/thinc/optimizers.py @@ -1,16 +1,17 @@ -import math - -from typing import Dict, Optional, Union, Tuple, List, cast +from typing import Any, Dict, Optional, Union, Tuple, List, cast from collections import defaultdict +import itertools +import math +from types import GeneratorType from .backends import get_array_ops from .types import Generator, FloatsXd from .config import registry +from .schedules import constant, Schedule KeyT = Tuple[int, str] -FloatOrSeq = Union[float, List[float], Generator] -IntOrSeq = Union[int, List[int], Generator] +ScheduleT = Union[float, List[float], Generator, Schedule] SGD_DEFAULTS: Dict[str, Union[float, bool, int]] = { "L2": 0.0, @@ -32,14 +33,14 @@ @registry.optimizers("RAdam.v1") def RAdam( - learn_rate: FloatOrSeq = ADAM_DEFAULTS["learn_rate"], + learn_rate: ScheduleT = ADAM_DEFAULTS["learn_rate"], *, - beta1: FloatOrSeq = ADAM_DEFAULTS["beta1"], - beta2: FloatOrSeq = ADAM_DEFAULTS["beta2"], - eps: FloatOrSeq = ADAM_DEFAULTS["eps"], - L2: FloatOrSeq = ADAM_DEFAULTS["L2"], + beta1: ScheduleT = ADAM_DEFAULTS["beta1"], + beta2: ScheduleT = ADAM_DEFAULTS["beta2"], + eps: ScheduleT = ADAM_DEFAULTS["eps"], + L2: ScheduleT = ADAM_DEFAULTS["L2"], L2_is_weight_decay: bool = cast(bool, ADAM_DEFAULTS["L2_is_weight_decay"]), - grad_clip: FloatOrSeq = ADAM_DEFAULTS["grad_clip"], + grad_clip: ScheduleT = ADAM_DEFAULTS["grad_clip"], use_averages: bool = True, ): return Optimizer( @@ -57,13 +58,13 @@ def RAdam( @registry.optimizers("Adam.v1") def Adam( - learn_rate: FloatOrSeq = ADAM_DEFAULTS["learn_rate"], + learn_rate: ScheduleT = ADAM_DEFAULTS["learn_rate"], *, - L2: FloatOrSeq = ADAM_DEFAULTS["L2"], - beta1: FloatOrSeq = ADAM_DEFAULTS["beta1"], - beta2: FloatOrSeq = ADAM_DEFAULTS["beta2"], - eps: FloatOrSeq = ADAM_DEFAULTS["eps"], - grad_clip: FloatOrSeq = ADAM_DEFAULTS["grad_clip"], + L2: ScheduleT = ADAM_DEFAULTS["L2"], + beta1: ScheduleT = ADAM_DEFAULTS["beta1"], + beta2: ScheduleT = ADAM_DEFAULTS["beta2"], + eps: ScheduleT = ADAM_DEFAULTS["eps"], + grad_clip: ScheduleT = ADAM_DEFAULTS["grad_clip"], L2_is_weight_decay: bool = cast(bool, ADAM_DEFAULTS["L2_is_weight_decay"]), use_averages: bool = True, ): @@ -82,10 +83,10 @@ def Adam( @registry.optimizers("SGD.v1") def SGD( - learn_rate: FloatOrSeq, + learn_rate: ScheduleT, *, - L2: FloatOrSeq = SGD_DEFAULTS["L2"], - grad_clip: FloatOrSeq = SGD_DEFAULTS["grad_clip"], + L2: ScheduleT = SGD_DEFAULTS["L2"], + grad_clip: ScheduleT = SGD_DEFAULTS["grad_clip"], L2_is_weight_decay: bool = cast(bool, SGD_DEFAULTS["L2_is_weight_decay"]), use_averages: bool = True, ): @@ -111,15 +112,17 @@ class Optimizer(object): schedules: Dict[str, Generator] nr_update: Dict[KeyT, int] last_seen: Dict[KeyT, int] - grad_clip: float - learn_rate: float - b1: float - b2: float - eps: float - L2: float + grad_clip: Schedule + learn_rate: Schedule + b1: Schedule + b2: Schedule + eps: Schedule + L2: Schedule use_radam: bool L2_is_weight_decay: bool _radam_buffer: List[List[Optional[FloatsXd]]] + _step: int + _last_score: Optional[Tuple[int, float]] # This "locks" the class, so we get an error if you try to assign to # an unexpected variable. @@ -139,17 +142,19 @@ class Optimizer(object): "use_radam", "L2_is_weight_decay", "_radam_buffer", + "_step", + "_last_score", ] def __init__( self, - learn_rate: FloatOrSeq, + learn_rate: ScheduleT, *, - L2: FloatOrSeq = ADAM_DEFAULTS["L2"], - beta1: FloatOrSeq = ADAM_DEFAULTS["beta1"], - beta2: FloatOrSeq = ADAM_DEFAULTS["beta2"], - eps: FloatOrSeq = ADAM_DEFAULTS["eps"], - grad_clip: FloatOrSeq = ADAM_DEFAULTS["grad_clip"], + L2: ScheduleT = ADAM_DEFAULTS["L2"], + beta1: ScheduleT = ADAM_DEFAULTS["beta1"], + beta2: ScheduleT = ADAM_DEFAULTS["beta2"], + eps: ScheduleT = ADAM_DEFAULTS["eps"], + grad_clip: ScheduleT = ADAM_DEFAULTS["grad_clip"], use_averages: bool = True, use_radam: bool = False, L2_is_weight_decay: bool = True, @@ -168,13 +173,14 @@ def __init__( L2_is_weight_decay (bool): Whether to interpret the L2 parameter as a weight decay term, in the style of the AdamW optimizer. """ + self._step = 0 + self._last_score = None self.mom1 = {} self.mom2 = {} if use_averages: self.averages = {} else: self.averages = None - self.schedules = {} self.nr_update = defaultdict(int) self.last_seen = defaultdict(int) self._set_attr_or_schedule("grad_clip", grad_clip) @@ -189,24 +195,38 @@ def __init__( def _set_attr_or_schedule(self, name, value): if isinstance(value, (float, bool, int)): + setattr(self, name, constant(value)) + elif isinstance(value, list): + value = iter(value) + setattr(self, name, _wrap_generator(name, value)) + elif isinstance(value, GeneratorType): + setattr(self, name, _wrap_generator(name, value)) + elif isinstance(value, Schedule): setattr(self, name, value) else: - if isinstance(value, list): - value = iter(value) - self.schedules[name] = value - try: - setattr(self, name, next(value)) - except (StopIteration, TypeError) as e: - err = f"Invalid schedule for '{name}' ({type(value)})\n{e}" - raise ValueError(err) + err = f"Invalid schedule for '{name}' ({type(value)})" + raise ValueError(err) def step_schedules(self): - for key, schedule in self.schedules.items(): - try: - value = next(schedule) - except StopIteration: # schedule exhausted, use last value - value = getattr(self, key) - setattr(self, key, value) + self._step += 1 + + @property + def last_score(self) -> Optional[Tuple[int, float]]: + return self._last_score + + @last_score.setter + def last_score(self, score: float): + self._last_score = (self._step, score) + + @property + def step(self) -> int: + return self._step + + def _schedule_args(self, key: KeyT) -> Dict[str, Any]: + return { + "key": key, + "last_score": self.last_score, + } def __call__( self, @@ -221,28 +241,42 @@ def __call__( """ if len(gradient) < 1: return weights, gradient + ops = get_array_ops(weights) self.nr_update[key] += 1 nr_upd = self.nr_update[key] - if self.L2 != 0 and not self.L2_is_weight_decay: - gradient += self.L2 * weights - if self.grad_clip: - gradient = ops.clip_gradient(gradient, self.grad_clip) + schedule_args = self._schedule_args(key) + + if self.L2(self.step, **schedule_args) != 0 and not self.L2_is_weight_decay: + gradient += self.L2(self.step, **schedule_args) * weights + if self.grad_clip(self.step, **schedule_args): + gradient = ops.clip_gradient( + gradient, + self.grad_clip(self.step, **schedule_args), + ) if self.use_radam: weights, gradient = self._radam( ops, weights, gradient, lr_scale, key, nr_upd ) - elif self.b1 > 0.0 and self.b2 > 0.0: + elif ( + self.b1(self.step, **schedule_args) > 0.0 + and self.b2(self.step, **schedule_args) > 0.0 + ): weights, gradient = self._adam( ops, weights, gradient, lr_scale, key, nr_upd ) - elif self.b2 > 0.0: # pragma: no cover + elif self.b2(self.step, **schedule_args) > 0.0: # pragma: no cover raise NotImplementedError # TODO: error message else: - weights -= lr_scale * self.learn_rate * gradient + weights -= lr_scale * self.learn_rate(self.step, **schedule_args) * gradient gradient *= 0 - if self.L2 != 0 and self.L2_is_weight_decay: - weights -= lr_scale * self.learn_rate * self.L2 * weights + if self.L2(self.step, **schedule_args) != 0 and self.L2_is_weight_decay: + weights -= ( + lr_scale + * self.learn_rate(self.step, **schedule_args) + * self.L2(self.step, **schedule_args) + * weights + ) if self.averages is not None: if key not in self.averages: self.averages[key] = ops.alloc(weights.shape, dtype="float32") @@ -258,6 +292,8 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd): weights_1D = ops.reshape1f(weights, weights.size) gradient_1D = ops.reshape1f(grad, grad.size) + schedule_args = self._schedule_args(key) + # While we port from the pytorch implementation, keep some of the same # naming state = { @@ -266,9 +302,12 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd): "exp_avg_sq": self.mom2[key], } group = { - "lr": self.learn_rate, - "betas": [self.b1, self.b2], - "eps": self.eps, + "lr": self.learn_rate(self.step, **schedule_args), + "betas": [ + self.b1(self.step, **schedule_args), + self.b2(self.step, **schedule_args), + ], + "eps": self.eps(self.step, **schedule_args), "weight_decay": 0.0, "buffer": self._radam_buffer, } @@ -330,18 +369,21 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd): def _adam(self, ops, weights, gradient, lr_scale, key, nr_upd): weights_1D = ops.reshape1f(weights, weights.size) gradient_1D = ops.reshape1f(gradient, gradient.size) + + schedule_args = self._schedule_args(key) + if key not in self.mom1: self.mom1[key] = ops.alloc1f(weights.size) if key not in self.mom2: self.mom2[key] = ops.alloc1f(weights.size) mom1 = self.mom1[key] mom2 = self.mom2[key] - b1 = self.b1 - b2 = self.b2 + b1 = self.b1(self.step, **schedule_args) + b2 = self.b2(self.step, **schedule_args) fix1 = 1.0 - (b1**nr_upd) fix2 = 1.0 - (b2**nr_upd) - lr = self.learn_rate * fix2**0.5 / fix1 - eps = self.eps + lr = self.learn_rate(self.step, **schedule_args) * fix2**0.5 / fix1 + eps = self.eps(self.step, **schedule_args) # needs to be 1D going into the adam function weights_1D, gradient_1D, mom1, mom2 = ops.adam( weights_1D, gradient_1D, mom1, mom2, b1, b2, eps, lr * lr_scale @@ -354,4 +396,49 @@ def _adam(self, ops, weights, gradient, lr_scale, key, nr_upd): ) +def _wrap_generator(attr_name: str, generator: Generator) -> Schedule[Any]: + try: + peek = next(generator) + except (StopIteration, TypeError) as e: + err = f"Invalid schedule for '{attr_name}' ({type(generator)})\n{e}" + raise ValueError(err) + return Schedule( + "wrap_generator", + _wrap_generator_schedule, + attrs={ + "attr_name": attr_name, + "last_step": -1, + "last_value": peek, + "generator": itertools.chain([peek], generator), + }, + ) + + +def _wrap_generator_schedule(schedule: Schedule, step, **kwargs) -> float: + attr_name = schedule.attrs["attr_name"] + last_step = schedule.attrs["last_step"] + last_value = schedule.attrs["last_value"] + generator = schedule.attrs["generator"] + + if step < last_step: + raise ValueError( + f"'step' of the generator-based schedule for {attr_name} must not decrease" + ) + + # Ensure that we have a value when we didn't step or when the + # generator is exhausted. + value = last_value + + for i in range(step - last_step): + try: + value = next(generator) + except StopIteration: # schedule exhausted, use last value + break + + schedule.attrs["last_step"] = step + schedule.attrs["last_value"] = value + + return value + + __all__ = ["Adam", "RAdam", "SGD", "Optimizer", "ADAM_DEFAULTS", "SGD_DEFAULTS"] diff --git a/thinc/schedules.py b/thinc/schedules.py index 87581af74..73711f87e 100644 --- a/thinc/schedules.py +++ b/thinc/schedules.py @@ -1,32 +1,83 @@ """Generators that provide different rates, schedules, decays or series.""" -from typing import Iterable +from typing import Any, Callable, Dict, Generic, TypeVar import numpy from .config import registry +OutT = TypeVar("OutT") + + +class Schedule(Generic[OutT]): + """Class for implementing Thinc schedules.""" + + name: str + _schedule: Callable + _attrs: Dict[str, Any] + + __slots__ = ["name", "_schedule", "_attrs"] + + def __init__( + self, name: str, schedule: Callable, *, attrs: Dict[str, Any] = {} + ) -> None: + """Initialize a new schedule. + + name (str): The name of the schedule type. + schedule (Callable): The schedule function. + """ + self.name = name + self._schedule = schedule + self._attrs = dict(attrs) + + def __call__(self, step: int, **extra) -> OutT: + """Compute the schedule for a given step.""" + + if step < 0: + raise ValueError(f"Step must be non-negative, was: {step}") + + return self._schedule(self, step, **extra) + + @property + def attrs(self): + """Schedule attributes.""" + return self._attrs + @registry.schedules("constant_then.v1") -def constant_then( - rate: float, steps: int, schedule: Iterable[float] -) -> Iterable[float]: +def constant_then(rate: OutT, steps: int, schedule: Schedule[OutT]) -> Schedule[OutT]: """Yield a constant rate for N steps, before starting a schedule.""" - for i in range(steps): - yield rate - for value in schedule: - yield value + return Schedule( + "constant_then", + _constant_then_schedule, + attrs={"rate": rate, "steps": steps, "schedule": schedule}, + ) + + +def _constant_then_schedule(schedule: Schedule, step: int, **kwargs) -> float: + rate = schedule.attrs["rate"] + steps = schedule.attrs["steps"] + schedule = schedule.attrs["schedule"] + + if step < steps: + return rate + else: + return schedule(step=step, **kwargs) @registry.schedules("constant.v1") -def constant(rate: float) -> Iterable[float]: +def constant(rate: OutT) -> Schedule[OutT]: """Yield a constant rate.""" - while True: - yield rate + return Schedule("constant", _constant_schedule, attrs={"rate": rate}) + + +def _constant_schedule(schedule: Schedule, step: int, **kwargs) -> float: + rate = schedule.attrs["rate"] + return rate @registry.schedules("decaying.v1") -def decaying(base_rate: float, decay: float, *, t: int = 0) -> Iterable[float]: +def decaying(base_rate: float, decay: float, *, t: float = 0.0) -> Schedule[float]: """Yield an infinite series of linearly decaying values, - following the schedule: base_rate * 1 / (1 + decay * t) + following the schedule: base_rate * 1 / (1 + decay * (t + step)) EXAMPLE: >>> learn_rates = decaying(0.001, 1e-4) @@ -35,15 +86,24 @@ def decaying(base_rate: float, decay: float, *, t: int = 0) -> Iterable[float]: >>> next(learn_rates) 0.00999 """ - while True: - yield base_rate * (1.0 / (1.0 + decay * t)) - t += 1 + return Schedule( + "decaying", + _decaying_schedule, + attrs={"base_rate": base_rate, "decay": decay, "t": t}, + ) + + +def _decaying_schedule(schedule: Schedule, step: int, **kwargs) -> float: + base_rate = schedule.attrs["base_rate"] + decay = schedule.attrs["decay"] + t = schedule.attrs["t"] + return base_rate * (1.0 / (1.0 + decay * (step + t))) @registry.schedules("compounding.v1") def compounding( start: float, stop: float, compound: float, *, t: float = 0.0 -) -> Iterable[float]: +) -> Schedule[float]: """Yield an infinite series of compounding values. Each time the generator is called, a value is produced by multiplying the previous value by the compound rate. @@ -54,10 +114,19 @@ def compounding( >>> assert next(sizes) == 1 * 1.5 >>> assert next(sizes) == 1.5 * 1.5 """ - curr = float(start) - while True: - yield _clip(curr, start, stop) - curr *= compound + return Schedule( + "compounding", + _compounding_schedule, + attrs={"start": start, "stop": stop, "compound": compound, "t": t}, + ) + + +def _compounding_schedule(schedule: Schedule, step: int, **kwargs) -> float: + start = schedule.attrs["start"] + stop = schedule.attrs["stop"] + compound = schedule.attrs["compound"] + t = schedule.attrs["t"] + return _clip(start * (compound ** (step + t)), start, stop) def _clip(value: float, start: float, stop: float) -> float: @@ -71,52 +140,90 @@ def slanted_triangular( *, cut_frac: float = 0.1, ratio: int = 32, - decay: float = 1.0, t: float = 0.0, -) -> Iterable[float]: +) -> Schedule[float]: """Yield an infinite series of values according to Howard and Ruder's "slanted triangular learning rate" schedule. """ cut = int(num_steps * cut_frac) - while True: - t += 1 - if t < cut: - p = t / cut - else: - p = 1 - ((t - cut) / (cut * (1 / cut_frac - 1))) - learn_rate = max_rate * (1 + p * (ratio - 1)) * (1 / ratio) - yield learn_rate + return Schedule( + "slanted_triangular", + _slanted_triangular_schedule, + attrs={ + "max_rate": max_rate, + "cut": cut, + "cut_frac": cut_frac, + "ratio": ratio, + "t": t, + }, + ) + + +def _slanted_triangular_schedule(schedule: Schedule, step: int, **kwargs) -> float: + max_rate = schedule.attrs["max_rate"] + cut = schedule.attrs["cut"] + cut_frac = schedule.attrs["cut_frac"] + ratio = schedule.attrs["ratio"] + t = schedule.attrs["t"] + + t_step = step + t + 1.0 + if t_step < cut: + p = t_step / cut + else: + p = 1 - ((t_step - cut) / (cut * (1 / cut_frac - 1))) + return max_rate * (1 + p * (ratio - 1)) * (1 / ratio) @registry.schedules("warmup_linear.v1") def warmup_linear( initial_rate: float, warmup_steps: int, total_steps: int -) -> Iterable[float]: +) -> Schedule[float]: """Generate a series, starting from an initial rate, and then with a warmup period, and then a linear decline. Used for learning rates. """ - step = 0 - while True: - if step < warmup_steps: - factor = step / max(1, warmup_steps) - else: - factor = max( - 0.0, (total_steps - step) / max(1.0, total_steps - warmup_steps) - ) - yield factor * initial_rate - step += 1 + return Schedule( + "warmup_linear", + _warmup_linear_schedule, + attrs={ + "initial_rate": initial_rate, + "warmup_steps": warmup_steps, + "total_steps": total_steps, + }, + ) + + +def _warmup_linear_schedule(schedule: Schedule, step: int, **kwargs) -> float: + initial_rate = schedule.attrs["initial_rate"] + warmup_steps = schedule.attrs["warmup_steps"] + total_steps = schedule.attrs["total_steps"] + + if step < warmup_steps: + factor = step / max(1, warmup_steps) + else: + factor = max(0.0, (total_steps - step) / max(1.0, total_steps - warmup_steps)) + return factor * initial_rate @registry.schedules("cyclic_triangular.v1") -def cyclic_triangular(min_lr: float, max_lr: float, period: int) -> Iterable[float]: - it = 1 - while True: - # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee - cycle = numpy.floor(1 + it / (2 * period)) - x = numpy.abs(it / period - 2 * cycle + 1) - relative = max(0, 1 - x) - yield min_lr + (max_lr - min_lr) * relative - it += 1 +def cyclic_triangular(min_lr: float, max_lr: float, period: int) -> Schedule[float]: + return Schedule( + "cyclic_triangular", + _cyclic_triangular_schedule, + attrs={"min_lr": min_lr, "max_lr": max_lr, "period": period}, + ) + + +def _cyclic_triangular_schedule(schedule: Schedule, step: int, **kwargs) -> float: + min_lr = schedule.attrs["min_lr"] + max_lr = schedule.attrs["max_lr"] + period = schedule.attrs["period"] + + it = step + 1 + # https://towardsdatascience.com/adaptive-and-cyclical-learning-rates-using-pytorch-2bf904d18dee + cycle = numpy.floor(1 + it / (2 * period)) + x = numpy.abs(it / period - 2 * cycle + 1) + relative = max(0, 1 - x) + return min_lr + (max_lr - min_lr) * relative __all__ = [ diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py index 0dceadfc4..e028937da 100644 --- a/thinc/tests/test_config.py +++ b/thinc/tests/test_config.py @@ -178,9 +178,8 @@ def decaying(base_rate: float, repeat: int) -> List[float]: return repeat * [base_rate] optimizer = my_registry.resolve(config)["optimizer"] - assert optimizer.b1 == 0.2 - assert "learn_rate" in optimizer.schedules - assert optimizer.learn_rate == 0.001 + assert optimizer.b1(step=optimizer._step, key=(0, "")) == 0.2 + assert optimizer.learn_rate(step=optimizer._step, key=(0, "")) == 0.001 def test_handle_generic_model_type(): diff --git a/thinc/tests/test_optimizers.py b/thinc/tests/test_optimizers.py index a31dbce32..0fab737f9 100644 --- a/thinc/tests/test_optimizers.py +++ b/thinc/tests/test_optimizers.py @@ -1,8 +1,12 @@ import pytest from thinc.api import registry, Optimizer +from thinc.optimizers import KeyT, _wrap_generator import numpy +STUB_KEY: KeyT = (0, "") + + def _test_schedule_valid(): while True: yield 0.456 @@ -29,6 +33,22 @@ def schedule_valid(request): return r_func(), r1, r2, r3 +@pytest.fixture( + params=[ + (lambda: 0.123, 0.123, 0.123, 0.123), + (lambda: (i for i in [0.2, 0.1, 0.4, 0.5, 0.6, 0.7, 0.8]), 0.2, 0.1, 0.4), + (lambda: (i for i in [0.333, 0.666]), 0.333, 0.666, 0.666), + (lambda: [0.9, 0.8, 0.7], 0.9, 0.8, 0.7), + (lambda: [0.0, 0.123], 0.0, 0.123, 0.123), + ], + scope="function", +) +def schedule_config_valid(request): + # Use lambda to prevent iterator from being consumed by first test + r_func, r1, r2, r3 = request.param + return r_func(), r1, r2, r3 + + @pytest.fixture( params=[ (lambda: "hello"), @@ -49,32 +69,32 @@ def test_optimizers_from_config(name): learn_rate = 0.123 cfg = {"@optimizers": name, "learn_rate": learn_rate} optimizer = registry.resolve({"config": cfg})["config"] - assert optimizer.learn_rate == learn_rate + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == learn_rate -def test_optimizer_schedules_from_config(schedule_valid): - lr, lr_next1, lr_next2, lr_next3 = schedule_valid +def test_optimizer_schedules_from_config(schedule_config_valid): + lr, lr_next1, lr_next2, lr_next3 = schedule_config_valid cfg = {"@optimizers": "Adam.v1", "learn_rate": lr} optimizer = registry.resolve({"cfg": cfg})["cfg"] - assert optimizer.learn_rate == lr_next1 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next1 optimizer.step_schedules() - assert optimizer.learn_rate == lr_next2 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next2 optimizer.step_schedules() - assert optimizer.learn_rate == lr_next3 - optimizer.learn_rate = 1.0 - assert optimizer.learn_rate == 1.0 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next3 + optimizer.learn_rate = lambda *, step, key: 1.0 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == 1.0 def test_optimizer_schedules_valid(schedule_valid): lr, lr_next1, lr_next2, lr_next3 = schedule_valid optimizer = Optimizer(learn_rate=lr) - assert optimizer.learn_rate == lr_next1 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next1 optimizer.step_schedules() - assert optimizer.learn_rate == lr_next2 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next2 optimizer.step_schedules() - assert optimizer.learn_rate == lr_next3 - optimizer.learn_rate = 1.0 - assert optimizer.learn_rate == 1.0 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == lr_next3 + optimizer.learn_rate = lambda *, step, key: 1.0 + assert optimizer.learn_rate(step=optimizer._step, key=STUB_KEY) == 1.0 def test_optimizer_schedules_invalid(schedule_invalid): @@ -97,3 +117,31 @@ def test_optimizer_init(): optimizer((0, "x"), W, dW) optimizer = Optimizer(learn_rate=0.123, beta1=0.1, beta2=0.1) optimizer((1, "x"), W, dW) + + +def test_optimizer_last_score(): + optimizer = Optimizer( + learn_rate=0.123, + ) + + assert optimizer.last_score is None + optimizer.last_score = 1.0 + assert optimizer.last_score == (0, 1.0) + optimizer.step_schedules() + optimizer.step_schedules() + assert optimizer.last_score == (0, 1.0) + optimizer.last_score = 2.0 + assert optimizer.last_score == (2, 2.0) + + +def test_generator_schedule(): + s = _wrap_generator("test", iter([0.0, 1.0, 2.0, 3.0])) + assert s(step=0, key=STUB_KEY, last_score=None) == 0.0 + assert s(step=0, key=STUB_KEY, last_score=None) == 0.0 + assert s(step=1, key=STUB_KEY, last_score=None) == 1.0 + assert s(step=1, key=STUB_KEY, last_score=None) == 1.0 + assert s(step=3, key=STUB_KEY, last_score=None) == 3.0 + assert s(step=10, key=STUB_KEY, last_score=None) == 3.0 + + with pytest.raises(ValueError, match=r"must not decrease"): + s(step=1, key=STUB_KEY, last_score=None) diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py index d975d2dbd..d8da928e4 100644 --- a/thinc/tests/test_schedules.py +++ b/thinc/tests/test_schedules.py @@ -1,63 +1,76 @@ from thinc.api import decaying, compounding, slanted_triangular, constant_then from thinc.api import constant, warmup_linear, cyclic_triangular +from thinc.optimizers import KeyT def test_decaying_rate(): rates = decaying(0.001, 1e-4) - rate = next(rates) + rate = rates(step=0) assert rate == 0.001 - next_rate = next(rates) + next_rate = rates(step=1) assert next_rate < rate assert next_rate > 0 - assert next_rate > next(rates) + assert next_rate > rates(step=2) + + rates_offset = decaying(0.001, 1e-4, t=1.0) + assert rates(step=1) == rates_offset(step=0) + assert rates(step=2) == rates_offset(step=1) def test_compounding_rate(): rates = compounding(1, 16, 1.01) - rate0 = next(rates) + rate0 = rates(step=0) assert rate0 == 1.0 - rate1 = next(rates) - rate2 = next(rates) - rate3 = next(rates) + rate1 = rates(step=1) + rate2 = rates(step=2) + rate3 = rates(step=3) assert rate3 > rate2 > rate1 > rate0 assert (rate3 - rate2) > (rate2 - rate1) > (rate1 - rate0) + rates_offset = compounding(1, 16, 1.01, t=1.0) + assert rates(step=1) == rates_offset(step=0) + assert rates(step=2) == rates_offset(step=1) + def test_slanted_triangular_rate(): rates = slanted_triangular(1.0, 20.0, ratio=10) - rate0 = next(rates) + rate0 = rates(step=0) assert rate0 < 1.0 - rate1 = next(rates) + rate1 = rates(step=1) assert rate1 > rate0 - rate2 = next(rates) + rate2 = rates(step=2) assert rate2 < rate1 - rate3 = next(rates) + rate3 = rates(step=3) assert rate0 < rate3 < rate2 + rates_offset = slanted_triangular(1.0, 20.0, ratio=10, t=1.0) + assert rates(step=1) == rates_offset(step=0) + assert rates(step=2) == rates_offset(step=1) + def test_constant_then_schedule(): - rates = constant_then(1.0, 2, [100, 200]) - assert next(rates) == 1.0 - assert next(rates) == 1.0 - assert next(rates) == 100 - assert next(rates) == 200 + rates = constant_then(1.0, 2, constant(100)) + assert rates(step=0) == 1.0 + assert rates(step=1) == 1.0 + assert rates(step=2) == 100 + assert rates(step=3) == 100 def test_constant(): rates = constant(123) - assert next(rates) == 123 - assert next(rates) == 123 + assert rates(step=0, key=(0, "")) == 123 + assert rates(step=0, key=(0, "")) == 123 def test_warmup_linear(): rates = warmup_linear(1.0, 2, 10) expected = [0.0, 0.5, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125, 0.0] for i in range(11): - assert next(rates) == expected[i] + assert rates(step=i, key=(0, "")) == expected[i] def test_cyclic_triangular(): rates = cyclic_triangular(0.1, 1.0, 2) expected = [0.55, 1.0, 0.55, 0.1, 0.55, 1.0, 0.55, 0.1, 0.55, 1.0] for i in range(10): - assert next(rates) == expected[i] + assert rates(step=i, key=(0, "")) == expected[i] diff --git a/website/docs/api-optimizers.md b/website/docs/api-optimizers.md index 47873cc1c..2deab184e 100644 --- a/website/docs/api-optimizers.md +++ b/website/docs/api-optimizers.md @@ -14,10 +14,9 @@ zero the gradients in place. The optimizers are registered in the ### SGD {#sgd tag="function"} -If a hyperparameter specifies a schedule as a list or generator, its value will -be replaced with the next item on each call to -[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted, -its last value will be used. +Function to create a SGD optimizer. If a hyperparameter specifies a schedule, +the step that is passed to the schedule will be incremented on each call to +[`Optimizer.step_schedules`](#step-schedules). @@ -58,10 +57,9 @@ use_averages = true ### Adam {#adam tag="function"} Function to create an Adam optimizer. Returns an instance of -[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule as a list or -generator, its value will be replaced with the next item on each call to -[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted, -its last value will be used. +[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule, the step +that is passed to the schedule will be incremented on each call to +[`Optimizer.step_schedules`](#step-schedules). @@ -113,10 +111,9 @@ use_averages = true ### RAdam {#radam tag="function"} Function to create an RAdam optimizer. Returns an instance of -[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule as a list or -generator, its value will be replaced with the next item on each call to -[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted, -its last value will be used. +[`Optimizer`](#optimizer). If a hyperparameter specifies a schedule, the step +that is passed to the schedule will be incremented on each call to +[`Optimizer.step_schedules`](#step-schedules). @@ -171,10 +168,9 @@ momentum. Currently support "vanilla" SGD, Adam, and RAdam. ### Optimizer.\_\_init\_\_ {#init tag="method"} -Initialize an optimizer. If a hyperparameter specifies a schedule as a list or -generator, its value will be replaced with the next item on each call to -[`Optimizer.step_schedules`](#step-schedules). Once the schedule is exhausted, -its last value will be used. +Initialize an optimizer. If a hyperparameter specifies a schedule, the step that +is passed to the schedule will be incremented on each call to +[`Optimizer.step_schedules`](#step-schedules). ```python ### Example @@ -213,9 +209,8 @@ and parameter name. ### Optimizer.step_schedules {#step_schedules tag="method"} -Replace the the named hyperparameters with the next item from the schedules -iterator, if available. Once the schedule is exhausted, its last value will be -used. +Increase the current step of the optimizer. This step will be used by schedules +to determine their next value. ```python ### Example diff --git a/website/docs/api-schedules.md b/website/docs/api-schedules.md index f15877111..0a395ff6d 100644 --- a/website/docs/api-schedules.md +++ b/website/docs/api-schedules.md @@ -5,11 +5,94 @@ next: /docs/api-loss Schedules are generators that provide different rates, schedules, decays or series. They're typically used for batch sizes or learning rates. You can easily -implement your own schedules as well: just write your own generator function, -that produces whatever series of values you need. A common use case for -schedules is within [`Optimizer`](/docs/api-optimizer) objects, which accept -iterators for most of their parameters. See the -[training guide](/docs/usage-training) for details. +implement your own schedules as well: just write your own +[`Schedule`](#schedule) implementation, that produces whatever series of values +you need. A common use case for schedules is within +[`Optimizer`](/docs/api-optimizer) objects, which accept iterators for most of +their parameters. See the [training guide](/docs/usage-training) for details. + +## Schedule {#schedule tag="class" new="9"} + +Class for implementing Thinc schedules. + + + +There's only one `Schedule` class in Thinc and schedules are built using +**composition**, not inheritance. This means that a schedule or composed +schedule will return an **instance** of `Schedule` – it doesn't subclass it. To +read more about this concept, see the pages on +[Thinc's philosophy](/docs/concept). + + + +### Typing {#typing} + +`Schedule` can be used as a +[generic type](https://docs.python.org/3/library/typing.html#generics) with one +parameter. This parameter specifies the type that is returned by the schedule. +For instance, `Schedule[int]` denotes a scheduler that returns integers when +called. A mismatch will cause a type error. For more details, see the docs on +[type checking](/docs/usage-type-checking). + +```python +from thinc.api import Schedule + +def my_function(schedule: Schedule[int]): + ... +``` + +### Attributes {#attributes} + +| Name | Type | Description | +| ------ | ------------ | ------------------------------- | +| `name` | str | The name of the scheduler type. | + +### Properties {#properties} + +| Name | Type | Description | +| ------- | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `attrs` | Dict[str, Any] | The scheduler attributes. You can use the dict directly and assign _to_ it – but you cannot reassign `schedule.attrs` to a new variable: `schedule.attrs = {}` will fail. | + +### Schedule.\_\_init\_\_ {#init tag="method"} + +Initialize a new schedule. + +```python +### Example +schedule = Schedule( + "constant", + constant_schedule, + attrs={"rate": rate}, +) +``` + +| Argument | Type | Description | +| -------------- | ----------------------- | -------------------------------------------------------- | +| `name` | str | The name of the schedule type. | +| `schedule` | Callable | Function to compute the schedule value for a given step. | +| _keyword-only_ | | | +| `attrs` | Dict[str, Any] | Dictionary of non-parameter attributes. | + +### Schedule.\_\_call\_\_ {#call tag="method"} + +Call the schedule function, returning the value for the given step. The +`step` positional argument is always required. Some schedules may require +additional keyword arguments. + +```python +### Example +from thinc.api import constant + +schedule = constant(0.1) +assert schedule(0) == 0.1 +assert schedule(1000) == 0.1 +``` + +| Argument | Type | Description | +| ----------- | ------------ | ------------------------------------------ | +| `step` | int | The step to compute the schedule for. | +| `**kwargs` | | Optional arguments passed to the schedule. | +| **RETURNS** | Any | The schedule value for the step. | ## constant {#constant tag="function"} @@ -24,7 +107,7 @@ Yield a constant rate. from thinc.api import constant batch_sizes = constant(0.001) -batch_size = next(batch_sizes) +batch_size = batch_sizes(step=0) ``` ```ini @@ -58,7 +141,7 @@ learn_rates = constant_then( 1000, decaying(0.005, 1e-4) ) -learn_rate = next(learn_rates) +learn_rate = learn_rates(step=0) ``` ```ini @@ -97,8 +180,8 @@ Yield an infinite series of linearly decaying values, following the schedule from thinc.api import decaying learn_rates = decaying(0.005, 1e-4) -learn_rate = next(learn_rates) # 0.001 -learn_rate = next(learn_rates) # 0.00999 +learn_rate = learn_rates(step=0) # 0.001 +learn_rate = learn_rates(step=1) # 0.00999 ``` ```ini @@ -135,8 +218,8 @@ rate. from thinc.api import compounding batch_sizes = compounding(1.0, 32.0, 1.001) -batch_size = next(batch_sizes) # 1.0 -batch_size = next(batch_sizes) # 1.0 * 1.001 +batch_size = batch_sizes(step=0) # 1.0 +batch_size = batch_sizes(step=1) # 1.0 * 1.001 ``` ```ini @@ -174,7 +257,7 @@ and then a linear decline. Used for learning rates. from thinc.api import warmup_linear learn_rates = warmup_linear(0.01, 3000, 6000) -learn_rate = next(learn_rates) +learn_rate = learn_rates(step=0) ``` ```ini @@ -210,7 +293,7 @@ triangular learning rate" schedule. from thinc.api import slanted_triangular learn_rates = slanted_triangular(0.1, 5000) -learn_rate = next(learn_rates) +learn_rate = learn_rates(step=0) ``` ```ini @@ -251,7 +334,7 @@ Linearly increasing then linearly decreasing the rate at each cycle. from thinc.api import cyclic_triangular learn_rates = cyclic_triangular(0.005, 0.001, 1000) -learn_rate = next(learn_rates) +learn_rate = learn_rates(step=0) ``` ```ini diff --git a/website/docs/usage-config.md b/website/docs/usage-config.md index 73a1638ac..2887c39d5 100644 --- a/website/docs/usage-config.md +++ b/website/docs/usage-config.md @@ -190,21 +190,30 @@ For details and examples, see the The function registry integration becomes even more powerful when used to build **recursive structures**. Let's say you want to use a learning rate schedule and -pass in a generator as the `learn_rate` argument. Here's an example of a -function that yields an infinite series of decaying values, following the -schedule `base_rate * 1 / (1 + decay * t)`. It's also available in Thinc as +pass in a schedule as the `learn_rate` argument. Here's an example of a function +that yields an infinite series of decaying values, following the schedule +`base_rate * 1 / (1 + decay * t)`. It's also available in Thinc as [`schedules.decaying`](/docs/api-schedules#decaying). The decorator registers the function `"my_cool_decaying_schedule.v1"` in the registry `schedules`: ```python -from typing import Iterable import thinc +from thinc.schedules import Schedule @thinc.registry.schedules("my_cool_decaying_schedule.v1") -def decaying(base_rate: float, decay: float, *, t: int = 0) -> Iterable[float]: - while True: - yield base_rate * (1.0 / (1.0 + decay * t)) - t += 1 +def decaying(base_rate: float, decay: float, *, t: int = 0) -> Schedule[float]: + return Schedule( + "decaying", + _decaying_schedule, + attrs={"base_rate": base_rate, "decay": decay, "t": t} + ) + + +def _decaying_schedule(schedule: Schedule, step: int, **kwargs) -> float: + base_rate = schedule.attrs["base_rate"] + decay = schedule.attrs["decay"] + t = schedule.attrs["t"] + return base_rate * (1.0 / (1.0 + decay * (step + t))) ``` In your config, you can now define the `learn_rate` as a subsection of @@ -230,15 +239,6 @@ argument. If type annotations are available for the return value and it's a type that can be evaluated, the return value of the function will be validated as well. - - -**A note on validating generators:** If a value is a generator, it won't be -validated further, since this would mean having to execute and consume it. -Generators can potentially be infinite – like the decaying schedule in this -example – so checking its return value isn't viable. - - - ```python ### Under the hood learn_rate_func = thinc.registry.get("schedules", "my_cool_decaying_schedule.v1") @@ -290,11 +290,22 @@ values: ```python ### {small="true"} +import thinc +from thinc.schedules import Schedule + @thinc.registry.schedules("my_cool_schedule.v1") -def schedule(*steps: float, final: float = 1.0) -> Iterable[float]: - yield from steps - while True: - yield final +def step_values(*steps: float, final: float = 1.0) -> Schedule[float]: + step_list = list(steps) + return Schedule( + "step_values", + _step_values_schedule, + attrs={"steps": list(steps), "final": final} + ) + +def _step_values_schedule(schedule: Schedule, step: int, **kwargs) -> float: + steps = schedule.attrs["steps"] + final = schedule.attrs["final"] + return steps[step] if step < len(steps) else final ``` ```ini diff --git a/website/docs/usage-training.md b/website/docs/usage-training.md index c34648b89..8df7127a3 100644 --- a/website/docs/usage-training.md +++ b/website/docs/usage-training.md @@ -120,10 +120,9 @@ also simply consume the entire generator, by calling `list()` on it. Finally, `minibatch` and `multibatch` support **variable length batching**, based on a schedule you can provide as the `batch_size` argument. Simply pass in -an iterable (such as a generator from the -[built-in schedules](/docs/api-schedules)) instead of an integer. Variable -length batching is non-standard, but we regularly use it for some of -[spaCy](https://spacy.io)'s models, especially the parser and entity recognizer. +an iterable. Variable length batching is non-standard, but we regularly use it +for some of [spaCy](https://spacy.io)'s models, especially the parser and entity +recognizer. ```python from thinc.api import compounding @@ -225,37 +224,39 @@ normalize = true A common trick for stochastic gradient descent is to **vary the learning rate or other hyperparameters** over the course of training. Since there are many possible ways to vary the learning rate, Thinc lets you implement hyperparameter -schedules as simple generator functions. Thinc also provides a number of -[popular schedules](/docs/api-schedules) built-in. - -You can use schedules directly, by calling `next()` on the schedule and using it -to update hyperparameters in your training loop. Since schedules are -particularly common for optimization settings, the -[`Optimizer`](/docs/api-optimizer) object accepts floats, lists and iterators -for most of its parameters. When you call -[`Optimizer.step_schedules`](/docs/api-optimizer#step_schedules), the optimizer -will draw the next value from the generators and use them to change the given -attributes. For instance, here's how to create an instance of the `Adam` -optimizer with a custom learning rate schedule: +schedules as instances of the [`Schedule`](/docs/api-schedules#schedule) class. +Thinc also provides a number of [popular schedules](/docs/api-schedules) +built-in. + +You can use schedules directly, by calling the schedule with the `step` keyword +argument and using it to update hyperparameters in your training loop. Since +schedules are particularly common for optimization settings, the +[`Optimizer`](/docs/api-optimizer) object accepts floats, lists, iterators, and +[`Schedule`](/docs/api-schedules#schedule) instances for most of its parameters. +When you call [`Optimizer.step_schedules`](/docs/api-optimizer#step_schedules), +the optimizer will increase its step count and pass it to the schedules. For +instance, this is how one creates an instance of the `Adam` optimizer with a +custom learning rate schedule: ```python ### Custom learning rate schedule -from thinc.api import Adam +from thinc.api import Adam, Schedule -def my_schedule(): +def cycle(): values = [0.001, 0.01, 0.1] - while True: - for value in values: - yield value - for value in reversed(values): - yield value - -optimizer = Adam(learn_rate=my_schedule()) -assert optimizer.learn_rate == 0.001 + all_values = values + list(reversed(values)) + return Schedule("cycle", _cycle_schedule, attrs={"all_values": all_values}) + +def _cycle_schedule(schedule: Schedule, step: int, **kwargs) -> float: + all_values = schedule.attrs["all_values"] + return all_values[step % len(all_values)] + +optimizer = Adam(learn_rate=cycle()) +assert optimizer.learn_rate(optimizer.step) == 0.001 optimizer.step_schedules() -assert optimizer.learn_rate == 0.01 +assert optimizer.learn_rate(optimizer.step) == 0.01 optimizer.step_schedules() -assert optimizer.learn_rate == 0.1 +assert optimizer.learn_rate(optimizer.step) == 0.1 ``` ![](images/schedules_custom1.svg) @@ -271,13 +272,14 @@ of the optimizer. Check out the ```python ### Registered function {small="true"} -@thinc.registry.schedules("my_schedule.v1") -def my_schedule(values): - while True: - for value in values: - yield value - for value in reversed(values): - yield value +@thinc.registry.schedules("cycle.v1") +def cycle(values): + all_values = values + list(reversed(values)) + return Schedule("cycle", _cycle_schedule, attrs={"all_values": all_values}) + +def _cycle_schedule(schedule: Schedule, step: int, **kwargs) -> float: + all_values = schedule.attrs["all_values"] + return all_values[step % len(all_values)] ``` ```ini @@ -286,7 +288,7 @@ def my_schedule(values): @optimizers = "Adam.v1" [optimizer.learn_rate] -@schedules = "my_schedule.v1" +@schedules = "cycle.v1" values = [0.001, 0.01, 0.1] ``` From f6f6c81b4b60ccab4988d9b30acac5d08303e1fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Fri, 23 Dec 2022 09:49:28 +0100 Subject: [PATCH 11/30] Set version to v9.0.0.dev1 (#829) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index 3c68811b6..bb8e99dad 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev0" +__version__ = "9.0.0.dev1" __release__ = True From 7f35b3c48955e8e88deaf6c75a3d03d38c6be1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 12 Jan 2023 13:44:27 +0100 Subject: [PATCH 12/30] Add `Schedule.to_generator` (#837) * Add `Schedule.to_generator` This method turns a `Schedule` into a generator by feeding the `Schedule` steps with a given starting step and increment. * Doc fix Co-authored-by: Madeesh Kannan * docs: add default values for Schedule.to_generator * fix anchor Co-authored-by: Madeesh Kannan Co-authored-by: Sofie Van Landeghem --- thinc/schedules.py | 24 +++++++++++++++++++++++- thinc/tests/test_schedules.py | 7 +++++++ website/docs/api-schedules.md | 27 ++++++++++++++++++++++++--- 3 files changed, 54 insertions(+), 4 deletions(-) diff --git a/thinc/schedules.py b/thinc/schedules.py index 73711f87e..37a3cc04c 100644 --- a/thinc/schedules.py +++ b/thinc/schedules.py @@ -1,5 +1,6 @@ """Generators that provide different rates, schedules, decays or series.""" -from typing import Any, Callable, Dict, Generic, TypeVar +from typing import Any, Callable, Dict, Generator, Generic, TypeVar +import itertools import numpy from .config import registry @@ -41,6 +42,27 @@ def attrs(self): """Schedule attributes.""" return self._attrs + def to_generator( + self, start: int = 0, step_size=1, **extra + ) -> Generator[OutT, None, None]: + """Turn the schedule into a generator. + + start (int): The schedule initial step. + step_size (int): The amount to increase the step for each generated value. + **extra: Additional arguments that are passed to the schedule. + RETURNS (Generator[OutT, None, None]): The generator. + """ + if start < 0: + raise ValueError(f"Schedule start must be non-negative, was: {start}") + if step_size < 0: + raise ValueError(f"Step size must be non-negative, was: {step_size}") + + def generate(): + for step in itertools.count(start, step_size): + yield self(step, **extra) + + return generate() + @registry.schedules("constant_then.v1") def constant_then(rate: OutT, steps: int, schedule: Schedule[OutT]) -> Schedule[OutT]: diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py index d8da928e4..710d304de 100644 --- a/thinc/tests/test_schedules.py +++ b/thinc/tests/test_schedules.py @@ -1,3 +1,4 @@ +from itertools import islice from thinc.api import decaying, compounding, slanted_triangular, constant_then from thinc.api import constant, warmup_linear, cyclic_triangular from thinc.optimizers import KeyT @@ -74,3 +75,9 @@ def test_cyclic_triangular(): expected = [0.55, 1.0, 0.55, 0.1, 0.55, 1.0, 0.55, 0.1, 0.55, 1.0] for i in range(10): assert rates(step=i, key=(0, "")) == expected[i] + + +def test_to_generator(): + rates = warmup_linear(1.0, 2, 10) + expected = [0.0, 0.5, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125, 0.0] + assert list(islice(rates.to_generator(), len(expected))) == expected diff --git a/website/docs/api-schedules.md b/website/docs/api-schedules.md index 0a395ff6d..872d73cdf 100644 --- a/website/docs/api-schedules.md +++ b/website/docs/api-schedules.md @@ -75,9 +75,9 @@ schedule = Schedule( ### Schedule.\_\_call\_\_ {#call tag="method"} -Call the schedule function, returning the value for the given step. The -`step` positional argument is always required. Some schedules may require -additional keyword arguments. +Call the schedule function, returning the value for the given step. The `step` +positional argument is always required. Some schedules may require additional +keyword arguments. ```python ### Example @@ -94,6 +94,27 @@ assert schedule(1000) == 0.1 | `**kwargs` | | Optional arguments passed to the schedule. | | **RETURNS** | Any | The schedule value for the step. | +### Schedule.to_generator {#to_generator tag="method"} + +Turn the schedule into a generator by passing monotonically increasing step +count into the schedule. + +```python +### Example +from thinc.api import constant + +g = constant(0.1).to_generator() +assert next(g) == 0.1 +assert next(g) == 0.1 +``` + +| Argument | Type | Description | +| ----------- | ------------------------------------ | ------------------------------------------------------------------------------- | +| `start` | int | The initial schedule step. Defaults to `0`. | +| `step_size` | int | The amount to increase the step with for each generated value. Defaults to `1`. | +| `**kwargs` | | Optional arguments passed to the schedule. | +| **RETURNS** | Generator[OutT, None, None] | The generator. | + ## constant {#constant tag="function"} Yield a constant rate. From bbe8f537cfad778f2b6f3753a9a4d7a4b9e0c933 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Thu, 12 Jan 2023 17:36:17 +0100 Subject: [PATCH 13/30] Set version to v9.0.0.dev2 --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index bb8e99dad..b8ed8d6e7 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev1" +__version__ = "9.0.0.dev2" __release__ = True From f576d1e2b3e8e5c9223bb21ed8f3321727fbe5b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 19 Jan 2023 11:14:17 +0100 Subject: [PATCH 14/30] Add plateau.v1 schedule (#842) * Add plateau.v1 schedule This schedule yields values from the wrapped schedule, exponentially scaled by the number of times optimization has plateaued. * Fix anchor * Remove stagnant wording in favor of plateaus * Type annotation: last_score is Optional Also set a default value, to that the schedule does not fail when the last_score argument is not provided. * Update docs to clarify that passing last_score is not mandatory * Document plateau arguments --- thinc/api.py | 3 +- thinc/schedules.py | 107 +++++++++++++++++++++++++++++++++- thinc/tests/test_schedules.py | 19 ++++++ website/docs/api-schedules.md | 44 ++++++++++++++ 4 files changed, 171 insertions(+), 2 deletions(-) diff --git a/thinc/api.py b/thinc/api.py index 3d904fe29..b296875b6 100644 --- a/thinc/api.py +++ b/thinc/api.py @@ -10,6 +10,7 @@ from .optimizers import Adam, RAdam, SGD, Optimizer from .schedules import Schedule, cyclic_triangular, warmup_linear, constant from .schedules import constant_then, decaying, slanted_triangular, compounding +from .schedules import plateau from .types import Ragged, Padded, ArgsKwargs, Unserializable from .util import fix_random_seed, is_cupy_array, set_active_gpu from .util import prefer_gpu, require_gpu, require_cpu @@ -67,7 +68,7 @@ "Adam", "RAdam", "SGD", "Optimizer", # .schedules "Schedule", "cyclic_triangular", "warmup_linear", "constant", "constant_then", - "decaying", "slanted_triangular", "compounding", + "decaying", "slanted_triangular", "compounding", "plateau", # .types "Ragged", "Padded", "ArgsKwargs", "Unserializable", # .util diff --git a/thinc/schedules.py b/thinc/schedules.py index 37a3cc04c..49e43a0c8 100644 --- a/thinc/schedules.py +++ b/thinc/schedules.py @@ -1,5 +1,7 @@ """Generators that provide different rates, schedules, decays or series.""" -from typing import Any, Callable, Dict, Generator, Generic, TypeVar +from typing import Any, Callable, Dict, Generator, Generic, Tuple, TypeVar +from typing import Optional +from dataclasses import dataclass import itertools import numpy @@ -155,6 +157,109 @@ def _clip(value: float, start: float, stop: float) -> float: return max(value, stop) if (start > stop) else min(value, stop) +@registry.schedules("plateau.v1") +def plateau( + max_patience: int, scale: float, schedule: Schedule[float] +) -> Schedule[float]: + + """Yields values from the wrapped schedule, exponentially scaled by the + number of times optimization has plateaued. The caller must pass model + evaluation scores through the last_score argument for the scaling to be + adjusted. The last evaluation score is passed through the last_score argument + as a tuple (last_score_step, last_score). This tuple indicates when a model + was last evaluated (last_score_step) and with what score (last_score). + + max_patience (int): the number of evaluations without improvement when + we consider the model to have plateaued. + scale (float): scaling of the inner schedule (scale**n_plateaus * inner). + schedule (Schedule[float]): the schedule to wrap. + """ + + return Schedule( + "plateau", + _plateau_schedule, + attrs={ + "scale": scale, + "max_patience": max_patience, + "schedule": schedule, + "state": _PlateauState( + best_score=None, last_score_step=None, patience=0, n_plateaus=0 + ), + }, + ) + + +def _plateau_schedule( + schedule: Schedule, + step: int, + *, + last_score: Optional[Tuple[int, float]] = None, + **kwargs, +) -> float: + inner_schedule: Schedule[float] = schedule.attrs["schedule"] + max_patience: int = schedule.attrs["max_patience"] + scale: float = schedule.attrs["scale"] + state: _PlateauState = schedule.attrs["state"] + + if last_score is None: + return (scale**state.n_plateaus) * inner_schedule( + step=step, last_score=last_score, **kwargs + ) + + last_score_step, last_score_ = last_score + + if ( + state.best_score is None + or state.last_score_step is None + or last_score_ > state.best_score + ): + state.best_score = last_score_ + state.patience = 0 + elif last_score_step < state.last_score_step: + raise ValueError( + f"Expected score with step >= {state.last_score_step}, was: {last_score_step}" + ) + elif last_score_step > state.last_score_step: + # If the score didn't improve and we are not seeing the last + # score again, we may be at a plateau, so increase patience. + state.patience += 1 + + # If we are at the maximum patience, we consider the optimization + # to have reached a plateau. + if state.patience == max_patience: + state.n_plateaus += 1 + state.patience = 0 + + state.last_score_step = last_score_step + + return (scale**state.n_plateaus) * inner_schedule( + step=step, last_score=last_score, **kwargs + ) + + +@dataclass +class _PlateauState: + """Plateau schedule state. + + best_score (Optional[float]): the best score so far, or None when no + score has been observed. + last_score_step (Optional[int]): the step of the last score that was + observed. + patience (int): the number of scores so far which do not improve over + the best score (reset after reaching the maximum patience). + n_plateaus (int): the number of times the maximum patience has been + reached. + """ + + best_score: Optional[float] + last_score_step: Optional[int] + patience: int + n_plateaus: int + + # @dataclass(slots=True) is only supported in Python >= 3.10 + __slots__ = ["best_score", "last_score_step", "patience", "n_plateaus"] + + @registry.schedules("slanted_triangular.v1") def slanted_triangular( max_rate: float, diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py index 710d304de..c404fe128 100644 --- a/thinc/tests/test_schedules.py +++ b/thinc/tests/test_schedules.py @@ -1,7 +1,9 @@ from itertools import islice +import pytest from thinc.api import decaying, compounding, slanted_triangular, constant_then from thinc.api import constant, warmup_linear, cyclic_triangular from thinc.optimizers import KeyT +from thinc.schedules import plateau def test_decaying_rate(): @@ -77,6 +79,23 @@ def test_cyclic_triangular(): assert rates(step=i, key=(0, "")) == expected[i] +def test_plateau(): + schedule = plateau(2, 0.5, constant(1.0)) + assert schedule(step=0, last_score=None) == 1.0 + assert schedule(step=1, last_score=(1, 1.0)) == 1.0 # patience == 0 + assert schedule(step=2, last_score=(2, 1.0)) == 1.0 # patience == 1 + assert schedule(step=3, last_score=None) == 1.0 # patience == 1 + assert schedule(step=4, last_score=(4, 1.0)) == 0.5 # patience == 2, reset + assert schedule(step=5, last_score=(4, 1.0)) == 0.5 # patience == 0 + assert schedule(step=6, last_score=(6, 0.9)) == 0.5 # patience == 1 + assert schedule(step=7, last_score=(7, 2.0)) == 0.5 # patience == 0 + assert schedule(step=8, last_score=(8, 1.0)) == 0.5 # patience == 1 + assert schedule(step=9, last_score=(9, 2.0)) == 0.25 # patience == 2, reset + + with pytest.raises(ValueError, match=r"Expected score with step"): + schedule(step=1, last_score=(1, 1.0)) == 1.0 + + def test_to_generator(): rates = warmup_linear(1.0, 2, 10) expected = [0.0, 0.5, 1.0, 0.875, 0.75, 0.625, 0.5, 0.375, 0.25, 0.125, 0.0] diff --git a/website/docs/api-schedules.md b/website/docs/api-schedules.md index 872d73cdf..c3837055b 100644 --- a/website/docs/api-schedules.md +++ b/website/docs/api-schedules.md @@ -375,3 +375,47 @@ period = 1000 | `max_lr` | float | | `period` | int | | **YIELDS** | float | + +## plateau {#plateau tag="function" new="9"} + +Yields values from the wrapped schedule, exponentially scaled by the number of +times optimization has plateaued. The caller must pass model evaluation scores +through the `last_score` argument for the scaling to be adjusted. The last +evaluation score is passed through the `last_score` argument as a tuple +(`last_score_step`, `last_score`). This tuple indicates when a model was last +evaluated (`last_score_step`) and with what score (`last_score`). + + + +```python +### {small="true"} +from thinc.api import constant, plateau + +schedule = plateau(2, 0.5, constant(1.0)) +assert schedule(step=0, last_score=(0, 1.0)) == 1.0 +assert schedule(step=1, last_score=(1, 1.0)) == 1.0 +assert schedule(step=2, last_score=(2, 1.0)) == 0.5 +assert schedule(step=3, last_score=(3, 1.0)) == 0.5 +assert schedule(step=4, last_score=(4, 1.0)) == 0.25 +``` + +```ini +### config {small="true"} +[learn_rate] +@schedules = "plateau.v1" +scale = 0.5 +max_patience = 2 + +[learn_rate.shedule] +@schedules = "constant.v1" +rate = 1.0 +``` + + + +| Argument | Type | Description | +| -------------- | ------------------------ | ------------------------------------------------------------------------------------- | ----------------------------------------------- | +| `max_patience` | int | Number of evaluations without an improvement to consider the model to have plateaued. | +| `scale` | float | | Scaling of the inner schedule after plateauing. | +| `schedule` | Schedule[float] | | The schedule to wrap. | +| **RETURNS** | Schedule[float] | | From fc24e8a7cf1d56ae069f824b1b8fb9f87ebef1a3 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 1 Feb 2023 13:54:15 +0100 Subject: [PATCH 15/30] Smooth one hot fix (#830) * fix valid label smoothing parameter * remove print * fix typo * ensure number of classes larger than one --- thinc/tests/test_util.py | 22 ++++++++++++++++++++++ thinc/util.py | 22 +++++++++++++++++----- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py index 8d2d0058d..f525a5133 100644 --- a/thinc/tests/test_util.py +++ b/thinc/tests/test_util.py @@ -5,8 +5,10 @@ from thinc.util import get_array_module, is_numpy_array, to_categorical from thinc.util import is_cupy_array from thinc.util import convert_recursive +from thinc.util import smooth_one_hot from thinc.types import ArgsKwargs + from . import strategies ALL_XP = [numpy] @@ -145,6 +147,26 @@ def test_to_categorical(label_smoothing): to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.88) +@given( + n_classes=strategies.lengths(lo=2, hi=100), + n_samples=strategies.lengths(lo=1, hi=100), + label_smoothing=strategies.floats(min_value=0.0, max_value=1.0) +) +def test_smooth_one_hot(n_samples, n_classes, label_smoothing): + one_hot = numpy.zeros((n_samples, n_classes)) + labels = numpy.random.randint(0, n_classes, (n_samples,)) + one_hot[numpy.arange(n_samples), labels] = 1 + max_smooth = (n_classes - 1) / n_classes + if label_smoothing >= max_smooth: + with pytest.raises(ValueError, match=r"label_smoothing parameter has to be less than"): + smooth_one_hot(one_hot, label_smoothing) + else: + smoothed = smooth_one_hot(one_hot, label_smoothing) + assert numpy.all(numpy.argmax(smoothed, axis=1) == labels) + assert smoothed.shape == one_hot.shape + assert numpy.allclose(smoothed.sum(1), 1.0) + + def test_convert_recursive(): is_match = lambda obj: obj == "foo" convert_item = lambda obj: obj.upper() diff --git a/thinc/util.py b/thinc/util.py index 059f2c235..08ad6c3d7 100644 --- a/thinc/util.py +++ b/thinc/util.py @@ -257,14 +257,25 @@ def smooth_one_hot(X: Floats2d, label_smoothing: float) -> Floats2d: """ Apply label-smoothing to one-hot array. """ - if not 0.0 <= label_smoothing < 0.5: + n_classes = X.shape[1] + max_smooth = (n_classes - 1) / n_classes + if label_smoothing < 0.0: + raise ValueError( + "Label-smoothing parameter has to be greater than or equal to 0" + ) + if not n_classes > 1: raise ValueError( - "label_smoothing should be greater or " - "equal to 0.0 and less than 0.5, " - f"but {label_smoothing} was provided." + "n_classes should be greater than 1 when label smoothing is enabled," + f"but {n_classes} was provided." + ) + if label_smoothing >= max_smooth: + raise ValueError( + f"For {n_classes} classes " + "label_smoothing parameter has to be less than " + f"{max_smooth}, but found {label_smoothing}." ) X[X == 1] = 1 - label_smoothing - X[X == 0] = label_smoothing / (X.shape[1] - 1) + X[X == 0] = label_smoothing / (n_classes - 1) return X @@ -631,6 +642,7 @@ def check_consistency(self, arr: ArrayXd): "require_gpu", "copy_array", "to_categorical", + "smooth_one_hot", "get_width", "xp2torch", "torch2xp", From bf0e2762c674973d56f09bf666d08ab7d84e2bef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 22 Mar 2023 15:29:48 +0100 Subject: [PATCH 16/30] Set version to v9.0.0.dev3 (#868) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index b8ed8d6e7..502500b04 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev2" +__version__ = "9.0.0.dev3" __release__ = True From 816ea330f1a47e476e1dab75d771c28a8837699b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 8 Jan 2024 16:50:15 +0100 Subject: [PATCH 17/30] Temporarily revert new loss implementations (#916) * Revert "Cross entropy fix (#647)" This reverts commit c8ac07fe734aaee43d8197bbf5c9a370f692766b. * Cherry pick MPS Torch bug to get CI unstuck --- .github/workflows/tests.yml | 6 +- examples/mnist.py | 5 +- thinc/legacy/__init__.py | 8 - thinc/legacy/loss.py | 285 ----------------- thinc/loss.py | 503 ++++++++++------------------- thinc/tests/test_loss.py | 611 +++++++----------------------------- thinc/tests/test_util.py | 21 -- thinc/util.py | 36 +-- 8 files changed, 284 insertions(+), 1191 deletions(-) delete mode 100644 thinc/legacy/__init__.py delete mode 100644 thinc/legacy/loss.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 90ea34aa2..8c868d876 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -101,7 +101,11 @@ jobs: run: | pip install "protobuf~=3.20.0" "tensorflow~=2.5.0" pip install "mxnet; sys_platform != 'win32'" - pip install "torch!=1.13.0" --extra-index-url https://download.pytorch.org/whl/cpu + pip install "torch!=1.13.0; sys_platform!='darwin'" --extra-index-url https://download.pytorch.org/whl/cpu + # there is a bug related to MPS devices in github macos runners that + # will be fixed in torch v2.1.1 + # https://github.com/pytorch/pytorch/pull/111576 + pip install "torch>=2.1.1; sys_platform=='darwin'" --extra-index-url https://download.pytorch.org/whl/cpu pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'" pip install "numpy<1.24.0" pip install -r requirements.txt diff --git a/examples/mnist.py b/examples/mnist.py index 790bcc640..971f4645b 100644 --- a/examples/mnist.py +++ b/examples/mnist.py @@ -4,7 +4,6 @@ """ # pip install thinc ml_datasets typer from thinc.api import Model, chain, Relu, Softmax, Adam -from thinc.api import CategoricalCrossentropy import ml_datasets from wasabi import msg from tqdm import tqdm @@ -22,7 +21,6 @@ def main( ) # Load the data (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist() - loss_func = CategoricalCrossentropy() # Set any missing shapes for the model. model.initialize(X=train_X[:5], Y=train_Y[:5]) train_data = model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True) @@ -32,8 +30,7 @@ def main( for i in range(n_iter): for X, Y in tqdm(train_data, leave=False): Yh, backprop = model.begin_update(X) - grad, loss = loss_func(Yh, Y) - backprop(grad) + backprop(Yh - Y) model.finish_update(optimizer) # Evaluate and print progress correct = 0 diff --git a/thinc/legacy/__init__.py b/thinc/legacy/__init__.py deleted file mode 100644 index ced5121ba..000000000 --- a/thinc/legacy/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from .loss import LegacyCategoricalCrossentropy -from .loss import LegacySequenceCategoricalCrossentropy - - -__all__ = [ - "LegacyCategoricalCrossentropy", - "LegacySequenceCategoricalCrossentropy" -] diff --git a/thinc/legacy/loss.py b/thinc/legacy/loss.py deleted file mode 100644 index ab9871625..000000000 --- a/thinc/legacy/loss.py +++ /dev/null @@ -1,285 +0,0 @@ -from typing import Optional, Sequence, Dict, Union, Tuple -from typing import cast, List -from ..types import Floats2d, Ints1d -from ..config import registry -from ..util import to_categorical, get_array_module -from ..loss import IntsOrFloatsOrStrs, Loss -from ..loss import _make_mask, _make_mask_by_value - - -TruthsT = Union[List[Optional[str]], List[int], Ints1d, Floats2d] - - -class LegacyCategoricalCrossentropy(Loss): - names: Optional[Sequence[str]] - missing_value: Optional[Union[str, int]] - _name_to_i: Dict[str, int] - - def __init__( - self, - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, - label_smoothing: float = 0.0, - ): - self.normalize = normalize - self.names = names - self.missing_value = missing_value - self.neg_prefix = neg_prefix - self.label_smoothing = label_smoothing - if names is not None: - self._name_to_i = {name: i for i, name in enumerate(names)} - else: - self._name_to_i = {} - - def convert_truths( - self, truths: TruthsT, guesses: Floats2d - ) -> Tuple[Floats2d, Floats2d]: - xp = get_array_module(guesses) - missing = [] - negatives_mask = None - if self.names: - negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f") - missing_value = self.missing_value - # Convert list of ints or list of strings - if isinstance(truths, list): - if len(truths): - if isinstance(truths[0], int): - for i, value in enumerate(truths): - if not isinstance(value, int): - raise ValueError( - "All values in the truths list have to " - "have the same type. The first value was " - f"detected to be integer, but found {type(value)}." - ) - if value == missing_value: - missing.append(i) - else: - truths = cast(List[Optional[str]], truths) - if self.names is None: - msg = ( - "Cannot calculate loss from list of strings without names. " - "You can pass the names as a keyword argument when you " - "create the loss object, " - "e.g. CategoricalCrossentropy(names=['dog', 'cat'])" - ) - raise ValueError(msg) - for i, value in enumerate(truths): - if not (isinstance(value, str) or value == missing_value): - raise ValueError( - "All values in the truths list have to " - "have the same type. The first value was " - f"detected to be string, but found {type(value)}." - ) - if value == missing_value: - truths[i] = self.names[0] - missing.append(i) - elif ( - value - and self.neg_prefix - and value.startswith(self.neg_prefix) - ): - neg_value = value[len(self.neg_prefix) :] - truths[i] = neg_value - neg_index = self._name_to_i[neg_value] - negatives_mask[i] = 0 # type: ignore - negatives_mask[i][neg_index] = -1 # type: ignore - # In the loop above, we have ensured that `truths` doesn't - # contain `None` (anymore). However, mypy can't infer this - # and doesn't like the shadowing. - truths_str = cast(List[str], truths) - truths = [self._name_to_i[name] for name in truths_str] - truths = xp.asarray(truths, dtype="i") - mask = _make_mask(guesses, missing) - else: - mask = _make_mask_by_value(truths, guesses, missing_value) - truths = cast(Union[Ints1d, Floats2d], truths) - if truths.ndim != guesses.ndim: - # transform categorical values to one-hot encoding - truths_2d = to_categorical( - truths, - n_classes=guesses.shape[-1], - label_smoothing=self.label_smoothing, - ) - else: - if self.label_smoothing: - raise ValueError( - "Label smoothing is only applied, when truths have type " - "List[str], List[int] or Ints1d, but it seems like Floats2d " - "was provided." - ) - truths_2d = cast(Floats2d, truths) - # Transform negative annotations to a 0 for the negated value - # + mask all other values for that row - if negatives_mask is not None: - truths_2d *= negatives_mask - truths_2d[truths_2d == -1] = 0 - negatives_mask[negatives_mask == -1] = 1 - mask *= negatives_mask - return cast(Floats2d, truths_2d), mask - - def __call__(self, guesses: Floats2d, truths: TruthsT) -> Tuple[Floats2d, float]: - d_truth = self.get_grad(guesses, truths) - return (d_truth, self._get_loss_from_grad(d_truth)) - - def get_grad(self, guesses: Floats2d, truths: TruthsT) -> Floats2d: - target, mask = self.convert_truths(truths, guesses) - xp = get_array_module(target) - if guesses.shape != target.shape: # pragma: no cover - err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}." - raise ValueError(err) - elif xp.any(guesses > 1) or xp.any(guesses < 0): # pragma: no cover - err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval." - raise ValueError(err) - elif xp.any(target > 1) or xp.any(target < 0): # pragma: no cover - err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval." - raise ValueError(err) - difference = guesses - target - difference *= mask - if self.normalize: - difference = difference / guesses.shape[0] - return difference - - def get_loss(self, guesses: Floats2d, truths: TruthsT) -> float: - d_truth = self.get_grad(guesses, truths) - return self._get_loss_from_grad(d_truth) - - def _get_loss_from_grad(self, d_truth: Floats2d) -> float: - # TODO: Add overload for axis=None case to sum - return (d_truth**2).sum() # type: ignore - - -class LegacySequenceCategoricalCrossentropy(Loss): - def __init__( - self, - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, - label_smoothing: float = 0.0, - ): - self.cc = LegacyCategoricalCrossentropy( - normalize=False, - names=names, - missing_value=missing_value, - neg_prefix=neg_prefix, - label_smoothing=label_smoothing, - ) - self.normalize = normalize - - def __call__( - self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT] - ) -> Tuple[List[Floats2d], float]: - grads = self.get_grad(guesses, truths) - loss = self._get_loss_from_grad(grads) - return grads, loss - - def get_grad( - self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT] - ) -> List[Floats2d]: - if len(guesses) != len(truths): # pragma: no cover - err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length" - raise ValueError(err) - n = len(guesses) - d_scores = [] - for yh, y in zip(guesses, truths): - d_yh = self.cc.get_grad(yh, y) - if self.normalize: - d_yh /= n - d_scores.append(d_yh) - return d_scores - - def get_loss(self, guesses: Sequence[Floats2d], truths: Sequence[TruthsT]) -> float: - return self._get_loss_from_grad(self.get_grad(guesses, truths)) - - def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float: - loss = 0.0 - for grad in grads: - loss += self.cc._get_loss_from_grad(grad) # type: ignore - return loss - - -@registry.losses("CategoricalCrossentropy.v1") -def configure_CategoricalCrossentropy_v1( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, -) -> LegacyCategoricalCrossentropy: - return LegacyCategoricalCrossentropy( - normalize=normalize, names=names, missing_value=missing_value - ) - - -@registry.losses("CategoricalCrossentropy.v2") -def configure_CategoricalCrossentropy_v2( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, -) -> LegacyCategoricalCrossentropy: - return LegacyCategoricalCrossentropy( - normalize=normalize, - names=names, - missing_value=missing_value, - neg_prefix=neg_prefix, - ) - - -@registry.losses("CategoricalCrossentropy.v3") -def configure_CategoricalCrossentropy_v3( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, - label_smoothing: float = 0.0, -) -> LegacyCategoricalCrossentropy: - return LegacyCategoricalCrossentropy( - normalize=normalize, - names=names, - missing_value=missing_value, - neg_prefix=neg_prefix, - label_smoothing=label_smoothing, - ) - - -@registry.losses("SequenceCategoricalCrossentropy.v1") -def configure_SequenceCategoricalCrossentropy_v1( - *, normalize: bool = True, names: Optional[Sequence[str]] = None -) -> LegacySequenceCategoricalCrossentropy: - return LegacySequenceCategoricalCrossentropy(normalize=normalize, names=names) - - -@registry.losses("SequenceCategoricalCrossentropy.v2") -def configure_SequenceCategoricalCrossentropy_v2( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - neg_prefix: Optional[str] = None, -) -> LegacySequenceCategoricalCrossentropy: - return LegacySequenceCategoricalCrossentropy( - normalize=normalize, names=names, neg_prefix=neg_prefix - ) - - -@registry.losses("SequenceCategoricalCrossentropy.v3") -def configure_SequenceCategoricalCrossentropy_v3( - *, - normalize: bool = True, - names: Optional[Sequence[str]] = None, - missing_value: Optional[Union[str, int]] = None, - neg_prefix: Optional[str] = None, - label_smoothing: float = 0.0, -) -> LegacySequenceCategoricalCrossentropy: - return LegacySequenceCategoricalCrossentropy( - normalize=normalize, - names=names, - neg_prefix=neg_prefix, - missing_value=missing_value, - label_smoothing=label_smoothing, - ) diff --git a/thinc/loss.py b/thinc/loss.py index e194516de..e8edb194d 100644 --- a/thinc/loss.py +++ b/thinc/loss.py @@ -2,19 +2,17 @@ from typing import Dict from abc import abstractmethod -from .types import Floats2d, Ints1d, Ragged, ArrayXd -from .util import get_array_module, to_categorical, smooth_one_hot -from .util import is_xp_array +from .types import Floats2d, Ints1d +from .util import get_array_module, to_categorical from .config import registry + LossT = TypeVar("LossT") GradT = TypeVar("GradT") GuessT = TypeVar("GuessT") TruthT = TypeVar("TruthT") -FloatsOrRaggedT = TypeVar("FloatsOrRaggedT", Floats2d, Ragged) IntsOrFloats = Union[Ints1d, Floats2d] IntsOrFloatsOrStrs = Union[Ints1d, Floats2d, Sequence[int], Sequence[str]] -Categories1d = Union[Ints1d, Sequence[int], Sequence[str]] class Loss(Generic[GuessT, TruthT, GradT, LossT]): # pragma: no cover @@ -39,118 +37,7 @@ def get_loss(self, guesses: GuessT, truths: TruthT) -> LossT: ... -class CategoricalCrossentropyBase(Loss): - normalize: bool - - def _validate_input(self, guesses: FloatsOrRaggedT, target: Floats2d) -> None: - guesses_f2d = _to_array(guesses) - xp = get_array_module(target) - if not xp.allclose(guesses_f2d.sum(axis=1), 1.0): - raise ValueError( - "Cannot calculate CategoricalCrossentropy if " - "some rows of 'guesses' are not " - "valid categorical distributions (do not sum to 1)." - ) - elif guesses_f2d.shape != target.shape: # pragma: no cover - raise ValueError( - "Cannot calculate CategoricalCrossentropy loss " - f"with mismatching shapes: {guesses_f2d.shape} vs {target.shape}." - ) - elif xp.any(guesses_f2d > 1) or xp.any(guesses_f2d < 0): # pragma: no cover - raise ValueError( - "Cannot calculate CategoricalCrossentropy loss " - "with guesses outside the [0,1] interval." - ) - elif xp.any(target > 1) or xp.any(target < 0): # pragma: no cover - raise ValueError( - "Cannot calculate CategoricalCrossentropy loss " - "with truth values outside the [0,1] interval." - ) - - def _get_grad( - self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d - ) -> FloatsOrRaggedT: - difference = _to_array(guesses) - target - difference *= mask - if self.normalize: - # FIXME: normalized by the number of sequences, also support normalizing - # by the number of instances. - difference /= _normalization_length(guesses) - - return _array_like(difference, guesses) - - def _get_loss( - self, guesses: FloatsOrRaggedT, target: Floats2d, mask: Floats2d - ) -> float: - guesses_f2d = _to_array(guesses) - xp = get_array_module(guesses_f2d) - logprobs = xp.log(guesses_f2d + 1e-9) - logprobs *= mask - if self.normalize: - return -(target * logprobs).sum() / _normalization_length(guesses) - else: - return -(target * logprobs).sum() - - -class CategoricalCrossentropy(CategoricalCrossentropyBase): - missing_value: Optional[Union[str, int]] - - def __init__( - self, - *, - normalize: bool = True, - missing_value: Optional[int] = None, - label_smoothing: float = 0.0, - ): - self.normalize = normalize - self.missing_value = missing_value - self.label_smoothing = label_smoothing - - def __call__( - self, guesses: FloatsOrRaggedT, truths: Floats2d - ) -> Tuple[FloatsOrRaggedT, float]: - target, mask = self.convert_truths(truths, guesses) - self._validate_input(guesses, target) - d_truth = self._get_grad(guesses, target, mask) - loss = self._get_loss(guesses, target, mask) - - return d_truth, loss - - def convert_truths( - self, truths: Floats2d, guesses: FloatsOrRaggedT - ) -> Tuple[Floats2d, Floats2d]: - if truths.ndim != 2: - raise ValueError(f"'truths' have to have 2 axes, but found {truths.ndim}") - guesses_2d = _to_array(guesses) - missing_value = self.missing_value - xp = get_array_module(guesses_2d) - mask = _make_mask_by_value(truths, guesses_2d, missing_value) - if not xp.allclose(truths.sum(axis=1), 1.0): - raise ValueError( - "Cannot calculate CategoricalCrossentropy. " - "All rows of 'truths' have to be a " - "valid categorical distribution (sum to 1)." - ) - if self.label_smoothing: - # Validate that array is binary, ergo one-hot at this point - if ((truths == 0) | (truths == 1)).all(): - truths = smooth_one_hot(truths, self.label_smoothing) - else: - raise ValueError("Can only apply label-smoothing to one-hot target.") - return truths, mask - - def get_grad(self, guesses: FloatsOrRaggedT, truths: Floats2d) -> FloatsOrRaggedT: - target, mask = self.convert_truths(truths, guesses) - self._validate_input(guesses, target) - return self._get_grad(guesses, target, mask) - - def get_loss(self, guesses: Floats2d, truths: Floats2d) -> float: - target, mask = self.convert_truths(truths, guesses) - self._validate_input(guesses, target) - return self._get_loss(guesses, target, mask) - - -class SparseCategoricalCrossentropy(CategoricalCrossentropyBase): +class CategoricalCrossentropy(Loss): names: Optional[Sequence[str]] missing_value: Optional[Union[str, int]] _name_to_i: Dict[str, int] @@ -174,174 +61,142 @@ def __init__( else: self._name_to_i = {} - def __call__( - self, guesses: Floats2d, truths: Union[Sequence[int], Sequence[str]] - ) -> Tuple[Floats2d, float]: - target, mask = self.convert_truths(truths, guesses) - self._validate_input(guesses, target) - d_truth = self._get_grad(guesses, target, mask) - loss = self._get_loss(guesses, target, mask) - return (d_truth, loss) - - def _convert_ints( - self, guesses: Floats2d, truths: Sequence[int] - ) -> Tuple[Floats2d, Floats2d]: - """ - Convert Sequence[int] into a Floats2d one-hot array. - """ - missing_value = self.missing_value - if missing_value is not None and not isinstance(missing_value, int): - raise ValueError( - "'truths' provided in Sequence[int] format, but " - f"'missing_value' was set to be {self.missing_value} " - f", which has type {type(self.missing_value)}." - ) - missing = [] - for i, value in enumerate(truths): - if not isinstance(value, int): - raise ValueError( - "The first value of `truths` was of type " - f"integer, but found {type(value)} during iteration." - ) - if value == missing_value: - missing.append(i) - xp = get_array_module(guesses) - # FIXME: convert using ops? - xp_truths = cast(Ints1d, xp.asarray(truths, dtype="i")) - truths_2d = to_categorical( - xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing - ) - mask = _make_mask(guesses, missing) - return cast(Floats2d, truths_2d), mask - - def _convert_strs( - self, guesses: Floats2d, truths: Sequence[str] - ) -> Tuple[Floats2d, Floats2d]: - """ - Convert Sequence[int] into a Floats2d one-hot array. - """ - - missing_value = self.missing_value - if self.names is None: - raise ValueError( - "Cannot calculate loss from Sequence[str] without names. " - "You can pass the names as a keyword argument when you " - "create the loss object" - ) - elif missing_value is not None and not isinstance(missing_value, str): - raise ValueError( - "'truths' provided in Sequence[str] format, but " - f"'missing_value' was set to be {self.missing_value} " - f", which has type {type(self.missing_value)}." - ) + def convert_truths(self, truths, guesses: Floats2d) -> Tuple[Floats2d, Floats2d]: xp = get_array_module(guesses) missing = [] - negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f") - truths_int = [] - for i, value in enumerate(truths): - if not isinstance(value, str): - raise ValueError( - "The first value of the 'truths' was of type " - f"string, but found {type(value)} during iteration." - ) - # missing value - if value == missing_value: - label_i = self._name_to_i[self.names[0]] - missing.append(i) - # negative labels - elif self.neg_prefix and value.startswith(self.neg_prefix): - label_i = self._name_to_i[value[len(self.neg_prefix) :]] - negatives_mask[i] = 0 # type: ignore - negatives_mask[i][label_i] = -1 # type: ignore - # nothing special - else: - label_i = self._name_to_i[value] - truths_int.append(label_i) - xp_truths = cast(Ints1d, xp.asarray(truths_int, dtype="i")) - truths_2d = to_categorical( - xp_truths, n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing - ) - mask = _make_mask(guesses, missing) - truths_2d *= negatives_mask - truths_2d[truths_2d == -1] = 0 - negatives_mask[negatives_mask == -1] = 1 - mask *= negatives_mask - return cast(Floats2d, truths_2d), mask - - def convert_truths( - self, truths: Categories1d, guesses: Floats2d - ) -> Tuple[Floats2d, Floats2d]: - guesses_f2d = _to_array(guesses) - - if is_xp_array(truths): - _check_ints1d(cast(ArrayXd, truths)) - xp_truths = cast(Ints1d, truths) - truths_2d = to_categorical( - xp_truths, + negatives_mask = None + if self.names: + negatives_mask = xp.ones((len(truths), len(self.names)), dtype="f") + missing_value = self.missing_value + # Convert list of ints or list of strings + if isinstance(truths, list): + truths = list(truths) + if len(truths): + if isinstance(truths[0], int): + for i, value in enumerate(truths): + if value == missing_value: + missing.append(i) + else: + if self.names is None: + msg = ( + "Cannot calculate loss from list of strings without names. " + "You can pass the names as a keyword argument when you " + "create the loss object, " + "e.g. CategoricalCrossentropy(names=['dog', 'cat'])" + ) + raise ValueError(msg) + for i, value in enumerate(truths): + if value == missing_value: + truths[i] = self.names[0] + missing.append(i) + elif ( + value + and self.neg_prefix + and value.startswith(self.neg_prefix) + ): + truths[i] = value[len(self.neg_prefix) :] + neg_index = self._name_to_i[truths[i]] + negatives_mask[i] = 0 # type: ignore + negatives_mask[i][neg_index] = -1 # type: ignore + truths = [self._name_to_i[name] for name in truths] + truths = xp.asarray(truths, dtype="i") + mask = _make_mask(guesses, missing) + else: + mask = _make_mask_by_value(truths, guesses, missing_value) + if truths.ndim != guesses.ndim: + # transform categorical values to one-hot encoding + truths = to_categorical( + cast(Ints1d, truths), + n_classes=guesses.shape[-1], label_smoothing=self.label_smoothing, - n_classes=guesses_f2d.shape[1], ) - mask = _make_mask_by_value(truths_2d, guesses_f2d, self.missing_value) - elif isinstance(truths, Sequence): - if isinstance(truths[0], int): - truths_2d, mask = self._convert_ints( - guesses_f2d, cast(Sequence[int], truths) - ) - elif isinstance(truths[0], str): - truths_2d, mask = self._convert_strs( - guesses_f2d, cast(Sequence[str], truths) - ) - else: + else: + if self.label_smoothing: raise ValueError( - "When truths to SparseCategoricalCrossentropy is provided " - "in Sequence format, elements need to be " - "of type str or int, but first element " - f"was found to be {type(truths[0])}." + "Label smoothing is only applied, when truths have type " + "List[str], List[int] or Ints1d, but it seems like Floats2d " + "was provided." ) - else: - raise ValueError( - "Truths have to be provided either as 1D " - "numpy/cupy integer array or as Sequence[int] or " - "Sequence[str], but truths has different type." - ) + # Transform negative annotations to a 0 for the negated value + # + mask all other values for that row + if negatives_mask is not None: + truths *= negatives_mask + truths[truths == -1] = 0 + negatives_mask[negatives_mask == -1] = 1 + mask *= negatives_mask + return truths, mask - return cast(Floats2d, truths_2d), mask + def __call__( + self, guesses: Floats2d, truths: IntsOrFloatsOrStrs + ) -> Tuple[Floats2d, float]: + d_truth = self.get_grad(guesses, truths) + return (d_truth, self._get_loss_from_grad(d_truth)) - def get_grad(self, guesses: Floats2d, truths: Categories1d) -> Floats2d: + def get_grad(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> Floats2d: target, mask = self.convert_truths(truths, guesses) - self._validate_input(guesses, target) - return self._get_grad(guesses, target, mask) + xp = get_array_module(target) + if guesses.shape != target.shape: # pragma: no cover + err = f"Cannot calculate CategoricalCrossentropy loss: mismatched shapes: {guesses.shape} vs {target.shape}." + raise ValueError(err) + if xp.any(guesses > 1) or xp.any(guesses < 0): # pragma: no cover + err = f"Cannot calculate CategoricalCrossentropy loss with guesses outside the [0,1] interval." + raise ValueError(err) + if xp.any(target > 1) or xp.any(target < 0): # pragma: no cover + err = f"Cannot calculate CategoricalCrossentropy loss with truth values outside the [0,1] interval." + raise ValueError(err) + difference = guesses - target + difference *= mask + if self.normalize: + difference = difference / guesses.shape[0] + return difference - def get_loss(self, guesses: Floats2d, truths: Categories1d) -> float: - target, mask = self.convert_truths(truths, guesses) - self._validate_input(guesses, target) - return self._get_loss(guesses, target, mask) + def get_loss(self, guesses: Floats2d, truths: IntsOrFloatsOrStrs) -> float: + d_truth = self.get_grad(guesses, truths) + return self._get_loss_from_grad(d_truth) + + def _get_loss_from_grad(self, d_truth: Floats2d) -> float: + # TODO: Add overload for axis=None case to sum + return (d_truth**2).sum() # type: ignore -@registry.losses("CategoricalCrossentropy.v4") -def configure_CategoricalCrossentropy_v4( +@registry.losses("CategoricalCrossentropy.v1") +def configure_CategoricalCrossentropy_v1( *, normalize: bool = True, - missing_value: Optional[int] = None, - label_smoothing: float = 0.0, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, +) -> CategoricalCrossentropy: + return CategoricalCrossentropy( + normalize=normalize, names=names, missing_value=missing_value + ) + + +@registry.losses("CategoricalCrossentropy.v2") +def configure_CategoricalCrossentropy_v2( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, ) -> CategoricalCrossentropy: return CategoricalCrossentropy( normalize=normalize, + names=names, missing_value=missing_value, - label_smoothing=label_smoothing, + neg_prefix=neg_prefix, ) -@registry.losses("SparseCategoricalCrossentropy.v4") -def configure_SparseCategoricalCrossentropy_v4( +@registry.losses("CategoricalCrossentropy.v3") +def configure_CategoricalCrossentropy_v3( *, normalize: bool = True, names: Optional[Sequence[str]] = None, missing_value: Optional[Union[str, int]] = None, neg_prefix: Optional[str] = None, label_smoothing: float = 0.0, -) -> SparseCategoricalCrossentropy: - return SparseCategoricalCrossentropy( +) -> CategoricalCrossentropy: + return CategoricalCrossentropy( normalize=normalize, names=names, missing_value=missing_value, @@ -354,44 +209,38 @@ class SequenceCategoricalCrossentropy(Loss): def __init__( self, *, - cross_entropy: Union[CategoricalCrossentropy, SparseCategoricalCrossentropy], normalize: bool = True, + names: Optional[Sequence[str]] = None, + missing_value: Optional[Union[str, int]] = None, + neg_prefix: Optional[str] = None, + label_smoothing: float = 0.0, ): - self.cc = cross_entropy + self.cc = CategoricalCrossentropy( + normalize=False, + names=names, + missing_value=missing_value, + neg_prefix=neg_prefix, + label_smoothing=label_smoothing, + ) self.normalize = normalize def __call__( self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] ) -> Tuple[List[Floats2d], float]: - self._validate_input(guesses, truths) - n = len(guesses) - d_scores = [] - loss = 0.0 - for yh, y in zip(guesses, truths): - d_yh, l = self.cc(yh, y) # type: ignore - if self.normalize: - d_yh /= n - d_scores.append(d_yh) - loss += l - return d_scores, loss - - def _validate_input( - self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] - ): - if len(guesses) != len(truths): # pragma: no cover - raise ValueError( - "Cannot calculate SequenceCategoricalCrossentropy loss: " - "guesses and truths must be same length!" - ) + grads = self.get_grad(guesses, truths) + loss = self._get_loss_from_grad(grads) + return grads, loss def get_grad( self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] ) -> List[Floats2d]: - self._validate_input(guesses, truths) + err = "Cannot calculate SequenceCategoricalCrossentropy loss: guesses and truths must be same length" + if len(guesses) != len(truths): # pragma: no cover + raise ValueError(err) n = len(guesses) d_scores = [] for yh, y in zip(guesses, truths): - d_yh = self.cc.get_grad(yh, y) # type: ignore + d_yh = self.cc.get_grad(yh, y) if self.normalize: d_yh /= n d_scores.append(d_yh) @@ -400,42 +249,49 @@ def get_grad( def get_loss( self, guesses: Sequence[Floats2d], truths: Sequence[IntsOrFloatsOrStrs] ) -> float: - self._validate_input(guesses, truths) + return self._get_loss_from_grad(self.get_grad(guesses, truths)) + + def _get_loss_from_grad(self, grads: Sequence[Floats2d]) -> float: loss = 0.0 - for guess, truth in zip(guesses, truths): - loss += self.cc.get_loss(guess, truth) # type: ignore + for grad in grads: + loss += self.cc._get_loss_from_grad(grad) return loss -@registry.losses("SequenceCategoricalCrossentropy.v4") -def configure_SequenceCategoricalCrossentropy_v4( +@registry.losses("SequenceCategoricalCrossentropy.v1") +def configure_SequenceCategoricalCrossentropy_v1( + *, normalize: bool = True, names: Optional[Sequence[str]] = None +) -> SequenceCategoricalCrossentropy: + return SequenceCategoricalCrossentropy(normalize=normalize, names=names) + + +@registry.losses("SequenceCategoricalCrossentropy.v2") +def configure_SequenceCategoricalCrossentropy_v2( + *, + normalize: bool = True, + names: Optional[Sequence[str]] = None, + neg_prefix: Optional[str] = None, +) -> SequenceCategoricalCrossentropy: + return SequenceCategoricalCrossentropy( + normalize=normalize, names=names, neg_prefix=neg_prefix + ) + + +@registry.losses("SequenceCategoricalCrossentropy.v3") +def configure_SequenceCategoricalCrossentropy_v3( *, normalize: bool = True, - sparse: bool = True, names: Optional[Sequence[str]] = None, missing_value: Optional[Union[str, int]] = None, neg_prefix: Optional[str] = None, label_smoothing: float = 0.0, ) -> SequenceCategoricalCrossentropy: - if names is None and neg_prefix is None and not sparse: - cross_entropy: Union[ - CategoricalCrossentropy, SparseCategoricalCrossentropy - ] = CategoricalCrossentropy( - normalize=False, - missing_value=cast(Optional[int], missing_value), - label_smoothing=label_smoothing, - ) - else: - cross_entropy = SparseCategoricalCrossentropy( - normalize=False, - names=names, - missing_value=cast(Optional[Union[str, int]], missing_value), - neg_prefix=neg_prefix, - label_smoothing=label_smoothing, - ) return SequenceCategoricalCrossentropy( - cross_entropy=cross_entropy, normalize=normalize, + names=names, + missing_value=missing_value, + neg_prefix=neg_prefix, + label_smoothing=label_smoothing, ) @@ -566,43 +422,6 @@ def _make_mask_by_value(truths, guesses, missing_value) -> Floats2d: return mask -def _array_like(a: Floats2d, like: FloatsOrRaggedT) -> FloatsOrRaggedT: - if isinstance(like, Ragged): - return Ragged(a, lengths=like.lengths) - else: - return a - - -def _to_array(guesses: FloatsOrRaggedT) -> Floats2d: - if isinstance(guesses, Ragged): - return cast(Floats2d, guesses.data.astype("float32")) - else: - return guesses - - -def _normalization_length(guesses: FloatsOrRaggedT) -> int: - if isinstance(guesses, Ragged): - return len(guesses.lengths) - else: - return guesses.shape[0] - - -def _check_ints1d(arr: ArrayXd): - """ - Check whether array is 1D and has type integer. - """ - if arr.ndim != 1: - raise ValueError( - "SparseCategoricalCrossentropy only accepts 1D arrays, but " - f"array with shape {arr.shape} was given." - ) - if arr.dtype.kind != "i": # type: ignore - raise ValueError( - "SparseCategoricalCrossentropy only accepts integer arrays, but " - f"array with {arr.dtype} was given." - ) - - __all__ = [ "SequenceCategoricalCrossentropy", "CategoricalCrossentropy", diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py index 2cb49e466..75206d240 100644 --- a/thinc/tests/test_loss.py +++ b/thinc/tests/test_loss.py @@ -1,384 +1,108 @@ import pytest import numpy -from functools import partial -from thinc.api import CategoricalCrossentropy -from thinc.api import L2Distance, CosineDistance, softmax_activation -from thinc.api import Ragged +from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy +from thinc.api import L2Distance, CosineDistance from thinc import registry -from thinc.util import has_torch, to_categorical -from hypothesis import given, settings -from hypothesis.strategies import integers, floats -from thinc.legacy import loss - -ALL_XP = [numpy] -try: - import cupy - - ALL_XP.append(cupy) -except ImportError: - pass - - -softmax_func = partial(softmax_activation(), is_train=False) -MAX_EXAMPLES = 50 # some simple arrays scores0 = numpy.zeros((3, 3), dtype="f") labels0 = numpy.asarray([0, 1, 1], dtype="i") # a few more diverse ones to test realistic values -guesses1 = numpy.asarray( - [[0.1, 0.5, 0.4], [0.4, 0.3, 0.3], [0, 1, 0], [0.1, 0.05, 0.85]], dtype="f" -) -guesses1_legacy = numpy.asarray( - [[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]], dtype="f" -) +guesses1 = numpy.asarray([[0.1, 0.5, 0.6], [0.4, 0.6, 0.3], [1, 1, 1], [0, 0, 0]]) labels1 = numpy.asarray([2, 1, 0, 2]) -labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]], dtype="f") +labels1_full = numpy.asarray([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) labels1_strings = ["C", "B", "A", "C"] -d_guesses1 = numpy.array( - [ - [0.025, 0.125, -0.15], - [0.1, -0.175, 0.075], - [-0.25, 0.25, 0.0], - [0.025, 0.0125, -0.0375], - ], - dtype="f", -) -d_guesses1_seq = numpy.array( - [ - [0.05, 0.25, -0.3], - [0.2, -0.35, 0.15], - [-0.5, 0.5, 0.0], - [0.05, 0.025, -0.075], - ], - dtype="f", -) -d_guesses1_0_missing = numpy.array( - [ - [0.025, 0.125, -0.15], - [0.1, -0.175, 0.075], - [0.0, 0.0, 0.0], - [0.025, 0.0125, -0.0375], - ], - dtype="f", -) -d_guesses1_sum = numpy.array( - [ - [0.1, 0.5, -0.6], - [0.4, -0.7, 0.3], - [-1.0, 1.0, 0.0], - [0.1, 0.05, -0.15], - ], - dtype="f", -) -loss1 = 5.75151207 -loss1_seq = 11.50302410 -loss1_0_missing = 0.57069561 -guesses2 = numpy.asarray([[0.2, 0.3, 0.5]]) -guesses2_legacy = numpy.asarray([[0.2, 0.3, 0.0]]) + +guesses2 = numpy.asarray([[0.2, 0.3, 0.0]]) labels2 = numpy.asarray([1]) labels2_strings = ["B"] -d_guesses2_sum = numpy.asarray([[0.2, -0.7, 0.5]]) -sequence_loss = 24.210021096627 -eps = 1e-6 - - -ce_factory = registry.get("losses", "CategoricalCrossentropy.v4") - -sparse_ce_factory = registry.get("losses", "SparseCategoricalCrossentropy.v4") -seq_ce_factory = registry.get("losses", "SequenceCategoricalCrossentropy.v4") +eps = 0.0001 -def _get_legacy_cross_entropy(version: int, **kwargs): - return registry.get("losses", f"CategoricalCrossentropy.v{version}")(**kwargs) - - -def _get_legacy_seq_cross_entropy(version: int, **kwargs): - return registry.get("losses", f"SequenceCategoricalCrossentropy.v{version}")( - **kwargs - ) - - -def test_cross_entropy_types_shapes(): - sparse_cross_entropy = ce_factory() - cross_entropy = ce_factory() - sparse_seq_cross_entropy = seq_ce_factory() - seq_cross_entropy = seq_ce_factory(sparse=False) - d_scores_sparse = sparse_cross_entropy.get_grad(guesses1, labels1_full) - d_scores = cross_entropy.get_grad(guesses1, labels1_full) - assert d_scores_sparse.dtype == "float32" - assert d_scores.dtype == "float32" - assert d_scores_sparse.shape == guesses1.shape - assert d_scores.shape == guesses1.shape - d_scores_sparse = sparse_seq_cross_entropy.get_grad([guesses1], [labels1]) - d_scores = seq_cross_entropy.get_grad([guesses1], [labels1_full]) - assert d_scores_sparse[0].dtype == "float32" - assert d_scores[0].dtype == "float32" - assert d_scores_sparse[0].shape == guesses1.shape - assert d_scores[0].shape == guesses1.shape - assert sparse_seq_cross_entropy.get_grad([], []) == [] - assert seq_cross_entropy.get_grad([], []) == [] - d_scores_ragged = cross_entropy.get_grad( - Ragged(numpy.array(guesses1), lengths=[3, 1]), labels1_full - ) - assert isinstance(d_scores_ragged, Ragged) - assert d_scores_ragged.dataXd.dtype == "float32" - assert d_scores_ragged.dataXd.shape == guesses1.shape - - -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_cross_entropy_types_shapes(version): - cross_entropy = _get_legacy_cross_entropy(version) - seq_cross_entropy = _get_legacy_seq_cross_entropy(version) - d_scores = cross_entropy.get_grad(scores0, labels0) +def test_loss(): + d_scores = CategoricalCrossentropy().get_grad(scores0, labels0) assert d_scores.dtype == "float32" assert d_scores.shape == scores0.shape - d_scores = seq_cross_entropy.get_grad([scores0], [labels0]) + d_scores = SequenceCategoricalCrossentropy().get_grad([scores0], [labels0]) assert d_scores[0].dtype == "float32" assert d_scores[0].shape == scores0.shape - assert seq_cross_entropy.get_grad([], []) == [] - - -@pytest.mark.skipif(not has_torch, reason="needs PyTorch") -@pytest.mark.parametrize("xp", ALL_XP) -@settings(max_examples=MAX_EXAMPLES, deadline=None) -@given( - n_samples=integers(min_value=1, max_value=100), - n_classes=integers(min_value=1, max_value=100), - low=floats(min_value=-20, max_value=10), - offset=floats(min_value=1, max_value=10), -) -def test_compare_cross_entropy_to_torch(xp, n_samples, n_classes, low, offset): - import torch - - sparse_loss_sum = sparse_ce_factory(normalize=False) - sparse_loss_mean = sparse_ce_factory() - loss_sum = ce_factory(normalize=False) - loss_mean = ce_factory() - torch_loss_sum = torch.nn.CrossEntropyLoss(reduction="sum") - torch_loss_mean = torch.nn.CrossEntropyLoss() - logits = xp.random.uniform(low, low + offset, (n_samples, n_classes)) - labels = xp.random.randint(0, n_classes, n_samples) - labels_full = to_categorical(labels, n_classes=n_classes) - torch_logits = torch.tensor(logits, requires_grad=True) - torch_labels = torch.tensor(labels, dtype=torch.long) - probs, _ = softmax_func(logits) - d_sum_sparse, l_sum_sparse = sparse_loss_sum(probs, labels) - d_sum, l_sum = loss_sum(probs, labels_full) - torch_l_sum = torch_loss_sum(torch_logits, torch_labels) - torch_l_sum.backward() - torch_d_sum = torch_logits.grad - torch_logits = torch.tensor(logits, requires_grad=True) - d_mean_sparse, l_mean_sparse = sparse_loss_mean(probs, labels) - d_mean, l_mean = loss_mean(probs, labels_full) - torch_l_mean = torch_loss_mean(torch_logits, torch_labels) - torch_l_mean.backward() - torch_d_mean = torch_logits.grad - assert xp.isclose(float(l_sum), float(torch_l_sum), atol=1e-06) - assert xp.allclose(d_sum, torch_d_sum.numpy()) - assert xp.isclose(float(l_mean), float(torch_l_mean)) - assert xp.allclose(d_mean, torch_d_mean.numpy()) - assert xp.isclose(float(l_sum_sparse), float(torch_l_sum), atol=1e-06) - assert xp.allclose(d_sum_sparse, torch_d_sum.numpy()) - assert xp.isclose(float(l_mean_sparse), float(torch_l_mean)) - assert xp.allclose(d_mean_sparse, torch_d_mean.numpy()) - - -@pytest.mark.parametrize("dist", [CosineDistance(ignore_zeros=True), L2Distance()]) -@pytest.mark.parametrize("vect", [scores0, guesses1, guesses2]) -def test_equal_distance(dist, vect): - assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps) - assert dist.get_loss(vect, vect) == pytest.approx(0, abs=eps) - - -@pytest.mark.parametrize("version", [1, 2, 3]) -@pytest.mark.parametrize("vect", [scores0, guesses1_legacy, guesses2_legacy]) -def test_equal_legacy_cross_entropy(vect, version): - cross_entropy = _get_legacy_cross_entropy(version) - assert int(cross_entropy.get_grad(vect, vect)[0][0]) == pytest.approx(0, abs=eps) - assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps) - assert cross_entropy.get_loss(vect, vect) == pytest.approx(0, abs=eps) - - -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_cross_entropy_absent_labels(version): - cross_entropy = _get_legacy_cross_entropy(version, names=["cat", "dog", "rat"]) - assert cross_entropy.get_loss(scores0, [None, None, None]) == pytest.approx( - 0, abs=eps - ) + assert SequenceCategoricalCrossentropy().get_grad([], []) == [] @pytest.mark.parametrize( - "guesses, labels, grad, grad_seq, loss, loss_seq", - [ - (guesses1, labels1_full, d_guesses1, d_guesses1_seq, loss1, loss1_seq), - ], -) -def test_categorical_crossentropy(guesses, labels, grad, grad_seq, loss, loss_seq): - cross_entropy = ce_factory() - d_scores = cross_entropy.get_grad(guesses, labels) - loss_val = cross_entropy.get_loss(guesses, labels) - assert d_scores.shape == guesses.shape - assert numpy.allclose(d_scores, grad) - assert numpy.isclose(loss_val, loss) - - # Test with Ragged inputs - d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels) - loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels) - assert d_scores_ragged.dataXd.shape == guesses.shape - assert numpy.allclose(d_scores_ragged.dataXd, grad_seq) - assert numpy.isclose(loss_ragged, loss_seq) - - -@pytest.mark.parametrize( - "guesses, labels, grad, grad_seq, loss, loss_seq", - [ - (guesses1, labels1, d_guesses1, d_guesses1_seq, loss1, loss1_seq), - ], + "dist", [CategoricalCrossentropy(), CosineDistance(ignore_zeros=True), L2Distance()] ) -def test_sparse_categorical_crossentropy( - guesses, labels, grad, grad_seq, loss, loss_seq -): - cross_entropy = sparse_ce_factory() - d_scores = cross_entropy.get_grad(guesses, labels) - loss_val = cross_entropy.get_loss(guesses, labels) - assert d_scores.shape == guesses.shape - assert numpy.allclose(d_scores, grad) - assert numpy.isclose(loss_val, loss) - - # Test with Ragged inputs - d_scores_ragged = cross_entropy.get_grad(Ragged(guesses, lengths=[3, 1]), labels) - loss_ragged = cross_entropy.get_loss(Ragged(guesses, lengths=[3, 1]), labels) - assert d_scores_ragged.dataXd.shape == guesses.shape - assert numpy.allclose(d_scores_ragged.dataXd, grad_seq) - assert numpy.isclose(loss_ragged, loss_seq) +@pytest.mark.parametrize("vect", [scores0, guesses1, guesses2]) +def test_equality(dist, vect): + assert int(dist.get_grad(vect, vect)[0][0]) == pytest.approx(0, eps) + assert dist.get_loss(vect, vect) == pytest.approx(0, eps) @pytest.mark.parametrize( - "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)] + "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)] ) -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_categorical_crossentropy(guesses, labels, version): - cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True) - d_scores = cross_entropy_normalize.get_grad(guesses, labels) +def test_categorical_crossentropy(guesses, labels): + d_scores = CategoricalCrossentropy(normalize=True).get_grad(guesses, labels) assert d_scores.shape == guesses.shape # The normalization divides the difference (e.g. 0.4) by the number of vectors (4) - assert d_scores[1][0] == pytest.approx(0.1, abs=eps) - assert d_scores[1][1] == pytest.approx(-0.1, abs=eps) + assert d_scores[1][0] == pytest.approx(0.1, eps) + assert d_scores[1][1] == pytest.approx(-0.1, eps) # The third vector predicted all labels, but only the first one was correct - assert d_scores[2][0] == pytest.approx(0, abs=eps) - assert d_scores[2][1] == pytest.approx(0.25, abs=eps) - assert d_scores[2][2] == pytest.approx(0.25, abs=eps) + assert d_scores[2][0] == pytest.approx(0, eps) + assert d_scores[2][1] == pytest.approx(0.25, eps) + assert d_scores[2][2] == pytest.approx(0.25, eps) # The fourth vector predicted no labels but should have predicted the last one - assert d_scores[3][0] == pytest.approx(0, abs=eps) - assert d_scores[3][1] == pytest.approx(0, abs=eps) - assert d_scores[3][2] == pytest.approx(-0.25, abs=eps) + assert d_scores[3][0] == pytest.approx(0, eps) + assert d_scores[3][1] == pytest.approx(0, eps) + assert d_scores[3][2] == pytest.approx(-0.25, eps) - loss = cross_entropy_normalize.get_loss(guesses, labels) - assert loss == pytest.approx(0.239375, abs=eps) + loss = CategoricalCrossentropy(normalize=True).get_loss(guesses, labels) + assert loss == pytest.approx(0.239375, eps) def test_crossentropy_incorrect_scores_targets(): labels = numpy.asarray([2]) - labels_full = numpy.asarray([[0.0, 0.0, 1.0]]) - cross_entropy = ce_factory() - sparse_cross_entropy = sparse_ce_factory() guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - cross_entropy.get_grad(guesses_neg, labels_full) - with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - sparse_cross_entropy.get_grad(guesses_neg, labels) - - guesses_dont_sum_one = numpy.asarray([[0.1, 0.5, 0.6]]) - with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - cross_entropy.get_grad(guesses_dont_sum_one, labels_full) - with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - sparse_cross_entropy.get_grad(guesses_dont_sum_one, labels) + CategoricalCrossentropy(normalize=True).get_grad(guesses_neg, labels) guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - cross_entropy.get_grad(guesses_larger_than_one, labels_full) - with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - sparse_cross_entropy.get_grad(guesses_larger_than_one, labels) - - guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]]) - targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) - with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - cross_entropy.get_grad(guesses_ok, targets_neg) - - targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]]) - with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - cross_entropy.get_grad(guesses_ok, targets_larger_than_one) - - targets_dont_sum_one = numpy.asarray([[0.9, 0.5, 0.6]]) - with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - cross_entropy.get_grad(guesses_ok, targets_dont_sum_one) - - -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_categorical_cross_entropy_incorrect_scores_targets(version): - labels = numpy.asarray([2]) - cross_entropy_normalize = _get_legacy_cross_entropy(version, normalize=True) - guesses_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) - with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - cross_entropy_normalize.get_grad(guesses_neg, labels) - - guesses_larger_than_one = numpy.asarray([[1.1, 0.5, 0.6]]) - with pytest.raises(ValueError, match=r"Cannot calculate.*guesses"): - cross_entropy_normalize.get_grad(guesses_larger_than_one, labels) + CategoricalCrossentropy(normalize=True).get_grad( + guesses_larger_than_one, labels + ) guesses_ok = numpy.asarray([[0.1, 0.4, 0.5]]) targets_neg = numpy.asarray([[-0.1, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - cross_entropy_normalize.get_grad(guesses_ok, targets_neg) + CategoricalCrossentropy(normalize=True).get_grad(guesses_ok, targets_neg) targets_larger_than_one = numpy.asarray([[2.0, 0.5, 0.6]]) with pytest.raises(ValueError, match=r"Cannot calculate.*truth"): - cross_entropy_normalize.get_grad(guesses_ok, targets_larger_than_one) - - -@pytest.mark.parametrize( - "guesses, labels, grad, missing_value", - [ - (guesses1, [2, 1, 0, 2], d_guesses1_0_missing, 0), - (guesses1, labels1, d_guesses1_0_missing, 0), - (guesses1, labels1_strings, d_guesses1_0_missing, "A"), - ], -) -def test_sparse_crossentropy_missing(guesses, labels, grad, missing_value): - if missing_value == "A": - names = ["A", "B", "C"] - else: - names = None - sparse_cross_entropy = sparse_ce_factory(missing_value=missing_value, names=names) - d_scores = sparse_cross_entropy.get_grad(guesses, labels) - assert d_scores.shape == guesses.shape - assert numpy.allclose(d_scores, grad) - loss = sparse_cross_entropy.get_loss(guesses, labels) - assert numpy.isclose(loss, loss1_0_missing) + CategoricalCrossentropy(normalize=True).get_grad( + guesses_ok, targets_larger_than_one + ) @pytest.mark.parametrize( "guesses, labels", - [(guesses1_legacy, [2, 1, 0, 2])], + [(guesses1, [2, 1, 0, 2])], ) -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_categorical_crossentropy_int_list_missing(guesses, labels, version): - cross_entropy_normalize_missing = _get_legacy_cross_entropy( - version, normalize=True, missing_value=0 +def test_categorical_crossentropy_int_list_missing(guesses, labels): + d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad( + guesses, labels ) - d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels) assert d_scores.shape == guesses.shape # The normalization divides the difference (e.g. 0.4) by the number of vectors (4) - assert d_scores[1][0] == pytest.approx(0.1, abs=eps) - assert d_scores[1][1] == pytest.approx(-0.1, abs=eps) + assert d_scores[1][0] == pytest.approx(0.1, eps) + assert d_scores[1][1] == pytest.approx(-0.1, eps) # Label 0 is masked, because it represents the missing value assert d_scores[2][0] == 0.0 @@ -386,46 +110,28 @@ def test_legacy_categorical_crossentropy_int_list_missing(guesses, labels, versi assert d_scores[2][2] == 0.0 # The fourth vector predicted no labels but should have predicted the last one - assert d_scores[3][0] == pytest.approx(0, abs=eps) - assert d_scores[3][1] == pytest.approx(0, abs=eps) - assert d_scores[3][2] == pytest.approx(-0.25, abs=eps) - - loss = cross_entropy_normalize_missing.get_loss(guesses, labels) - assert loss == pytest.approx(0.114375, abs=eps) - - -@pytest.mark.parametrize( - "guesses, labels, grad", - [ - (guesses1, labels1_full, d_guesses1_0_missing), - ], -) -def test_categorical_crossentropy_missing(guesses, labels, grad): - cross_entropy = ce_factory(missing_value=0) - d_scores = cross_entropy.get_grad(guesses, labels) - assert d_scores.shape == guesses.shape - assert numpy.allclose(d_scores, grad) + assert d_scores[3][0] == pytest.approx(0, eps) + assert d_scores[3][1] == pytest.approx(0, eps) + assert d_scores[3][2] == pytest.approx(-0.25, eps) loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss( guesses, labels ) - assert numpy.isclose(loss, loss1_0_missing) + assert loss == pytest.approx(0.114375, eps) @pytest.mark.parametrize( - "guesses, labels", [(guesses1_legacy, labels1), (guesses1_legacy, labels1_full)] + "guesses, labels", [(guesses1, labels1), (guesses1, labels1_full)] ) -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_categorical_crossentropy_missing(guesses, labels, version): - cross_entropy_normalize_missing = _get_legacy_cross_entropy( - version, normalize=True, missing_value=0 +def test_categorical_crossentropy_missing(guesses, labels): + d_scores = CategoricalCrossentropy(normalize=True, missing_value=0).get_grad( + guesses, labels ) - d_scores = cross_entropy_normalize_missing.get_grad(guesses, labels) assert d_scores.shape == guesses.shape # The normalization divides the difference (e.g. 0.4) by the number of vectors (4) - assert d_scores[1][0] == pytest.approx(0.1, abs=eps) - assert d_scores[1][1] == pytest.approx(-0.1, abs=eps) + assert d_scores[1][0] == pytest.approx(0.1, eps) + assert d_scores[1][1] == pytest.approx(-0.1, eps) # Label 0 is masked, because it represents the missing value assert d_scores[2][0] == 0.0 @@ -433,179 +139,95 @@ def test_legacy_categorical_crossentropy_missing(guesses, labels, version): assert d_scores[2][2] == 0.0 # The fourth vector predicted no labels but should have predicted the last one - assert d_scores[3][0] == pytest.approx(0, abs=eps) - assert d_scores[3][1] == pytest.approx(0, abs=eps) - assert d_scores[3][2] == pytest.approx(-0.25, abs=eps) - - loss = cross_entropy_normalize_missing.get_loss(guesses, labels) - assert loss == pytest.approx(0.114375, abs=eps) - - -@pytest.mark.parametrize( - "guesses, labels, names, grad, loss", - [ - ( - [guesses1, guesses2], - [labels1, labels2], - [], - [d_guesses1_sum, d_guesses2_sum], - sequence_loss, - ), - ( - [guesses1, guesses2], - [labels1_strings, labels2_strings], - ["A", "B", "C"], - [d_guesses1_sum, d_guesses2_sum], - sequence_loss, - ), - ], -) -def test_sequence_sparse_crossentropy(guesses, labels, names, grad, loss): - sparse_seq_cross_entropy_sum = seq_ce_factory(names=names, normalize=False) - sparse_seq_cross_entropy = seq_ce_factory(names=names, normalize=True) - d_scores = sparse_seq_cross_entropy_sum.get_grad(guesses, labels) - assert numpy.allclose(d_scores[0], grad[0]) - assert numpy.allclose(d_scores[1], grad[1]) - # The normalization divides the difference (e.g. 0.4) by the number of seqs - d_scores = sparse_seq_cross_entropy.get_grad(guesses, labels) - assert numpy.allclose(d_scores[0], grad[0] / 2.0) - assert numpy.allclose(d_scores[1], grad[1] / 2.0) - loss_val = sparse_seq_cross_entropy.get_loss(guesses, labels) - assert numpy.isclose(loss_val, loss) - d_scores, loss_val = sparse_seq_cross_entropy_sum(guesses, labels) - assert numpy.isclose(loss_val, loss) - assert numpy.allclose(d_scores[0], grad[0]) - assert numpy.allclose(d_scores[1], grad[1]) + assert d_scores[3][0] == pytest.approx(0, eps) + assert d_scores[3][1] == pytest.approx(0, eps) + assert d_scores[3][2] == pytest.approx(-0.25, eps) - -@pytest.mark.parametrize( - "guesses, labels, grad, loss", - [([guesses1], [labels1_full], [d_guesses1_sum], [23.00604829563447])], -) -def test_sequence_crossentropy(guesses, labels, grad, loss): - seq_cross_entropy = seq_ce_factory(sparse=False, normalize=False) - d_scores = seq_cross_entropy.get_grad(guesses, labels) - assert numpy.allclose(d_scores[0], grad[0]) - # The normalization divides the difference (e.g. 0.4) by the number of seqs - loss_val = seq_cross_entropy.get_loss(guesses, labels) - assert numpy.isclose(loss_val, loss) - d_scores, loss_val = seq_cross_entropy(guesses, labels) - assert numpy.isclose(loss_val, loss) - assert numpy.allclose(d_scores[0], grad[0]) + loss = CategoricalCrossentropy(normalize=True, missing_value=0).get_loss( + guesses, labels + ) + assert loss == pytest.approx(0.114375, eps) @pytest.mark.parametrize( "guesses, labels, names", [ - ([guesses1_legacy, guesses2_legacy], [labels1, labels2], []), - ([guesses1_legacy, guesses2_legacy], [labels1_full, labels2], []), - ( - [guesses1_legacy, guesses2_legacy], - [labels1_strings, labels2_strings], - ["A", "B", "C"], - ), + ([guesses1, guesses2], [labels1, labels2], []), + ([guesses1, guesses2], [labels1_full, labels2], []), + ([guesses1, guesses2], [labels1_strings, labels2_strings], ["A", "B", "C"]), ], ) -@pytest.mark.parametrize("version", [1, 2, 3]) -def test_legacy_sequence_categorical_crossentropy(guesses, labels, names, version): - seq_cross_entropy_names = _get_legacy_seq_cross_entropy( - version, normalize=False, names=names - ) - seq_cross_entropy_names_normalize = _get_legacy_seq_cross_entropy( - version, normalize=True, names=names +def test_sequence_categorical_crossentropy(guesses, labels, names): + d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names).get_grad( + guesses, labels ) - d_scores = seq_cross_entropy_names.get_grad(guesses, labels) d_scores1 = d_scores[0] d_scores2 = d_scores[1] assert d_scores1.shape == guesses1.shape assert d_scores2.shape == guesses2.shape - assert d_scores1[1][0] == pytest.approx(0.4, abs=eps) - assert d_scores1[1][1] == pytest.approx(-0.4, abs=eps) + assert d_scores1[1][0] == pytest.approx(0.4, eps) + assert d_scores1[1][1] == pytest.approx(-0.4, eps) # The normalization divides the difference (e.g. 0.4) by the number of seqs - d_scores = seq_cross_entropy_names_normalize.get_grad(guesses, labels) + d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad( + guesses, labels + ) d_scores1 = d_scores[0] d_scores2 = d_scores[1] - assert d_scores1[1][0] == pytest.approx(0.2, abs=eps) - assert d_scores1[1][1] == pytest.approx(-0.2, abs=eps) + assert d_scores1[1][0] == pytest.approx(0.2, eps) + assert d_scores1[1][1] == pytest.approx(-0.2, eps) # The third vector predicted all labels, but only the first one was correct - assert d_scores1[2][0] == pytest.approx(0, abs=eps) - assert d_scores1[2][1] == pytest.approx(0.5, abs=eps) - assert d_scores1[2][2] == pytest.approx(0.5, abs=eps) + assert d_scores1[2][0] == pytest.approx(0, eps) + assert d_scores1[2][1] == pytest.approx(0.5, eps) + assert d_scores1[2][2] == pytest.approx(0.5, eps) # The fourth vector predicted no labels but should have predicted the last one - assert d_scores1[3][0] == pytest.approx(0, abs=eps) - assert d_scores1[3][1] == pytest.approx(0, abs=eps) - assert d_scores1[3][2] == pytest.approx(-0.5, abs=eps) + assert d_scores1[3][0] == pytest.approx(0, eps) + assert d_scores1[3][1] == pytest.approx(0, eps) + assert d_scores1[3][2] == pytest.approx(-0.5, eps) # Test the second batch - assert d_scores2[0][0] == pytest.approx(0.1, abs=eps) - assert d_scores2[0][1] == pytest.approx(-0.35, abs=eps) - - loss = seq_cross_entropy_names_normalize.get_loss(guesses, labels) - assert loss == pytest.approx(1.09, abs=eps) + assert d_scores2[0][0] == pytest.approx(0.1, eps) + assert d_scores2[0][1] == pytest.approx(-0.35, eps) - -@pytest.mark.parametrize( - "guesses, labels, names, grad", - [ - ( - [guesses1], - [["A", "!A", "", "!C"]], - ["A", "B", "C"], - numpy.array( - [ - [-0.9, 0.5, 0.4], # First is correct - [0.4, 0.0, 0.0], # Not first one - [0.0, 0.0, 0.0], # Missing - [0.0, 0.0, 0.85], # Not last one - ] - ), - ) - ], -) -def test_sequence_crossentropy_missing_negative(guesses, labels, names, grad): - sparse_seq_ce = seq_ce_factory( - names=names, normalize=False, neg_prefix="!", missing_value="" + loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss( + guesses, labels ) - d_scores = sparse_seq_ce.get_grad(guesses, labels) - assert numpy.allclose(d_scores, grad) + assert loss == pytest.approx(1.09, eps) @pytest.mark.parametrize( "guesses, labels, names", [ - ([guesses1_legacy], [["A", "!A", "", "!C"]], ["A", "B", "C"]), + ([guesses1], [["A", "!A", "", "!C"]], ["A", "B", "C"]), ], ) -@pytest.mark.parametrize("version", [3]) -def test_legacy_sequence_categorical_missing_negative(guesses, labels, names, version): - seq_cross_entropy = _get_legacy_seq_cross_entropy( - version, normalize=False, names=names, neg_prefix="!", missing_value="" - ) - d_scores = seq_cross_entropy.get_grad(guesses, labels) +def test_sequence_categorical_missing_negative(guesses, labels, names): + d_scores = SequenceCategoricalCrossentropy( + normalize=False, names=names, neg_prefix="!", missing_value="" + ).get_grad(guesses, labels) d_scores0 = d_scores[0] # [0.1, 0.5, 0.6] should be A - assert d_scores0[0][0] == pytest.approx(-0.9, abs=eps) - assert d_scores0[0][1] == pytest.approx(0.5, abs=eps) - assert d_scores0[0][2] == pytest.approx(0.6, abs=eps) + assert d_scores0[0][0] == pytest.approx(-0.9, eps) + assert d_scores0[0][1] == pytest.approx(0.5, eps) + assert d_scores0[0][2] == pytest.approx(0.6, eps) # [0.4, 0.6, 0.3] should NOT be A - assert d_scores0[1][0] == pytest.approx(0.4, abs=eps) - assert d_scores0[1][1] == pytest.approx(0.0, abs=eps) - assert d_scores0[1][2] == pytest.approx(0.0, abs=eps) + assert d_scores0[1][0] == pytest.approx(0.4, eps) + assert d_scores0[1][1] == pytest.approx(0.0, eps) + assert d_scores0[1][2] == pytest.approx(0.0, eps) # [1, 1, 1] has missing gold label - assert d_scores0[2][0] == pytest.approx(0.0, abs=eps) - assert d_scores0[2][1] == pytest.approx(0.0, abs=eps) - assert d_scores0[2][2] == pytest.approx(0.0, abs=eps) + assert d_scores0[2][0] == pytest.approx(0.0, eps) + assert d_scores0[2][1] == pytest.approx(0.0, eps) + assert d_scores0[2][2] == pytest.approx(0.0, eps) # [0.0, 0.0, 0.0] should NOT be C - assert d_scores0[3][0] == pytest.approx(0.0, abs=eps) - assert d_scores0[3][1] == pytest.approx(0.0, abs=eps) - assert d_scores0[3][2] == pytest.approx(0.0, abs=eps) + assert d_scores0[3][0] == pytest.approx(0.0, eps) + assert d_scores0[3][1] == pytest.approx(0.0, eps) + assert d_scores0[3][2] == pytest.approx(0.0, eps) def test_L2(): @@ -619,10 +241,10 @@ def test_L2(): ) loss_not_normalized = L2Distance(normalize=False).get_loss(vec1, vec2) - assert loss_not_normalized == pytest.approx(20, abs=eps) + assert loss_not_normalized == pytest.approx(20, eps) loss_normalized = L2Distance(normalize=True).get_loss(vec1, vec2) - assert loss_normalized == pytest.approx(5, abs=eps) + assert loss_normalized == pytest.approx(5, eps) def test_cosine_orthogonal(): @@ -638,10 +260,10 @@ def test_cosine_orthogonal(): assert d_vecs[1][1] > 0 loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2) - assert loss_not_normalized == pytest.approx(2, abs=eps) + assert loss_not_normalized == pytest.approx(2, eps) loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2) - assert loss_normalized == pytest.approx(1, abs=eps) + assert loss_normalized == pytest.approx(1, eps) def test_cosine_equal(): @@ -654,10 +276,10 @@ def test_cosine_equal(): numpy.testing.assert_allclose(d_vec1, numpy.zeros(d_vec1.shape), rtol=eps, atol=eps) loss_not_normalized = CosineDistance(normalize=False).get_loss(vec1, vec2) - assert loss_not_normalized == pytest.approx(0, abs=eps) + assert loss_not_normalized == pytest.approx(0, eps) loss_normalized = CosineDistance(normalize=True).get_loss(vec1, vec2) - assert loss_normalized == pytest.approx(0, abs=eps) + assert loss_normalized == pytest.approx(0, eps) def test_cosine_unmatched(): @@ -670,26 +292,19 @@ def test_cosine_unmatched(): @pytest.mark.parametrize( "name,kwargs,args", [ - ("CategoricalCrossentropy.v1", {}, (guesses1, labels1)), - ("SequenceCategoricalCrossentropy.v1", {}, ([guesses1], [labels1])), - ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (guesses1, labels1)), - ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (guesses1, labels1)), - ("SparseCategoricalCrossentropy.v4", {"neg_prefix": "!"}, (guesses1, labels1)), - ("CategoricalCrossentropy.v4", {}, (guesses1, labels1_full)), + ("CategoricalCrossentropy.v1", {}, (scores0, labels0)), + ("SequenceCategoricalCrossentropy.v1", {}, ([scores0], [labels0])), + ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (scores0, labels0)), + ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (scores0, labels0)), ( "SequenceCategoricalCrossentropy.v2", {"neg_prefix": "!"}, - ([guesses1], [labels1]), + ([scores0], [labels0]), ), ( "SequenceCategoricalCrossentropy.v3", {"neg_prefix": "!"}, - ([guesses1], [labels1]), - ), - ( - "SequenceCategoricalCrossentropy.v4", - {"neg_prefix": "!"}, - ([guesses1], [labels1]), + ([scores0], [labels0]), ), ("L2Distance.v1", {}, (scores0, scores0)), ( diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py index f525a5133..133efbe60 100644 --- a/thinc/tests/test_util.py +++ b/thinc/tests/test_util.py @@ -5,7 +5,6 @@ from thinc.util import get_array_module, is_numpy_array, to_categorical from thinc.util import is_cupy_array from thinc.util import convert_recursive -from thinc.util import smooth_one_hot from thinc.types import ArgsKwargs @@ -147,26 +146,6 @@ def test_to_categorical(label_smoothing): to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.88) -@given( - n_classes=strategies.lengths(lo=2, hi=100), - n_samples=strategies.lengths(lo=1, hi=100), - label_smoothing=strategies.floats(min_value=0.0, max_value=1.0) -) -def test_smooth_one_hot(n_samples, n_classes, label_smoothing): - one_hot = numpy.zeros((n_samples, n_classes)) - labels = numpy.random.randint(0, n_classes, (n_samples,)) - one_hot[numpy.arange(n_samples), labels] = 1 - max_smooth = (n_classes - 1) / n_classes - if label_smoothing >= max_smooth: - with pytest.raises(ValueError, match=r"label_smoothing parameter has to be less than"): - smooth_one_hot(one_hot, label_smoothing) - else: - smoothed = smooth_one_hot(one_hot, label_smoothing) - assert numpy.all(numpy.argmax(smoothed, axis=1) == labels) - assert smoothed.shape == one_hot.shape - assert numpy.allclose(smoothed.sum(1), 1.0) - - def test_convert_recursive(): is_match = lambda obj: obj == "foo" convert_item = lambda obj: obj.upper() diff --git a/thinc/util.py b/thinc/util.py index c7212818f..9afec29ba 100644 --- a/thinc/util.py +++ b/thinc/util.py @@ -1,7 +1,5 @@ from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar -from typing import List, Mapping -from typing import TYPE_CHECKING - +from typing import List, Mapping, TYPE_CHECKING import numpy import platform import random @@ -15,17 +13,18 @@ import contextlib from contextvars import ContextVar from dataclasses import dataclass + from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu from .compat import has_torch_mps from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack - -from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd, Floats2d # noqa: E402 +from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd # noqa: E402 from . import types # noqa: E402 if TYPE_CHECKING: from .api import Ops + DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False) @@ -261,32 +260,6 @@ def to_categorical( return label_distr[Y] -def smooth_one_hot(X: Floats2d, label_smoothing: float) -> Floats2d: - """ - Apply label-smoothing to one-hot array. - """ - n_classes = X.shape[1] - max_smooth = (n_classes - 1) / n_classes - if label_smoothing < 0.0: - raise ValueError( - "Label-smoothing parameter has to be greater than or equal to 0" - ) - if not n_classes > 1: - raise ValueError( - "n_classes should be greater than 1 when label smoothing is enabled," - f"but {n_classes} was provided." - ) - if label_smoothing >= max_smooth: - raise ValueError( - f"For {n_classes} classes " - "label_smoothing parameter has to be less than " - f"{max_smooth}, but found {label_smoothing}." - ) - X[X == 1] = 1 - label_smoothing - X[X == 0] = label_smoothing / (n_classes - 1) - return X - - def get_width( X: Union[ArrayXd, Ragged, Padded, Sequence[ArrayXd]], *, dim: int = -1 ) -> int: @@ -650,7 +623,6 @@ def check_consistency(self, arr: ArrayXd): "require_gpu", "copy_array", "to_categorical", - "smooth_one_hot", "get_width", "xp2torch", "torch2xp", From 95f894f3c4b5e6df5cf26fae280d3864e3f25423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 9 Jan 2024 09:19:28 +0100 Subject: [PATCH 18/30] isort --- thinc/__init__.py | 1 - thinc/api.py | 210 ++++++++++++++---- thinc/backends/__init__.py | 27 ++- thinc/backends/_cupy_allocators.py | 2 +- thinc/backends/_custom_kernels.py | 11 +- thinc/backends/_param_server.py | 3 +- thinc/backends/cblas.pxd | 1 - thinc/backends/cupy_ops.py | 21 +- thinc/backends/mps_ops.py | 4 +- thinc/backends/numpy_ops.pyx | 24 +- thinc/backends/ops.py | 55 ++++- thinc/compat.py | 4 +- thinc/config.py | 3 +- thinc/initializers.py | 1 + thinc/layers/__init__.py | 94 ++++---- thinc/layers/add.py | 5 +- thinc/layers/array_getitem.py | 6 +- thinc/layers/bidirectional.py | 5 +- thinc/layers/cauchysimilarity.py | 5 +- thinc/layers/chain.py | 7 +- thinc/layers/clipped_linear.py | 10 +- thinc/layers/clone.py | 9 +- thinc/layers/concatenate.py | 20 +- thinc/layers/dish.py | 10 +- thinc/layers/dropout.py | 7 +- thinc/layers/embed.py | 11 +- thinc/layers/expand_window.py | 5 +- thinc/layers/gelu.py | 10 +- thinc/layers/hard_swish.py | 10 +- thinc/layers/hard_swish_mobilenet.py | 10 +- thinc/layers/hashembed.py | 11 +- thinc/layers/layernorm.py | 7 +- thinc/layers/linear.py | 7 +- thinc/layers/list2array.py | 5 +- thinc/layers/list2padded.py | 7 +- thinc/layers/list2ragged.py | 7 +- thinc/layers/logistic.py | 5 +- thinc/layers/lstm.py | 13 +- thinc/layers/map_list.py | 4 +- thinc/layers/maxout.py | 7 +- thinc/layers/mish.py | 9 +- thinc/layers/multisoftmax.py | 7 +- thinc/layers/mxnetwrapper.py | 7 +- thinc/layers/noop.py | 5 +- thinc/layers/padded2list.py | 7 +- thinc/layers/parametricattention.py | 5 +- thinc/layers/premap_ids.pyx | 8 +- thinc/layers/pytorchwrapper.py | 16 +- thinc/layers/ragged2list.py | 7 +- thinc/layers/reduce_first.py | 5 +- thinc/layers/reduce_last.py | 4 +- thinc/layers/reduce_max.py | 7 +- thinc/layers/reduce_mean.py | 7 +- thinc/layers/reduce_sum.py | 3 +- thinc/layers/relu.py | 11 +- thinc/layers/remap_ids.py | 8 +- thinc/layers/residual.py | 6 +- thinc/layers/resizable.py | 2 +- thinc/layers/siamese.py | 5 +- thinc/layers/sigmoid.py | 7 +- thinc/layers/sigmoid_activation.py | 4 +- thinc/layers/softmax.py | 9 +- thinc/layers/softmax_activation.py | 5 +- thinc/layers/sparselinear.pyx | 15 +- thinc/layers/strings2arrays.py | 6 +- thinc/layers/swish.py | 10 +- thinc/layers/tensorflowwrapper.py | 14 +- thinc/layers/torchscriptwrapper.py | 7 +- thinc/layers/tuplify.py | 4 +- thinc/layers/uniqued.py | 8 +- thinc/layers/with_array.py | 7 +- thinc/layers/with_array2d.py | 5 +- thinc/layers/with_cpu.py | 5 +- thinc/layers/with_debug.py | 2 +- thinc/layers/with_flatten.py | 4 +- thinc/layers/with_flatten_v2.py | 5 +- thinc/layers/with_getitem.py | 5 +- thinc/layers/with_list.py | 6 +- thinc/layers/with_nvtx_range.py | 3 +- thinc/layers/with_padded.py | 7 +- thinc/layers/with_ragged.py | 7 +- thinc/layers/with_reshape.py | 7 +- thinc/layers/with_signpost_interval.py | 3 +- thinc/loss.py | 17 +- thinc/model.py | 37 ++- thinc/mypy.py | 13 +- thinc/optimizers.py | 9 +- thinc/schedules.py | 6 +- thinc/shims/__init__.py | 7 +- thinc/shims/mxnet.py | 14 +- thinc/shims/pytorch.py | 17 +- thinc/shims/shim.py | 6 +- thinc/shims/tensorflow.py | 9 +- thinc/shims/torchscript.py | 3 +- thinc/tests/backends/test_mem.py | 3 +- thinc/tests/backends/test_ops.py | 29 ++- thinc/tests/conftest.py | 5 +- thinc/tests/layers/test_basic_tagger.py | 17 +- thinc/tests/layers/test_combinators.py | 16 +- thinc/tests/layers/test_feed_forward.py | 8 +- thinc/tests/layers/test_hash_embed.py | 1 + thinc/tests/layers/test_layers_api.py | 13 +- thinc/tests/layers/test_linear.py | 7 +- thinc/tests/layers/test_lstm.py | 9 +- thinc/tests/layers/test_mappers.py | 3 +- thinc/tests/layers/test_mnist.py | 16 +- thinc/tests/layers/test_mxnet_wrapper.py | 15 +- thinc/tests/layers/test_pytorch_wrapper.py | 38 +++- thinc/tests/layers/test_reduce.py | 3 +- thinc/tests/layers/test_resizable.py | 8 +- thinc/tests/layers/test_shim.py | 2 + thinc/tests/layers/test_softmax.py | 2 +- thinc/tests/layers/test_sparse_linear.py | 4 +- thinc/tests/layers/test_tensorflow_wrapper.py | 16 +- thinc/tests/layers/test_torchscriptwrapper.py | 9 +- thinc/tests/layers/test_transforms.py | 3 +- thinc/tests/layers/test_uniqued.py | 9 +- thinc/tests/layers/test_with_debug.py | 3 +- thinc/tests/layers/test_with_flatten.py | 1 + thinc/tests/layers/test_with_transforms.py | 19 +- thinc/tests/model/test_model.py | 29 ++- thinc/tests/model/test_validation.py | 13 +- thinc/tests/mypy/modules/fail_no_plugin.py | 2 +- thinc/tests/mypy/modules/fail_plugin.py | 2 +- thinc/tests/mypy/modules/success_no_plugin.py | 2 +- thinc/tests/mypy/modules/success_plugin.py | 2 +- thinc/tests/mypy/test_mypy.py | 2 +- thinc/tests/regression/issue519/program.py | 2 +- thinc/tests/regression/test_issue208.py | 2 +- thinc/tests/shims/test_pytorch_grad_scaler.py | 4 +- thinc/tests/strategies.py | 5 +- thinc/tests/test_config.py | 17 +- thinc/tests/test_import__all__.py | 4 +- thinc/tests/test_indexing.py | 5 +- thinc/tests/test_initializers.py | 12 +- thinc/tests/test_loss.py | 11 +- thinc/tests/test_optimizers.py | 6 +- thinc/tests/test_schedules.py | 13 +- thinc/tests/test_serialize.py | 13 +- thinc/tests/test_types.py | 15 +- thinc/tests/test_util.py | 16 +- thinc/tests/util.py | 8 +- thinc/types.py | 31 ++- thinc/util.py | 53 +++-- 144 files changed, 1006 insertions(+), 608 deletions(-) diff --git a/thinc/__init__.py b/thinc/__init__.py index dfa821c4f..8f4a8a5a5 100644 --- a/thinc/__init__.py +++ b/thinc/__init__.py @@ -4,7 +4,6 @@ from .about import __version__ from .config import registry - # fmt: off __all__ = [ "registry", diff --git a/thinc/api.py b/thinc/api.py index 74633addd..74a654622 100644 --- a/thinc/api.py +++ b/thinc/api.py @@ -1,53 +1,165 @@ -from .config import Config, registry, ConfigValidationError -from .initializers import normal_init, uniform_init, glorot_uniform_init, zero_init -from .initializers import configure_normal_init -from .loss import CategoricalCrossentropy, L2Distance, CosineDistance -from .loss import SequenceCategoricalCrossentropy -from .model import Model, serialize_attr, deserialize_attr -from .model import set_dropout_rate, change_attr_values, wrap_model_recursive -from .shims import Shim, PyTorchGradScaler, PyTorchShim, TensorFlowShim, keras_model_fns -from .shims import MXNetShim, TorchScriptShim, maybe_handshake_model -from .optimizers import Adam, RAdam, SGD, Optimizer -from .schedules import Schedule, cyclic_triangular, warmup_linear, constant -from .schedules import constant_then, decaying, slanted_triangular, compounding -from .schedules import plateau -from .types import Ragged, Padded, ArgsKwargs, Unserializable -from .util import fix_random_seed, is_cupy_array, set_active_gpu -from .util import prefer_gpu, require_gpu, require_cpu -from .util import DataValidationError, data_validation -from .util import to_categorical, get_width, get_array_module, to_numpy -from .util import torch2xp, xp2torch, tensorflow2xp, xp2tensorflow, mxnet2xp, xp2mxnet -from .util import get_torch_default_device +from .backends import ( + CupyOps, + MPSOps, + NumpyOps, + Ops, + get_current_ops, + get_ops, + set_current_ops, + set_gpu_allocator, + use_ops, + use_pytorch_for_gpu_memory, + use_tensorflow_for_gpu_memory, +) from .compat import has_cupy -from .backends import get_ops, set_current_ops, get_current_ops, use_ops -from .backends import Ops, CupyOps, MPSOps, NumpyOps, set_gpu_allocator -from .backends import use_pytorch_for_gpu_memory, use_tensorflow_for_gpu_memory - -from .layers import Dropout, Embed, expand_window, HashEmbed, LayerNorm, Linear -from .layers import Maxout, Mish, MultiSoftmax, Relu, softmax_activation, Softmax, LSTM -from .layers import CauchySimilarity, ParametricAttention, Logistic -from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear -from .layers import SparseLinear_v2, ClippedLinear, ReluK, HardTanh, HardSigmoid -from .layers import Dish, HardSwish, HardSwishMobilenet, Swish, Gelu -from .layers import PyTorchWrapper, PyTorchRNNWrapper, PyTorchLSTM -from .layers import TensorFlowWrapper, keras_subclass, MXNetWrapper -from .layers import PyTorchWrapper_v2, Softmax_v2, PyTorchWrapper_v3 -from .layers import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper - -from .layers import add, bidirectional, chain, clone, concatenate, noop -from .layers import residual, uniqued, siamese, list2ragged, ragged2list -from .layers import map_list -from .layers import with_array, with_array2d -from .layers import with_padded, with_list, with_ragged, with_flatten -from .layers import with_reshape, with_getitem, strings2arrays, list2array -from .layers import list2ragged, ragged2list, list2padded, padded2list -from .layers import remap_ids, remap_ids_v2, premap_ids -from .layers import array_getitem, with_cpu, with_debug, with_nvtx_range -from .layers import with_signpost_interval -from .layers import tuplify, with_flatten_v2 - -from .layers import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum - +from .config import Config, ConfigValidationError, registry +from .initializers import ( + configure_normal_init, + glorot_uniform_init, + normal_init, + uniform_init, + zero_init, +) +from .layers import ( + LSTM, + CauchySimilarity, + ClippedLinear, + Dish, + Dropout, + Embed, + Gelu, + HardSigmoid, + HardSwish, + HardSwishMobilenet, + HardTanh, + HashEmbed, + LayerNorm, + Linear, + Logistic, + Maxout, + Mish, + MultiSoftmax, + MXNetWrapper, + ParametricAttention, + PyTorchLSTM, + PyTorchRNNWrapper, + PyTorchWrapper, + PyTorchWrapper_v2, + PyTorchWrapper_v3, + Relu, + ReluK, + Sigmoid, + Softmax, + Softmax_v2, + SparseLinear, + SparseLinear_v2, + Swish, + TensorFlowWrapper, + TorchScriptWrapper_v1, + add, + array_getitem, + bidirectional, + chain, + clone, + concatenate, + expand_window, + keras_subclass, + list2array, + list2padded, + list2ragged, + map_list, + noop, + padded2list, + premap_ids, + pytorch_to_torchscript_wrapper, + ragged2list, + reduce_first, + reduce_last, + reduce_max, + reduce_mean, + reduce_sum, + remap_ids, + remap_ids_v2, + residual, + resizable, + siamese, + sigmoid_activation, + softmax_activation, + strings2arrays, + tuplify, + uniqued, + with_array, + with_array2d, + with_cpu, + with_debug, + with_flatten, + with_flatten_v2, + with_getitem, + with_list, + with_nvtx_range, + with_padded, + with_ragged, + with_reshape, + with_signpost_interval, +) +from .loss import ( + CategoricalCrossentropy, + CosineDistance, + L2Distance, + SequenceCategoricalCrossentropy, +) +from .model import ( + Model, + change_attr_values, + deserialize_attr, + serialize_attr, + set_dropout_rate, + wrap_model_recursive, +) +from .optimizers import SGD, Adam, Optimizer, RAdam +from .schedules import ( + Schedule, + compounding, + constant, + constant_then, + cyclic_triangular, + decaying, + plateau, + slanted_triangular, + warmup_linear, +) +from .shims import ( + MXNetShim, + PyTorchGradScaler, + PyTorchShim, + Shim, + TensorFlowShim, + TorchScriptShim, + keras_model_fns, + maybe_handshake_model, +) +from .types import ArgsKwargs, Padded, Ragged, Unserializable +from .util import ( + DataValidationError, + data_validation, + fix_random_seed, + get_array_module, + get_torch_default_device, + get_width, + is_cupy_array, + mxnet2xp, + prefer_gpu, + require_cpu, + require_gpu, + set_active_gpu, + tensorflow2xp, + to_categorical, + to_numpy, + torch2xp, + xp2mxnet, + xp2tensorflow, + xp2torch, +) # fmt: off __all__ = [ diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py index c21620126..8973c8836 100644 --- a/thinc/backends/__init__.py +++ b/thinc/backends/__init__.py @@ -1,20 +1,23 @@ import contextlib -from typing import Type, Dict, Any, Callable, Optional, cast - -from contextvars import ContextVar import threading +from contextvars import ContextVar +from typing import Any, Callable, Dict, Optional, Type, cast -from .ops import Ops -from .cupy_ops import CupyOps -from .numpy_ops import NumpyOps -from .mps_ops import MPSOps -from ._cupy_allocators import cupy_tensorflow_allocator, cupy_pytorch_allocator -from ._param_server import ParamServer -from ..util import assert_tensorflow_installed, assert_pytorch_installed -from ..util import get_torch_default_device, is_cupy_array, require_cpu from .. import registry from ..compat import cupy, has_cupy - +from ..util import ( + assert_pytorch_installed, + assert_tensorflow_installed, + get_torch_default_device, + is_cupy_array, + require_cpu, +) +from ._cupy_allocators import cupy_pytorch_allocator, cupy_tensorflow_allocator +from ._param_server import ParamServer +from .cupy_ops import CupyOps +from .mps_ops import MPSOps +from .numpy_ops import NumpyOps +from .ops import Ops context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None) context_pools: ContextVar[dict] = ContextVar("context_pools", default={}) diff --git a/thinc/backends/_cupy_allocators.py b/thinc/backends/_cupy_allocators.py index f2b6faee9..77c958e36 100644 --- a/thinc/backends/_cupy_allocators.py +++ b/thinc/backends/_cupy_allocators.py @@ -1,8 +1,8 @@ from typing import cast +from ..compat import cupy, tensorflow, torch from ..types import ArrayXd from ..util import get_torch_default_device, tensorflow2xp -from ..compat import torch, cupy, tensorflow def cupy_tensorflow_allocator(size_in_bytes: int): diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py index 0b868e6d6..fa837017d 100644 --- a/thinc/backends/_custom_kernels.py +++ b/thinc/backends/_custom_kernels.py @@ -1,12 +1,13 @@ -from typing import Callable, Optional, Tuple -from functools import reduce -import numpy import operator import re -from pathlib import Path from collections import defaultdict -from ..compat import cupy, has_cupy_gpu +from functools import reduce +from pathlib import Path +from typing import Callable, Optional, Tuple +import numpy + +from ..compat import cupy, has_cupy_gpu PWD = Path(__file__).parent KERNELS_SRC = (PWD / "_custom_kernels.cu").read_text(encoding="utf8") diff --git a/thinc/backends/_param_server.py b/thinc/backends/_param_server.py index 4ce374a4e..db7b5a505 100644 --- a/thinc/backends/_param_server.py +++ b/thinc/backends/_param_server.py @@ -1,9 +1,8 @@ -from typing import Dict, Tuple, Optional, Any +from typing import Any, Dict, Optional, Tuple from ..types import FloatsXd from ..util import get_array_module - KeyT = Tuple[int, str] diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd index a789ef4a3..c608d8702 100644 --- a/thinc/backends/cblas.pxd +++ b/thinc/backends/cblas.pxd @@ -1,6 +1,5 @@ from libcpp.memory cimport shared_ptr - ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K, float alpha, const float* A, int lda, const float* B, int ldb, float beta, float* C, int ldc) nogil diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py index 506276380..366faf70a 100644 --- a/thinc/backends/cupy_ops.py +++ b/thinc/backends/cupy_ops.py @@ -1,13 +1,20 @@ import numpy + from .. import registry -from .ops import Ops -from .numpy_ops import NumpyOps -from . import _custom_kernels -from ..types import DeviceTypes -from ..util import torch2xp, tensorflow2xp, mxnet2xp -from ..util import is_cupy_array -from ..util import is_torch_cuda_array, is_tensorflow_gpu_array, is_mxnet_gpu_array from ..compat import cupy, cupyx +from ..types import DeviceTypes +from ..util import ( + is_cupy_array, + is_mxnet_gpu_array, + is_tensorflow_gpu_array, + is_torch_cuda_array, + mxnet2xp, + tensorflow2xp, + torch2xp, +) +from . import _custom_kernels +from .numpy_ops import NumpyOps +from .ops import Ops @registry.ops("CupyOps") diff --git a/thinc/backends/mps_ops.py b/thinc/backends/mps_ops.py index 8ebbd4e4b..c6ba71f11 100644 --- a/thinc/backends/mps_ops.py +++ b/thinc/backends/mps_ops.py @@ -1,8 +1,10 @@ from typing import TYPE_CHECKING + import numpy from .. import registry -from . import NumpyOps, Ops +from .numpy_ops import NumpyOps +from .ops import Ops if TYPE_CHECKING: # Type checking does not work with dynamic base classes, since MyPy cannot diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx index 45d3d9093..87c6b9d01 100644 --- a/thinc/backends/numpy_ops.pyx +++ b/thinc/backends/numpy_ops.pyx @@ -1,27 +1,29 @@ # cython: cdivision=True # cython: infer_types=True # cython: profile=True -from typing import Optional from collections.abc import Sized +from typing import Optional + import numpy cimport cython -from libc.string cimport memcpy, memset -from libc.stdlib cimport calloc, malloc, free -from libc.stdint cimport uint32_t, uint64_t -from libc.string cimport memcpy -from libc.math cimport isnan +cimport numpy as np from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap +from libc.math cimport isnan +from libc.stdint cimport uint32_t, uint64_t +from libc.stdlib cimport calloc, free, malloc +from libc.string cimport memcpy, memset from murmurhash.mrmr cimport hash64 -cimport numpy as np +from preshed.maps cimport PreshMap from .. import registry +from ..types import ArrayXd, DeviceTypes, DTypes, Shape from ..util import copy_array, get_array_module -from ..types import DeviceTypes, DTypes, Shape, ArrayXd -from .cblas cimport CBlas, daxpy, saxpy, sgemm, dgemm, sscal -from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights + +from .cblas cimport CBlas, daxpy, dgemm, saxpy, sgemm, sscal + from ..compat import has_blis +from .ops import Ops, _split_weights, _transpose_weights, _untranspose_unsplit_weights cdef extern from "math.h": diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py index 8bb770023..01bb2f852 100644 --- a/thinc/backends/ops.py +++ b/thinc/backends/ops.py @@ -1,18 +1,53 @@ +import itertools import math +from typing import ( + Any, + Iterator, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, + overload, +) -from typing import Optional, List, Tuple, Sequence, Type, Union, cast, TypeVar -from typing import Iterator, overload, Any import numpy -import itertools -from ..types import Xp, Shape, DTypes, DTypesInt, DTypesFloat, List2d, ArrayXd -from ..types import Floats1d, Floats2d, Floats3d, Floats4d -from ..types import Array1d, Array2d, Array3d, Array4d, ListXd -from ..types import FloatsXd, Ints1d, Ints2d, Ints3d, Ints4d, IntsXd, _Floats -from ..types import FloatsXdT -from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator +from ..types import ( + Array1d, + Array2d, + Array3d, + Array4d, + ArrayXd, + Batchable, + DeviceTypes, + DTypes, + DTypesFloat, + DTypesInt, + Floats1d, + Floats2d, + Floats3d, + Floats4d, + FloatsXd, + FloatsXdT, + Generator, + Ints1d, + Ints2d, + Ints3d, + Ints4d, + IntsXd, + List2d, + ListXd, + Padded, + Shape, + SizedGenerator, + Xp, + _Floats, +) from ..util import get_array_module, is_xp_array, to_numpy - from .cblas import CBlas ArrayT = TypeVar("ArrayT", bound=ArrayXd) diff --git a/thinc/compat.py b/thinc/compat.py index 6d8b139fe..7e79cdaf9 100644 --- a/thinc/compat.py +++ b/thinc/compat.py @@ -27,8 +27,8 @@ try: # pragma: no cover - import torch.utils.dlpack import torch + import torch.utils.dlpack has_torch = True has_torch_cuda_gpu = torch.cuda.device_count() != 0 @@ -51,8 +51,8 @@ torch_version = Version("0.0.0") try: # pragma: no cover - import tensorflow.experimental.dlpack import tensorflow + import tensorflow.experimental.dlpack has_tensorflow = True has_tensorflow_gpu = len(tensorflow.config.get_visible_devices("GPU")) > 0 diff --git a/thinc/config.py b/thinc/config.py index e5452819b..434c96085 100644 --- a/thinc/config.py +++ b/thinc/config.py @@ -1,6 +1,7 @@ import catalogue import confection -from confection import Config, ConfigValidationError, Promise, VARIABLE_RE +from confection import VARIABLE_RE, Config, ConfigValidationError, Promise + from .types import Decorator diff --git a/thinc/initializers.py b/thinc/initializers.py index 1333911a3..feb02889d 100644 --- a/thinc/initializers.py +++ b/thinc/initializers.py @@ -1,4 +1,5 @@ from typing import Callable, cast + import numpy from .backends import Ops diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py index 4b73a2dce..032af5fde 100644 --- a/thinc/layers/__init__.py +++ b/thinc/layers/__init__.py @@ -1,48 +1,48 @@ # Weights layers +# Combinators +from .add import add + +# Array manipulation +from .array_getitem import array_getitem +from .bidirectional import bidirectional from .cauchysimilarity import CauchySimilarity +from .chain import chain +from .clipped_linear import ClippedLinear, HardSigmoid, HardTanh, ReluK +from .clone import clone +from .concatenate import concatenate from .dish import Dish from .dropout import Dropout from .embed import Embed from .expand_window import expand_window +from .gelu import Gelu +from .hard_swish import HardSwish +from .hard_swish_mobilenet import HardSwishMobilenet from .hashembed import HashEmbed from .layernorm import LayerNorm from .linear import Linear -from .lstm import LSTM, PyTorchLSTM + +# Data-type transfers +from .list2array import list2array +from .list2padded import list2padded +from .list2ragged import list2ragged from .logistic import Logistic +from .lstm import LSTM, PyTorchLSTM +from .map_list import map_list from .maxout import Maxout from .mish import Mish from .multisoftmax import MultiSoftmax -from .parametricattention import ParametricAttention -from .pytorchwrapper import PyTorchWrapper, PyTorchWrapper_v2, PyTorchWrapper_v3 -from .pytorchwrapper import PyTorchRNNWrapper -from .relu import Relu -from .clipped_linear import ClippedLinear, ReluK, HardSigmoid, HardTanh -from .hard_swish import HardSwish -from .hard_swish_mobilenet import HardSwishMobilenet -from .swish import Swish -from .gelu import Gelu -from .resizable import resizable -from .sigmoid_activation import sigmoid_activation -from .sigmoid import Sigmoid -from .softmax_activation import softmax_activation -from .softmax import Softmax, Softmax_v2 -from .sparselinear import SparseLinear, SparseLinear_v2 -from .tensorflowwrapper import TensorFlowWrapper, keras_subclass -from .torchscriptwrapper import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper from .mxnetwrapper import MXNetWrapper - -# Combinators -from .add import add -from .bidirectional import bidirectional -from .chain import chain -from .clone import clone -from .concatenate import concatenate -from .map_list import map_list from .noop import noop -from .residual import residual -from .uniqued import uniqued -from .siamese import siamese -from .tuplify import tuplify +from .padded2list import padded2list +from .parametricattention import ParametricAttention +from .premap_ids import premap_ids +from .pytorchwrapper import ( + PyTorchRNNWrapper, + PyTorchWrapper, + PyTorchWrapper_v2, + PyTorchWrapper_v3, +) +from .ragged2list import ragged2list # Pooling from .reduce_first import reduce_first @@ -50,34 +50,36 @@ from .reduce_max import reduce_max from .reduce_mean import reduce_mean from .reduce_sum import reduce_sum - -# Array manipulation -from .array_getitem import array_getitem - -# Data-type transfers -from .list2array import list2array -from .list2ragged import list2ragged -from .list2padded import list2padded -from .ragged2list import ragged2list -from .padded2list import padded2list +from .relu import Relu from .remap_ids import remap_ids, remap_ids_v2 -from .premap_ids import premap_ids +from .residual import residual +from .resizable import resizable +from .siamese import siamese +from .sigmoid import Sigmoid +from .sigmoid_activation import sigmoid_activation +from .softmax import Softmax, Softmax_v2 +from .softmax_activation import softmax_activation +from .sparselinear import SparseLinear, SparseLinear_v2 from .strings2arrays import strings2arrays +from .swish import Swish +from .tensorflowwrapper import TensorFlowWrapper, keras_subclass +from .torchscriptwrapper import TorchScriptWrapper_v1, pytorch_to_torchscript_wrapper +from .tuplify import tuplify +from .uniqued import uniqued from .with_array import with_array from .with_array2d import with_array2d from .with_cpu import with_cpu +from .with_debug import with_debug from .with_flatten import with_flatten from .with_flatten_v2 import with_flatten_v2 -from .with_padded import with_padded +from .with_getitem import with_getitem from .with_list import with_list +from .with_nvtx_range import with_nvtx_range +from .with_padded import with_padded from .with_ragged import with_ragged from .with_reshape import with_reshape -from .with_getitem import with_getitem -from .with_debug import with_debug -from .with_nvtx_range import with_nvtx_range from .with_signpost_interval import with_signpost_interval - # fmt: off __all__ = [ "CauchySimilarity", diff --git a/thinc/layers/add.py b/thinc/layers/add.py index 60b1f46b9..a3aa1af17 100644 --- a/thinc/layers/add.py +++ b/thinc/layers/add.py @@ -1,11 +1,10 @@ -from typing import Any, Tuple, Callable, Optional, TypeVar, Dict +from typing import Any, Callable, Dict, Optional, Tuple, TypeVar -from ..model import Model from ..config import registry +from ..model import Model from ..types import ArrayXd, XY_XY_OutT from ..util import get_width - InT = TypeVar("InT", bound=Any) OutT = TypeVar("OutT", bound=ArrayXd) diff --git a/thinc/layers/array_getitem.py b/thinc/layers/array_getitem.py index 17ffcb7ee..219b4ea1c 100644 --- a/thinc/layers/array_getitem.py +++ b/thinc/layers/array_getitem.py @@ -1,7 +1,7 @@ -from typing import Union, Sequence, Tuple, TypeVar -from ..types import ArrayXd, FloatsXd, IntsXd -from ..model import Model +from typing import Sequence, Tuple, TypeVar, Union +from ..model import Model +from ..types import ArrayXd, FloatsXd, IntsXd AxisIndex = Union[int, slice, Sequence[int]] Index = Union[AxisIndex, Tuple[AxisIndex, ...]] diff --git a/thinc/layers/bidirectional.py b/thinc/layers/bidirectional.py index 1ff73f013..8cea04e30 100644 --- a/thinc/layers/bidirectional.py +++ b/thinc/layers/bidirectional.py @@ -1,11 +1,10 @@ -from typing import Optional, Tuple, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..backends import Ops -from ..model import Model from ..config import registry +from ..model import Model from ..types import Padded - InT = Padded OutT = Padded diff --git a/thinc/layers/cauchysimilarity.py b/thinc/layers/cauchysimilarity.py index 25af8d9df..57e5932ec 100644 --- a/thinc/layers/cauchysimilarity.py +++ b/thinc/layers/cauchysimilarity.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model from ..config import registry +from ..model import Model from ..types import Floats1d, Floats2d from ..util import get_width - InT = Tuple[Floats2d, Floats2d] OutT = Floats1d diff --git a/thinc/layers/chain.py b/thinc/layers/chain.py index 258ee0902..a7e3ee7da 100644 --- a/thinc/layers/chain.py +++ b/thinc/layers/chain.py @@ -1,10 +1,9 @@ -from typing import Tuple, Callable, Optional, TypeVar, Any, Dict, List, cast +from typing import Any, Callable, Dict, List, Optional, Tuple, TypeVar, cast -from ..model import Model from ..config import registry -from ..util import get_width +from ..model import Model from ..types import XY_YZ_OutT - +from ..util import get_width InT = TypeVar("InT") MidT = TypeVar("MidT") diff --git a/thinc/layers/clipped_linear.py b/thinc/layers/clipped_linear.py index 34bb8ade8..efe295fa6 100644 --- a/thinc/layers/clipped_linear.py +++ b/thinc/layers/clipped_linear.py @@ -1,13 +1,13 @@ -from typing import Tuple, Optional, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..config import registry +from ..initializers import glorot_uniform_init, zero_init from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout -from ..types import Floats1d, Floats2d -from ..util import partial, get_width -from ..initializers import glorot_uniform_init, zero_init +from .layernorm import LayerNorm @registry.layers("ClippedLinear.v1") diff --git a/thinc/layers/clone.py b/thinc/layers/clone.py index 8b433407d..1758f5fe7 100644 --- a/thinc/layers/clone.py +++ b/thinc/layers/clone.py @@ -1,10 +1,9 @@ -from typing import TypeVar, cast, List +from typing import List, TypeVar, cast -from .noop import noop -from .chain import chain -from ..model import Model from ..config import registry - +from ..model import Model +from .chain import chain +from .noop import noop InT = TypeVar("InT") OutT = TypeVar("OutT") diff --git a/thinc/layers/concatenate.py b/thinc/layers/concatenate.py index 4cce96954..e810cefc3 100644 --- a/thinc/layers/concatenate.py +++ b/thinc/layers/concatenate.py @@ -1,14 +1,22 @@ -from typing import Any, List, Tuple, Callable, Optional -from typing import TypeVar, cast, Dict, Union, Sequence +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + cast, +) from ..backends import NumpyOps -from ..model import Model from ..config import registry -from ..types import Array2d, Ragged +from ..model import Model +from ..types import Array2d, Ragged, XY_XY_OutT from ..util import get_width from .noop import noop -from ..types import XY_XY_OutT - NUMPY_OPS = NumpyOps() diff --git a/thinc/layers/dish.py b/thinc/layers/dish.py index 1092638e7..dc871ad24 100644 --- a/thinc/layers/dish.py +++ b/thinc/layers/dish.py @@ -1,13 +1,13 @@ -from typing import Tuple, Optional, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..config import registry +from ..initializers import he_normal_init, zero_init from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout -from ..types import Floats1d, Floats2d -from ..util import partial, get_width -from ..initializers import he_normal_init, zero_init +from .layernorm import LayerNorm @registry.layers("Dish.v1") diff --git a/thinc/layers/dropout.py b/thinc/layers/dropout.py index f4fa29445..7db35261a 100644 --- a/thinc/layers/dropout.py +++ b/thinc/layers/dropout.py @@ -1,9 +1,8 @@ -from typing import Tuple, Callable, List, TypeVar, cast, Union, Sequence +from typing import Callable, List, Sequence, Tuple, TypeVar, Union, cast -from ..model import Model from ..config import registry -from ..types import ArrayXd, Ragged, Padded - +from ..model import Model +from ..types import ArrayXd, Padded, Ragged InT = TypeVar("InT", bound=Union[ArrayXd, Sequence[ArrayXd], Ragged, Padded]) diff --git a/thinc/layers/embed.py b/thinc/layers/embed.py index 703baf475..9d8d34e4a 100644 --- a/thinc/layers/embed.py +++ b/thinc/layers/embed.py @@ -1,13 +1,12 @@ -from typing import Dict, Callable, Tuple, Optional, Union, cast, TypeVar +from typing import Callable, Dict, Optional, Tuple, TypeVar, Union, cast -from .chain import chain -from .array_getitem import ints_getitem -from ..model import Model from ..config import registry -from ..types import Ints1d, Ints2d, Floats1d, Floats2d from ..initializers import uniform_init +from ..model import Model +from ..types import Floats1d, Floats2d, Ints1d, Ints2d from ..util import get_width, partial - +from .array_getitem import ints_getitem +from .chain import chain InT = TypeVar("InT", bound=Union[Ints1d, Ints2d]) OutT = Floats2d diff --git a/thinc/layers/expand_window.py b/thinc/layers/expand_window.py index 1075a49a2..193b82d39 100644 --- a/thinc/layers/expand_window.py +++ b/thinc/layers/expand_window.py @@ -1,10 +1,9 @@ -from typing import Tuple, TypeVar, Callable, Union, cast +from typing import Callable, Tuple, TypeVar, Union, cast -from ..model import Model from ..config import registry +from ..model import Model from ..types import Floats2d, Ragged - InT = TypeVar("InT", Floats2d, Ragged) diff --git a/thinc/layers/gelu.py b/thinc/layers/gelu.py index 686b1f0d8..f51ee4545 100644 --- a/thinc/layers/gelu.py +++ b/thinc/layers/gelu.py @@ -1,13 +1,13 @@ -from typing import Tuple, Optional, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..config import registry +from ..initializers import he_normal_init, zero_init from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout -from ..types import Floats1d, Floats2d -from ..util import partial, get_width -from ..initializers import he_normal_init, zero_init +from .layernorm import LayerNorm @registry.layers("Gelu.v1") diff --git a/thinc/layers/hard_swish.py b/thinc/layers/hard_swish.py index 773314a38..2fc135e41 100644 --- a/thinc/layers/hard_swish.py +++ b/thinc/layers/hard_swish.py @@ -1,13 +1,13 @@ -from typing import Tuple, Optional, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..config import registry +from ..initializers import he_normal_init, zero_init from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout -from ..types import Floats1d, Floats2d -from ..util import partial, get_width -from ..initializers import he_normal_init, zero_init +from .layernorm import LayerNorm @registry.layers("HardSwish.v1") diff --git a/thinc/layers/hard_swish_mobilenet.py b/thinc/layers/hard_swish_mobilenet.py index 9f5f3fb9f..400622497 100644 --- a/thinc/layers/hard_swish_mobilenet.py +++ b/thinc/layers/hard_swish_mobilenet.py @@ -1,13 +1,13 @@ -from typing import Tuple, Optional, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..config import registry +from ..initializers import he_normal_init, zero_init from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout -from ..types import Floats1d, Floats2d -from ..util import partial, get_width -from ..initializers import he_normal_init, zero_init +from .layernorm import LayerNorm @registry.layers("HardSwishMobilenet.v1") diff --git a/thinc/layers/hashembed.py b/thinc/layers/hashembed.py index 8c85fdb02..7ecd9b26a 100644 --- a/thinc/layers/hashembed.py +++ b/thinc/layers/hashembed.py @@ -1,13 +1,12 @@ -from typing import Callable, Dict, Tuple, Optional, Any, Union, cast, TypeVar +from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, Union, cast -from .chain import chain -from .array_getitem import ints_getitem -from ..model import Model from ..config import registry -from ..types import Floats1d, Floats2d, Ints2d, Ints1d from ..initializers import uniform_init +from ..model import Model +from ..types import Floats1d, Floats2d, Ints1d, Ints2d from ..util import partial - +from .array_getitem import ints_getitem +from .chain import chain InT = TypeVar("InT", bound=Union[Ints1d, Ints2d]) OutT = Floats2d diff --git a/thinc/layers/layernorm.py b/thinc/layers/layernorm.py index 684489c54..2090ed9a8 100644 --- a/thinc/layers/layernorm.py +++ b/thinc/layers/layernorm.py @@ -1,12 +1,11 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model +from ..backends import Ops from ..config import registry +from ..model import Model from ..types import Floats2d -from ..backends import Ops from ..util import get_width - InT = Floats2d diff --git a/thinc/layers/linear.py b/thinc/layers/linear.py index bbf7b7874..ef24ec044 100644 --- a/thinc/layers/linear.py +++ b/thinc/layers/linear.py @@ -1,12 +1,11 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model from ..config import registry -from ..types import Floats1d, Floats2d from ..initializers import glorot_uniform_init, zero_init +from ..model import Model +from ..types import Floats1d, Floats2d from ..util import get_width, partial - InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/list2array.py b/thinc/layers/list2array.py index a52d6e6c6..a31d5d80d 100644 --- a/thinc/layers/list2array.py +++ b/thinc/layers/list2array.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, TypeVar, List +from typing import Callable, List, Tuple, TypeVar from ..backends import NumpyOps -from ..model import Model from ..config import registry +from ..model import Model from ..types import Array2d - NUMPY_OPS = NumpyOps() diff --git a/thinc/layers/list2padded.py b/thinc/layers/list2padded.py index 2a02f90e0..e98e88a5c 100644 --- a/thinc/layers/list2padded.py +++ b/thinc/layers/list2padded.py @@ -1,9 +1,8 @@ -from typing import Tuple, Callable, TypeVar, cast +from typing import Callable, Tuple, TypeVar, cast -from ..types import Padded, List2d -from ..model import Model from ..config import registry - +from ..model import Model +from ..types import List2d, Padded InT = TypeVar("InT", bound=List2d) OutT = Padded diff --git a/thinc/layers/list2ragged.py b/thinc/layers/list2ragged.py index a63237dfe..25ad7bed3 100644 --- a/thinc/layers/list2ragged.py +++ b/thinc/layers/list2ragged.py @@ -1,9 +1,8 @@ -from typing import Tuple, List, Callable, cast, TypeVar +from typing import Callable, List, Tuple, TypeVar, cast -from ..model import Model from ..config import registry -from ..types import ListXd, ArrayXd, Ragged - +from ..model import Model +from ..types import ArrayXd, ListXd, Ragged InT = TypeVar("InT", bound=ListXd) OutT = Ragged diff --git a/thinc/layers/logistic.py b/thinc/layers/logistic.py index cda0c7dd5..43d45a330 100644 --- a/thinc/layers/logistic.py +++ b/thinc/layers/logistic.py @@ -1,10 +1,9 @@ -from typing import Tuple, Callable +from typing import Callable, Tuple -from ..model import Model from ..config import registry +from ..model import Model from ..types import Floats2d - InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/lstm.py b/thinc/layers/lstm.py index 266fee6e3..c817cd4db 100644 --- a/thinc/layers/lstm.py +++ b/thinc/layers/lstm.py @@ -1,13 +1,13 @@ -from typing import Optional, Tuple, Callable, cast from functools import partial +from typing import Callable, Optional, Tuple, cast -from ..model import Model +from ..backends import Ops from ..config import registry -from ..util import get_width +from ..initializers import glorot_uniform_init, zero_init +from ..model import Model from ..types import Floats1d, Floats2d, Floats4d, Padded, Ragged +from ..util import get_width from .noop import noop -from ..initializers import glorot_uniform_init, zero_init -from ..backends import Ops @registry.layers("LSTM.v1") @@ -45,8 +45,9 @@ def PyTorchLSTM( nO: int, nI: int, *, bi: bool = False, depth: int = 1, dropout: float = 0.0 ) -> Model[Padded, Padded]: import torch.nn - from .with_padded import with_padded + from .pytorchwrapper import PyTorchRNNWrapper + from .with_padded import with_padded if depth == 0: return noop() # type: ignore[misc] diff --git a/thinc/layers/map_list.py b/thinc/layers/map_list.py index b05a934b1..aaadf0b55 100644 --- a/thinc/layers/map_list.py +++ b/thinc/layers/map_list.py @@ -1,6 +1,6 @@ -from typing import Callable, TypeVar, List, Tuple, Optional -from ..model import Model +from typing import Callable, List, Optional, Tuple, TypeVar +from ..model import Model InT = TypeVar("InT") OutT = TypeVar("OutT") diff --git a/thinc/layers/maxout.py b/thinc/layers/maxout.py index 72788a5c7..ff0e52037 100644 --- a/thinc/layers/maxout.py +++ b/thinc/layers/maxout.py @@ -1,14 +1,13 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model from ..config import registry from ..initializers import glorot_uniform_init, zero_init +from ..model import Model from ..types import Floats2d from ..util import get_width, partial +from .chain import chain from .dropout import Dropout from .layernorm import LayerNorm -from .chain import chain - InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/mish.py b/thinc/layers/mish.py index ab7a2a76c..32542b963 100644 --- a/thinc/layers/mish.py +++ b/thinc/layers/mish.py @@ -1,14 +1,13 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model -from ..initializers import glorot_uniform_init, zero_init from ..config import registry +from ..initializers import glorot_uniform_init, zero_init +from ..model import Model from ..types import Floats1d, Floats2d from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout - +from .layernorm import LayerNorm InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/multisoftmax.py b/thinc/layers/multisoftmax.py index cf55ecc37..d07b684f4 100644 --- a/thinc/layers/multisoftmax.py +++ b/thinc/layers/multisoftmax.py @@ -1,11 +1,10 @@ -from typing import Optional, Tuple, Callable, cast +from typing import Callable, Optional, Tuple, cast -from ..types import Floats2d, Floats1d -from ..model import Model from ..config import registry +from ..model import Model +from ..types import Floats1d, Floats2d from ..util import get_width - InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/mxnetwrapper.py b/thinc/layers/mxnetwrapper.py index 642d01f38..2303871fb 100644 --- a/thinc/layers/mxnetwrapper.py +++ b/thinc/layers/mxnetwrapper.py @@ -1,11 +1,10 @@ -from typing import Callable, Tuple, Optional, Any, Type +from typing import Any, Callable, Optional, Tuple, Type +from ..config import registry from ..model import Model from ..shims import MXNetShim -from ..config import registry -from ..util import is_xp_array, is_mxnet_array -from ..util import mxnet2xp, xp2mxnet, convert_recursive from ..types import ArgsKwargs +from ..util import convert_recursive, is_mxnet_array, is_xp_array, mxnet2xp, xp2mxnet @registry.layers("MXNetWrapper.v1") diff --git a/thinc/layers/noop.py b/thinc/layers/noop.py index d1c83d1cd..2e855b875 100644 --- a/thinc/layers/noop.py +++ b/thinc/layers/noop.py @@ -1,8 +1,7 @@ -from typing import Tuple, Callable, TypeVar +from typing import Callable, Tuple, TypeVar -from ..model import Model from ..config import registry - +from ..model import Model InOutT = TypeVar("InOutT") diff --git a/thinc/layers/padded2list.py b/thinc/layers/padded2list.py index 8f1bee7e8..a4d374e6b 100644 --- a/thinc/layers/padded2list.py +++ b/thinc/layers/padded2list.py @@ -1,9 +1,8 @@ -from typing import Tuple, Callable, TypeVar, cast +from typing import Callable, Tuple, TypeVar, cast -from ..types import Padded, List2d -from ..model import Model from ..config import registry - +from ..model import Model +from ..types import List2d, Padded InT = Padded OutT = TypeVar("OutT", bound=List2d) diff --git a/thinc/layers/parametricattention.py b/thinc/layers/parametricattention.py index d54a2f19e..a03906f51 100644 --- a/thinc/layers/parametricattention.py +++ b/thinc/layers/parametricattention.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, Optional +from typing import Callable, Optional, Tuple -from ..model import Model from ..config import registry +from ..model import Model from ..types import Ragged from ..util import get_width - InT = Ragged OutT = Ragged diff --git a/thinc/layers/premap_ids.pyx b/thinc/layers/premap_ids.pyx index 74bc8dc6a..17acafa8e 100644 --- a/thinc/layers/premap_ids.pyx +++ b/thinc/layers/premap_ids.pyx @@ -1,13 +1,15 @@ # cython: binding=True, infer_types=True import numpy + from preshed.maps cimport PreshMap -from typing import Dict, Union, Optional, cast, Callable, Tuple, Mapping -from ..types import Ints1d, Ints2d + +from typing import Callable, Dict, Mapping, Optional, Tuple, Union, cast + from ..config import registry from ..model import Model +from ..types import Ints1d, Ints2d from ..util import to_numpy - InT = Union[Ints1d, Ints2d] OutT = Ints2d diff --git a/thinc/layers/pytorchwrapper.py b/thinc/layers/pytorchwrapper.py index a1b0c462a..39c8b95c1 100644 --- a/thinc/layers/pytorchwrapper.py +++ b/thinc/layers/pytorchwrapper.py @@ -1,12 +1,18 @@ -from typing import Callable, Dict, Tuple, Optional, Any, cast +from typing import Any, Callable, Dict, Optional, Tuple, cast from ..compat import torch +from ..config import registry from ..model import Model from ..shims import PyTorchGradScaler, PyTorchShim -from ..config import registry -from ..util import is_xp_array, is_torch_array, partial -from ..util import xp2torch, torch2xp, convert_recursive -from ..types import Floats3d, ArgsKwargs, Padded +from ..types import ArgsKwargs, Floats3d, Padded +from ..util import ( + convert_recursive, + is_torch_array, + is_xp_array, + partial, + torch2xp, + xp2torch, +) @registry.layers("PyTorchRNNWrapper.v1") diff --git a/thinc/layers/ragged2list.py b/thinc/layers/ragged2list.py index 35af28f2f..3d8463f11 100644 --- a/thinc/layers/ragged2list.py +++ b/thinc/layers/ragged2list.py @@ -1,9 +1,8 @@ -from typing import Tuple, Callable, TypeVar, cast +from typing import Callable, Tuple, TypeVar, cast -from ..model import Model from ..config import registry -from ..types import Ragged, ListXd - +from ..model import Model +from ..types import ListXd, Ragged InT = Ragged OutT = TypeVar("OutT", bound=ListXd) diff --git a/thinc/layers/reduce_first.py b/thinc/layers/reduce_first.py index ab72cb5e3..ede42c5d0 100644 --- a/thinc/layers/reduce_first.py +++ b/thinc/layers/reduce_first.py @@ -1,11 +1,10 @@ from typing import Callable, Tuple, cast -from ..model import Model from ..config import registry -from ..types import Ragged, Floats2d +from ..model import Model +from ..types import Floats2d, Ragged from ..util import ArrayInfo - InT = Ragged OutT = Floats2d diff --git a/thinc/layers/reduce_last.py b/thinc/layers/reduce_last.py index b8194ec2b..d2de6a877 100644 --- a/thinc/layers/reduce_last.py +++ b/thinc/layers/reduce_last.py @@ -1,8 +1,8 @@ from typing import Callable, Tuple, cast -from ..model import Model from ..config import registry -from ..types import Ragged, Floats2d +from ..model import Model +from ..types import Floats2d, Ragged from ..util import ArrayInfo InT = Ragged diff --git a/thinc/layers/reduce_max.py b/thinc/layers/reduce_max.py index ebafb5172..e6f033e48 100644 --- a/thinc/layers/reduce_max.py +++ b/thinc/layers/reduce_max.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, cast +from typing import Callable, Tuple, cast -from ..types import Floats2d, Ragged -from ..model import Model from ..config import registry +from ..model import Model +from ..types import Floats2d, Ragged from ..util import ArrayInfo - InT = Ragged OutT = Floats2d diff --git a/thinc/layers/reduce_mean.py b/thinc/layers/reduce_mean.py index f37ae8253..f1bd04898 100644 --- a/thinc/layers/reduce_mean.py +++ b/thinc/layers/reduce_mean.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, cast +from typing import Callable, Tuple, cast -from ..types import Floats2d, Ragged -from ..model import Model from ..config import registry +from ..model import Model +from ..types import Floats2d, Ragged from ..util import ArrayInfo - InT = Ragged OutT = Floats2d diff --git a/thinc/layers/reduce_sum.py b/thinc/layers/reduce_sum.py index e93a362d8..62ade00f6 100644 --- a/thinc/layers/reduce_sum.py +++ b/thinc/layers/reduce_sum.py @@ -1,11 +1,10 @@ from typing import Callable, Tuple, cast -from ..model import Model from ..config import registry +from ..model import Model from ..types import Floats2d, Ragged from ..util import ArrayInfo - InT = Ragged OutT = Floats2d diff --git a/thinc/layers/relu.py b/thinc/layers/relu.py index d1d3ebf74..488a1eff7 100644 --- a/thinc/layers/relu.py +++ b/thinc/layers/relu.py @@ -1,14 +1,13 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model -from ..initializers import glorot_uniform_init, zero_init from ..config import registry -from ..types import Floats2d, Floats1d +from ..initializers import glorot_uniform_init, zero_init +from ..model import Model +from ..types import Floats1d, Floats2d from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout - +from .layernorm import LayerNorm InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/remap_ids.py b/thinc/layers/remap_ids.py index 265b24a9d..3801b703f 100644 --- a/thinc/layers/remap_ids.py +++ b/thinc/layers/remap_ids.py @@ -1,12 +1,10 @@ -from typing import Tuple, Callable, Sequence, cast -from typing import Dict, Union, Optional, Hashable, Any +from typing import Any, Callable, Dict, Hashable, Optional, Sequence, Tuple, Union, cast -from ..model import Model from ..config import registry -from ..types import Ints1d, Ints2d, DTypes +from ..model import Model +from ..types import DTypes, Ints1d, Ints2d from ..util import is_xp_array, to_numpy - InT = Union[Sequence[Hashable], Ints1d, Ints2d] OutT = Ints2d diff --git a/thinc/layers/residual.py b/thinc/layers/residual.py index 3793ee1d5..f213e9bf5 100644 --- a/thinc/layers/residual.py +++ b/thinc/layers/residual.py @@ -1,8 +1,8 @@ -from typing import Tuple, Callable, Optional, List, TypeVar +from typing import Callable, List, Optional, Tuple, TypeVar -from ..model import Model from ..config import registry -from ..types import Floats1d, Floats2d, Floats3d, Floats4d, FloatsXd, Ragged, Padded +from ..model import Model +from ..types import Floats1d, Floats2d, Floats3d, Floats4d, FloatsXd, Padded, Ragged # fmt: off InT = TypeVar( diff --git a/thinc/layers/resizable.py b/thinc/layers/resizable.py index 2dd4dde1a..606d50dae 100644 --- a/thinc/layers/resizable.py +++ b/thinc/layers/resizable.py @@ -1,7 +1,7 @@ from typing import Callable, Optional, TypeVar -from ..model import Model from ..config import registry +from ..model import Model from ..types import Floats2d InT = TypeVar("InT") diff --git a/thinc/layers/siamese.py b/thinc/layers/siamese.py index 82bafacbb..33579a4de 100644 --- a/thinc/layers/siamese.py +++ b/thinc/layers/siamese.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, Optional, TypeVar +from typing import Callable, Optional, Tuple, TypeVar +from ..config import registry from ..model import Model from ..types import ArrayXd -from ..config import registry from ..util import get_width - LayerT = TypeVar("LayerT") SimT = TypeVar("SimT") InT = Tuple[LayerT, LayerT] diff --git a/thinc/layers/sigmoid.py b/thinc/layers/sigmoid.py index d8933b66e..157047e37 100644 --- a/thinc/layers/sigmoid.py +++ b/thinc/layers/sigmoid.py @@ -1,12 +1,11 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model from ..config import registry -from ..types import Floats2d, Floats1d from ..initializers import zero_init +from ..model import Model +from ..types import Floats1d, Floats2d from ..util import get_width, partial - InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/sigmoid_activation.py b/thinc/layers/sigmoid_activation.py index b87261075..37e188ab8 100644 --- a/thinc/layers/sigmoid_activation.py +++ b/thinc/layers/sigmoid_activation.py @@ -1,7 +1,7 @@ -from typing import TypeVar, Tuple, Callable, cast +from typing import Callable, Tuple, TypeVar, cast -from ..model import Model from ..config import registry +from ..model import Model from ..types import FloatsXdT diff --git a/thinc/layers/softmax.py b/thinc/layers/softmax.py index 9d766f1db..8b7301af0 100644 --- a/thinc/layers/softmax.py +++ b/thinc/layers/softmax.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, Optional, cast +from typing import Callable, Optional, Tuple, cast -from ..model import Model from ..config import registry -from ..types import Floats2d, Floats1d from ..initializers import zero_init -from ..util import get_width, partial, ArrayInfo - +from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import ArrayInfo, get_width, partial InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/softmax_activation.py b/thinc/layers/softmax_activation.py index 858320143..974ed2c8c 100644 --- a/thinc/layers/softmax_activation.py +++ b/thinc/layers/softmax_activation.py @@ -1,10 +1,9 @@ -from typing import Tuple, Callable +from typing import Callable, Tuple -from ..model import Model from ..config import registry +from ..model import Model from ..types import Floats2d - InT = Floats2d OutT = Floats2d diff --git a/thinc/layers/sparselinear.pyx b/thinc/layers/sparselinear.pyx index b9a982f4b..a1be75ccc 100644 --- a/thinc/layers/sparselinear.pyx +++ b/thinc/layers/sparselinear.pyx @@ -1,16 +1,15 @@ # cython: infer_types=True, cdivision=True, bounds_check=False, wraparound=False -cimport numpy as np -from libc.stdint cimport uint64_t, int32_t, uint32_t cimport cython +cimport numpy as np +from libc.stdint cimport int32_t, uint32_t, uint64_t -from typing import Tuple, Callable, Optional +from typing import Callable, Optional, Tuple -from ..types import ArrayXd -from ..model import Model +from ..backends import CupyOps, NumpyOps from ..config import registry -from ..util import get_width, is_cupy_array, is_numpy_array, get_array_module -from ..backends import NumpyOps, CupyOps - +from ..model import Model +from ..types import ArrayXd +from ..util import get_array_module, get_width, is_cupy_array, is_numpy_array InT = Tuple[ArrayXd, ArrayXd, ArrayXd] OutT = ArrayXd diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py index 469b1636d..91a6b1a31 100644 --- a/thinc/layers/strings2arrays.py +++ b/thinc/layers/strings2arrays.py @@ -1,11 +1,11 @@ -from typing import Tuple, List, Callable, Sequence +from typing import Callable, List, Sequence, Tuple + from murmurhash import hash_unicode -from ..model import Model from ..config import registry +from ..model import Model from ..types import Ints2d - InT = Sequence[Sequence[str]] OutT = List[Ints2d] diff --git a/thinc/layers/swish.py b/thinc/layers/swish.py index 4f3fe49d5..5cf8be50f 100644 --- a/thinc/layers/swish.py +++ b/thinc/layers/swish.py @@ -1,13 +1,13 @@ -from typing import Tuple, Optional, Callable, cast +from typing import Callable, Optional, Tuple, cast from ..config import registry +from ..initializers import he_normal_init, zero_init from ..model import Model +from ..types import Floats1d, Floats2d +from ..util import get_width, partial from .chain import chain -from .layernorm import LayerNorm from .dropout import Dropout -from ..types import Floats1d, Floats2d -from ..util import partial, get_width -from ..initializers import he_normal_init, zero_init +from .layernorm import LayerNorm @registry.layers("Swish.v1") diff --git a/thinc/layers/tensorflowwrapper.py b/thinc/layers/tensorflowwrapper.py index 7e166ea50..a77e0b3af 100644 --- a/thinc/layers/tensorflowwrapper.py +++ b/thinc/layers/tensorflowwrapper.py @@ -2,12 +2,18 @@ import srsly +from ..compat import tensorflow as tf from ..model import Model from ..shims import TensorFlowShim, keras_model_fns, maybe_handshake_model -from ..util import xp2tensorflow, tensorflow2xp, assert_tensorflow_installed -from ..util import is_tensorflow_array, convert_recursive, is_xp_array -from ..types import ArrayXd, ArgsKwargs -from ..compat import tensorflow as tf +from ..types import ArgsKwargs, ArrayXd +from ..util import ( + assert_tensorflow_installed, + convert_recursive, + is_tensorflow_array, + is_xp_array, + tensorflow2xp, + xp2tensorflow, +) InT = TypeVar("InT") OutT = TypeVar("OutT") diff --git a/thinc/layers/torchscriptwrapper.py b/thinc/layers/torchscriptwrapper.py index a74db9225..a3a8e1ac0 100644 --- a/thinc/layers/torchscriptwrapper.py +++ b/thinc/layers/torchscriptwrapper.py @@ -3,8 +3,11 @@ from ..compat import torch from ..model import Model from ..shims import PyTorchGradScaler, PyTorchShim, TorchScriptShim -from .pytorchwrapper import forward, convert_pytorch_default_inputs -from .pytorchwrapper import convert_pytorch_default_outputs +from .pytorchwrapper import ( + convert_pytorch_default_inputs, + convert_pytorch_default_outputs, + forward, +) def TorchScriptWrapper_v1( diff --git a/thinc/layers/tuplify.py b/thinc/layers/tuplify.py index 99b4d7589..35dfdc66f 100644 --- a/thinc/layers/tuplify.py +++ b/thinc/layers/tuplify.py @@ -1,7 +1,7 @@ -from typing import Optional, Tuple, Any, TypeVar +from typing import Any, Optional, Tuple, TypeVar -from ..model import Model from ..config import registry +from ..model import Model InT = TypeVar("InT") OutT = Tuple diff --git a/thinc/layers/uniqued.py b/thinc/layers/uniqued.py index 582b31093..26f2cdf16 100644 --- a/thinc/layers/uniqued.py +++ b/thinc/layers/uniqued.py @@ -1,10 +1,10 @@ -from typing import Tuple, Callable, Optional +from typing import Callable, Optional, Tuple + import numpy -from ..model import Model from ..config import registry -from ..types import Ints2d, Floats2d - +from ..model import Model +from ..types import Floats2d, Ints2d InT = Ints2d OutT = Floats2d diff --git a/thinc/layers/with_array.py b/thinc/layers/with_array.py index 2511b3c17..31b9fa494 100644 --- a/thinc/layers/with_array.py +++ b/thinc/layers/with_array.py @@ -1,10 +1,9 @@ -from typing import Tuple, Callable, Optional, TypeVar, Union, cast +from typing import Callable, Optional, Tuple, TypeVar, Union, cast from ..backends import NumpyOps -from ..model import Model from ..config import registry -from ..types import Padded, Ragged, ArrayXd, Array3d, ListXd - +from ..model import Model +from ..types import Array3d, ArrayXd, ListXd, Padded, Ragged NUMPY_OPS = NumpyOps() diff --git a/thinc/layers/with_array2d.py b/thinc/layers/with_array2d.py index 740593a26..98eba8b96 100644 --- a/thinc/layers/with_array2d.py +++ b/thinc/layers/with_array2d.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union +from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast from ..backends import NumpyOps -from ..model import Model from ..config import registry +from ..model import Model from ..types import Array2d, Floats2d, List2d, Padded, Ragged - NUMPY_OPS = NumpyOps() diff --git a/thinc/layers/with_cpu.py b/thinc/layers/with_cpu.py index 3fc7645a8..39e5965f2 100644 --- a/thinc/layers/with_cpu.py +++ b/thinc/layers/with_cpu.py @@ -1,10 +1,11 @@ -from typing import Tuple, Callable, Any +from typing import Any, Callable, Tuple import numpy + from thinc.backends import Ops -from ..model import Model from ..config import registry +from ..model import Model @registry.layers("with_cpu.v1") diff --git a/thinc/layers/with_debug.py b/thinc/layers/with_debug.py index 91505c9f6..21790e468 100644 --- a/thinc/layers/with_debug.py +++ b/thinc/layers/with_debug.py @@ -1,4 +1,4 @@ -from typing import Optional, Callable, Any, Tuple, TypeVar +from typing import Any, Callable, Optional, Tuple, TypeVar from ..model import Model diff --git a/thinc/layers/with_flatten.py b/thinc/layers/with_flatten.py index 5cf8a85cf..9658a788f 100644 --- a/thinc/layers/with_flatten.py +++ b/thinc/layers/with_flatten.py @@ -1,7 +1,7 @@ -from typing import Tuple, Callable, Sequence, Any, cast, TypeVar, Optional, List +from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, cast -from ..model import Model from ..config import registry +from ..model import Model from ..types import ArrayXd, ListXd ItemT = TypeVar("ItemT") diff --git a/thinc/layers/with_flatten_v2.py b/thinc/layers/with_flatten_v2.py index 4dd75e0d1..95549994f 100644 --- a/thinc/layers/with_flatten_v2.py +++ b/thinc/layers/with_flatten_v2.py @@ -1,8 +1,7 @@ -from typing import Tuple, Callable, Sequence, Any, cast, TypeVar, Optional, List +from typing import Any, Callable, List, Optional, Sequence, Tuple, TypeVar, cast -from ..model import Model from ..config import registry - +from ..model import Model InItemT = TypeVar("InItemT") OutItemT = TypeVar("OutItemT") diff --git a/thinc/layers/with_getitem.py b/thinc/layers/with_getitem.py index 9f6b93459..fb6a3cccf 100644 --- a/thinc/layers/with_getitem.py +++ b/thinc/layers/with_getitem.py @@ -1,8 +1,7 @@ -from typing import Callable, Optional, Tuple, Any +from typing import Any, Callable, Optional, Tuple -from ..model import Model from ..config import registry - +from ..model import Model InT = Tuple[Any, ...] OutT = Tuple[Any, ...] diff --git a/thinc/layers/with_list.py b/thinc/layers/with_list.py index 9f86c24dc..5331758a5 100644 --- a/thinc/layers/with_list.py +++ b/thinc/layers/with_list.py @@ -1,8 +1,8 @@ -from typing import Tuple, Callable, List, Optional, TypeVar, Union, cast +from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast -from ..types import Padded, Ragged, Array2d, List2d, Floats2d, Ints2d -from ..model import Model from ..config import registry +from ..model import Model +from ..types import Array2d, Floats2d, Ints2d, List2d, Padded, Ragged SeqT = TypeVar("SeqT", Padded, Ragged, List2d, List[Floats2d], List[Ints2d]) diff --git a/thinc/layers/with_nvtx_range.py b/thinc/layers/with_nvtx_range.py index bf270abce..480f82a7c 100644 --- a/thinc/layers/with_nvtx_range.py +++ b/thinc/layers/with_nvtx_range.py @@ -1,9 +1,8 @@ -from typing import Optional, Callable, Any, Tuple, TypeVar +from typing import Any, Callable, Optional, Tuple, TypeVar from ..model import Model from ..util import use_nvtx_range - _ModelT = TypeVar("_ModelT", bound=Model) diff --git a/thinc/layers/with_padded.py b/thinc/layers/with_padded.py index 379df1bef..b92c6308a 100644 --- a/thinc/layers/with_padded.py +++ b/thinc/layers/with_padded.py @@ -1,11 +1,10 @@ -from typing import Tuple, Callable, Optional, TypeVar, Union, cast, List +from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast -from ..types import Padded, Ragged, Floats3d, Ints1d, List2d, Array2d -from ..model import Model from ..config import registry +from ..model import Model +from ..types import Array2d, Floats3d, Ints1d, List2d, Padded, Ragged from ..util import is_xp_array - PaddedData = Tuple[Floats3d, Ints1d, Ints1d, Ints1d] SeqT = TypeVar("SeqT", bound=Union[Padded, Ragged, List2d, Floats3d, PaddedData]) diff --git a/thinc/layers/with_ragged.py b/thinc/layers/with_ragged.py index cbff6f59d..6cf45d9e8 100644 --- a/thinc/layers/with_ragged.py +++ b/thinc/layers/with_ragged.py @@ -1,10 +1,9 @@ -from typing import Tuple, Callable, Optional, TypeVar, cast, List, Union +from typing import Callable, List, Optional, Tuple, TypeVar, Union, cast from ..backends import NumpyOps -from ..types import Padded, Ragged, Array2d, ListXd, List2d, Ints1d -from ..model import Model from ..config import registry - +from ..model import Model +from ..types import Array2d, Ints1d, List2d, ListXd, Padded, Ragged NUMPY_OPS = NumpyOps() diff --git a/thinc/layers/with_reshape.py b/thinc/layers/with_reshape.py index 5bd3e9025..b40ada757 100644 --- a/thinc/layers/with_reshape.py +++ b/thinc/layers/with_reshape.py @@ -1,9 +1,8 @@ -from typing import Tuple, Callable, Optional, cast, TypeVar, List +from typing import Callable, List, Optional, Tuple, TypeVar, cast -from ..model import Model from ..config import registry -from ..types import Array3d, Array2d - +from ..model import Model +from ..types import Array2d, Array3d InT = TypeVar("InT", bound=Array3d) OutT = TypeVar("OutT", bound=Array2d) diff --git a/thinc/layers/with_signpost_interval.py b/thinc/layers/with_signpost_interval.py index 9a468d896..58f5d4165 100644 --- a/thinc/layers/with_signpost_interval.py +++ b/thinc/layers/with_signpost_interval.py @@ -1,9 +1,8 @@ -from typing import Optional, Callable, Any, Tuple, TypeVar +from typing import Any, Callable, Optional, Tuple, TypeVar from ..compat import has_os_signpost, os_signpost from ..model import Model - _ModelT = TypeVar("_ModelT", bound=Model) diff --git a/thinc/loss.py b/thinc/loss.py index e8edb194d..756dac4c3 100644 --- a/thinc/loss.py +++ b/thinc/loss.py @@ -1,11 +1,20 @@ -from typing import Tuple, Sequence, cast, TypeVar, Generic, Any, Union, Optional, List -from typing import Dict from abc import abstractmethod +from typing import ( + Any, + Dict, + Generic, + List, + Optional, + Sequence, + Tuple, + TypeVar, + Union, + cast, +) +from .config import registry from .types import Floats2d, Ints1d from .util import get_array_module, to_categorical -from .config import registry - LossT = TypeVar("LossT") GradT = TypeVar("GradT") diff --git a/thinc/model.py b/thinc/model.py index e094d5294..ba49215c1 100644 --- a/thinc/model.py +++ b/thinc/model.py @@ -1,20 +1,39 @@ -from typing import Dict, List, Callable, Optional, Any, Union, Iterable, Set, cast -from typing import Generic, Sequence, Tuple, TypeVar, Iterator import contextlib -from contextvars import ContextVar -import srsly -from pathlib import Path import copy import functools import threading +from contextvars import ContextVar +from pathlib import Path +from typing import ( + Any, + Callable, + Dict, + Generic, + Iterable, + Iterator, + List, + Optional, + Sequence, + Set, + Tuple, + TypeVar, + Union, + cast, +) + +import srsly -from .backends import ParamServer, Ops, NumpyOps, CupyOps, get_current_ops +from .backends import CupyOps, NumpyOps, Ops, ParamServer, get_current_ops from .optimizers import Optimizer # noqa: F401 from .shims import Shim -from .util import convert_recursive, is_xp_array, DATA_VALIDATION -from .util import partial, validate_fwd_input_output from .types import FloatsXd - +from .util import ( + DATA_VALIDATION, + convert_recursive, + is_xp_array, + partial, + validate_fwd_input_output, +) InT = TypeVar("InT") OutT = TypeVar("OutT") diff --git a/thinc/mypy.py b/thinc/mypy.py index e02f6d5be..73c6e72f6 100644 --- a/thinc/mypy.py +++ b/thinc/mypy.py @@ -1,13 +1,14 @@ -from typing import Dict, List import itertools -from mypy.errors import Errors +from typing import Dict, List + +from mypy.checker import TypeChecker from mypy.errorcodes import ErrorCode +from mypy.errors import Errors +from mypy.nodes import CallExpr, Decorator, Expression, FuncDef, MypyFile, NameExpr from mypy.options import Options -from mypy.plugin import FunctionContext, Plugin, CheckerPluginInterface -from mypy.types import Instance, Type, CallableType, TypeVarType -from mypy.nodes import Expression, CallExpr, NameExpr, FuncDef, Decorator, MypyFile -from mypy.checker import TypeChecker +from mypy.plugin import CheckerPluginInterface, FunctionContext, Plugin from mypy.subtypes import is_subtype +from mypy.types import CallableType, Instance, Type, TypeVarType thinc_model_fullname = "thinc.model.Model" chained_out_fullname = "thinc.types.XY_YZ_OutT" diff --git a/thinc/optimizers.py b/thinc/optimizers.py index b0636fd87..071ad4e85 100644 --- a/thinc/optimizers.py +++ b/thinc/optimizers.py @@ -1,14 +1,13 @@ -from typing import Any, Dict, Optional, Union, Tuple, List, cast -from collections import defaultdict import itertools import math +from collections import defaultdict from types import GeneratorType +from typing import Any, Dict, List, Optional, Tuple, Union, cast from .backends import get_array_ops -from .types import Generator, FloatsXd from .config import registry -from .schedules import constant, Schedule - +from .schedules import Schedule, constant +from .types import FloatsXd, Generator KeyT = Tuple[int, str] ScheduleT = Union[float, List[float], Generator, Schedule] diff --git a/thinc/schedules.py b/thinc/schedules.py index 49e43a0c8..2f99a536a 100644 --- a/thinc/schedules.py +++ b/thinc/schedules.py @@ -1,8 +1,8 @@ """Generators that provide different rates, schedules, decays or series.""" -from typing import Any, Callable, Dict, Generator, Generic, Tuple, TypeVar -from typing import Optional -from dataclasses import dataclass import itertools +from dataclasses import dataclass +from typing import Any, Callable, Dict, Generator, Generic, Optional, Tuple, TypeVar + import numpy from .config import registry diff --git a/thinc/shims/__init__.py b/thinc/shims/__init__.py index 9cd8bd030..fb246c9f2 100644 --- a/thinc/shims/__init__.py +++ b/thinc/shims/__init__.py @@ -1,10 +1,9 @@ -from .shim import Shim +from .mxnet import MXNetShim from .pytorch import PyTorchShim from .pytorch_grad_scaler import PyTorchGradScaler -from .tensorflow import keras_model_fns, TensorFlowShim, maybe_handshake_model +from .shim import Shim +from .tensorflow import TensorFlowShim, keras_model_fns, maybe_handshake_model from .torchscript import TorchScriptShim -from .mxnet import MXNetShim - # fmt: off __all__ = [ diff --git a/thinc/shims/mxnet.py b/thinc/shims/mxnet.py index 3962a2ef5..2dd36a62f 100644 --- a/thinc/shims/mxnet.py +++ b/thinc/shims/mxnet.py @@ -1,13 +1,19 @@ +import copy from typing import Any, cast + import srsly -import copy -from ..util import mxnet2xp, convert_recursive, make_tempfile, xp2mxnet -from ..util import get_array_module +from ..compat import mxnet as mx from ..optimizers import Optimizer from ..types import ArgsKwargs, FloatsXd +from ..util import ( + convert_recursive, + get_array_module, + make_tempfile, + mxnet2xp, + xp2mxnet, +) from .shim import Shim -from ..compat import mxnet as mx class MXNetShim(Shim): diff --git a/thinc/shims/pytorch.py b/thinc/shims/pytorch.py index 9582c8616..505669867 100644 --- a/thinc/shims/pytorch.py +++ b/thinc/shims/pytorch.py @@ -1,16 +1,21 @@ -from typing import Any, Dict, Optional, cast, Callable import contextlib -from io import BytesIO import itertools +from io import BytesIO +from typing import Any, Callable, Dict, Optional, cast + import srsly -from ..util import torch2xp, xp2torch, convert_recursive, iterate_recursive -from ..util import get_torch_default_device +from ..backends import CupyOps, context_pools, get_current_ops, set_gpu_allocator from ..compat import torch -from ..backends import get_current_ops, context_pools, CupyOps -from ..backends import set_gpu_allocator from ..optimizers import Optimizer from ..types import ArgsKwargs, FloatsXd +from ..util import ( + convert_recursive, + get_torch_default_device, + iterate_recursive, + torch2xp, + xp2torch, +) from .pytorch_grad_scaler import PyTorchGradScaler from .shim import Shim diff --git a/thinc/shims/shim.py b/thinc/shims/shim.py index 0c246e8d4..ef88408a3 100644 --- a/thinc/shims/shim.py +++ b/thinc/shims/shim.py @@ -1,8 +1,8 @@ -from typing import Any, Optional, Tuple, Callable, Dict, Union -import copy import contextlib -from pathlib import Path +import copy import threading +from pathlib import Path +from typing import Any, Callable, Dict, Optional, Tuple, Union class Shim: # pragma: no cover diff --git a/thinc/shims/tensorflow.py b/thinc/shims/tensorflow.py index d630d86f9..bcaae3aac 100644 --- a/thinc/shims/tensorflow.py +++ b/thinc/shims/tensorflow.py @@ -1,17 +1,18 @@ -from typing import Any, Dict, List, Optional -import catalogue import contextlib import copy from io import BytesIO +from typing import Any, Dict, List, Optional + +import catalogue import numpy from ..backends import Ops, get_current_ops +from ..compat import cupy, h5py +from ..compat import tensorflow as tf from ..optimizers import Optimizer from ..types import ArgsKwargs, ArrayXd from ..util import get_array_module from .shim import Shim -from ..compat import tensorflow as tf -from ..compat import cupy, h5py keras_model_fns = catalogue.create("thinc", "keras", entry_points=True) diff --git a/thinc/shims/torchscript.py b/thinc/shims/torchscript.py index 675718cd1..6c05c8a9b 100644 --- a/thinc/shims/torchscript.py +++ b/thinc/shims/torchscript.py @@ -1,5 +1,6 @@ -from typing import Any, Optional from io import BytesIO +from typing import Any, Optional + import srsly from ..compat import torch diff --git a/thinc/tests/backends/test_mem.py b/thinc/tests/backends/test_mem.py index cb26e24e0..bf867726d 100644 --- a/thinc/tests/backends/test_mem.py +++ b/thinc/tests/backends/test_mem.py @@ -1,6 +1,7 @@ -from thinc.backends._param_server import ParamServer import numpy +from thinc.backends._param_server import ParamServer + def test_param_server_init(): array = numpy.zeros((5,), dtype="f") diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py index 83dd582ea..3cec4b6fa 100644 --- a/thinc/tests/backends/test_ops.py +++ b/thinc/tests/backends/test_ops.py @@ -1,26 +1,32 @@ +import inspect +import platform from typing import Tuple, cast -import pytest import numpy -import platform +import pytest from hypothesis import given, settings from hypothesis.strategies import composite, integers from numpy.testing import assert_allclose from packaging.version import Version -from thinc.api import NumpyOps, CupyOps, Ops, get_ops -from thinc.api import get_current_ops, use_ops -from thinc.util import torch2xp, xp2torch + +from thinc.api import ( + LSTM, + CupyOps, + NumpyOps, + Ops, + fix_random_seed, + get_current_ops, + get_ops, + use_ops, +) +from thinc.backends._custom_kernels import KERNELS, KERNELS_LIST, compile_mmh from thinc.compat import has_cupy_gpu, has_torch, torch_version -from thinc.api import fix_random_seed -from thinc.api import LSTM from thinc.types import Floats2d -from thinc.backends._custom_kernels import KERNELS_LIST, KERNELS, compile_mmh -import inspect +from thinc.util import torch2xp, xp2torch from .. import strategies from ..strategies import arrays_BI, ndarrays_of_shape - MAX_EXAMPLES = 10 VANILLA_OPS = Ops(numpy) # type:ignore @@ -37,9 +43,10 @@ def create_pytorch_funcs(): - import torch import math + import torch + def torch_relu(x): return torch.nn.functional.relu(x) diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py index 19b5137d3..026f3eb06 100644 --- a/thinc/tests/conftest.py +++ b/thinc/tests/conftest.py @@ -52,9 +52,10 @@ def getopt(opt): @pytest.fixture() def pathy_fixture(): pytest.importorskip("pathy") - import tempfile import shutil - from pathy import use_fs, Pathy + import tempfile + + from pathy import Pathy, use_fs temp_folder = tempfile.mkdtemp(prefix="thinc-pathy") use_fs(temp_folder) diff --git a/thinc/tests/layers/test_basic_tagger.py b/thinc/tests/layers/test_basic_tagger.py index 3046c1b04..855a6d6ad 100644 --- a/thinc/tests/layers/test_basic_tagger.py +++ b/thinc/tests/layers/test_basic_tagger.py @@ -1,7 +1,18 @@ -import pytest import random -from thinc.api import Model, Relu, Softmax, HashEmbed, expand_window -from thinc.api import chain, with_array, Adam, strings2arrays + +import pytest + +from thinc.api import ( + Adam, + HashEmbed, + Model, + Relu, + Softmax, + chain, + expand_window, + strings2arrays, + with_array, +) @pytest.fixture(scope="module") diff --git a/thinc/tests/layers/test_combinators.py b/thinc/tests/layers/test_combinators.py index ea5583108..c7b4fbe9f 100644 --- a/thinc/tests/layers/test_combinators.py +++ b/thinc/tests/layers/test_combinators.py @@ -1,8 +1,18 @@ -import pytest import numpy +import pytest from numpy.testing import assert_allclose -from thinc.api import clone, concatenate, noop, add, map_list -from thinc.api import Linear, Dropout, Model, NumpyOps + +from thinc.api import ( + Dropout, + Linear, + Model, + NumpyOps, + add, + clone, + concatenate, + map_list, + noop, +) from thinc.layers import chain, tuplify diff --git a/thinc/tests/layers/test_feed_forward.py b/thinc/tests/layers/test_feed_forward.py index b18a0fc0b..a808bb445 100644 --- a/thinc/tests/layers/test_feed_forward.py +++ b/thinc/tests/layers/test_feed_forward.py @@ -1,8 +1,10 @@ -import pytest -import numpy from functools import partial + +import numpy +import pytest from numpy.testing import assert_allclose -from thinc.api import chain, Linear, Relu, NumpyOps + +from thinc.api import Linear, NumpyOps, Relu, chain @pytest.fixture(params=[1, 2, 9]) diff --git a/thinc/tests/layers/test_hash_embed.py b/thinc/tests/layers/test_hash_embed.py index 8df50a03f..5b79539fa 100644 --- a/thinc/tests/layers/test_hash_embed.py +++ b/thinc/tests/layers/test_hash_embed.py @@ -1,4 +1,5 @@ import numpy + from thinc.api import HashEmbed diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py index 761cad880..0ef559d96 100644 --- a/thinc/tests/layers/test_layers_api.py +++ b/thinc/tests/layers/test_layers_api.py @@ -1,14 +1,15 @@ from typing import List, Optional -from numpy.testing import assert_almost_equal -from thinc.api import registry, with_padded, Dropout, NumpyOps, Model -from thinc.backends import NumpyOps -from thinc.util import data_validation, get_width -from thinc.types import Ragged, Padded, Array2d, Floats2d, FloatsXd, Shape -from thinc.compat import has_torch import numpy import pytest import srsly +from numpy.testing import assert_almost_equal + +from thinc.api import Dropout, Model, NumpyOps, registry, with_padded +from thinc.backends import NumpyOps +from thinc.compat import has_torch +from thinc.types import Array2d, Floats2d, FloatsXd, Padded, Ragged, Shape +from thinc.util import data_validation, get_width OPS = NumpyOps() diff --git a/thinc/tests/layers/test_linear.py b/thinc/tests/layers/test_linear.py index 2362b556b..345669d87 100644 --- a/thinc/tests/layers/test_linear.py +++ b/thinc/tests/layers/test_linear.py @@ -1,9 +1,10 @@ +import numpy import pytest -from mock import MagicMock from hypothesis import given, settings -import numpy +from mock import MagicMock from numpy.testing import assert_allclose -from thinc.api import Linear, chain, Dropout, SGD + +from thinc.api import SGD, Dropout, Linear, chain from ..strategies import arrays_OI_O_BI from ..util import get_model, get_shape diff --git a/thinc/tests/layers/test_lstm.py b/thinc/tests/layers/test_lstm.py index 208ffb58b..44c90ed4c 100644 --- a/thinc/tests/layers/test_lstm.py +++ b/thinc/tests/layers/test_lstm.py @@ -1,10 +1,11 @@ -import numpy import timeit -from thinc.api import NumpyOps, LSTM, PyTorchLSTM, with_padded, fix_random_seed -from thinc.api import Ops -from thinc.compat import has_torch + +import numpy import pytest +from thinc.api import LSTM, NumpyOps, Ops, PyTorchLSTM, fix_random_seed, with_padded +from thinc.compat import has_torch + @pytest.fixture(params=[1, 6]) def nI(request): diff --git a/thinc/tests/layers/test_mappers.py b/thinc/tests/layers/test_mappers.py index e890dd086..85e984bc4 100644 --- a/thinc/tests/layers/test_mappers.py +++ b/thinc/tests/layers/test_mappers.py @@ -1,5 +1,6 @@ -import pytest import numpy +import pytest + from thinc.layers import premap_ids, remap_ids, remap_ids_v2 diff --git a/thinc/tests/layers/test_mnist.py b/thinc/tests/layers/test_mnist.py index 321de3a0f..060007cfd 100644 --- a/thinc/tests/layers/test_mnist.py +++ b/thinc/tests/layers/test_mnist.py @@ -1,8 +1,16 @@ import pytest -from thinc.api import Relu, Softmax, chain, clone, Adam -from thinc.api import PyTorchWrapper, TensorFlowWrapper -from thinc.api import get_current_ops -from thinc.compat import has_torch, has_tensorflow + +from thinc.api import ( + Adam, + PyTorchWrapper, + Relu, + Softmax, + TensorFlowWrapper, + chain, + clone, + get_current_ops, +) +from thinc.compat import has_tensorflow, has_torch @pytest.fixture(scope="module") diff --git a/thinc/tests/layers/test_mxnet_wrapper.py b/thinc/tests/layers/test_mxnet_wrapper.py index b954a8ec5..8ddf5dfce 100644 --- a/thinc/tests/layers/test_mxnet_wrapper.py +++ b/thinc/tests/layers/test_mxnet_wrapper.py @@ -2,10 +2,19 @@ import numpy import pytest -from thinc.api import Adam, ArgsKwargs, Model, Ops, MXNetWrapper -from thinc.api import get_current_ops, mxnet2xp, xp2mxnet -from thinc.types import Array2d, Array1d, IntsXd + +from thinc.api import ( + Adam, + ArgsKwargs, + Model, + MXNetWrapper, + Ops, + get_current_ops, + mxnet2xp, + xp2mxnet, +) from thinc.compat import has_cupy_gpu, has_mxnet +from thinc.types import Array1d, Array2d, IntsXd from thinc.util import to_categorical from ..util import check_input_converters, make_tempdir diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py index f4f83cb60..aa40d9044 100644 --- a/thinc/tests/layers/test_pytorch_wrapper.py +++ b/thinc/tests/layers/test_pytorch_wrapper.py @@ -1,20 +1,34 @@ -from thinc.api import Linear, SGD, PyTorchWrapper, PyTorchWrapper_v2, PyTorchWrapper_v3 -from thinc.api import xp2torch, torch2xp, ArgsKwargs, use_ops -from thinc.api import chain, get_current_ops, Relu -from thinc.api import CupyOps, MPSOps, NumpyOps +import numpy +import pytest + +from thinc.api import ( + SGD, + ArgsKwargs, + CupyOps, + Linear, + MPSOps, + NumpyOps, + PyTorchWrapper, + PyTorchWrapper_v2, + PyTorchWrapper_v3, + Relu, + chain, + get_current_ops, + torch2xp, + use_ops, + xp2torch, +) from thinc.backends import context_pools +from thinc.compat import has_cupy_gpu, has_torch, has_torch_amp, has_torch_mps_gpu from thinc.layers.pytorchwrapper import PyTorchWrapper_v3 +from thinc.shims.pytorch import ( + default_deserialize_torch_model, + default_serialize_torch_model, +) from thinc.shims.pytorch_grad_scaler import PyTorchGradScaler -from thinc.shims.pytorch import default_deserialize_torch_model -from thinc.shims.pytorch import default_serialize_torch_model -from thinc.compat import has_torch, has_torch_amp -from thinc.compat import has_cupy_gpu, has_torch_mps_gpu -import numpy -import pytest from thinc.util import get_torch_default_device -from ..util import make_tempdir, check_input_converters - +from ..util import check_input_converters, make_tempdir XP_OPS = [NumpyOps()] if has_cupy_gpu: diff --git a/thinc/tests/layers/test_reduce.py b/thinc/tests/layers/test_reduce.py index d26065c4a..608561e13 100644 --- a/thinc/tests/layers/test_reduce.py +++ b/thinc/tests/layers/test_reduce.py @@ -1,5 +1,6 @@ -import pytest import numpy +import pytest + from thinc.api import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum from thinc.types import Ragged diff --git a/thinc/tests/layers/test_resizable.py b/thinc/tests/layers/test_resizable.py index dfb6c67fd..ffa256de5 100644 --- a/thinc/tests/layers/test_resizable.py +++ b/thinc/tests/layers/test_resizable.py @@ -1,7 +1,9 @@ -import pytest from functools import partial -from thinc.api import resizable, Linear -from thinc.layers.resizable import resize_model, resize_linear_weighted + +import pytest + +from thinc.api import Linear, resizable +from thinc.layers.resizable import resize_linear_weighted, resize_model @pytest.fixture diff --git a/thinc/tests/layers/test_shim.py b/thinc/tests/layers/test_shim.py index bacde5cf6..dcb43ab1e 100644 --- a/thinc/tests/layers/test_shim.py +++ b/thinc/tests/layers/test_shim.py @@ -1,5 +1,7 @@ from typing import List + from thinc.shims.shim import Shim + from ..util import make_tempdir diff --git a/thinc/tests/layers/test_softmax.py b/thinc/tests/layers/test_softmax.py index 69072b558..95e2f41c7 100644 --- a/thinc/tests/layers/test_softmax.py +++ b/thinc/tests/layers/test_softmax.py @@ -1,8 +1,8 @@ from typing import Tuple, cast import numpy -from numpy.testing import assert_allclose import pytest +from numpy.testing import assert_allclose from thinc.api import Model, NumpyOps, Softmax_v2 from thinc.types import Floats2d, Ints1d diff --git a/thinc/tests/layers/test_sparse_linear.py b/thinc/tests/layers/test_sparse_linear.py index 87c5a3a75..cce0d1023 100644 --- a/thinc/tests/layers/test_sparse_linear.py +++ b/thinc/tests/layers/test_sparse_linear.py @@ -1,7 +1,9 @@ import math + import numpy import pytest -from thinc.api import SGD, to_categorical, SparseLinear, SparseLinear_v2 + +from thinc.api import SGD, SparseLinear, SparseLinear_v2, to_categorical @pytest.fixture diff --git a/thinc/tests/layers/test_tensorflow_wrapper.py b/thinc/tests/layers/test_tensorflow_wrapper.py index c1b85da3b..4741f6dc3 100644 --- a/thinc/tests/layers/test_tensorflow_wrapper.py +++ b/thinc/tests/layers/test_tensorflow_wrapper.py @@ -1,9 +1,19 @@ import numpy import pytest -from thinc.api import Adam, ArgsKwargs, Linear, Model, TensorFlowWrapper -from thinc.api import get_current_ops, keras_subclass, tensorflow2xp, xp2tensorflow -from thinc.util import to_categorical + +from thinc.api import ( + Adam, + ArgsKwargs, + Linear, + Model, + TensorFlowWrapper, + get_current_ops, + keras_subclass, + tensorflow2xp, + xp2tensorflow, +) from thinc.compat import has_cupy_gpu, has_tensorflow +from thinc.util import to_categorical from ..util import check_input_converters, make_tempdir diff --git a/thinc/tests/layers/test_torchscriptwrapper.py b/thinc/tests/layers/test_torchscriptwrapper.py index 37ff9ef08..b37afa3c3 100644 --- a/thinc/tests/layers/test_torchscriptwrapper.py +++ b/thinc/tests/layers/test_torchscriptwrapper.py @@ -1,8 +1,11 @@ -import pytest import numpy +import pytest -from thinc.api import PyTorchWrapper_v2, TorchScriptWrapper_v1 -from thinc.api import pytorch_to_torchscript_wrapper +from thinc.api import ( + PyTorchWrapper_v2, + TorchScriptWrapper_v1, + pytorch_to_torchscript_wrapper, +) from thinc.compat import has_torch, torch diff --git a/thinc/tests/layers/test_transforms.py b/thinc/tests/layers/test_transforms.py index 8de5341d7..3a9a110f1 100644 --- a/thinc/tests/layers/test_transforms.py +++ b/thinc/tests/layers/test_transforms.py @@ -1,7 +1,8 @@ -from thinc.api import strings2arrays, NumpyOps, Ragged, registry import numpy import pytest +from thinc.api import NumpyOps, Ragged, registry, strings2arrays + from ..util import get_data_checker diff --git a/thinc/tests/layers/test_uniqued.py b/thinc/tests/layers/test_uniqued.py index 9cb207ca5..685da1deb 100644 --- a/thinc/tests/layers/test_uniqued.py +++ b/thinc/tests/layers/test_uniqued.py @@ -1,10 +1,11 @@ -import pytest import numpy +import pytest +from hypothesis import given, settings +from hypothesis.strategies import composite, integers, lists +from numpy.testing import assert_allclose + from thinc.layers import Embed from thinc.layers.uniqued import uniqued -from numpy.testing import assert_allclose -from hypothesis import given, settings -from hypothesis.strategies import integers, lists, composite ROWS = 10 diff --git a/thinc/tests/layers/test_with_debug.py b/thinc/tests/layers/test_with_debug.py index 679c1f21e..3f65a3ac3 100644 --- a/thinc/tests/layers/test_with_debug.py +++ b/thinc/tests/layers/test_with_debug.py @@ -1,5 +1,6 @@ from mock import MagicMock -from thinc.api import with_debug, Linear + +from thinc.api import Linear, with_debug def test_with_debug(): diff --git a/thinc/tests/layers/test_with_flatten.py b/thinc/tests/layers/test_with_flatten.py index 1ff622026..86d18eb67 100644 --- a/thinc/tests/layers/test_with_flatten.py +++ b/thinc/tests/layers/test_with_flatten.py @@ -1,4 +1,5 @@ from typing import List + from thinc.api import Model, with_flatten_v2 INPUT = [[1, 2, 3], [4, 5], [], [6, 7, 8]] diff --git a/thinc/tests/layers/test_with_transforms.py b/thinc/tests/layers/test_with_transforms.py index c23db1463..82cdaed36 100644 --- a/thinc/tests/layers/test_with_transforms.py +++ b/thinc/tests/layers/test_with_transforms.py @@ -1,11 +1,20 @@ -import pytest import numpy import numpy.testing -from thinc.api import NumpyOps, Model, Linear, noop -from thinc.api import with_array2d, with_array, with_padded, with_list -from thinc.api import with_ragged, with_getitem -from thinc.types import Padded, Ragged +import pytest +from thinc.api import ( + Linear, + Model, + NumpyOps, + noop, + with_array, + with_array2d, + with_getitem, + with_list, + with_padded, + with_ragged, +) +from thinc.types import Padded, Ragged from ..util import get_data_checker diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py index 733b3329f..f93b46c8c 100644 --- a/thinc/tests/model/test_model.py +++ b/thinc/tests/model/test_model.py @@ -1,13 +1,28 @@ -from collections import Counter -import pytest import threading import time -from thinc.api import Adam, CupyOps, Dropout, Linear, Model, Relu -from thinc.api import Shim, Softmax, chain, change_attr_values -from thinc.api import concatenate, set_dropout_rate -from thinc.api import use_ops, with_debug, wrap_model_recursive -from thinc.compat import has_cupy_gpu +from collections import Counter + import numpy +import pytest + +from thinc.api import ( + Adam, + CupyOps, + Dropout, + Linear, + Model, + Relu, + Shim, + Softmax, + chain, + change_attr_values, + concatenate, + set_dropout_rate, + use_ops, + with_debug, + wrap_model_recursive, +) +from thinc.compat import has_cupy_gpu from ..util import make_tempdir diff --git a/thinc/tests/model/test_validation.py b/thinc/tests/model/test_validation.py index adecdd6d5..c58efd015 100644 --- a/thinc/tests/model/test_validation.py +++ b/thinc/tests/model/test_validation.py @@ -1,6 +1,15 @@ import pytest -from thinc.api import chain, Relu, reduce_max, Softmax, with_ragged -from thinc.api import ParametricAttention, list2ragged, reduce_sum + +from thinc.api import ( + ParametricAttention, + Relu, + Softmax, + chain, + list2ragged, + reduce_max, + reduce_sum, + with_ragged, +) from thinc.util import DataValidationError, data_validation diff --git a/thinc/tests/mypy/modules/fail_no_plugin.py b/thinc/tests/mypy/modules/fail_no_plugin.py index 807fd672b..f53e33ef3 100644 --- a/thinc/tests/mypy/modules/fail_no_plugin.py +++ b/thinc/tests/mypy/modules/fail_no_plugin.py @@ -1,4 +1,4 @@ -from thinc.api import chain, Relu, reduce_max, Softmax, add +from thinc.api import Relu, Softmax, add, chain, reduce_max bad_model = chain(Relu(10), reduce_max(), Softmax()) diff --git a/thinc/tests/mypy/modules/fail_plugin.py b/thinc/tests/mypy/modules/fail_plugin.py index b14fcecf0..6f23c82b1 100644 --- a/thinc/tests/mypy/modules/fail_plugin.py +++ b/thinc/tests/mypy/modules/fail_plugin.py @@ -1,4 +1,4 @@ -from thinc.api import chain, Relu, reduce_max, Softmax, add, concatenate +from thinc.api import Relu, Softmax, add, chain, concatenate, reduce_max bad_model = chain(Relu(10), reduce_max(), Softmax()) diff --git a/thinc/tests/mypy/modules/success_no_plugin.py b/thinc/tests/mypy/modules/success_no_plugin.py index b17cff053..058573e5b 100644 --- a/thinc/tests/mypy/modules/success_no_plugin.py +++ b/thinc/tests/mypy/modules/success_no_plugin.py @@ -1,4 +1,4 @@ -from thinc.api import chain, Relu, reduce_max, Softmax, add +from thinc.api import Relu, Softmax, add, chain, reduce_max good_model = chain(Relu(10), Relu(10), Softmax()) reveal_type(good_model) diff --git a/thinc/tests/mypy/modules/success_plugin.py b/thinc/tests/mypy/modules/success_plugin.py index 85879a88a..3214bdcb7 100644 --- a/thinc/tests/mypy/modules/success_plugin.py +++ b/thinc/tests/mypy/modules/success_plugin.py @@ -1,6 +1,6 @@ from typing import Any, TypeVar -from thinc.api import chain, Relu, reduce_max, Softmax, add, Model +from thinc.api import Model, Relu, Softmax, add, chain, reduce_max good_model = chain(Relu(10), Relu(10), Softmax()) reveal_type(good_model) diff --git a/thinc/tests/mypy/test_mypy.py b/thinc/tests/mypy/test_mypy.py index 2f2976882..f144128f4 100644 --- a/thinc/tests/mypy/test_mypy.py +++ b/thinc/tests/mypy/test_mypy.py @@ -1,8 +1,8 @@ import os import re -from pathlib import Path import shutil import sys +from pathlib import Path import pytest diff --git a/thinc/tests/regression/issue519/program.py b/thinc/tests/regression/issue519/program.py index b3e6dc9ba..bce5f3234 100644 --- a/thinc/tests/regression/issue519/program.py +++ b/thinc/tests/regression/issue519/program.py @@ -1,4 +1,4 @@ -from thinc.api import chain, concatenate, Relu, Softmax +from thinc.api import Relu, Softmax, chain, concatenate from thinc.model import Model from thinc.types import Floats2d diff --git a/thinc/tests/regression/test_issue208.py b/thinc/tests/regression/test_issue208.py index 25d7280f1..0c574d6d1 100644 --- a/thinc/tests/regression/test_issue208.py +++ b/thinc/tests/regression/test_issue208.py @@ -1,4 +1,4 @@ -from thinc.api import chain, Linear +from thinc.api import Linear, chain def test_issue208(): diff --git a/thinc/tests/shims/test_pytorch_grad_scaler.py b/thinc/tests/shims/test_pytorch_grad_scaler.py index 2ab0fa738..d4ac10fec 100644 --- a/thinc/tests/shims/test_pytorch_grad_scaler.py +++ b/thinc/tests/shims/test_pytorch_grad_scaler.py @@ -1,10 +1,10 @@ import pytest - from hypothesis import given, settings from hypothesis.strategies import lists, one_of, tuples + +from thinc.api import PyTorchGradScaler from thinc.compat import has_torch, has_torch_amp, has_torch_cuda_gpu, torch from thinc.util import is_torch_array -from thinc.api import PyTorchGradScaler from ..strategies import ndarrays diff --git a/thinc/tests/strategies.py b/thinc/tests/strategies.py index 322728cd9..bc12975aa 100644 --- a/thinc/tests/strategies.py +++ b/thinc/tests/strategies.py @@ -1,7 +1,8 @@ import numpy -from hypothesis.strategies import just, tuples, integers, floats from hypothesis.extra.numpy import arrays -from thinc.api import NumpyOps, Linear +from hypothesis.strategies import floats, integers, just, tuples + +from thinc.api import Linear, NumpyOps def get_ops(): diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py index e028937da..254fcf078 100644 --- a/thinc/tests/test_config.py +++ b/thinc/tests/test_config.py @@ -1,20 +1,21 @@ -import pytest -from typing import Iterable, Union, Optional, List, Callable, Dict, Any +import inspect +import pickle from types import GeneratorType -from pydantic import BaseModel, StrictBool, StrictFloat, PositiveInt, constr +from typing import Any, Callable, Dict, Iterable, List, Optional, Union + import catalogue +import numpy +import pytest +from pydantic import BaseModel, PositiveInt, StrictBool, StrictFloat, constr + import thinc.config +from thinc.api import Config, Model, NumpyOps, RAdam from thinc.config import ConfigValidationError from thinc.types import Generator, Ragged -from thinc.api import Config, RAdam, Model, NumpyOps from thinc.util import partial -import numpy -import inspect -import pickle from .util import make_tempdir - EXAMPLE_CONFIG = """ [optimizer] @optimizers = "Adam.v1" diff --git a/thinc/tests/test_import__all__.py b/thinc/tests/test_import__all__.py index 226783ec2..fb0a08a20 100644 --- a/thinc/tests/test_import__all__.py +++ b/thinc/tests/test_import__all__.py @@ -1,9 +1,9 @@ import ast +import importlib from collections import namedtuple -from typing import Tuple, List +from typing import List, Tuple import pytest -import importlib _Import = namedtuple("_Import", ["module", "name", "alias"]) diff --git a/thinc/tests/test_indexing.py b/thinc/tests/test_indexing.py index 98fbc4437..2703e5dfa 100644 --- a/thinc/tests/test_indexing.py +++ b/thinc/tests/test_indexing.py @@ -1,7 +1,8 @@ -import pytest import numpy +import pytest from numpy.testing import assert_allclose -from thinc.types import Ragged, Pairs + +from thinc.types import Pairs, Ragged @pytest.fixture diff --git a/thinc/tests/test_initializers.py b/thinc/tests/test_initializers.py index 4f7c8f2cc..628398be0 100644 --- a/thinc/tests/test_initializers.py +++ b/thinc/tests/test_initializers.py @@ -1,8 +1,14 @@ +import numpy import pytest -from thinc.api import glorot_uniform_init, zero_init, uniform_init, normal_init -from thinc.api import NumpyOps + from thinc import registry -import numpy +from thinc.api import ( + NumpyOps, + glorot_uniform_init, + normal_init, + uniform_init, + zero_init, +) @pytest.mark.parametrize( diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py index 75206d240..fc100dd3a 100644 --- a/thinc/tests/test_loss.py +++ b/thinc/tests/test_loss.py @@ -1,8 +1,13 @@ -import pytest import numpy -from thinc.api import CategoricalCrossentropy, SequenceCategoricalCrossentropy -from thinc.api import L2Distance, CosineDistance +import pytest + from thinc import registry +from thinc.api import ( + CategoricalCrossentropy, + CosineDistance, + L2Distance, + SequenceCategoricalCrossentropy, +) # some simple arrays scores0 = numpy.zeros((3, 3), dtype="f") diff --git a/thinc/tests/test_optimizers.py b/thinc/tests/test_optimizers.py index 0fab737f9..57b5a27ff 100644 --- a/thinc/tests/test_optimizers.py +++ b/thinc/tests/test_optimizers.py @@ -1,8 +1,8 @@ -import pytest -from thinc.api import registry, Optimizer -from thinc.optimizers import KeyT, _wrap_generator import numpy +import pytest +from thinc.api import Optimizer, registry +from thinc.optimizers import KeyT, _wrap_generator STUB_KEY: KeyT = (0, "") diff --git a/thinc/tests/test_schedules.py b/thinc/tests/test_schedules.py index c404fe128..693dcfcc7 100644 --- a/thinc/tests/test_schedules.py +++ b/thinc/tests/test_schedules.py @@ -1,7 +1,16 @@ from itertools import islice + import pytest -from thinc.api import decaying, compounding, slanted_triangular, constant_then -from thinc.api import constant, warmup_linear, cyclic_triangular + +from thinc.api import ( + compounding, + constant, + constant_then, + cyclic_triangular, + decaying, + slanted_triangular, + warmup_linear, +) from thinc.optimizers import KeyT from thinc.schedules import plateau diff --git a/thinc/tests/test_serialize.py b/thinc/tests/test_serialize.py index b89fc2d94..a457cd237 100644 --- a/thinc/tests/test_serialize.py +++ b/thinc/tests/test_serialize.py @@ -1,7 +1,16 @@ import pytest import srsly -from thinc.api import with_array, Linear, Maxout, chain, Model, Shim -from thinc.api import serialize_attr, deserialize_attr + +from thinc.api import ( + Linear, + Maxout, + Model, + Shim, + chain, + deserialize_attr, + serialize_attr, + with_array, +) @pytest.fixture diff --git a/thinc/tests/test_types.py b/thinc/tests/test_types.py index 249ce2b80..ebfbb6fb6 100644 --- a/thinc/tests/test_types.py +++ b/thinc/tests/test_types.py @@ -1,8 +1,17 @@ import numpy -from pydantic import create_model, ValidationError -from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d -from thinc.types import Ints1d, Ints2d, Ints3d, Ints4d import pytest +from pydantic import ValidationError, create_model + +from thinc.types import ( + Floats1d, + Floats2d, + Floats3d, + Floats4d, + Ints1d, + Ints2d, + Ints3d, + Ints4d, +) @pytest.mark.parametrize( diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py index 133efbe60..77f6a7b86 100644 --- a/thinc/tests/test_util.py +++ b/thinc/tests/test_util.py @@ -1,12 +1,16 @@ -import pytest import numpy +import pytest from hypothesis import given -from thinc.api import get_width, Ragged, Padded -from thinc.util import get_array_module, is_numpy_array, to_categorical -from thinc.util import is_cupy_array -from thinc.util import convert_recursive -from thinc.types import ArgsKwargs +from thinc.api import Padded, Ragged, get_width +from thinc.types import ArgsKwargs +from thinc.util import ( + convert_recursive, + get_array_module, + is_cupy_array, + is_numpy_array, + to_categorical, +) from . import strategies diff --git a/thinc/tests/util.py b/thinc/tests/util.py index 7440a4b6e..defb9a2f6 100644 --- a/thinc/tests/util.py +++ b/thinc/tests/util.py @@ -1,10 +1,12 @@ import contextlib -from pathlib import Path -import tempfile import shutil -from thinc.api import Linear, Ragged, Padded, ArgsKwargs +import tempfile +from pathlib import Path + import numpy import pytest + +from thinc.api import ArgsKwargs, Linear, Padded, Ragged from thinc.util import has_cupy, is_cupy_array, is_numpy_array diff --git a/thinc/types.py b/thinc/types.py index c7e6a00f6..9a9487cb4 100644 --- a/thinc/types.py +++ b/thinc/types.py @@ -1,11 +1,28 @@ -from typing import Union, Tuple, Sized, Container, Any, TypeVar, Callable -from typing import Iterable, Iterator, Sequence, Dict, Generic, cast -from typing import Optional, List, overload +import sys from abc import abstractmethod from dataclasses import dataclass +from typing import ( + Any, + Callable, + Container, + Dict, + Generic, + Iterable, + Iterator, + List, + Optional, + Sequence, + Sized, + Tuple, + TypeVar, + Union, + cast, + overload, +) + import numpy -import sys -from .compat import has_cupy, cupy + +from .compat import cupy, has_cupy if has_cupy: get_array_module = cupy.get_array_module @@ -14,9 +31,9 @@ # Use typing_extensions for Python versions < 3.8 if sys.version_info < (3, 8): - from typing_extensions import Protocol, Literal + from typing_extensions import Literal, Protocol else: - from typing import Protocol, Literal # noqa: F401 + from typing import Literal, Protocol # noqa: F401 # fmt: off diff --git a/thinc/util.py b/thinc/util.py index 9afec29ba..6f47f38df 100644 --- a/thinc/util.py +++ b/thinc/util.py @@ -1,25 +1,48 @@ -from typing import Any, Union, Sequence, cast, Dict, Optional, Callable, TypeVar -from typing import List, Mapping, TYPE_CHECKING -import numpy -import platform -import random +import contextlib import functools -from wasabi import table -from pydantic import create_model, ValidationError import inspect import os +import platform +import random import tempfile import threading -import contextlib from contextvars import ContextVar from dataclasses import dataclass +from typing import ( + TYPE_CHECKING, + Any, + Callable, + Dict, + List, + Mapping, + Optional, + Sequence, + TypeVar, + Union, + cast, +) + +import numpy +from pydantic import ValidationError, create_model +from wasabi import table -from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow -from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu -from .compat import has_torch_mps -from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack -from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd # noqa: E402 from . import types # noqa: E402 +from .compat import ( + cupy, + cupy_from_dlpack, + has_cupy, + has_cupy_gpu, + has_gpu, + has_mxnet, + has_tensorflow, + has_torch, + has_torch_cuda_gpu, + has_torch_mps, +) +from .compat import mxnet as mx +from .compat import tensorflow as tf +from .compat import torch +from .types import ArgsKwargs, ArrayXd, FloatsXd, IntsXd, Padded, Ragged # noqa: E402 if TYPE_CHECKING: from .api import Ops @@ -173,7 +196,7 @@ def set_active_gpu(gpu_id: int) -> "cupy.cuda.Device": # pragma: no cover def require_cpu() -> bool: # pragma: no cover """Use CPU through best available backend.""" - from .backends import set_current_ops, get_ops + from .backends import get_ops, set_current_ops ops = get_ops("cpu") set_current_ops(ops) @@ -189,7 +212,7 @@ def prefer_gpu(gpu_id: int = 0) -> bool: # pragma: no cover def require_gpu(gpu_id: int = 0) -> bool: # pragma: no cover - from .backends import set_current_ops, CupyOps, MPSOps + from .backends import CupyOps, MPSOps, set_current_ops if platform.system() == "Darwin" and not has_torch_mps: if has_torch: From d34f536ea6cb2df6ad2f72e1e5b7511aafe3c66d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 9 Jan 2024 10:24:20 +0100 Subject: [PATCH 19/30] strings2arrays: make work again for sequences of inequal length PR #897 fixed the dtypes in strings2arrays, however also broke strings2arrays for batches with sequences if inequal lengths. --- thinc/layers/strings2arrays.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py index 91a6b1a31..eba2c983d 100644 --- a/thinc/layers/strings2arrays.py +++ b/thinc/layers/strings2arrays.py @@ -1,3 +1,4 @@ +from ctypes import c_uint64 from typing import Callable, List, Sequence, Tuple from murmurhash import hash_unicode @@ -17,8 +18,10 @@ def strings2arrays() -> Model[InT, OutT]: def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]: - hashes = [[hash_unicode(word) for word in X] for X in Xs] - hash_arrays = [model.ops.asarray2i(h, dtype="uint64") for h in hashes] + # Cast 32-bit (signed) integer to 64-bit unsigned, since such casting + # is deprecated in NumPy. + hashes = [[c_uint64(hash_unicode(word)).value for word in X] for X in Xs] + hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes] arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays] def backprop(dX: OutT) -> InT: From 5c46b82a47781ebff91c953c42c2361a3cdd4f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 9 Jan 2024 11:05:46 +0100 Subject: [PATCH 20/30] Fix local thread storage usage and make it typecheck The way we used local thread storage before did not typecheck, since we assigned to `Thread`. Thread local storage can be a global variable, the state of this object will be different per thread. --- thinc/backends/__init__.py | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py index 8973c8836..eb954370f 100644 --- a/thinc/backends/__init__.py +++ b/thinc/backends/__init__.py @@ -26,6 +26,9 @@ # notebook might not have preserved contextvars across cells. _GLOBAL_STATE = {"ops": None} +# Thread-local state. +_LOCAL_STATE = threading.local() + def set_gpu_allocator(allocator: str) -> None: # pragma: no cover """Route GPU memory allocation via PyTorch or tensorflow. @@ -152,22 +155,14 @@ def contextvars_eq_thread_ops() -> bool: return False -def _get_thread_state(): +def _get_thread_state() -> threading.local: """Get a thread-specific state variable that inherits from a global state when it's created.""" - thread: threading.Thread = threading.current_thread() - if not hasattr(thread, "__local"): - thread.__local = _create_thread_local(_GLOBAL_STATE) - return thread.__local - - -def _create_thread_local( - attrs: Dict[str, Any], local_class: Type[threading.local] = threading.local -): - obj = local_class() - for name, value in attrs.items(): - setattr(obj, name, value) - return obj + if not hasattr(_LOCAL_STATE, "initialized") or not _LOCAL_STATE.initialized: + for name, value in _GLOBAL_STATE.items(): + setattr(_LOCAL_STATE, name, value) + _LOCAL_STATE.initialized = True + return _LOCAL_STATE __all__ = [ From 09e955586fa4f84308508257642cae17e61bad1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Tue, 9 Jan 2024 11:07:34 +0100 Subject: [PATCH 21/30] Fixup imports that lead to type checking issues --- thinc/tests/test_types.py | 1 - thinc/util.py | 1 - 2 files changed, 2 deletions(-) diff --git a/thinc/tests/test_types.py b/thinc/tests/test_types.py index 6bdf4ea23..bf2740bbb 100644 --- a/thinc/tests/test_types.py +++ b/thinc/tests/test_types.py @@ -1,6 +1,5 @@ import numpy import pytest -from pydantic import ValidationError, create_model from thinc.types import ( Floats1d, diff --git a/thinc/util.py b/thinc/util.py index c6b4bcbc5..529faf875 100644 --- a/thinc/util.py +++ b/thinc/util.py @@ -32,7 +32,6 @@ from pydantic import ValidationError, create_model # type: ignore import numpy -from pydantic import ValidationError, create_model from wasabi import table from . import types # noqa: E402 From 6c314d27f350e0bc2a9206ad33b10b3ccf1d1282 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 16 Jan 2024 11:34:48 +0100 Subject: [PATCH 22/30] Fix strings2array (#918) * remove slow marker from basic tagger test * fix strings2array * isort --- thinc/layers/strings2arrays.py | 7 ++++--- thinc/tests/layers/test_basic_tagger.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/thinc/layers/strings2arrays.py b/thinc/layers/strings2arrays.py index ed40b1e88..eba2c983d 100644 --- a/thinc/layers/strings2arrays.py +++ b/thinc/layers/strings2arrays.py @@ -1,3 +1,4 @@ +from ctypes import c_uint64 from typing import Callable, List, Sequence, Tuple from murmurhash import hash_unicode @@ -17,9 +18,9 @@ def strings2arrays() -> Model[InT, OutT]: def forward(model: Model[InT, OutT], Xs: InT, is_train: bool) -> Tuple[OutT, Callable]: - hashes = model.ops.asarray2i( - [[hash_unicode(word) for word in X] for X in Xs], dtype="int32" - ) + # Cast 32-bit (signed) integer to 64-bit unsigned, since such casting + # is deprecated in NumPy. + hashes = [[c_uint64(hash_unicode(word)).value for word in X] for X in Xs] hash_arrays = [model.ops.asarray1i(h, dtype="uint64") for h in hashes] arrays = [model.ops.reshape2i(array, -1, 1) for array in hash_arrays] diff --git a/thinc/tests/layers/test_basic_tagger.py b/thinc/tests/layers/test_basic_tagger.py index 855a6d6ad..3bc772940 100644 --- a/thinc/tests/layers/test_basic_tagger.py +++ b/thinc/tests/layers/test_basic_tagger.py @@ -60,7 +60,6 @@ def get_shuffled_batches(Xs, Ys, batch_size): yield list(batch_X), list(batch_Y) -@pytest.mark.slow @pytest.mark.parametrize( ("depth", "width", "vector_width", "nb_epoch"), [(2, 32, 16, 5)] ) From 40d41487ed5f8270e974b16ec8e2edc097faef22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 16 Jan 2024 12:12:43 +0100 Subject: [PATCH 23/30] Set version to v9.0.0.dev4 (#919) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index 502500b04..19a87d71d 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev3" +__version__ = "9.0.0.dev4" __release__ = True From 307a4f83465c132f0a4d6d6af83c029d9eaea3bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 7 Feb 2024 15:37:29 +0100 Subject: [PATCH 24/30] Fix `cupy.cublas` import (#921) * Fix `cupy.cublas` import Reported in #920. * Update mypy to work with recent Torch versions * CI: Do not run MyPy on Python 3.6/3.7. --- .github/workflows/tests.yml | 4 +++- requirements.txt | 2 +- thinc/backends/cupy_ops.py | 4 ++-- thinc/compat.py | 3 +++ thinc/shims/torchscript.py | 2 +- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 035be0baf..1ed106d59 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -87,7 +87,9 @@ jobs: - name: Run mypy run: python -m mypy thinc --no-implicit-reexport - if: matrix.python_version != '3.6' + if: | + matrix.python_version != '3.6' && + matrix.python_version != '3.7' - name: Delete source directory run: rm -rf thinc diff --git a/requirements.txt b/requirements.txt index b7682e738..3e3c9901e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -25,7 +25,7 @@ pytest-cov>=2.7.0,<5.0.0 coverage>=5.0.0,<8.0.0 mock>=2.0.0,<3.0.0 flake8>=3.5.0,<3.6.0 -mypy>=1.0.0,<1.1.0; python_version >= "3.7" +mypy>=1.5.0,<1.6.0; platform_machine != "aarch64" and python_version >= "3.8" types-mock>=0.1.1 types-contextvars>=0.1.2; python_version < "3.7" types-dataclasses>=0.1.3; python_version < "3.7" diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py index 1e1e5b92b..472b6c542 100644 --- a/thinc/backends/cupy_ops.py +++ b/thinc/backends/cupy_ops.py @@ -1,7 +1,7 @@ import numpy from .. import registry -from ..compat import cupy, cupyx +from ..compat import cublas, cupy, cupyx from ..types import DeviceTypes from ..util import ( is_cupy_array, @@ -257,7 +257,7 @@ def clip_gradient(self, gradient, threshold): # implementation. def frobenius_norm(X): X_vec = X.reshape(-1) - return cupy.cublas.nrm2(X_vec) + return cublas.nrm2(X_vec) grad_norm = cupy.maximum(frobenius_norm(gradient), 1e-12) gradient *= cupy.minimum(threshold, grad_norm) / grad_norm diff --git a/thinc/compat.py b/thinc/compat.py index 5d600796a..c7b47cbe6 100644 --- a/thinc/compat.py +++ b/thinc/compat.py @@ -4,9 +4,11 @@ try: # pragma: no cover import cupy + import cupy.cublas import cupyx has_cupy = True + cublas = cupy.cublas cupy_version = Version(cupy.__version__) try: cupy.cuda.runtime.getDeviceCount() @@ -20,6 +22,7 @@ else: cupy_from_dlpack = cupy.fromDlpack except (ImportError, AttributeError): + cublas = None cupy = None cupyx = None cupy_version = Version("0.0.0") diff --git a/thinc/shims/torchscript.py b/thinc/shims/torchscript.py index 6c05c8a9b..9d413f93a 100644 --- a/thinc/shims/torchscript.py +++ b/thinc/shims/torchscript.py @@ -30,7 +30,7 @@ class TorchScriptShim(PyTorchShim): def __init__( self, - model: Optional["torch.ScriptModule"], + model: Optional["torch.jit.ScriptModule"], config=None, optimizer: Any = None, mixed_precision: bool = False, From 3aae298d32adc167ac57e0791a5a8c0544b1e8a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 7 Feb 2024 16:14:19 +0100 Subject: [PATCH 25/30] Set version to v8.2.3 (#922) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index 394a8253e..e7455c55b 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "8.2.2" +__version__ = "8.2.3" __release__ = True From ec68d7d558783a40ccb7a4f4627070f9aa4fb195 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 8 Apr 2024 15:56:35 +0200 Subject: [PATCH 26/30] Set version to 9.0.0.dev5 (#925) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index 19a87d71d..ebf1604dc 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev4" +__version__ = "9.0.0.dev5" __release__ = True From c998bf2a6d304a0539b987e289245f5311820fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 16 Apr 2024 12:07:19 +0200 Subject: [PATCH 27/30] Merge `thinc-apple-ops` into Thinc (#927) This change adds `AppleOps` to Thinc, to ensure that the AMX unit is always used on Apple Silicon Macs. Before this change, a user would get much worse performance if they forgot to install `thinc-apple-ops`. The `apple_ops` and `_accelerate` modules are built conditionally. When detecting the best CPU implementation, we rely on a `try...except` import to determine whether Apple ops are available. Even though x86_64 Macs do not have an AMX unit, Accelerate is competitive with BLIS, so it does not hurt to enable Apple ops on all Macs. --- .github/workflows/tests.yml | 11 --- setup.py | 16 +++- thinc/api.py | 7 +- thinc/backends/__init__.py | 10 ++- thinc/backends/_accelerate.pxd | 40 ++++++++++ thinc/backends/_accelerate.pyx | 75 ++++++++++++++++++ thinc/backends/apple_ops.pyx | 39 +++++++++ thinc/backends/mps_ops.py | 7 +- thinc/compat.py | 4 + thinc/tests/backends/_apple_blas/__init__.py | 0 thinc/tests/backends/_apple_blas/test_gemm.py | 79 +++++++++++++++++++ thinc/tests/backends/test_mps_ops.py | 11 +++ thinc/tests/backends/test_ops.py | 2 +- 13 files changed, 279 insertions(+), 22 deletions(-) create mode 100644 thinc/backends/_accelerate.pxd create mode 100644 thinc/backends/_accelerate.pyx create mode 100644 thinc/backends/apple_ops.pyx create mode 100644 thinc/tests/backends/_apple_blas/__init__.py create mode 100644 thinc/tests/backends/_apple_blas/test_gemm.py create mode 100644 thinc/tests/backends/test_mps_ops.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1ed106d59..cd569bafa 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -152,14 +152,3 @@ jobs: - name: Run tests with extras run: python -m pytest --pyargs thinc --cov=thinc --cov-report=term -p thinc.tests.enable_tensorflow -p thinc.tests.enable_mxnet - - - name: Run tests for thinc-apple-ops - run: | - pip uninstall -y tensorflow - pip install thinc-apple-ops - python -m pytest --pyargs thinc_apple_ops - if: matrix.os == 'macos-latest' && matrix.python_version == '3.10' - - - name: Run tests with thinc-apple-ops - run: python -m pytest --pyargs thinc - if: matrix.os == 'macos-latest' && matrix.python_version == '3.10' diff --git a/setup.py b/setup.py index 231f7298b..e380c815c 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +import platform import sys from setuptools.command.build_ext import build_ext from sysconfig import get_path @@ -13,6 +14,8 @@ # http://docs.cython.org/en/latest/src/userguide/source_files_and_compilation.html#compiler-options Options.docstrings = True +ACCELERATE = "thinc.backends._accelerate" +APPLE_OPS = ["thinc.backends.apple_ops", ACCELERATE] PACKAGES = find_packages() MOD_NAMES = [ @@ -20,7 +23,7 @@ "thinc.backends.numpy_ops", "thinc.layers.sparselinear", "thinc.layers.premap_ids", -] +] + (APPLE_OPS if platform.system() == "Darwin" else []) COMPILE_OPTIONS = { "msvc": ["/Ox", "/EHsc"], "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function", "-std=c++11"], @@ -78,7 +81,16 @@ def setup_package(): ext_modules = [] for name in MOD_NAMES: mod_path = name.replace(".", "/") + ".pyx" - ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs) + if name == ACCELERATE: + ext = Extension( + name, + [mod_path], + language="c++", + include_dirs=include_dirs, + libraries=["blas"], + ) + else: + ext = Extension(name, [mod_path], language="c++", include_dirs=include_dirs) ext_modules.append(ext) print("Cythonizing sources") ext_modules = cythonize( diff --git a/thinc/api.py b/thinc/api.py index 0c4d0a0e1..798ef6f08 100644 --- a/thinc/api.py +++ b/thinc/api.py @@ -162,6 +162,11 @@ xp2torch, ) +try: + from .backends import AppleOps +except ImportError: + AppleOps = None + # fmt: off __all__ = [ # .config @@ -198,7 +203,7 @@ "has_cupy", # .backends "get_ops", "set_current_ops", "get_current_ops", "use_ops", - "Ops", "CupyOps", "MPSOps", "NumpyOps", "set_gpu_allocator", + "Ops", "AppleOps", "CupyOps", "MPSOps", "NumpyOps", "set_gpu_allocator", "use_pytorch_for_gpu_memory", "use_tensorflow_for_gpu_memory", # .layers "Dropout", "Embed", "expand_window", "HashEmbed", "LayerNorm", "Linear", diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py index eb954370f..5d33c2c34 100644 --- a/thinc/backends/__init__.py +++ b/thinc/backends/__init__.py @@ -19,6 +19,11 @@ from .numpy_ops import NumpyOps from .ops import Ops +try: + from .apple_ops import AppleOps +except ImportError: + AppleOps = None + context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None) context_pools: ContextVar[dict] = ContextVar("context_pools", default={}) @@ -83,10 +88,6 @@ def use_tensorflow_for_gpu_memory() -> None: # pragma: no cover def _import_extra_cpu_backends(): - try: - from thinc_apple_ops import AppleOps - except ImportError: - pass try: from thinc_bigendian_ops import BigEndianOps except ImportError: @@ -171,6 +172,7 @@ def _get_thread_state() -> threading.local: "use_ops", "ParamServer", "Ops", + "AppleOps", "CupyOps", "MPSOps", "NumpyOps", diff --git a/thinc/backends/_accelerate.pxd b/thinc/backends/_accelerate.pxd new file mode 100644 index 000000000..8bc0ce233 --- /dev/null +++ b/thinc/backends/_accelerate.pxd @@ -0,0 +1,40 @@ +cdef extern from "Accelerate/Accelerate.h": + enum CBLAS_ORDER: CblasRowMajor, CblasColMajor + enum CBLAS_TRANSPOSE: CblasNoTrans, CblasTrans, CblasConjTrans + enum CBLAS_UPLO: CblasUpper, CblasLower + enum CBLAS_DIAG: CblasNonUnit, CblasUnit + enum CBLAS_SIDE: CblasLeft, CblasRight + + # BLAS level 1 routines + + void cblas_sswap(int M, float *x, int incX, float *y, int incY) nogil + void cblas_sscal(int N, float alpha, float *x, int incX) nogil + void cblas_scopy(int N, float *x, int incX, float *y, int incY) nogil + void cblas_saxpy(int N, float alpha, float *x, int incX, float *y, int incY ) nogil + float cblas_sdot(int N, float *x, int incX, float *y, int incY ) nogil + float cblas_snrm2(int N, float *x, int incX) nogil + float cblas_sasum(int N, float *x, int incX) nogil + int cblas_isamax(int N, float *x, int incX) nogil + + # BLAS level 2 routines + void cblas_sgemv(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, int M, int N, + float alpha, float *A, int lda, float *x, int incX, + float beta, float *y, int incY) nogil + + void cblas_sger(CBLAS_ORDER Order, int M, int N, float alpha, float *x, + int incX, float *y, int incY, float *A, int lda) nogil + + # BLAS level 3 routines + void cblas_sgemm(CBLAS_ORDER Order, CBLAS_TRANSPOSE TransA, + CBLAS_TRANSPOSE TransB, int M, int N, int K, + float alpha, float *A, int lda, float *B, int ldb, + float beta, float *C, int ldc) nogil + + +cdef void sgemm(bint TransA, bint TransB, int M, int N, int K, + float alpha, const float* A, int lda, const float *B, + int ldb, float beta, float* C, int ldc) nogil + + +cdef void saxpy(int N, float alpha, const float* X, int incX, + float *Y, int incY) nogil diff --git a/thinc/backends/_accelerate.pyx b/thinc/backends/_accelerate.pyx new file mode 100644 index 000000000..094cb9443 --- /dev/null +++ b/thinc/backends/_accelerate.pyx @@ -0,0 +1,75 @@ +cimport numpy as np +from libc.stdint cimport uintptr_t + +import numpy + + +cpdef np.ndarray gemm(float[:, ::1] A, float[:, ::1] B, + bint trans1=False, bint trans2=False, + np.ndarray out=None): + cdef int nM = A.shape[0] if not trans1 else A.shape[1] + cdef int nK = A.shape[1] if not trans1 else A.shape[0] + cdef int nK_b = B.shape[0] if not trans2 else B.shape[1] + cdef int nN = B.shape[1] if not trans2 else B.shape[0] + + cdef float[:, ::1] C = out + + if out is None: + out = numpy.empty((nM, nN), dtype="f") + C = out + else: + if C.shape[0] != nM or C.shape[1] != nN: + msg = "Shape mismatch for output matrix, was: (%d, %d), expected (%d, %d)" + raise ValueError(msg % (C.shape[0], C.shape[1], nM, nN)) + + + if nK != nK_b: + msg = "Shape mismatch for gemm: (%d, %d), (%d, %d)" + raise ValueError(msg % (nM, nK, nK_b, nN)) + + if nM == 0 or nK == 0 or nN == 0: + return out + + cblas_sgemm( + CblasRowMajor, + CblasTrans if trans1 else CblasNoTrans, + CblasTrans if trans2 else CblasNoTrans, + nM, + nN, + nK, + 1.0, + &A[0, 0], + A.shape[1], + &B[0, 0], + B.shape[1], + 0.0, + &C[0, 0], + C.shape[1] + ) + return out + + +cdef void sgemm(bint TransA, bint TransB, int M, int N, int K, + float alpha, const float* A, int lda, const float *B, + int ldb, float beta, float* C, int ldc) nogil: + cblas_sgemm( + CblasRowMajor, + CblasTrans if TransA else CblasNoTrans, + CblasTrans if TransB else CblasNoTrans, + M, + N, + K, + alpha, + A, + lda, + B, + ldb, + beta, + C, + ldc + ) + + +cdef void saxpy(int N, float alpha, const float* X, int incX, + float *Y, int incY) nogil: + cblas_saxpy(N, alpha, X, incX, Y, incY) diff --git a/thinc/backends/apple_ops.pyx b/thinc/backends/apple_ops.pyx new file mode 100644 index 000000000..95a710c0d --- /dev/null +++ b/thinc/backends/apple_ops.pyx @@ -0,0 +1,39 @@ +from typing import Optional + +import numpy + +from ._accelerate import gemm + +from ._accelerate cimport saxpy, sgemm +from .cblas cimport CBlas, set_saxpy, set_sgemm + +from .. import registry +from ..types import Floats2d +from .numpy_ops import NumpyOps + + +@registry.ops("AppleOps") +class AppleOps(NumpyOps): + """Thinc Ops class that calls into Apple's native libraries for some + operations. Other operations fall back to numpy.""" + name = "apple" + xp = numpy + + def cblas(self) -> CBlas: + cdef CBlas cblas = CBlas() + set_saxpy(cblas, saxpy) + set_sgemm(cblas, sgemm) + return cblas + + def gemm( + self, + x: Floats2d, + y: Floats2d, + out: Optional[Floats2d] = None, + trans1: bool = False, + trans2: bool = False, + ) -> Floats2d: + """Perform General Matrix Multiplication (GeMM) and optionally store + the result in the specified output variable. + """ + return gemm(x, y, out=out, trans1=trans1, trans2=trans2) diff --git a/thinc/backends/mps_ops.py b/thinc/backends/mps_ops.py index c6ba71f11..fb242f0f1 100644 --- a/thinc/backends/mps_ops.py +++ b/thinc/backends/mps_ops.py @@ -3,6 +3,7 @@ import numpy from .. import registry +from ..compat import has_apple_ops from .numpy_ops import NumpyOps from .ops import Ops @@ -12,11 +13,11 @@ # during type checking. _Ops = Ops else: - try: - from thinc_apple_ops import AppleOps + if has_apple_ops: + from .apple_ops import AppleOps _Ops = AppleOps - except ImportError: + else: _Ops = NumpyOps diff --git a/thinc/compat.py b/thinc/compat.py index 7275bfc6e..2ec91de48 100644 --- a/thinc/compat.py +++ b/thinc/compat.py @@ -1,3 +1,4 @@ +import platform import warnings from packaging.version import Version @@ -119,6 +120,9 @@ def enable_mxnet(): has_blis = False +# AppleOps is available unconditionally on macOS. +has_apple_ops = platform.system() == "Darwin" + has_gpu = has_cupy_gpu or has_torch_mps_gpu __all__ = [ diff --git a/thinc/tests/backends/_apple_blas/__init__.py b/thinc/tests/backends/_apple_blas/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/thinc/tests/backends/_apple_blas/test_gemm.py b/thinc/tests/backends/_apple_blas/test_gemm.py new file mode 100644 index 000000000..10e662110 --- /dev/null +++ b/thinc/tests/backends/_apple_blas/test_gemm.py @@ -0,0 +1,79 @@ +import numpy +import pytest + +from thinc.compat import has_apple_ops + +try: + import thinc.backends._accelerate as accelerate +except: + pass + + +@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available") +def test_basic_sgemm(): + A = numpy.random.randn(5, 4).astype("f") + B = numpy.random.randn(4, 7).astype("f") + C = accelerate.gemm(A, B) + assert C.shape == (A.shape[0], B.shape[1]) + + C_out = numpy.empty((5, 7), dtype="f") + accelerate.gemm(A, B, out=C_out) + + numpy.testing.assert_allclose(C, C_out) + + +@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available") +def test_incorrect_output_size(): + A = numpy.ndarray((5, 4), dtype="f") + B = numpy.ndarray((4, 7), dtype="f") + + with pytest.raises(ValueError, match=r"Shape mismatch for output matrix"): + accelerate.gemm(A, B, out=numpy.ndarray((3, 7), dtype="f")) + + with pytest.raises(ValueError, match=r"Shape mismatch for output matrix"): + accelerate.gemm(A, B, out=numpy.ndarray((5, 3), dtype="f")) + + +@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available") +@pytest.mark.parametrize( + "A_shape,B_shape,transA,transB", + [ + [(0, 0), (0, 0), False, False], + [(0, 0), (0, 0), True, False], + [(0, 0), (0, 0), False, True], + [(0, 0), (0, 0), True, True], + [(0, 5), (5, 0), False, False], + [(5, 0), (5, 0), False, True], + [(5, 0), (5, 0), True, False], + ], +) +def test_zero_size(A_shape, B_shape, transA, transB): + A = numpy.ndarray(A_shape, dtype="f") + B = numpy.ndarray(B_shape, dtype="f") + if not transA and not transB: + C = numpy.dot(A, B) + elif transA: + C = numpy.dot(A.T, B) + elif transB: + C = numpy.dot(A, B.T) + else: + C = numpy.dot(A.T, B.T) + C_ = accelerate.gemm(A, B, trans1=transA, trans2=transB) + assert C.shape == C_.shape + + +@pytest.mark.skipif(not has_apple_ops, reason="Apple ops not available") +@pytest.mark.parametrize( + "A_shape,B_shape,transA,transB", + [ + [(4, 5), (4, 5), False, False], + [(5, 4), (4, 5), True, False], + [(4, 5), (5, 4), False, True], + [(5, 4), (5, 4), True, True], + ], +) +def test_incorrect_shapes(A_shape, B_shape, transA, transB): + A = numpy.ndarray(A_shape, dtype="f") + B = numpy.ndarray(B_shape, dtype="f") + with pytest.raises(ValueError, match=r"Shape mismatch"): + accelerate.gemm(A, B, trans1=transA, trans2=transB) diff --git a/thinc/tests/backends/test_mps_ops.py b/thinc/tests/backends/test_mps_ops.py new file mode 100644 index 000000000..1bd5838b1 --- /dev/null +++ b/thinc/tests/backends/test_mps_ops.py @@ -0,0 +1,11 @@ +from thinc.api import NumpyOps, get_ops +from thinc.compat import has_apple_ops + + +def test_mps_ops_inherits_apple_ops(): + ops = get_ops("mps") + assert isinstance(ops, NumpyOps) + if has_apple_ops: + # We can't import AppleOps directly, because its' not + # available on non-Darwin systems. + assert "AppleOps" in [base.__name__ for base in type(ops).__bases__] diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py index 9f03c0438..7cf4a935d 100644 --- a/thinc/tests/backends/test_ops.py +++ b/thinc/tests/backends/test_ops.py @@ -1403,7 +1403,7 @@ def test_get_ops(): # If Apple ops are available, "cpu" should return AppleOps or # NumpyOps otherwise. try: - from thinc_apple_ops import AppleOps + from thinc.backends.apple_ops import AppleOps assert isinstance(get_ops("cpu"), AppleOps) except ImportError: From 2a0b9c1e41ef29052bfc422076f65a2e522a850d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Wed, 17 Apr 2024 10:59:21 +0200 Subject: [PATCH 28/30] Set version to 9.0.0.dev6 (#928) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index ebf1604dc..1d2afbabb 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev5" +__version__ = "9.0.0.dev6" __release__ = True From ccae25849587dda3dbdb4e6cbc8836cba506220e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 18 Apr 2024 09:48:26 +0200 Subject: [PATCH 29/30] Document `AppleOps` and `MPSOps` (#929) * Document AppleOps and MPSOps * Reformat Ops table - Sort alphabetically. - Note that `AppleOps` is new in 9.0. * Missing comma --- website/docs/api-backends.md | 22 ++++++++-------- website/docs/api-model.md | 50 ++++++++++++++++++------------------ 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md index fc69a775d..853fada3b 100644 --- a/website/docs/api-backends.md +++ b/website/docs/api-backends.md @@ -17,16 +17,18 @@ specialized versions can be called for different backends. You can also create your own `Ops` subclasses with specialized routines for your layers, and use the [`set_current_ops`](#set_current_ops) function to change the default. -| Backend | CPU | GPU | TPU | Description | -| ---------- | :----------------: | :----------------: | :---------------: | ----------------------------------------------------------------------------------------------------- | -| `NumpyOps` | | | | Execute via `numpy`, [`blis`](https://github.com/explosion/cython-blis) (optional) and custom Cython. | -| `CupyOps` | | | | Execute via [`cupy`](https://cupy.chainer.org/) and custom CUDA. | +| Backend | CPU | GPU | TPU | Description | +| ---------- | :----------------: | :----------------: | :---------------: | ----------------------------------------------------------------------------------------------------------- | +| `AppleOps` | | | | Use AMX matrix multiplication units on Apple Silicon Macs. Added in Thinc 9.0. | +| `CupyOps` | | | | Execute via [`cupy`](https://cupy.chainer.org/) and custom CUDA. | +| `MPSOps` | | | | Use the GPU on Apple Silicon Macs for PyTorch models, use AMX matrix multiplication units for Thinc Models. | +| `NumpyOps` | | | | Execute via `numpy`, [`blis`](https://github.com/explosion/cython-blis) (optional) and custom Cython. | ## Ops {#ops tag="class"} -The `Ops` class is typically not used directly but via `NumpyOps` or `CupyOps`, -which are subclasses of `Ops` and implement a **more efficient subset of the -methods**. You also have access to the ops via the +The `Ops` class is typically not used directly but via `NumpyOps`, `AppleOps`, +`CupyOps` or `MPSOps`, which are subclasses of `Ops` and implement a **more +efficient subset of the methods**. You also have access to the ops via the [`Model.ops`](/docs/api-model#attributes) attribute. The documented methods below list which backends provide optimized and more efficient versions (indicated by ), and which use the default implementation. @@ -56,7 +58,7 @@ use_ops(blis_ops) | Name | Type | Description | | ------------- | ------------ | ---------------------------------------------------------------------------------------- | -| `name` | str | **Class attribute:** Backend name, `"numpy"` or `"cupy"`. | +| `name` | str | **Class attribute:** Backend name, `"numpy"`, `"apple"`, `"cupy"` or `"mps"`. | | `xp` | Xp | **Class attribute:** `numpy` or `cupy`. | | `device_type` | str | The device type to use, if available for the given backend: `"cpu"`, `"gpu"` or `"tpu"`. | | `device_id` | int | The device ID to use, if available for the given backend. | @@ -1553,7 +1555,7 @@ numpy_ops = get_ops("numpy") | Argument | Type | Description | | ----------- | ------------ | ----------------------------------------------------- | -| `ops` | str | `"numpy"` or `"cupy"`. | +| `ops` | str | `"numpy"`, `"apple"`, `"cupy"` or `"mps"`. | | `**kwargs` | | Optional arguments passed to [`Ops.__init__`](#init). | | **RETURNS** | Ops | The backend object. | @@ -1572,7 +1574,7 @@ with use_ops("cupy"): | Argument | Type | Description | | ---------- | ------------ | ----------------------------------------------------- | -| `ops` | str | `"numpy"` or `"cupy"`. | +| `ops` | str | `"numpy"`, `"apple"`, `"cupy"` or `"mps"`. | | `**kwargs` | | Optional arguments passed to [`Ops.__init__`](#init). | ### get_current_ops {#get_current_ops tag="function"} diff --git a/website/docs/api-model.md b/website/docs/api-model.md index 597f67ec9..193fd1acb 100644 --- a/website/docs/api-model.md +++ b/website/docs/api-model.md @@ -84,19 +84,19 @@ model = Model( ) ``` -| Argument | Type | Description | -| -------------- | ------------------------------------------- | --------------------------------------------------------------------------------------- | -| `name` | str | The name of the layer type. | -| `forward` | Callable | Function to compute the forward result and the backpropagation callback. | -| _keyword-only_ | | | -| `init` | Callable | Function to define the initialization logic. | -| `dims` | Dict[str, Optional[int]] | Dictionary describing the model's dimensions. Map unknown dimensions to `None`. | -| `params` | Dict[str, Optional[FloatsXd]] | Dictionary with the model's parameters. Set currently unavailable parameters to `None`. | -| `refs` | Dict[str, Optional[Model]] | Dictionary mapping specific nodes (sublayers) of the network to a name. | -| `attrs` | Dict[str, Any] | Dictionary of non-parameter attributes. | -| `layers` | List[Model] | List of child layers. | -| `shims` | List[Shim] | List of interfaces for external models. | -| `ops` | Optional[Union[NumpyOps, CupyOps]] | An `Ops` instance, which provides mathematical and memory operations. | +| Argument | Type | Description | +| -------------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------- | +| `name` | str | The name of the layer type. | +| `forward` | Callable | Function to compute the forward result and the backpropagation callback. | +| _keyword-only_ | | | +| `init` | Callable | Function to define the initialization logic. | +| `dims` | Dict[str, Optional[int]] | Dictionary describing the model's dimensions. Map unknown dimensions to `None`. | +| `params` | Dict[str, Optional[FloatsXd]] | Dictionary with the model's parameters. Set currently unavailable parameters to `None`. | +| `refs` | Dict[str, Optional[Model]] | Dictionary mapping specific nodes (sublayers) of the network to a name. | +| `attrs` | Dict[str, Any] | Dictionary of non-parameter attributes. | +| `layers` | List[Model] | List of child layers. | +| `shims` | List[Shim] | List of interfaces for external models. | +| `ops` | Optional[Union[NumpyOps, AppleOps, CupyOps, MPSOps]] | An `Ops` instance, which provides mathematical and memory operations. | ### Model.define_operators {#define_operators tag="classmethod,contextmanager"} @@ -260,17 +260,17 @@ for node in model.walk(): The `walk` method supports three iteration orders through the `order` argument: -* `"bfs"`: breadth-first. Iteration order of the example above: - *1 - 2 - 4 - 3 - 5* -* `"dfs_pre"`: depth-first preorder, outputs a node before its children. - Iteration order of the example above: *1 - 2 - 3 - 4 - 5* -* `"dfs_post"`: depth-first postorder, outputs children before a node itself. - Iteration order of the example above: *3 - 2 - 5 - 4 - 1* +- `"bfs"`: breadth-first. Iteration order of the example above: _1 - 2 - 4 - 3 - + 5_ +- `"dfs_pre"`: depth-first preorder, outputs a node before its children. + Iteration order of the example above: _1 - 2 - 3 - 4 - 5_ +- `"dfs_post"`: depth-first postorder, outputs children before a node itself. + Iteration order of the example above: _3 - 2 - 5 - 4 - 1_ -| Argument | Type | Description | -|-------------|--------------------------|--------------------------------------------------------------------------------------------------------------------------------------------| +| Argument | Type | Description | +| ----------- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | | `order` | str | Node iteration order. `"bfs"` (breadth-first), `"dfs_pre"` (depth-first preorder), `"dfs_post"` (depth-first postorder) Default: `"bfs"`. | -| **RETURNS** | Iterable[Model] | The layers of the model. | +| **RETURNS** | Iterable[Model] | The layers of the model. | ### Model.remove_node {#remove_node tag="method"} @@ -329,9 +329,9 @@ assert model.get_dim("nI") == 16 Retrieve the value of a dimension of the given name, or `None` if the dimension is either unregistered or the value is currently unset. -| Argument | Type | Description | -| ----------- | --------------------- | --------------------------------------- | -| `name` | str | The name of the dimension, e.g. `"nO"`. | +| Argument | Type | Description | +| ----------- | ---------------------- | --------------------------------------- | +| `name` | str | The name of the dimension, e.g. `"nO"`. | | **RETURNS** | Optional[int] | The size of the dimension, or `None`. | ### Model.set_dim {#set_dim tag="method"} From 5be631e9623434ec2b4a3dc5989ce1cb13062ebe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Thu, 18 Apr 2024 10:22:55 +0200 Subject: [PATCH 30/30] Set version to 9.0.0 (#930) --- thinc/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinc/about.py b/thinc/about.py index 1d2afbabb..d2a73d579 100644 --- a/thinc/about.py +++ b/thinc/about.py @@ -1,2 +1,2 @@ -__version__ = "9.0.0.dev6" +__version__ = "9.0.0" __release__ = True