Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Table lock: tests run with tables that are read-only by default #5381

Merged
merged 5 commits into from
Oct 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions Orange/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,13 @@ def __init__(self, cat_model, cat_features, domain):
self.cat_model = cat_model
self.cat_features = cat_features

def __call__(self, data, ret=Model.Value):
if isinstance(data, Table):
with data.force_unlocked(data.X):
markotoplak marked this conversation as resolved.
Show resolved Hide resolved
return super().__call__(data, ret)
else:
return super().__call__(data, ret)

def predict(self, X):
if self.cat_features:
X = X.astype(str)
Expand Down Expand Up @@ -824,17 +831,18 @@ def __call__(self, data, progress_callback=None):
return m

def fit_storage(self, data: Table):
domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None
if self.supports_weights and data.has_weights():
W = data.W.reshape(-1)
# pylint: disable=not-callable
clf = self.__wraps__(**self.params)
cat_features = [i for i, attr in enumerate(domain.attributes)
if attr.is_discrete]
if cat_features:
X = X.astype(str)
cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W)
return self.__returns__(cat_model, cat_features, domain)
with data.force_unlocked(data.X):
markotoplak marked this conversation as resolved.
Show resolved Hide resolved
domain, X, Y, W = data.domain, data.X, data.Y.reshape(-1), None
if self.supports_weights and data.has_weights():
W = data.W.reshape(-1)
# pylint: disable=not-callable
clf = self.__wraps__(**self.params)
cat_features = [i for i, attr in enumerate(domain.attributes)
if attr.is_discrete]
if cat_features:
X = X.astype(str)
cat_model = clf.fit(X, Y, cat_features=cat_features, sample_weight=W)
return self.__returns__(cat_model, cat_features, domain)

def __getattr__(self, item):
try:
Expand Down
4 changes: 4 additions & 0 deletions Orange/canvas/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from Orange.data import Table

if Table.LOCKING is None:
Table.LOCKING = True
51 changes: 32 additions & 19 deletions Orange/classification/_tree_scorers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ cdef extern from "numpy/npy_math.h":
cpdef enum:
NULL_BRANCH = -1

def contingency(double[:] x, int nx, double[:] y, int ny):
def contingency(const double[:] x, int nx, const double[:] y, int ny):
cdef:
np.ndarray[np.uint32_t, ndim=2] cont = np.zeros((ny, nx), dtype=np.uint32)
int n = len(x), yi, xi
Expand All @@ -28,7 +28,8 @@ def contingency(double[:] x, int nx, double[:] y, int ny):
cont[yi, xi] += 1
return cont

def find_threshold_entropy(double[:] x, double[:] y, np.intp_t[:] idx,
def find_threshold_entropy(const double[:] x, const double[:] y,
const np.intp_t[:] idx,
int n_classes, int min_leaf):
"""
Find the threshold for continuous attribute values that maximizes
Expand Down Expand Up @@ -89,8 +90,9 @@ def find_threshold_entropy(double[:] x, double[:] y, np.intp_t[:] idx,
return (class_entro - best_entro) / N / log(2), x[idx[best_idx]]


def find_binarization_entropy(double[:, :] cont, double[:] class_distr,
double[:] val_distr, int min_leaf):
def find_binarization_entropy(const double[:, :] cont,
const double[:] class_distr,
const double[:] val_distr, int min_leaf):
"""
Find the split of discrete values into two groups that optimizes information
gain.
Expand Down Expand Up @@ -187,7 +189,9 @@ def find_binarization_entropy(double[:, :] cont, double[:] class_distr,
return (class_entro - best_entro) / N / log(2), best_mapping


def find_threshold_MSE(double[:] x, double[:] y, np.intp_t[:] idx, int min_leaf):
def find_threshold_MSE(const double[:] x,
const double[:] y,
const np.intp_t[:] idx, int min_leaf):
"""
Find the threshold for continuous attribute values that minimizes MSE.

Expand Down Expand Up @@ -232,7 +236,8 @@ def find_threshold_MSE(double[:] x, double[:] y, np.intp_t[:] idx, int min_leaf)
return (best_inter - (sum * sum) / N) / N, x[idx[best_idx]]


def find_binarization_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
def find_binarization_MSE(const double[:] x,
const double[:] y, int n_values, int min_leaf):
"""
Find the split of discrete values into two groups that minimizes the MSE.

Expand Down Expand Up @@ -315,7 +320,9 @@ def find_binarization_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
return (best_inter - start_inter) / x.shape[0], best_mapping


def compute_grouped_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
def compute_grouped_MSE(const double[:] x,
const double[:] y,
int n_values, int min_leaf):
"""
Compute the MSE decrease of the given split into groups.

Expand Down Expand Up @@ -371,8 +378,10 @@ def compute_grouped_MSE(double[:] x, double[:] y, int n_values, int min_leaf):
return (inter - sum * sum / n) / x.shape[0]


def compute_predictions(double[:, :] X, int[:] code,
double[:, :] values, double[:] thresholds):
def compute_predictions(const double[:, :] X,
const int[:] code,
const double[:, :] values,
const double[:] thresholds):
"""
Return the values (distributions, means and variances) stored in the nodes
to which the tree classify the rows in X.
Expand Down Expand Up @@ -419,8 +428,10 @@ def compute_predictions(double[:, :] X, int[:] code,
return np.asarray(predictions)


def compute_predictions_csr(X, int[:] code,
double[:, :] values, double[:] thresholds):
def compute_predictions_csr(X,
const int[:] code,
const double[:, :] values,
const double[:] thresholds):
"""
Same as compute_predictions except for sparse data
"""
Expand All @@ -431,9 +442,9 @@ def compute_predictions_csr(X, int[:] code,
double[: ,:] predictions = np.empty(
(X.shape[0], values.shape[1]), dtype=np.float64)

double[:] data = X.data
np.int32_t[:] indptr = X.indptr
np.int32_t[:] indices = X.indices
const double[:] data = X.data
const np.int32_t[:] indptr = X.indptr
const np.int32_t[:] indices = X.indices
int ind, attr, n_rows

n_rows = X.shape[0]
Expand Down Expand Up @@ -463,8 +474,10 @@ def compute_predictions_csr(X, int[:] code,
predictions[i, j] = values[node_idx, j]
return np.asarray(predictions)

def compute_predictions_csc(X, int[:] code,
double[:, :] values, double[:] thresholds):
def compute_predictions_csc(X,
const int[:] code,
const double[:, :] values,
const double[:] thresholds):
"""
Same as compute_predictions except for sparse data
"""
Expand All @@ -475,9 +488,9 @@ def compute_predictions_csc(X, int[:] code,
double[: ,:] predictions = np.empty(
(X.shape[0], values.shape[1]), dtype=np.float64)

double[:] data = X.data
np.int32_t[:] indptr = X.indptr
np.int32_t[:] indices = X.indices
const double[:] data = X.data
const np.int32_t[:] indptr = X.indptr
const np.int32_t[:] indices = X.indices
int ind, attr, n_rows

n_rows = X.shape[0]
Expand Down
25 changes: 25 additions & 0 deletions Orange/classification/tests/test_catgb_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,31 @@ def test_retain_x(self):
np.testing.assert_array_equal(data.X, X)
self.assertEqual(data.X.dtype, X.dtype)

def test_doesnt_modify_data(self):
# catgb is called with force-unlocked table, so let us (attempt to)
# test it doesn't actually change it
data = Table("iris")
with data.unlocked():
data[0, 0] = 0
data[1, 0] = np.nan
data[:, 1] = 0
data[:, 2] = np.nan
data.Y[0] = np.nan
x, y = data.X.copy(), data.Y.copy()
booster = CatGBClassifier()
model = booster(data)
model(data)
np.testing.assert_equal(data.X, x)
np.testing.assert_equal(data.Y, y)

with data.unlocked():
data = data.to_sparse()
x = data.X.copy()
booster = CatGBClassifier()
model = booster(data)
model(data)
np.testing.assert_equal(data.X.data, x.data)


if __name__ == "__main__":
unittest.main()
2 changes: 1 addition & 1 deletion Orange/classification/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def _score_disc():
cont_entr = np.sum(cont * np.log(cont))
score = (class_entr - attr_entr + cont_entr) / n / np.log(2)
score *= n / len(data) # punishment for missing values
branches = col_x
branches = col_x.copy()
branches[np.isnan(branches)] = -1
if score == 0:
return REJECT_ATTRIBUTE
Expand Down
8 changes: 6 additions & 2 deletions Orange/data/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,12 @@ def __init__(self, domain, data=None, id=None):
self._weight = 1
elif isinstance(data, Instance) and data.domain == domain:
self._x = np.array(data._x)
self._y = np.array(data._y)
self._y = np.atleast_1d(np.array(data._y))
self._metas = np.array(data._metas)
self._weight = data._weight
else:
self._x, self._y, self._metas = domain.convert(data)
self._y = np.atleast_1d(self._y)
self._weight = 1

if id is not None:
Expand Down Expand Up @@ -116,7 +117,10 @@ def __getitem__(self, key):
if 0 <= idx < len(self._domain.attributes):
value = self._x[idx]
elif idx >= len(self._domain.attributes):
value = self._y[idx - len(self.domain.attributes)]
if self._y.ndim == 0:
value = self._y
else:
value = self._y[idx - len(self.domain.attributes)]
else:
value = self._metas[-1 - idx]
var = self._domain[idx]
Expand Down
4 changes: 4 additions & 0 deletions Orange/data/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ def to_cat(s, _):
for var, col, expr in zip(Avars, Acols, Aexpr)]).T
XYM.append(A)

# Let the tables share memory with pandas frame
if XYM[1] is not None and XYM[1].ndim == 2 and XYM[1].shape[1] == 1:
XYM[1] = XYM[1][:, 0]

return XYM, Domain(attrs, class_vars, metas)


Expand Down
3 changes: 2 additions & 1 deletion Orange/data/sql/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,7 +556,8 @@ def _filter_values(self, f):
return t2

@classmethod
def from_table(cls, domain, source, row_indices=...):
def from_table(cls, domain, source, row_indices=..., *, copy=False):
# pylint: disable=unused-argument
assert row_indices is ...

table = source.copy()
Expand Down
Loading