Skip to content

Commit

Permalink
OWDiscretize: Reduce interface, add nicer binnings
Browse files Browse the repository at this point in the history
  • Loading branch information
janezd committed Mar 30, 2022
1 parent 93cf05f commit 94c1bda
Show file tree
Hide file tree
Showing 2 changed files with 737 additions and 617 deletions.
155 changes: 133 additions & 22 deletions Orange/preprocess/discretize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import numpy as np
import scipy.sparse as sp

from Orange.data import DiscreteVariable, Domain
from Orange.data import DiscreteVariable, Domain, TimeVariable
from Orange.data.sql.table import SqlTable
from Orange.statistics import distribution, contingency, util as ut
from Orange.statistics.basic_stats import BasicStats
Expand Down Expand Up @@ -58,13 +58,17 @@ def _fmt_interval(low, high, formatter):
return f"{formatter(low)} - {formatter(high)}"

@classmethod
def create_discretized_var(cls, var, points):
def fmt(val):
sval = var.str_val(val)
# For decimal numbers, remove trailing 0's and . if no decimals left
if re.match(r"^\d+\.\d+", sval):
return sval.rstrip("0").rstrip(".")
return sval
def create_discretized_var(cls, var, points, ndigits=None):
if ndigits is None:
def fmt(val):
sval = var.str_val(val)
# For decimal numbers, remove trailing 0's and . if no decimals left
if re.match(r"^\d+\.\d+", sval):
return sval.rstrip("0").rstrip(".")
return sval
else:
def fmt(val):
return f"{val:.{ndigits}f}"

lpoints = list(points)
if lpoints:
Expand Down Expand Up @@ -186,7 +190,122 @@ def _split_eq_width(self, min, max):
if np.isnan(min) or np.isnan(max) or min == max:
return []
dif = (max - min) / self.n
return [min + (i + 1) * dif for i in range(self.n - 1)]
return [min + i * dif for i in range(1, self.n)]


class TooManyIntervals(ValueError):
pass


class FixedWidth(Discretization):
def __init__(self, width, digits=None):
super().__init__()
self.width = width
self.digits = digits

def __call__(self, data, attribute):
values = data[:, attribute]
values = values.X if values.X.size else values.Y
if values.size:
min, max = ut.nanmin(values), ut.nanmax(values)
minf = int(1 + np.floor(min / self.width))
maxf = int(1 + np.floor(max / self.width))
if maxf - minf - 1 >= 100:
raise TooManyIntervals
points = [i * self.width for i in range(minf, maxf)]
else:
points = []

return Discretizer.create_discretized_var(
data.domain[attribute], points, ndigits=self.digits)

class FixedTimeWidth(Discretization):
def __init__(self, width, unit):
# unit: 0=year, 1=month, 2=day, 3=hour, 4=minute, 5=second
# for week, use day with a width of 7
super().__init__()
self.width = width
self.unit = unit

def __call__(self, data, attribute):
fmt = ["%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%b %d %H:%M",
"%H:%M:%S"][self.unit]
values = data[:, attribute]
values = values.X if values.X.size else values.Y
if not values.size:
times = []
else:
mn, mx = ut.nanmin(values), ut.nanmax(values)
mn = utc_from_timestamp(mn).timetuple()
mx = utc_from_timestamp(mx).timetuple()
times = _time_range(mn, mx, self.unit, self.width, 0, 100)
if times is None:
raise TooManyIntervals
times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1]
points = np.array([calendar.timegm(t) for t in times])
values = [time.strftime(fmt, t) for t in times]
if values:
values = _simplified_labels(values)
values = [f"< {values[0]}"] + [
f"{low} - {high}" for low, high in zip(values, values[1:])
] + [f"≥ {values[-1]}"]
return DiscreteVariable(name=attribute.name, values=values,
compute_value=Discretizer(attribute, points),
sparse=attribute.sparse)

class Binning(Discretization):
"""Discretization with nice thresholds
This class creates different decimal or time binnings and picks the one
in which the number of interval is closest to the desired number.
The difference is measured as proportion; e.g. having 30 % less intervals
is the same difference as having 30 % too many.
.. attribute:: n
Desired number of bins (default: 4).
"""
def __init__(self, n=4):
self.n = n

def __call__(self, data, attribute):
def no_discretization():
return Discretizer.create_discretized_var(
data.domain[attribute], [])

values = data[:, attribute]
values = values.X if values.X.size else values.Y
if not values.size:
return no_discretization()

var = data.domain[attribute]
if isinstance(var, TimeVariable):
binnings = time_binnings(values)
else:
binnings = decimal_binnings(values)
if not binnings:
return no_discretization()

# If self.n is 2, require two intervals (one threshold, excluding top
# and bottom), else require at least three intervals
# ... unless this is the only option, in which case we use it
binning = min(
(binning for binning in binnings
if len(binning.thresholds) - 2 >= 1 + (self.n != 2)),
key=lambda binning: abs(1 - len(binning.short_labels) / self.n),
default=binnings[-1])

blabels = binning.labels[1:-1]
labels = [f"< {blabels[0]}"] + [
f"{lab1} - {lab2}" for lab1, lab2 in zip(blabels, blabels[1:])
] + [f"≥ {blabels[-1]}"]

discretizer = Discretizer(var, list(binning.thresholds[1:-1]))
dvar = DiscreteVariable(name=var.name, values=labels,
compute_value=discretizer,
sparse=var.sparse)
dvar.source_variable = var
return dvar


class BinDefinition(NamedTuple):
Expand Down Expand Up @@ -234,7 +353,7 @@ def decimal_binnings(
data, *, min_width=0, min_bins=2, max_bins=50,
min_unique=5, add_unique=0,
factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20),
label_fmt="%g"):
label_fmt="%g") -> List[BinDefinition]:
"""
Find a set of nice splits of data into bins
Expand Down Expand Up @@ -283,22 +402,13 @@ def decimal_binnings(
or a function for formatting thresholds (e.g. var.str_val)
Returns:
bin_boundaries (list of np.ndarray): a list of bin boundaries,
including the top boundary of the last interval, hence the list
size equals the number bins + 1. These array match the `bin`
argument of `numpy.histogram`.
This is returned if `return_defs` is left `True`.
bin_definition (list of BinDefinition):
`BinDefinition` is a named tuple containing the beginning of the
first bin (`start`), number of bins (`nbins`) and their widths
(`width`). The last value can also be a `nd.array` with `nbins + 1`
elements, which describes bins of unequal width and is used for
binnings that match the unique values in the data (see `min_unique`
and `add_unique`).
This is returned if `return_defs` is `False`.
"""
bins = []

Expand Down Expand Up @@ -329,7 +439,8 @@ def decimal_binnings(
return bins


def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0
) -> List[BinDefinition]:
"""
Find a set of nice splits of time variable data into bins
Expand All @@ -355,7 +466,7 @@ def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
number of unique values
Returns:
bin_boundaries (list): a list of possible binning.
bin_boundaries (list of BinDefinition): a list of possible binning.
Each element of `bin_boundaries` is a tuple consisting of a label
describing the bin size (e.g. `2 weeks`) and a list of thresholds.
Thresholds are given as pairs
Expand Down Expand Up @@ -448,7 +559,7 @@ def _simplified_labels(labels):
to_remove = "42"
while True:
firsts = {f for f, *_ in (lab.split() for lab in labels)}
if len(firsts) > 1:
if len(firsts) != 1: # can be 0 if there are no labels
break
to_remove = firsts.pop()
flen = len(to_remove)
Expand Down
Loading

0 comments on commit 94c1bda

Please sign in to comment.