OWDiscretize: Reduce interface, add nicer binnings

biolab · Mar 30, 2022 · 94c1bda · 94c1bda
1 parent 93cf05f
commit 94c1bda
Show file tree

Hide file tree

Showing 2 changed files with 737 additions and 617 deletions.
diff --git a/Orange/preprocess/discretize.py b/Orange/preprocess/discretize.py
@@ -8,7 +8,7 @@
 import numpy as np
 import scipy.sparse as sp
 
-from Orange.data import DiscreteVariable, Domain
+from Orange.data import DiscreteVariable, Domain, TimeVariable
 from Orange.data.sql.table import SqlTable
 from Orange.statistics import distribution, contingency, util as ut
 from Orange.statistics.basic_stats import BasicStats
@@ -58,13 +58,17 @@ def _fmt_interval(low, high, formatter):
         return f"{formatter(low)} - {formatter(high)}"
 
     @classmethod
-    def create_discretized_var(cls, var, points):
-        def fmt(val):
-            sval = var.str_val(val)
-            # For decimal numbers, remove trailing 0's and . if no decimals left
-            if re.match(r"^\d+\.\d+", sval):
-                return sval.rstrip("0").rstrip(".")
-            return sval
+    def create_discretized_var(cls, var, points, ndigits=None):
+        if ndigits is None:
+            def fmt(val):
+                sval = var.str_val(val)
+                # For decimal numbers, remove trailing 0's and . if no decimals left
+                if re.match(r"^\d+\.\d+", sval):
+                    return sval.rstrip("0").rstrip(".")
+                return sval
+        else:
+            def fmt(val):
+                return f"{val:.{ndigits}f}"
 
         lpoints = list(points)
         if lpoints:
@@ -186,7 +190,122 @@ def _split_eq_width(self, min, max):
         if np.isnan(min) or np.isnan(max) or min == max:
             return []
         dif = (max - min) / self.n
-        return [min + (i + 1) * dif for i in range(self.n - 1)]
+        return [min + i * dif for i in range(1, self.n)]
+
+
+class TooManyIntervals(ValueError):
+    pass
+
+
+class FixedWidth(Discretization):
+    def __init__(self, width, digits=None):
+        super().__init__()
+        self.width = width
+        self.digits = digits
+
+    def __call__(self, data, attribute):
+        values = data[:, attribute]
+        values = values.X if values.X.size else values.Y
+        if values.size:
+            min, max = ut.nanmin(values), ut.nanmax(values)
+            minf = int(1 + np.floor(min / self.width))
+            maxf = int(1 + np.floor(max / self.width))
+            if maxf - minf - 1 >= 100:
+                raise TooManyIntervals
+            points = [i * self.width for i in range(minf, maxf)]
+        else:
+            points = []
+
+        return Discretizer.create_discretized_var(
+                    data.domain[attribute], points, ndigits=self.digits)
+
+class FixedTimeWidth(Discretization):
+    def __init__(self, width, unit):
+        # unit: 0=year, 1=month, 2=day, 3=hour, 4=minute, 5=second
+        # for week, use day with a width of 7
+        super().__init__()
+        self.width = width
+        self.unit = unit
+
+    def __call__(self, data, attribute):
+        fmt = ["%Y", "%y %b", "%y %b %d", "%y %b %d %H:%M", "%b %d %H:%M",
+               "%H:%M:%S"][self.unit]
+        values = data[:, attribute]
+        values = values.X if values.X.size else values.Y
+        if not values.size:
+            times = []
+        else:
+            mn, mx = ut.nanmin(values), ut.nanmax(values)
+            mn = utc_from_timestamp(mn).timetuple()
+            mx = utc_from_timestamp(mx).timetuple()
+            times = _time_range(mn, mx, self.unit, self.width, 0, 100)
+            if times is None:
+                raise TooManyIntervals
+        times = [time.struct_time(t + (0, 0, 0)) for t in times][1:-1]
+        points = np.array([calendar.timegm(t) for t in times])
+        values = [time.strftime(fmt, t) for t in times]
+        if values:
+            values = _simplified_labels(values)
+            values = [f"< {values[0]}"] + [
+                      f"{low} - {high}" for low, high in zip(values, values[1:])
+            ] + [f"≥ {values[-1]}"]
+        return DiscreteVariable(name=attribute.name, values=values,
+                                compute_value=Discretizer(attribute, points),
+                                sparse=attribute.sparse)
+
+class Binning(Discretization):
+    """Discretization with nice thresholds
+
+    This class creates different decimal or time binnings and picks the one
+    in which the number of interval is closest to the desired number.
+    The difference is measured as proportion; e.g. having 30 % less intervals
+    is the same difference as having 30 % too many.
+
+    .. attribute:: n
+
+        Desired number of bins (default: 4).
+    """
+    def __init__(self, n=4):
+        self.n = n
+
+    def __call__(self, data, attribute):
+        def no_discretization():
+            return Discretizer.create_discretized_var(
+                data.domain[attribute], [])
+
+        values = data[:, attribute]
+        values = values.X if values.X.size else values.Y
+        if not values.size:
+            return no_discretization()
+
+        var = data.domain[attribute]
+        if isinstance(var, TimeVariable):
+            binnings = time_binnings(values)
+        else:
+            binnings = decimal_binnings(values)
+        if not binnings:
+            return no_discretization()
+
+        # If self.n is 2, require two intervals (one threshold, excluding top
+        # and bottom), else require at least three intervals
+        # ... unless this is the only option, in which case we use it
+        binning = min(
+            (binning for binning in binnings
+             if len(binning.thresholds) - 2 >= 1 + (self.n != 2)),
+            key=lambda binning: abs(1 - len(binning.short_labels) / self.n),
+            default=binnings[-1])
+
+        blabels = binning.labels[1:-1]
+        labels = [f"< {blabels[0]}"] + [
+            f"{lab1} - {lab2}" for lab1, lab2 in zip(blabels, blabels[1:])
+        ] + [f"≥ {blabels[-1]}"]
+
+        discretizer = Discretizer(var, list(binning.thresholds[1:-1]))
+        dvar = DiscreteVariable(name=var.name, values=labels,
+                                compute_value=discretizer,
+                                sparse=var.sparse)
+        dvar.source_variable = var
+        return dvar
 
 
 class BinDefinition(NamedTuple):
@@ -234,7 +353,7 @@ def decimal_binnings(
         data, *, min_width=0, min_bins=2, max_bins=50,
         min_unique=5, add_unique=0,
         factors=(0.01, 0.02, 0.025, 0.05, 0.1, 0.2, 0.25, 0.5, 1, 2, 5, 10, 20),
-        label_fmt="%g"):
+        label_fmt="%g") -> List[BinDefinition]:
     """
     Find a set of nice splits of data into bins
 
@@ -283,22 +402,13 @@ def decimal_binnings(
             or a function for formatting thresholds (e.g. var.str_val)
 
     Returns:
-        bin_boundaries (list of np.ndarray): a list of bin boundaries,
-            including the top boundary of the last interval, hence the list
-            size equals the number bins + 1. These array match the `bin`
-            argument of `numpy.histogram`.
-
-            This is returned if `return_defs` is left `True`.
-
         bin_definition (list of BinDefinition):
             `BinDefinition` is a named tuple containing the beginning of the
             first bin (`start`), number of bins (`nbins`) and their widths
             (`width`). The last value can also be a `nd.array` with `nbins + 1`
             elements, which describes bins of unequal width and is used for
             binnings that match the unique values in the data (see `min_unique`
             and `add_unique`).
-
-            This is returned if `return_defs` is `False`.
     """
     bins = []
 
@@ -329,7 +439,8 @@ def decimal_binnings(
     return bins
 
 
-def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
+def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0
+                  ) -> List[BinDefinition]:
     """
     Find a set of nice splits of time variable data into bins
 
@@ -355,7 +466,7 @@ def time_binnings(data, *, min_bins=2, max_bins=50, min_unique=5, add_unique=0):
             number of unique values
 
     Returns:
-        bin_boundaries (list): a list of possible binning.
+        bin_boundaries (list of BinDefinition): a list of possible binning.
             Each element of `bin_boundaries` is a tuple consisting of a label
             describing the bin size (e.g. `2 weeks`) and a list of thresholds.
             Thresholds are given as pairs
@@ -448,7 +559,7 @@ def _simplified_labels(labels):
     to_remove = "42"
     while True:
         firsts = {f for f, *_ in (lab.split() for lab in labels)}
-        if len(firsts) > 1:
+        if len(firsts) != 1:  # can be 0 if there are no labels
             break
         to_remove = firsts.pop()
         flen = len(to_remove)