Skip to content

Commit

Permalink
Edit Domain: enable transformation to time variable with format selec…
Browse files Browse the repository at this point in the history
…tion
  • Loading branch information
PrimozGodec committed Feb 18, 2022
1 parent 12bb88f commit 5d7c550
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 93 deletions.
121 changes: 66 additions & 55 deletions Orange/widgets/data/oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,12 @@ class Unlink(_DataType, namedtuple("Unlink", [])):
"""Unlink variable from its source, that is, remove compute_value"""


Transform = Union[Rename, CategoriesMapping, Annotate, Unlink]
TransformTypes = (Rename, CategoriesMapping, Annotate, Unlink)
class StrpTime(_DataType, namedtuple("StrpTime", ["label", "formats", "have_date", "have_time"])):
"""Use format on variable interpreted as time"""


Transform = Union[Rename, CategoriesMapping, Annotate, Unlink, StrpTime]
TransformTypes = (Rename, CategoriesMapping, Annotate, Unlink, StrpTime)

CategoricalTransformTypes = (CategoriesMapping, Unlink)

Expand Down Expand Up @@ -1519,8 +1523,37 @@ class ContinuousVariableEditor(VariableEditor):


class TimeVariableEditor(VariableEditor):
# TODO: enable editing of display format...
pass
def __init__(self, parent=None, **kwargs):
super().__init__(parent, **kwargs)
form = self.layout().itemAt(0)

self.format_cb = QComboBox()
for item, data in [("Detect automatically", (None, 1, 1))] + list(
Orange.data.TimeVariable.ADDITIONAL_FORMATS.items()
):
self.format_cb.addItem(item, StrpTime(item, *data))
self.format_cb.currentIndexChanged.connect(self.variable_changed)
form.insertRow(2, "Format:", self.format_cb)

def set_data(self, var, transform=()):
super().set_data(var, transform)
if self.parent() is not None and isinstance(self.parent().var, Time):
# when transforming from time to time disable format selection combo
self.format_cb.setEnabled(False)
else:
# select the format from StrpTime transform
for tr in transform:
if isinstance(tr, StrpTime):
index = self.format_cb.findText(tr.label)
self.format_cb.setCurrentIndex(index)
self.format_cb.setEnabled(True)

def get_data(self):
var, tr = super().get_data()
if var is not None and (self.parent() is None or not isinstance(self.parent().var, Time)):
# do not add StrpTime when transforming from time to time
tr.insert(0, self.format_cb.currentData())
return var, tr


def variable_icon(var):
Expand Down Expand Up @@ -2581,14 +2614,17 @@ def apply_transform_time(var, trs):
def apply_transform_string(var, trs):
# type: (Orange.data.StringVariable, List[Transform]) -> Orange.data.Variable
name, annotations = var.name, var.attributes
out_type = Orange.data.StringVariable
compute_value = Identity
for tr in trs:
if isinstance(tr, Rename):
name = tr.name
elif isinstance(tr, Annotate):
annotations = _parse_attributes(tr.annotations)
variable = Orange.data.StringVariable(
name=name, compute_value=Identity(var)
)
elif isinstance(tr, StrpTime):
out_type = partial(Orange.data.TimeVariable, have_date=tr.have_date, have_time=tr.have_time)
compute_value = partial(ReparseTimeTransform, tr=tr)
variable = out_type(name=name, compute_value=compute_value(var))
variable.attributes.update(annotations)
return variable

Expand Down Expand Up @@ -2649,21 +2685,6 @@ def mapper(arr, out=None, dtype=dtype, **kwargs):
return mapper


def time_parse(values: Sequence[str], name="__"):
tvar = Orange.data.TimeVariable(name)
parse_time = ftry(tvar.parse, ValueError, np.nan)
_values = [parse_time(v) for v in values]
if np.all(np.isnan(_values)):
# try parsing it with pandas (like in transform)
dti = pd.to_datetime(values, errors="coerce")
_values = datetime_to_epoch(dti)
date_only = getattr(dti, "_is_dates_only", False)
if np.all(dti != pd.NaT):
tvar.have_date = True
tvar.have_time = not date_only
return tvar, _values


as_string = np.frompyfunc(str, 1, 1)
parse_float = ftry(float, ValueError, float("nan"))

Expand Down Expand Up @@ -2710,24 +2731,16 @@ def apply_reinterpret_d(var, tr, data):
# type: (Orange.data.DiscreteVariable, ReinterpretTransform, ndarray) -> Orange.data.Variable
if isinstance(tr, AsCategorical):
return var
elif isinstance(tr, AsString):
elif isinstance(tr, (AsString, AsTime)):
# TimeVar will be interpreted by StrpTime later
f = Lookup(var, np.array(var.values, dtype=object), unknown="")
rvar = Orange.data.StringVariable(
name=var.name, compute_value=f
)
rvar = Orange.data.StringVariable(name=var.name, compute_value=f)
elif isinstance(tr, AsContinuous):
f = Lookup(var, np.array(list(map(parse_float, var.values))),
unknown=np.nan)
rvar = Orange.data.ContinuousVariable(
name=var.name, compute_value=f, sparse=var.sparse
)
elif isinstance(tr, AsTime):
_tvar, values = time_parse(var.values)
f = Lookup(var, np.array(values), unknown=np.nan)
rvar = Orange.data.TimeVariable(
name=var.name, have_date=_tvar.have_date,
have_time=_tvar.have_time, compute_value=f,
)
else:
assert False
return copy_attributes(rvar, var)
Expand All @@ -2753,14 +2766,11 @@ def apply_reinterpret_c(var, tr, data: MArray):
elif isinstance(tr, AsContinuous):
return var
elif isinstance(tr, AsString):
# TimeVar will be interpreted by StrpTime later
tstr = ToStringTransform(var)
rvar = Orange.data.StringVariable(
name=var.name, compute_value=tstr
)
rvar = Orange.data.StringVariable(name=var.name, compute_value=tstr)
elif isinstance(tr, AsTime):
rvar = Orange.data.TimeVariable(
name=var.name, compute_value=Identity(var)
)
rvar = Orange.data.TimeVariable(name=var.name, compute_value=Identity(var))
else:
assert False
return copy_attributes(rvar, var)
Expand All @@ -2783,14 +2793,9 @@ def apply_reinterpret_s(var: Orange.data.StringVariable, tr, data: MArray):
rvar = Orange.data.ContinuousVariable(
var.name, compute_value=ToContinuousTransform(var)
)
elif isinstance(tr, AsString):
elif isinstance(tr, (AsString, AsTime)):
# TimeVar will be interpreted by StrpTime later
return var
elif isinstance(tr, AsTime):
tvar, _ = time_parse(np.unique(data.data[~data.mask]))
rvar = Orange.data.TimeVariable(
name=var.name, have_date=tvar.have_date, have_time=tvar.have_time,
compute_value=ReparseTimeTransform(var)
)
else:
assert False
return copy_attributes(rvar, var)
Expand Down Expand Up @@ -2822,6 +2827,7 @@ def apply_reinterpret_t(var: Orange.data.TimeVariable, tr, data):
else:
assert False
return copy_attributes(rvar, var)
#todo: disable format dropdown when allready time


def orange_isna(variable: Orange.data.Variable, data: ndarray) -> ndarray:
Expand Down Expand Up @@ -2867,23 +2873,28 @@ def transform(self, c):
raise TypeError


def datetime_to_epoch(dti: pd.DatetimeIndex) -> np.ndarray:
def datetime_to_epoch(dti: pd.DatetimeIndex, only_time) -> np.ndarray:
"""Convert datetime to epoch"""
data = dti.values.astype("M8[us]")
mask = np.isnat(data)
data = data.astype(float) / 1e6
data[mask] = np.nan
return data
delta = dti - (dti.normalize() if only_time else pd.Timestamp("1970-01-01"))
return (delta / pd.Timedelta("1s")).values


class ReparseTimeTransform(Transformation):
"""
Re-parse the column's string repr as datetime.
"""
def __init__(self, variable, tr):
super().__init__(variable)
self.tr = tr

def transform(self, c):
c = column_str_repr(self.variable, c)
c = pd.to_datetime(c, errors="coerce")
return datetime_to_epoch(c)
# if self.formats is none guess format option is selected
formats = self.tr.formats if self.tr.formats is not None else [None]
for f in formats:
d = pd.to_datetime(c, errors="coerce", format=f)
if pd.notnull(d).any():
return datetime_to_epoch(d, only_time=not self.tr.have_date)
return np.nan


class LookupMappingTransform(Transformation):
Expand Down
89 changes: 51 additions & 38 deletions Orange/widgets/data/tests/test_oweditdomain.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
# pylint: disable=all
import pickle
import unittest
from itertools import product
from functools import partial
from itertools import product, chain
from unittest import TestCase
from unittest.mock import Mock, patch

import numpy as np
from numpy.testing import assert_array_equal
import pandas as pd

from AnyQt.QtCore import QItemSelectionModel, Qt, QItemSelection, QPoint
from AnyQt.QtGui import QPalette, QColor, QHelpEvent
Expand All @@ -33,8 +33,8 @@
table_column_data, ReinterpretVariableEditor, CategoricalVector,
VariableEditDelegate, TransformRole,
RealVector, TimeVector, StringVector, make_dict_mapper, DictMissingConst,
LookupMappingTransform, as_float_or_nan, column_str_repr, time_parse,
GroupItemsDialog, VariableListModel
LookupMappingTransform, as_float_or_nan, column_str_repr,
GroupItemsDialog, VariableListModel, StrpTime
)
from Orange.widgets.data.owcolor import OWColor, ColorRole
from Orange.widgets.tests.base import WidgetTest, GuiTest
Expand Down Expand Up @@ -589,8 +589,9 @@ def test_time_editor(self):
),
]
ReinterpretTransforms = {
Categorical: AsCategorical, Real: AsContinuous, Time: AsTime,
String: AsString
Categorical: [AsCategorical], Real: [AsContinuous],
Time: [AsTime, partial(StrpTime, 'Detect automatically', None, 1, 1)],
String: [AsString]
}

def test_reinterpret_editor(self):
Expand All @@ -603,13 +604,13 @@ def test_reinterpret_editor(self):
self.assertEqual(w.get_data(), (data.vtype, [Rename("Z")]))

for vec, tr in product(self.DataVectors, self.ReinterpretTransforms.values()):
w.set_data(vec, [tr()])
w.set_data(vec, [t() for t in tr])
v, tr_ = w.get_data()
self.assertEqual(v, vec.vtype)
if not tr_:
self.assertEqual(tr, self.ReinterpretTransforms[type(v)])
else:
self.assertEqual(tr_, [tr()])
self.assertListEqual(tr_, [t() for t in tr])

def test_reinterpret_editor_simulate(self):
w = ReinterpretVariableEditor()
Expand All @@ -619,7 +620,9 @@ def cb():
var, tr = w.get_data()
type_ = tc.currentData()
if type_ is not type(var):
self.assertEqual(tr, [self.ReinterpretTransforms[type_](), Rename("Z")])
self.assertEqual(
tr, [t() for t in self.ReinterpretTransforms[type_]] + [Rename("Z")]
)
else:
self.assertEqual(tr, [Rename("Z")])

Expand Down Expand Up @@ -912,34 +915,58 @@ def test_as_continuous(self):
)

def test_as_time(self):
table = self.data
domain = table.domain
# this test only test type of format that can be string, continuous and discrete
# correctness of time formats is already tested in TimeVariable module
d = TimeVariable("_").parse_exact_iso
times = (
["07.02.2022", "18.04.2021"], # date only
["07.02.2022 01:02:03", "18.04.2021 01:02:03"], # datetime
["010203", "010203"], # time
["02-07", "04-18"],
)
formats = ["25.11.2021", "25.11.2021 00:00:00", "000000", "11-25"]
expected = [
[d("2022-02-07"), d("2021-04-18")],
[d("2022-02-07 01:02:03"), d("2021-04-18 01:02:03")],
[d("01:02:03"), d("01:02:03")],
[d("1900-02-07"), d("1900-04-18")],
]
variables = [StringVariable(f"s{i}") for i in range(len(times))]
variables += [DiscreteVariable(f"d{i}", values=t) for i, t in enumerate(times)]
domain = Domain([], metas=variables)
metas = [t for t in times] + [list(range(len(x))) for x in times]
table = Table(domain, np.empty((len(times[0]), 0)), metas=np.array(metas).transpose())

tr = AsTime()
dtr = []
for v in domain.variables:
vtr = apply_reinterpret(v, tr, table_column_data(table, v))
for v, f in zip(domain.metas, chain(formats, formats)):
strp = StrpTime(f, *TimeVariable.ADDITIONAL_FORMATS[f])
vtr = apply_transform_var(
apply_reinterpret(v, tr, table_column_data(table, v)), [strp]
)
dtr.append(vtr)

ttable = table.transform(Domain(dtr))
ttable = table.transform(Domain([], metas=dtr))
assert_array_equal(
ttable.X,
np.array([
[np.nan, np.nan, 0.25, 180],
[np.nan, np.nan, 1.25, 360],
[np.nan, np.nan, 0.20, 720],
[np.nan, np.nan, 0.00, 000],
], dtype=float)
ttable.metas,
np.array(list(chain(expected, expected)), dtype=float).transpose()
)

def test_reinterpret_string(self):
table = self.data_str
domain = table.domain
tvars = []
for v in domain.metas:
for i, tr in enumerate([AsContinuous(), AsCategorical(), AsTime(), AsString()]):
tr = apply_reinterpret(v, tr, table_column_data(table, v)).renamed(f'{v.name}_{i}')
tvars.append(tr)
for i, tr in enumerate(
[AsContinuous(), AsCategorical(), AsTime(), AsString()]
):
vtr = apply_reinterpret(v, tr, table_column_data(table, v)).renamed(
f"{v.name}_{i}"
)
if isinstance(tr, AsTime):
strp = StrpTime("Detect automatically", None, 1, 1)
vtr = apply_transform_var(vtr, [strp])
tvars.append(vtr)
tdomain = Domain([], metas=tvars)
ttable = table.transform(tdomain)
assert_array_nanequal(
Expand Down Expand Up @@ -1039,19 +1066,6 @@ def test_column_str_repr(self):
d = column_str_repr(v, np.array([0., np.nan, 1.0]))
assert_array_equal(d, ["00:00:00", "?", "00:00:01"])

def test_time_parse(self):
"""parsing additional datetimes by pandas"""
date = ["1/22/20", "1/23/20", "1/24/20"]
# we use privet method, check if still exists
assert hasattr(pd.DatetimeIndex, '_is_dates_only')

tval, values = time_parse(date)

self.assertTrue(tval.have_date)
self.assertFalse(tval.have_time)
self.assertListEqual(list(values),
[1579651200.0, 1579737600.0, 1579824000.0])


class TestLookupMappingTransform(TestCase):
def setUp(self) -> None:
Expand Down Expand Up @@ -1220,4 +1234,3 @@ def _test_correctness():

if __name__ == '__main__':
unittest.main()

0 comments on commit 5d7c550

Please sign in to comment.