Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Datetime format selection #5819

Merged
merged 2 commits into from
Mar 11, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 112 additions & 0 deletions Orange/data/tests/test_variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from io import StringIO

import numpy as np
import pandas as pd
import scipy.sparse as sp

from Orange.data import Variable, ContinuousVariable, DiscreteVariable, \
Expand Down Expand Up @@ -698,6 +699,117 @@ def varcls_modified(self, name):
var.have_time = 1
return var

def test_additional_formats(self):
expected_date = datetime(2022, 2, 7)
dates = {
"2021-11-25": ("2022-02-07",),
"25.11.2021": ("07.02.2022", "07. 02. 2022", "7.2.2022", "7. 2. 2022"),
"25.11.21": ("07.02.22", "07. 02. 22", "7.2.22", "7. 2. 22"),
"11/25/2021": ("02/07/2022", "2/7/2022"),
"11/25/21": ("02/07/22", "2/7/22"),
"20211125": ("20220207",),
}
expected_date_time = datetime(2022, 2, 7, 10, 11, 12)
date_times = {
"2021-11-25 00:00:00": (
"2022-02-07 10:11:12",
"2022-02-07 10:11:12.00",
),
"25.11.2021 00:00:00": (
"07.02.2022 10:11:12",
"07. 02. 2022 10:11:12",
"7.2.2022 10:11:12",
"7. 2. 2022 10:11:12",
"07.02.2022 10:11:12.00",
"07. 02. 2022 10:11:12.00",
"7.2.2022 10:11:12.00",
"7. 2. 2022 10:11:12.00",
),
"25.11.21 00:00:00": (
"07.02.22 10:11:12",
"07. 02. 22 10:11:12",
"7.2.22 10:11:12",
"7. 2. 22 10:11:12",
"07.02.22 10:11:12.00",
"07. 02. 22 10:11:12.00",
"7.2.22 10:11:12.00",
"7. 2. 22 10:11:12.00",
),
"11/25/2021 00:00:00": (
"02/07/2022 10:11:12",
"2/7/2022 10:11:12",
"02/07/2022 10:11:12.00",
"2/7/2022 10:11:12.00",
),
"11/25/21 00:00:00": (
"02/07/22 10:11:12",
"2/7/22 10:11:12",
"02/07/22 10:11:12.00",
"2/7/22 10:11:12.00",
),
"20211125000000": ("20220207101112", "20220207101112.00"),
}
# times without seconds
expected_date_time2 = datetime(2022, 2, 7, 10, 11, 0)
date_times2 = {
"2021-11-25 00:00:00": ("2022-02-07 10:11",),
"25.11.2021 00:00:00": (
"07.02.2022 10:11",
"07. 02. 2022 10:11",
"7.2.2022 10:11",
"7. 2. 2022 10:11",
),
"25.11.21 00:00:00": (
"07.02.22 10:11",
"07. 02. 22 10:11",
"7.2.22 10:11",
"7. 2. 22 10:11",
),
"11/25/2021 00:00:00": ("02/07/2022 10:11", "2/7/2022 10:11"),
"11/25/21 00:00:00": ("02/07/22 10:11", "2/7/22 10:11"),
"20211125000000": ("202202071011",),
}
# datetime defaults to 1900, 01, 01
expected_time = datetime(1900, 1, 1, 10, 11, 12)
times = {
"00:00:00": ("10:11:12", "10:11:12.00"),
"000000": ("101112", "101112.00"),
}
expected_time2 = datetime(1900, 1, 1, 10, 11, 0)
times2 = {
"00:00:00": ("10:11",),
}
expected_year = datetime(2022, 1, 1)
years = {
"2021": (2022,),
}
expected_day = datetime(1900, 2, 7)
days = {
"11-25": ("02-07",),
"25.11.": ("07.02.", "07. 02.", "7.2.", "7. 2."),
"11/25": ("02/07", "2/7"),
}
data = (
(expected_date, dates),
(expected_date_time, date_times),
(expected_date_time2, date_times2),
(expected_time, times),
(expected_time2, times2),
(expected_year, years),
(expected_day, days),
)
for expected, dts in data:
for k, dt in dts.items():
for t in dt:
parsed = [
pd.to_datetime(t, format=f, errors="coerce")
for f in TimeVariable.ADDITIONAL_FORMATS[k][0]
]
# test any equal to expected
self.assertTrue(any(d == expected for d in parsed))
# test that no other equal to any other date - only nan or expected
self.assertTrue(any(d == expected or pd.isnull(d) for d in parsed))


PickleContinuousVariable = create_pickling_tests(
"PickleContinuousVariable",
Expand Down
80 changes: 76 additions & 4 deletions Orange/data/variable.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,7 @@ class TimeVariable(ContinuousVariable):

If time is specified without a date, Unix epoch is assumed.

If time is specified wihout an UTC offset, localtime is assumed.
If time is specified without an UTC offset, localtime is assumed.
"""
_all_vars = {}
TYPE_HEADERS = ('time', 't')
Expand Down Expand Up @@ -923,15 +923,86 @@ class TimeVariable(ContinuousVariable):
r'\d{1,4}(-?\d{2,3})?'
r')$')

ADDITIONAL_FORMATS = {
"2021-11-25": (("%Y-%m-%d",), 1, 0),
"25.11.2021": (("%d.%m.%Y", "%d. %m. %Y"), 1, 0),
"25.11.21": (("%d.%m.%y", "%d. %m. %y"), 1, 0),
"11/25/2021": (("%m/%d/%Y",), 1, 0),
"11/25/21": (("%m/%d/%y",), 1, 0),
"20211125": (("%Y%m%d",), 1, 0),
# it would be too many options if we also include all time formats with
# with lengths up to minutes, up to seconds and up to milliseconds,
# joining all tree options under 00:00:00
"2021-11-25 00:00:00": (
(
"%Y-%m-%d %H:%M",
"%Y-%m-%d %H:%M:%S",
"%Y-%m-%d %H:%M:%S.%f",
),
1,
1,
),
"25.11.2021 00:00:00": (
(
"%d.%m.%Y %H:%M",
"%d. %m. %Y %H:%M",
"%d.%m.%Y %H:%M:%S",
"%d. %m. %Y %H:%M:%S",
"%d.%m.%Y %H:%M:%S.%f",
"%d. %m. %Y %H:%M:%S.%f",
),
1,
1,
),
"25.11.21 00:00:00": (
(
"%d.%m.%y %H:%M",
"%d. %m. %y %H:%M",
"%d.%m.%y %H:%M:%S",
"%d. %m. %y %H:%M:%S",
"%d.%m.%y %H:%M:%S.%f",
"%d. %m. %y %H:%M:%S.%f",
),
1,
1,
),
"11/25/2021 00:00:00": (
(
"%m/%d/%Y %H:%M",
"%m/%d/%Y %H:%M:%S",
"%m/%d/%Y %H:%M:%S.%f",
),
1,
1,
),
"11/25/21 00:00:00": (
(
"%m/%d/%y %H:%M",
"%m/%d/%y %H:%M:%S",
"%m/%d/%y %H:%M:%S.%f",
),
1,
1,
),
"20211125000000": (("%Y%m%d%H%M", "%Y%m%d%H%M%S", "%Y%m%d%H%M%S.%f"), 1, 1),
"00:00:00": (("%H:%M", "%H:%M:%S", "%H:%M:%S.%f"), 0, 1),
"000000": (("%H%M", "%H%M%S", "%H%M%S.%f"), 0, 1),
"2021": (("%Y",), 1, 0),
"11-25": (("%m-%d",), 1, 0),
"25.11.": (("%d.%m.", "%d. %m."), 1, 0),
"11/25": (("%m/%d",), 1, 0),
"1125": (("%m%d",), 1, 0),
}

class InvalidDateTimeFormatError(ValueError):
def __init__(self, date_string):
super().__init__(
"Invalid datetime format '{}'. "
"Only ISO 8601 supported.".format(date_string))
f"Invalid datetime format '{date_string}'. Only ISO 8601 supported."
)

_matches_iso_format = re.compile(REGEX).match

# If parsed datetime values provide an offset or timzone, it is used for display.
# If parsed datetime values provide an offset or timzone, it is used for display.
# If not all values have the same offset, +0000 (=UTC) timezone is used
_timezone = None

Expand Down Expand Up @@ -1011,6 +1082,7 @@ def parse(self, datestr):
"""
if datestr in MISSING_VALUES:
return Unknown

datestr = datestr.strip().rstrip('Z')
datestr = self._tzre_sub(datestr)

Expand Down
Loading