Skip to content

Commit

Permalink
Merge pull request #91 from Zeutschler/dev
Browse files Browse the repository at this point in the history
datespan package added
  • Loading branch information
Zeutschler authored Sep 23, 2024
2 parents 22d10e6 + 5e49353 commit 7701dbd
Show file tree
Hide file tree
Showing 10 changed files with 1,304 additions and 365 deletions.
297 changes: 146 additions & 151 deletions .idea/workspace.xml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cubedpandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from cubedpandas.schema.schema import Schema
from cubedpandas.settings import CachingStrategy

__version__ = "0.2.33"
__version__ = "0.2.34"
__author__ = "Thomas Zeutschler"
__copyright__ = "(C) 2024 Thomas Zeutschler"

Expand Down
122 changes: 45 additions & 77 deletions cubedpandas/context/context_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import numpy as np
import pandas as pd
from datespan import DateSpanSet
from datespan import DateSpanSet, DateSpan

from cubedpandas.context.enums import ContextFunction
from cubedpandas.context.context import Context
Expand Down Expand Up @@ -210,78 +210,47 @@ def resolve(parent: Context, address, dynamic_attribute: bool = False,

# special case for datetime dimensions!
if dimension is not None and pd.api.types.is_datetime64_any_dtype(dimension.dtype):
# maybe the address is a datetime token like "today", "last week", "next month", etc.
its_a_valid_date = False
# As arbitrary date expressions can be used, we use the datespan package to resolve them.
dss: DateSpanSet | None = None

if isinstance(address, datetime.datetime):
from_date = to_date = address
its_a_valid_date = True
elif isinstance(address, str):
# NEW Implementation using datespan package (https://github.com/Zeutschler/datespan):
try:
dss = DateSpanSet(address)
its_a_valid_date = True
except ValueError as e:
ValueError(e)
its_a_valid_date = False

# OLD Implementation:
# # We need to parse the date token, either it's a date string, e.g. "2021-01-01"
# # or a date token, e.g. "today", "yesterday", "last week", "next month", etc.
# its_a_valid_date, from_date, to_date = parse_standard_date_token(address)
# if not its_a_valid_date:
# from_date, to_date = resolve_datetime(address)
# its_a_valid_date = (from_date, to_date) != (None, None)

elif isinstance(address, slice):
# We might have a date range, e.g. "2021-01-01":"2021-12-31" or "last year":"today"
from_date = address.start
to_date = address.stop

if isinstance(from_date, datetime.datetime):
its_a_valid_date = True
elif isinstance(from_date, str):
its_a_valid_date, from_date, result_not_of_interest = parse_standard_date_token(from_date)
if not its_a_valid_date:
fd, result_not_of_interest = resolve_datetime(from_date)
its_a_valid_date = (fd, result_not_of_interest) != (None, None)
if its_a_valid_date:
from_date = fd
else:
raise ValueError(f"Invalid date token '{from_date}' in address '{address}'.")

if isinstance(to_date, datetime.datetime):
its_a_valid_date = True
elif isinstance(to_date, str):
its_a_valid_date, result_not_of_interest, to_date = parse_standard_date_token(to_date)
if not its_a_valid_date:
result_not_of_interest, td = resolve_datetime(to_date)
its_a_valid_date = (result_not_of_interest, td) != (None, None)
if its_a_valid_date:
to_date = td
else:
raise ValueError(f"Invalid date token '{to_date}' in address '{address}'.")

if its_a_valid_date:
parent_row_mask = parent._get_row_mask(before_dimension=dimension)
if dss is not None:
filter_func = dss.to_df_lambda()
filter_func_Source = dss.to_df_lambda(return_source_code=True)
if parent_row_mask is not None:
series = cube.df[dimension.column][parent_row_mask]
bool_mask = filter_func(series)
else:
series = cube.df[dimension.column]
bool_mask = filter_func(series)
new_row_mask = cube.df[bool_mask].index.to_numpy()
exists = len(new_row_mask) > 0
try:
if isinstance(address, slice): # e.g. "2021-01-01":"2021-12-31"
# the 'step' attribute of a slice object will be ignored
start, stop = address.start, address.stop
if start is None and stop is None:
raise ValueError(f"Invalid date range slice '{address}'. "
f"Both start and stop of slice are None.")
if start: # from start to datetime.max
start_date = DateSpanSet(start).start
stop_date = DateSpan.MAX_DATE
elif stop: # from datetime.min to stop
start_date = DateSpan.MIN_DATE
stop_date = DateSpanSet(stop).end
else: # from start to stop
start_date = DateSpanSet(start).start
stop_date = DateSpanSet(stop).end

dss = DateSpanSet(DateSpan(start_date, stop_date))
else:
exists, new_row_mask, member_mask = dimension._check_exists_and_resolve_member(
(from_date, to_date), parent_row_mask, member_mask, skip_checks=True,
evaluate_as_range=True)

# let's create a member context, independent of the result
dss = DateSpanSet(address)
except Exception as e:
raise ValueError(f"Invalid date token '{address}' in address '{address}'. {e}")

# Filter using the datespan package
parent_row_mask = parent._get_row_mask(before_dimension=dimension)
filter_func = dss.to_df_lambda()
# filter_func_Source = dss.to_df_lambda(return_source_code=True) # for debugging only
# if "year=9999" in filter_func_Source:
# pass
if parent_row_mask is not None:
series = cube.df[dimension.column][parent_row_mask]
bool_mask = filter_func(series)
else:
series = cube.df[dimension.column]
bool_mask = filter_func(series)
new_row_mask = cube.df[bool_mask].index.to_numpy()
if len(new_row_mask) > 0:
# some records were found
from cubedpandas.schema.member import Member, MemberSet
member = Member(dim, address)
members = MemberSet(dimension=dim, address=address, row_mask=new_row_mask,
Expand All @@ -291,13 +260,12 @@ def resolve(parent: Context, address, dynamic_attribute: bool = False,
row_mask=new_row_mask, member_mask=member_mask,
measure=measure, dimension=dim,
members=members, resolve=False)
if not exists:
# If no records where found, we will return a context with an empty row mask
from cubedpandas.context.member_not_found_context import MemberNotFoundContext
resolved_context = MemberNotFoundContext(cube=cube, parent=parent, address=address,
dimension=dim)

return resolved_context
else:
# no records were found, we will return a context with an empty row mask
from cubedpandas.context.member_not_found_context import MemberNotFoundContext
resolved_context = MemberNotFoundContext(cube=cube, parent=parent, address=address,
dimension=dim)
return resolved_context


if not dynamic_attribute:
Expand Down
7 changes: 4 additions & 3 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ value = df.loc[(df['make'] == 'Audi') &
can turn into this equivalent CubedPandas code:

```python
# CubedPandas: calculate the total revenue of all hybrid Audi cars in September 2024
value = df.cubed.make.Audi.engine.hybrid.date.september_2024.price.sum
# or even shorter
# ...the same with CubedPandas:
value = df.cubed.make.Audi.engine.hybrid.date.september_2024.price

# ...or even shorter
value = df.cubed.Audi.hybrid.sep_2024.price
```

Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
numpy>=1.26.1
pandas>=1.5.2
python-dateutil>=2.8.2
datespan>=0.2.4
datespan>=0.2.8

# for future use
matplotlib
Expand All @@ -11,7 +11,7 @@ scikit-learn


# for server sample
fastapi>=0.88.0
fastapi>=0.109.1
uvicorn>=0.20.0


Expand Down
Loading

0 comments on commit 7701dbd

Please sign in to comment.