Source code for pipeline.src.helpers.dates

from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import List, Union

import pandas as pd


@dataclass
[docs] class Period:
[docs] start: datetime
[docs] end: datetime
[docs] def make_periods( start_datetime_utc: datetime, end_datetime_utc: datetime, period_duration: timedelta, overlap: Union[None, timedelta] = None, ) -> List[Period]: """ Returns a list of `Period` of duration `period_duration` covering the time range from `start_datetime_utc` to `end_datetime_utc`. If `overlap` is specified, the `Period` returned will overlap by the amount specified, otherwise the end of one period will coincide with the start of the next one. If `period_duration` is shorter than the time between `start_datetime_utc` and `end_datetime_utc`, returns a list with a single `Period` starting on `start_datetime_utc` and ending on `end_datetime_utc`. This is useful to break a long time range into smaller periods for processing time series data that would take up too much memory to handle in one piece. Args: start_datetime_utc (datetime): start of the period to cover end_datetime_utc (datetime): end of the period to cover period_duration (timedelta): duration of the individual periods returned overlap (Union[None, timedelta]): overlap between successive periods, if specified. Defaults to `None`. """ if not overlap: overlap = timedelta(0) try: assert period_duration > overlap except AssertionError: raise ValueError("'period_duration' cannot be shorter than 'overlap'.") try: assert end_datetime_utc >= start_datetime_utc except AssertionError: raise ValueError( "'end_datetime_utc' cannot be before than 'start_datetime_utc'." ) if end_datetime_utc - start_datetime_utc <= period_duration: return [Period(start=start_datetime_utc, end=end_datetime_utc)] else: periods = make_periods( start_datetime_utc=start_datetime_utc + period_duration - overlap, end_datetime_utc=end_datetime_utc, period_duration=period_duration, overlap=overlap, ) periods.insert( 0, Period( start=start_datetime_utc, end=start_datetime_utc + period_duration, ), ) return periods
[docs] def get_datetime_intervals( s: pd.Series, unit: str = None, how: str = "backward" ) -> pd.Series: """ Takes a pandas Series with datetime dtype. Return a pandas Series with the same index and with time intervals between the successives values of the input Series as values. Args: s (Series): pandas Series with datetime dtype unit (Union[str, None]): - if `None`, returns values as pandas `Timedelta` - if provided, must be one of 's', 'min' or 'h', in which case values are returned as a float. Defaults to `None`. how (str): if, 'forward', computes the interval between each position and the next one. If 'backward', computes the interval between each position and the previous one. Defaults to 'backward' Returns: pd.Series: Series of time intervals between the values of the input Series """ if how == "backward": shift = 1 elif how == "forward": shift = -1 else: raise ValueError(f"how expects 'backward' or 'forward', got '{how}'") intervals = pd.Series( index=s.index, data=shift * (s.values - s.shift(shift).values), ) if unit: intervals = intervals.map(lambda dt: dt.total_seconds()) if unit == "h": intervals = intervals / 3600 elif unit == "min": intervals = intervals / 60 elif unit == "s": pass else: raise ValueError(f"unit must be None, 'h', 'min' or 's', got '{unit}'.") return intervals
[docs] def is_in_validity_period( validity_start_date: datetime, validity_end_date: datetime, repeat_each_year: bool, sample_date: datetime, ) -> bool: """ Check if a sample_date falls within a validity period. Args: validity_start_date: Start of validity period (None means no start constraint) validity_end_date: End of validity period (None means no end constraint) repeat_each_year: If True, the validity period repeats annually sample_date: Date to check against the validity period Returns: True if sample_date is within the validity period, False otherwise """ if pd.isna(validity_start_date) and pd.isna(validity_end_date): return True if pd.isna(validity_start_date): return sample_date <= validity_end_date if pd.isna(validity_end_date): return sample_date >= validity_start_date if repeat_each_year: validaty_duration = validity_end_date - validity_start_date one_year = timedelta(days=365) if validaty_duration >= one_year: return sample_date >= validity_start_date return ( validity_start_date <= sample_date.replace(year=min(validity_start_date.year, sample_date.year)) <= validity_end_date ) or ( validity_start_date <= sample_date.replace(year=min(validity_end_date.year, sample_date.year)) <= validity_end_date ) else: return validity_start_date <= sample_date <= validity_end_date