Source code for pipeline.src.helpers.dates

from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import List, Union

import pandas as pd


@dataclass

[docs]
class Period:

[docs]
    start: datetime


[docs]
    end: datetime





[docs]
def make_periods(
    start_datetime_utc: datetime,
    end_datetime_utc: datetime,
    period_duration: timedelta,
    overlap: Union[None, timedelta] = None,
) -> List[Period]:
    """
    Returns a list of `Period` of duration `period_duration` covering the time range
    from `start_datetime_utc` to `end_datetime_utc`.

    If `overlap` is specified, the `Period` returned will overlap by the amount
    specified, otherwise the end of one period will coincide with the start of the
    next one.

    If `period_duration` is shorter than the time between `start_datetime_utc` and
    `end_datetime_utc`, returns a list with a single `Period` starting on
    `start_datetime_utc` and ending on `end_datetime_utc`.

    This is useful to break a long time range into smaller periods for processing time
    series data that would take up too much memory to handle in one piece.

    Args:
        start_datetime_utc (datetime): start of the period to cover
        end_datetime_utc (datetime): end of the period to cover
        period_duration (timedelta): duration of the individual
          periods returned
        overlap (Union[None, timedelta]): overlap between successive
          periods, if specified. Defaults to `None`.
    """

    if not overlap:
        overlap = timedelta(0)

    try:
        assert period_duration > overlap
    except AssertionError:
        raise ValueError("'period_duration' cannot be shorter than 'overlap'.")

    try:
        assert end_datetime_utc >= start_datetime_utc
    except AssertionError:
        raise ValueError(
            "'end_datetime_utc' cannot be before than 'start_datetime_utc'."
        )

    if end_datetime_utc - start_datetime_utc <= period_duration:
        return [Period(start=start_datetime_utc, end=end_datetime_utc)]
    else:
        periods = make_periods(
            start_datetime_utc=start_datetime_utc + period_duration - overlap,
            end_datetime_utc=end_datetime_utc,
            period_duration=period_duration,
            overlap=overlap,
        )

        periods.insert(
            0,
            Period(
                start=start_datetime_utc,
                end=start_datetime_utc + period_duration,
            ),
        )
        return periods




[docs]
def get_datetime_intervals(
    s: pd.Series, unit: str = None, how: str = "backward"
) -> pd.Series:
    """
    Takes a pandas Series with datetime dtype. Return a pandas Series with the same
    index and with time intervals between the successives values of the input Series as
    values.

    Args:
        s (Series): pandas Series with datetime dtype
        unit (Union[str, None]):

          - if `None`, returns values as pandas `Timedelta`
          - if provided, must be one of 's', 'min' or 'h', in which case values
            are returned as a float.

          Defaults to `None`.
        how (str): if, 'forward', computes the interval between each position and the
          next one. If 'backward', computes the interval between each position and
          the previous one.
          Defaults to 'backward'

    Returns:
        pd.Series: Series of time intervals between the values of the input Series
    """

    if how == "backward":
        shift = 1
    elif how == "forward":
        shift = -1
    else:
        raise ValueError(f"how expects 'backward' or 'forward', got '{how}'")

    intervals = pd.Series(
        index=s.index,
        data=shift * (s.values - s.shift(shift).values),
    )

    if unit:
        intervals = intervals.map(lambda dt: dt.total_seconds())
        if unit == "h":
            intervals = intervals / 3600
        elif unit == "min":
            intervals = intervals / 60
        elif unit == "s":
            pass
        else:
            raise ValueError(f"unit must be None, 'h', 'min' or 's', got '{unit}'.")

    return intervals




[docs]
def is_in_validity_period(
    validity_start_date: datetime,
    validity_end_date: datetime,
    repeat_each_year: bool,
    sample_date: datetime,
) -> bool:
    """
    Check if a sample_date falls within a validity period.

    Args:
        validity_start_date: Start of validity period (None means no start constraint)
        validity_end_date: End of validity period (None means no end constraint)
        repeat_each_year: If True, the validity period repeats annually
        sample_date: Date to check against the validity period

    Returns:
        True if sample_date is within the validity period, False otherwise
    """
    if pd.isna(validity_start_date) and pd.isna(validity_end_date):
        return True

    if pd.isna(validity_start_date):
        return sample_date <= validity_end_date

    if pd.isna(validity_end_date):
        return sample_date >= validity_start_date

    if repeat_each_year:
        validaty_duration = validity_end_date - validity_start_date
        one_year = timedelta(days=365)
        if validaty_duration >= one_year:
            return sample_date >= validity_start_date

        return (
            validity_start_date
            <= sample_date.replace(year=min(validity_start_date.year, sample_date.year))
            <= validity_end_date
        ) or (
            validity_start_date
            <= sample_date.replace(year=min(validity_end_date.year, sample_date.year))
            <= validity_end_date
        )
    else:
        return validity_start_date <= sample_date <= validity_end_date