Source code for pylimer_tools.utils.data_utility

import warnings

import pandas as pd



[docs]
def get_tail(data, percentage=0.2, min_n=25, max_percentage=0.5):
    """
    Extract the last few entries of a list

    :param data: The list, DataFrame, or Series to extract the last few entries from
    :type data: list or pd.DataFrame or pd.Series
    :param percentage: The percentage of entries to extract (default: 0.2)
    :type percentage: float
    :param min_n: The minimum number of entries to extract (default: 25)
    :type min_n: int
    :param max_percentage: The maximum percentage of entries to extract (default: 0.5)
    :type max_percentage: float
    :return: A subset of the input data containing the last entries according to the specified criteria
    :rtype: Same type as input data

    The function returns a subset with at maximum max_percentage,
    at least min_n entries (assuming the initial data is as large),
    but ideally `percentage` many percentage of the last entries.
    """
    assert percentage <= 1
    assert max_percentage <= 1
    tail_n = int(
        min(
            max(min(min_n, max_percentage * len(data)), percentage * len(data)),
            len(data),
        )
    )
    if isinstance(data, pd.DataFrame) or isinstance(data, pd.Series):
        return data.tail(tail_n)
    else:
        return data[-tail_n:]




[docs]
def unify_data_stepsizes(
    data: pd.DataFrame,
    key: str,
    step_size: int = None,
    max_expected_step_size: int = 100,
) -> pd.DataFrame:
    """
    Get a DataFrame where all data points have the same step between the values in column given by `key`

    :param data: The DataFrame to unify the step-size for
    :type data: pd.DataFrame
    :param key: The column name indicating the column containing the step-nr
    :type key: str
    :param step_size: The step size to use for filtering (if None, computed automatically)
    :type step_size: int, optional
    :param max_expected_step_size: Used to get a warning if the computed step-size is larger
    :type max_expected_step_size: int, default=100
    :return: A DataFrame with a consistent step-size
    :rtype: pd.DataFrame

    NOTE: this function is rather unstable, as it has a few assumptions:
    - steps are modulo stepsize. Breaks e.g. with steps start with 1 and go up by step_size.
    - ideal step-size is max step difference. Breaks e.g. if there is one big gap
    """
    # lenBefore = len(data)
    if step_size is None:
        step_size = data[key].sort_values().diff().max()
    if step_size > max_expected_step_size:
        warnings.warn(
            "Step size {} unexpectedly large, with max expected {}".format(
                step_size, max_expected_step_size
            )
        )
    data = data[(data[key] % step_size) == 0]
    # print("Reduced from {} to {} data-points using step size of {}".format(lenBefore, len(data), step_size))
    return data