Source code for pylimer_tools.utils.optimize_dataframe

"""
Utility functions to reduce the memory usage of a pandas DataFrame.
Particularly useful when dealing with large datasets, e.g. output from long LAMMPS simulation runs.

Heavily inspired by the following sources:
- https://medium.com/bigdatarepublic/advanced-pandas-optimize-speed-and-memory-a654b53be6c2
- https://stackoverflow.com/questions/57531388/how-can-i-reduce-the-memory-of-a-pandas-dataframe
"""

import gc
from typing import List

import numpy as np
import pandas as pd



[docs]
def reduce_mem_usage(
    df, obj_to_category=False, subset=None, inplace=True, print_stats=False
):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.

    :param df: dataframe to reduce
    :type df: pd.DataFrame
    :param obj_to_category: convert non-datetime related objects to category dtype
    :type obj_to_category: bool
    :param subset: subset of columns to analyse
    :type subset: List[str] or None
    :param inplace: whether to modify the dataframe in place
    :type inplace: bool
    :param print_stats: whether to print memory usage statistics
    :type print_stats: bool

    :return: dataset with the column dtypes adjusted
    :rtype: pd.DataFrame
    """
    start_mem = df.memory_usage().sum() / 1024**2
    gc.collect()
    if print_stats:
        print("Memory usage of dataframe is {:.2f} MB".format(start_mem))

    if not inplace:
        df = df.copy()

    cols = subset if subset is not None else df.columns.tolist()

    for col in cols:
        col_type = df[col].dtype

        if (
            col_type != object
            and col_type.name != "category"
            and "datetime" not in col_type.name
        ):
            c_min = df[col].min()
            c_max = df[col].max()

            # test if column can be converted to an integer
            treat_as_int = str(col_type)[
                :3] == "int" or str(col_type)[
                :4] == "uint"

            if treat_as_int:
                if c_min > np.iinfo(
                        np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    df[col] = df[col].astype(np.uint8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif (
                    c_min > np.iinfo(
                        np.uint16).min and c_max < np.iinfo(
                        np.uint16).max
                ):
                    df[col] = df[col].astype(np.uint16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif (
                    c_min > np.iinfo(
                        np.uint32).min and c_max < np.iinfo(
                        np.uint32).max
                ):
                    df[col] = df[col].astype(np.uint32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
                elif (
                    c_min > np.iinfo(
                        np.uint64).min and c_max < np.iinfo(
                        np.uint64).max
                ):
                    df[col] = df[col].astype(np.uint64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif "datetime" not in col_type.name and obj_to_category:
            df[col] = df[col].astype("category")
    gc.collect()
    end_mem = df.memory_usage().sum() / 1024**2
    if print_stats:
        print("Memory usage after optimization is: {:.3f} MB".format(end_mem))
        print("Decreased by {:.1f}%".format(
            100 * (start_mem - end_mem) / start_mem))

    return df




[docs]
def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    """
    Optimize the floating point type entries.

    :param df: dataframe to reduce
    :type df: pd.DataFrame

    :return: dataset with the column dtypes adjusted
    :rtype: pd.DataFrame
    """
    floats = df.select_dtypes(include=["float64"]).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast="float")
    return df




[docs]
def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    """
    Optimize the integer point type entries.

    :param df: dataframe to reduce
    :type df: pd.DataFrame

    :return: dataset with the column dtypes adjusted
    :rtype: pd.DataFrame
    """
    ints = df.select_dtypes(include=["int64"]).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast="integer")
    return df




[docs]
def optimize_objects(df: pd.DataFrame,
                     datetime_features: List[str]) -> pd.DataFrame:
    """
    Optimize object type entries.

    :param df: dataframe to reduce
    :type df: pd.DataFrame
    :param datetime_features: list of column names that contain datetime data
    :type datetime_features: List[str]

    :return: dataset with the column dtypes adjusted
    :rtype: pd.DataFrame
    """
    for col in df.select_dtypes(include=["object"]):
        if col not in datetime_features:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if float(num_unique_values) / num_total_values < 0.5:
                df[col] = df[col].astype("category")
        else:
            df[col] = pd.to_datetime(df[col])
    return df




[docs]
def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
    """
    Optimize all types of all columns in a dataframe.

    :param df: dataframe to reduce
    :type df: pd.DataFrame
    :param datetime_features: list of column names that contain datetime data
    :type datetime_features: List[str]

    :return: dataset with the column dtypes adjusted
    :rtype: pd.DataFrame
    """
    return optimize_floats(optimize_ints(
        optimize_objects(df, datetime_features)))