Source code for pylimer_tools.utils.optimize_dataframe

"""
Utility functions to reduce the memory usage of a pandas DataFrame.
Particularly useful when dealing with large datasets, e.g. output from long LAMMPS simulation runs.

Heavily inspired by the following sources:
- https://medium.com/bigdatarepublic/advanced-pandas-optimize-speed-and-memory-a654b53be6c2
- https://stackoverflow.com/questions/57531388/how-can-i-reduce-the-memory-of-a-pandas-dataframe
"""

import gc
from typing import List

import numpy as np
import pandas as pd


[docs] def reduce_mem_usage( df, obj_to_category=False, subset=None, inplace=True, print_stats=False ): """ Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. :param df: dataframe to reduce :type df: pd.DataFrame :param obj_to_category: convert non-datetime related objects to category dtype :type obj_to_category: bool :param subset: subset of columns to analyse :type subset: List[str] or None :param inplace: whether to modify the dataframe in place :type inplace: bool :param print_stats: whether to print memory usage statistics :type print_stats: bool :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """ start_mem = df.memory_usage().sum() / 1024**2 gc.collect() if print_stats: print("Memory usage of dataframe is {:.2f} MB".format(start_mem)) if not inplace: df = df.copy() cols = subset if subset is not None else df.columns.tolist() for col in cols: col_type = df[col].dtype if ( col_type != object and col_type.name != "category" and "datetime" not in col_type.name ): c_min = df[col].min() c_max = df[col].max() # test if column can be converted to an integer treat_as_int = str(col_type)[ :3] == "int" or str(col_type)[ :4] == "uint" if treat_as_int: if c_min > np.iinfo( np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max: df[col] = df[col].astype(np.uint8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif ( c_min > np.iinfo( np.uint16).min and c_max < np.iinfo( np.uint16).max ): df[col] = df[col].astype(np.uint16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif ( c_min > np.iinfo( np.uint32).min and c_max < np.iinfo( np.uint32).max ): df[col] = df[col].astype(np.uint32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) elif ( c_min > np.iinfo( np.uint64).min and c_max < np.iinfo( np.uint64).max ): df[col] = df[col].astype(np.uint64) else: if ( c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max ): df[col] = df[col].astype(np.float16) elif ( c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max ): df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) elif "datetime" not in col_type.name and obj_to_category: df[col] = df[col].astype("category") gc.collect() end_mem = df.memory_usage().sum() / 1024**2 if print_stats: print("Memory usage after optimization is: {:.3f} MB".format(end_mem)) print("Decreased by {:.1f}%".format( 100 * (start_mem - end_mem) / start_mem)) return df
[docs] def optimize_floats(df: pd.DataFrame) -> pd.DataFrame: """ Optimize the floating point type entries. :param df: dataframe to reduce :type df: pd.DataFrame :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """ floats = df.select_dtypes(include=["float64"]).columns.tolist() df[floats] = df[floats].apply(pd.to_numeric, downcast="float") return df
[docs] def optimize_ints(df: pd.DataFrame) -> pd.DataFrame: """ Optimize the integer point type entries. :param df: dataframe to reduce :type df: pd.DataFrame :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """ ints = df.select_dtypes(include=["int64"]).columns.tolist() df[ints] = df[ints].apply(pd.to_numeric, downcast="integer") return df
[docs] def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame: """ Optimize object type entries. :param df: dataframe to reduce :type df: pd.DataFrame :param datetime_features: list of column names that contain datetime data :type datetime_features: List[str] :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """ for col in df.select_dtypes(include=["object"]): if col not in datetime_features: num_unique_values = len(df[col].unique()) num_total_values = len(df[col]) if float(num_unique_values) / num_total_values < 0.5: df[col] = df[col].astype("category") else: df[col] = pd.to_datetime(df[col]) return df
[docs] def optimize(df: pd.DataFrame, datetime_features: List[str] = []): """ Optimize all types of all columns in a dataframe. :param df: dataframe to reduce :type df: pd.DataFrame :param datetime_features: list of column names that contain datetime data :type datetime_features: List[str] :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """ return optimize_floats(optimize_ints( optimize_objects(df, datetime_features)))