Source code for pylimer_tools.utils.optimize_dataframe
"""Utility functions to reduce the memory usage of a pandas DataFrame.Particularly useful when dealing with large datasets, e.g. output from long LAMMPS simulation runs.Heavily inspired by the following sources:- https://medium.com/bigdatarepublic/advanced-pandas-optimize-speed-and-memory-a654b53be6c2- https://stackoverflow.com/questions/57531388/how-can-i-reduce-the-memory-of-a-pandas-dataframe"""importgcfromtypingimportListimportnumpyasnpimportpandasaspd
[docs]defreduce_mem_usage(df,obj_to_category=False,subset=None,inplace=True,print_stats=False):""" Iterate through all the columns of a dataframe and modify the data type to reduce memory usage. :param df: dataframe to reduce :type df: pd.DataFrame :param obj_to_category: convert non-datetime related objects to category dtype :type obj_to_category: bool :param subset: subset of columns to analyse :type subset: List[str] or None :param inplace: whether to modify the dataframe in place :type inplace: bool :param print_stats: whether to print memory usage statistics :type print_stats: bool :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """start_mem=df.memory_usage().sum()/1024**2gc.collect()ifprint_stats:print("Memory usage of dataframe is {:.2f} MB".format(start_mem))ifnotinplace:df=df.copy()cols=subsetifsubsetisnotNoneelsedf.columns.tolist()forcolincols:col_type=df[col].dtypeif(col_type!=objectandcol_type.name!="category"and"datetime"notincol_type.name):c_min=df[col].min()c_max=df[col].max()# test if column can be converted to an integertreat_as_int=str(col_type)[:3]=="int"orstr(col_type)[:4]=="uint"iftreat_as_int:ifc_min>np.iinfo(np.int8).minandc_max<np.iinfo(np.int8).max:df[col]=df[col].astype(np.int8)elifc_min>np.iinfo(np.uint8).minandc_max<np.iinfo(np.uint8).max:df[col]=df[col].astype(np.uint8)elifc_min>np.iinfo(np.int16).minandc_max<np.iinfo(np.int16).max:df[col]=df[col].astype(np.int16)elif(c_min>np.iinfo(np.uint16).minandc_max<np.iinfo(np.uint16).max):df[col]=df[col].astype(np.uint16)elifc_min>np.iinfo(np.int32).minandc_max<np.iinfo(np.int32).max:df[col]=df[col].astype(np.int32)elif(c_min>np.iinfo(np.uint32).minandc_max<np.iinfo(np.uint32).max):df[col]=df[col].astype(np.uint32)elifc_min>np.iinfo(np.int64).minandc_max<np.iinfo(np.int64).max:df[col]=df[col].astype(np.int64)elif(c_min>np.iinfo(np.uint64).minandc_max<np.iinfo(np.uint64).max):df[col]=df[col].astype(np.uint64)else:if(c_min>np.finfo(np.float16).minandc_max<np.finfo(np.float16).max):df[col]=df[col].astype(np.float16)elif(c_min>np.finfo(np.float32).minandc_max<np.finfo(np.float32).max):df[col]=df[col].astype(np.float32)else:df[col]=df[col].astype(np.float64)elif"datetime"notincol_type.nameandobj_to_category:df[col]=df[col].astype("category")gc.collect()end_mem=df.memory_usage().sum()/1024**2ifprint_stats:print("Memory usage after optimization is: {:.3f} MB".format(end_mem))print("Decreased by {:.1f}%".format(100*(start_mem-end_mem)/start_mem))returndf
[docs]defoptimize_floats(df:pd.DataFrame)->pd.DataFrame:""" Optimize the floating point type entries. :param df: dataframe to reduce :type df: pd.DataFrame :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """floats=df.select_dtypes(include=["float64"]).columns.tolist()df[floats]=df[floats].apply(pd.to_numeric,downcast="float")returndf
[docs]defoptimize_ints(df:pd.DataFrame)->pd.DataFrame:""" Optimize the integer point type entries. :param df: dataframe to reduce :type df: pd.DataFrame :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """ints=df.select_dtypes(include=["int64"]).columns.tolist()df[ints]=df[ints].apply(pd.to_numeric,downcast="integer")returndf
[docs]defoptimize_objects(df:pd.DataFrame,datetime_features:List[str])->pd.DataFrame:""" Optimize object type entries. :param df: dataframe to reduce :type df: pd.DataFrame :param datetime_features: list of column names that contain datetime data :type datetime_features: List[str] :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """forcolindf.select_dtypes(include=["object"]):ifcolnotindatetime_features:num_unique_values=len(df[col].unique())num_total_values=len(df[col])iffloat(num_unique_values)/num_total_values<0.5:df[col]=df[col].astype("category")else:df[col]=pd.to_datetime(df[col])returndf
[docs]defoptimize(df:pd.DataFrame,datetime_features:List[str]=[]):""" Optimize all types of all columns in a dataframe. :param df: dataframe to reduce :type df: pd.DataFrame :param datetime_features: list of column names that contain datetime data :type datetime_features: List[str] :return: dataset with the column dtypes adjusted :rtype: pd.DataFrame """returnoptimize_floats(optimize_ints(optimize_objects(df,datetime_features)))