Source code for pylimer_tools.io.read_pylimer_tools_output_file

"""
This module provides a few functions to read output from pylimer_tools_cpp's simulators.
"""

import pandas as pd

from pylimer_tools.utils.cache_utility import do_cache, load_cache



[docs]
def read_avg_file(filename: str) -> pd.DataFrame:
    """
    Read an averages-output file from one of the simulators shipped with pylimer_tools.

    This function parses the output file format used by pylimer_tools_cpp simulators,
    handling multiple data sections and converting them to a pandas DataFrame.
    The function also caches results to improve performance on subsequent reads.

    :param filename: Path to the averages file to read
    :type filename: str
    :return: DataFrame containing the parsed averages data, grouped by OutputStep
    :rtype: pd.DataFrame

    :note: The function automatically filters out lines containing "-nan" values,
           null characters, or fewer than 3 columns.
    :note: The returned DataFrame is grouped by OutputStep, keeping only the last
           entry for each step.
    """
    cache = load_cache(filename, "my-avg")
    if cache is not None:
        return cache
    data_frames = []
    with open(filename, "r") as f:
        first_line_split = f.readline().removeprefix("#").strip().split()
        data = []
        for line in f:
            if "-nan" in line or "\x00" in line or len(line.split()) < 3:
                continue
            stripped_line = line.removeprefix("#").strip()
            if stripped_line.startswith(first_line_split[0]):
                data_frames.append(
                    pd.DataFrame(
                        data, columns=first_line_split))
                first_line_split = stripped_line.split()
                data = []
            elif stripped_line != "":
                data.append(stripped_line.split())
    if not len(data) == 0:
        data_frames.append(pd.DataFrame(data, columns=first_line_split))
    df = pd.concat(data_frames, ignore_index=True)
    result = df.apply(pd.to_numeric, errors="ignore")
    result = result.groupby("OutputStep", as_index=False).last()
    assert not result["OutputStep"].duplicated().any()
    do_cache(result, filename, "my-avg")
    return result