Source code for rwskit.pandas

"""Utilities for working with pandas."""

# Python Modules
from __future__ import annotations

import logging

# 3rd Party Modules
import numpy as np
import pandas as pd

from icontract import require

# Project Modules
from rwskit.collections_ import get_first_non_null_value
from rwskit.numpy import get_dtype


[docs]
log = logging.getLogger(__name__)



def _expand_list(df: pd.DataFrame, column_name: str, string_fill: str) -> pd.DataFrame:
    column = df.pop(column_name)
    min_length = column.map(len).min()
    max_length = column.map(len).max()
    new_names = [f"{column_name}__{i}" for i in range(max_length)]
    new_shape = (len(df.index), max_length)
    dtype = get_dtype(get_first_non_null_value(column))
    is_str = np.issubdtype(dtype, np.str_)
    fill_value = string_fill if is_str else np.nan

    if is_str:
        # max_length is the length of the list, not the max length string
        # in the list.
        max_string_length = max(
            (len(s) if s else 0 for sublist in column for s in sublist)
        )
        dtype = f"U{max(max_string_length, len(string_fill))}"

    if min_length == max_length:
        # We just need to replace 'None`, with the string fill value.
        values = [
            [string_fill if v is None and is_str else v for v in inner_list]
            for inner_list in column.to_list()
        ]
        new_data = np.array(values, dtype=dtype)
    else:
        # Otherwise we need to pad and fill an empty array of the correct
        # dtype.
        # Convert to float so we can fill all missing values with NaN
        if np.issubdtype(dtype, np.number) or dtype == np.bool_:
            dtype = np.float64

        new_data = np.full(new_shape, fill_value=fill_value, dtype=dtype)

        for i, values in enumerate(column):
            # If `None` appears inside a string list, replace it with the
            # fill value. Note, if we don't replace it here, then it
            # will be inserted into the array as the string 'None'.
            values = [fill_value if v is None and is_str else v for v in values]
            new_data[i, : len(values)] = values

    df[new_names] = pd.DataFrame(new_data, index=column.index, columns=new_names)

    return df


@require(
    lambda string_fill: string_fill is not None,
    "The 'string_fill' value cannot be None.",
)

[docs]
def flatten_data_frame(
    df: pd.DataFrame, string_fill: str = "[UNK]", in_place: bool = False
) -> pd.DataFrame:
    """Converts columns containing lists into (new) individual columns in the ``DataFrame``.

    If one or more columns in a DataFrame consist of lists, this method will
    remove the original column and replace it with ``N`` columns, where
    ``N`` is the maximum length of the lists in the original column.

    If the lists are of unequal length, the additional columns will be appended
    to the right. Lists of strings will be padded using the given
    ``string_fill`` value. All others will be padded with ``np.nan``. Note,
    most numpy types will convert ``np.nan`` into an appropriate missing
    value for that type. For example, when used to fill ``np.datetime64``
    objects, the resulting object will be ``np.datetime64('NaT')``.

    If the lists are numeric (including boolean) and they do not have equal
    lengths, the new columns will have ``dtype=np.float64`` regardless of
    the original dtype.

    .. note::
        Nested lists within a column are not supported and will not be
        flattened.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame to flatten.
    string_fill : any, defualt = '[UNK]'
        Use this value to pad string lists. All other data types will use
        ``np.nan``
    in_place : bool, default = False
        Whether to modify the DataFrame in place or return a copy.

    Returns
    -------
    df : pandas.DataFrame
        The modified DataFrame

    Examples
    --------

    .. code-block:: python

        >>>input_df = pd.DataFrame({
            "A": [["1"], ["2", "3"]],
            "B": [["4", "5"], ["6", "7", "8"]],
            "C": [[1], [2, 3]],
            "D": [True, False]
        })
        >>>print(input_df)
                A          B       C      D
        0     [1]     [4, 5]     [1]   True
        1  [2, 3]  [6, 7, 8]  [2, 3]  False

        >>>flatten_data_frame(input_df)
          A__0   A__1 B__0 B__1   B__2  C__0  C__1      D
        0    1  [UNK]    4    5  [UNK]   1.0   NaN   True
        1    2      3    6    7      8   2.0   3.0  False

    """

    if not in_place:
        df = df.copy()

    for c in df.columns:
        if isinstance(df[c].iat[0], list):
            df = _expand_list(df, c, string_fill)

    return df