Skip to content

DataFrame Operations

Tools for cleaning and transforming pandas DataFrames.


xpytools.xtool.df.normalize_column_names.normalize_column_names

Python
normalize_column_names(df: DataFrame, inplace: bool = True) -> Optional[pdDataFrame]

Normalize and sanitize DataFrame column names.

Rules applied: - Lowercase all names - Replace spaces, hyphens, and slashes with underscores - Remove parentheses, brackets, braces, and special characters - Preserve .digits suffixes (e.g., score.1 -> score_1) - Ensure unique column names (adds numeric suffixes if needed) - Converts : and . to _ (except trailing digits)

Parameters:

Name Type Description Default

df

Any

DataFrame whose columns to clean.

required

inplace

bool

Modify the DataFrame in place. If False, return a copy.

True

Returns:

Type Description
DataFrame | None

The modified DataFrame (or copy) with cleaned columns. Returns None if input is not a valid DataFrame.

Examples:

Python Console Session
>>> import pandas as pd
>>> df = pd.DataFrame(columns=["User ID", "Email-Address", "Score.1", "Score.1"])
>>> normalize_column_names(df)
>>> list(df.columns)
['user_id', 'email_address', 'score_1', 'score_1_1']
Source code in xpytools/xtool/df/normalize_column_names.py
Python
@requireModules(["pandas"], exc_raise=True)
def normalize_column_names(df: "pdDataFrame", inplace: bool = True) -> Optional["pdDataFrame"]:
    """
    Normalize and sanitize DataFrame column names.

    Rules applied:
    - Lowercase all names
    - Replace spaces, hyphens, and slashes with underscores
    - Remove parentheses, brackets, braces, and special characters
    - Preserve `.digits` suffixes (e.g., `score.1` -> `score_1`)
    - Ensure unique column names (adds numeric suffixes if needed)
    - Converts `:` and `.` to `_` (except trailing digits)

    Parameters
    ----------
    df : Any
        DataFrame whose columns to clean.
    inplace : bool, default=True
        Modify the DataFrame in place. If False, return a copy.

    Returns
    -------
    pandas.DataFrame | None
        The modified DataFrame (or copy) with cleaned columns.
        Returns None if input is not a valid DataFrame.

    Examples
    --------
    >>> import pandas as pd
    >>> df = pd.DataFrame(columns=["User ID", "Email-Address", "Score.1", "Score.1"])
    >>> normalize_column_names(df)
    >>> list(df.columns)
    ['user_id', 'email_address', 'score_1', 'score_1_1']
    """
    try:
        import pandas as pd
    except ImportError:
        return None

    if not is_df(df):
        return None

    target_df = df if inplace else df.copy()

    original_cols = list(target_df.columns)
    cleaned_cols: list[str] = []
    seen: set[str] = set()

    for col in original_cols:
        if not isinstance(col, str):
            col = str(col)

        new_col = col.lower()

        # Preserve ".digits" suffix
        match = re.search(r'\.(\d+)$', new_col)
        end_digits = match.group(0) if match else ""
        if end_digits:
            new_col = new_col[:-len(end_digits)]

        # Replace and clean
        new_col = new_col.replace(':', '_')
        new_col = new_col.replace('.', '_')
        new_col = re.sub(r'[ /\\\-]+', '_', new_col)
        new_col = re.sub(r'[(){}\[\]]+', '', new_col)
        new_col = re.sub(r'[^0-9a-zA-Z_]+', '', new_col)
        new_col = new_col.strip('_')

        # Re-add numeric suffix if it existed
        if end_digits:
            new_col = f"{new_col}_{end_digits.strip('.')}"

        # Ensure uniqueness
        base = new_col
        suffix = 1
        while new_col in seen:
            new_col = f"{base}_{suffix}"
            suffix += 1

        seen.add(new_col)
        cleaned_cols.append(new_col)

    target_df.columns = cleaned_cols
    return target_df

xpytools.xtool.df.lookup.lookup

Python
lookup(df: 'pdDataFrame', filter_col: str, filter_val: Any, target_col: str, index: int = 0, safe: bool = False) -> Optional[Any]

Safely get a value from a DataFrame filtered by condition.

Example

lookup(df, "user_id", 123, "email") 'user@example.com'

Source code in xpytools/xtool/df/lookup.py
Python
@requireModules(["pandas"], exc_raise=True)
def lookup(
        df: "pdDataFrame",
        filter_col: str,
        filter_val: Any,
        target_col: str,
        index: int = 0,
        safe: bool = False
        ) -> Optional[Any]:
    """
    Safely get a value from a DataFrame filtered by condition.

    Example
    -------
    >>> lookup(df, "user_id", 123, "email")
    'user@example.com'
    """
    try:
        _check_df(df)
        subset = df.loc[df[filter_col] == filter_val, target_col]
        if subset.empty:
            return None
        return subset.iloc[index]
    except Exception:
        if safe:
            return None
        else:
            raise

xpytools.xtool.df.merge_fill.merge_fill

Python
merge_fill(left: 'pdDataFrame', right: 'pdDataFrame', on: str, how: str = 'left', prefer_right: bool = True, fill_only_if_none: bool = True) -> Optional['pdDataFrame']

Merge two DataFrames without creating duplicate columns.

This performs a normal merge, but instead of generating col_x and col_y, values from right fill in None cells in left (if fill_only_if_none=True).

Parameters:

Name Type Description Default

left

DataFrame
required

right

DataFrame
required

on

str

Column to merge on.

required

how

str

Merge type ('left', 'inner', 'outer', etc.)

'left'

prefer_right

bool

If True, values from right overwrite missing ones in left.

True

fill_only_if_none

bool

If True, only fill where left[col] is None/NaN.

True

Returns:

Type Description
DataFrame | None
Source code in xpytools/xtool/df/merge_fill.py
Python
@requireModules(["pandas"], exc_raise=True)
def merge_fill(
        left: "pdDataFrame",
        right: "pdDataFrame",
        on: str,
        how: str = "left",
        prefer_right: bool = True,
        fill_only_if_none: bool = True,
        ) -> Optional["pdDataFrame"]:
    """
    Merge two DataFrames *without creating duplicate columns*.

    This performs a normal merge, but instead of generating
    `col_x` and `col_y`, values from `right` fill in `None` cells
    in `left` (if `fill_only_if_none=True`).

    Parameters
    ----------
    left, right : DataFrame
    on : str
        Column to merge on.
    how : str
        Merge type ('left', 'inner', 'outer', etc.)
    prefer_right : bool
        If True, values from right overwrite missing ones in left.
    fill_only_if_none : bool
        If True, only fill where left[col] is None/NaN.

    Returns
    -------
    DataFrame | None
    """
    _mergetype = strChoice('left', 'right', 'outer', 'inner')

    _check_df(left, 'Left')

    _check_df(right, 'Right')

    try:
        import pandas as pd
        how = _mergetype(how)
        merged = pd.merge(left, right, on=on, how=how, suffixes=("", "_right"))
        for col in right.columns:
            if col == on or f"{col}_right" not in merged.columns:
                continue

            if prefer_right:
                if fill_only_if_none:
                    merged[col] = merged[col].combine_first(merged[f"{col}_right"])
                else:
                    merged[col] = merged[f"{col}_right"]

            merged.drop(columns=[f"{col}_right"], inplace=True, errors="ignore")

        return merged
    except Exception:
        raise

xpytools.xtool.df.replace_none_like.replace_none_like

Python
replace_none_like(df: 'pdDataFrame', force: bool = False) -> Optional['pdDataFrame']

Replace all None-like representations (NaN, '', 'null', etc.) in a DataFrame with proper Python None using as_none().

Parameters:

Name Type Description Default

df

DataFrame

Input DataFrame.

required

force

bool

When True, performs a second pass to replace all np.nan / pd.NA values with literal None and converts affected columns to object dtype. Useful before serializing or inserting into Postgres.

False

Returns:

Type Description
DataFrame | None

Cleaned DataFrame or None if not a valid DataFrame.

Source code in xpytools/xtool/df/replace_none_like.py
Python
@requireModules(["pandas"], exc_raise=True)
def replace_none_like(
        df: "pdDataFrame",
        force: bool = False
        ) -> Optional["pdDataFrame"]:
    """
    Replace all None-like representations (NaN, '', 'null', etc.)
    in a DataFrame with proper Python None using `as_none()`.

    Parameters
    ----------
    df : DataFrame
        Input DataFrame.
    force : bool, default=False
        When True, performs a second pass to replace *all* np.nan / pd.NA
        values with literal None and converts affected columns to object dtype.
        Useful before serializing or inserting into Postgres.

    Returns
    -------
    DataFrame | None
        Cleaned DataFrame or None if not a valid DataFrame.
    """
    import pandas as pd

    if force:
        df = df.astype(object)

    if not is_df(df) or is_empty(df):
        raise ValueError("Invalid DataFrame")

    # first pass – textual / exotic normalization
    cleaned = df.map(as_none)
    cleaned = cleaned.replace(pd.NA, None)

    if force:
        # this is the correct modern idiom
        try:
            import numpy as np
            cleaned = cleaned.replace(np.nan, None)
        except ImportError:
            cleaned = cleaned.reaplce(float('nan'), None)

    return cleaned