DataFrame Operations¶
Tools for cleaning and transforming pandas DataFrames.
xpytools.xtool.df.normalize_column_names.normalize_column_names
¶
Normalize and sanitize DataFrame column names.
Rules applied:
- Lowercase all names
- Replace spaces, hyphens, and slashes with underscores
- Remove parentheses, brackets, braces, and special characters
- Preserve .digits suffixes (e.g., score.1 -> score_1)
- Ensure unique column names (adds numeric suffixes if needed)
- Converts : and . to _ (except trailing digits)
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
Any
|
DataFrame whose columns to clean. |
required |
|
bool
|
Modify the DataFrame in place. If False, return a copy. |
True
|
Returns:
| Type | Description |
|---|---|
DataFrame | None
|
The modified DataFrame (or copy) with cleaned columns. Returns None if input is not a valid DataFrame. |
Examples:
>>> import pandas as pd
>>> df = pd.DataFrame(columns=["User ID", "Email-Address", "Score.1", "Score.1"])
>>> normalize_column_names(df)
>>> list(df.columns)
['user_id', 'email_address', 'score_1', 'score_1_1']
Source code in xpytools/xtool/df/normalize_column_names.py
@requireModules(["pandas"], exc_raise=True)
def normalize_column_names(df: "pdDataFrame", inplace: bool = True) -> Optional["pdDataFrame"]:
"""
Normalize and sanitize DataFrame column names.
Rules applied:
- Lowercase all names
- Replace spaces, hyphens, and slashes with underscores
- Remove parentheses, brackets, braces, and special characters
- Preserve `.digits` suffixes (e.g., `score.1` -> `score_1`)
- Ensure unique column names (adds numeric suffixes if needed)
- Converts `:` and `.` to `_` (except trailing digits)
Parameters
----------
df : Any
DataFrame whose columns to clean.
inplace : bool, default=True
Modify the DataFrame in place. If False, return a copy.
Returns
-------
pandas.DataFrame | None
The modified DataFrame (or copy) with cleaned columns.
Returns None if input is not a valid DataFrame.
Examples
--------
>>> import pandas as pd
>>> df = pd.DataFrame(columns=["User ID", "Email-Address", "Score.1", "Score.1"])
>>> normalize_column_names(df)
>>> list(df.columns)
['user_id', 'email_address', 'score_1', 'score_1_1']
"""
try:
import pandas as pd
except ImportError:
return None
if not is_df(df):
return None
target_df = df if inplace else df.copy()
original_cols = list(target_df.columns)
cleaned_cols: list[str] = []
seen: set[str] = set()
for col in original_cols:
if not isinstance(col, str):
col = str(col)
new_col = col.lower()
# Preserve ".digits" suffix
match = re.search(r'\.(\d+)$', new_col)
end_digits = match.group(0) if match else ""
if end_digits:
new_col = new_col[:-len(end_digits)]
# Replace and clean
new_col = new_col.replace(':', '_')
new_col = new_col.replace('.', '_')
new_col = re.sub(r'[ /\\\-]+', '_', new_col)
new_col = re.sub(r'[(){}\[\]]+', '', new_col)
new_col = re.sub(r'[^0-9a-zA-Z_]+', '', new_col)
new_col = new_col.strip('_')
# Re-add numeric suffix if it existed
if end_digits:
new_col = f"{new_col}_{end_digits.strip('.')}"
# Ensure uniqueness
base = new_col
suffix = 1
while new_col in seen:
new_col = f"{base}_{suffix}"
suffix += 1
seen.add(new_col)
cleaned_cols.append(new_col)
target_df.columns = cleaned_cols
return target_df
xpytools.xtool.df.lookup.lookup
¶
lookup(df: 'pdDataFrame', filter_col: str, filter_val: Any, target_col: str, index: int = 0, safe: bool = False) -> Optional[Any]
Safely get a value from a DataFrame filtered by condition.
Example
lookup(df, "user_id", 123, "email") 'user@example.com'
Source code in xpytools/xtool/df/lookup.py
@requireModules(["pandas"], exc_raise=True)
def lookup(
df: "pdDataFrame",
filter_col: str,
filter_val: Any,
target_col: str,
index: int = 0,
safe: bool = False
) -> Optional[Any]:
"""
Safely get a value from a DataFrame filtered by condition.
Example
-------
>>> lookup(df, "user_id", 123, "email")
'user@example.com'
"""
try:
_check_df(df)
subset = df.loc[df[filter_col] == filter_val, target_col]
if subset.empty:
return None
return subset.iloc[index]
except Exception:
if safe:
return None
else:
raise
xpytools.xtool.df.merge_fill.merge_fill
¶
merge_fill(left: 'pdDataFrame', right: 'pdDataFrame', on: str, how: str = 'left', prefer_right: bool = True, fill_only_if_none: bool = True) -> Optional['pdDataFrame']
Merge two DataFrames without creating duplicate columns.
This performs a normal merge, but instead of generating
col_x and col_y, values from right fill in None cells
in left (if fill_only_if_none=True).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
DataFrame
|
|
required |
|
DataFrame
|
|
required |
|
str
|
Column to merge on. |
required |
|
str
|
Merge type ('left', 'inner', 'outer', etc.) |
'left'
|
|
bool
|
If True, values from right overwrite missing ones in left. |
True
|
|
bool
|
If True, only fill where left[col] is None/NaN. |
True
|
Returns:
| Type | Description |
|---|---|
DataFrame | None
|
|
Source code in xpytools/xtool/df/merge_fill.py
@requireModules(["pandas"], exc_raise=True)
def merge_fill(
left: "pdDataFrame",
right: "pdDataFrame",
on: str,
how: str = "left",
prefer_right: bool = True,
fill_only_if_none: bool = True,
) -> Optional["pdDataFrame"]:
"""
Merge two DataFrames *without creating duplicate columns*.
This performs a normal merge, but instead of generating
`col_x` and `col_y`, values from `right` fill in `None` cells
in `left` (if `fill_only_if_none=True`).
Parameters
----------
left, right : DataFrame
on : str
Column to merge on.
how : str
Merge type ('left', 'inner', 'outer', etc.)
prefer_right : bool
If True, values from right overwrite missing ones in left.
fill_only_if_none : bool
If True, only fill where left[col] is None/NaN.
Returns
-------
DataFrame | None
"""
_mergetype = strChoice('left', 'right', 'outer', 'inner')
_check_df(left, 'Left')
_check_df(right, 'Right')
try:
import pandas as pd
how = _mergetype(how)
merged = pd.merge(left, right, on=on, how=how, suffixes=("", "_right"))
for col in right.columns:
if col == on or f"{col}_right" not in merged.columns:
continue
if prefer_right:
if fill_only_if_none:
merged[col] = merged[col].combine_first(merged[f"{col}_right"])
else:
merged[col] = merged[f"{col}_right"]
merged.drop(columns=[f"{col}_right"], inplace=True, errors="ignore")
return merged
except Exception:
raise
xpytools.xtool.df.replace_none_like.replace_none_like
¶
Replace all None-like representations (NaN, '', 'null', etc.)
in a DataFrame with proper Python None using as_none().
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
DataFrame
|
Input DataFrame. |
required |
|
bool
|
When True, performs a second pass to replace all np.nan / pd.NA values with literal None and converts affected columns to object dtype. Useful before serializing or inserting into Postgres. |
False
|
Returns:
| Type | Description |
|---|---|
DataFrame | None
|
Cleaned DataFrame or None if not a valid DataFrame. |
Source code in xpytools/xtool/df/replace_none_like.py
@requireModules(["pandas"], exc_raise=True)
def replace_none_like(
df: "pdDataFrame",
force: bool = False
) -> Optional["pdDataFrame"]:
"""
Replace all None-like representations (NaN, '', 'null', etc.)
in a DataFrame with proper Python None using `as_none()`.
Parameters
----------
df : DataFrame
Input DataFrame.
force : bool, default=False
When True, performs a second pass to replace *all* np.nan / pd.NA
values with literal None and converts affected columns to object dtype.
Useful before serializing or inserting into Postgres.
Returns
-------
DataFrame | None
Cleaned DataFrame or None if not a valid DataFrame.
"""
import pandas as pd
if force:
df = df.astype(object)
if not is_df(df) or is_empty(df):
raise ValueError("Invalid DataFrame")
# first pass – textual / exotic normalization
cleaned = df.map(as_none)
cleaned = cleaned.replace(pd.NA, None)
if force:
# this is the correct modern idiom
try:
import numpy as np
cleaned = cleaned.replace(np.nan, None)
except ImportError:
cleaned = cleaned.reaplce(float('nan'), None)
return cleaned