Text Processing¶

String manipulation, cleaning, and formatting utilities.

xpytools.xtool.txt.clean.clean ¶

Python

clean(text: Any, *, lowercase: bool = False) -> str | None

Clean and normalize txt with or without cleantext dependency.

• Uses cleantext.clean() if available. • Falls back to a Unicode-safe normalization, HTML/ASCII stripping, and basic cleanup.

Parameters:

Name	Type	Description	Default
`text` ¶	`Any`	Input txt (converted to str).	required
`lowercase` ¶	`bool`	Whether to lowercase output.	`False`

Returns:

Type	Description
`str \| None`	Cleaned string, or None if txt is empty/None.

Source code in xpytools/xtool/txt/clean.py

Python

@requireModules(["cleantext"], exc_raise=False)
def clean(text: Any, *, lowercase: bool = False) -> str | None:
    """
    Clean and normalize txt with or without `cleantext` dependency.

    • Uses `cleantext.clean()` if available.
    • Falls back to a Unicode-safe normalization, HTML/ASCII stripping, and basic cleanup.

    Parameters
    ----------
    text : Any
        Input txt (converted to str).
    lowercase : bool, default=False
        Whether to lowercase output.

    Returns
    -------
    str | None
        Cleaned string, or None if txt is empty/None.
    """
    if text is None:
        return None
    text = str(text)

    try:
        from cleantext import clean  # noqa
        return clean(
                text,
                fix_unicode=True,
                to_ascii=False,
                lower=lowercase,
                no_line_breaks=False,
                no_urls=True,
                no_emails=True,
                no_phone_numbers=True,
                no_numbers=False,
                no_digits=False,
                no_currency_symbols=True,
                no_punct=False,
                lang="en",
                )
    except Exception:
        # Fallback: simple safe cleaner
        text = unicodedata.normalize("NFKC", text)
        text = re.sub(r"<[^>]+>", "", text)  # strip HTML
        text = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", "", text)  # remove non-ASCII control chars
        text = text.strip()
        if lowercase:
            text = text.lower()
        return text or None

xpytools.xtool.txt.strip_html.strip_html ¶

Python

strip_html(text: str) -> str

Remove HTML tags and entities.

Parameters:

Name	Type	Description	Default
`text` ¶	`str`		required

Returns:

Type	Description
`str`	Text with all HTML tags removed.

Source code in xpytools/xtool/txt/strip_html.py

Python

def strip_html(text: str) -> str:
    """
    Remove HTML tags and entities.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Text with all HTML tags removed.
    """
    if not text:
        return ""
    # Remove tags
    text = _TAG_RE.sub("", text)
    # Decode common entities
    text = text.replace("&nbsp;", " ").replace("&amp;", "&")
    text = text.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')
    return text.strip()

xpytools.xtool.txt.strip_ascii.strip_ascii ¶

Python

strip_ascii(text: str, keep_basic_symbols: bool = True) -> str

Remove non-ASCII characters (optionally keeping punctuation and spaces).

Parameters:

Name	Type	Description	Default
`text` ¶	`str`	Input string.	required
`keep_basic_symbols` ¶	`bool`	If False, removes everything outside [A-Za-z0-9 ].	`True`

Returns:

Type	Description
`str`	ASCII-only txt.

Source code in xpytools/xtool/txt/strip_ascii.py

Python

def strip_ascii(text: str, keep_basic_symbols: bool = True) -> str:
    """
    Remove non-ASCII characters (optionally keeping punctuation and spaces).

    Parameters
    ----------
    text : str
        Input string.
    keep_basic_symbols : bool, default=True
        If False, removes everything outside [A-Za-z0-9 ].

    Returns
    -------
    str
        ASCII-only txt.
    """
    if not isinstance(text, str):
        return str(text or "")
    if keep_basic_symbols:
        return text.encode("ascii", "ignore").decode("ascii")
    return re.sub(r"[^A-Za-z0-9 ]+", "", text)

xpytools.xtool.txt.truncate.truncate ¶

Python

truncate(text: str, limit: int = 120, suffix: str = '…') -> str

Truncate txt safely and append ellipsis if needed.

Parameters:

Name	Type	Description	Default
`text` ¶	`str`	Input string.	required
`limit` ¶	`int`	Maximum length before truncation.	`120`
`suffix` ¶	`str`	Suffix to indicate truncation.	`"…"`

Returns:

Type	Description
`str`	Possibly truncated string.

Source code in xpytools/xtool/txt/truncate.py

Python

def truncate(text: str, limit: int = 120, suffix: str = "…") -> str:
    """
    Truncate txt safely and append ellipsis if needed.

    Parameters
    ----------
    text : str
        Input string.
    limit : int, default=120
        Maximum length before truncation.
    suffix : str, default="…"
        Suffix to indicate truncation.

    Returns
    -------
    str
        Possibly truncated string.
    """
    if text is None:
        return ""
    text = str(text)
    return text if len(text) <= limit else text[:limit].rstrip() + suffix

xpytools.xtool.txt.pad.pad ¶

Python

pad(text: str, width: int = 20, align: str = 'left', fillchar: str = ' ', truncate: bool = True) -> str

Pad (and optionally truncate) a string to a fixed width.

Parameters:

Name	Type	Description	Default
`text` ¶	`str`	Input string to pad.	required
`width` ¶	`int`	Desired total width of the output.	`20`
`align` ¶	`('left', 'right', 'center')`	Alignment direction within the padded area.	`"left"`
`fillchar` ¶	`str`	Character used for padding. Must be a single character.	`" "`
`truncate` ¶	`bool`	Whether to truncate strings longer than the target width.	`True`

Returns:

Type	Description
`str`	Padded (and possibly truncated) string.

Source code in xpytools/xtool/txt/pad.py

Python

def pad(
        text: str,
        width: int = 20,
        align: str = "left",
        fillchar: str = " ",
        truncate: bool = True,
        ) -> str:
    """
    Pad (and optionally truncate) a string to a fixed width.

    Parameters
    ----------
    text : str
        Input string to pad.
    width : int, default=20
        Desired total width of the output.
    align : {"left", "right", "center"}, default="left"
        Alignment direction within the padded area.
    fillchar : str, default=" "
        Character used for padding. Must be a single character.
    truncate : bool, default=True
        Whether to truncate strings longer than the target width.

    Returns
    -------
    str
        Padded (and possibly truncated) string.
    """
    if not isinstance(text, str):
        text = str(text or "")

    if not fillchar or len(fillchar) != 1:
        raise ValueError("fillchar must be a single character")

    if truncate and len(text) > width:
        text = text[:width]

    if align == "right":
        return text.rjust(width, fillchar)
    elif align == "center":
        return text.center(width, fillchar)
    else:
        return text.ljust(width, fillchar)

xpytools.xtool.txt.split_lines.split_lines ¶

Python

split_lines(text: str, width: int = 80) -> List[str]

Split txt into fixed-width lines without breaking words.

Parameters:

Name	Type	Description	Default
`text` ¶	`str`	Input string.	required
`width` ¶	`int`	Maximum width per line.	`80`

Returns:

Type	Description
`list[str]`	List of wrapped lines.

Source code in xpytools/xtool/txt/split_lines.py

Python

def split_lines(text: str, width: int = 80) -> List[str]:
    """
    Split txt into fixed-width lines without breaking words.

    Parameters
    ----------
    text : str
        Input string.
    width : int, default=80
        Maximum width per line.

    Returns
    -------
    list[str]
        List of wrapped lines.
    """
    if not text:
        return []
    return wrap(text.strip(), width=width, break_long_words=False, replace_whitespace=True)

Text Processing¶

xpytools.xtool.txt.clean.clean ¶

`text` ¶

`lowercase` ¶

xpytools.xtool.txt.strip_html.strip_html ¶

`text` ¶

xpytools.xtool.txt.strip_ascii.strip_ascii ¶

`text` ¶

`keep_basic_symbols` ¶

xpytools.xtool.txt.truncate.truncate ¶

`text` ¶

`limit` ¶

`suffix` ¶

xpytools.xtool.txt.pad.pad ¶

`text` ¶

`width` ¶

`align` ¶

`fillchar` ¶

`truncate` ¶

xpytools.xtool.txt.split_lines.split_lines ¶

`text` ¶

`width` ¶

Text Processing¶

xpytools.xtool.txt.clean.clean ¶

text ¶

lowercase ¶

xpytools.xtool.txt.strip_html.strip_html ¶

text ¶

xpytools.xtool.txt.strip_ascii.strip_ascii ¶

text ¶

keep_basic_symbols ¶

xpytools.xtool.txt.truncate.truncate ¶

text ¶

limit ¶

suffix ¶

xpytools.xtool.txt.pad.pad ¶

text ¶

width ¶

align ¶

fillchar ¶

truncate ¶

xpytools.xtool.txt.split_lines.split_lines ¶

text ¶

width ¶

`text` ¶

`lowercase` ¶

`text` ¶

`text` ¶

`keep_basic_symbols` ¶

`text` ¶

`limit` ¶

`suffix` ¶

`text` ¶

`width` ¶

`align` ¶

`fillchar` ¶

`truncate` ¶

`text` ¶

`width` ¶