Skip to content

Text Processing

String manipulation, cleaning, and formatting utilities.


xpytools.xtool.txt.clean.clean

Python
clean(text: Any, *, lowercase: bool = False) -> str | None

Clean and normalize txt with or without cleantext dependency.

• Uses cleantext.clean() if available. • Falls back to a Unicode-safe normalization, HTML/ASCII stripping, and basic cleanup.

Parameters:

Name Type Description Default

text

Any

Input txt (converted to str).

required

lowercase

bool

Whether to lowercase output.

False

Returns:

Type Description
str | None

Cleaned string, or None if txt is empty/None.

Source code in xpytools/xtool/txt/clean.py
Python
@requireModules(["cleantext"], exc_raise=False)
def clean(text: Any, *, lowercase: bool = False) -> str | None:
    """
    Clean and normalize txt with or without `cleantext` dependency.

    • Uses `cleantext.clean()` if available.
    • Falls back to a Unicode-safe normalization, HTML/ASCII stripping, and basic cleanup.

    Parameters
    ----------
    text : Any
        Input txt (converted to str).
    lowercase : bool, default=False
        Whether to lowercase output.

    Returns
    -------
    str | None
        Cleaned string, or None if txt is empty/None.
    """
    if text is None:
        return None
    text = str(text)

    try:
        from cleantext import clean  # noqa
        return clean(
                text,
                fix_unicode=True,
                to_ascii=False,
                lower=lowercase,
                no_line_breaks=False,
                no_urls=True,
                no_emails=True,
                no_phone_numbers=True,
                no_numbers=False,
                no_digits=False,
                no_currency_symbols=True,
                no_punct=False,
                lang="en",
                )
    except Exception:
        # Fallback: simple safe cleaner
        text = unicodedata.normalize("NFKC", text)
        text = re.sub(r"<[^>]+>", "", text)  # strip HTML
        text = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", "", text)  # remove non-ASCII control chars
        text = text.strip()
        if lowercase:
            text = text.lower()
        return text or None

xpytools.xtool.txt.strip_html.strip_html

Python
strip_html(text: str) -> str

Remove HTML tags and entities.

Parameters:

Name Type Description Default

text

str
required

Returns:

Type Description
str

Text with all HTML tags removed.

Source code in xpytools/xtool/txt/strip_html.py
Python
def strip_html(text: str) -> str:
    """
    Remove HTML tags and entities.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Text with all HTML tags removed.
    """
    if not text:
        return ""
    # Remove tags
    text = _TAG_RE.sub("", text)
    # Decode common entities
    text = text.replace("&nbsp;", " ").replace("&amp;", "&")
    text = text.replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')
    return text.strip()

xpytools.xtool.txt.strip_ascii.strip_ascii

Python
strip_ascii(text: str, keep_basic_symbols: bool = True) -> str

Remove non-ASCII characters (optionally keeping punctuation and spaces).

Parameters:

Name Type Description Default

text

str

Input string.

required

keep_basic_symbols

bool

If False, removes everything outside [A-Za-z0-9 ].

True

Returns:

Type Description
str

ASCII-only txt.

Source code in xpytools/xtool/txt/strip_ascii.py
Python
def strip_ascii(text: str, keep_basic_symbols: bool = True) -> str:
    """
    Remove non-ASCII characters (optionally keeping punctuation and spaces).

    Parameters
    ----------
    text : str
        Input string.
    keep_basic_symbols : bool, default=True
        If False, removes everything outside [A-Za-z0-9 ].

    Returns
    -------
    str
        ASCII-only txt.
    """
    if not isinstance(text, str):
        return str(text or "")
    if keep_basic_symbols:
        return text.encode("ascii", "ignore").decode("ascii")
    return re.sub(r"[^A-Za-z0-9 ]+", "", text)

xpytools.xtool.txt.truncate.truncate

Python
truncate(text: str, limit: int = 120, suffix: str = '…') -> str

Truncate txt safely and append ellipsis if needed.

Parameters:

Name Type Description Default

text

str

Input string.

required

limit

int

Maximum length before truncation.

120

suffix

str

Suffix to indicate truncation.

"…"

Returns:

Type Description
str

Possibly truncated string.

Source code in xpytools/xtool/txt/truncate.py
Python
def truncate(text: str, limit: int = 120, suffix: str = "…") -> str:
    """
    Truncate txt safely and append ellipsis if needed.

    Parameters
    ----------
    text : str
        Input string.
    limit : int, default=120
        Maximum length before truncation.
    suffix : str, default="…"
        Suffix to indicate truncation.

    Returns
    -------
    str
        Possibly truncated string.
    """
    if text is None:
        return ""
    text = str(text)
    return text if len(text) <= limit else text[:limit].rstrip() + suffix

xpytools.xtool.txt.pad.pad

Python
pad(text: str, width: int = 20, align: str = 'left', fillchar: str = ' ', truncate: bool = True) -> str

Pad (and optionally truncate) a string to a fixed width.

Parameters:

Name Type Description Default

text

str

Input string to pad.

required

width

int

Desired total width of the output.

20

align

('left', 'right', 'center')

Alignment direction within the padded area.

"left"

fillchar

str

Character used for padding. Must be a single character.

" "

truncate

bool

Whether to truncate strings longer than the target width.

True

Returns:

Type Description
str

Padded (and possibly truncated) string.

Source code in xpytools/xtool/txt/pad.py
Python
def pad(
        text: str,
        width: int = 20,
        align: str = "left",
        fillchar: str = " ",
        truncate: bool = True,
        ) -> str:
    """
    Pad (and optionally truncate) a string to a fixed width.

    Parameters
    ----------
    text : str
        Input string to pad.
    width : int, default=20
        Desired total width of the output.
    align : {"left", "right", "center"}, default="left"
        Alignment direction within the padded area.
    fillchar : str, default=" "
        Character used for padding. Must be a single character.
    truncate : bool, default=True
        Whether to truncate strings longer than the target width.

    Returns
    -------
    str
        Padded (and possibly truncated) string.
    """
    if not isinstance(text, str):
        text = str(text or "")

    if not fillchar or len(fillchar) != 1:
        raise ValueError("fillchar must be a single character")

    if truncate and len(text) > width:
        text = text[:width]

    if align == "right":
        return text.rjust(width, fillchar)
    elif align == "center":
        return text.center(width, fillchar)
    else:
        return text.ljust(width, fillchar)

xpytools.xtool.txt.split_lines.split_lines

Python
split_lines(text: str, width: int = 80) -> List[str]

Split txt into fixed-width lines without breaking words.

Parameters:

Name Type Description Default

text

str

Input string.

required

width

int

Maximum width per line.

80

Returns:

Type Description
list[str]

List of wrapped lines.

Source code in xpytools/xtool/txt/split_lines.py
Python
def split_lines(text: str, width: int = 80) -> List[str]:
    """
    Split txt into fixed-width lines without breaking words.

    Parameters
    ----------
    text : str
        Input string.
    width : int, default=80
        Maximum width per line.

    Returns
    -------
    list[str]
        List of wrapped lines.
    """
    if not text:
        return []
    return wrap(text.strip(), width=width, break_long_words=False, replace_whitespace=True)