Text Processing¶
String manipulation, cleaning, and formatting utilities.
xpytools.xtool.txt.clean.clean
¶
Clean and normalize txt with or without cleantext dependency.
• Uses cleantext.clean() if available.
• Falls back to a Unicode-safe normalization, HTML/ASCII stripping, and basic cleanup.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
Any
|
Input txt (converted to str). |
required |
|
bool
|
Whether to lowercase output. |
False
|
Returns:
| Type | Description |
|---|---|
str | None
|
Cleaned string, or None if txt is empty/None. |
Source code in xpytools/xtool/txt/clean.py
@requireModules(["cleantext"], exc_raise=False)
def clean(text: Any, *, lowercase: bool = False) -> str | None:
"""
Clean and normalize txt with or without `cleantext` dependency.
• Uses `cleantext.clean()` if available.
• Falls back to a Unicode-safe normalization, HTML/ASCII stripping, and basic cleanup.
Parameters
----------
text : Any
Input txt (converted to str).
lowercase : bool, default=False
Whether to lowercase output.
Returns
-------
str | None
Cleaned string, or None if txt is empty/None.
"""
if text is None:
return None
text = str(text)
try:
from cleantext import clean # noqa
return clean(
text,
fix_unicode=True,
to_ascii=False,
lower=lowercase,
no_line_breaks=False,
no_urls=True,
no_emails=True,
no_phone_numbers=True,
no_numbers=False,
no_digits=False,
no_currency_symbols=True,
no_punct=False,
lang="en",
)
except Exception:
# Fallback: simple safe cleaner
text = unicodedata.normalize("NFKC", text)
text = re.sub(r"<[^>]+>", "", text) # strip HTML
text = re.sub(r"[^\x09\x0A\x0D\x20-\x7E]", "", text) # remove non-ASCII control chars
text = text.strip()
if lowercase:
text = text.lower()
return text or None
xpytools.xtool.txt.strip_html.strip_html
¶
Remove HTML tags and entities.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
str
|
|
required |
Returns:
| Type | Description |
|---|---|
str
|
Text with all HTML tags removed. |
Source code in xpytools/xtool/txt/strip_html.py
def strip_html(text: str) -> str:
"""
Remove HTML tags and entities.
Parameters
----------
text : str
Returns
-------
str
Text with all HTML tags removed.
"""
if not text:
return ""
# Remove tags
text = _TAG_RE.sub("", text)
# Decode common entities
text = text.replace(" ", " ").replace("&", "&")
text = text.replace("<", "<").replace(">", ">").replace(""", '"')
return text.strip()
xpytools.xtool.txt.strip_ascii.strip_ascii
¶
Remove non-ASCII characters (optionally keeping punctuation and spaces).
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
str
|
Input string. |
required |
|
bool
|
If False, removes everything outside [A-Za-z0-9 ]. |
True
|
Returns:
| Type | Description |
|---|---|
str
|
ASCII-only txt. |
Source code in xpytools/xtool/txt/strip_ascii.py
def strip_ascii(text: str, keep_basic_symbols: bool = True) -> str:
"""
Remove non-ASCII characters (optionally keeping punctuation and spaces).
Parameters
----------
text : str
Input string.
keep_basic_symbols : bool, default=True
If False, removes everything outside [A-Za-z0-9 ].
Returns
-------
str
ASCII-only txt.
"""
if not isinstance(text, str):
return str(text or "")
if keep_basic_symbols:
return text.encode("ascii", "ignore").decode("ascii")
return re.sub(r"[^A-Za-z0-9 ]+", "", text)
xpytools.xtool.txt.truncate.truncate
¶
Truncate txt safely and append ellipsis if needed.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
str
|
Input string. |
required |
|
int
|
Maximum length before truncation. |
120
|
|
str
|
Suffix to indicate truncation. |
"…"
|
Returns:
| Type | Description |
|---|---|
str
|
Possibly truncated string. |
Source code in xpytools/xtool/txt/truncate.py
def truncate(text: str, limit: int = 120, suffix: str = "…") -> str:
"""
Truncate txt safely and append ellipsis if needed.
Parameters
----------
text : str
Input string.
limit : int, default=120
Maximum length before truncation.
suffix : str, default="…"
Suffix to indicate truncation.
Returns
-------
str
Possibly truncated string.
"""
if text is None:
return ""
text = str(text)
return text if len(text) <= limit else text[:limit].rstrip() + suffix
xpytools.xtool.txt.pad.pad
¶
pad(text: str, width: int = 20, align: str = 'left', fillchar: str = ' ', truncate: bool = True) -> str
Pad (and optionally truncate) a string to a fixed width.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
str
|
Input string to pad. |
required |
|
int
|
Desired total width of the output. |
20
|
|
('left', 'right', 'center')
|
Alignment direction within the padded area. |
"left"
|
|
str
|
Character used for padding. Must be a single character. |
" "
|
|
bool
|
Whether to truncate strings longer than the target width. |
True
|
Returns:
| Type | Description |
|---|---|
str
|
Padded (and possibly truncated) string. |
Source code in xpytools/xtool/txt/pad.py
def pad(
text: str,
width: int = 20,
align: str = "left",
fillchar: str = " ",
truncate: bool = True,
) -> str:
"""
Pad (and optionally truncate) a string to a fixed width.
Parameters
----------
text : str
Input string to pad.
width : int, default=20
Desired total width of the output.
align : {"left", "right", "center"}, default="left"
Alignment direction within the padded area.
fillchar : str, default=" "
Character used for padding. Must be a single character.
truncate : bool, default=True
Whether to truncate strings longer than the target width.
Returns
-------
str
Padded (and possibly truncated) string.
"""
if not isinstance(text, str):
text = str(text or "")
if not fillchar or len(fillchar) != 1:
raise ValueError("fillchar must be a single character")
if truncate and len(text) > width:
text = text[:width]
if align == "right":
return text.rjust(width, fillchar)
elif align == "center":
return text.center(width, fillchar)
else:
return text.ljust(width, fillchar)
xpytools.xtool.txt.split_lines.split_lines
¶
Split txt into fixed-width lines without breaking words.
Parameters:
| Name | Type | Description | Default |
|---|---|---|---|
|
str
|
Input string. |
required |
|
int
|
Maximum width per line. |
80
|
Returns:
| Type | Description |
|---|---|
list[str]
|
List of wrapped lines. |
Source code in xpytools/xtool/txt/split_lines.py
def split_lines(text: str, width: int = 80) -> List[str]:
"""
Split txt into fixed-width lines without breaking words.
Parameters
----------
text : str
Input string.
width : int, default=80
Maximum width per line.
Returns
-------
list[str]
List of wrapped lines.
"""
if not text:
return []
return wrap(text.strip(), width=width, break_long_words=False, replace_whitespace=True)