123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 |
- from __future__ import annotations
- import re
- import unicodedata
- from collections.abc import Iterable
- from html.entities import name2codepoint
- try:
- import unidecode
- except ImportError:
- import text_unidecode as unidecode
- __all__ = ['slugify', 'smart_truncate']
- CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
- DECIMAL_PATTERN = re.compile(r'&#(\d+);')
- HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
- QUOTE_PATTERN = re.compile(r'[\']+')
- DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
- DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
- DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
- NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
- DEFAULT_SEPARATOR = '-'
- def smart_truncate(
- string: str,
- max_length: int = 0,
- word_boundary: bool = False,
- separator: str = " ",
- save_order: bool = False,
- ) -> str:
- """
- Truncate a string.
- :param string (str): string for modification
- :param max_length (int): output string length
- :param word_boundary (bool):
- :param save_order (bool): if True then word order of output string is like input string
- :param separator (str): separator between words
- :return:
- """
- string = string.strip(separator)
- if not max_length:
- return string
- if len(string) < max_length:
- return string
- if not word_boundary:
- return string[:max_length].strip(separator)
- if separator not in string:
- return string[:max_length]
- truncated = ''
- for word in string.split(separator):
- if word:
- next_len = len(truncated) + len(word)
- if next_len < max_length:
- truncated += '{}{}'.format(word, separator)
- elif next_len == max_length:
- truncated += '{}'.format(word)
- break
- else:
- if save_order:
- break
- if not truncated: # pragma: no cover
- truncated = string[:max_length]
- return truncated.strip(separator)
- def slugify(
- text: str,
- entities: bool = True,
- decimal: bool = True,
- hexadecimal: bool = True,
- max_length: int = 0,
- word_boundary: bool = False,
- separator: str = DEFAULT_SEPARATOR,
- save_order: bool = False,
- stopwords: Iterable[str] = (),
- regex_pattern: re.Pattern[str] | str | None = None,
- lowercase: bool = True,
- replacements: Iterable[Iterable[str]] = (),
- allow_unicode: bool = False,
- ) -> str:
- """
- Make a slug from the given text.
- :param text (str): initial text
- :param entities (bool): converts html entities to unicode
- :param decimal (bool): converts html decimal to unicode
- :param hexadecimal (bool): converts html hexadecimal to unicode
- :param max_length (int): output string length
- :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
- :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
- :param separator (str): separator between words
- :param stopwords (iterable): words to discount
- :param regex_pattern (str): regex pattern for disallowed characters
- :param lowercase (bool): activate case sensitivity by setting it to False
- :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
- :param allow_unicode (bool): allow unicode characters
- :return (str):
- """
- # user-specific replacements
- if replacements:
- for old, new in replacements:
- text = text.replace(old, new)
- # ensure text is unicode
- if not isinstance(text, str):
- text = str(text, 'utf-8', 'ignore')
- # replace quotes with dashes - pre-process
- text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
- # normalize text, convert to unicode if required
- if allow_unicode:
- text = unicodedata.normalize('NFKC', text)
- else:
- text = unicodedata.normalize('NFKD', text)
- text = unidecode.unidecode(text)
- # ensure text is still in unicode
- if not isinstance(text, str):
- text = str(text, 'utf-8', 'ignore')
- # character entity reference
- if entities:
- text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
- # decimal character reference
- if decimal:
- try:
- text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
- except Exception:
- pass
- # hexadecimal character reference
- if hexadecimal:
- try:
- text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
- except Exception:
- pass
- # re normalize text
- if allow_unicode:
- text = unicodedata.normalize('NFKC', text)
- else:
- text = unicodedata.normalize('NFKD', text)
- # make the text lowercase (optional)
- if lowercase:
- text = text.lower()
- # remove generated quotes -- post-process
- text = QUOTE_PATTERN.sub('', text)
- # cleanup numbers
- text = NUMBERS_PATTERN.sub('', text)
- # replace all other unwanted characters
- if allow_unicode:
- pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
- else:
- pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
- text = re.sub(pattern, DEFAULT_SEPARATOR, text)
- # remove redundant
- text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
- # remove stopwords
- if stopwords:
- if lowercase:
- stopwords_lower = [s.lower() for s in stopwords]
- words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
- else:
- words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
- text = DEFAULT_SEPARATOR.join(words)
- # finalize user-specific replacements
- if replacements:
- for old, new in replacements:
- text = text.replace(old, new)
- # smart truncate if requested
- if max_length > 0:
- text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
- if separator != DEFAULT_SEPARATOR:
- text = text.replace(DEFAULT_SEPARATOR, separator)
- return text
|