slugify.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. from __future__ import annotations
  2. import re
  3. import unicodedata
  4. from collections.abc import Iterable
  5. from html.entities import name2codepoint
  6. try:
  7. import unidecode
  8. except ImportError:
  9. import text_unidecode as unidecode
  10. __all__ = ['slugify', 'smart_truncate']
  11. CHAR_ENTITY_PATTERN = re.compile(r'&(%s);' % '|'.join(name2codepoint))
  12. DECIMAL_PATTERN = re.compile(r'&#(\d+);')
  13. HEX_PATTERN = re.compile(r'&#x([\da-fA-F]+);')
  14. QUOTE_PATTERN = re.compile(r'[\']+')
  15. DISALLOWED_CHARS_PATTERN = re.compile(r'[^-a-zA-Z0-9]+')
  16. DISALLOWED_UNICODE_CHARS_PATTERN = re.compile(r'[\W_]+')
  17. DUPLICATE_DASH_PATTERN = re.compile(r'-{2,}')
  18. NUMBERS_PATTERN = re.compile(r'(?<=\d),(?=\d)')
  19. DEFAULT_SEPARATOR = '-'
  20. def smart_truncate(
  21. string: str,
  22. max_length: int = 0,
  23. word_boundary: bool = False,
  24. separator: str = " ",
  25. save_order: bool = False,
  26. ) -> str:
  27. """
  28. Truncate a string.
  29. :param string (str): string for modification
  30. :param max_length (int): output string length
  31. :param word_boundary (bool):
  32. :param save_order (bool): if True then word order of output string is like input string
  33. :param separator (str): separator between words
  34. :return:
  35. """
  36. string = string.strip(separator)
  37. if not max_length:
  38. return string
  39. if len(string) < max_length:
  40. return string
  41. if not word_boundary:
  42. return string[:max_length].strip(separator)
  43. if separator not in string:
  44. return string[:max_length]
  45. truncated = ''
  46. for word in string.split(separator):
  47. if word:
  48. next_len = len(truncated) + len(word)
  49. if next_len < max_length:
  50. truncated += '{}{}'.format(word, separator)
  51. elif next_len == max_length:
  52. truncated += '{}'.format(word)
  53. break
  54. else:
  55. if save_order:
  56. break
  57. if not truncated: # pragma: no cover
  58. truncated = string[:max_length]
  59. return truncated.strip(separator)
  60. def slugify(
  61. text: str,
  62. entities: bool = True,
  63. decimal: bool = True,
  64. hexadecimal: bool = True,
  65. max_length: int = 0,
  66. word_boundary: bool = False,
  67. separator: str = DEFAULT_SEPARATOR,
  68. save_order: bool = False,
  69. stopwords: Iterable[str] = (),
  70. regex_pattern: re.Pattern[str] | str | None = None,
  71. lowercase: bool = True,
  72. replacements: Iterable[Iterable[str]] = (),
  73. allow_unicode: bool = False,
  74. ) -> str:
  75. """
  76. Make a slug from the given text.
  77. :param text (str): initial text
  78. :param entities (bool): converts html entities to unicode
  79. :param decimal (bool): converts html decimal to unicode
  80. :param hexadecimal (bool): converts html hexadecimal to unicode
  81. :param max_length (int): output string length
  82. :param word_boundary (bool): truncates to complete word even if length ends up shorter than max_length
  83. :param save_order (bool): if parameter is True and max_length > 0 return whole words in the initial order
  84. :param separator (str): separator between words
  85. :param stopwords (iterable): words to discount
  86. :param regex_pattern (str): regex pattern for disallowed characters
  87. :param lowercase (bool): activate case sensitivity by setting it to False
  88. :param replacements (iterable): list of replacement rules e.g. [['|', 'or'], ['%', 'percent']]
  89. :param allow_unicode (bool): allow unicode characters
  90. :return (str):
  91. """
  92. # user-specific replacements
  93. if replacements:
  94. for old, new in replacements:
  95. text = text.replace(old, new)
  96. # ensure text is unicode
  97. if not isinstance(text, str):
  98. text = str(text, 'utf-8', 'ignore')
  99. # replace quotes with dashes - pre-process
  100. text = QUOTE_PATTERN.sub(DEFAULT_SEPARATOR, text)
  101. # normalize text, convert to unicode if required
  102. if allow_unicode:
  103. text = unicodedata.normalize('NFKC', text)
  104. else:
  105. text = unicodedata.normalize('NFKD', text)
  106. text = unidecode.unidecode(text)
  107. # ensure text is still in unicode
  108. if not isinstance(text, str):
  109. text = str(text, 'utf-8', 'ignore')
  110. # character entity reference
  111. if entities:
  112. text = CHAR_ENTITY_PATTERN.sub(lambda m: chr(name2codepoint[m.group(1)]), text)
  113. # decimal character reference
  114. if decimal:
  115. try:
  116. text = DECIMAL_PATTERN.sub(lambda m: chr(int(m.group(1))), text)
  117. except Exception:
  118. pass
  119. # hexadecimal character reference
  120. if hexadecimal:
  121. try:
  122. text = HEX_PATTERN.sub(lambda m: chr(int(m.group(1), 16)), text)
  123. except Exception:
  124. pass
  125. # re normalize text
  126. if allow_unicode:
  127. text = unicodedata.normalize('NFKC', text)
  128. else:
  129. text = unicodedata.normalize('NFKD', text)
  130. # make the text lowercase (optional)
  131. if lowercase:
  132. text = text.lower()
  133. # remove generated quotes -- post-process
  134. text = QUOTE_PATTERN.sub('', text)
  135. # cleanup numbers
  136. text = NUMBERS_PATTERN.sub('', text)
  137. # replace all other unwanted characters
  138. if allow_unicode:
  139. pattern = regex_pattern or DISALLOWED_UNICODE_CHARS_PATTERN
  140. else:
  141. pattern = regex_pattern or DISALLOWED_CHARS_PATTERN
  142. text = re.sub(pattern, DEFAULT_SEPARATOR, text)
  143. # remove redundant
  144. text = DUPLICATE_DASH_PATTERN.sub(DEFAULT_SEPARATOR, text).strip(DEFAULT_SEPARATOR)
  145. # remove stopwords
  146. if stopwords:
  147. if lowercase:
  148. stopwords_lower = [s.lower() for s in stopwords]
  149. words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords_lower]
  150. else:
  151. words = [w for w in text.split(DEFAULT_SEPARATOR) if w not in stopwords]
  152. text = DEFAULT_SEPARATOR.join(words)
  153. # finalize user-specific replacements
  154. if replacements:
  155. for old, new in replacements:
  156. text = text.replace(old, new)
  157. # smart truncate if requested
  158. if max_length > 0:
  159. text = smart_truncate(text, max_length, word_boundary, DEFAULT_SEPARATOR, save_order)
  160. if separator != DEFAULT_SEPARATOR:
  161. text = text.replace(DEFAULT_SEPARATOR, separator)
  162. return text