md.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. from __future__ import annotations
  2. from functools import lru_cache
  3. from logging import getLogger
  4. from .constant import (
  5. COMMON_SAFE_ASCII_CHARACTERS,
  6. TRACE,
  7. UNICODE_SECONDARY_RANGE_KEYWORD,
  8. )
  9. from .utils import (
  10. is_accentuated,
  11. is_arabic,
  12. is_arabic_isolated_form,
  13. is_case_variable,
  14. is_cjk,
  15. is_emoticon,
  16. is_hangul,
  17. is_hiragana,
  18. is_katakana,
  19. is_latin,
  20. is_punctuation,
  21. is_separator,
  22. is_symbol,
  23. is_thai,
  24. is_unprintable,
  25. remove_accent,
  26. unicode_range,
  27. )
  28. class MessDetectorPlugin:
  29. """
  30. Base abstract class used for mess detection plugins.
  31. All detectors MUST extend and implement given methods.
  32. """
  33. def eligible(self, character: str) -> bool:
  34. """
  35. Determine if given character should be fed in.
  36. """
  37. raise NotImplementedError # pragma: nocover
  38. def feed(self, character: str) -> None:
  39. """
  40. The main routine to be executed upon character.
  41. Insert the logic in witch the text would be considered chaotic.
  42. """
  43. raise NotImplementedError # pragma: nocover
  44. def reset(self) -> None: # pragma: no cover
  45. """
  46. Permit to reset the plugin to the initial state.
  47. """
  48. raise NotImplementedError
  49. @property
  50. def ratio(self) -> float:
  51. """
  52. Compute the chaos ratio based on what your feed() has seen.
  53. Must NOT be lower than 0.; No restriction gt 0.
  54. """
  55. raise NotImplementedError # pragma: nocover
  56. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  57. def __init__(self) -> None:
  58. self._punctuation_count: int = 0
  59. self._symbol_count: int = 0
  60. self._character_count: int = 0
  61. self._last_printable_char: str | None = None
  62. self._frenzy_symbol_in_word: bool = False
  63. def eligible(self, character: str) -> bool:
  64. return character.isprintable()
  65. def feed(self, character: str) -> None:
  66. self._character_count += 1
  67. if (
  68. character != self._last_printable_char
  69. and character not in COMMON_SAFE_ASCII_CHARACTERS
  70. ):
  71. if is_punctuation(character):
  72. self._punctuation_count += 1
  73. elif (
  74. character.isdigit() is False
  75. and is_symbol(character)
  76. and is_emoticon(character) is False
  77. ):
  78. self._symbol_count += 2
  79. self._last_printable_char = character
  80. def reset(self) -> None: # Abstract
  81. self._punctuation_count = 0
  82. self._character_count = 0
  83. self._symbol_count = 0
  84. @property
  85. def ratio(self) -> float:
  86. if self._character_count == 0:
  87. return 0.0
  88. ratio_of_punctuation: float = (
  89. self._punctuation_count + self._symbol_count
  90. ) / self._character_count
  91. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  92. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  93. def __init__(self) -> None:
  94. self._character_count: int = 0
  95. self._accentuated_count: int = 0
  96. def eligible(self, character: str) -> bool:
  97. return character.isalpha()
  98. def feed(self, character: str) -> None:
  99. self._character_count += 1
  100. if is_accentuated(character):
  101. self._accentuated_count += 1
  102. def reset(self) -> None: # Abstract
  103. self._character_count = 0
  104. self._accentuated_count = 0
  105. @property
  106. def ratio(self) -> float:
  107. if self._character_count < 8:
  108. return 0.0
  109. ratio_of_accentuation: float = self._accentuated_count / self._character_count
  110. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  111. class UnprintablePlugin(MessDetectorPlugin):
  112. def __init__(self) -> None:
  113. self._unprintable_count: int = 0
  114. self._character_count: int = 0
  115. def eligible(self, character: str) -> bool:
  116. return True
  117. def feed(self, character: str) -> None:
  118. if is_unprintable(character):
  119. self._unprintable_count += 1
  120. self._character_count += 1
  121. def reset(self) -> None: # Abstract
  122. self._unprintable_count = 0
  123. @property
  124. def ratio(self) -> float:
  125. if self._character_count == 0:
  126. return 0.0
  127. return (self._unprintable_count * 8) / self._character_count
  128. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  129. def __init__(self) -> None:
  130. self._successive_count: int = 0
  131. self._character_count: int = 0
  132. self._last_latin_character: str | None = None
  133. def eligible(self, character: str) -> bool:
  134. return character.isalpha() and is_latin(character)
  135. def feed(self, character: str) -> None:
  136. self._character_count += 1
  137. if (
  138. self._last_latin_character is not None
  139. and is_accentuated(character)
  140. and is_accentuated(self._last_latin_character)
  141. ):
  142. if character.isupper() and self._last_latin_character.isupper():
  143. self._successive_count += 1
  144. # Worse if its the same char duplicated with different accent.
  145. if remove_accent(character) == remove_accent(self._last_latin_character):
  146. self._successive_count += 1
  147. self._last_latin_character = character
  148. def reset(self) -> None: # Abstract
  149. self._successive_count = 0
  150. self._character_count = 0
  151. self._last_latin_character = None
  152. @property
  153. def ratio(self) -> float:
  154. if self._character_count == 0:
  155. return 0.0
  156. return (self._successive_count * 2) / self._character_count
  157. class SuspiciousRange(MessDetectorPlugin):
  158. def __init__(self) -> None:
  159. self._suspicious_successive_range_count: int = 0
  160. self._character_count: int = 0
  161. self._last_printable_seen: str | None = None
  162. def eligible(self, character: str) -> bool:
  163. return character.isprintable()
  164. def feed(self, character: str) -> None:
  165. self._character_count += 1
  166. if (
  167. character.isspace()
  168. or is_punctuation(character)
  169. or character in COMMON_SAFE_ASCII_CHARACTERS
  170. ):
  171. self._last_printable_seen = None
  172. return
  173. if self._last_printable_seen is None:
  174. self._last_printable_seen = character
  175. return
  176. unicode_range_a: str | None = unicode_range(self._last_printable_seen)
  177. unicode_range_b: str | None = unicode_range(character)
  178. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  179. self._suspicious_successive_range_count += 1
  180. self._last_printable_seen = character
  181. def reset(self) -> None: # Abstract
  182. self._character_count = 0
  183. self._suspicious_successive_range_count = 0
  184. self._last_printable_seen = None
  185. @property
  186. def ratio(self) -> float:
  187. if self._character_count <= 13:
  188. return 0.0
  189. ratio_of_suspicious_range_usage: float = (
  190. self._suspicious_successive_range_count * 2
  191. ) / self._character_count
  192. return ratio_of_suspicious_range_usage
  193. class SuperWeirdWordPlugin(MessDetectorPlugin):
  194. def __init__(self) -> None:
  195. self._word_count: int = 0
  196. self._bad_word_count: int = 0
  197. self._foreign_long_count: int = 0
  198. self._is_current_word_bad: bool = False
  199. self._foreign_long_watch: bool = False
  200. self._character_count: int = 0
  201. self._bad_character_count: int = 0
  202. self._buffer: str = ""
  203. self._buffer_accent_count: int = 0
  204. self._buffer_glyph_count: int = 0
  205. def eligible(self, character: str) -> bool:
  206. return True
  207. def feed(self, character: str) -> None:
  208. if character.isalpha():
  209. self._buffer += character
  210. if is_accentuated(character):
  211. self._buffer_accent_count += 1
  212. if (
  213. self._foreign_long_watch is False
  214. and (is_latin(character) is False or is_accentuated(character))
  215. and is_cjk(character) is False
  216. and is_hangul(character) is False
  217. and is_katakana(character) is False
  218. and is_hiragana(character) is False
  219. and is_thai(character) is False
  220. ):
  221. self._foreign_long_watch = True
  222. if (
  223. is_cjk(character)
  224. or is_hangul(character)
  225. or is_katakana(character)
  226. or is_hiragana(character)
  227. or is_thai(character)
  228. ):
  229. self._buffer_glyph_count += 1
  230. return
  231. if not self._buffer:
  232. return
  233. if (
  234. character.isspace() or is_punctuation(character) or is_separator(character)
  235. ) and self._buffer:
  236. self._word_count += 1
  237. buffer_length: int = len(self._buffer)
  238. self._character_count += buffer_length
  239. if buffer_length >= 4:
  240. if self._buffer_accent_count / buffer_length >= 0.5:
  241. self._is_current_word_bad = True
  242. # Word/Buffer ending with an upper case accentuated letter are so rare,
  243. # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
  244. elif (
  245. is_accentuated(self._buffer[-1])
  246. and self._buffer[-1].isupper()
  247. and all(_.isupper() for _ in self._buffer) is False
  248. ):
  249. self._foreign_long_count += 1
  250. self._is_current_word_bad = True
  251. elif self._buffer_glyph_count == 1:
  252. self._is_current_word_bad = True
  253. self._foreign_long_count += 1
  254. if buffer_length >= 24 and self._foreign_long_watch:
  255. camel_case_dst = [
  256. i
  257. for c, i in zip(self._buffer, range(0, buffer_length))
  258. if c.isupper()
  259. ]
  260. probable_camel_cased: bool = False
  261. if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
  262. probable_camel_cased = True
  263. if not probable_camel_cased:
  264. self._foreign_long_count += 1
  265. self._is_current_word_bad = True
  266. if self._is_current_word_bad:
  267. self._bad_word_count += 1
  268. self._bad_character_count += len(self._buffer)
  269. self._is_current_word_bad = False
  270. self._foreign_long_watch = False
  271. self._buffer = ""
  272. self._buffer_accent_count = 0
  273. self._buffer_glyph_count = 0
  274. elif (
  275. character not in {"<", ">", "-", "=", "~", "|", "_"}
  276. and character.isdigit() is False
  277. and is_symbol(character)
  278. ):
  279. self._is_current_word_bad = True
  280. self._buffer += character
  281. def reset(self) -> None: # Abstract
  282. self._buffer = ""
  283. self._is_current_word_bad = False
  284. self._foreign_long_watch = False
  285. self._bad_word_count = 0
  286. self._word_count = 0
  287. self._character_count = 0
  288. self._bad_character_count = 0
  289. self._foreign_long_count = 0
  290. @property
  291. def ratio(self) -> float:
  292. if self._word_count <= 10 and self._foreign_long_count == 0:
  293. return 0.0
  294. return self._bad_character_count / self._character_count
  295. class CjkInvalidStopPlugin(MessDetectorPlugin):
  296. """
  297. GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
  298. can be easily detected. Searching for the overuse of '丅' and '丄'.
  299. """
  300. def __init__(self) -> None:
  301. self._wrong_stop_count: int = 0
  302. self._cjk_character_count: int = 0
  303. def eligible(self, character: str) -> bool:
  304. return True
  305. def feed(self, character: str) -> None:
  306. if character in {"丅", "丄"}:
  307. self._wrong_stop_count += 1
  308. return
  309. if is_cjk(character):
  310. self._cjk_character_count += 1
  311. def reset(self) -> None: # Abstract
  312. self._wrong_stop_count = 0
  313. self._cjk_character_count = 0
  314. @property
  315. def ratio(self) -> float:
  316. if self._cjk_character_count < 16:
  317. return 0.0
  318. return self._wrong_stop_count / self._cjk_character_count
  319. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  320. def __init__(self) -> None:
  321. self._buf: bool = False
  322. self._character_count_since_last_sep: int = 0
  323. self._successive_upper_lower_count: int = 0
  324. self._successive_upper_lower_count_final: int = 0
  325. self._character_count: int = 0
  326. self._last_alpha_seen: str | None = None
  327. self._current_ascii_only: bool = True
  328. def eligible(self, character: str) -> bool:
  329. return True
  330. def feed(self, character: str) -> None:
  331. is_concerned = character.isalpha() and is_case_variable(character)
  332. chunk_sep = is_concerned is False
  333. if chunk_sep and self._character_count_since_last_sep > 0:
  334. if (
  335. self._character_count_since_last_sep <= 64
  336. and character.isdigit() is False
  337. and self._current_ascii_only is False
  338. ):
  339. self._successive_upper_lower_count_final += (
  340. self._successive_upper_lower_count
  341. )
  342. self._successive_upper_lower_count = 0
  343. self._character_count_since_last_sep = 0
  344. self._last_alpha_seen = None
  345. self._buf = False
  346. self._character_count += 1
  347. self._current_ascii_only = True
  348. return
  349. if self._current_ascii_only is True and character.isascii() is False:
  350. self._current_ascii_only = False
  351. if self._last_alpha_seen is not None:
  352. if (character.isupper() and self._last_alpha_seen.islower()) or (
  353. character.islower() and self._last_alpha_seen.isupper()
  354. ):
  355. if self._buf is True:
  356. self._successive_upper_lower_count += 2
  357. self._buf = False
  358. else:
  359. self._buf = True
  360. else:
  361. self._buf = False
  362. self._character_count += 1
  363. self._character_count_since_last_sep += 1
  364. self._last_alpha_seen = character
  365. def reset(self) -> None: # Abstract
  366. self._character_count = 0
  367. self._character_count_since_last_sep = 0
  368. self._successive_upper_lower_count = 0
  369. self._successive_upper_lower_count_final = 0
  370. self._last_alpha_seen = None
  371. self._buf = False
  372. self._current_ascii_only = True
  373. @property
  374. def ratio(self) -> float:
  375. if self._character_count == 0:
  376. return 0.0
  377. return self._successive_upper_lower_count_final / self._character_count
  378. class ArabicIsolatedFormPlugin(MessDetectorPlugin):
  379. def __init__(self) -> None:
  380. self._character_count: int = 0
  381. self._isolated_form_count: int = 0
  382. def reset(self) -> None: # Abstract
  383. self._character_count = 0
  384. self._isolated_form_count = 0
  385. def eligible(self, character: str) -> bool:
  386. return is_arabic(character)
  387. def feed(self, character: str) -> None:
  388. self._character_count += 1
  389. if is_arabic_isolated_form(character):
  390. self._isolated_form_count += 1
  391. @property
  392. def ratio(self) -> float:
  393. if self._character_count < 8:
  394. return 0.0
  395. isolated_form_usage: float = self._isolated_form_count / self._character_count
  396. return isolated_form_usage
  397. @lru_cache(maxsize=1024)
  398. def is_suspiciously_successive_range(
  399. unicode_range_a: str | None, unicode_range_b: str | None
  400. ) -> bool:
  401. """
  402. Determine if two Unicode range seen next to each other can be considered as suspicious.
  403. """
  404. if unicode_range_a is None or unicode_range_b is None:
  405. return True
  406. if unicode_range_a == unicode_range_b:
  407. return False
  408. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  409. return False
  410. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  411. return False
  412. # Latin characters can be accompanied with a combining diacritical mark
  413. # eg. Vietnamese.
  414. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  415. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  416. ):
  417. return False
  418. keywords_range_a, keywords_range_b = (
  419. unicode_range_a.split(" "),
  420. unicode_range_b.split(" "),
  421. )
  422. for el in keywords_range_a:
  423. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  424. continue
  425. if el in keywords_range_b:
  426. return False
  427. # Japanese Exception
  428. range_a_jp_chars, range_b_jp_chars = (
  429. unicode_range_a
  430. in (
  431. "Hiragana",
  432. "Katakana",
  433. ),
  434. unicode_range_b in ("Hiragana", "Katakana"),
  435. )
  436. if (range_a_jp_chars or range_b_jp_chars) and (
  437. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  438. ):
  439. return False
  440. if range_a_jp_chars and range_b_jp_chars:
  441. return False
  442. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  443. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  444. return False
  445. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  446. return False
  447. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  448. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  449. unicode_range_a in ["Katakana", "Hiragana"]
  450. and unicode_range_b in ["Katakana", "Hiragana"]
  451. ):
  452. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  453. return False
  454. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  455. return False
  456. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  457. return False
  458. return True
  459. @lru_cache(maxsize=2048)
  460. def mess_ratio(
  461. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  462. ) -> float:
  463. """
  464. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  465. """
  466. detectors: list[MessDetectorPlugin] = [
  467. md_class() for md_class in MessDetectorPlugin.__subclasses__()
  468. ]
  469. length: int = len(decoded_sequence) + 1
  470. mean_mess_ratio: float = 0.0
  471. if length < 512:
  472. intermediary_mean_mess_ratio_calc: int = 32
  473. elif length <= 1024:
  474. intermediary_mean_mess_ratio_calc = 64
  475. else:
  476. intermediary_mean_mess_ratio_calc = 128
  477. for character, index in zip(decoded_sequence + "\n", range(length)):
  478. for detector in detectors:
  479. if detector.eligible(character):
  480. detector.feed(character)
  481. if (
  482. index > 0 and index % intermediary_mean_mess_ratio_calc == 0
  483. ) or index == length - 1:
  484. mean_mess_ratio = sum(dt.ratio for dt in detectors)
  485. if mean_mess_ratio >= maximum_threshold:
  486. break
  487. if debug:
  488. logger = getLogger("charset_normalizer")
  489. logger.log(
  490. TRACE,
  491. "Mess-detector extended-analysis start. "
  492. f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
  493. f"maximum_threshold={maximum_threshold}",
  494. )
  495. if len(decoded_sequence) > 16:
  496. logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
  497. logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
  498. for dt in detectors:
  499. logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
  500. return round(mean_mess_ratio, 3)