html_re.py 929 B

12345678910111213141516171819202122232425262728293031323334353637383940
  1. """Regexps to match html elements
  2. """
  3. import re
  4. attr_name = "[a-zA-Z_:][a-zA-Z0-9:._-]*"
  5. unquoted = "[^\"'=<>`\\x00-\\x20]+"
  6. single_quoted = "'[^']*'"
  7. double_quoted = '"[^"]*"'
  8. attr_value = "(?:" + unquoted + "|" + single_quoted + "|" + double_quoted + ")"
  9. attribute = "(?:\\s+" + attr_name + "(?:\\s*=\\s*" + attr_value + ")?)"
  10. open_tag = "<[A-Za-z][A-Za-z0-9\\-]*" + attribute + "*\\s*\\/?>"
  11. close_tag = "<\\/[A-Za-z][A-Za-z0-9\\-]*\\s*>"
  12. comment = "<!---->|<!--(?:-?[^>-])(?:-?[^-])*-->"
  13. processing = "<[?][\\s\\S]*?[?]>"
  14. declaration = "<![A-Z]+\\s+[^>]*>"
  15. cdata = "<!\\[CDATA\\[[\\s\\S]*?\\]\\]>"
  16. HTML_TAG_RE = re.compile(
  17. "^(?:"
  18. + open_tag
  19. + "|"
  20. + close_tag
  21. + "|"
  22. + comment
  23. + "|"
  24. + processing
  25. + "|"
  26. + declaration
  27. + "|"
  28. + cdata
  29. + ")"
  30. )
  31. HTML_OPEN_CLOSE_TAG_STR = "^(?:" + open_tag + "|" + close_tag + ")"
  32. HTML_OPEN_CLOSE_TAG_RE = re.compile(HTML_OPEN_CLOSE_TAG_STR)