parser.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. """Parser for configuration files (normally ``*.cfg/*.ini``)
  2. A configuration file consists of sections, lead by a "[section]" header,
  3. and followed by "name: value" entries, with continuations and such in
  4. the style of RFC 822.
  5. The basic idea of **ConfigUpdater** is that a configuration file consists of
  6. three kinds of building blocks: sections, comments and spaces for separation.
  7. A section itself consists of three kinds of blocks: options, comments and
  8. spaces. This gives us the corresponding data structures to describe a
  9. configuration file.
  10. A general block object contains the lines which were parsed and make up
  11. the block. If a block object was not changed then during writing the same
  12. lines that were parsed will be used to express the block. In case a block,
  13. e.g. an option, was changed, it is marked as `updated` and its values will
  14. be transformed into a corresponding string during an update of a
  15. configuration file.
  16. .. note::
  17. ConfigUpdater is based on Python's ConfigParser source code, specially regarding the
  18. ``parser`` module.
  19. The main parsing rules and algorithm are preserved, however ConfigUpdater implements
  20. its own modified version of the abstract syntax tree to support retaining comments
  21. and whitespace in an attempt to provide format-preserving document manipulation.
  22. The copyright and license of the original ConfigParser code is included as an
  23. attachment to ConfigUpdater's own license, at the root of the source code repository;
  24. see the file LICENSE for details.
  25. """
  26. import io
  27. import os
  28. import re
  29. import sys
  30. from configparser import (
  31. DuplicateOptionError,
  32. DuplicateSectionError,
  33. MissingSectionHeaderError,
  34. NoOptionError,
  35. NoSectionError,
  36. ParsingError,
  37. )
  38. from types import MappingProxyType as ReadOnlyMapping
  39. from typing import Callable, Optional, Tuple, Type, TypeVar, Union, cast, overload
  40. if sys.version_info[:2] >= (3, 9): # pragma: no cover
  41. from collections.abc import Iterable, Mapping
  42. List = list
  43. Dict = dict
  44. else: # pragma: no cover
  45. from typing import Iterable, List, Dict, Mapping
  46. from .block import Comment, Space
  47. from .document import Document
  48. from .option import Option
  49. from .section import Section
  50. __all__ = [
  51. "NoSectionError",
  52. "DuplicateOptionError",
  53. "DuplicateSectionError",
  54. "NoOptionError",
  55. "ParsingError",
  56. "MissingSectionHeaderError",
  57. "InconsistentStateError",
  58. "Parser",
  59. ]
  60. T = TypeVar("T")
  61. E = TypeVar("E", bound=Exception)
  62. D = TypeVar("D", bound=Document)
  63. if sys.version_info[:2] >= (3, 7): # pragma: no cover
  64. PathLike = Union[str, bytes, os.PathLike]
  65. else: # pragma: no cover
  66. PathLike = Union[str, os.PathLike]
  67. ConfigContent = Union["Section", "Comment", "Space"]
  68. class InconsistentStateError(Exception): # pragma: no cover (not expected to happen)
  69. """Internal parser error, some of the parsing algorithm assumptions was violated,
  70. and the internal state machine ended up in an unpredicted state.
  71. """
  72. def __init__(self, msg, fpname="<???>", lineno: int = -1, line: str = "???"):
  73. super().__init__(msg)
  74. self.args = (msg, fpname, lineno, line)
  75. def __str__(self):
  76. (msg, fpname, lineno, line) = self.args
  77. return f"{msg}\n{fpname}({lineno}): {line!r}"
  78. class Parser:
  79. """Parser for updating configuration files.
  80. ConfigUpdater's parser follows ConfigParser with some differences:
  81. * inline comments are treated as part of a key's value,
  82. * only a single config file can be updated at a time,
  83. * the original case of sections and keys are kept,
  84. * control over the position of a new section/key.
  85. Following features are **deliberately not** implemented:
  86. * interpolation of values,
  87. * propagation of parameters from the default section,
  88. * conversions of values,
  89. * passing key/value-pairs with ``default`` argument,
  90. * non-strict mode allowing duplicate sections and keys.
  91. """
  92. # Regular expressions for parsing section headers and options
  93. _SECT_TMPL: str = r"""
  94. \[ # [
  95. (?P<header>.+) # very permissive!
  96. \] # ]
  97. (?P<raw_comment>.*) # match any suffix
  98. """
  99. _OPT_TMPL: str = r"""
  100. (?P<option>.*?) # very permissive!
  101. \s*(?P<vi>{delim})\s* # any number of space/tab,
  102. # followed by any of the
  103. # allowed delimiters,
  104. # followed by any space/tab
  105. (?P<value>.*)$ # everything up to eol
  106. """
  107. _OPT_NV_TMPL: str = r"""
  108. (?P<option>.*?) # very permissive!
  109. \s*(?: # any number of space/tab,
  110. (?P<vi>{delim})\s* # optionally followed by
  111. # any of the allowed
  112. # delimiters, followed by any
  113. # space/tab
  114. (?P<value>.*))?$ # everything up to eol
  115. """
  116. # Compiled regular expression for matching sections
  117. SECTCRE = re.compile(_SECT_TMPL, re.VERBOSE)
  118. # Compiled regular expression for matching options with typical separators
  119. OPTCRE = re.compile(_OPT_TMPL.format(delim="=|:"), re.VERBOSE)
  120. # Compiled regular expression for matching options with optional values
  121. # delimited using typical separators
  122. OPTCRE_NV = re.compile(_OPT_NV_TMPL.format(delim="=|:"), re.VERBOSE)
  123. # Compiled regular expression for matching leading whitespace in a line
  124. NONSPACECRE = re.compile(r"\S")
  125. def __init__(
  126. self,
  127. allow_no_value=False,
  128. *,
  129. delimiters: Tuple[str, ...] = ("=", ":"),
  130. comment_prefixes: Tuple[str, ...] = ("#", ";"),
  131. inline_comment_prefixes: Optional[Tuple[str, ...]] = None,
  132. strict: bool = True,
  133. empty_lines_in_values: bool = True,
  134. space_around_delimiters: bool = True,
  135. optionxform: Callable[[str], str] = str,
  136. ):
  137. """Constructor of the Parser
  138. Args:
  139. allow_no_value (bool): allow keys without a value, default False
  140. delimiters (tuple): delimiters for key/value pairs, default =, :
  141. comment_prefixes (tuple): prefix of comments, default # and ;
  142. inline_comment_prefixes (tuple): prefix of inline comment,
  143. default None
  144. strict (bool): each section must be unique as well as every key
  145. within a section, default True
  146. empty_lines_in_values (bool): each empty line marks the end of an option.
  147. Otherwise, internal empty lines of a multiline option are kept as part
  148. of the value, default: True.
  149. space_around_delimiters (bool): add a space before and after the
  150. delimiter, default True
  151. """
  152. self._document: Document # bind later
  153. self._optionxform_fn = optionxform
  154. self._lineno = -1
  155. self._fpname = "<???>"
  156. self._filename: Optional[str] = None
  157. self._space_around_delimiters: bool = space_around_delimiters
  158. self._dict = dict # no reason to let the user change this
  159. # keeping _sections to keep code aligned with ConfigParser but
  160. # _document takes the actual role instead. Only use self._document!
  161. self._sections: Dict[str, Dict[str, List[str]]] = self._dict()
  162. self._delimiters: Tuple[str, ...] = tuple(delimiters)
  163. if delimiters == ("=", ":"):
  164. self._optcre = self.OPTCRE_NV if allow_no_value else self.OPTCRE
  165. else:
  166. d = "|".join(re.escape(d) for d in delimiters)
  167. if allow_no_value:
  168. self._optcre = re.compile(self._OPT_NV_TMPL.format(delim=d), re.VERBOSE)
  169. else:
  170. self._optcre = re.compile(self._OPT_TMPL.format(delim=d), re.VERBOSE)
  171. self._comment_prefixes: Tuple[str, ...] = tuple(comment_prefixes or ())
  172. self._inline_comment_prefixes: Tuple[str, ...] = tuple(
  173. inline_comment_prefixes or ()
  174. )
  175. self._strict = strict
  176. self._allow_no_value = allow_no_value
  177. self._empty_lines_in_values = empty_lines_in_values
  178. def _get_args(self) -> dict:
  179. args = (
  180. "allow_no_value",
  181. "delimiters",
  182. "comment_prefixes",
  183. "inline_comment_prefixes",
  184. "strict",
  185. "empty_lines_in_values",
  186. "space_around_delimiters",
  187. )
  188. return {attr: getattr(self, f"_{attr}") for attr in args}
  189. def __repr__(self) -> str:
  190. return f"<{self.__class__.__name__}: {self._get_args()!r}>"
  191. @property
  192. def syntax_options(self) -> Mapping:
  193. return ReadOnlyMapping(self._get_args())
  194. @overload
  195. def read(self, filename: PathLike, encoding: Optional[str] = None) -> Document:
  196. ...
  197. @overload
  198. def read(self, filename: PathLike, encoding: str, into: D) -> D:
  199. ...
  200. @overload
  201. def read(self, filename: PathLike, *, into: D, encoding: Optional[str] = None) -> D:
  202. ...
  203. def read(self, filename, encoding=None, into=None):
  204. """Read and parse a filename.
  205. Args:
  206. filename (str): path to file
  207. encoding (Optional[str]): encoding of file, default None
  208. into (Optional[Document]): object to be populated with the parsed config
  209. """
  210. document = Document() if into is None else into
  211. with open(filename, encoding=encoding) as fp:
  212. self._read(fp, str(filename), document)
  213. self._filename = os.path.abspath(filename)
  214. return document
  215. @overload
  216. def read_file(self, f: Iterable[str], source: Optional[str]) -> Document:
  217. ...
  218. @overload
  219. def read_file(self, f: Iterable[str], source: Optional[str], into: D) -> D:
  220. ...
  221. @overload
  222. def read_file(
  223. self, f: Iterable[str], *, into: D, source: Optional[str] = None
  224. ) -> D:
  225. ...
  226. def read_file(self, f, source=None, into=None):
  227. """Like read() but the argument must be a file-like object.
  228. The ``f`` argument must be iterable, returning one line at a time.
  229. Optional second argument is the ``source`` specifying the name of the
  230. file being read. If not given, it is taken from f.name. If ``f`` has no
  231. ``name`` attribute, ``<???>`` is used.
  232. Args:
  233. f: file like object
  234. source (Optional[str]): reference name for file object, default None
  235. into (Optional[Document]): object to be populated with the parsed config
  236. """
  237. if isinstance(f, str):
  238. raise RuntimeError("f must be a file-like object, not string!")
  239. document = Document() if into is None else into
  240. if source is None:
  241. try:
  242. source = cast(str, cast(io.FileIO, f).name)
  243. except AttributeError:
  244. source = "<???>"
  245. self._read(f, source, document)
  246. return document
  247. @overload
  248. def read_string(self, string: str, source: str = "<string>") -> Document:
  249. ...
  250. @overload
  251. def read_string(self, string: str, source: str, into: D) -> D:
  252. ...
  253. @overload
  254. def read_string(self, string: str, *, into: D, source: str = "<string>") -> D:
  255. ...
  256. def read_string(self, string, source="<string>", into=None):
  257. """Read configuration from a given string.
  258. Args:
  259. string (str): string containing a configuration
  260. source (str): reference name for file object, default '<string>'
  261. into (Optional[Document]): object to be populated with the parsed config
  262. """
  263. sfile = io.StringIO(string)
  264. return self.read_file(sfile, source, into)
  265. def optionxform(self, string: str) -> str:
  266. fn = self._optionxform_fn
  267. return fn(string)
  268. @property
  269. def _last_block(self):
  270. return self._document.last_block
  271. def _update_curr_block(
  272. self, block_type: Type[Union[Comment, Space]]
  273. ) -> Union[Comment, Space]:
  274. if isinstance(self._last_block, block_type):
  275. return self._last_block
  276. else:
  277. new_block = block_type(container=self._document)
  278. self._document.append(new_block)
  279. return new_block
  280. def _add_comment(self, line: str):
  281. if isinstance(self._last_block, Section):
  282. self._last_block.add_comment(line)
  283. else:
  284. self._update_curr_block(Comment).add_line(line)
  285. def _add_section(self, sectname: str, raw_comment: str, line: str):
  286. new_section = Section(
  287. sectname, container=self._document, raw_comment=raw_comment
  288. )
  289. new_section.add_line(line)
  290. self._document.append(new_section)
  291. def _add_option(self, key: str, vi: str, value: Optional[str], line: str):
  292. if not isinstance(self._last_block, Section): # pragma: no cover
  293. msg = f"{self._last_block!r} should be Section"
  294. raise InconsistentStateError(msg, self._fpname, self._lineno, line)
  295. entry = Option(
  296. key,
  297. value=None,
  298. delimiter=vi,
  299. container=self._last_block,
  300. space_around_delimiters=self._space_around_delimiters,
  301. line=line,
  302. )
  303. # Initially add the value as further lines might follow
  304. entry.add_value(value)
  305. self._last_block.add_option(entry)
  306. def _add_option_line(self, line: str):
  307. last_section = self._last_block
  308. if not isinstance(last_section, Section): # pragma: no cover
  309. msg = f"{last_section!r} should be Section"
  310. raise InconsistentStateError(msg, self._fpname, self._lineno, line)
  311. # if empty_lines_in_values is true, we later will merge options and whitespace
  312. # (in the _check_values_with_blank_lines function called at the end).
  313. # This allows option values to have empty new lines inside them
  314. # So for now we can add parts of option values to Space nodes, than we check if
  315. # that is an error or not.
  316. last_option = last_section.last_block
  317. # handle special case of unindented comment in multi-line value
  318. if isinstance(last_option, Comment):
  319. last_option, comment = (
  320. cast(Option, last_option.previous_block),
  321. last_option.detach(),
  322. )
  323. # move lines from comment to last option to keep it.
  324. for comment_line in comment.lines:
  325. last_option.add_line(comment_line)
  326. if not isinstance(last_option, (Option, Space)): # pragma: no cover
  327. msg = f"{last_option!r} should be Option or Space"
  328. raise InconsistentStateError(msg, self._fpname, self._lineno, line)
  329. last_option.add_line(line)
  330. def _add_space(self, line: str):
  331. if isinstance(self._last_block, Section):
  332. self._last_block.add_space(line)
  333. else:
  334. self._update_curr_block(Space).add_line(line)
  335. def _read(self, fp: Iterable[str], fpname: str, into: Document):
  336. """Parse a sectioned configuration file.
  337. Each section in a configuration file contains a header, indicated by
  338. a name in square brackets (`[]`), plus key/value options, indicated by
  339. `name` and `value` delimited with a specific substring (`=` or `:` by
  340. default).
  341. Values can span multiple lines, as long as they are indented deeper
  342. than the first line of the value. Depending on the parser's mode, blank
  343. lines may be treated as parts of multiline values or ignored.
  344. Configuration files may include comments, prefixed by specific
  345. characters (`#` and `;` by default). Comments may appear on their own
  346. in an otherwise empty line or may be entered in lines holding values or
  347. section names.
  348. Note: This method was borrowed from ConfigParser and we keep this
  349. mess here as close as possible to the original messod (pardon
  350. this german pun) for consistency reasons and later upgrades.
  351. """
  352. self._document = into
  353. elements_added: set = set()
  354. cursect: Optional[Dict[str, List[str]]] = None # None or dict
  355. sectname: Optional[str] = None
  356. optname: Optional[str] = None
  357. lineno = 0
  358. indent_level = 0
  359. e: Optional[Exception] = None # None, or an exception
  360. self._fpname = fpname
  361. for lineno, line in enumerate(fp, start=1):
  362. self._lineno = lineno
  363. comment_start: Optional[int] = sys.maxsize
  364. # strip inline comments
  365. inline_prefixes = {p: -1 for p in self._inline_comment_prefixes}
  366. while comment_start == sys.maxsize and inline_prefixes:
  367. next_prefixes = {}
  368. for prefix, index in inline_prefixes.items():
  369. index = line.find(prefix, index + 1)
  370. if index == -1:
  371. continue
  372. next_prefixes[prefix] = index
  373. if index == 0 or (index > 0 and line[index - 1].isspace()):
  374. comment_start = min(comment_start, index)
  375. inline_prefixes = next_prefixes
  376. # strip full line comments
  377. for prefix in self._comment_prefixes:
  378. # configparser would do line.strip() here,
  379. # we do rstrip() to allow comments in multi-line options
  380. if line.rstrip().startswith(prefix):
  381. comment_start = 0
  382. self._add_comment(line) # HOOK
  383. break
  384. if comment_start == sys.maxsize:
  385. comment_start = None
  386. value = line[:comment_start].strip()
  387. if not value:
  388. if self._empty_lines_in_values:
  389. # add empty line to the value, but only if there was no
  390. # comment on the line
  391. if (
  392. comment_start is None
  393. and cursect is not None
  394. and optname
  395. and cursect[optname] is not None
  396. ):
  397. cursect[optname].append("") # newlines added at join
  398. if line.strip():
  399. self._add_option_line(line) # HOOK
  400. else:
  401. # empty line marks end of value
  402. indent_level = sys.maxsize
  403. if comment_start is None:
  404. self._add_space(line)
  405. continue
  406. # continuation line?
  407. first_nonspace = self.NONSPACECRE.search(line)
  408. cur_indent_level = first_nonspace.start() if first_nonspace else 0
  409. if cursect is not None and optname and cur_indent_level > indent_level:
  410. cursect[optname].append(value)
  411. self._add_option_line(line) # HOOK
  412. # a section header or option header?
  413. else:
  414. indent_level = cur_indent_level
  415. # is it a section header?
  416. mo = self.SECTCRE.match(value)
  417. if mo:
  418. sectname = mo.group("header")
  419. if sectname in self._sections:
  420. if self._strict and sectname in elements_added:
  421. raise DuplicateSectionError(sectname, fpname, lineno)
  422. cursect = self._sections[sectname]
  423. elements_added.add(sectname)
  424. else:
  425. cursect = self._dict()
  426. self._sections[sectname] = cursect
  427. elements_added.add(sectname)
  428. # So sections can't start with a continuation line
  429. optname = None
  430. self._add_section(sectname, mo.group("raw_comment"), line) # HOOK
  431. # no section header in the file?
  432. elif cursect is None:
  433. raise MissingSectionHeaderError(fpname, lineno, line)
  434. # an option line?
  435. else:
  436. mo = self._optcre.match(value)
  437. if mo:
  438. optname, vi, optval = mo.group("option", "vi", "value")
  439. if not optname:
  440. e = self._handle_error(e, fpname, lineno, line)
  441. # optname = self.optionxform(optname.rstrip())
  442. # keep original case of key
  443. optname = optname.rstrip()
  444. if sectname is None: # pragma: no cover
  445. msg = f"Could not find the section name for {optname}"
  446. raise InconsistentStateError(msg, fpname, lineno, line)
  447. if self._strict and (sectname, optname) in elements_added:
  448. args = (sectname, optname, fpname, lineno)
  449. raise DuplicateOptionError(*args)
  450. elements_added.add((sectname, optname))
  451. # This check is fine because the OPTCRE cannot
  452. # match if it would set optval to None
  453. if optval is not None:
  454. optval = optval.strip()
  455. cursect[optname] = [optval]
  456. else:
  457. # valueless option handling
  458. cursect[optname] = [] # None in Configparser
  459. self._add_option(optname, vi, optval, line) # HOOK
  460. # handle indented comment
  461. elif (
  462. first_nonspace is not None
  463. and first_nonspace.group(0) in self._comment_prefixes
  464. ):
  465. self._add_comment(line) # HOOK
  466. else:
  467. # a non-fatal parsing error occurred. set up the
  468. # exception but keep going. the exception will be
  469. # raised at the end of the file and will contain a
  470. # list of all bogus lines
  471. e = self._handle_error(e, fpname, lineno, line)
  472. # if any parsing errors occurred, raise an exception
  473. if e:
  474. raise e
  475. # if empty_lines_in_values is true, we have to eliminate spurious newlines
  476. if self._empty_lines_in_values:
  477. self._check_values_with_blank_lines()
  478. def _handle_error(
  479. self, exc: Optional[E], fpname: str, lineno: int, line: str
  480. ) -> Union[ParsingError, E]:
  481. e = exc or ParsingError(fpname)
  482. if hasattr(e, "append"):
  483. e.append(lineno, repr(line))
  484. # ^ the typechecker cannot handle hasattr
  485. return e
  486. def _check_values_with_blank_lines(self):
  487. for section in self._document.section_blocks():
  488. for option in section.option_blocks():
  489. next_block = option.next_block
  490. if isinstance(next_block, Space):
  491. # check if space is part of a multi-line value with blank lines
  492. if "".join(next_block.lines).strip():
  493. self._merge_option_with_space(option, next_block)
  494. def _merge_option_with_space(self, option: Option, space: Space):
  495. last_val_idx = max(i for i, line in enumerate(space.lines) if line.strip())
  496. value_lines = space.lines[: last_val_idx + 1]
  497. merge_vals = "".join(line.lstrip(" ") for line in value_lines)
  498. option._values.append(merge_vals)
  499. option._multiline_value_joined = False
  500. option.lines.extend(space.lines[: last_val_idx + 1])
  501. del space.lines[: last_val_idx + 1]