123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744 |
- """
- babel.messages.pofile
- ~~~~~~~~~~~~~~~~~~~~~
- Reading and writing of files in the ``gettext`` PO (portable object)
- format.
- :copyright: (c) 2013-2025 by the Babel Team.
- :license: BSD, see LICENSE for more details.
- """
- from __future__ import annotations
- import os
- import re
- from collections.abc import Iterable
- from typing import TYPE_CHECKING, Literal
- from babel.core import Locale
- from babel.messages.catalog import Catalog, Message
- from babel.util import TextWrapper, _cmp
- if TYPE_CHECKING:
- from typing import IO, AnyStr
- from _typeshed import SupportsWrite
- def unescape(string: str) -> str:
- r"""Reverse `escape` the given string.
- >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))
- Say:
- "hello, world!"
- <BLANKLINE>
- :param string: the string to unescape
- """
- def replace_escapes(match):
- m = match.group(1)
- if m == 'n':
- return '\n'
- elif m == 't':
- return '\t'
- elif m == 'r':
- return '\r'
- # m is \ or "
- return m
- return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
- def denormalize(string: str) -> str:
- r"""Reverse the normalization done by the `normalize` function.
- >>> print(denormalize(r'''""
- ... "Say:\n"
- ... " \"hello, world!\"\n"'''))
- Say:
- "hello, world!"
- <BLANKLINE>
- >>> print(denormalize(r'''""
- ... "Say:\n"
- ... " \"Lorem ipsum dolor sit "
- ... "amet, consectetur adipisicing"
- ... " elit, \"\n"'''))
- Say:
- "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
- <BLANKLINE>
- :param string: the string to denormalize
- """
- if '\n' in string:
- escaped_lines = string.splitlines()
- if string.startswith('""'):
- escaped_lines = escaped_lines[1:]
- lines = map(unescape, escaped_lines)
- return ''.join(lines)
- else:
- return unescape(string)
- def _extract_locations(line: str) -> list[str]:
- """Extract locations from location comments.
- Locations are extracted while properly handling First Strong
- Isolate (U+2068) and Pop Directional Isolate (U+2069), used by
- gettext to enclose filenames with spaces and tabs in their names.
- """
- if "\u2068" not in line and "\u2069" not in line:
- return line.lstrip().split()
- locations = []
- location = ""
- in_filename = False
- for c in line:
- if c == "\u2068":
- if in_filename:
- raise ValueError("location comment contains more First Strong Isolate "
- "characters, than Pop Directional Isolate characters")
- in_filename = True
- continue
- elif c == "\u2069":
- if not in_filename:
- raise ValueError("location comment contains more Pop Directional Isolate "
- "characters, than First Strong Isolate characters")
- in_filename = False
- continue
- elif c == " ":
- if in_filename:
- location += c
- elif location:
- locations.append(location)
- location = ""
- else:
- location += c
- else:
- if location:
- if in_filename:
- raise ValueError("location comment contains more First Strong Isolate "
- "characters, than Pop Directional Isolate characters")
- locations.append(location)
- return locations
- class PoFileError(Exception):
- """Exception thrown by PoParser when an invalid po file is encountered."""
- def __init__(self, message: str, catalog: Catalog, line: str, lineno: int) -> None:
- super().__init__(f'{message} on {lineno}')
- self.catalog = catalog
- self.line = line
- self.lineno = lineno
- class _NormalizedString:
- def __init__(self, *args: str) -> None:
- self._strs: list[str] = []
- for arg in args:
- self.append(arg)
- def append(self, s: str) -> None:
- self._strs.append(s.strip())
- def denormalize(self) -> str:
- return ''.join(map(unescape, self._strs))
- def __bool__(self) -> bool:
- return bool(self._strs)
- def __repr__(self) -> str:
- return os.linesep.join(self._strs)
- def __cmp__(self, other: object) -> int:
- if not other:
- return 1
- return _cmp(str(self), str(other))
- def __gt__(self, other: object) -> bool:
- return self.__cmp__(other) > 0
- def __lt__(self, other: object) -> bool:
- return self.__cmp__(other) < 0
- def __ge__(self, other: object) -> bool:
- return self.__cmp__(other) >= 0
- def __le__(self, other: object) -> bool:
- return self.__cmp__(other) <= 0
- def __eq__(self, other: object) -> bool:
- return self.__cmp__(other) == 0
- def __ne__(self, other: object) -> bool:
- return self.__cmp__(other) != 0
- class PoFileParser:
- """Support class to read messages from a ``gettext`` PO (portable object) file
- and add them to a `Catalog`
- See `read_po` for simple cases.
- """
- _keywords = [
- 'msgid',
- 'msgstr',
- 'msgctxt',
- 'msgid_plural',
- ]
- def __init__(self, catalog: Catalog, ignore_obsolete: bool = False, abort_invalid: bool = False) -> None:
- self.catalog = catalog
- self.ignore_obsolete = ignore_obsolete
- self.counter = 0
- self.offset = 0
- self.abort_invalid = abort_invalid
- self._reset_message_state()
- def _reset_message_state(self) -> None:
- self.messages = []
- self.translations = []
- self.locations = []
- self.flags = []
- self.user_comments = []
- self.auto_comments = []
- self.context = None
- self.obsolete = False
- self.in_msgid = False
- self.in_msgstr = False
- self.in_msgctxt = False
- def _add_message(self) -> None:
- """
- Add a message to the catalog based on the current parser state and
- clear the state ready to process the next message.
- """
- self.translations.sort()
- if len(self.messages) > 1:
- msgid = tuple(m.denormalize() for m in self.messages)
- else:
- msgid = self.messages[0].denormalize()
- if isinstance(msgid, (list, tuple)):
- string = ['' for _ in range(self.catalog.num_plurals)]
- for idx, translation in self.translations:
- if idx >= self.catalog.num_plurals:
- self._invalid_pofile("", self.offset, "msg has more translations than num_plurals of catalog")
- continue
- string[idx] = translation.denormalize()
- string = tuple(string)
- else:
- string = self.translations[0][1].denormalize()
- msgctxt = self.context.denormalize() if self.context else None
- message = Message(msgid, string, list(self.locations), set(self.flags),
- self.auto_comments, self.user_comments, lineno=self.offset + 1,
- context=msgctxt)
- if self.obsolete:
- if not self.ignore_obsolete:
- self.catalog.obsolete[self.catalog._key_for(msgid, msgctxt)] = message
- else:
- self.catalog[msgid] = message
- self.counter += 1
- self._reset_message_state()
- def _finish_current_message(self) -> None:
- if self.messages:
- if not self.translations:
- self._invalid_pofile("", self.offset, f"missing msgstr for msgid '{self.messages[0].denormalize()}'")
- self.translations.append([0, _NormalizedString("")])
- self._add_message()
- def _process_message_line(self, lineno, line, obsolete=False) -> None:
- if line.startswith('"'):
- self._process_string_continuation_line(line, lineno)
- else:
- self._process_keyword_line(lineno, line, obsolete)
- def _process_keyword_line(self, lineno, line, obsolete=False) -> None:
- for keyword in self._keywords:
- try:
- if line.startswith(keyword) and line[len(keyword)] in [' ', '[']:
- arg = line[len(keyword):]
- break
- except IndexError:
- self._invalid_pofile(line, lineno, "Keyword must be followed by a string")
- else:
- self._invalid_pofile(line, lineno, "Start of line didn't match any expected keyword.")
- return
- if keyword in ['msgid', 'msgctxt']:
- self._finish_current_message()
- self.obsolete = obsolete
- # The line that has the msgid is stored as the offset of the msg
- # should this be the msgctxt if it has one?
- if keyword == 'msgid':
- self.offset = lineno
- if keyword in ['msgid', 'msgid_plural']:
- self.in_msgctxt = False
- self.in_msgid = True
- self.messages.append(_NormalizedString(arg))
- elif keyword == 'msgstr':
- self.in_msgid = False
- self.in_msgstr = True
- if arg.startswith('['):
- idx, msg = arg[1:].split(']', 1)
- self.translations.append([int(idx), _NormalizedString(msg)])
- else:
- self.translations.append([0, _NormalizedString(arg)])
- elif keyword == 'msgctxt':
- self.in_msgctxt = True
- self.context = _NormalizedString(arg)
- def _process_string_continuation_line(self, line, lineno) -> None:
- if self.in_msgid:
- s = self.messages[-1]
- elif self.in_msgstr:
- s = self.translations[-1][1]
- elif self.in_msgctxt:
- s = self.context
- else:
- self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
- return
- s.append(line)
- def _process_comment(self, line) -> None:
- self._finish_current_message()
- if line[1:].startswith(':'):
- for location in _extract_locations(line[2:]):
- pos = location.rfind(':')
- if pos >= 0:
- try:
- lineno = int(location[pos + 1:])
- except ValueError:
- continue
- self.locations.append((location[:pos], lineno))
- else:
- self.locations.append((location, None))
- elif line[1:].startswith(','):
- for flag in line[2:].lstrip().split(','):
- self.flags.append(flag.strip())
- elif line[1:].startswith('.'):
- # These are called auto-comments
- comment = line[2:].strip()
- if comment: # Just check that we're not adding empty comments
- self.auto_comments.append(comment)
- else:
- # These are called user comments
- self.user_comments.append(line[1:].strip())
- def parse(self, fileobj: IO[AnyStr] | Iterable[AnyStr]) -> None:
- """
- Reads from the file-like object `fileobj` and adds any po file
- units found in it to the `Catalog` supplied to the constructor.
- """
- for lineno, line in enumerate(fileobj):
- line = line.strip()
- if not isinstance(line, str):
- line = line.decode(self.catalog.charset)
- if not line:
- continue
- if line.startswith('#'):
- if line[1:].startswith('~'):
- self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
- else:
- try:
- self._process_comment(line)
- except ValueError as exc:
- self._invalid_pofile(line, lineno, str(exc))
- else:
- self._process_message_line(lineno, line)
- self._finish_current_message()
- # No actual messages found, but there was some info in comments, from which
- # we'll construct an empty header message
- if not self.counter and (self.flags or self.user_comments or self.auto_comments):
- self.messages.append(_NormalizedString('""'))
- self.translations.append([0, _NormalizedString('""')])
- self._add_message()
- def _invalid_pofile(self, line, lineno, msg) -> None:
- assert isinstance(line, str)
- if self.abort_invalid:
- raise PoFileError(msg, self.catalog, line, lineno)
- print("WARNING:", msg)
- print(f"WARNING: Problem on line {lineno + 1}: {line!r}")
- def read_po(
- fileobj: IO[AnyStr] | Iterable[AnyStr],
- locale: Locale | str | None = None,
- domain: str | None = None,
- ignore_obsolete: bool = False,
- charset: str | None = None,
- abort_invalid: bool = False,
- ) -> Catalog:
- """Read messages from a ``gettext`` PO (portable object) file from the given
- file-like object (or an iterable of lines) and return a `Catalog`.
- >>> from datetime import datetime
- >>> from io import StringIO
- >>> buf = StringIO('''
- ... #: main.py:1
- ... #, fuzzy, python-format
- ... msgid "foo %(name)s"
- ... msgstr "quux %(name)s"
- ...
- ... # A user comment
- ... #. An auto comment
- ... #: main.py:3
- ... msgid "bar"
- ... msgid_plural "baz"
- ... msgstr[0] "bar"
- ... msgstr[1] "baaz"
- ... ''')
- >>> catalog = read_po(buf)
- >>> catalog.revision_date = datetime(2007, 4, 1)
- >>> for message in catalog:
- ... if message.id:
- ... print((message.id, message.string))
- ... print(' ', (message.locations, sorted(list(message.flags))))
- ... print(' ', (message.user_comments, message.auto_comments))
- (u'foo %(name)s', u'quux %(name)s')
- ([(u'main.py', 1)], [u'fuzzy', u'python-format'])
- ([], [])
- ((u'bar', u'baz'), (u'bar', u'baaz'))
- ([(u'main.py', 3)], [])
- ([u'A user comment'], [u'An auto comment'])
- .. versionadded:: 1.0
- Added support for explicit charset argument.
- :param fileobj: the file-like object (or iterable of lines) to read the PO file from
- :param locale: the locale identifier or `Locale` object, or `None`
- if the catalog is not bound to a locale (which basically
- means it's a template)
- :param domain: the message domain
- :param ignore_obsolete: whether to ignore obsolete messages in the input
- :param charset: the character set of the catalog.
- :param abort_invalid: abort read if po file is invalid
- """
- catalog = Catalog(locale=locale, domain=domain, charset=charset)
- parser = PoFileParser(catalog, ignore_obsolete, abort_invalid=abort_invalid)
- parser.parse(fileobj)
- return catalog
- WORD_SEP = re.compile('('
- r'\s+|' # any whitespace
- r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
- r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash
- ')')
- def escape(string: str) -> str:
- r"""Escape the given string so that it can be included in double-quoted
- strings in ``PO`` files.
- >>> escape('''Say:
- ... "hello, world!"
- ... ''')
- '"Say:\\n \\"hello, world!\\"\\n"'
- :param string: the string to escape
- """
- return '"%s"' % string.replace('\\', '\\\\') \
- .replace('\t', '\\t') \
- .replace('\r', '\\r') \
- .replace('\n', '\\n') \
- .replace('\"', '\\"')
- def normalize(string: str, prefix: str = '', width: int = 76) -> str:
- r"""Convert a string into a format that is appropriate for .po files.
- >>> print(normalize('''Say:
- ... "hello, world!"
- ... ''', width=None))
- ""
- "Say:\n"
- " \"hello, world!\"\n"
- >>> print(normalize('''Say:
- ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
- ... ''', width=32))
- ""
- "Say:\n"
- " \"Lorem ipsum dolor sit "
- "amet, consectetur adipisicing"
- " elit, \"\n"
- :param string: the string to normalize
- :param prefix: a string that should be prepended to every line
- :param width: the maximum line width; use `None`, 0, or a negative number
- to completely disable line wrapping
- """
- if width and width > 0:
- prefixlen = len(prefix)
- lines = []
- for line in string.splitlines(True):
- if len(escape(line)) + prefixlen > width:
- chunks = WORD_SEP.split(line)
- chunks.reverse()
- while chunks:
- buf = []
- size = 2
- while chunks:
- length = len(escape(chunks[-1])) - 2 + prefixlen
- if size + length < width:
- buf.append(chunks.pop())
- size += length
- else:
- if not buf:
- # handle long chunks by putting them on a
- # separate line
- buf.append(chunks.pop())
- break
- lines.append(''.join(buf))
- else:
- lines.append(line)
- else:
- lines = string.splitlines(True)
- if len(lines) <= 1:
- return escape(string)
- # Remove empty trailing line
- if lines and not lines[-1]:
- del lines[-1]
- lines[-1] += '\n'
- return '""\n' + '\n'.join([(prefix + escape(line)) for line in lines])
- def _enclose_filename_if_necessary(filename: str) -> str:
- """Enclose filenames which include white spaces or tabs.
- Do the same as gettext and enclose filenames which contain white
- spaces or tabs with First Strong Isolate (U+2068) and Pop
- Directional Isolate (U+2069).
- """
- if " " not in filename and "\t" not in filename:
- return filename
- if not filename.startswith("\u2068"):
- filename = "\u2068" + filename
- if not filename.endswith("\u2069"):
- filename += "\u2069"
- return filename
- def write_po(
- fileobj: SupportsWrite[bytes],
- catalog: Catalog,
- width: int = 76,
- no_location: bool = False,
- omit_header: bool = False,
- sort_output: bool = False,
- sort_by_file: bool = False,
- ignore_obsolete: bool = False,
- include_previous: bool = False,
- include_lineno: bool = True,
- ) -> None:
- r"""Write a ``gettext`` PO (portable object) template file for a given
- message catalog to the provided file-like object.
- >>> catalog = Catalog()
- >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)],
- ... flags=('fuzzy',))
- <Message...>
- >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)])
- <Message...>
- >>> from io import BytesIO
- >>> buf = BytesIO()
- >>> write_po(buf, catalog, omit_header=True)
- >>> print(buf.getvalue().decode("utf8"))
- #: main.py:1
- #, fuzzy, python-format
- msgid "foo %(name)s"
- msgstr ""
- <BLANKLINE>
- #: main.py:3
- msgid "bar"
- msgid_plural "baz"
- msgstr[0] ""
- msgstr[1] ""
- <BLANKLINE>
- <BLANKLINE>
- :param fileobj: the file-like object to write to
- :param catalog: the `Catalog` instance
- :param width: the maximum line width for the generated output; use `None`,
- 0, or a negative number to completely disable line wrapping
- :param no_location: do not emit a location comment for every message
- :param omit_header: do not include the ``msgid ""`` entry at the top of the
- output
- :param sort_output: whether to sort the messages in the output by msgid
- :param sort_by_file: whether to sort the messages in the output by their
- locations
- :param ignore_obsolete: whether to ignore obsolete messages and not include
- them in the output; by default they are included as
- comments
- :param include_previous: include the old msgid as a comment when
- updating the catalog
- :param include_lineno: include line number in the location comment
- """
- sort_by = None
- if sort_output:
- sort_by = "message"
- elif sort_by_file:
- sort_by = "location"
- for line in generate_po(
- catalog,
- ignore_obsolete=ignore_obsolete,
- include_lineno=include_lineno,
- include_previous=include_previous,
- no_location=no_location,
- omit_header=omit_header,
- sort_by=sort_by,
- width=width,
- ):
- if isinstance(line, str):
- line = line.encode(catalog.charset, 'backslashreplace')
- fileobj.write(line)
- def generate_po(
- catalog: Catalog,
- *,
- ignore_obsolete: bool = False,
- include_lineno: bool = True,
- include_previous: bool = False,
- no_location: bool = False,
- omit_header: bool = False,
- sort_by: Literal["message", "location"] | None = None,
- width: int = 76,
- ) -> Iterable[str]:
- r"""Yield text strings representing a ``gettext`` PO (portable object) file.
- See `write_po()` for a more detailed description.
- """
- # xgettext always wraps comments even if --no-wrap is passed;
- # provide the same behaviour
- comment_width = width if width and width > 0 else 76
- comment_wrapper = TextWrapper(width=comment_width, break_long_words=False)
- header_wrapper = TextWrapper(width=width, subsequent_indent="# ", break_long_words=False)
- def _format_comment(comment, prefix=''):
- for line in comment_wrapper.wrap(comment):
- yield f"#{prefix} {line.strip()}\n"
- def _format_message(message, prefix=''):
- if isinstance(message.id, (list, tuple)):
- if message.context:
- yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
- yield f"{prefix}msgid {normalize(message.id[0], prefix=prefix, width=width)}\n"
- yield f"{prefix}msgid_plural {normalize(message.id[1], prefix=prefix, width=width)}\n"
- for idx in range(catalog.num_plurals):
- try:
- string = message.string[idx]
- except IndexError:
- string = ''
- yield f"{prefix}msgstr[{idx:d}] {normalize(string, prefix=prefix, width=width)}\n"
- else:
- if message.context:
- yield f"{prefix}msgctxt {normalize(message.context, prefix=prefix, width=width)}\n"
- yield f"{prefix}msgid {normalize(message.id, prefix=prefix, width=width)}\n"
- yield f"{prefix}msgstr {normalize(message.string or '', prefix=prefix, width=width)}\n"
- for message in _sort_messages(catalog, sort_by=sort_by):
- if not message.id: # This is the header "message"
- if omit_header:
- continue
- comment_header = catalog.header_comment
- if width and width > 0:
- lines = []
- for line in comment_header.splitlines():
- lines += header_wrapper.wrap(line)
- comment_header = '\n'.join(lines)
- yield f"{comment_header}\n"
- for comment in message.user_comments:
- yield from _format_comment(comment)
- for comment in message.auto_comments:
- yield from _format_comment(comment, prefix='.')
- if not no_location:
- locs = []
- # sort locations by filename and lineno.
- # if there's no <int> as lineno, use `-1`.
- # if no sorting possible, leave unsorted.
- # (see issue #606)
- try:
- locations = sorted(message.locations,
- key=lambda x: (x[0], isinstance(x[1], int) and x[1] or -1))
- except TypeError: # e.g. "TypeError: unorderable types: NoneType() < int()"
- locations = message.locations
- for filename, lineno in locations:
- location = filename.replace(os.sep, '/')
- location = _enclose_filename_if_necessary(location)
- if lineno and include_lineno:
- location = f"{location}:{lineno:d}"
- if location not in locs:
- locs.append(location)
- yield from _format_comment(' '.join(locs), prefix=':')
- if message.flags:
- yield f"#{', '.join(['', *sorted(message.flags)])}\n"
- if message.previous_id and include_previous:
- yield from _format_comment(
- f'msgid {normalize(message.previous_id[0], width=width)}',
- prefix='|',
- )
- if len(message.previous_id) > 1:
- norm_previous_id = normalize(message.previous_id[1], width=width)
- yield from _format_comment(f'msgid_plural {norm_previous_id}', prefix='|')
- yield from _format_message(message)
- yield '\n'
- if not ignore_obsolete:
- for message in _sort_messages(
- catalog.obsolete.values(),
- sort_by=sort_by,
- ):
- for comment in message.user_comments:
- yield from _format_comment(comment)
- yield from _format_message(message, prefix='#~ ')
- yield '\n'
- def _sort_messages(messages: Iterable[Message], sort_by: Literal["message", "location"] | None) -> list[Message]:
- """
- Sort the given message iterable by the given criteria.
- Always returns a list.
- :param messages: An iterable of Messages.
- :param sort_by: Sort by which criteria? Options are `message` and `location`.
- :return: list[Message]
- """
- messages = list(messages)
- if sort_by == "message":
- messages.sort()
- elif sort_by == "location":
- messages.sort(key=lambda m: m.locations)
- return messages
|