imap.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. #
  2. # Licensed to the Apache Software Foundation (ASF) under one
  3. # or more contributor license agreements. See the NOTICE file
  4. # distributed with this work for additional information
  5. # regarding copyright ownership. The ASF licenses this file
  6. # to you under the Apache License, Version 2.0 (the
  7. # "License"); you may not use this file except in compliance
  8. # with the License. You may obtain a copy of the License at
  9. #
  10. # http://www.apache.org/licenses/LICENSE-2.0
  11. #
  12. # Unless required by applicable law or agreed to in writing,
  13. # software distributed under the License is distributed on an
  14. # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  15. # KIND, either express or implied. See the License for the
  16. # specific language governing permissions and limitations
  17. # under the License.
  18. """
  19. This module provides everything to search mail for a specific attachment and download it.
  20. It uses the imaplib library that is already integrated in python 3.
  21. """
  22. from __future__ import annotations
  23. import email
  24. import imaplib
  25. import os
  26. import re
  27. import ssl
  28. from collections.abc import Iterable
  29. from typing import TYPE_CHECKING, Any
  30. from airflow.exceptions import AirflowException
  31. from airflow.hooks.base import BaseHook
  32. from airflow.utils.log.logging_mixin import LoggingMixin
  33. if TYPE_CHECKING:
  34. from airflow.models.connection import Connection
  35. class ImapHook(BaseHook):
  36. """
  37. This hook connects to a mail server by using the imap protocol.
  38. .. note:: Please call this Hook as context manager via `with`
  39. to automatically open and close the connection to the mail server.
  40. :param imap_conn_id: The :ref:`imap connection id <howto/connection:imap>`
  41. that contains the information used to authenticate the client.
  42. """
  43. conn_name_attr = "imap_conn_id"
  44. default_conn_name = "imap_default"
  45. conn_type = "imap"
  46. hook_name = "IMAP"
  47. def __init__(self, imap_conn_id: str = default_conn_name) -> None:
  48. super().__init__()
  49. self.imap_conn_id = imap_conn_id
  50. self.mail_client: imaplib.IMAP4_SSL | imaplib.IMAP4 | None = None
  51. def __enter__(self) -> ImapHook:
  52. return self.get_conn()
  53. def __exit__(self, exc_type, exc_val, exc_tb):
  54. self.mail_client.logout()
  55. def get_conn(self) -> ImapHook:
  56. """
  57. Login to the mail server.
  58. .. note:: Please call this Hook as context manager via `with`
  59. to automatically open and close the connection to the mail server.
  60. :return: an authorized ImapHook object.
  61. """
  62. if not self.mail_client:
  63. conn = self.get_connection(self.imap_conn_id)
  64. self.mail_client = self._build_client(conn)
  65. self.mail_client.login(conn.login, conn.password)
  66. return self
  67. def _build_client(self, conn: Connection) -> imaplib.IMAP4_SSL | imaplib.IMAP4:
  68. mail_client: imaplib.IMAP4_SSL | imaplib.IMAP4
  69. use_ssl = conn.extra_dejson.get("use_ssl", True)
  70. if use_ssl:
  71. from airflow.configuration import conf
  72. extra_ssl_context = conn.extra_dejson.get("ssl_context", None)
  73. if extra_ssl_context:
  74. ssl_context_string = extra_ssl_context
  75. else:
  76. ssl_context_string = conf.get("imap", "SSL_CONTEXT", fallback=None)
  77. if ssl_context_string is None:
  78. ssl_context_string = conf.get("email", "SSL_CONTEXT", fallback=None)
  79. if ssl_context_string is None:
  80. ssl_context_string = "default"
  81. if ssl_context_string == "default":
  82. ssl_context = ssl.create_default_context()
  83. elif ssl_context_string == "none":
  84. ssl_context = None
  85. else:
  86. raise RuntimeError(
  87. f"The email.ssl_context configuration variable must "
  88. f"be set to 'default' or 'none' and is '{ssl_context_string}'."
  89. )
  90. if conn.port:
  91. mail_client = imaplib.IMAP4_SSL(conn.host, conn.port, ssl_context=ssl_context)
  92. else:
  93. mail_client = imaplib.IMAP4_SSL(conn.host, ssl_context=ssl_context)
  94. else:
  95. if conn.port:
  96. mail_client = imaplib.IMAP4(conn.host, conn.port)
  97. else:
  98. mail_client = imaplib.IMAP4(conn.host)
  99. return mail_client
  100. def has_mail_attachment(
  101. self, name: str, *, check_regex: bool = False, mail_folder: str = "INBOX", mail_filter: str = "All"
  102. ) -> bool:
  103. """
  104. Check the mail folder for mails containing attachments with the given name.
  105. :param name: The name of the attachment that will be searched for.
  106. :param check_regex: Checks the name for a regular expression.
  107. :param mail_folder: The mail folder where to look at.
  108. :param mail_filter: If set other than 'All' only specific mails will be checked.
  109. See :py:meth:`imaplib.IMAP4.search` for details.
  110. :returns: True if there is an attachment with the given name and False if not.
  111. """
  112. mail_attachments = self._retrieve_mails_attachments_by_name(
  113. name, check_regex, True, mail_folder, mail_filter
  114. )
  115. return bool(mail_attachments)
  116. def retrieve_mail_attachments(
  117. self,
  118. name: str,
  119. *,
  120. check_regex: bool = False,
  121. latest_only: bool = False,
  122. mail_folder: str = "INBOX",
  123. mail_filter: str = "All",
  124. not_found_mode: str = "raise",
  125. ) -> list[tuple]:
  126. """
  127. Retrieve mail's attachments in the mail folder by its name.
  128. :param name: The name of the attachment that will be downloaded.
  129. :param check_regex: Checks the name for a regular expression.
  130. :param latest_only: If set to True it will only retrieve the first matched attachment.
  131. :param mail_folder: The mail folder where to look at.
  132. :param mail_filter: If set other than 'All' only specific mails will be checked.
  133. See :py:meth:`imaplib.IMAP4.search` for details.
  134. :param not_found_mode: Specify what should happen if no attachment has been found.
  135. Supported values are 'raise', 'warn' and 'ignore'.
  136. If it is set to 'raise' it will raise an exception,
  137. if set to 'warn' it will only print a warning and
  138. if set to 'ignore' it won't notify you at all.
  139. :returns: a list of tuple each containing the attachment filename and its payload.
  140. """
  141. mail_attachments = self._retrieve_mails_attachments_by_name(
  142. name, check_regex, latest_only, mail_folder, mail_filter
  143. )
  144. if not mail_attachments:
  145. self._handle_not_found_mode(not_found_mode)
  146. return mail_attachments
  147. def download_mail_attachments(
  148. self,
  149. name: str,
  150. local_output_directory: str,
  151. *,
  152. check_regex: bool = False,
  153. latest_only: bool = False,
  154. mail_folder: str = "INBOX",
  155. mail_filter: str = "All",
  156. not_found_mode: str = "raise",
  157. ) -> None:
  158. """
  159. Download mail's attachments in the mail folder by its name to the local directory.
  160. :param name: The name of the attachment that will be downloaded.
  161. :param local_output_directory: The output directory on the local machine
  162. where the files will be downloaded to.
  163. :param check_regex: Checks the name for a regular expression.
  164. :param latest_only: If set to True it will only download the first matched attachment.
  165. :param mail_folder: The mail folder where to look at.
  166. :param mail_filter: If set other than 'All' only specific mails will be checked.
  167. See :py:meth:`imaplib.IMAP4.search` for details.
  168. :param not_found_mode: Specify what should happen if no attachment has been found.
  169. Supported values are 'raise', 'warn' and 'ignore'.
  170. If it is set to 'raise' it will raise an exception,
  171. if set to 'warn' it will only print a warning and
  172. if set to 'ignore' it won't notify you at all.
  173. """
  174. mail_attachments = self._retrieve_mails_attachments_by_name(
  175. name, check_regex, latest_only, mail_folder, mail_filter
  176. )
  177. if not mail_attachments:
  178. self._handle_not_found_mode(not_found_mode)
  179. self._create_files(mail_attachments, local_output_directory)
  180. def _handle_not_found_mode(self, not_found_mode: str) -> None:
  181. if not_found_mode not in ("raise", "warn", "ignore"):
  182. self.log.error('Invalid "not_found_mode" %s', not_found_mode)
  183. elif not_found_mode == "raise":
  184. raise AirflowException("No mail attachments found!")
  185. elif not_found_mode == "warn":
  186. self.log.warning("No mail attachments found!")
  187. def _retrieve_mails_attachments_by_name(
  188. self, name: str, check_regex: bool, latest_only: bool, mail_folder: str, mail_filter: str
  189. ) -> list:
  190. if not self.mail_client:
  191. raise RuntimeError("The 'mail_client' should be initialized before!")
  192. all_matching_attachments = []
  193. self.mail_client.select(mail_folder)
  194. for mail_id in self._list_mail_ids_desc(mail_filter):
  195. response_mail_body = self._fetch_mail_body(mail_id)
  196. matching_attachments = self._check_mail_body(response_mail_body, name, check_regex, latest_only)
  197. if matching_attachments:
  198. all_matching_attachments.extend(matching_attachments)
  199. if latest_only:
  200. break
  201. self.mail_client.close()
  202. return all_matching_attachments
  203. def _list_mail_ids_desc(self, mail_filter: str) -> Iterable[str]:
  204. if not self.mail_client:
  205. raise RuntimeError("The 'mail_client' should be initialized before!")
  206. _, data = self.mail_client.search(None, mail_filter)
  207. mail_ids = data[0].split()
  208. return reversed(mail_ids)
  209. def _fetch_mail_body(self, mail_id: str) -> str:
  210. if not self.mail_client:
  211. raise RuntimeError("The 'mail_client' should be initialized before!")
  212. _, data = self.mail_client.fetch(mail_id, "(RFC822)")
  213. mail_body = data[0][1] # type: ignore # The mail body is always in this specific location
  214. mail_body_str = mail_body.decode("utf-8") # type: ignore
  215. return mail_body_str
  216. def _check_mail_body(
  217. self, response_mail_body: str, name: str, check_regex: bool, latest_only: bool
  218. ) -> list[tuple[Any, Any]]:
  219. mail = Mail(response_mail_body)
  220. if mail.has_attachments():
  221. return mail.get_attachments_by_name(name, check_regex, find_first=latest_only)
  222. return []
  223. def _create_files(self, mail_attachments: list, local_output_directory: str) -> None:
  224. for name, payload in mail_attachments:
  225. if self._is_symlink(name):
  226. self.log.error("Can not create file because it is a symlink!")
  227. elif self._is_escaping_current_directory(name):
  228. self.log.error("Can not create file because it is escaping the current directory!")
  229. else:
  230. self._create_file(name, payload, local_output_directory)
  231. def _is_symlink(self, name: str) -> bool:
  232. # IMPORTANT NOTE: os.path.islink is not working for windows symlinks
  233. # See: https://stackoverflow.com/a/11068434
  234. return os.path.islink(name)
  235. def _is_escaping_current_directory(self, name: str) -> bool:
  236. return "../" in name
  237. def _correct_path(self, name: str, local_output_directory: str) -> str:
  238. return (
  239. local_output_directory + name
  240. if local_output_directory.endswith("/")
  241. else local_output_directory + "/" + name
  242. )
  243. def _create_file(self, name: str, payload: Any, local_output_directory: str) -> None:
  244. file_path = self._correct_path(name, local_output_directory)
  245. with open(file_path, "wb") as file:
  246. file.write(payload)
  247. class Mail(LoggingMixin):
  248. """
  249. This class simplifies working with mails returned by the imaplib client.
  250. :param mail_body: The mail body of a mail received from imaplib client.
  251. """
  252. def __init__(self, mail_body: str) -> None:
  253. super().__init__()
  254. self.mail = email.message_from_string(mail_body)
  255. def has_attachments(self) -> bool:
  256. """
  257. Check the mail for a attachments.
  258. :returns: True if it has attachments and False if not.
  259. """
  260. return self.mail.get_content_maintype() == "multipart"
  261. def get_attachments_by_name(
  262. self, name: str, check_regex: bool, find_first: bool = False
  263. ) -> list[tuple[Any, Any]]:
  264. """
  265. Get all attachments by name for the mail.
  266. :param name: The name of the attachment to look for.
  267. :param check_regex: Checks the name for a regular expression.
  268. :param find_first: If set to True it will only find the first match and then quit.
  269. :returns: a list of tuples each containing name and payload
  270. where the attachments name matches the given name.
  271. """
  272. attachments = []
  273. for attachment in self._iterate_attachments():
  274. found_attachment = (
  275. attachment.has_matching_name(name) if check_regex else attachment.has_equal_name(name)
  276. )
  277. if found_attachment:
  278. file_name, file_payload = attachment.get_file()
  279. self.log.info("Found attachment: %s", file_name)
  280. attachments.append((file_name, file_payload))
  281. if find_first:
  282. break
  283. return attachments
  284. def _iterate_attachments(self) -> Iterable[MailPart]:
  285. for part in self.mail.walk():
  286. mail_part = MailPart(part)
  287. if mail_part.is_attachment():
  288. yield mail_part
  289. class MailPart:
  290. """
  291. This class is a wrapper for a Mail object's part and gives it more features.
  292. :param part: The mail part in a Mail object.
  293. """
  294. def __init__(self, part: Any) -> None:
  295. self.part = part
  296. def is_attachment(self) -> bool:
  297. """
  298. Check if the part is a valid mail attachment.
  299. :returns: True if it is an attachment and False if not.
  300. """
  301. return self.part.get_content_maintype() != "multipart" and self.part.get("Content-Disposition")
  302. def has_matching_name(self, name: str) -> tuple[Any, Any] | None:
  303. """
  304. Check if the given name matches the part's name.
  305. :param name: The name to look for.
  306. :returns: True if it matches the name (including regular expression).
  307. """
  308. return re.match(name, self.part.get_filename()) # type: ignore
  309. def has_equal_name(self, name: str) -> bool:
  310. """
  311. Check if the given name is equal to the part's name.
  312. :param name: The name to look for.
  313. :returns: True if it is equal to the given name.
  314. """
  315. return self.part.get_filename() == name
  316. def get_file(self) -> tuple:
  317. """
  318. Get the file including name and payload.
  319. :returns: the part's name and payload.
  320. """
  321. return self.part.get_filename(), self.part.get_payload(decode=True)