12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- #
- # Licensed to the Apache Software Foundation (ASF) under one
- # or more contributor license agreements. See the NOTICE file
- # distributed with this work for additional information
- # regarding copyright ownership. The ASF licenses this file
- # to you under the Apache License, Version 2.0 (the
- # "License"); you may not use this file except in compliance
- # with the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing,
- # software distributed under the License is distributed on an
- # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- # KIND, either express or implied. See the License for the
- # specific language governing permissions and limitations
- # under the License.
- from __future__ import annotations
- from typing import TYPE_CHECKING
- from airflow.utils.module_loading import qualname
- # lazy loading for performance reasons
- serializers = [
- "pandas.core.frame.DataFrame",
- ]
- deserializers = serializers
- if TYPE_CHECKING:
- import pandas as pd
- from airflow.serialization.serde import U
- __version__ = 1
- def serialize(o: object) -> tuple[U, str, int, bool]:
- import pandas as pd
- import pyarrow as pa
- from pyarrow import parquet as pq
- if not isinstance(o, pd.DataFrame):
- return "", "", 0, False
- # for now, we *always* serialize into in memory
- # until we have a generic backend that manages
- # sinks
- table = pa.Table.from_pandas(o)
- buf = pa.BufferOutputStream()
- pq.write_table(table, buf, compression="snappy")
- return buf.getvalue().hex().decode("utf-8"), qualname(o), __version__, True
- def deserialize(classname: str, version: int, data: object) -> pd.DataFrame:
- if version > __version__:
- raise TypeError(f"serialized {version} of {classname} > {__version__}")
- from pyarrow import parquet as pq
- if not isinstance(data, str):
- raise TypeError(f"serialized {classname} has wrong data type {type(data)}")
- from io import BytesIO
- with BytesIO(bytes.fromhex(data)) as buf:
- df = pq.read_table(buf).to_pandas()
- return df
|