8#include <sparrow/record_batch.hpp>
40 const sparrow::record_batch& record_batch,
41 const std::vector<record_batch_block>& record_batch_blocks,
103 template <writable_stream TStream>
125 void write(
const sparrow::record_batch& rb);
146 template <std::ranges::input_range R>
147 requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
151 if (std::ranges::empty(record_batches))
158 throw std::runtime_error(
"Cannot write to a file serializer that has been ended");
171 const auto reserve_function = [&record_batches, &compressed_buffers_cache,
this]()
173 return std::accumulate(
174 record_batches.begin(),
175 record_batches.end(),
177 [&compressed_buffers_cache,
this](
size_t acc,
const sparrow::record_batch& rb)
179 return acc + calculate_record_batch_message_size(rb, m_compression, compressed_buffers_cache);
195 for (
const auto& rb : record_batches)
199 throw std::invalid_argument(
"Record batch schema does not match file serializer schema");
203 const int64_t offset =
static_cast<int64_t
>(
m_stream.size());
252 template <std::ranges::input_range R>
253 requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
256 write(record_batches);
Type-erased wrapper for any stream-like object.
A class for serializing Apache Arrow record batches to an output stream.
void end()
Finalizes the serialization process by writing end-of-stream marker.
A class for serializing Apache Arrow record batches to the IPC file format.
std::vector< record_batch_block > m_record_batch_blocks
stream_file_serializer & operator<<(stream_file_serializer &(*manip)(stream_file_serializer &))
std::vector< sparrow::data_type > m_dtypes
std::optional< CompressionType > m_compression
void write(const R &record_batches)
Writes a collection of record batches to the file.
std::optional< sparrow::record_batch > m_first_record_batch
~stream_file_serializer()
Destructor for the stream_file_serializer.
stream_file_serializer & operator<<(const sparrow::record_batch &rb)
void write(const sparrow::record_batch &rb)
Writes a single record batch to the file.
stream_file_serializer(TStream &stream, std::optional< CompressionType > compression=std::nullopt)
Constructs a stream_file_serializer object with a reference to a stream.
void end()
Finalizes the file serialization by writing footer and trailing magic bytes.
stream_file_serializer & operator<<(const R &record_batches)
any_output_stream m_stream
SPARROW_IPC_API serialized_record_batch_info serialize_record_batch(const sparrow::record_batch &record_batch, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
Serializes a record batch into a binary format following the Arrow IPC specification.
SPARROW_IPC_API void serialize_schema_message(const sparrow::record_batch &record_batch, any_output_stream &stream)
Serializes a schema message for a record batch into a byte buffer.
SPARROW_IPC_API std::vector< sparrow::record_batch > deserialize_file(std::span< const uint8_t > data)
Deserializes Arrow IPC file format into a vector of record batches.
constexpr std::array< std::uint8_t, 8 > arrow_file_header_magic
Magic bytes with padding for file header (8 bytes total for alignment)
SPARROW_IPC_API size_t write_footer(const sparrow::record_batch &record_batch, const std::vector< record_batch_block > &record_batch_blocks, any_output_stream &stream)
Writes the Arrow IPC file footer.
stream_file_serializer & end_file(stream_file_serializer &serializer)
SPARROW_IPC_API std::size_t calculate_schema_message_size(const sparrow::record_batch &record_batch)
Calculates the total serialized size of a schema message.
SPARROW_IPC_API std::vector< sparrow::data_type > get_column_dtypes(const sparrow::record_batch &rb)
Represents a block entry in the Arrow IPC file footer.
int64_t body_length
Length of the record batch body (data buffers)
int32_t metadata_length
Length of the metadata (FlatBuffer message)
int64_t offset
Offset from the start of the file to the record batch message.