sparrow-ipc 0.2.0
Loading...
Searching...
No Matches
sparrow_ipc Namespace Reference

Namespaces

namespace  details
 
namespace  utils
 

Classes

class  any_output_stream
 Type-erased wrapper for any stream-like object. More...
 
class  arrow_array_private_data
 
class  chunk_serializer
 A serializer that writes record batches to chunked memory streams. More...
 
class  chunked_memory_output_stream
 An output stream that writes data into separate memory chunks. More...
 
class  CompressionCache
 
class  deserializer
 
class  encapsulated_message
 
class  memory_output_stream
 An output stream that writes data to a contiguous memory buffer. More...
 
class  non_owning_arrow_schema_private_data
 
struct  record_batch_block
 Represents a block entry in the Arrow IPC file footer. More...
 
struct  serialized_record_batch_info
 Information about a serialized record batch block. More...
 
class  serializer
 A class for serializing Apache Arrow record batches to an output stream. More...
 
class  stream_file_serializer
 A class for serializing Apache Arrow record batches to the IPC file format. More...
 

Concepts

concept  writable_stream
 Concept for stream-like types that support write operations.
 
concept  ArrowPrivateData
 

Enumerations

enum class  CompressionType : std::uint8_t { LZ4_FRAME , ZSTD }
 

Functions

SPARROW_IPC_API void release_arrow_array_children_and_dictionary (ArrowArray *array)
 
template<ArrowPrivateData T>
void arrow_array_release (ArrowArray *array)
 
template<ArrowPrivateData T, typename Arg>
void fill_arrow_array (ArrowArray &array, int64_t length, int64_t null_count, int64_t offset, size_t children_count, ArrowArray **children, ArrowArray *dictionary, Arg &&private_data_arg)
 
template<ArrowPrivateData T, typename Arg>
ArrowArray make_arrow_array (int64_t length, int64_t null_count, int64_t offset, size_t children_count, ArrowArray **children, ArrowArray *dictionary, Arg &&private_data_arg)
 
template<class T>
requires std::same_as<T, ArrowArray> || std::same_as<T, ArrowSchema>
void release_common_non_owning_arrow (T &t)
 Release the children and dictionnary of an ArrowArray or ArrowSchema.
 
SPARROW_IPC_API void release_non_owning_arrow_schema (ArrowSchema *schema)
 
template<sparrow::input_metadata_container M = std::vector<sparrow::metadata_pair>>
void fill_non_owning_arrow_schema (ArrowSchema &schema, std::string_view format, const char *name, std::optional< M > metadata, std::optional< std::unordered_set< sparrow::ArrowFlag > > flags, size_t children_count, ArrowSchema **children, ArrowSchema *dictionary)
 
template<sparrow::input_metadata_container M = std::vector<sparrow::metadata_pair>>
ArrowSchema make_non_owning_arrow_schema (std::string_view format, const char *name, std::optional< M > metadata, std::optional< std::unordered_set< sparrow::ArrowFlag > > flags, size_t children_count, ArrowSchema **children, ArrowSchema *dictionary)
 
SPARROW_IPC_API std::span< const std::uint8_t > compress (const CompressionType compression_type, const std::span< const std::uint8_t > &data, CompressionCache &cache)
 
SPARROW_IPC_API size_t get_compressed_size (const CompressionType compression_type, const std::span< const std::uint8_t > &data, CompressionCache &cache)
 
SPARROW_IPC_API std::variant< std::vector< std::uint8_t >, std::span< const std::uint8_t > > decompress (const CompressionType compression_type, std::span< const std::uint8_t > data)
 
SPARROW_IPC_API std::vector< sparrow::record_batch > deserialize_stream (std::span< const uint8_t > data)
 Deserializes an Arrow IPC stream from binary data into a vector of record batches.
 
sparrow::fixed_width_binary_array deserialize_non_owning_fixedwidthbinary (const org::apache::arrow::flatbuf::RecordBatch &record_batch, std::span< const uint8_t > body, std::string_view name, const std::optional< std::vector< sparrow::metadata_pair > > &metadata, bool nullable, size_t &buffer_index, int32_t byte_width)
 
template<typename T>
sparrow::primitive_array< T > deserialize_non_owning_primitive_array (const org::apache::arrow::flatbuf::RecordBatch &record_batch, std::span< const uint8_t > body, std::string_view name, const std::optional< std::vector< sparrow::metadata_pair > > &metadata, bool nullable, size_t &buffer_index)
 
template<typename T>
deserialize_non_owning_variable_size_binary (const org::apache::arrow::flatbuf::RecordBatch &record_batch, std::span< const uint8_t > body, std::string_view name, const std::optional< std::vector< sparrow::metadata_pair > > &metadata, bool nullable, size_t &buffer_index)
 
std::pair< encapsulated_message, std::span< const uint8_t > > extract_encapsulated_message (std::span< const uint8_t > buf_ptr)
 
std::pair< org::apache::arrow::flatbuf::Type, flatbuffers::Offset< void > > get_flatbuffer_decimal_type (flatbuffers::FlatBufferBuilder &builder, std::string_view format_str, const int32_t bitWidth)
 
std::pair< org::apache::arrow::flatbuf::Type, flatbuffers::Offset< void > > get_flatbuffer_type (flatbuffers::FlatBufferBuilder &builder, std::string_view format_str)
 
flatbuffers::Offset< flatbuffers::Vector< flatbuffers::Offset< org::apache::arrow::flatbuf::KeyValue > > > create_metadata (flatbuffers::FlatBufferBuilder &builder, const ArrowSchema &arrow_schema)
 Creates a FlatBuffers vector of KeyValue pairs from ArrowSchema metadata.
 
::flatbuffers::Offset< org::apache::arrow::flatbuf::Field > create_field (flatbuffers::FlatBufferBuilder &builder, const ArrowSchema &arrow_schema, std::optional< std::string_view > name_override=std::nullopt)
 Creates a FlatBuffer Field object from an ArrowSchema.
 
::flatbuffers::Offset< ::flatbuffers::Vector<::flatbuffers::Offset< org::apache::arrow::flatbuf::Field > > > create_children (flatbuffers::FlatBufferBuilder &builder, const sparrow::record_batch &record_batch)
 Creates a FlatBuffers vector of Field objects from a record batch.
 
::flatbuffers::Offset< ::flatbuffers::Vector<::flatbuffers::Offset< org::apache::arrow::flatbuf::Field > > > create_children (flatbuffers::FlatBufferBuilder &builder, const ArrowSchema &arrow_schema)
 Creates a FlatBuffers vector of Field objects from an ArrowSchema's children.
 
flatbuffers::FlatBufferBuilder get_schema_message_builder (const sparrow::record_batch &record_batch)
 Creates a FlatBuffer builder containing a serialized Arrow schema message.
 
void fill_fieldnodes (const sparrow::arrow_proxy &arrow_proxy, std::vector< org::apache::arrow::flatbuf::FieldNode > &nodes)
 Recursively fills a vector of FieldNode objects from an arrow_proxy and its children.
 
std::vector< org::apache::arrow::flatbuf::FieldNode > create_fieldnodes (const sparrow::record_batch &record_batch)
 Creates a vector of Apache Arrow FieldNode objects from a record batch.
 
void fill_buffers (const sparrow::arrow_proxy &arrow_proxy, std::vector< org::apache::arrow::flatbuf::Buffer > &flatbuf_buffers, int64_t &offset)
 Recursively fills a vector of FlatBuffer Buffer objects with buffer information from an Arrow proxy.
 
std::vector< org::apache::arrow::flatbuf::Buffer > get_buffers (const sparrow::record_batch &record_batch)
 Extracts buffer information from a record batch for serialization.
 
void fill_compressed_buffers (const sparrow::arrow_proxy &arrow_proxy, std::vector< org::apache::arrow::flatbuf::Buffer > &flatbuf_compressed_buffers, int64_t &offset, const CompressionType compression_type, CompressionCache &cache)
 Recursively populates a vector with compressed buffer metadata from an Arrow proxy.
 
std::vector< org::apache::arrow::flatbuf::Buffer > get_compressed_buffers (const sparrow::record_batch &record_batch, const CompressionType compression_type, CompressionCache &cache)
 Retrieves metadata describing the layout of compressed buffers within a record batch.
 
int64_t calculate_body_size (const sparrow::arrow_proxy &arrow_proxy, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Calculates the total aligned size in bytes of all buffers in an Arrow array structure.
 
int64_t calculate_body_size (const sparrow::record_batch &record_batch, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Calculates the total body size of a record batch by summing the body sizes of all its columns.
 
flatbuffers::FlatBufferBuilder get_record_batch_message_builder (const sparrow::record_batch &record_batch, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Creates a FlatBuffer message containing a serialized Apache Arrow RecordBatch.
 
SPARROW_IPC_API const org::apache::arrow::flatbuf::Footer * get_footer_from_file_data (std::span< const uint8_t > file_data)
 
template<std::ranges::input_range R>
bool is_continuation (const R &buf)
 
template<std::ranges::input_range R>
bool is_end_of_stream (const R &buf)
 
template<std::ranges::input_range R>
bool is_arrow_file_magic (const R &buf)
 
std::vector< sparrow::metadata_pair > to_sparrow_metadata (const ::flatbuffers::Vector<::flatbuffers::Offset< org::apache::arrow::flatbuf::KeyValue > > &metadata)
 Converts FlatBuffers metadata to Sparrow metadata format.
 
template<std::ranges::input_range R>
requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
void serialize_record_batches_to_ipc_stream (const R &record_batches, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
 Serializes a collection of record batches into a binary format.
 
SPARROW_IPC_API serialized_record_batch_info serialize_record_batch (const sparrow::record_batch &record_batch, any_output_stream &stream, std::optional< CompressionType > compression, std::optional< std::reference_wrapper< CompressionCache > > cache)
 Serializes a record batch into a binary format following the Arrow IPC specification.
 
SPARROW_IPC_API void serialize_schema_message (const sparrow::record_batch &record_batch, any_output_stream &stream)
 Serializes a schema message for a record batch into a byte buffer.
 
SPARROW_IPC_API std::size_t calculate_schema_message_size (const sparrow::record_batch &record_batch)
 Calculates the total serialized size of a schema message.
 
SPARROW_IPC_API std::size_t calculate_record_batch_message_size (const sparrow::record_batch &record_batch, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Calculates the total serialized size of a record batch message.
 
template<std::ranges::input_range R>
requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
std::size_t calculate_total_serialized_size (const R &record_batches, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Calculates the total serialized size for a collection of record batches.
 
SPARROW_IPC_API void fill_body (const sparrow::arrow_proxy &arrow_proxy, any_output_stream &stream, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Fills the body vector with serialized data from an arrow proxy and its children.
 
SPARROW_IPC_API void generate_body (const sparrow::record_batch &record_batch, any_output_stream &stream, std::optional< CompressionType > compression=std::nullopt, std::optional< std::reference_wrapper< CompressionCache > > cache=std::nullopt)
 Generates a serialized body from a record batch.
 
SPARROW_IPC_API std::vector< sparrow::data_type > get_column_dtypes (const sparrow::record_batch &rb)
 
serializerend_stream (serializer &serializer)
 
SPARROW_IPC_API size_t write_footer (const sparrow::record_batch &record_batch, const std::vector< record_batch_block > &record_batch_blocks, any_output_stream &stream)
 Writes the Arrow IPC file footer.
 
SPARROW_IPC_API std::vector< sparrow::record_batch > deserialize_file (std::span< const uint8_t > data)
 Deserializes Arrow IPC file format into a vector of record batches.
 
stream_file_serializerend_file (stream_file_serializer &serializer)
 

Variables

constexpr int SPARROW_IPC_VERSION_MAJOR = 0
 
constexpr int SPARROW_IPC_VERSION_MINOR = 2
 
constexpr int SPARROW_IPC_VERSION_PATCH = 0
 
constexpr int SPARROW_IPC_BINARY_CURRENT = 2
 
constexpr int SPARROW_IPC_BINARY_REVISION = 0
 
constexpr int SPARROW_IPC_BINARY_AGE = 1
 
constexpr std::array< std::uint8_t, 4 > continuation = {0xFF, 0xFF, 0xFF, 0xFF}
 Continuation value defined in the Arrow IPC specification: https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format.
 
constexpr std::array< std::uint8_t, 8 > end_of_stream = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00}
 End-of-stream marker defined in the Arrow IPC specification: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format.
 
constexpr std::array< std::uint8_t, 6 > arrow_file_magic = {'A', 'R', 'R', 'O', 'W', '1'}
 Magic bytes for Arrow file format defined in the Arrow IPC specification: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format The magic string is "ARROW1" (6 bytes) followed by 2 padding bytes to reach 8-byte alignment.
 
constexpr std::size_t arrow_file_magic_size = arrow_file_magic.size()
 
constexpr std::array< std::uint8_t, 8 > arrow_file_header_magic = {'A', 'R', 'R', 'O', 'W', '1', 0x00, 0x00}
 Magic bytes with padding for file header (8 bytes total for alignment)
 

Enumeration Type Documentation

◆ CompressionType

enum class sparrow_ipc::CompressionType : std::uint8_t
strong
Enumerator
LZ4_FRAME 
ZSTD 

Definition at line 14 of file compression.hpp.

Function Documentation

◆ arrow_array_release()

template<ArrowPrivateData T>
void sparrow_ipc::arrow_array_release ( ArrowArray * array)

Definition at line 16 of file arrow_array.hpp.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ calculate_body_size() [1/2]

int64_t sparrow_ipc::calculate_body_size ( const sparrow::arrow_proxy & arrow_proxy,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )
nodiscard

Calculates the total aligned size in bytes of all buffers in an Arrow array structure.

This function recursively computes the total size needed for all buffers in an Arrow array structure, including buffers from child arrays. Each buffer size is aligned to 8-byte boundaries as required by the Arrow format.

Parameters
arrow_proxyThe Arrow array proxy containing buffers and child arrays.
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache to store and retrieve compressed buffer sizes, avoiding recompression. If compression is given, cache should be set as well.
Returns
int64_t The total aligned size in bytes of all buffers in the array hierarchy.
Exceptions
std::invalid_argumentif compression is given but not cache.

◆ calculate_body_size() [2/2]

int64_t sparrow_ipc::calculate_body_size ( const sparrow::record_batch & record_batch,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )
nodiscard

Calculates the total body size of a record batch by summing the body sizes of all its columns.

This function iterates through all columns in the given record batch and accumulates the body size of each column's underlying Arrow array proxy. The body size represents the total memory required for the serialized data content of the record batch.

Parameters
record_batchThe sparrow record batch containing columns to calculate size for.
compressionOptional: The compression type to use when serializing. If not provided, sizes are for uncompressed buffers.
cacheOptional: A cache to store and retrieve compressed buffer sizes, avoiding recompression. If compression is given, cache should be set as well.
Returns
int64_t The total body size in bytes of all columns in the record batch.

◆ calculate_record_batch_message_size()

SPARROW_IPC_API std::size_t sparrow_ipc::calculate_record_batch_message_size ( const sparrow::record_batch & record_batch,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )
nodiscard

Calculates the total serialized size of a record batch message.

This function computes the complete size that would be produced by serialize_record_batch(), including:

  • Continuation bytes (4 bytes)
  • Message length prefix (4 bytes)
  • FlatBuffer record batch metadata
  • Padding to 8-byte alignment after metadata
  • Body data with 8-byte alignment between buffers
Parameters
record_batchThe record batch to be measured.
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache to store and retrieve compressed buffer sizes, avoiding recompression. If compression is given, cache should be set as well.
Returns
The total size in bytes that the serialized record batch would occupy.
Here is the caller graph for this function:

◆ calculate_schema_message_size()

SPARROW_IPC_API std::size_t sparrow_ipc::calculate_schema_message_size ( const sparrow::record_batch & record_batch)
nodiscard

Calculates the total serialized size of a schema message.

This function computes the complete size that would be produced by serialize_schema_message(), including:

  • Continuation bytes (4 bytes)
  • Message length prefix (4 bytes)
  • FlatBuffer schema message data
  • Padding to 8-byte alignment
Parameters
record_batchThe record batch containing the schema to be measured
Returns
The total size in bytes that the serialized schema message would occupy
Here is the caller graph for this function:

◆ calculate_total_serialized_size()

template<std::ranges::input_range R>
requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
std::size_t sparrow_ipc::calculate_total_serialized_size ( const R & record_batches,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )
nodiscard

Calculates the total serialized size for a collection of record batches.

This function computes the complete size that would be produced by serializing a schema message followed by all record batch messages in the collection.

Template Parameters
RRange type containing sparrow::record_batch objects.
Parameters
record_batchesCollection of record batches to be measured.
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache to store and retrieve compressed buffer sizes, avoiding recompression. If compression is given, cache should be set as well.
Returns
The total size in bytes for the complete serialized output.
Exceptions
std::invalid_argumentif record batches have inconsistent schemas.

Definition at line 82 of file serialize_utils.hpp.

Here is the call graph for this function:

◆ compress()

SPARROW_IPC_API std::span< const std::uint8_t > sparrow_ipc::compress ( const CompressionType compression_type,
const std::span< const std::uint8_t > & data,
CompressionCache & cache )
nodiscard
Here is the call graph for this function:
Here is the caller graph for this function:

◆ create_children() [1/2]

::flatbuffers::Offset< ::flatbuffers::Vector<::flatbuffers::Offset< org::apache::arrow::flatbuf::Field > > > sparrow_ipc::create_children ( flatbuffers::FlatBufferBuilder & builder,
const ArrowSchema & arrow_schema )
nodiscard

Creates a FlatBuffers vector of Field objects from an ArrowSchema's children.

This function iterates through all children of the given ArrowSchema and converts each child to a FlatBuffers Field object. The resulting fields are collected into a FlatBuffers vector.

Parameters
builderReference to the FlatBufferBuilder used for creating FlatBuffers objects
arrow_schemaThe ArrowSchema containing the children to convert
Returns
A FlatBuffers offset to a vector of Field objects, or 0 if no children exist
Exceptions
std::invalid_argumentIf any child pointer in the ArrowSchema is null
Note
The function reserves space for all children upfront for performance optimization
Returns 0 (null offset) when the schema has no children, otherwise returns a valid vector offset

◆ create_children() [2/2]

::flatbuffers::Offset< ::flatbuffers::Vector<::flatbuffers::Offset< org::apache::arrow::flatbuf::Field > > > sparrow_ipc::create_children ( flatbuffers::FlatBufferBuilder & builder,
const sparrow::record_batch & record_batch )
nodiscard

Creates a FlatBuffers vector of Field objects from a record batch.

This function extracts column information from a record batch and converts each column into a FlatBuffers Field object. It uses both the column's Arrow schema and the record batch's column names to create properly named fields. The resulting fields are collected into a FlatBuffers vector.

Parameters
builderReference to the FlatBuffers builder used for creating the vector
record_batchThe record batch containing columns and their associated names
Returns
FlatBuffers offset to a vector of Field objects, or 0 if the record batch has no columns
Note
The function reserves space in the children vector based on the column count for performance optimization
Each field is created using the column name from record_batch.names() rather than from the Arrow schema, ensuring consistency with the record batch structure
This function properly handles the case where Arrow schemas may not have names by using the record batch's explicit column names via the name_override parameter

◆ create_field()

::flatbuffers::Offset< org::apache::arrow::flatbuf::Field > sparrow_ipc::create_field ( flatbuffers::FlatBufferBuilder & builder,
const ArrowSchema & arrow_schema,
std::optional< std::string_view > name_override = std::nullopt )
nodiscard

Creates a FlatBuffer Field object from an ArrowSchema.

This function converts an ArrowSchema structure into a FlatBuffer Field representation suitable for Apache Arrow IPC serialization. It handles the creation of all necessary components including field name, type information, metadata, children, and nullable flag.

Parameters
builderReference to the FlatBufferBuilder used for creating FlatBuffer objects
arrow_schemaThe ArrowSchema structure containing the field definition to convert
name_overrideOptional field name to use instead of the name from arrow_schema. If provided, this name will be used regardless of arrow_schema.name. If not provided, falls back to arrow_schema.name (or empty if null)
Returns
A FlatBuffer offset to the created Field object that can be used in further FlatBuffer construction operations
Note
Dictionary encoding is not currently supported (TODO item)
The function checks the NULLABLE flag from the ArrowSchema flags to determine nullability
The name_override parameter is useful when serializing record batches where column names are stored separately from the array schemas

◆ create_fieldnodes()

std::vector< org::apache::arrow::flatbuf::FieldNode > sparrow_ipc::create_fieldnodes ( const sparrow::record_batch & record_batch)
nodiscard

Creates a vector of Apache Arrow FieldNode objects from a record batch.

This function iterates through all columns in the provided record batch and generates corresponding FieldNode flatbuffer objects. Each column's arrow proxy is used to populate the field nodes vector through the fill_fieldnodes function.

Parameters
record_batchThe sparrow record batch containing columns to process
Returns
std::vector<org::apache::arrow::flatbuf::FieldNode> Vector of FieldNode objects representing the structure and metadata of each column

◆ create_metadata()

flatbuffers::Offset< flatbuffers::Vector< flatbuffers::Offset< org::apache::arrow::flatbuf::KeyValue > > > sparrow_ipc::create_metadata ( flatbuffers::FlatBufferBuilder & builder,
const ArrowSchema & arrow_schema )
nodiscard

Creates a FlatBuffers vector of KeyValue pairs from ArrowSchema metadata.

This function converts metadata from an ArrowSchema into a FlatBuffers representation suitable for serialization. It processes key-value pairs from the schema's metadata and creates corresponding FlatBuffers KeyValue objects.

Parameters
builderReference to the FlatBufferBuilder used for creating FlatBuffers objects
arrow_schemaThe ArrowSchema containing metadata to be serialized
Returns
A FlatBuffers offset to a vector of KeyValue pairs. Returns 0 if the schema has no metadata (metadata is nullptr).
Note
The function reserves memory for the vector based on the metadata size for optimal performance.

◆ decompress()

SPARROW_IPC_API std::variant< std::vector< std::uint8_t >, std::span< const std::uint8_t > > sparrow_ipc::decompress ( const CompressionType compression_type,
std::span< const std::uint8_t > data )
nodiscard
Here is the call graph for this function:
Here is the caller graph for this function:

◆ deserialize_file()

SPARROW_IPC_API std::vector< sparrow::record_batch > sparrow_ipc::deserialize_file ( std::span< const uint8_t > data)
nodiscard

Deserializes Arrow IPC file format into a vector of record batches.

Reads an Arrow IPC file format which consists of:

  1. Magic bytes "ARROW1" with padding (8 bytes)
  2. Stream format data (schema + record batches)
  3. Footer containing metadata
  4. Footer size (int32)
  5. Trailing magic bytes "ARROW1" (6 bytes)
Parameters
dataA span of bytes containing the serialized Arrow IPC file data
Returns
std::vector<sparrow::record_batch> A vector containing all deserialized record batches
Exceptions
std::runtime_errorIf:
  • The file magic bytes are incorrect
  • The footer is missing or invalid
  • Record batch deserialization fails
Note
The function validates the file structure including magic bytes at both start and end
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/stream_file_serializer.hpp.

◆ deserialize_non_owning_fixedwidthbinary()

sparrow::fixed_width_binary_array sparrow_ipc::deserialize_non_owning_fixedwidthbinary ( const org::apache::arrow::flatbuf::RecordBatch & record_batch,
std::span< const uint8_t > body,
std::string_view name,
const std::optional< std::vector< sparrow::metadata_pair > > & metadata,
bool nullable,
size_t & buffer_index,
int32_t byte_width )
nodiscard

◆ deserialize_non_owning_primitive_array()

template<typename T>
sparrow::primitive_array< T > sparrow_ipc::deserialize_non_owning_primitive_array ( const org::apache::arrow::flatbuf::RecordBatch & record_batch,
std::span< const uint8_t > body,
std::string_view name,
const std::optional< std::vector< sparrow::metadata_pair > > & metadata,
bool nullable,
size_t & buffer_index )
nodiscard

Definition at line 18 of file deserialize_primitive_array.hpp.

Here is the call graph for this function:

◆ deserialize_non_owning_variable_size_binary()

template<typename T>
T sparrow_ipc::deserialize_non_owning_variable_size_binary ( const org::apache::arrow::flatbuf::RecordBatch & record_batch,
std::span< const uint8_t > body,
std::string_view name,
const std::optional< std::vector< sparrow::metadata_pair > > & metadata,
bool nullable,
size_t & buffer_index )
nodiscard

Definition at line 17 of file deserialize_variable_size_binary_array.hpp.

Here is the call graph for this function:

◆ deserialize_stream()

SPARROW_IPC_API std::vector< sparrow::record_batch > sparrow_ipc::deserialize_stream ( std::span< const uint8_t > data)
nodiscard

Deserializes an Arrow IPC stream from binary data into a vector of record batches.

This function processes an Arrow IPC stream format, extracting schema information and record batch data. It handles encapsulated messages sequentially, first expecting a Schema message followed by one or more RecordBatch messages.

Parameters
dataA span of bytes containing the serialized Arrow IPC stream data
Returns
std::vector<sparrow::record_batch> A vector containing all deserialized record batches
Exceptions
std::runtime_errorIf:
  • A RecordBatch message is encountered before a Schema message
  • A RecordBatch message header is missing or invalid
  • Unsupported message types are encountered (Tensor, DictionaryBatch, SparseTensor)
  • An unknown message header type is encountered
Note
The function processes messages until an end-of-stream marker is detected
Here is the caller graph for this function:

◆ end_file()

stream_file_serializer & sparrow_ipc::end_file ( stream_file_serializer & serializer)
inline
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/stream_file_serializer.hpp.

Definition at line 320 of file stream_file_serializer.hpp.

Here is the call graph for this function:

◆ end_stream()

serializer & sparrow_ipc::end_stream ( serializer & serializer)
inline
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/serializer.hpp.

Definition at line 223 of file serializer.hpp.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ extract_encapsulated_message()

std::pair< encapsulated_message, std::span< const uint8_t > > sparrow_ipc::extract_encapsulated_message ( std::span< const uint8_t > buf_ptr)
nodiscard

◆ fill_arrow_array()

template<ArrowPrivateData T, typename Arg>
void sparrow_ipc::fill_arrow_array ( ArrowArray & array,
int64_t length,
int64_t null_count,
int64_t offset,
size_t children_count,
ArrowArray ** children,
ArrowArray * dictionary,
Arg && private_data_arg )

Definition at line 32 of file arrow_array.hpp.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ fill_body()

SPARROW_IPC_API void sparrow_ipc::fill_body ( const sparrow::arrow_proxy & arrow_proxy,
any_output_stream & stream,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )

Fills the body vector with serialized data from an arrow proxy and its children.

This function recursively processes an arrow proxy by:

  1. Iterating through all buffers in the proxy and appending their data to the body vector
  2. Adding padding bytes (zeros) after each buffer to align data to 8-byte boundaries
  3. Recursively processing all child proxies in the same manner

The function ensures proper memory alignment by padding each buffer's data to the next 8-byte boundary, which is typically required for efficient memory access and Arrow format compliance.

Parameters
arrow_proxyThe arrow proxy containing buffers and potential child proxies to serialize.
streamThe output stream where the serialized body data will be written.
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache for compressed buffers to avoid recompression if compression is enabled. If compression is given, cache should be set as well.
Exceptions
std::invalid_argumentif compression is given but not cache.

◆ fill_buffers()

void sparrow_ipc::fill_buffers ( const sparrow::arrow_proxy & arrow_proxy,
std::vector< org::apache::arrow::flatbuf::Buffer > & flatbuf_buffers,
int64_t & offset )

Recursively fills a vector of FlatBuffer Buffer objects with buffer information from an Arrow proxy.

This function traverses an Arrow proxy structure and creates FlatBuffer Buffer entries for each buffer found in the proxy and its children. The buffers are processed in a depth-first manner, first handling the buffers of the current proxy, then recursively processing all child proxies.

Parameters
arrow_proxyThe Arrow proxy object containing buffers and potential child proxies to process
flatbuf_buffersVector of FlatBuffer Buffer objects to be populated with buffer information
offsetReference to the current byte offset, updated as buffers are processed and aligned to 8-byte boundaries
Note
The offset is automatically aligned to 8-byte boundaries using utils::align_to_8() for each buffer
This function modifies both the flatbuf_buffers vector and the offset parameter

◆ fill_compressed_buffers()

void sparrow_ipc::fill_compressed_buffers ( const sparrow::arrow_proxy & arrow_proxy,
std::vector< org::apache::arrow::flatbuf::Buffer > & flatbuf_compressed_buffers,
int64_t & offset,
const CompressionType compression_type,
CompressionCache & cache )

Recursively populates a vector with compressed buffer metadata from an Arrow proxy.

This function traverses the Arrow proxy and its children, compressing each buffer and recording its metadata (offset and size) in the provided vector. The offset is updated to ensure proper alignment for each subsequent buffer.

Parameters
arrow_proxyThe Arrow proxy containing the buffers to be compressed.
flatbuf_compressed_buffersA vector to store the resulting compressed buffer metadata.
offsetThe current offset in the buffer layout, which will be updated by the function.
compression_typeThe compression algorithm to use.
cacheA cache to store compressed buffers and avoid recompression.

◆ fill_fieldnodes()

void sparrow_ipc::fill_fieldnodes ( const sparrow::arrow_proxy & arrow_proxy,
std::vector< org::apache::arrow::flatbuf::FieldNode > & nodes )

Recursively fills a vector of FieldNode objects from an arrow_proxy and its children.

This function creates FieldNode objects containing length and null count information from the given arrow_proxy and recursively processes all its children, appending them to the provided nodes vector in depth-first order.

Parameters
arrow_proxyThe arrow proxy object containing array metadata (length, null_count) and potential child arrays
nodesReference to a vector that will be populated with FieldNode objects. Each FieldNode contains the length and null count of the corresponding array.
Note
The function reserves space in the nodes vector to optimize memory allocation when processing children arrays.
The traversal order is depth-first, with parent nodes added before their children.

◆ fill_non_owning_arrow_schema()

template<sparrow::input_metadata_container M = std::vector<sparrow::metadata_pair>>
void sparrow_ipc::fill_non_owning_arrow_schema ( ArrowSchema & schema,
std::string_view format,
const char * name,
std::optional< M > metadata,
std::optional< std::unordered_set< sparrow::ArrowFlag > > flags,
size_t children_count,
ArrowSchema ** children,
ArrowSchema * dictionary )

Definition at line 18 of file arrow_schema.hpp.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ generate_body()

SPARROW_IPC_API void sparrow_ipc::generate_body ( const sparrow::record_batch & record_batch,
any_output_stream & stream,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )

Generates a serialized body from a record batch.

This function iterates through all columns in the provided record batch, extracts their Arrow proxy representations, and serializes them into a single byte vector that forms the body of the serialized data.

Parameters
record_batchThe record batch containing columns to be serialized.
streamThe output stream where the serialized body will be written.
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache for compressed buffers to avoid recompression if compression is enabled. If compression is given, cache should be set as well.

◆ get_buffers()

std::vector< org::apache::arrow::flatbuf::Buffer > sparrow_ipc::get_buffers ( const sparrow::record_batch & record_batch)
nodiscard

Extracts buffer information from a record batch for serialization.

This function iterates through all columns in the provided record batch and collects their buffer information into a vector of Arrow FlatBuffer Buffer objects. The buffers are processed sequentially with cumulative offset tracking.

Parameters
record_batchThe sparrow record batch containing columns to extract buffers from
Returns
std::vector<org::apache::arrow::flatbuf::Buffer> A vector containing all buffer descriptors from the record batch columns, with properly calculated offsets
Note
This function relies on the fill_buffers helper function to process individual column buffers and maintain offset consistency across all buffers.

◆ get_column_dtypes()

SPARROW_IPC_API std::vector< sparrow::data_type > sparrow_ipc::get_column_dtypes ( const sparrow::record_batch & rb)
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/chunk_memory_serializer.hpp.
Here is the caller graph for this function:

◆ get_compressed_buffers()

std::vector< org::apache::arrow::flatbuf::Buffer > sparrow_ipc::get_compressed_buffers ( const sparrow::record_batch & record_batch,
const CompressionType compression_type,
CompressionCache & cache )
nodiscard

Retrieves metadata describing the layout of compressed buffers within a record batch.

This function processes a record batch to determine the metadata (offset and size) for each of its buffers, assuming they are compressed using the specified algorithm. This metadata accounts for each compressed buffer being prefixed by its 8-byte uncompressed size and padded to ensure 8-byte alignment.

Parameters
record_batchThe record batch whose buffers' compressed metadata is to be retrieved.
compression_typeThe compression algorithm that would be applied (e.g., LZ4_FRAME, ZSTD).
cacheA cache to store compressed buffers and avoid recompression.
Returns
A vector of FlatBuffer Buffer objects, each describing the offset and size of a corresponding compressed buffer within a larger message body.

◆ get_compressed_size()

SPARROW_IPC_API size_t sparrow_ipc::get_compressed_size ( const CompressionType compression_type,
const std::span< const std::uint8_t > & data,
CompressionCache & cache )
nodiscard
Here is the call graph for this function:
Here is the caller graph for this function:

◆ get_flatbuffer_decimal_type()

std::pair< org::apache::arrow::flatbuf::Type, flatbuffers::Offset< void > > sparrow_ipc::get_flatbuffer_decimal_type ( flatbuffers::FlatBufferBuilder & builder,
std::string_view format_str,
const int32_t bitWidth )
nodiscard

◆ get_flatbuffer_type()

std::pair< org::apache::arrow::flatbuf::Type, flatbuffers::Offset< void > > sparrow_ipc::get_flatbuffer_type ( flatbuffers::FlatBufferBuilder & builder,
std::string_view format_str )
nodiscard

◆ get_footer_from_file_data()

SPARROW_IPC_API const org::apache::arrow::flatbuf::Footer * sparrow_ipc::get_footer_from_file_data ( std::span< const uint8_t > file_data)
nodiscard

◆ get_record_batch_message_builder()

flatbuffers::FlatBufferBuilder sparrow_ipc::get_record_batch_message_builder ( const sparrow::record_batch & record_batch,
std::optional< CompressionType > compression = std::nullopt,
std::optional< std::reference_wrapper< CompressionCache > > cache = std::nullopt )
nodiscard

Creates a FlatBuffer message containing a serialized Apache Arrow RecordBatch.

This function builds a complete Arrow IPC message by serializing a record batch along with its metadata (field nodes and buffer information) into a FlatBuffer format that conforms to the Arrow IPC specification.

Parameters
record_batchThe source record batch containing the data to be serialized.
compressionOptional: The compression algorithm to be used for the message body.
cacheOptional: A cache for compressed buffers to avoid recompression if compression is enabled. If compression is given, cache should be set as well.
Returns
A FlatBufferBuilder containing the complete serialized message ready for transmission or storage. The builder is finished and ready to be accessed via GetBufferPointer() and GetSize().
Exceptions
std::invalid_argumentif compression is given but not cache.
Note
The returned message uses Arrow IPC format version V5.
Variadic buffer counts is not currently implemented (set to 0).

◆ get_schema_message_builder()

flatbuffers::FlatBufferBuilder sparrow_ipc::get_schema_message_builder ( const sparrow::record_batch & record_batch)
nodiscard

Creates a FlatBuffer builder containing a serialized Arrow schema message.

This function constructs an Arrow IPC schema message from a record batch by:

  1. Creating field definitions from the record batch columns
  2. Building a Schema flatbuffer with little-endian byte order
  3. Wrapping the schema in a Message with metadata version V5
  4. Finalizing the buffer for serialization
Parameters
record_batchThe source record batch containing column definitions
Returns
flatbuffers::FlatBufferBuilder A completed FlatBuffer containing the schema message, ready for Arrow IPC serialization
Note
The schema message has zero body length as it contains only metadata
Currently uses little-endian byte order (marked as TODO for configurability)

◆ is_arrow_file_magic()

template<std::ranges::input_range R>
bool sparrow_ipc::is_arrow_file_magic ( const R & buf)
nodiscard

Definition at line 49 of file magic_values.hpp.

◆ is_continuation()

template<std::ranges::input_range R>
bool sparrow_ipc::is_continuation ( const R & buf)
nodiscard

Definition at line 37 of file magic_values.hpp.

◆ is_end_of_stream()

template<std::ranges::input_range R>
bool sparrow_ipc::is_end_of_stream ( const R & buf)
nodiscard

Definition at line 43 of file magic_values.hpp.

◆ make_arrow_array()

template<ArrowPrivateData T, typename Arg>
ArrowArray sparrow_ipc::make_arrow_array ( int64_t length,
int64_t null_count,
int64_t offset,
size_t children_count,
ArrowArray ** children,
ArrowArray * dictionary,
Arg && private_data_arg )
nodiscard

Definition at line 63 of file arrow_array.hpp.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ make_non_owning_arrow_schema()

template<sparrow::input_metadata_container M = std::vector<sparrow::metadata_pair>>
ArrowSchema sparrow_ipc::make_non_owning_arrow_schema ( std::string_view format,
const char * name,
std::optional< M > metadata,
std::optional< std::unordered_set< sparrow::ArrowFlag > > flags,
size_t children_count,
ArrowSchema ** children,
ArrowSchema * dictionary )
nodiscard

Definition at line 57 of file arrow_schema.hpp.

Here is the call graph for this function:
Here is the caller graph for this function:

◆ release_arrow_array_children_and_dictionary()

SPARROW_IPC_API void sparrow_ipc::release_arrow_array_children_and_dictionary ( ArrowArray * array)
Here is the caller graph for this function:

◆ release_common_non_owning_arrow()

template<class T>
requires std::same_as<T, ArrowArray> || std::same_as<T, ArrowSchema>
void sparrow_ipc::release_common_non_owning_arrow ( T & t)

Release the children and dictionnary of an ArrowArray or ArrowSchema.

Template Parameters
TArrowArray or ArrowSchema
Parameters
tThe ArrowArray or ArrowSchema to release.

Definition at line 20 of file arrow_array_schema_common_release.hpp.

◆ release_non_owning_arrow_schema()

SPARROW_IPC_API void sparrow_ipc::release_non_owning_arrow_schema ( ArrowSchema * schema)
Here is the caller graph for this function:

◆ serialize_record_batch()

SPARROW_IPC_API serialized_record_batch_info sparrow_ipc::serialize_record_batch ( const sparrow::record_batch & record_batch,
any_output_stream & stream,
std::optional< CompressionType > compression,
std::optional< std::reference_wrapper< CompressionCache > > cache )

Serializes a record batch into a binary format following the Arrow IPC specification.

This function converts a sparrow record batch into a serialized byte vector that includes:

  • A continuation marker
  • The record batch message length (4 bytes)
  • The flatbuffer-encoded record batch metadata
  • Padding to align to 8-byte boundaries
  • The record batch body containing the actual data buffers
Parameters
record_batchThe sparrow record batch to serialize
streamThe output stream where the serialized record batch will be written
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache to store and retrieve compressed buffers, avoiding recompression.
Note
If compression is given, cache should be set as well.
The output follows Arrow IPC message format with proper alignment and includes both metadata and data portions of the record batch
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/chunk_memory_serializer.hpp.
Here is the caller graph for this function:

◆ serialize_record_batches_to_ipc_stream()

template<std::ranges::input_range R>
requires std::same_as<std::ranges::range_value_t<R>, sparrow::record_batch>
void sparrow_ipc::serialize_record_batches_to_ipc_stream ( const R & record_batches,
any_output_stream & stream,
std::optional< CompressionType > compression,
std::optional< std::reference_wrapper< CompressionCache > > cache )

Serializes a collection of record batches into a binary format.

This function takes a collection of record batches and serializes them into a single binary representation following the Arrow IPC format. The serialization includes:

  • Schema message (derived from the first record batch)
  • All record batch data
  • End-of-stream marker
Template Parameters
RContainer type that holds record batches (must support empty(), operator[], begin(), end())
Parameters
record_batchesCollection of record batches to serialize. All batches must have identical schemas.
streamThe output stream where the serialized data will be written.
compressionOptional: The compression type to use when serializing.
cacheOptional: A cache to store and retrieve compressed buffers, avoiding recompression. If compression is given, cache should be set as well.
Exceptions
std::invalid_argumentIf record batches have inconsistent schemas or if the collection contains batches that cannot be serialized together.
Precondition
All record batches in the collection must have the same schema
The container R must not be empty when consistency checking is required

Definition at line 50 of file serialize.hpp.

Here is the call graph for this function:

◆ serialize_schema_message()

SPARROW_IPC_API void sparrow_ipc::serialize_schema_message ( const sparrow::record_batch & record_batch,
any_output_stream & stream )

Serializes a schema message for a record batch into a byte buffer.

Serializes a record batch schema into a binary message format.

This function creates a serialized schema message following the Arrow IPC format. The resulting buffer contains:

  1. Continuation bytes at the beginning
  2. A 4-byte length prefix indicating the size of the schema message
  3. The actual FlatBuffer schema message bytes
  4. Padding bytes to align the total size to 8-byte boundaries
Parameters
record_batchThe record batch containing the schema to serialize
streamThe output stream where the serialized schema message will be written

This function creates a serialized schema message by combining continuation bytes, a length prefix, the flatbuffer schema data, and padding to ensure 8-byte alignment. The resulting format follows the Arrow IPC specification for schema messages.

Parameters
record_batchThe record batch containing the schema to be serialized
streamThe output stream where the serialized schema message will be written
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/chunk_memory_serializer.hpp.
Here is the caller graph for this function:

◆ to_sparrow_metadata()

std::vector< sparrow::metadata_pair > sparrow_ipc::to_sparrow_metadata ( const ::flatbuffers::Vector<::flatbuffers::Offset< org::apache::arrow::flatbuf::KeyValue > > & metadata)

Converts FlatBuffers metadata to Sparrow metadata format.

This function takes a FlatBuffers vector containing key-value pairs from Apache Arrow format and converts them into a vector of Sparrow metadata pairs. Each key-value pair from the FlatBuffers structure is extracted and stored as a sparrow::metadata_pair.

Parameters
metadataA FlatBuffers vector containing KeyValue pairs from Apache Arrow format
Returns
std::vector<sparrow::metadata_pair> A vector of Sparrow metadata pairs containing the converted key-value data
Note
The function reserves space in the output vector to match the input size for optimal memory allocation performance.

◆ write_footer()

SPARROW_IPC_API size_t sparrow_ipc::write_footer ( const sparrow::record_batch & record_batch,
const std::vector< record_batch_block > & record_batch_blocks,
any_output_stream & stream )

Writes the Arrow IPC file footer.

Parameters
record_batchA record batch containing the schema for the footer
record_batch_blocksVector of block information for each record batch
streamThe output stream to write the footer to
Returns
The size of the footer in bytes
Examples
/home/runner/work/sparrow-ipc/sparrow-ipc/include/sparrow_ipc/stream_file_serializer.hpp.

Variable Documentation

◆ arrow_file_header_magic

std::array<std::uint8_t, 8> sparrow_ipc::arrow_file_header_magic = {'A', 'R', 'R', 'O', 'W', '1', 0x00, 0x00}
inlineconstexpr

Magic bytes with padding for file header (8 bytes total for alignment)

Definition at line 34 of file magic_values.hpp.

◆ arrow_file_magic

std::array<std::uint8_t, 6> sparrow_ipc::arrow_file_magic = {'A', 'R', 'R', 'O', 'W', '1'}
inlineconstexpr

Magic bytes for Arrow file format defined in the Arrow IPC specification: https://arrow.apache.org/docs/format/Columnar.html#ipc-file-format The magic string is "ARROW1" (6 bytes) followed by 2 padding bytes to reach 8-byte alignment.

Definition at line 28 of file magic_values.hpp.

◆ arrow_file_magic_size

std::size_t sparrow_ipc::arrow_file_magic_size = arrow_file_magic.size()
inlineconstexpr

Definition at line 29 of file magic_values.hpp.

◆ continuation

std::array<std::uint8_t, 4> sparrow_ipc::continuation = {0xFF, 0xFF, 0xFF, 0xFF}
inlineconstexpr

Continuation value defined in the Arrow IPC specification: https://arrow.apache.org/docs/format/Columnar.html#encapsulated-message-format.

Definition at line 15 of file magic_values.hpp.

◆ end_of_stream

std::array<std::uint8_t, 8> sparrow_ipc::end_of_stream = {0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00}
inlineconstexpr

End-of-stream marker defined in the Arrow IPC specification: https://arrow.apache.org/docs/format/Columnar.html#ipc-streaming-format.

Definition at line 21 of file magic_values.hpp.

◆ SPARROW_IPC_BINARY_AGE

int sparrow_ipc::SPARROW_IPC_BINARY_AGE = 1
constexpr

Definition at line 11 of file sparrow_ipc_version.hpp.

◆ SPARROW_IPC_BINARY_CURRENT

int sparrow_ipc::SPARROW_IPC_BINARY_CURRENT = 2
constexpr

Definition at line 9 of file sparrow_ipc_version.hpp.

◆ SPARROW_IPC_BINARY_REVISION

int sparrow_ipc::SPARROW_IPC_BINARY_REVISION = 0
constexpr

Definition at line 10 of file sparrow_ipc_version.hpp.

◆ SPARROW_IPC_VERSION_MAJOR

int sparrow_ipc::SPARROW_IPC_VERSION_MAJOR = 0
constexpr

Definition at line 5 of file sparrow_ipc_version.hpp.

◆ SPARROW_IPC_VERSION_MINOR

int sparrow_ipc::SPARROW_IPC_VERSION_MINOR = 2
constexpr

Definition at line 6 of file sparrow_ipc_version.hpp.

◆ SPARROW_IPC_VERSION_PATCH

int sparrow_ipc::SPARROW_IPC_VERSION_PATCH = 0
constexpr

Definition at line 7 of file sparrow_ipc_version.hpp.