Ruby red-arrow

From wikinotes

A library for serializting/deserializing and interacting with various columnar data formats (ex. parquet).

Documentation

api docs https://www.rubydoc.info/gems/red-arrow/11.0.0
github https://github.com/apache/arrow/tree/main/ruby/red-arrow

Usage

# instantiation
table = Arrow::Table.new(
  "name" => ["luke", "leia", "anakin"],
  "age" => [5, 5, 25]
)
table = Arrow::load("foo.parquet", format: :parquet)

# serialize
buf = Arrow::ResizableBuffer.new(1024)
table.save(buf, format: :parquet)
buf.data.to_s  # << serialized table

# deserialize
buf = Arrow::Buffer.new("..serialized-parquet..")
table = Arrow::Table.load(buf, format: :parquet)
table.n_rows                   # number of rows
table.column_names             # names of columns

table.slice(1, 4)              # new table w/ rows between 1-4
table.group("name")            # group by a value
table.slice(1, 1).raw_records  # json-esque data

table.select_columns("name", "age")  # only return these columns
table
  .select_columns(..)
  .each_column { |col|.each_with_index { |row| puts row } }  # each row value for selected column

sub-tables are defined as structs

# Array of Structs [{year: 2020, month: 6, day: 21}, {year: 2021, month: 7, day: 25}]
datatype = Arrow::StructDataType.new(
  year: { type: :int32 }, 
  month: { type: :int32 }, 
  day: { type: :inte32 },
)
values = Arrow::StructArray.new(datatype, [[2020, 2021], [6, 7], [21, 25])
table = Arrow::Table.new(date: values)

# Struct {foo: 1, bar: "hi"}
Arrow::StructScalar.new(
  Arrow::StructDataType.new(foo: Arrow::Int64DataType.new, bar: Arrow::StringDataTypew.new), 
  [Arrow::Int64Scalar.new(1), Arrow::StringScalar.new("hi")]
)

# Array [1, 2]
Arrow::Int64Array.new([1, 2])

# Array of Lists [[1, 2], [3, 4]]
Arrow::ListArray.new(
  Arrow::ListDataType.new(field: { name: :element, type: :int64}), 
  [[1]]
)

type info

Arrow::DataType.resolve "a"  # error lists all available datatypes

# print schema for target column
table
  .select_columns("target_column")
  .each_column { |col| puts col.data_type }

Misc

to categorize

# Create a table-schema, with a struct
struct_type = Arrow::StructDataType.new(bar: Arrow::StringDataType.new)
schema = Arrow::Schema.new([Arrow::Field.new("foo", struct_type)])
struct = Arrow::StructScalar.new(struct_type, [Arrow::StringScalar.new("foo")])

data = schema.fields.first.data_type.build_array([struct.value])
Arrow::Table.new(schema, [data])

# Create a table-schema with 2x columns
schema = Arrow::Schema.new([
  Arrow::Field.new("x", Arrow::Int64DataType.new), 
  Arrow::Field.new("y", Arrow::StringDataType.new)
])
Arrow::Table.new(schema, [[1, "x"], [2, "y"], [3, "z"]])