Ruby red-arrow
From wikinotes
A library for serializting/deserializing and interacting with various columnar data formats (ex. parquet).
Documentation
api docs https://www.rubydoc.info/gems/red-arrow/11.0.0 github https://github.com/apache/arrow/tree/main/ruby/red-arrow
Usage
# instantiation table = Arrow::Table.new( "name" => ["luke", "leia", "anakin"], "age" => [5, 5, 25] ) table = Arrow::load("foo.parquet", format: :parquet) # serialize buf = Arrow::ResizableBuffer.new(1024) table.save(buf, format: :parquet) buf.data.to_s # << serialized table # deserialize buf = Arrow::Buffer.new("..serialized-parquet..") table = Arrow::Table.load(buf, format: :parquet)table.n_rows # number of rows table.column_names # names of columns table.slice(1, 4) # new table w/ rows between 1-4 table.group("name") # group by a value table.slice(1, 1).raw_records # json-esque data table.select_columns("name", "age") # only return these columns table .select_columns(..) .each_column { |col|.each_with_index { |row| puts row } } # each row value for selected columnsub-tables are defined as structs
# Array of Structs [{year: 2020, month: 6, day: 21}, {year: 2021, month: 7, day: 25}] datatype = Arrow::StructDataType.new( year: { type: :int32 }, month: { type: :int32 }, day: { type: :inte32 }, ) values = Arrow::StructArray.new(datatype, [[2020, 2021], [6, 7], [21, 25]) table = Arrow::Table.new(date: values) # Struct {foo: 1, bar: "hi"} Arrow::StructScalar.new( Arrow::StructDataType.new(foo: Arrow::Int64DataType.new, bar: Arrow::StringDataTypew.new), [Arrow::Int64Scalar.new(1), Arrow::StringScalar.new("hi")] ) # Array [1, 2] Arrow::Int64Array.new([1, 2]) # Array of Lists [[1, 2], [3, 4]] Arrow::ListArray.new( Arrow::ListDataType.new(field: { name: :element, type: :int64}), [[1]] )type info
Arrow::DataType.resolve "a" # error lists all available datatypes # print schema for target column table .select_columns("target_column") .each_column { |col| puts col.data_type }
Misc
to categorize
# Create a table-schema, with a struct
struct_type = Arrow::StructDataType.new(bar: Arrow::StringDataType.new)
schema = Arrow::Schema.new([Arrow::Field.new("foo", struct_type)])
struct = Arrow::StructScalar.new(struct_type, [Arrow::StringScalar.new("foo")])
data = schema.fields.first.data_type.build_array([struct.value])
Arrow::Table.new(schema, [data])
# Create a table-schema with 2x columns
schema = Arrow::Schema.new([
Arrow::Field.new("x", Arrow::Int64DataType.new),
Arrow::Field.new("y", Arrow::StringDataType.new)
])
Arrow::Table.new(schema, [[1, "x"], [2, "y"], [3, "z"]])