Ruby red-arrow: Difference between revisions
From wikinotes
(→Usage) |
(→Usage) |
||
Line 48: | Line 48: | ||
sub-tables are defined as structs | sub-tables are defined as structs | ||
<source lang="ruby"> | <source lang="ruby"> | ||
datatype = Arrow::StructDataType.new( | datatype = Arrow::StructDataType.new( | ||
year: { type: :int32 }, | |||
values = Arrow::StructArray.new(datatype, [[ | month: { type: :int32 }, | ||
day: { type: :inte32 }, | |||
) | |||
values = Arrow::StructArray.new(datatype, [[2020, 2021], [6, 7], [21, 25]) | |||
table = Arrow::Table.new(date: values) | |||
</source> | </source> | ||
</blockquote><!-- Usage --> | </blockquote><!-- Usage --> |
Revision as of 20:15, 6 April 2023
A library for serializting/deserializing and interacting with various columnar data formats (ex. parquet).
Documentation
api docs https://www.rubydoc.info/gems/red-arrow/11.0.0 github https://github.com/apache/arrow/tree/main/ruby/red-arrow
Usage
# instantiation table = Arrow::Table.new( "name" => ["luke", "leia", "anakin"], "age" => [5, 5, 25] ) table = Arrow::load("foo.parquet", format: :parquet) # serialize buf = Arrow::ResizableBuffer.new(1024) table.save(buf, format: :parquet) buf.data.to_s # << serialized table # deserialize buf = Arrow::Buffer.new("..serialized-parquet..") table = Arrow::Table.load(buf, format: :parquet)table.n_rows # number of rows table.column_names # names of columns table.slice(1, 4) # new table w/ rows between 1-4 table.group("name") # group by a value table.slice(1, 1).raw_records # json-esque data table.select_columns("name", "age") # only return these columns table .select_columns(..) .each_column { |col|.each_with_index { |row| puts row } } # each row value for selected columnsub-tables are defined as structs
datatype = Arrow::StructDataType.new( year: { type: :int32 }, month: { type: :int32 }, day: { type: :inte32 }, ) values = Arrow::StructArray.new(datatype, [[2020, 2021], [6, 7], [21, 25]) table = Arrow::Table.new(date: values)