diff --git a/README.md b/README.md index 5e29d2c..a03c782 100644 --- a/README.md +++ b/README.md @@ -24,22 +24,30 @@ With these simple definitions, powerful workflows are enabled: * A package providing data cleansing, manipulation, visualization, or analysis can automatically handle any number of decoupled input table types * A tabular file format can have automatic integration with in-memory structures and translation to other file formats + +# Tables Interface + So how does one go about satisfying the Tables.jl interface functions? It mainly depends on what you've already defined and the natural access patterns of your table: -First: +## First: + * `Tables.istable(::Type{<:MyTable}) = true`: this provides an explicit affirmation that your type implements the Tables interface -To support `Rows`: +## To support `Rows`: + * Define `Tables.rowaccess(::Type{<:MyTable}) = true`: this signals to other types that `MyTable` supports valid `Row`-iteration * Define `Tables.rows(x::MyTable)`: return a `Row`-iterator object (perhaps the table itself if already defined) * Define `Tables.schema(Tables.rows(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason -To support `Columns`: +## To support `Columns`: + * Define `Tables.columnaccess(::Type{<:MyTable}) = true`: this signals to other types that `MyTable` supports returning a valid `Columns` object * Define `Tables.columns(x::MyTable)`: return a `Columns`, property-accessible object (perhaps the table itself if it naturally supports property-access to columns) * Define `Tables.schema(Tables.columns(x::MyTable))` to either return a `Tables.Schema` object, or `nothing` if the schema is unknown or non-inferrable for some reason -The final question is how `MyTable` can be a "sink" for any other table type. The answer is quite simple: use the interface functions! +## Sinks (transferring data from one table to another) + +Another question is how `MyTable` can be a "sink" for any other table type. The answer is quite simple: use the interface functions! * Define a function or constructor that takes, at a minimum, a single, untyped argument and then calls `Tables.rows` or `Tables.columns` on that argument to construct an instance of `MyTable` @@ -76,3 +84,35 @@ end ``` Obviously every table type is different, but via a combination of `Tables.rows` and `Tables.columns` each table type should be able to construct an instance of itself. + +## Functions that input and output tables: + +For functions that input a table, perform some calculation, and output a new table, we need a way of constructing the preferred output table given the input. For this purpose, `Tables.materializer(table)` returns the preferred sink function for a table (`Tables.columntable`, which creates a named tuple of AbstractVectors, is the default). + +Note that an in-memory table with a properly defined "sink" function can reconstruct itself with the following: + +```julia +materializer(table)(columns(table)) + +materializer(table)(rows(table)) +``` + +For example, we may want to select a subset of columns from a column-access table. One way we could implement it is with the following: + +```julia +function select(table, cols::Symbol...) + Tables.istable(table) || throw(ArgumentError("select requires a table input")) + nt = Tables.columntable(table) # columntable(t) creates a NamedTuple of AbstractVectors + newcols = NamedTuple{cols}(nt) + Tables.materializer(table)(newcols) +end + +# Example of selecting columns from a columntable +tbl = (x=1:100, y=rand(100), z=randn(100)) +select(tbl, :x) +select(tbl, :x, :z) + +tbl = [(x=1, y="a", z=1.0), (x=2, y="b", z=2.0)] +select(tbl, :z, :x) +``` + diff --git a/src/Tables.jl b/src/Tables.jl index 435abf6..ea7f8cf 100644 --- a/src/Tables.jl +++ b/src/Tables.jl @@ -123,6 +123,7 @@ rowaccess(::Type{T}) where {T} = false columnaccess(x::T) where {T} = columnaccess(T) columnaccess(::Type{T}) where {T} = false schema(x) = nothing +materializer(x) = columntable # Schema implementation """ diff --git a/src/namedtuples.jl b/src/namedtuples.jl index 45ecebb..8eddfd9 100644 --- a/src/namedtuples.jl +++ b/src/namedtuples.jl @@ -12,6 +12,7 @@ rowaccess(::Type{<:RowTable}) = true # a Vector of NamedTuple iterates `Row`s itself rows(x::RowTable) = x schema(x::Vector{NamedTuple{names, types}}) where {names, types} = Schema(names, types) +materializer(x::RowTable) = rowtable # struct to transform `Row`s into NamedTuples struct NamedTupleIterator{S, T} @@ -75,6 +76,7 @@ columnaccess(::Type{<:ColumnTable}) = true # a NamedTuple of AbstractVectors is itself a `Columns` object columns(x::ColumnTable) = x schema(x::T) where {T <: ColumnTable} = Schema(names(T), _types(T)) +materializer(x::ColumnTable) = columntable _eltype(::Type{A}) where {A <: AbstractVector{T}} where {T} = T Base.@pure function _types(::Type{NT}) where {NT <: NamedTuple{names, T}} where {names, T <: NTuple{N, AbstractVector{S} where S}} where {N} diff --git a/test/runtests.jl b/test/runtests.jl index e478327..234eb6b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -108,6 +108,27 @@ end @test isequal(rt, nti2) end +@testset "Materializer" begin + rt = [(a=1, b=4.0, c="7"), (a=2, b=5.0, c="8"), (a=3, b=6.0, c="9")] + nt = (a=[1,2,3], b=[4.0, 5.0, 6.0], c=["7", "8", "9"]) + + @test nt == Tables.materializer(nt)(Tables.columns(nt)) + @test nt == Tables.materializer(nt)(Tables.columns(rt)) + @test nt == Tables.materializer(nt)(rt) + @test rt == Tables.materializer(rt)(nt) + + function select(table, cols::Symbol...) + Tables.istable(table) || throw(ArgumentError("select requires a table input")) + nt = Tables.columntable(table) # columntable(t) creates a NamedTuple of AbstractVectors + newcols = NamedTuple{cols}(nt) + Tables.materializer(table)(newcols) + end + + @test select(nt, :a, :b, :c) == nt + @test select(nt, :c, :a) == NamedTuple{(:c, :a)}(nt) + @test select(rt, :a) == [(a=1,), (a=2,), (a=3,)] +end + import Base: == struct GenericRow a::Int