Skip to content

Commit c210d56

Browse files
committed
added statistics methods to DataFrame including #describe, #cov, #corr, #mean, #std etc. Also monkey patched Matrix to include #elementwise_division
1 parent db52927 commit c210d56

File tree

8 files changed

+118
-25
lines changed

8 files changed

+118
-25
lines changed

History.txt

+6-1
Original file line numberDiff line numberDiff line change
@@ -69,4 +69,9 @@
6969
* Convert Daru::Vector to horizontal or vertical Ruby Matrix with #to_matrix.
7070
* Added shortcut to DataFrame to allow access of vectors by using only #[] instead of calling #vector or *[vector_names, :vector]*.
7171
* Added DSL for Vector and DataFrame plotting with nyaplot. Can now grab the underlying Nyaplot::Plot and Nyaplot::Diagram object for performing different operations. Only need to supply parameters for the initial creation of the diagram.
72-
* Added #pivot\_table to DataFrame for reducing and aggregating data to generate a quick summary.
72+
* Added #pivot\_table to DataFrame for reducing and aggregating data to generate a quick summary.
73+
* Added #shape to DataFrame for knowing the numbers of rows and columns in a DataFrame.
74+
* Added statistics methods #mean, #std, #max, #min, #count, #product, #sum to DataFrame.
75+
* Added #describe to DataFrame for producing multiple statistics data of numerical vectors in one shot.
76+
* Monkey patched Ruby Matrix to include #elementwise_division.
77+
* Added #covariance to calculate the covariance between numbers of a DataFrame and #correlation to calculate correlation.

lib/daru/dataframe.rb

+15
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,21 @@ def filter_vectors &block
393393
df
394394
end
395395

396+
# Return the number of rows and columns of the DataFrame in an Array.
397+
def shape
398+
[@index.size, @vectors.size]
399+
end
400+
401+
# The number of rows
402+
def rows
403+
shape[0]
404+
end
405+
406+
# The number of vectors
407+
def cols
408+
shape[1]
409+
end
410+
396411
# Check if a vector is present
397412
def has_vector? vector
398413
!!@vectors[*vector]

lib/daru/maths/statistics/dataframe.rb

+66-12
Original file line numberDiff line numberDiff line change
@@ -4,53 +4,107 @@ module Statistics
44
module DataFrame
55
# Calculate mean of numeric vectors.
66
def mean
7-
7+
compute_stats :mean
88
end
99

10-
# Calculate standard deviation of numeric vectors.
10+
# Calculate sample standard deviation of numeric vectors.
1111
def std
12-
12+
compute_stats :std
1313
end
1414

1515
# Calculate sum of numeric vectors
1616
def sum
17-
17+
compute_stats :sum
1818
end
1919

2020
# Count the number of non-nil values in each vector.
2121
def count
22-
22+
compute_stats :count
2323
end
2424

2525
# Calculate the maximum value of each numeric vector.
2626
def max
27-
27+
compute_stats :max
2828
end
2929

3030
# Calculate the minimmum value of each numeric vector.
3131
def min
32-
32+
compute_stats :min
33+
end
34+
35+
# Compute the product of each numeric vector.
36+
def product
37+
compute_stats :product
3338
end
3439

3540
# Create a summary of mean, standard deviation, count, max and min of
3641
# each numeric vector in the dataframe in one shot.
37-
def describe
38-
42+
#
43+
# == Arguments
44+
#
45+
# +methods+ - An array with aggregation methods specified as symbols to
46+
# be applied to numeric vectors. Default is [:count, :mean, :std, :max,
47+
# :min]. Methods will be applied in the specified order.
48+
def describe methods=nil
49+
methods ||= [:count, :mean, :std, :min, :max]
50+
51+
description_hash = {}
52+
numeric_vectors.each do |vec|
53+
description_hash[vec] = methods.map { |m| self[vec].send(m) }
54+
end
55+
Daru::DataFrame.new(description_hash, index: methods)
3956
end
4057

4158
# Calculate variance-covariance between the numeric vectors.
42-
def covariance
43-
59+
#
60+
# == Arguments
61+
#
62+
# +for_sample_data+ - If set to false, will calculate the population
63+
# covariance (denominator N), otherwise calculates the sample covariance
64+
# matrix. Default to true.
65+
def covariance for_sample_data=true
66+
cov_arry =
67+
if defined? NMatrix and NMatrix.respond_to?(:cov)
68+
to_nmatrix.cov(for_sample_data).to_a
69+
else
70+
df_as_matrix = to_matrix
71+
denominator = for_sample_data ? rows - 1 : rows
72+
ones = Matrix.column_vector [1]*rows
73+
deviation_scores = df_as_matrix - (ones * ones.transpose * df_as_matrix) / rows
74+
((deviation_scores.transpose * deviation_scores) / denominator).to_a
75+
end
76+
77+
Daru::DataFrame.rows(cov_arry, index: numeric_vectors, order: numeric_vectors)
4478
end
4579

4680
alias :cov :covariance
4781

4882
# Calculate the correlation between the numeric vectors.
4983
def correlation
50-
84+
corr_arry =
85+
if defined? NMatrix and NMatrix.respond_to?(:corr)
86+
to_nmatrix.corr.to_a
87+
else
88+
standard_deviation = std.to_matrix
89+
(cov.to_matrix.elementwise_division(standard_deviation.transpose *
90+
standard_deviation)).to_a
91+
end
92+
93+
Daru::DataFrame.rows(corr_arry, index: numeric_vectors, order: numeric_vectors)
5194
end
5295

5396
alias :corr :correlation
97+
98+
private
99+
100+
def compute_stats method
101+
Daru::Vector.new(
102+
numeric_vectors.inject({}) do |hash, vec|
103+
hash[vec] = self[vec].send(method)
104+
hash
105+
end
106+
)
107+
end
54108
end
55109
end
56110
end

lib/daru/maths/statistics/vector.rb

+5-4
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,17 @@ def coefficient_of_variation
9797
end
9898

9999
# Retrieves number of cases which comply condition. If block given,
100-
# retrieves number of instances where block returns true. If other
101-
# values given, retrieves the frequency for this value. If no value
102-
# given, counts the number of elements in the Vector.
100+
# retrieves number of instances where block returns true. If other
101+
# values given, retrieves the frequency for this value. If no value
102+
# given, counts the number of non-nil elements in the Vector.
103103
def count value=false
104104
if block_given?
105105
@data.inject(0){ |memo, val| memo += 1 if yield val; memo}
106106
elsif value
107107
val = frequencies[value]
108108
val.nil? ? 0 : val
109109
else
110-
size
110+
size - @nil_positions.size
111111
end
112112
end
113113

@@ -175,6 +175,7 @@ def percentile percent
175175

176176
alias :sdp :standard_deviation_population
177177
alias :sds :standard_deviation_sample
178+
alias :std :sds
178179
alias :adp :average_deviation_population
179180
alias :cov :coefficient_of_variation
180181
alias :variance :variance_sample

lib/daru/monkeys.rb

+8
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,12 @@ class Numeric
5050
def square
5151
self * self
5252
end
53+
end
54+
55+
class Matrix
56+
def elementwise_division other
57+
self.map.with_index do |e, index|
58+
e / other.to_a.flatten[index]
59+
end
60+
end
5361
end

spec/dataframe_spec.rb

+1
Original file line numberDiff line numberDiff line change
@@ -1369,6 +1369,7 @@
13691369

13701370
context "#shape" do
13711371
it "returns an array containing number of rows and columns" do
1372+
expect(@data_frame.shape).to eq([5,3])
13721373
end
13731374
end
13741375
end if mri?

spec/math/statistics/dataframe_spec.rb

+7-7
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@
1414

1515
context "#mean" do
1616
it "calculates mean of single level numeric only vectors and returns values in a Vector" do
17-
expect(@df.mean).to eq(Daru::Vector.new([3.66, 7.33, 36.66],
17+
expect(@df.mean.round(2)).to eq(Daru::Vector.new([3.67, 7.33, 36.67],
1818
index: [:d, :e, :f]
1919
))
2020
end
2121

2222
it "calculates mean of multi level numeric only vectors and returns values in a DataFrame" do
23-
pending "next release"
23+
# TODO - pending
2424
end
2525
end
2626

@@ -32,7 +32,7 @@
3232

3333
context "#sum" do
3434
it "calculates sum of single level numeric only vectors and returns values in a Vector" do
35-
# TODO
35+
# TODO - write tests
3636
end
3737
end
3838

@@ -62,10 +62,10 @@
6262

6363
context "#describe" do
6464
it "generates mean, std, max, min and count of numeric vectors in one shot" do
65-
expect(@df.describe).to eq(Daru::DataFrame.new({
66-
d: [9.00, 3.66 ,2.00 , 1.00, 7.00],
65+
expect(@df.describe.round(2)).to eq(Daru::DataFrame.new({
66+
d: [9.00, 3.67 ,2.00 , 1.00, 7.00],
6767
e: [9.00, 7.33 ,4.00 , 2.00, 14.00],
68-
f: [9.00, 36.66,20.00,10.00, 70.00]
68+
f: [9.00, 36.67,20.00,10.00, 70.00]
6969
}, index: [:count, :mean, :std, :min, :max]
7070
))
7171
end
@@ -82,7 +82,7 @@
8282
end
8383
end
8484

85-
context "#corr" do
85+
context "#corr", focus: true do
8686
it "calculates the correlation between the numeric vectors of DataFrame" do
8787
expect(@df.corr).to eq(Daru::DataFrame.new({
8888
d: [1,1,1],

spec/monkeys_spec.rb

+10-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
require 'spec_helper.rb'
22

33
describe "Monkeys" do
4-
context "Array" do
4+
context Array do
5+
end
6+
7+
context Matrix do
8+
it "performs elementwise division" do
9+
left = Matrix[[3,6,9],[4,8,12],[2,4,6]]
10+
right = Matrix[[3,6,9],[4,8,12],[2,4,6]]
11+
12+
expect(left.elementwise_division(right)).to eq(Matrix[[1,1,1],[1,1,1],[1,1,1]])
13+
end
514
end
615
end

0 commit comments

Comments
 (0)