ruby-spark/lib/spark/mllib/regression/ridge.rb at master · ondra-m/ruby-spark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
##
# RidgeRegressionModel
#
# Train a regression model with L2-regularization using Stochastic Gradient Descent.
# This solves the l1-regularized least squares regression formulation
#   f(weights) = 1/2n ||A weights-y||^2^  + regParam/2 ||weights||^2^
# Here the data matrix has n rows, and the input RDD holds the set of rows of A, each with
# its corresponding right hand side label y.
# See also the documentation for the precise formulation.
#
# == Examples:
#
#   Spark::Mllib.import
#
#   data = [
#       LabeledPoint.new(0.0, [0.0]),
#       LabeledPoint.new(1.0, [1.0]),
#       LabeledPoint.new(3.0, [2.0]),
#       LabeledPoint.new(2.0, [3.0])
#   ]
#   lrm = RidgeRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.predict([0.0]) - 0 < 0.5
#   # => true
#
#   lrm.predict([1.0]) - 1 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
#   data = [
#       LabeledPoint.new(0.0, SparseVector.new(1, {0 => 0.0})),
#       LabeledPoint.new(1.0, SparseVector.new(1, {0 => 1.0})),
#       LabeledPoint.new(3.0, SparseVector.new(1, {0 => 2.0})),
#       LabeledPoint.new(2.0, SparseVector.new(1, {0 => 3.0}))
#   ]
#   lrm = LinearRegressionWithSGD.train($sc.parallelize(data), initial_weights: [1.0])
#
#   lrm.predict([0.0]) - 0 < 0.5
#   # => true
#
#   lrm.predict(SparseVector.new(1, {0 => 1.0})) - 1 < 0.5
#   # => true
#
class Spark::Mllib::RidgeRegressionModel < Spark::Mllib::RegressionModel
end

module Spark
  module Mllib
    class RidgeRegressionWithSGD < RegressionMethodBase

      DEFAULT_OPTIONS = {
        iterations: 100,
        step: 1.0,
        reg_param: 0.01,
        mini_batch_fraction: 1.0,
        initial_weights: nil,
        intercept: false,
        validate: true,
        convergence_tol: 0.001
      }

      # Train a ridge regression model on the given data.
      #
      # == Parameters:
      # rdd::
      #   The training data (RDD instance).
      #
      # iterations::
      #   The number of iterations (default: 100).
      #
      # step::
      #   The step parameter used in SGD (default: 1.0).
      #
      # reg_param::
      #   The regularizer parameter (default: 0.0).
      #
      # mini_batch_fraction::
      #   Fraction of data to be used for each SGD iteration (default: 1.0).
      #
      # initial_weights::
      #   The initial weights (default: nil).
      #
      # intercept::
      #   Boolean parameter which indicates the use
      #   or not of the augmented representation for
      #   training data (i.e. whether bias features
      #   are activated or not).
      #   (default: false)
      #
      # validate::
      #   Boolean parameter which indicates if the
      #   algorithm should validate data before training.
      #   (default: true)
      #
      # convergence_tol::
      #   A condition which decides iteration termination.
      #   (default: 0.001)
      #
      def self.train(rdd, options={})
        super

        weights, intercept = Spark.jb.call(RubyMLLibAPI.new, 'trainRidgeModelWithSGD', rdd,
                                           options[:iterations].to_i,
                                           options[:step].to_f,
                                           options[:reg_param].to_f,
                                           options[:mini_batch_fraction].to_f,
                                           options[:initial_weights],
                                           options[:intercept],
                                           options[:validate],
                                           options[:convergence_tol])

        RidgeRegressionModel.new(weights, intercept)
      end

    end
  end
end