ruby-spark/lib/spark/sql/data_frame_writer.rb at 1187e929334757c66498e72b72e02094ed29a194 · ondra-m/ruby-spark · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
module Spark
  module SQL
    class DataFrameWriter

      attr_reader :sql_context, :jwriter

      def initialize(df)
        @sql_context = df.sql_context
        @jwriter = df.jdf.write
      end

      # Specifies the input data source format.
      # Parameter is name of the data source, e.g. 'json', 'parquet'.
      def format(source)
        jwriter.format(source)
        self
      end

      # Adds an input option for the underlying data source.
      def option(key, value)
        jwriter.option(key, value.to_s)
        self
      end

      # Adds input options for the underlying data source.
      def options(options)
        options.each do |key, value|
          jwriter.option(key, value.to_s)
        end
        self
      end

      # Loads data from a data source and returns it as a :class`DataFrame`.
      #
      # == Parameters:
      # path:: Optional string for file-system backed data sources.
      # format:: Optional string for format of the data source. Default to 'parquet'.
      # schema:: Optional {StructType} for the input schema.
      # options:: All other string options.
      #
      def save(path=nil, new_format=nil, new_options=nil)
        new_format && format(new_format)
        new_options && options(new_options)

        # TODO - jwrite returns nil, probably should catch exception and return true/false
        if path.nil?
          jwriter.save
        else
          jwriter.save(path)
        end
      end

      # Saves DataFrame as a JSON file (one object per line)
      #
      # == Parameters:
      # path:: string, path to the JSON dataset
      #
      # == Example:
      #   df = sql.writer.json('output.json')
      #
      def json(path)
        # ClassNotFoundException: Failed to load class for data source: json
        # df(jwriter.json(path))

        save(path, 'org.apache.spark.sql.execution.datasources.json')
      end

      def parquet(path)
        # ClassNotFoundException: Failed to load class for data source: parquet
        # df(jwriter.parquet(path))

        save(path, 'org.apache.spark.sql.execution.datasources.parquet')
      end

      def orc(path)
        # ClassNotFoundException: Failed to load class for data source: json
        # df(jwriter.json(path))

        save(path, 'org.apache.spark.sql.execution.datasources.orc')
      end

      def insert_into table_name
        jwriter.insertInto(table_name)
      end

      def save_as_table name
        jwriter.saveAsTable(name)
      end

      def partition_by columns
        jwriter.partitionBy(columns)
      end

      def mode name
        jwriter.mode(name)
        self
      end

    end
  end
end