-
Notifications
You must be signed in to change notification settings - Fork 28
Expand file tree
/
Copy pathdata_frame_writer.rb
More file actions
101 lines (83 loc) · 2.61 KB
/
data_frame_writer.rb
File metadata and controls
101 lines (83 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
module Spark
module SQL
class DataFrameWriter
attr_reader :sql_context, :jwriter
def initialize(df)
@sql_context = df.sql_context
@jwriter = df.jdf.write
end
# Specifies the input data source format.
# Parameter is name of the data source, e.g. 'json', 'parquet'.
def format(source)
jwriter.format(source)
self
end
# Adds an input option for the underlying data source.
def option(key, value)
jwriter.option(key, value.to_s)
self
end
# Adds input options for the underlying data source.
def options(options)
options.each do |key, value|
jwriter.option(key, value.to_s)
end
self
end
# Loads data from a data source and returns it as a :class`DataFrame`.
#
# == Parameters:
# path:: Optional string for file-system backed data sources.
# format:: Optional string for format of the data source. Default to 'parquet'.
# schema:: Optional {StructType} for the input schema.
# options:: All other string options.
#
def save(path=nil, new_format=nil, new_options=nil)
new_format && format(new_format)
new_options && options(new_options)
# TODO - jwrite returns nil, probably should catch exception and return true/false
if path.nil?
jwriter.save
else
jwriter.save(path)
end
end
# Saves DataFrame as a JSON file (one object per line)
#
# == Parameters:
# path:: string, path to the JSON dataset
#
# == Example:
# df = sql.writer.json('output.json')
#
def json(path)
# ClassNotFoundException: Failed to load class for data source: json
# df(jwriter.json(path))
save(path, 'org.apache.spark.sql.execution.datasources.json')
end
def parquet(path)
# ClassNotFoundException: Failed to load class for data source: parquet
# df(jwriter.parquet(path))
save(path, 'org.apache.spark.sql.execution.datasources.parquet')
end
def orc(path)
# ClassNotFoundException: Failed to load class for data source: json
# df(jwriter.json(path))
save(path, 'org.apache.spark.sql.execution.datasources.orc')
end
def insert_into table_name
jwriter.insertInto(table_name)
end
def save_as_table name
jwriter.saveAsTable(name)
end
def partition_by columns
jwriter.partitionBy(columns)
end
def mode name
jwriter.mode(name)
self
end
end
end
end