-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy paths3_to_gcs.rb
More file actions
executable file
·202 lines (166 loc) · 5.55 KB
/
s3_to_gcs.rb
File metadata and controls
executable file
·202 lines (166 loc) · 5.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#!/usr/bin/env ruby
require 'aws-sdk'
require 'google/cloud/storage'
#require 'smarter_csv'
require 'optparse'
require 'pathname'
require 'fileutils'
require 'logger'
require 'rainbow'
ALWAYS_UPDATE = /(head|dev|snapshot|nightly)|\bphp-\d+\.\d+\.tar/
def logger
@logger ||= Logger.new(
$stderr,
level: Logger::WARN,
formatter: proc do |severity, time, progname, msg|
case severity
when "UNKOWN", "FATAL", "ERROR"
c = :red
when /WARN/
c = :yellow
when /INFO/
c = :blue
when /DEBUG/
c = :default
end
Logger::Formatter::Format % [severity[0..0], time.strftime(@datetime_format || "%Y-%m-%dT%H:%M:%S.%6N "), $$, Rainbow(severity).color(c), progname, msg]
end
)
end
def options
@options
end
@options = {
s3_region: 'us-east-1',
}
parser = OptionParser.new do |opts|
opts.banner = "Usage: #{$0} [options]"
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
options[:verbose] = v
end
opts.on("--s3-region=MANDATORY", "S3 region") do |s3_region|
options[:s3_region] = s3_region
end
opts.on("--gcs-region=MANDATORY", "GCS region") do |gcs_region|
options[:gcs_region] = gcs_region
end
opts.on("--gcs-creds-json=MANDATORY", "JSON file containing GCS credentials, as downloaded from GCP") do |json_file|
options[:gcs_creds_json] = json_file
end
opts.on("--gcs-project-id=MANDATORY", "GCS project ID") do |proj_id|
options[:gcs_project_id] = proj_id
end
opts.on("--s3-bucket=MANDATORY", "Bucket to copy from (on S3)") do |bucket|
options[:s3_bucket] = bucket
end
opts.on("--s3-prefix=MANDATORY", "Bucket prefix for S3") do |prefix|
options[:s3_prefix] = prefix
end
opts.on("--gcs-bucket=MANDATORY", "Bucket to copy to (on GCS)") do |bucket|
options[:gcs_bucket] = bucket
end
opts.on("--gcs-prefix=MANDATORY", "Bucket prefix for GCS") do |prefix|
options[:gcs_prefix] = prefix
end
opts.on("--log-level=MANDATORY", "Log level") do |level|
options[:log_level] = level
end
end
parser.parse!
logger.level = options[:log_level] if options[:log_level]
logger.level = Logger::DEBUG if options[:verbose]
Google::Apis.logger = logger
logger.debug options.inspect
def s3
@s3 ||= Aws::S3::Resource.new(
region: options[:s3_region] || 'us-east-1'
)
end
def gcs
@gcs ||= Google::Cloud::Storage.new(
project_id: options[:gcs_project_id],
credentials: options[:gcs_creds_json]
)
end
def main
s3_bucket = s3.bucket(options[:s3_bucket])
gcs_bucket = gcs.bucket(options[:gcs_bucket])
s3_bucket.objects.each do |obj_summary|
obj_key = obj_summary.key
unless obj_summary.size > 0
logger.info "Skipping blank file #{obj_key}"
next
end
unless obj_key.start_with?(options[:s3_prefix])
logger.info "Skipping #{obj_key} because it does not match prefix #{options[:s3_prefix]}"
next
end
pn = Pathname.new(obj_key)
gcs_obj_key = obj_key.sub(options[:s3_prefix], options[:gcs_prefix])
if obj_key.end_with?(".sha256sum.txt.sha256sum.txt")
logger.info "Removing #{obj_key}"
obj_summary.delete
gcs_bucket.file(gcs_obj_key).delete
next
end
checksums_match_p = false
begin
checksum_obj_key = obj_key
gcs_checksum_obj_key = gcs_obj_key
if !obj_key.end_with?(".sha256sum.txt")
checksum_obj_key = obj_key + ".sha256sum.txt"
gcs_checksum_obj_key = gcs_obj_key + ".sha256sum.txt"
end
s3_obj_checksum = s3.client.get_object(
bucket: s3_bucket.name,
key: checksum_obj_key
).body.string
gcs_obj_checksum = (gcs_obj_checksum_obj = gcs_bucket.find_file(gcs_checksum_obj_key)) && gcs_obj_checksum_obj.download.string
if checksums_match_p = (s3_obj_checksum == gcs_obj_checksum)
logger.info "Skipping #{obj_key} because checksums match"
end
rescue Aws::S3::Errors::ServiceError => s3err
logger.warn(obj_key + " " + s3err.message)
rescue Google::Cloud::Error => gcerr
logger.warn(obj_key + " " + gcerr.message)
end
next if checksums_match_p
logger.info "Processing #{obj_key}"
logger.info "Downloading #{obj_key}"
local_file = File.basename(pn)
if !File.exist?(local_file)
unless obj_summary.download_file(local_file)
logger.warn "Failed to download #{obj_key}"
next
end
# generate and upload sha256sum file
if !local_file.end_with?(".sha256sum.txt")
begin
s3.client.get_object(bucket: s3_bucket.name, key: obj_key + ".sha256sum.txt")
rescue Aws::S3::Errors::NoSuchKey => no_such_key
logger.warn(obj_key + ".sha256sum.txt does not exist")
end
`sha256sum #{local_file} > #{local_file}.sha256sum.txt`
logger.debug "Generated sha256 checksum file: #{File.read(local_file + ".sha256sum.txt")}"
logger.info "Uploading #{local_file + ".sha256sum.txt"} to S3"
s3_bucket.put_object(
acl: "public-read",
body: File.read(local_file + ".sha256sum.txt"),
key: obj_key + ".sha256sum.txt"
)
logger.info "Uploading #{local_file + ".sha256sum.txt"} to GCS"
gcs_bucket.create_file(local_file + ".sha256sum.txt", gcs_obj_key + ".sha256sum.txt")
end
end
logger.info "Downloaded #{obj_key}"
# Upload to GCS
logger.debug "local_file: #{local_file}"
logger.debug "gcs_obj_key: #{gcs_obj_key}"
logger.info "Uploading #{gcs_obj_key}"
if gcs_bucket.create_file(local_file, gcs_obj_key)
logger.info "Uploaded #{gcs_obj_key}"
FileUtils.rm_f(local_file)
end
end
end
main