Skip to content

Commit 4a61752

Browse files
author
tbchrist
committed
Merge pull request #19 from tumblr/rc90
Rc90
2 parents 315a3c7 + 42994d4 commit 4a61752

33 files changed

+1393
-116
lines changed

README.rdoc

+2-1
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ If you have a question that isn't covered here, please feel free to email the au
7575
* <b>Dallas Marlow</b>: Developer
7676
* <b>Bob Patterson Jr</b>: Developer
7777
* <b>Tom Christ</b>: Developer
78+
* <b>Tyler Neely</b>: Developer
7879

7980
Special thanks to <b>Tim Ellis</b> for testing and bug reports.
8081

@@ -92,4 +93,4 @@ Unless required by applicable law or agreed to in writing, software
9293
distributed under the License is distributed on an "AS IS" BASIS,
9394
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9495
See the License for the specific language governing permissions and
95-
limitations under the License.
96+
limitations under the License.

bin/jetpants

+94-13
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# module in a Thor command processor, providing a command-line interface to
55
# common Jetpants functionality.
66

7-
%w[thor pry highline/import colored].each {|g| require g}
7+
%w[thor pry pry-rescue highline/import colored].each {|g| require g}
88

99
module Jetpants
1010

@@ -222,15 +222,15 @@ module Jetpants
222222
end
223223

224224
source.start_mysql if ! source.running?
225-
error "source (#{source}) is not a standby slave" unless source.is_standby?
225+
error "source (#{source}) is not a standby or backup slave" unless (source.is_standby? || source.for_backups?)
226226

227227
targets.each do |t|
228228
error "target #{t} already has a master; please clear out node (including in asset tracker) before proceeding" if t.master
229229
error "target #{t} is running a different version of MySQL than source #{source}! Cannot proceed with clone operation." if t.version_cmp(source) != 0
230230
end
231231

232232
source.enslave_siblings!(targets)
233-
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master}
233+
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master(21600)}
234234
source.pool.sync_configuration
235235
puts "Cloning complete."
236236
Jetpants.topology.write_config
@@ -264,7 +264,12 @@ module Jetpants
264264
puts "This task turns an active slave into a standby slave."
265265
node = ask_node('Please enter node IP: ', options[:node])
266266
describe node
267-
raise "Node is not an active slave" unless node.role == :active_slave
267+
if node.running?
268+
raise "Node is not an active slave" unless node.role == :active_slave
269+
else
270+
inform "Unable to connect to node #{node} to pull slave"
271+
error "Unable to pull slave" unless agree "Please confirm that #{node} is offline [yes/no]: "
272+
end
268273
node.pool.mark_slave_standby(node)
269274
Jetpants.topology.write_config
270275
end
@@ -391,6 +396,8 @@ module Jetpants
391396
# (ie, if app configuration is a static file that needs to be deployed to webs.)
392397
desc 'shard_split_child_reads', 'shard split step 2 of 4: move reads to child shards'
393398
def shard_split_child_reads
399+
s = ask_shard_being_split
400+
s.move_reads_to_children
394401
Jetpants.topology.write_config
395402
end
396403
def self.after_shard_split_child_reads
@@ -453,8 +460,9 @@ module Jetpants
453460
# because the new last shard can't be created yet (chicken-and-egg problem -- master
454461
# must exist before we create the pool). The assumption is the hardware spec
455462
# of the new last shard and previous last shard will be the same.
456-
raise "Not enough total spare machines!" unless Jetpants.topology.count_spares(like: last_shard_master) >= Jetpants.standby_slaves_per_pool + 1
457-
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: last_shard_master) >= Jetpants.standby_slaves_per_pool
463+
raise "Not enough total spare machines!" unless Jetpants.topology.count_spares(like: last_shard_master) >= last_shard.slaves_layout[:standby_slave] + last_shard.slaves_layout[:backup_slave] + 1
464+
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: last_shard_master) >= last_shard.slaves_layout[:standby_slave]
465+
raise "Not enough backup_slave role spare machines!" unless Jetpants.topology.count_spares(role: :backup_slave) >= last_shard.slaves_layout[:backup_slave]
458466
raise "Cannot find a spare master-role machine!" unless Jetpants.topology.count_spares(role: :master, like: last_shard_master) >= 1
459467

460468
# In asset tracker, remove the last shard pool and replace it with a new pool. The new pool
@@ -470,15 +478,26 @@ module Jetpants
470478
# NOT to actually set the pool of the returned object.)
471479
new_last_shard_master = Jetpants.topology.claim_spare(role: :master, like: last_shard_master)
472480
new_last_shard_master.disable_read_only! if new_last_shard_master.running?
473-
if Jetpants.standby_slaves_per_pool > 0
481+
if last_shard.slaves_layout[:standby_slave] > 0
474482
# Verify spare count again, now that we can actually supply the new master as the :like context
475-
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: new_last_shard_master) >= Jetpants.standby_slaves_per_pool
476-
new_last_shard_slaves = Jetpants.topology.claim_spares(Jetpants.standby_slaves_per_pool, role: :standby_slave, like: new_last_shard_master)
483+
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: new_last_shard_master) >= last_shard.slaves_layout[:standby_slave]
484+
new_last_shard_slaves = Jetpants.topology.claim_spares(last_shard.slaves_layout[:standby_slave], role: :standby_slave, like: new_last_shard_master)
477485
new_last_shard_slaves.each do |x|
478486
x.change_master_to new_last_shard_master
479487
x.resume_replication
480488
end
481489
end
490+
491+
# Set up backup slaves
492+
if last_shard.slaves_layout[:backup_slave] > 0
493+
raise "Not enough backup_slave role spare machines!" unless Jetpants.topology.count_spares(role: :backup_slave) >= last_shard.slaves_layout[:backup_slave]
494+
new_last_shard_backup_slaves = Jetpants.topology.claim_spares(last_shard.slaves_layout[:backup_slave], role: :backup_slave)
495+
new_last_shard_backup_slaves.each do |x|
496+
x.change_master_to new_last_shard_master
497+
x.resume_replication
498+
end
499+
end
500+
482501
new_last_shard = Shard.new(cutover_id, 'INFINITY', new_last_shard_master)
483502
new_last_shard.sync_configuration
484503
Jetpants.topology.pools << new_last_shard
@@ -498,7 +517,53 @@ module Jetpants
498517
'Deploy the configuration to all machines.',
499518
)
500519
end
501-
520+
521+
desc 'rebalance_backup_slaves', 'Add backup slaves to pools which contain too few (does not destroy existing slaves)'
522+
def rebalance_backup_slaves
523+
spares_available = Jetpants.topology.count_spares(role: :backup_slave)
524+
525+
raise "No backup slaves available" if spares_available == 0
526+
527+
# find shards that need a backup slave
528+
need_backup_shards = Jetpants.topology.shards.reject { |shard| shard.backup_slaves.count >= shard.slaves_layout[:backup_slave] }
529+
530+
possible_iterations = spares_available < need_backup_shards.count ? spares_available : need_backup_shards.count
531+
532+
spares = Jetpants.topology.claim_spares(possible_iterations, role: :backup_slave)
533+
534+
need_backup_shards.first(possible_iterations)
535+
536+
# loop through and place a backup slave per pool that is without
537+
need_backup_shards.limited_concurrent_map(5) do |shard|
538+
539+
if spares.count > 0
540+
541+
output "Considering shard #{shard}"
542+
543+
source = shard.standby_slaves.last
544+
source.master.probe if source.master # fail early if there are any replication issues in this pool
545+
546+
547+
targets = [spares.pop]
548+
549+
source.start_mysql if ! source.running?
550+
error "Source (#{source}) is not a standby slave" unless source.is_standby?
551+
552+
targets.each do |t|
553+
error "Target #{t} already has a master; please clear out node (including in asset tracker) before proceeding" if t.master
554+
error "Target #{t} is running a different version of MySQL than source #{source}! Cannot proceed with clone operation." if t.version_cmp(source) != 0
555+
error "Target #{t} already has a pool!" if t.pool
556+
end
557+
558+
source.enslave_siblings!(targets)
559+
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master(21600)}
560+
source.pool.sync_configuration
561+
output "Rebalance complete for #{shard}"
562+
563+
end
564+
565+
end
566+
end
502567

503568
no_tasks do
504569
def is_ip? address
@@ -516,7 +581,7 @@ module Jetpants
516581
def describe node
517582
puts "Node #{node} (#{node.hostname}:#{node.port}) has role #{node.role} in pool #{node.pool(true)}.".green
518583
end
519-
584+
520585
def ask_node(prompt, supplied_node=false)
521586
node = supplied_node || ask(prompt)
522587
error "Node (#{node}) does not appear to be an IP address." unless is_ip? node
@@ -540,7 +605,7 @@ module Jetpants
540605
s = Jetpants.topology.shard shard_min, shard_max
541606
raise "Shard not found" unless s
542607
end
543-
raise "Shard isn't in expected state" unless s.state == :deprecated
608+
raise "Shard does not have children" if (s.children.nil? or s.children.count ==0)
544609
s
545610
end
546611
end
@@ -558,4 +623,20 @@ end
558623
# We load jetpants last so that plugins can monkeypatch Jetpants::CommandSuite if desired.
559624
require 'jetpants'
560625

561-
Jetpants::CommandSuite.start
626+
def with_debug
627+
Pry::rescue do
628+
begin
629+
yield if block_given?
630+
rescue => e
631+
Pry.config.prompt = proc {|_, _, _| "# debug > "}
632+
puts "Entering debugging session (debug_exceptions is enabled). Resume with ctrl+d."
633+
Pry::rescued e
634+
end
635+
end
636+
end
637+
638+
if Jetpants.debug_exceptions
639+
with_debug { Jetpants::CommandSuite.start }
640+
else
641+
Jetpants::CommandSuite.start
642+
end

etc/jetpants.yaml.sample

+7
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@ mysql_grant_privs:
2121
- LOCK TABLES
2222
- EXECUTE
2323

24+
# Specific schemata may be optionally ignored while cloning. If this is not
25+
# overridden it will default to information_schema and performance_schema.
26+
mysql_clone_ignore:
27+
- some_temporary_schema
28+
- information_schema
29+
- performance_schema
30+
2431
# Define what directory to put exported files in. If you have /var/lib/mysql
2532
# on an SSD RAID mount and the rest of your OS on a rotational disk mount,
2633
# it's fine to put the export_location on the rotational disk, which probably

jetpants.gemspec

+4-2
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@ require 'rake'
22

33
Gem::Specification.new do |s|
44
s.name = "jetpants"
5-
s.version = "0.8.3"
5+
s.version = "0.9.0"
66

77
s.homepage = 'https://github.com/tumblr/jetpants/'
88
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
99
s.required_ruby_version = '>= 1.9.2'
1010
s.authors = ["Evan Elias", "Dallas Marlow", "Bob Patterson Jr.", "Tom Christ"]
11-
s.date = "2013-05-17"
11+
s.date = "2014-06-09"
1212
1313
s.files = FileList['Gemfile', 'README.rdoc', 'doc/*.rdoc', 'lib/**/*.rb', 'bin/**', 'scripts/*.rb', 'plugins/**/*.rb', 'etc/jetpants.yaml.sample'].to_a
1414
s.require_paths = ["lib"]
@@ -24,8 +24,10 @@ Gem::Specification.new do |s|
2424
s.add_runtime_dependency 'sequel', '~> 3.36'
2525
s.add_runtime_dependency 'net-ssh', '~> 2.3'
2626
s.add_runtime_dependency 'pry', '~> 0.9.8'
27+
s.add_runtime_dependency 'pry-rescue', '~> 1.4.0'
2728
s.add_runtime_dependency 'thor', '~> 0.15'
2829
s.add_runtime_dependency 'highline', '~> 1.6.12'
2930
s.add_runtime_dependency 'colored', '~> 1.2'
3031
s.add_runtime_dependency 'collins_client', '~> 0.2.7'
32+
s.add_runtime_dependency 'bloom-filter', '~> 0.2.0'
3133
end

lib/jetpants.rb

+4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ module Jetpants
2323
@config = {
2424
'max_concurrency' => 20, # max threads/conns per database
2525
'standby_slaves_per_pool' => 2, # number of standby slaves in every pool
26+
'backup_slaves_per_pool' => 1, # number of backup slaves in every pool
2627
'mysql_schema' => 'test', # database name
2728
'mysql_app_user' => 'appuser', # mysql user for application
2829
'mysql_app_password' => '', # mysql password for application
@@ -31,6 +32,7 @@ module Jetpants
3132
'mysql_root_password' => false, # mysql root password. omit if specified in /root/.my.cnf instead.
3233
'mysql_grant_ips' => ['192.168.%'], # mysql user manipulations are applied to these IPs
3334
'mysql_grant_privs' => ['ALL'], # mysql user manipulations grant this set of privileges by default
35+
'mysql_clone_ignore' => ['information_schema', 'performance_schema'], # these schemata will be ignored during cloning
3436
'export_location' => '/tmp', # directory to use for data dumping
3537
'verify_replication' => true, # raise exception if the 2 repl threads are in different states, or if actual repl topology differs from Jetpants' understanding of it
3638
'plugins' => {}, # hash of plugin name => arbitrary plugin data (usually a nested hash of settings)
@@ -39,6 +41,8 @@ module Jetpants
3941
'compress_with' => false, # command line to use for compression in large file transfers
4042
'decompress_with' => false, # command line to use for decompression in large file transfers
4143
'private_interface' => 'bond0', # network interface corresponding to private IP
44+
'output_caller_info' => false, # includes calling file, line and method in output calls
45+
'debug_exceptions' => false, # open a pry session when an uncaught exception is thrown
4246
}
4347

4448
config_paths = ["/etc/jetpants.yaml", "~/.jetpants.yml", "~/.jetpants.yaml"]

lib/jetpants/db.rb

+4-13
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ module Jetpants
1212
# functional lines.
1313
class DB
1414
include CallbackHandler
15+
include Output
1516

1617
# IP address (as a string) of the MySQL instance
1718
attr_reader :ip
@@ -70,6 +71,9 @@ def initialize(ip, port=3306)
7071
# This is ephemeral, only known to Jetpants if you previously called
7172
# DB#start_mysql or DB#restart_mysql in this process
7273
@options = []
74+
75+
# Mutex used to lock probing state
76+
@probe_mutex = Mutex.new
7377
end
7478

7579
###### Host methods ########################################################
@@ -97,19 +101,6 @@ def same_host_as?(db)
97101

98102
###### Misc methods ########################################################
99103

100-
# Displays the provided output, along with information about the current time,
101-
# self, and optionally a Jetpants::Table name.
102-
def output(str, table=nil)
103-
str = str.to_s.strip
104-
str = nil if str && str.length == 0
105-
str ||= "Completed (no output)"
106-
output = Time.now.strftime("%H:%M:%S") + " [#{self}] "
107-
output << table.name << ': ' if table
108-
output << str
109-
print output + "\n"
110-
output
111-
end
112-
113104
# DB objects are sorted as strings, ie, by calling to_s
114105
def <=> other
115106
to_s <=> other.to_s

lib/jetpants/db/import_export.rb

+13-4
Original file line numberDiff line numberDiff line change
@@ -325,13 +325,22 @@ def clone_to!(*targets)
325325
destinations[t] = t.mysql_directory
326326
raise "Over 100 MB of existing MySQL data on target #{t}, aborting copy!" if t.data_set_size > 100000000
327327
end
328-
[self, targets].flatten.concurrent_each {|t| t.stop_query_killer; t.stop_mysql}
329-
targets.concurrent_each {|t| t.ssh_cmd "rm -rf #{t.mysql_directory}/ib_logfile*"}
330328

331329
# Construct the list of files and dirs to copy. We include ib_lru_dump if present
332330
# (ie, if using Percona Server with innodb_buffer_pool_restore_at_startup enabled)
333331
# since this will greatly improve warm-up time of the cloned nodes
334-
files = ['ibdata1', 'mysql', 'test', app_schema]
332+
databases = mysql_root_cmd("SHOW DATABASES").split("\n").select { |row|
333+
row.include?('Database:')
334+
}.map{ |line|
335+
line.split(":").last.strip
336+
}.reject { |s|
337+
Jetpants.mysql_clone_ignore.include? s
338+
}
339+
340+
[self, targets].flatten.concurrent_each {|t| t.stop_query_killer; t.stop_mysql}
341+
targets.concurrent_each {|t| t.ssh_cmd "rm -rf #{t.mysql_directory}/ib_logfile*"}
342+
343+
files = (databases + ['ibdata1', app_schema]).uniq
335344
files << 'ib_lru_dump' if ssh_cmd("test -f #{mysql_directory}/ib_lru_dump 2>/dev/null; echo $?").chomp.to_i == 0
336345

337346
fast_copy_chain(mysql_directory,
@@ -343,4 +352,4 @@ def clone_to!(*targets)
343352
end
344353

345354
end
346-
end
355+
end

lib/jetpants/db/replication.rb

+2-2
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def enslave_siblings!(targets)
164164
t.enable_read_only!
165165
end
166166
resume_replication # should already have happened from the clone_to! restart anyway, but just to be explicit
167-
catch_up_to_master
167+
catch_up_to_master 21600
168168
enable_monitoring
169169
end
170170

@@ -209,7 +209,7 @@ def seconds_behind_master
209209
# returns true if slave lag is zero 3 times in a row. Gives up if this does
210210
# not occur within a one-hour period. If a large amount of slave lag is
211211
# reported, this method will automatically reduce its polling frequency.
212-
def catch_up_to_master(timeout=3600, threshold=3, poll_frequency=5)
212+
def catch_up_to_master(timeout=21600, threshold=3, poll_frequency=5)
213213
raise "This instance is not a slave" unless master
214214
resume_replication if @repl_paused
215215

lib/jetpants/db/server.rb

+11
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,16 @@ def enable_monitoring(*services)
108108
def disable_monitoring(*services)
109109
end
110110

111+
# No built-in effect. Use when performing actions which will cause the
112+
# server to go offline or become unresponsive, as an escalated enable_monitoring
113+
# Plugins can override and/or implement before/after hooks
114+
def set_downtime(hours)
115+
end
116+
117+
# No built-in effect. Use when performing actions which will cause the
118+
# server to go offline or become unresponsive, as an escalated disable_monitoring
119+
# Plugins can override and/or implement before/after hooks
120+
def cancel_downtime
121+
end
111122
end
112123
end

0 commit comments

Comments
 (0)