Skip to content

Commit

Permalink
Merge pull request #19 from tumblr/rc90
Browse files Browse the repository at this point in the history
Rc90
  • Loading branch information
tbchrist committed Sep 26, 2014
2 parents 315a3c7 + 42994d4 commit 4a61752
Show file tree
Hide file tree
Showing 33 changed files with 1,393 additions and 116 deletions.
3 changes: 2 additions & 1 deletion README.rdoc
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ If you have a question that isn't covered here, please feel free to email the au
* <b>Dallas Marlow</b>: Developer
* <b>Bob Patterson Jr</b>: Developer
* <b>Tom Christ</b>: Developer
* <b>Tyler Neely</b>: Developer

Special thanks to <b>Tim Ellis</b> for testing and bug reports.

Expand All @@ -92,4 +93,4 @@ Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
limitations under the License.
107 changes: 94 additions & 13 deletions bin/jetpants
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# module in a Thor command processor, providing a command-line interface to
# common Jetpants functionality.

%w[thor pry highline/import colored].each {|g| require g}
%w[thor pry pry-rescue highline/import colored].each {|g| require g}

module Jetpants

Expand Down Expand Up @@ -222,15 +222,15 @@ module Jetpants
end

source.start_mysql if ! source.running?
error "source (#{source}) is not a standby slave" unless source.is_standby?
error "source (#{source}) is not a standby or backup slave" unless (source.is_standby? || source.for_backups?)

targets.each do |t|
error "target #{t} already has a master; please clear out node (including in asset tracker) before proceeding" if t.master
error "target #{t} is running a different version of MySQL than source #{source}! Cannot proceed with clone operation." if t.version_cmp(source) != 0
end

source.enslave_siblings!(targets)
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master}
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master(21600)}
source.pool.sync_configuration
puts "Cloning complete."
Jetpants.topology.write_config
Expand Down Expand Up @@ -264,7 +264,12 @@ module Jetpants
puts "This task turns an active slave into a standby slave."
node = ask_node('Please enter node IP: ', options[:node])
describe node
raise "Node is not an active slave" unless node.role == :active_slave
if node.running?
raise "Node is not an active slave" unless node.role == :active_slave
else
inform "Unable to connect to node #{node} to pull slave"
error "Unable to pull slave" unless agree "Please confirm that #{node} is offline [yes/no]: "
end
node.pool.mark_slave_standby(node)
Jetpants.topology.write_config
end
Expand Down Expand Up @@ -391,6 +396,8 @@ module Jetpants
# (ie, if app configuration is a static file that needs to be deployed to webs.)
desc 'shard_split_child_reads', 'shard split step 2 of 4: move reads to child shards'
def shard_split_child_reads
s = ask_shard_being_split
s.move_reads_to_children
Jetpants.topology.write_config
end
def self.after_shard_split_child_reads
Expand Down Expand Up @@ -453,8 +460,9 @@ module Jetpants
# because the new last shard can't be created yet (chicken-and-egg problem -- master
# must exist before we create the pool). The assumption is the hardware spec
# of the new last shard and previous last shard will be the same.
raise "Not enough total spare machines!" unless Jetpants.topology.count_spares(like: last_shard_master) >= Jetpants.standby_slaves_per_pool + 1
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: last_shard_master) >= Jetpants.standby_slaves_per_pool
raise "Not enough total spare machines!" unless Jetpants.topology.count_spares(like: last_shard_master) >= last_shard.slaves_layout[:standby_slave] + last_shard.slaves_layout[:backup_slave] + 1
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: last_shard_master) >= last_shard.slaves_layout[:standby_slave]
raise "Not enough backup_slave role spare machines!" unless Jetpants.topology.count_spares(role: :backup_slave) >= last_shard.slaves_layout[:backup_slave]
raise "Cannot find a spare master-role machine!" unless Jetpants.topology.count_spares(role: :master, like: last_shard_master) >= 1

# In asset tracker, remove the last shard pool and replace it with a new pool. The new pool
Expand All @@ -470,15 +478,26 @@ module Jetpants
# NOT to actually set the pool of the returned object.)
new_last_shard_master = Jetpants.topology.claim_spare(role: :master, like: last_shard_master)
new_last_shard_master.disable_read_only! if new_last_shard_master.running?
if Jetpants.standby_slaves_per_pool > 0
if last_shard.slaves_layout[:standby_slave] > 0
# Verify spare count again, now that we can actually supply the new master as the :like context
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: new_last_shard_master) >= Jetpants.standby_slaves_per_pool
new_last_shard_slaves = Jetpants.topology.claim_spares(Jetpants.standby_slaves_per_pool, role: :standby_slave, like: new_last_shard_master)
raise "Not enough standby_slave role spare machines!" unless Jetpants.topology.count_spares(role: :standby_slave, like: new_last_shard_master) >= last_shard.slaves_layout[:standby_slave]
new_last_shard_slaves = Jetpants.topology.claim_spares(last_shard.slaves_layout[:standby_slave], role: :standby_slave, like: new_last_shard_master)
new_last_shard_slaves.each do |x|
x.change_master_to new_last_shard_master
x.resume_replication
end
end

# Set up backup slaves
if last_shard.slaves_layout[:backup_slave] > 0
raise "Not enough backup_slave role spare machines!" unless Jetpants.topology.count_spares(role: :backup_slave) >= last_shard.slaves_layout[:backup_slave]
new_last_shard_backup_slaves = Jetpants.topology.claim_spares(last_shard.slaves_layout[:backup_slave], role: :backup_slave)
new_last_shard_backup_slaves.each do |x|
x.change_master_to new_last_shard_master
x.resume_replication
end
end

new_last_shard = Shard.new(cutover_id, 'INFINITY', new_last_shard_master)
new_last_shard.sync_configuration
Jetpants.topology.pools << new_last_shard
Expand All @@ -498,7 +517,53 @@ module Jetpants
'Deploy the configuration to all machines.',
)
end


desc 'rebalance_backup_slaves', 'Add backup slaves to pools which contain too few (does not destroy existing slaves)'
def rebalance_backup_slaves
spares_available = Jetpants.topology.count_spares(role: :backup_slave)

raise "No backup slaves available" if spares_available == 0

# find shards that need a backup slave
need_backup_shards = Jetpants.topology.shards.reject { |shard| shard.backup_slaves.count >= shard.slaves_layout[:backup_slave] }

possible_iterations = spares_available < need_backup_shards.count ? spares_available : need_backup_shards.count

spares = Jetpants.topology.claim_spares(possible_iterations, role: :backup_slave)

need_backup_shards.first(possible_iterations)

# loop through and place a backup slave per pool that is without
need_backup_shards.limited_concurrent_map(5) do |shard|

if spares.count > 0

output "Considering shard #{shard}"

source = shard.standby_slaves.last
source.master.probe if source.master # fail early if there are any replication issues in this pool


targets = [spares.pop]

source.start_mysql if ! source.running?
error "Source (#{source}) is not a standby slave" unless source.is_standby?

targets.each do |t|
error "Target #{t} already has a master; please clear out node (including in asset tracker) before proceeding" if t.master
error "Target #{t} is running a different version of MySQL than source #{source}! Cannot proceed with clone operation." if t.version_cmp(source) != 0
error "Target #{t} already has a pool!" if t.pool
end

source.enslave_siblings!(targets)
targets.concurrent_each {|t| t.resume_replication; t.catch_up_to_master(21600)}
source.pool.sync_configuration
output "Rebalance complete for #{shard}"

end

end
end

no_tasks do
def is_ip? address
Expand All @@ -516,7 +581,7 @@ module Jetpants
def describe node
puts "Node #{node} (#{node.hostname}:#{node.port}) has role #{node.role} in pool #{node.pool(true)}.".green
end

def ask_node(prompt, supplied_node=false)
node = supplied_node || ask(prompt)
error "Node (#{node}) does not appear to be an IP address." unless is_ip? node
Expand All @@ -540,7 +605,7 @@ module Jetpants
s = Jetpants.topology.shard shard_min, shard_max
raise "Shard not found" unless s
end
raise "Shard isn't in expected state" unless s.state == :deprecated
raise "Shard does not have children" if (s.children.nil? or s.children.count ==0)
s
end
end
Expand All @@ -558,4 +623,20 @@ end
# We load jetpants last so that plugins can monkeypatch Jetpants::CommandSuite if desired.
require 'jetpants'

Jetpants::CommandSuite.start
def with_debug
Pry::rescue do
begin
yield if block_given?
rescue => e
Pry.config.prompt = proc {|_, _, _| "# debug > "}
puts "Entering debugging session (debug_exceptions is enabled). Resume with ctrl+d."
Pry::rescued e
end
end
end

if Jetpants.debug_exceptions
with_debug { Jetpants::CommandSuite.start }
else
Jetpants::CommandSuite.start
end
7 changes: 7 additions & 0 deletions etc/jetpants.yaml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ mysql_grant_privs:
- LOCK TABLES
- EXECUTE

# Specific schemata may be optionally ignored while cloning. If this is not
# overridden it will default to information_schema and performance_schema.
mysql_clone_ignore:
- some_temporary_schema
- information_schema
- performance_schema

# Define what directory to put exported files in. If you have /var/lib/mysql
# on an SSD RAID mount and the rest of your OS on a rotational disk mount,
# it's fine to put the export_location on the rotational disk, which probably
Expand Down
6 changes: 4 additions & 2 deletions jetpants.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@ require 'rake'

Gem::Specification.new do |s|
s.name = "jetpants"
s.version = "0.8.3"
s.version = "0.9.0"

s.homepage = 'https://github.com/tumblr/jetpants/'
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.required_ruby_version = '>= 1.9.2'
s.authors = ["Evan Elias", "Dallas Marlow", "Bob Patterson Jr.", "Tom Christ"]
s.date = "2013-05-17"
s.date = "2014-06-09"
s.email = ["[email protected]", "[email protected]", "[email protected]", "[email protected]"]
s.files = FileList['Gemfile', 'README.rdoc', 'doc/*.rdoc', 'lib/**/*.rb', 'bin/**', 'scripts/*.rb', 'plugins/**/*.rb', 'etc/jetpants.yaml.sample'].to_a
s.require_paths = ["lib"]
Expand All @@ -24,8 +24,10 @@ Gem::Specification.new do |s|
s.add_runtime_dependency 'sequel', '~> 3.36'
s.add_runtime_dependency 'net-ssh', '~> 2.3'
s.add_runtime_dependency 'pry', '~> 0.9.8'
s.add_runtime_dependency 'pry-rescue', '~> 1.4.0'
s.add_runtime_dependency 'thor', '~> 0.15'
s.add_runtime_dependency 'highline', '~> 1.6.12'
s.add_runtime_dependency 'colored', '~> 1.2'
s.add_runtime_dependency 'collins_client', '~> 0.2.7'
s.add_runtime_dependency 'bloom-filter', '~> 0.2.0'
end
4 changes: 4 additions & 0 deletions lib/jetpants.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ module Jetpants
@config = {
'max_concurrency' => 20, # max threads/conns per database
'standby_slaves_per_pool' => 2, # number of standby slaves in every pool
'backup_slaves_per_pool' => 1, # number of backup slaves in every pool
'mysql_schema' => 'test', # database name
'mysql_app_user' => 'appuser', # mysql user for application
'mysql_app_password' => '', # mysql password for application
Expand All @@ -31,6 +32,7 @@ module Jetpants
'mysql_root_password' => false, # mysql root password. omit if specified in /root/.my.cnf instead.
'mysql_grant_ips' => ['192.168.%'], # mysql user manipulations are applied to these IPs
'mysql_grant_privs' => ['ALL'], # mysql user manipulations grant this set of privileges by default
'mysql_clone_ignore' => ['information_schema', 'performance_schema'], # these schemata will be ignored during cloning
'export_location' => '/tmp', # directory to use for data dumping
'verify_replication' => true, # raise exception if the 2 repl threads are in different states, or if actual repl topology differs from Jetpants' understanding of it
'plugins' => {}, # hash of plugin name => arbitrary plugin data (usually a nested hash of settings)
Expand All @@ -39,6 +41,8 @@ module Jetpants
'compress_with' => false, # command line to use for compression in large file transfers
'decompress_with' => false, # command line to use for decompression in large file transfers
'private_interface' => 'bond0', # network interface corresponding to private IP
'output_caller_info' => false, # includes calling file, line and method in output calls
'debug_exceptions' => false, # open a pry session when an uncaught exception is thrown
}

config_paths = ["/etc/jetpants.yaml", "~/.jetpants.yml", "~/.jetpants.yaml"]
Expand Down
17 changes: 4 additions & 13 deletions lib/jetpants/db.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ module Jetpants
# functional lines.
class DB
include CallbackHandler
include Output

# IP address (as a string) of the MySQL instance
attr_reader :ip
Expand Down Expand Up @@ -70,6 +71,9 @@ def initialize(ip, port=3306)
# This is ephemeral, only known to Jetpants if you previously called
# DB#start_mysql or DB#restart_mysql in this process
@options = []

# Mutex used to lock probing state
@probe_mutex = Mutex.new
end

###### Host methods ########################################################
Expand Down Expand Up @@ -97,19 +101,6 @@ def same_host_as?(db)

###### Misc methods ########################################################

# Displays the provided output, along with information about the current time,
# self, and optionally a Jetpants::Table name.
def output(str, table=nil)
str = str.to_s.strip
str = nil if str && str.length == 0
str ||= "Completed (no output)"
output = Time.now.strftime("%H:%M:%S") + " [#{self}] "
output << table.name << ': ' if table
output << str
print output + "\n"
output
end

# DB objects are sorted as strings, ie, by calling to_s
def <=> other
to_s <=> other.to_s
Expand Down
17 changes: 13 additions & 4 deletions lib/jetpants/db/import_export.rb
Original file line number Diff line number Diff line change
Expand Up @@ -325,13 +325,22 @@ def clone_to!(*targets)
destinations[t] = t.mysql_directory
raise "Over 100 MB of existing MySQL data on target #{t}, aborting copy!" if t.data_set_size > 100000000
end
[self, targets].flatten.concurrent_each {|t| t.stop_query_killer; t.stop_mysql}
targets.concurrent_each {|t| t.ssh_cmd "rm -rf #{t.mysql_directory}/ib_logfile*"}

# Construct the list of files and dirs to copy. We include ib_lru_dump if present
# (ie, if using Percona Server with innodb_buffer_pool_restore_at_startup enabled)
# since this will greatly improve warm-up time of the cloned nodes
files = ['ibdata1', 'mysql', 'test', app_schema]
databases = mysql_root_cmd("SHOW DATABASES").split("\n").select { |row|
row.include?('Database:')
}.map{ |line|
line.split(":").last.strip
}.reject { |s|
Jetpants.mysql_clone_ignore.include? s
}

[self, targets].flatten.concurrent_each {|t| t.stop_query_killer; t.stop_mysql}
targets.concurrent_each {|t| t.ssh_cmd "rm -rf #{t.mysql_directory}/ib_logfile*"}

files = (databases + ['ibdata1', app_schema]).uniq
files << 'ib_lru_dump' if ssh_cmd("test -f #{mysql_directory}/ib_lru_dump 2>/dev/null; echo $?").chomp.to_i == 0

fast_copy_chain(mysql_directory,
Expand All @@ -343,4 +352,4 @@ def clone_to!(*targets)
end

end
end
end
4 changes: 2 additions & 2 deletions lib/jetpants/db/replication.rb
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def enslave_siblings!(targets)
t.enable_read_only!
end
resume_replication # should already have happened from the clone_to! restart anyway, but just to be explicit
catch_up_to_master
catch_up_to_master 21600
enable_monitoring
end

Expand Down Expand Up @@ -209,7 +209,7 @@ def seconds_behind_master
# returns true if slave lag is zero 3 times in a row. Gives up if this does
# not occur within a one-hour period. If a large amount of slave lag is
# reported, this method will automatically reduce its polling frequency.
def catch_up_to_master(timeout=3600, threshold=3, poll_frequency=5)
def catch_up_to_master(timeout=21600, threshold=3, poll_frequency=5)
raise "This instance is not a slave" unless master
resume_replication if @repl_paused

Expand Down
11 changes: 11 additions & 0 deletions lib/jetpants/db/server.rb
Original file line number Diff line number Diff line change
Expand Up @@ -108,5 +108,16 @@ def enable_monitoring(*services)
def disable_monitoring(*services)
end

# No built-in effect. Use when performing actions which will cause the
# server to go offline or become unresponsive, as an escalated enable_monitoring
# Plugins can override and/or implement before/after hooks
def set_downtime(hours)
end

# No built-in effect. Use when performing actions which will cause the
# server to go offline or become unresponsive, as an escalated disable_monitoring
# Plugins can override and/or implement before/after hooks
def cancel_downtime
end
end
end
Loading

1 comment on commit 4a61752

@mschenck
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Please sign in to comment.