Skip to content

Retrieve default disk type from Azure API #716

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions src/bosh_azure_cpi/lib/cloud/azure/cloud.rb
Original file line number Diff line number Diff line change
Expand Up @@ -302,8 +302,7 @@ def calculate_vm_cloud_properties(desired_instance_size)
raise "Missing VM cloud properties: #{missing_keys.join(', ')}"
end

available_vm_sizes = @azure_client.list_available_virtual_machine_sizes_by_location(location)
instance_types = @instance_type_mapper.map(desired_instance_size, available_vm_sizes)
instance_types = @instance_type_mapper.map(desired_instance_size, location)
{
'instance_types' => instance_types,
'ephemeral_disk' => {
Expand Down Expand Up @@ -354,7 +353,7 @@ def create_disk(size, cloud_properties, vm_cid = nil)
location = vm[:location]
instance_type = vm[:vm_size]
zone = vm[:zone]
default_storage_account_type = get_storage_account_type_by_instance_type(instance_type)
default_storage_account_type = @disk_manager2.get_default_storage_account_type(instance_type, location)
end
storage_account_type = cloud_properties.fetch('storage_account_type', default_storage_account_type)
caching = cloud_properties.fetch('caching', 'None')
Expand Down Expand Up @@ -844,7 +843,7 @@ def _init_azure
@stemcell_manager2 = Bosh::AzureCloud::StemcellManager2.new(_azure_config, @blob_manager, @meta_store, @storage_account_manager, @azure_client)
@light_stemcell_manager = Bosh::AzureCloud::LightStemcellManager.new(@blob_manager, @storage_account_manager, @azure_client)
@vm_manager = Bosh::AzureCloud::VMManager.new(_azure_config, @disk_manager, @disk_manager2, @azure_client, @storage_account_manager, @stemcell_manager, @stemcell_manager2, @light_stemcell_manager)
@instance_type_mapper = Bosh::AzureCloud::InstanceTypeMapper.new
@instance_type_mapper = Bosh::AzureCloud::InstanceTypeMapper.new(@azure_client)
rescue Net::OpenTimeout => e
cloud_error("Please make sure the CPI has proper network access to Azure. #{e.inspect}") # TODO: Will it throw the error when initializing the client and manager
end
Expand Down
28 changes: 28 additions & 0 deletions src/bosh_azure_cpi/lib/cloud/azure/disk/disk_manager2.rb
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,34 @@ def migrate_to_zone(disk_id, disk, zone)
end
end

def get_default_storage_account_type(instance_type, location)
supports_premium_storage?(instance_type, location) ? STORAGE_ACCOUNT_TYPE_PREMIUM_LRS : STORAGE_ACCOUNT_TYPE_STANDARD_LRS
end

def supports_premium_storage?(instance_type, location)
@premium_storage_cache ||= {}

instance_type_downcase = instance_type.downcase
cache_key = "#{instance_type_downcase}-#{location}"
return @premium_storage_cache[cache_key] if @premium_storage_cache.key?(cache_key)

begin
@azure_client.list_vm_skus(location).each do |sku|
if sku[:name].downcase == instance_type_downcase &&
sku[:capabilities].key?(:PremiumIO) &&
sku[:capabilities][:PremiumIO] == 'True'
@premium_storage_cache[cache_key] = true
return true
end
end
rescue => e
@logger.error("Error determining premium storage support for '#{instance_type}' in location '#{location}': #{e.message}. Defaulting to Standard storage.")
end

@premium_storage_cache[cache_key] = false
false
end

private

def _get_disk(resource_group_name, disk_name)
Expand Down
110 changes: 100 additions & 10 deletions src/bosh_azure_cpi/lib/cloud/azure/restapi/azure_client.rb
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ class AzureClient
# Please add the key into this list if you want to redact its value in request body.
CREDENTIAL_KEYWORD_LIST = %w[adminPassword client_secret customData].freeze

CACHE_DIR = '/var/vcap/sys/run/azure_cpi'.freeze
CACHE_SUBDIR = 'cache'.freeze
CACHE_EXPIRY_SECONDS = 24 * 60 * 60 # 24 hours
TRUNCATION_LIMIT = 10000

def initialize(azure_config, logger)
@logger = logger

Expand Down Expand Up @@ -2063,15 +2068,14 @@ def update_tags_of_storage_account(name, tags)
end

def get_max_fault_domains_for_location(location)
error_text = "Unable to get maximum fault domains for location '#{location}'"
url = "/subscriptions/#{uri_escape(@azure_config.subscription_id)}"
url += "/providers/#{REST_API_PROVIDER_COMPUTE}"
url += "/skus"
skus = get_resource_by_id(url, "$filter" => "location eq '#{location}'")['value']
raise error_text unless skus
sku_for_location = skus.find { |sku| sku['name'] == 'Aligned' && sku['locations'].map(&:downcase).include?(location.downcase) }
raise error_text unless sku_for_location
sku_for_location['capabilities'].find { |capability| capability['name'] == 'MaximumPlatformFaultDomainCount' }['value'].to_i
resource_skus = list_resource_skus(location, REST_API_AVAILABILITY_SETS)
sku_for_location = resource_skus.find { |sku| sku[:name] == 'Aligned' }

raise "Unable to get maximum fault domains for location '#{location}'" unless sku_for_location &&
sku_for_location[:capabilities] &&
sku_for_location[:capabilities].key?(:MaximumPlatformFaultDomainCount)

sku_for_location[:capabilities][:MaximumPlatformFaultDomainCount].to_i
end

# Create or Update a Compute Gallery Image Definition
Expand Down Expand Up @@ -2218,6 +2222,91 @@ def get_gallery_image_version_by_tags(gallery_name, tags)
parse_gallery_image(image_version)
end

# List available Resource SKUs by Location and resource type. The list is updated at least once a day.
#
# @param [String] location - The location to list the resource SKUs.
# @param [String] resource_type - The resource type to filter the SKUs.
# @return [Array] The list of available Resource SKUs
#
# @See https://learn.microsoft.com/en-us/rest/api/compute/resource-skus/list?view=rest-compute-2024-11-04&tabs=HTTP
#
def list_resource_skus(location=nil, resource_type=nil)
cache_key = "skus_#{location || 'all'}_#{resource_type || 'all'}.json"
full_cache_dir = File.join(CACHE_DIR, CACHE_SUBDIR)
cache_file = File.join(full_cache_dir, cache_key)

if File.exist?(cache_file) && (Time.now - File.mtime(cache_file)) < CACHE_EXPIRY_SECONDS
@logger.debug("list_resource_skus - Reading from cache file: #{cache_file}")
begin
cached_data = JSON.parse(File.read(cache_file), symbolize_names: true)
return cached_data
rescue JSON::ParserError => e
@logger.warn("list_resource_skus - Failed to parse cache file #{cache_file}: #{e.message}. Fetching fresh data.")
rescue StandardError => e
@logger.warn("list_resource_skus - Failed to read cache file #{cache_file}: #{e.message}. Fetching fresh data.")
end
else
@logger.debug("list_resource_skus - Cache miss or expired for key: #{cache_key}. Fetching from API.")
end

url = "/subscriptions/#{uri_escape(@azure_config.subscription_id)}/providers/#{REST_API_PROVIDER_COMPUTE}/skus"
params = {}
params['$filter'] = "location eq '#{location}'" if location
result = get_resource_by_id(url, params)
resource_skus = []

unless result.nil? || result['value'].nil?
result['value'].each do |sku|
next if resource_type && sku['resourceType'] != resource_type

vm_sku = {
name: sku['name'],
resource_type: sku['resourceType'],
location: sku['locations']&.first,
tier: sku['tier'],
size: sku['size'],
family: sku['family'],
restrictions: sku['restrictions'],
capabilities: sku['capabilities']&.map { |c| [c['name'].to_sym, c['value']] }.to_h || {}
}
next if location && vm_sku[:location]&.downcase != location.downcase

resource_skus << vm_sku
end
end

if !Dir.exist?(CACHE_DIR)
@logger.debug("list_resource_skus - Cache parent directory #{CACHE_DIR} does not exist. Skipping cache write.")
return resource_skus
end

begin
FileUtils.mkdir_p(full_cache_dir)
File.open(cache_file, File::RDWR | File::CREAT) do |f|
f.flock(File::LOCK_EX)
begin
f.truncate(0)
f.write(JSON.pretty_generate(resource_skus))
@logger.debug("list_resource_skus - Wrote data to cache file: #{cache_file}")
ensure
f.flock(File::LOCK_UN)
end
end
rescue StandardError => e
@logger.warn("list_resource_skus - Failed to write cache file #{cache_file}: #{e.message}")
end

resource_skus
end

# List available Resource SKUs for Virtual Machines by Location
#
# @param [String] location - The location to list the resource SKUs.
# @return [Array] The list of available Resource SKUs
def list_vm_skus(location)
list_resource_skus(location, REST_API_VIRTUAL_MACHINES)
end

private

# @return [Hash]
Expand Down Expand Up @@ -2653,12 +2742,13 @@ def http_get_response(uri, request, retry_after)

status_code = response.code.to_i
response_body = response.body
truncated_body = response_body.to_s.length > TRUNCATION_LIMIT ? response_body.to_s[0..TRUNCATION_LIMIT] + '... [truncated]' : response_body.to_s
message = "http_get_response - #{status_code}\n"
message += get_http_common_headers(response)
message += if filter_credential_in_logs(uri)
'response.body cannot be logged because it may contain credentials.'
else
"response.body: #{redact_credentials_in_response_body(response_body)}"
"response.body: #{redact_credentials_in_response_body(truncated_body)}"
end
@logger.debug(message)

Expand Down
18 changes: 0 additions & 18 deletions src/bosh_azure_cpi/lib/cloud/azure/utils/helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -574,24 +574,6 @@ def flock(lock_name, mode)
end
end

def get_storage_account_type_by_instance_type(instance_type)
support_premium_storage?(instance_type) ? STORAGE_ACCOUNT_TYPE_PREMIUM_LRS : STORAGE_ACCOUNT_TYPE_STANDARD_LRS
end

def support_premium_storage?(instance_type)
instance_type = instance_type.downcase
# NOTE: The `Style/NumericPredicate` cop was reporting auto-correctable offenses for the lines below. But fixing the offenses caused failing specs, so the cop offenses have been disabled here.
((instance_type =~ /^standard_ds/) == 0) || # including DS and DSv2, e.g. Standard_DS1, Standard_DS1_v2 # rubocop:disable Style/NumericPredicate
((instance_type =~ /^standard_d(\d)+s_v3/) == 0) ||
((instance_type =~ /^standard_gs/) == 0) ||
((instance_type =~ /^standard_b(\d)+s/) == 0) ||
((instance_type =~ /^standard_b(\d)+ms/) == 0) ||
((instance_type =~ /^standard_f(\d)+s/) == 0) ||
((instance_type =~ /^standard_e(\d)+s_v3/) == 0) ||
((instance_type =~ /^standard_e(\d)+is_v3/) == 0) ||
((instance_type =~ /^standard_l(\d)+s/) == 0)
end

def is_stemcell_storage_account?(tags)
(STEMCELL_STORAGE_ACCOUNT_TAGS.to_a - tags.to_a).empty?
end
Expand Down
141 changes: 67 additions & 74 deletions src/bosh_azure_cpi/lib/cloud/azure/vms/instance_type_mapper.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,95 +4,88 @@ module Bosh::AzureCloud
class InstanceTypeMapper
include Helpers

# https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes
# The below array defines different series. The former series is recommended than the latter.
# 1. VM sizes which supports Premium storage are recommended;
# 2. New generations are recommended: v3 > v2 > v1 > previous generations;
# 3. General purpose (Balanced CPU-to-memory ratio) > Compute optimized (High CPU-to-memory ratio) > Memory optimized (High memory-to-CPU ratio);
# 4. High performance compute is listed at the end;
# 4. The following sizes are not recommended:
# a. The sizes with GPU (NV, NC, ND);
# b. The M-series which has lots of CPUs and memories.
# c. The Basic tier sizes (they don't support load balancing);
RECOMMENDED_VM_SIZES = [
# Premium Storage
%w[Standard_D2s_v3 Standard_D4s_v3 Standard_D8s_v3 Standard_D16s_v3 Standard_D32s_v3 Standard_D64s_v3],
%w[Standard_E2s_v3 Standard_E4s_v3 Standard_E8s_v3 Standard_E16s_v3 Standard_E32s_v3 Standard_E64s_v3 Standard_E64is_v3],
%w[Standard_DS1_v2 Standard_DS2_v2 Standard_DS3_v2 Standard_DS4_v2 Standard_DS5_v2],
%w[Standard_F2s_v2 Standard_F4s_v2 Standard_F8s_v2 Standard_F16s_v2 Standard_F32s_v2 Standard_F64s_v2 Standard_F72s_v2],
%w[Standard_DS11_v2 Standard_DS12_v2 Standard_DS13_v2 Standard_DS14_v2 Standard_DS15_v2],
%w[Standard_B1s Standard_B1ms Standard_B2s Standard_B2ms Standard_B4ms Standard_B8ms],
%w[Standard_F1s Standard_F2s Standard_F4s Standard_F8s Standard_F16s],
%w[Standard_GS1 Standard_GS2 Standard_GS3 Standard_GS4 Standard_GS5],
%w[Standard_DS1 Standard_DS2 Standard_DS3 Standard_DS4 Standard_DS11 Standard_DS12 Standard_DS13 Standard_DS14],
# Standard Storage
%w[Standard_D2_v3 Standard_D4_v3 Standard_D8_v3 Standard_D16_v3 Standard_D32_v3 Standard_D64_v3],
%w[Standard_E2_v3 Standard_E4_v3 Standard_E8_v3 Standard_E16_v3 Standard_E32_v3 Standard_E64_v3 Standard_E64i_v3],
%w[Standard_D1_v2 Standard_D2_v2 Standard_D3_v2 Standard_D4_v2 Standard_D5_v2],
%w[Standard_A1_v2 Standard_A2_v2 Standard_A4_v2 Standard_A8_v2 Standard_A2m_v2 Standard_A4m_v2 Standard_A8m_v2],
%w[Standard_D11_v2 Standard_D12_v2 Standard_D13_v2 Standard_D14_v2 Standard_D15_v2],
%w[Standard_F1 Standard_F2 Standard_F4 Standard_F8 Standard_F16],
%w[Standard_G1 Standard_G2 Standard_G3 Standard_G4 Standard_G5],
%w[Standard_D1 Standard_D2 Standard_D3 Standard_D4 Standard_D11 Standard_D12 Standard_D13 Standard_D14],
%w[Standard_A1 Standard_A2 Standard_A3 Standard_A4 Standard_A5 Standard_A6 Standard_A7 Standard_A8 Standard_A9 Standard_A10 Standard_A11],
# High performance compute
%w[Standard_H8 Standard_H16 Standard_H8m Standard_H16m Standard_H16r Standard_H16mr]
].freeze

def map(desired_instance_size, available_vm_sizes)
SERIES_PREFERENCE = {
'D' => 5, # General purpose - balanced
'F' => 4, # Compute optimized
'E' => 4, # Memory optimized
'B' => 3, # Burstable VMs
'L' => 3, # Storage optimized
'A' => 2, # Basic general purpose
'H' => 1, # High performance compute
'M' => 1, # Ultra memory optimized
'N' => 1 # GPU
}.freeze

def initialize(azure_client)
@azure_client = azure_client
@logger = Bosh::Clouds::Config.logger
@logger.debug("The available VM sizes in the specified region are '#{_vm_sizes_to_string(available_vm_sizes)}'")
possible_vm_sizes = _find_possible_vm_sizes(desired_instance_size, available_vm_sizes)
cloud_error("Unable to meet desired instance size: #{desired_instance_size['cpu']} CPU, #{desired_instance_size['ram']} MB RAM") if possible_vm_sizes.empty?
@logger.debug("The possible VM sizes which meet desired instance size are '#{_vm_sizes_to_string(possible_vm_sizes)}'")
@sku_cache = {}
end

closest_matched_vm_sizes = _find_closest_matched_vm_sizes(possible_vm_sizes)
@logger.debug("The closest matched VM sizes are '#{closest_matched_vm_sizes}'")
def map(desired_instance_size, location)
@logger.debug("Finding VM size with minimum #{desired_instance_size['cpu']} CPU, #{desired_instance_size['ram']} MB RAM")

closest_matched_vm_sizes
end
prepared_skus = get_vm_skus(location).map do |sku|
has_restrictions = sku.key?(:restrictions) && !sku[:restrictions].empty?
has_required_capabilities = sku.key?(:capabilities) &&
sku[:capabilities].key?(:vCPUs) &&
sku[:capabilities].key?(:MemoryGB)

private
next nil unless has_required_capabilities && !has_restrictions

def _find_possible_vm_sizes(desired_instance_size, available_vm_sizes)
available_vm_sizes.select do |vm_size|
vm_size[:number_of_cores] >= desired_instance_size['cpu'] &&
vm_size[:memory_in_mb] >= desired_instance_size['ram']
end
end
{
original_sku: sku,
name: sku[:name],
cores: sku[:capabilities][:vCPUs].to_i,
memory_mb: (sku[:capabilities][:MemoryGB].to_f * 1024).to_i,
premium_io: sku[:capabilities][:PremiumIO] == 'True' ? 1 : 0,
generation: extract_generation(sku[:name]),
series_score: get_series_score(sku[:name])
}
end.compact

def _find_closest_matched_vm_sizes(possible_vm_sizes)
vm_sizes = possible_vm_sizes.reject do |vm_size|
_find_index(vm_size) == -1
possible_vm_sizes = prepared_skus.select do |sku|
sku[:cores] >= desired_instance_size['cpu'] &&
sku[:memory_mb] >= desired_instance_size['ram']
end
cloud_error('Unable to find the closest matched VM sizes') if vm_sizes.empty?
@logger.debug("The recommended VM sizes are '#{_vm_sizes_to_string(vm_sizes)}'")

vm_sizes = vm_sizes.sort_by do |vm_size|
support_premium_storage = support_premium_storage?(vm_size[:name]) ? 0 : 1
priority = _find_index(vm_size)
[support_premium_storage, vm_size[:number_of_cores], vm_size[:memory_in_mb], priority]
end
cloud_error('Unable to find the closest matched VM sizes') if vm_sizes.empty?
@logger.debug("The sorted VM sizes are '#{_vm_sizes_to_string(vm_sizes)}'")
cloud_error("Unable to meet desired instance size: #{desired_instance_size['cpu']} CPU, #{desired_instance_size['ram']} MB RAM") if possible_vm_sizes.empty?

closest_matched_vm_sizes = []
vm_sizes.each do |vm_size|
closest_matched_vm_sizes.push(vm_size[:name])
end
closest_matched_vm_sizes = possible_vm_sizes.sort_by do |sku|
[-sku[:premium_io], sku[:cores], sku[:memory_mb], -sku[:series_score], -sku[:generation]]
end.map { |sku| sku[:name] }

@logger.debug("Selected VM sizes (in order): #{closest_matched_vm_sizes.join(', ')}")
closest_matched_vm_sizes
end

def _find_index(vm_size)
RECOMMENDED_VM_SIZES.each_index do |index|
return index if RECOMMENDED_VM_SIZES[index].map(&:downcase).include?(vm_size[:name].downcase)
private

def get_vm_skus(location)
cache_key = "skus-#{location}"
unless @sku_cache[cache_key]
begin
@logger.debug("Fetching VM SKU information for location: #{location}")
@sku_cache[cache_key] = @azure_client.list_vm_skus(location)
rescue => e
@logger.warn("Failed to fetch VM SKU information: #{e.message}")
return []
end
end
-1

@sku_cache[cache_key]
end

def extract_generation(vm_name)
match = vm_name.downcase.match(/_v(\d+)/)
match ? match[1].to_i : 1 # Default to gen 1 for non-versioned VMs
end

def _vm_sizes_to_string(vm_sizes)
vm_sizes = vm_sizes.map { |vm_size| "#{vm_size[:name]}: #{vm_size[:number_of_cores]} CPU, #{vm_size[:memory_in_mb]} MB RAM" }
vm_sizes.join(';')
def get_series_score(vm_name)
SERIES_PREFERENCE.each do |series, score|
return score if vm_name.downcase.match(/standard_#{series}\d/i)
end
0 # Default score for unknown series
end
end
end
Loading