Move to incidents subfolder
This commit is contained in:
328
equinix/api-team/incidents/salvage-license-costs.org
Normal file
328
equinix/api-team/incidents/salvage-license-costs.org
Normal file
@@ -0,0 +1,328 @@
|
||||
#+TITLE: Incident 2590
|
||||
#+AUTHOR: Adam Mohammed
|
||||
#+DATE: May 2, 2024
|
||||
|
||||
|
||||
* Starting out
|
||||
|
||||
There are CPUs missing their ProcessorComponent information.
|
||||
|
||||
** Get a list of affected hardware
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
affected_servers = []
|
||||
Hardware::Server.find_in_batches do |hbatch|
|
||||
hbatch.each do |h|
|
||||
affected_servers << h unless h.components.any? { |c| c.type == "ProcessorComponent" }
|
||||
end
|
||||
end
|
||||
#+END_SRC
|
||||
#+DATE:
|
||||
#+BEGIN_EXAMPLE
|
||||
1685 total affected
|
||||
#+END_EXAMPLE
|
||||
|
||||
** Classify the affected hardware by class and plan
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
affected_server_types = Hash.new(0)
|
||||
affected_servers.each do |h|
|
||||
affected_server_types[h.class] += 1
|
||||
end
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_EXAMPLE ruby
|
||||
irb(main):269:0> affected_server_types
|
||||
=> {"Hardware::StorageAppliance"=>170, "Hardware::Open19Node"=>195, "Hardware::Server"=>1319, "Hardware::StorageServer"=>1}
|
||||
#+END_EXAMPLE
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
affected_plan_types = Hash.new(0)
|
||||
affected_servers.each do |h|
|
||||
next unless h.plan.present?
|
||||
affected_plan_types[h.plan.slug.to_s] += 1
|
||||
end; nil
|
||||
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_EXAMPLE ruby
|
||||
{"storage.custom"=>102,
|
||||
"m3.large.x86"=>329,
|
||||
"c3.small.x86"=>120,
|
||||
"m3.small.x86"=>143,
|
||||
"n2.xlarge.x86"=>23,
|
||||
"c2.medium.x86"=>124,
|
||||
"c3.medium.x86"=>396,
|
||||
"netapp.storage"=>16,
|
||||
"m2.xlarge.x86"=>31,
|
||||
"nvidia3.a100.medium"=>1,
|
||||
"t3.small.x86"=>13,
|
||||
"n3.xlarge.x86"=>155,
|
||||
"w3amd.75xx24c.512.8160"=>102,
|
||||
"s3.xlarge.x86"=>29,
|
||||
"appliance.dell.hci.vxrail.opt-m.x86"=>12,
|
||||
"m3.large.opt-c2"=>3,
|
||||
"nvidia3.a30.medium"=>11,
|
||||
"purestorage"=>6,
|
||||
"a3.large.opt-s4a5n1.x86"=>17,
|
||||
"nvidia3.a30.large"=>3,
|
||||
"n3.xlarge.opt-m4"=>4,
|
||||
"storage.dell"=>14,
|
||||
"nvidia3.a40.medium"=>9,
|
||||
"w3amd.7402p.256.8160"=>1,
|
||||
"a4.lg"=>5,
|
||||
"a3.large.x86"=>1,
|
||||
"x.large.arm"=>1,
|
||||
"w3amd.75xx24c.256.4320"=>1,
|
||||
"npi.testing"=>1,
|
||||
"m3.large.opt-c2m4"=>1,
|
||||
"a3.large.opt-s4a1"=>1,
|
||||
"w3amd.75xx24c.256.8160"=>1,
|
||||
"c3.large.arm64"=>2}
|
||||
#+END_EXAMPLE
|
||||
|
||||
|
||||
** What hardware is missing plan information
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
missing_plan = []
|
||||
affected_servers.each do |h|
|
||||
missing_plan << h unless h.plan.present?
|
||||
end; nil
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_EXAMPLE ruby
|
||||
irb(main):289:0> missing_plan.pluck(:id, :type, :state)
|
||||
=>
|
||||
[["2556229f-3da0-4056-96dc-ce820af30ba3", "Hardware::Server", "enrolled"],
|
||||
["4ca367f4-33c2-494f-8227-bed6c0d8bd8d", "Hardware::Server", "enrolled"],
|
||||
["8504ffdf-24d7-453f-9a49-94a7cba3f9ae", "Hardware::StorageAppliance", "enrolled"],
|
||||
["8b383a51-2a45-4d02-aafa-f31b159e31b6", "Hardware::Server", "enrolled"],
|
||||
["a20a6442-7185-4c49-bfcf-5359fe22cd9f", "Hardware::StorageAppliance", "enrolled"],
|
||||
["e2ff6fec-a70a-42e6-afb1-93f57c6a30f1", "Hardware::Server", "enrolled"],
|
||||
["f9670617-0cde-4db6-94de-d7ec495881e7", "Hardware::StorageAppliance", "enrolled"]]
|
||||
#+END_EXAMPLE
|
||||
|
||||
|
||||
I think it's safe to not worry about these because customers can't deploy them yet.
|
||||
|
||||
** What hardware plan versions don't have the required CPU information?
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
def valid_cpu_data?(hardware)
|
||||
required_keys = ["cores", "count"]
|
||||
return false unless hardware.plan_version.present? && hardware.plan_version.specs["cpus"].present?
|
||||
cpu_data = hardware.plan_version.specs["cpus"][0]
|
||||
required_keys.map do |k|
|
||||
cpu_data.keys.include? k
|
||||
end.all?
|
||||
end
|
||||
|
||||
affected_plan_versions = Hash.new(0)
|
||||
affected_servers.each do |h|
|
||||
next unless h.plan_version.present?
|
||||
affected_plan_versions[h.plan_version.slug] += 1 unless valid_cpu_data?(h)
|
||||
end; nil
|
||||
#+END_SRC
|
||||
|
||||
|
||||
|
||||
** These are the the ones that currently are not being billed properly
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
broken_billables = [
|
||||
"39b7f377-af6d-437b-a99b-10d9d4fd7b53",
|
||||
"d2deb4c8-446f-4679-a7f5-60edf7745e23",
|
||||
"e9c50e27-9f74-477b-9210-0e277537a336",
|
||||
"88a6bf4a-b63e-4c7e-8c20-5d5949ba62f9",
|
||||
"5b205c53-af64-421e-b2b6-39f5923d4f3f",
|
||||
"604c38d9-1f8c-4600-bb29-a0d5e1aa504a",
|
||||
"d4914c80-c657-4ff2-86a1-8f41d90af0a9",
|
||||
"f6f087f3-3e7c-457f-8943-a6864a8a0b97",
|
||||
"88d2e8ee-6ec1-450b-9982-63d8220a1011",
|
||||
"a47f38f9-c2ac-46ba-bb16-68e659b89183",
|
||||
"e47a3d2e-13a0-444a-8164-ebe54fbc43b1",
|
||||
"840ce4fd-a300-4a7b-96a3-140e0bf988b4",
|
||||
"68e0feb1-8146-4b08-a591-15806a0f61a0",
|
||||
"0e1ef1c6-2de7-40b3-91e0-44474f32fafb",
|
||||
"161d4f10-4362-4028-b237-b7649f87eb09"
|
||||
]
|
||||
#+END_SRC
|
||||
|
||||
|
||||
** Do these pieces of hardware have the information I need to fix the data?
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
def my_valid_cpu_data?(hardware)
|
||||
required_keys = ["cores", "count"]
|
||||
return false unless hardware.plan_version.present? && hardware.plan_version.specs["cpus"].present?
|
||||
cpu_data = hardware.plan_version.specs["cpus"][0]
|
||||
required_keys.map do |k|
|
||||
cpu_data.keys.include? k
|
||||
end.all?
|
||||
end
|
||||
|
||||
can_be_fixed = []
|
||||
|
||||
broken_hardware.each do |h|
|
||||
next unless h.plan_version.present?
|
||||
can_be_fixed << h if my_valid_cpu_data?(h)
|
||||
end; nil
|
||||
#+END_SRC
|
||||
|
||||
** Actually fix the components
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
def create_processor_component(h_id, cpu_data, index)
|
||||
cpu = ProcessorComponent.new
|
||||
cpu.name = cpu_data["name"]
|
||||
cpu.type = ProcessorComponent.to_s
|
||||
cpu.vendor = cpu_data["manufacturer"]
|
||||
cpu.model = cpu_data["model"]
|
||||
cpu.serial = "CPU#{index}"
|
||||
cpu.firmware_version = "N/A"
|
||||
cpu.data = {
|
||||
"clock" => cpu_data["speed"],
|
||||
"cores" => cpu_data["cores"],
|
||||
}
|
||||
cpu.hardware_id = h_id
|
||||
cpu
|
||||
end
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
cant_fix = []
|
||||
finished = []
|
||||
broken_hardware.each_with_index do |h, i|
|
||||
unless h.plan_version.present? && h.plan_version["cpus"].present?
|
||||
cant_fix << h
|
||||
next
|
||||
end
|
||||
|
||||
cpu_data = h.plan_version["cpus"][0]
|
||||
core_count = h
|
||||
c = create_processor_component(h.id, ,
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
"04af7a5f-6330-4095-b525-ea8a596db035"
|
||||
"111fc3d1-7002-4c22-9d29-e2539c610bb1"
|
||||
"15a4071c-ddd9-4fc5-b9b9-35d5831a9de3"
|
||||
"19798268-39ca-454e-a7de-cab1a9cae4a5"
|
||||
"1df18ad3-3189-4b87-9654-7d9b062d553d"
|
||||
"20388df4-c645-445c-8563-114213c85604"
|
||||
"2cafd1cc-a6ba-4caf-849d-969ac22eddca"
|
||||
"2cc2596e-8045-49ea-8274-5b84e27a643c"
|
||||
"2d4941a3-f0ce-454c-b9dc-6f5bf3381519"
|
||||
"2e13125c-9794-4392-ab7c-0dbb10b3b4f7"
|
||||
"2e24c7dc-a219-45c2-ae79-1aa0eb367d56"
|
||||
"2ffa9123-6466-49a3-ac81-84a7e0dcb437"
|
||||
"35d423fa-e119-4c9b-8eed-9193a4037b18"
|
||||
"39888ace-88cb-49f8-8eef-f1ec14c36d2c"
|
||||
"4470e1bc-0c1e-47ac-99e0-8f23cc075228"
|
||||
"56c91002-4e8e-4ab8-b653-d8fb459ad186"
|
||||
"59daefde-f2c2-42c2-8bc9-90d5a00e98e9"
|
||||
"5bf121bf-1b11-429b-9f73-11206e9f438c"
|
||||
"5f81d1f6-9c7d-41b0-bb02-a4cb5b31b1ab"
|
||||
"613d4464-8c0b-44a8-8bcc-9ece50b17ce5"
|
||||
"62e344ed-2fe1-4778-92e8-0dd386cf0590"
|
||||
"630cf74d-d689-496c-b29f-5f094c4455d5"
|
||||
"649fa2b1-675c-4433-9256-e7632092ab8a"
|
||||
"66f1ef27-3310-40c3-8d06-6c889ddc1e15"
|
||||
"6ac54a10-c47d-446d-8ef5-d4131bdc746c"
|
||||
"6c7e5828-68fe-4114-a8e8-1e3ce9747de0"
|
||||
"773240e6-7f9b-472f-847e-0a9f914e4493"
|
||||
"77f9ba1e-bcd4-46c9-963a-b861fb573ab2"
|
||||
"7fe941fd-4533-411e-93ad-832632910cf2"
|
||||
"858a0e53-56ec-4b77-b852-8371f3ead1bd"
|
||||
"85b1ab1b-664b-4d0f-855e-30ccf7f16f50"
|
||||
"921d04e8-b7b8-4e13-a9f3-f55302d970c1"
|
||||
"9430eb5e-fbe2-48b0-b180-d94347a5f296"
|
||||
"a172606f-4d90-41b8-a1f1-0cd1b20aaa7f"
|
||||
"a8f5d150-0f5b-4a92-9583-0e70473a9b8b"
|
||||
"a96d685d-b16f-4852-84dd-dd3304b37471"
|
||||
"aa1f836f-5808-452a-a5bc-884acd3bcd90"
|
||||
"abc678fd-d92b-4fc9-ad46-bc6316c170c6"
|
||||
"afce0857-1016-4638-91bf-f67ee9ade423"
|
||||
"b37bae11-645f-45cf-b55e-20604b5f3030"
|
||||
"d263768d-c460-4bdd-81fa-c04fe80122cc"
|
||||
"d4849d7e-8b68-4f14-97f6-0682c20d4706"
|
||||
"d634a3b5-98ef-4eca-8fe3-3bc4903170c9"
|
||||
"dff6b6b3-d46e-47c8-8c85-e85f2566893b"
|
||||
"e248f2f0-b1dc-4e6e-b025-687ea375fe2d"
|
||||
"e9e17d57-f8dd-4f8a-b31b-6e33c8e25078"
|
||||
"fa97834e-d71e-4d8f-8fc0-2e8988a05a28"
|
||||
"fb6c21a5-8640-4e1e-af18-2790f3a79873"
|
||||
#+END_SRC
|
||||
|
||||
|
||||
1. LicenseActivationID
|
||||
2. Licensable (an Instance Model)
|
||||
3. PlanVersion
|
||||
4. CPU count and CPU cores
|
||||
5. Update License.data["cores"] = cpu_count * cpu_cores
|
||||
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
# this doesn't save the things
|
||||
def fix_core_count_prime(license_activation)
|
||||
instance = license_activation.licensable
|
||||
return "missing instance" unless instance.present?
|
||||
plan_version = instance.plan_version
|
||||
return "missing plan_version" unless plan_version.present?
|
||||
|
||||
cpu_data = plan_version.specs["cpus"][0]
|
||||
return "missing cpu_data" unless cpu_data.present? && cpu_data["cores"] && cpu_data["count"]
|
||||
cpu_count = plan_version.specs["cpus"][0]["count"].to_i
|
||||
cpu_cores = plan_version.specs["cpus"][0]["cores"].to_i
|
||||
|
||||
license = license_activation.license
|
||||
license.data["cores"] = cpu_count * cpu_cores
|
||||
|
||||
license
|
||||
end
|
||||
|
||||
|
||||
res = broken_license_activations[2..].map do |la_id|
|
||||
la = LicenseActivation.with_deleted.find(la_id)
|
||||
return "couldn't find la #{la_id}" unless la.present?
|
||||
fix_core_count_prime(la)
|
||||
end
|
||||
|
||||
# when I wanted to get the successful ones
|
||||
res.filter { |item| item.is_a? License }
|
||||
|
||||
# when I wanted to see what broke
|
||||
res.filter { |item| !item.is_a? License }
|
||||
|
||||
#+END_SRC
|
||||
|
||||
|
||||
** Are there any windows licenses remaining with 0 cores, that aren't erroring yet?
|
||||
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
activations = LicenseActivation.eager_load(:license).eager_load(:licensee_product).where("licensee_products.slug LIKE '%windows%'").all
|
||||
|
||||
missing_cores = activations.map do |la|
|
||||
if la.license.data["cores"] == 0
|
||||
la
|
||||
else
|
||||
nil
|
||||
end
|
||||
end.compact
|
||||
|
||||
|
||||
fixed_licenses = missing_cores.map do |la|
|
||||
fix_core_count_prime(la)
|
||||
end
|
||||
|
||||
|
||||
#+END_SRC
|
||||
|
||||
#+BEGIN_SRC ruby
|
||||
irb(main):066:0> LicenseActivation.eager_load(:license).eager_load(:licensee_product).where("licensee_products.slug LIKE '%windows%'").where("(licenses.data->>'cores')::integer = 0").count
|
||||
=> 0
|
||||
irb(main):067:0> LicenseActivation.eager_load(:license).eager_load(:licensee_product).where("licensee_products.slug LIKE '%windows%'").where("(licenses.data->>'cores')::integer > 0").count
|
||||
=> 538
|
||||
#+END_SRC
|
||||
Reference in New Issue
Block a user