diff --git a/2023.org b/2023.org new file mode 100644 index 0000000..79069b8 --- /dev/null +++ b/2023.org @@ -0,0 +1,63 @@ +#+TITLE: Year in review +#+AUTHOR: Adam Mohammed + + +* January +- Setting up environments for platform to test auth0 changes against portal +- Created a golang library to make it easier to build algolia indexes + in our applications. Used by bouncer, and quantum to provide nice searchable + interfaces on our frontends. +- Implemented the initial OIDC endpoints for identity-api in LBaaS + +* February +- Wrote helm charts for identity-API +- Bootstrapped initial identity-api deployment +- Discussed token format for identity-api +- Adding algolia indexing to quantum resources + +* March +- Drafted plan for upgrading the monolith from Rails 5 to Rails 6 and Ruby 2 to Ruby 3. +- Implemented extra o11y where we needed for the upgrade +- Used gradual rollout strategy to build confidence +- Upgraded CRDB and documented the process + +* April +- Added testing to exoskeleton - some gin tooling we use for go services + +* May +- Started work on the ResourceOwnerDirectory +- Maintenance on exoskeleton + +* June +- More ROD work +- Ruby 3 upgrade +- Added service to service clients for coupon +- Testing LBaaS with decuddle +- Added events to the API + +* July +- Deploy Resource Owner Directory + +* August +- Get ready for LBaaS Launch + +* September +- Implemented queue scheduler + + +* Talks: +- Session Scheduler +- Static analysis on Ruby +- API Auth discussion with using identity-api +- API monoitoring by thinking about what we actually deliver +- Deep diving caching issues from #_incent-1564 +- Recorded deployment and monitoring of API +- Monitoring strategy for the API Rails/Ruby Upgrades +- CRDB performance troubleshooting + + + + + + +* Docs: diff --git a/fleeting/open-stack-bm.org b/fleeting/open-stack-bm.org index b34a6f2..6e55099 100644 --- a/fleeting/open-stack-bm.org +++ b/fleeting/open-stack-bm.org @@ -9,3 +9,5 @@ provision/deploy/clean The "conductor" calls out to driver for vendor specific implementations of the above functions. A python ramdisk is used to provide control on the target machine. + +ref: https://docs.openstack.org/ironic/2023.1/install/get_started.html diff --git a/lbaas-testing.org b/lbaas-testing.org new file mode 100644 index 0000000..9e04be7 --- /dev/null +++ b/lbaas-testing.org @@ -0,0 +1,181 @@ +#+TITLE: LBaaS Testing +#+AUTHOR: Adam Mohammed +#+DATE: August 30, 2023 + +* API Testing +:PROPERTIES: +:header-args:shell: :session *bash3* +:header-args: :results output verbatim +:END: + +#+begin_src shell + PS1="> " + export PAPI_KEY="my-user-api-key" + export PROJECT_ID=7c0d4b1d-4f21-4657-96d4-afe6236e361e +#+end_src + + +First let's exchange our user's API key for an infratographer JWT. +#+begin_src shell + export INFRA_TOK=$(curl -s -X POST -H"authorization: Bearer $PAPI_KEY" https://iam.metalctrl.io/api-keys/exchange | jq -M -r '.access_token' ) +#+end_src + +#+RESULTS: + + +If all went well, you should see a json object containing the =loadbalancers= key from this block. +#+begin_src shell +curl -s -H"Authorization: Bearer $INFRA_TOK" https://lb.metalctrl.io/v1/projects/${PROJECT_ID}/loadbalancers | jq -M +#+end_src + +#+RESULTS: +#+begin_example +{ + "loadbalancers": [ + { + "created_at": "2023-08-30T18:26:19.534351Z", + "id": "loadbal-9OhCaBNHUXo_f-gC7YKzW", + "ips": [], + "name": "test-graphql", + "ports": [ + { + "id": "loadprt-8fN2XRnwY8C0SGs_T-zhp", + "name": "public-http", + "number": 8080 + } + ], + "updated_at": "2023-08-30T18:26:19.534351Z" + }, + { + "created_at": "2023-08-30T19:55:42.944273Z", + "id": "loadbal-pLdVJLcAa3UdbPEmGWwvB", + "ips": [], + "name": "test-graphql", + "ports": [ + { + "id": "loadprt-N8xRozMbxZwtG2yAPk7Wx", + "name": "public-http", + "number": 8080 + } + ], + "updated_at": "2023-08-30T19:55:42.944273Z" + } + ] +} +#+end_example + + +** Creating a LB + +Here we'll create an empty LB with our newly exchanged API key. +#+begin_src shell + curl -s \ + -H"Authorization: Bearer $INFRA_TOK" \ + -H"content-type: application/json" \ + -d '{"name": "test-graphql", "location_id": "metlloc-da", "provider_id":"loadpvd-gOB_-byp5ebFo7A3LHv2B"}' \ + https://lb.metalctrl.io/v1/projects/${PROJECT_ID}/loadbalancers | jq -M +#+end_src + +#+RESULTS: +: +: > > > { +: "errors": null, +: "id": "loadbal-ygZi9cUywLk5_oAoLGMxh" +: } + + +All we have is an ID now, but eventually we should get an IP back. +#+begin_src shell + RES=$(curl -s \ + -H"Authorization: Bearer $INFRA_TOK" \ + https://lb.metalctrl.io/v1/projects/${PROJECT_ID}/loadbalancers | tee ) + export LOADBALANCER_ID=$(echo $RES | jq -r '.loadbalancers | sort_by(.created_at) | reverse | .[0].id' ) + echo $LOADBALANCER_ID +#+end_src + +#+RESULTS: +: +: > > > loadbal-ygZi9cUywLk5_oAoLGMxh + + +** Create the backends + +The load balancer requires a pool with an associated origin. + +#+begin_src shell + export POOL_ID=$(curl -s -H"Authorization: Bearer $INFRA_TOK" \ + -H"content-type: application/json" \ + -d '{"name": "pool9", "protocol": "tcp"}' \ + https://lb.metalctrl.io/v1/projects/${PROJECT_ID}/loadbalancers/pools | jq -r '.id') + echo $POOL_ID +#+end_src + +#+RESULTS: +: +: > > > loadpol-hC_UY3Woqjfyfw1Tzr5R2 + + +Let's create a LB that points to =icanhazip.com= so we can see how we're proxying + +#+begin_src shell + export TARGET_IP=$(dig +short icanhazip.com | head -1) + data=$(jq -M -c -n --arg port_id $POOL_ID --arg target_ip "$TARGET_IP" '{"name": "icanhazip9", "target": $target_ip, "port_id": $port_id, "port_number": 80, "active": true}' | tee ) + curl -s \ + -H"Authorization: Bearer $INFRA_TOK" \ + -H"content-type: application/json" \ + -d "$data" \ + https://lb.metalctrl.io/v1/loadbalancers/pools/${POOL_ID}/origins | jq -M +#+end_src + +#+RESULTS: +: +: > > > > > { +: "errors": null, +: "id": "loadogn-zfbMfqtFKeQ75Tul52h4Q" +: } + + +#+begin_src shell + curl -s \ + -H"Authorization: Bearer $INFRA_TOK" \ + -H"content-type: application/json" \ + -d "$(jq -n -M -c -n --arg pool_id $POOL_ID '{"name": "public-http", "number": 8080, "pool_ids": [$pool_id]}')" \ + https://lb.metalctrl.io/v1/loadbalancers/${LOADBALANCER_ID}/ports | jq -M +#+end_src + +#+RESULTS: +: +: > > > { +: "errors": null, +: "id": "loadprt-IVrZB1sLUfKqdnDULd6Ix" +: } + +** Let's try out the LB now + +#+begin_src shell + curl -s \ + -H"Authorization: Bearer $INFRA_TOK" \ + -H"content-type: application/json" \ + https://lb.metalctrl.io/v1/loadbalancers/${LOADBALANCER_ID} | jq -M + +#+end_src + +#+RESULTS: +#+begin_example + +> > { + "created_at": "2023-08-30T20:10:59.389392Z", + "id": "loadbal-ygZi9cUywLk5_oAoLGMxh", + "ips": [], + "name": "test-graphql", + "ports": [ + { + "id": "loadprt-IVrZB1sLUfKqdnDULd6Ix", + "name": "public-http", + "number": 8080 + } + ], + "provider": null, + "updated_at": "2023-08-30T20:10:59.389392Z" +} +#+end_example diff --git a/literature-notes/OpenStackBareMetal.org b/literature-notes/OpenStackBareMetal.org new file mode 100644 index 0000000..e61d341 --- /dev/null +++ b/literature-notes/OpenStackBareMetal.org @@ -0,0 +1,10 @@ +#+TITLE: OpenStack: BareMetal + +Openstack has a collection of services named "ironic" that helps +manage baremetal servers. The architecture is broken down by levels of +abstraction. The highest abstraction is the API, which makes +controlling different hardware seem the same. Below the API is a basic +driver layer, which calls out to plugins to perform actions specific +to a vendor/hardware type. + +ref: https://docs.openstack.org/ironic/2023.1/install/get_started.html diff --git a/session_scheduler.org b/session_scheduler.org new file mode 100644 index 0000000..09050ca --- /dev/null +++ b/session_scheduler.org @@ -0,0 +1,166 @@ +#+TITLE: Session Scheduler + +* Overview + +For some API requests, the time it would take to serve the request is +too long for a typical HTTP call. We use ActiveJob from Rails to +handle these type of background jobs. Typically, instead of servicing +the whole request before responding back to the client, we'll just +create a new job and then immediately return. + +Sometimes we have jobs that need to be processed in a specific order, +and this is where the session scheduler comes in. It manages a number +of queues for workloads, and assigns a job to that queue dynamically. + +This document talks about what kind of problems the scheduler is meant +for, how it is implemented and how you can use it. + +* Ordering problems + +Often in those background jobs, there are some ordering constraints +that we have between the jobs. In some networking APIs for example, +things must happen in some order to achieve the desired state. + +The simplest example of this is assigning and unassigning a VLAN to a +port. You can quickly make these calls to the API in succession, but +it may take some time for the actual state of the switch to be +updated. If these jobs are processed in parallel, depending on the +order in which they finish changes the final state of the port. + +If the unassign finshes first, then the final state the user will see +is that the port is assigned to the VLAN. Otherwise, it'll end up in +the state without a VLAN assinged. + +The best we can do here is make the assumption that we get the +requests in the order that the customer wanted operations to occur +in. So, if the assign came in first, we must finish that job before +processing the unassign. + +Our api workers that serve the background jobs currently fetch and +process jobs as fast as they can with no respect to ordering. When +ordering is not important, this method works to process jobs quickly. + +With our networking example though, it leads to behavior that's hard +to predict on the customer's end. + +* + +We have a few constraints for creating a solution to the ordering +problem. Using the VLANS as an example. +- Order of jobs must be respected within a project, but total ordering + is not important (e.g. Project A's tasks don't need to be ordered + with respect to Project B's tasks)* +- Dynamically spining up consumers and queues isn't the most fun thing + in Ruby, but having access to the monolith data is required at this + point in time. +- We need a way to map an arbitrary of projects down to a fixed set of + consumers. +- Although total ordering doesn't matter, we do want to be somewhat + fair + + +Let's clarify some terms: + +- Total Ordering - All events occur in a specific order (A1 -> B1 -> + A2 -> C1 -> B2 -> C2 -> B3) +- Partial ordering - Some events must occur before others, but the + combinations are free (e.g. A1 must occur before A2 which must occur + before A3, but [A1,A2,A3] has no + relation to B1). +- Correctness - Jobs ordering constraints are honored. +- Fairness - If there are jobs A1, A2....An and jobs B1, B2....Bn both + are able to get serviced in some reasonable amount of time. + + +* Session scheduler + + +** Queueing and Processing Jobs In Order + +For some requests in the Metal API, we aren't able to fully service +the request in the span of a HTTP request/response time. Some things +might take several seconds to minutes to complete. We rely on Rails +Active Job to help us achieve these things as background +jobs. ActiveJob lets us specify a queue name, which until now, has +been a static name such as "network". + +The API runs a number of workers that are listening on these queues +with multiple threads, so we can pick up and service the jobs quickly. + +This breaks down when we require some jobs to be processed serially or +in a specific order. This is where the =Worker::SessionScheduler= +comes in. This scheduler dynamically assigns the queue name for a job +so that it is accomplished in-order with other related jobs. + + +A typical Rails job looks something like this: + +#+begin_src ruby + class MyJob < ApplicationJob #1 + queue_as :network #2 + + def perform #3 + # do stuff + end + end +#+end_src + + +1. We can tell the name of the job is =MyJob= +2. Show the queue that the job will wait in before getting picked up +3. Perform is the work that the consumer that picks up the job will do + +Typically, we'll queue a job to be peformed later within the span of +an HTTP request by doing something like =MyJob.perform_later=. This +puts the job on the =network= queue, and the next available worker +will pull the job off of the queue and then process it. + +In the case where we need jobs to be processed in a certain order it +might look like this: + +#+begin_src ruby + class MyJob < ApplicationJob + queue_as do + project = self.arguments.first #2 + Worker::SessionScheduler.call(session_key: project.id) + end + + def perform(project) + # do stuff + end + end +#+end_src + +Now instead of =2= being just a static queue name, it's dynamically +assigned based on what the scheduler assigns. + +The scheduler will use the "session key" to see if there are any other +jobs queued with the same key, if there are, you get sent to the same +queue. + +If there aren't, you'll get sent to the queue with the least number of +jobs waiting to be processed, and any subsequent requests with the +same "session key" will follow. + +Just putting jobs in the same queue isn't enough though, because if we +process the jobs from a queue in parallel, then we end up in a +situation where we can still have jobs completing out of order. We +have queues designated to serve this purpose of processing things in +order. We're currently leveraging a feature on rabbitmq queues that +lets us guarantee that only one consumer is ever getting the jobs to +process. We rely on the configuration of that consumer to only use a +single thread as well to make sure we're not doing things out of +order. + +This can be used to do any set of jobs which need to be ordered, +though currently we're just using it for Port VLAN management. If you +do decide to use this, you need to make sure that all the jobs which +are related share some attribute so you can use that as your "session +key" when calling into the scheduling service. + +The scheduler takes care of the details of managing the queues, so +once all the jobs for a session are completed, that session will get +removed and the next time the same key comes in it'll get reallocated +to the best worker. This allows us to rebalance the queues over time +so we prevent customers from having longer wait times despite us doing +things serially. diff --git a/slipbox/A.org b/slipbox/A.org new file mode 100644 index 0000000..4140c88 --- /dev/null +++ b/slipbox/A.org @@ -0,0 +1,8 @@ +#+TITLE: NanoMetal +#+AUTHOR: Adam Mohammed + +Nanometal intends to be a easily deployable suite of services which +help you manage your hardware as if it were in a datacenter controlled +by equinixmetal. This means that you can use the existing APIs to +manage your machines, provisioning and deprovisioning on the fly, and +having easy out-of-band access for when things go wrong. diff --git a/slipbox/B.org b/slipbox/B.org new file mode 100644 index 0000000..0c72843 --- /dev/null +++ b/slipbox/B.org @@ -0,0 +1,11 @@ +#+TITLE: Levels of abstraction to manage hardware differences + +OpenStack and Equinix metal both use multiple layers of abstraction to +make managing bare metal servers look simple. On the surface you get +access to basic operations: provision/deprovision, rescue, power, +boot. Each of which ends up having to do specific things depending on +the hardware vendor or OS. + + +TAGS: BareMetal Management +REF: [[file:~/org-notes/literature-notes/OpenStackBareMetal.org][OpenStack_Ironic]] diff --git a/slipbox/C.org b/slipbox/C.org new file mode 100644 index 0000000..8b4df96 --- /dev/null +++ b/slipbox/C.org @@ -0,0 +1,10 @@ +#+TITLE: RAMDisk to provide initial machine control + +OpenStack's ironic-conductor is the basic gRPC API that performs +actions on bare metal. In order for it to provide this functionality +the conductor relies on drivers, which handle the hardware specific +implementation issues, and an on-machine agent. The on-machine agent +is loaded as a ramdisk and gives the conductor the ability to perform +some sets of actions on the machine. + +TAGS: BareMetal Provisioning