Skip to content

Commit b954038

Browse files
Ensure loop closes properly and provider update
* Ensure the pool loop exits properly when the pool is not yet in a running state. * Use ListInstances() when cleaning orphaned runners. This ensures We only run one API call per pool to list instances, instead of running a GetInstance() for each individual instance we are checking. Signed-off-by: Gabriel Adrian Samfira <[email protected]>
1 parent bf844a4 commit b954038

File tree

4 files changed

+113
-50
lines changed

4 files changed

+113
-50
lines changed

contrib/providers.d/azure/garm-external-provider

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ function CreateInstance() {
226226
OS_VERSION=$(echo "${IMAGE_URN}" | cut -d ':' -f3)
227227
ARCH="amd64"
228228

229-
TAGS="garm-controller-id=${GARM_CONTROLLER_ID} garm-pool-id=${GARM_POOL_ID}"
229+
TAGS="garm_controller_id=${GARM_CONTROLLER_ID} garm_pool_id=${GARM_POOL_ID} os_type=${OS_TYPE} os_name=${OS_NAME} os_version=${OS_VERSION} os_arch=${ARCH}"
230230

231231
set +e
232232

@@ -298,6 +298,37 @@ function StopServer() {
298298
az vm deallocate -g "${instance_id}" -n "${instance_id}" -o none --only-show-errors
299299
}
300300

301+
function GetInstance() {
302+
local instance_id="${GARM_INSTANCE_ID}"
303+
info=$(az vm show -d -n $instance_id -g $instance_id -o json --only-show-errors 2>&1)
304+
echo $info | jq -r '
305+
{
306+
provider_id: .name,
307+
name: .name,
308+
os_type: .tags.os_type,
309+
os_name: .tags.os_name,
310+
os_version: .tags.os_version,
311+
os_arch: .tags.os_arch,
312+
pool_id: .tags.garm_pool_id,
313+
status: {"VM starting": "pending_create", "VM running": "running", "VM stopping": "stopped", "VM stopped": "stopped", "VM deallocating": "stopped", "VM deallocated": "stopped"}[.powerState]
314+
}'
315+
}
316+
317+
function ListInstances() {
318+
INSTANCES=$(az vm list --query "[?tags.garm_pool_id == '${GARM_POOL_ID}']" -o json --only-show-errors 2>&1)
319+
echo $info | jq -r '
320+
.[] | {
321+
provider_id: .name,
322+
name: .name,
323+
os_type: .tags.os_type,
324+
os_name: .tags.os_name,
325+
os_version: .tags.os_version,
326+
os_arch: .tags.os_arch,
327+
pool_id: .tags.garm_pool_id,
328+
status: {"Creating": "pending_create", "Migrating": "pending_create", "Failed": "error", "Succeeded": "running", "Deleting": "pending_delete"}[.provisioningState]
329+
}'
330+
}
331+
301332
# Login to Azure
302333
checkValNotNull "${AZURE_SUBSCRIPTION_ID}" "AZURE_SUBSCRIPTION_ID"
303334
checkValNotNull "${AZURE_TENANT_ID}" "AZURE_TENANT_ID"
@@ -317,12 +348,10 @@ case "$GARM_COMMAND" in
317348
DeleteInstance
318349
;;
319350
"GetInstance")
320-
echo "GetInstance not implemented"
321-
exit 1
351+
GetInstance
322352
;;
323353
"ListInstances")
324-
echo "ListInstances not implemented"
325-
exit 1
354+
ListInstances
326355
;;
327356
"StartInstance")
328357
StartInstance

contrib/providers.d/openstack/garm-external-provider

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ function CreateInstance() {
279279
OS_TYPE=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_type')
280280
checkValNotNull "${OS_TYPE}" "os_type" || return $?
281281
DISTRO=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_distro')
282-
checkValNotNull "${OS_TYPE}" "os_distro" || return $?
282+
checkValNotNull "${DISTRO}" "os_distro" || return $?
283283
VERSION=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.os_version')
284284
checkValNotNull "${VERSION}" "os_version" || return $?
285285
ARCH=$(echo "${IMAGE_DETAILS}" | jq -c -r '.properties.architecture')
@@ -306,7 +306,8 @@ function CreateInstance() {
306306
set +e
307307

308308
TAGS="--tag garm-controller-id=${GARM_CONTROLLER_ID} --tag garm-pool-id=${GARM_POOL_ID}"
309-
SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}")
309+
PROPERTIES="--property os_type=${OS_TYPE} --property os_name=${DISTRO} --property os_version=${VERSION} --property os_arch=${GH_ARCH} --property pool_id=${GARM_POOL_ID}"
310+
SRV_DETAILS=$(openstack server create --os-compute-api-version 2.52 ${SOURCE_ARGS} ${TAGS} ${PROPERTIES} --flavor "${FLAVOR}" --user-data="${CC_FILE}" --network="${NET}" "${INSTANCE_NAME}")
310311
if [ $? -ne 0 ];then
311312
openstack volume delete "${INSTANCE_NAME}" || true
312313
exit 1
@@ -394,6 +395,25 @@ function StopServer() {
394395
openstack server stop "${instance_id}"
395396
}
396397

398+
function ListInstances() {
399+
INSTANCES=$(openstack server list --os-compute-api-version 2.52 --tags garm-pool-id=${GARM_POOL_ID} --long -f json)
400+
echo ${INSTANCES} | jq -r '
401+
.[] | .Properties * {
402+
provider_id: .ID,
403+
name: .Name,
404+
status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.Status]
405+
}'
406+
}
407+
408+
function GetInstance() {
409+
INSTANCE=$(openstack server show --os-compute-api-version 2.52 ${GARM_INSTANCE_ID} -f json)
410+
echo ${INSTANCES} | jq -r '.properties * {
411+
provider_id: .id,
412+
name: .name,
413+
status: {"ACTIVE": "running", "SHUTOFF": "stopped", "BUILD": "pending_create", "ERROR": "error", "DELETING": "pending_delete"}[.status]
414+
}'
415+
}
416+
397417
case "$GARM_COMMAND" in
398418
"CreateInstance")
399419
CreateInstance
@@ -402,12 +422,10 @@ case "$GARM_COMMAND" in
402422
DeleteInstance
403423
;;
404424
"GetInstance")
405-
echo "GetInstance not implemented"
406-
exit 1
425+
GetInstance
407426
;;
408427
"ListInstances")
409-
echo "ListInstances not implemented"
410-
exit 1
428+
ListInstances
411429
;;
412430
"StartInstance")
413431
StartInstance

doc/external_provider.md

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ Then you can easily parse it. If you're using ```bash```, you can use the amazin
215215

216216
You will have to parse the bootstrap params, verify that the requested image exists, gather operating system information, CPU architecture information and using that information, you will need to select the appropriate tools for the arch/OS combination you are deploying.
217217

218-
Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder.
218+
Refer to the OpenStack or Azure providers available in the [providers.d](../contrib/providers.d/) folder. Of particular interest are the [cloudconfig folders](../contrib/providers.d/openstack/cloudconfig/), where the instance user data templates are stored. These templates are used to generate the needed automation for the instances to download the github runner agent, send back status updates (including the final github runner agent ID), and download the github runner registration token from garm.
219219

220220
### CreateInstance outputs
221221

@@ -259,8 +259,6 @@ If the target instance does not exist in the provider, this command is expected
259259

260260
## GetInstance
261261

262-
NOTE: This operation is currently not use by ```garm```, but should be implemented.
263-
264262
The ```GetInstance``` command will return details about the instance, as seen by the provider.
265263

266264
Available environment variables:
@@ -275,8 +273,6 @@ On failure, this command is expected to return a non-zero exit code.
275273

276274
## ListInstances
277275

278-
NOTE: This operation is currently not use by ```garm```, but should be implemented.
279-
280276
The ```ListInstances``` command will print to standard output, a json that is unserializable into an **array** of ```Instance{}```.
281277

282278
Available environment variables:
@@ -293,9 +289,7 @@ On failure, a non-zero exit code is expected.
293289

294290
## RemoveAllInstances
295291

296-
NOTE: This operation is currently not use by ```garm```, but should be implemented.
297-
298-
The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```.
292+
The ```RemoveAllInstances``` operation will remove all resources created in a cloud that have been tagged with the ```GARM_CONTROLLER_ID```. External providers should tag all resources they create with the garm controller ID. That tag can then be used to identify all resources when attempting to delete all instances.
299293

300294
Available environment variables:
301295

runner/pool/pool.go

Lines changed: 53 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,21 @@ func (r *basePoolManager) reapTimedOutRunners(runners []*github.Runner) error {
178178
return nil
179179
}
180180

181+
func instanceInList(instanceName string, instances []params.Instance) bool {
182+
for _, val := range instances {
183+
if val.Name == instanceName {
184+
return true
185+
}
186+
}
187+
return false
188+
}
189+
181190
// cleanupOrphanedGithubRunners will forcefully remove any github runners that appear
182191
// as offline and for which we no longer have a local instance.
183192
// This may happen if someone manually deletes the instance in the provider. We need to
184193
// first remove the instance from github, and then from our database.
185194
func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner) error {
195+
poolInstanceCache := map[string][]params.Instance{}
186196
for _, runner := range runners {
187197
if !r.isManagedRunner(labelsFromRunner(runner)) {
188198
log.Printf("runner %s is not managed by a pool belonging to %s", *runner.Name, r.helper.String())
@@ -238,12 +248,17 @@ func (r *basePoolManager) cleanupOrphanedGithubRunners(runners []*github.Runner)
238248
return fmt.Errorf("unknown provider %s for pool %s", pool.ProviderName, pool.ID)
239249
}
240250

241-
// Check if the instance is still on the provider.
242-
_, err = provider.GetInstance(r.ctx, dbInstance.Name)
243-
if err != nil {
244-
if !errors.Is(err, runnerErrors.ErrNotFound) {
245-
return errors.Wrap(err, "fetching instance from provider")
251+
var poolInstances []params.Instance
252+
poolInstances, ok = poolInstanceCache[pool.ID]
253+
if !ok {
254+
poolInstances, err = provider.ListInstances(r.ctx, pool.ID)
255+
if err != nil {
256+
return errors.Wrapf(err, "fetching instances for pool %s", pool.ID)
246257
}
258+
poolInstanceCache[pool.ID] = poolInstances
259+
}
260+
261+
if !instanceInList(dbInstance.Name, poolInstances) {
247262
// The runner instance is no longer on the provider, and it appears offline in github.
248263
// It should be safe to force remove it.
249264
log.Printf("Runner instance for %s is no longer on the provider, removing from github", dbInstance.Name)
@@ -470,36 +485,43 @@ func (r *basePoolManager) loop() {
470485
return
471486
}
472487
default:
473-
log.Printf("attempting to start pool manager for %s", r.helper.String())
474-
tools, err := r.helper.FetchTools()
475-
var failureReason string
476-
if err != nil {
477-
failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err)
478-
r.setPoolRunningState(false, failureReason)
479-
log.Print(failureReason)
480-
if errors.Is(err, runnerErrors.ErrUnauthorized) {
481-
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
482-
} else {
483-
r.waitForTimeoutOrCanceled(60 * time.Second)
488+
select {
489+
case <-r.ctx.Done():
490+
// daemon is shutting down.
491+
return
492+
case <-r.quit:
493+
// this worker was stopped.
494+
return
495+
default:
496+
log.Printf("attempting to start pool manager for %s", r.helper.String())
497+
tools, err := r.helper.FetchTools()
498+
var failureReason string
499+
if err != nil {
500+
failureReason = fmt.Sprintf("failed to fetch tools from github for %s: %q", r.helper.String(), err)
501+
r.setPoolRunningState(false, failureReason)
502+
log.Print(failureReason)
503+
if errors.Is(err, runnerErrors.ErrUnauthorized) {
504+
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
505+
} else {
506+
r.waitForTimeoutOrCanceled(60 * time.Second)
507+
}
508+
continue
484509
}
485-
continue
486-
}
487-
r.mux.Lock()
488-
r.tools = tools
489-
r.mux.Unlock()
510+
r.mux.Lock()
511+
r.tools = tools
512+
r.mux.Unlock()
490513

491-
if err := r.runnerCleanup(); err != nil {
492-
failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err)
493-
r.setPoolRunningState(false, failureReason)
494-
log.Print(failureReason)
495-
if errors.Is(err, runnerErrors.ErrUnauthorized) {
496-
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
497-
} else {
498-
r.waitForTimeoutOrCanceled(60 * time.Second)
514+
if err := r.runnerCleanup(); err != nil {
515+
failureReason = fmt.Sprintf("failed to clean runners for %s: %q", r.helper.String(), err)
516+
log.Print(failureReason)
517+
if errors.Is(err, runnerErrors.ErrUnauthorized) {
518+
r.setPoolRunningState(false, failureReason)
519+
r.waitForTimeoutOrCanceled(common.UnauthorizedBackoffTimer)
520+
}
521+
continue
499522
}
500-
continue
523+
r.setPoolRunningState(true, "")
501524
}
502-
r.setPoolRunningState(true, "")
503525
}
504526
}
505527
}

0 commit comments

Comments
 (0)