Skip to content

Commit 9dfed59

Browse files
authored
Safely handle CHAP rotation for stale portals during iSCSI self-healing
This commit changes iSCSI self-healing to never logout of stale non-CHAP portals. For stale CHAP portals, logouts are now only performed when the the portal is accessible.
1 parent 237e675 commit 9dfed59

File tree

10 files changed

+675
-101
lines changed

10 files changed

+675
-101
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
- **Kubernetes:** Fixed backend config credentials to support all available AWS ARN partitions (Issue [#913](https://github.com/NetApp/trident/issues/913)).
1313
- **Kubernetes:** Added option to disable the auto configurator reconciliation in the Trident operator (Issue [#924](https://github.com/NetApp/trident/issues/924)).
1414
- **Kubernetes:** Added securityContext for csi-resizer container (Issue [#976](https://github.com/NetApp/trident/issues/976)).
15+
- **Kubernetes:** Fixed issue where iSCSI self-healing may unsafely log out of stale iSCSI sessions when a portal is inaccessible (Issue [#961](https://github.com/NetApp/trident/issues/961)).
1516
- Fixed Zonal Flex pools for GCNV driver.
1617

1718
**Enhancements:**

frontend/csi/node_server.go

+92-6
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ const (
5050
iSCSINodeUnstageMaxDuration = 15 * time.Second
5151
iSCSILoginTimeout = 10 * time.Second
5252
iSCSISelfHealingLockContext = "ISCSISelfHealingThread"
53+
iSCSISelfHealingTimeout = 90 * time.Second
5354
fcpNodeUnstageMaxDuration = 15 * time.Second
5455
nvmeSelfHealingLockContext = "NVMeSelfHealingThread"
5556
defaultNodeReconciliationPeriod = 1 * time.Minute
@@ -1845,6 +1846,17 @@ func (p *Plugin) ensureAttachISCSIVolume(
18451846
return mpathSize, nil
18461847
}
18471848

1849+
// getChapInfoFromController attempts to gather CHAP info from the controller in a timebound manner.
1850+
func (p *Plugin) getChapInfoFromController(
1851+
ctx context.Context, volumeID, nodeName string,
1852+
) (*models.IscsiChapInfo, error) {
1853+
chapInfo, err := p.restClient.GetChap(ctx, volumeID, nodeName)
1854+
if err != nil {
1855+
return nil, fmt.Errorf("failed to get CHAP info from CSI controller for volume: '%s'; %w", volumeID, err)
1856+
}
1857+
return chapInfo, nil
1858+
}
1859+
18481860
func (p *Plugin) updateChapInfoFromController(
18491861
ctx context.Context, req *csi.NodeStageVolumeRequest, publishInfo *models.VolumePublishInfo,
18501862
) error {
@@ -2480,11 +2492,17 @@ func (p *Plugin) selfHealingRectifySession(ctx context.Context, portal string, a
24802492

24812493
switch action {
24822494
case models.LogoutLoginScan:
2495+
// Do not log out if the portal is unresponsive as subsequent logins are likely to fail.
2496+
if isAccessible, err := p.iscsi.IsPortalAccessible(ctx, portal); !isAccessible {
2497+
Logc(ctx).WithError(err).Warnf("Cannot safely log out of unresponsive portal '%s'.", portal)
2498+
return fmt.Errorf("cannot safely log out of unresponsive portal '%s'", portal)
2499+
}
2500+
24832501
if err = p.iscsi.Logout(ctx, publishInfo.IscsiTargetIQN, portal); err != nil {
24842502
return fmt.Errorf("error while logging out of target %s", publishInfo.IscsiTargetIQN)
2485-
} else {
2486-
Logc(ctx).Debug("Logout is successful.")
24872503
}
2504+
Logc(ctx).Debug("Logout is successful.")
2505+
24882506
// Logout is successful, fallthrough to perform login.
24892507
fallthrough
24902508
case models.LoginScan:
@@ -2508,12 +2526,17 @@ func (p *Plugin) selfHealingRectifySession(ctx context.Context, portal string, a
25082526
}
25092527

25102528
if publishedCHAPCredentials != publishInfo.IscsiChapInfo {
2529+
fields := LogFields{
2530+
"portal": portal,
2531+
"CHAPInUse": true,
2532+
}
2533+
2534+
// Update the CHAP info in the portal. This should never fail.
25112535
updateErr := publishedISCSISessions.UpdateCHAPForPortal(portal, publishInfo.IscsiChapInfo)
25122536
if updateErr != nil {
2513-
Logc(ctx).Warn("Failed to update published CHAP information.")
2537+
Logc(ctx).WithFields(fields).Warn("Failed to update published CHAP information.")
25142538
}
2515-
2516-
Logc(ctx).Debug("Updated published CHAP information.")
2539+
Logc(ctx).WithFields(fields).Debug("Updated published CHAP information after successful login.")
25172540
}
25182541

25192542
Logc(ctx).Debug("Login to target is successful.")
@@ -2566,6 +2589,64 @@ func (p *Plugin) deprecatedIgroupInUse(ctx context.Context) bool {
25662589
return false
25672590
}
25682591

2592+
// updateCHAPInfoForSessions provides a best attempt to populate up-to-date CHAP credentials within iSCSI self-healing's
2593+
// published sessions. It tracks CHAP credentials by unique IQN to reduce the number of calls to the controller.
2594+
func (p *Plugin) updateCHAPInfoForSessions(
2595+
ctx context.Context, publishedSessions, currentSessions *models.ISCSISessions,
2596+
) error {
2597+
if publishedSessions == nil || currentSessions == nil {
2598+
return nil
2599+
}
2600+
2601+
// Timebox this operation to prevent it from running indefinitely and blocking everything in self-healing and
2602+
// other operations in the node server.
2603+
cancelCtx, cancel := context.WithTimeout(ctx, iSCSISelfHealingTimeout/3)
2604+
defer cancel()
2605+
2606+
// Store the CHAP credentials we've found for certain IQNs.
2607+
// IQNs should be unique between SVMs and CHAP credentials are scoped at the SVM level.
2608+
iqnToCHAP := make(map[string]*models.IscsiChapInfo, 0)
2609+
var errs error
2610+
2611+
for portal, publishedData := range publishedSessions.Info {
2612+
// If the session isn't stale on the host, then we can ignore this portal for now.
2613+
data, ok := currentSessions.Info[portal]
2614+
if ok && !p.iscsi.IsSessionStale(cancelCtx, data.PortalInfo.SessionNumber) {
2615+
continue
2616+
} else if !publishedData.PortalInfo.CHAPInUse() || !publishedData.PortalInfo.HasTargetIQN() {
2617+
continue
2618+
}
2619+
2620+
chapInfo, ok := iqnToCHAP[publishedData.PortalInfo.ISCSITargetIQN]
2621+
if !ok {
2622+
volumeID, err := publishedSessions.VolumeIDForPortal(portal)
2623+
if err != nil {
2624+
errs = errors.Join(errs, fmt.Errorf("failed to get volume ID for portal: '%s'; %w", portal, err))
2625+
continue
2626+
}
2627+
2628+
chapInfo, err = p.getChapInfoFromController(cancelCtx, volumeID, p.nodeName)
2629+
if err != nil {
2630+
errs = errors.Join(errs, fmt.Errorf("failed to get CHAP info for portal: '%s'; %w", portal, err))
2631+
continue
2632+
}
2633+
2634+
// Store what we've learned for future iterations.
2635+
iqnToCHAP[publishedData.PortalInfo.ISCSITargetIQN] = chapInfo
2636+
}
2637+
2638+
publishedData.PortalInfo.UpdateCHAPCredentials(*chapInfo)
2639+
Logc(cancelCtx).WithField("portal", portal).Debug("Updated CHAP info for portal.")
2640+
}
2641+
2642+
if errs != nil {
2643+
Logc(cancelCtx).WithError(errs).Error("Failed to get updated CHAP info for portal(s).")
2644+
return errs
2645+
}
2646+
2647+
return nil
2648+
}
2649+
25692650
// performISCSISelfHealing inspects the desired state of the iSCSI sessions with the current state and accordingly
25702651
// identifies candidate sessions that require remediation. This function is invoked periodically.
25712652
func (p *Plugin) performISCSISelfHealing(ctx context.Context) {
@@ -2580,7 +2661,7 @@ func (p *Plugin) performISCSISelfHealing(ctx context.Context) {
25802661
}()
25812662

25822663
// After this time self-healing may be stopped
2583-
stopSelfHealingAt := time.Now().Add(60 * time.Second)
2664+
stopSelfHealingAt := time.Now().Add(iSCSISelfHealingTimeout)
25842665

25852666
// If there are not iSCSI volumes expected on the host skip self-healing
25862667
if publishedISCSISessions.IsEmpty() {
@@ -2610,6 +2691,11 @@ func (p *Plugin) performISCSISelfHealing(ctx context.Context) {
26102691
Logc(ctx).Debug("No iSCSI sessions LUN mappings found.")
26112692
}
26122693

2694+
// Update CHAP info for published sessions.
2695+
if err := p.updateCHAPInfoForSessions(ctx, &publishedISCSISessions, &currentISCSISessions); err != nil {
2696+
Logc(ctx).WithError(err).Error("Failed to update CHAP credentials for published iSCSI sessions.")
2697+
}
2698+
26132699
Logc(ctx).Debugf("Published iSCSI Sessions: %v", publishedISCSISessions)
26142700
Logc(ctx).Debugf("Current iSCSI Sessions: %v", currentISCSISessions)
26152701

frontend/csi/node_server_test.go

+208
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,91 @@ func TestNodeStageISCSIVolume(t *testing.T) {
606606
}
607607
}
608608

609+
func TestGetChapInfoFromController(t *testing.T) {
610+
type args struct {
611+
ctx context.Context
612+
volume string
613+
node string
614+
}
615+
616+
type data struct {
617+
chapInfo *models.IscsiChapInfo
618+
}
619+
620+
type assertions struct {
621+
Error assert.ErrorAssertionFunc
622+
Empty assert.ValueAssertionFunc
623+
Equal assert.ComparisonAssertionFunc
624+
}
625+
626+
tt := map[string]struct {
627+
args args
628+
data data
629+
assert assertions
630+
mocks func(mockAPI *mockControllerAPI.MockTridentController, args args, data data)
631+
}{
632+
"Successfully gets CHAP credentials": {
633+
args: args{
634+
ctx: context.Background(),
635+
volume: "foo",
636+
node: "bar",
637+
},
638+
data: data{
639+
chapInfo: &models.IscsiChapInfo{
640+
UseCHAP: true,
641+
IscsiUsername: "user",
642+
IscsiInitiatorSecret: "pass",
643+
IscsiTargetUsername: "user2",
644+
IscsiTargetSecret: "pass2",
645+
},
646+
},
647+
assert: assertions{
648+
Error: assert.NoError,
649+
Empty: assert.NotEmpty,
650+
Equal: assert.Equal,
651+
},
652+
mocks: func(mockAPI *mockControllerAPI.MockTridentController, args args, data data) {
653+
mockAPI.EXPECT().GetChap(args.ctx, args.volume, args.node).Return(data.chapInfo, nil)
654+
},
655+
},
656+
"Fails to get CHAP credentials": {
657+
args: args{
658+
ctx: context.Background(),
659+
volume: "foo",
660+
node: "bar",
661+
},
662+
data: data{
663+
chapInfo: nil,
664+
},
665+
assert: assertions{
666+
Error: assert.Error,
667+
Empty: assert.Empty,
668+
Equal: assert.Equal,
669+
},
670+
mocks: func(mockAPI *mockControllerAPI.MockTridentController, args args, data data) {
671+
mockAPI.EXPECT().GetChap(args.ctx, args.volume, args.node).Return(data.chapInfo, errors.New("api error"))
672+
},
673+
},
674+
}
675+
676+
for name, test := range tt {
677+
t.Run(name, func(t *testing.T) {
678+
mockCtrl := gomock.NewController(t)
679+
mockAPI := mockControllerAPI.NewMockTridentController(mockCtrl)
680+
test.mocks(mockAPI, test.args, test.data)
681+
682+
plugin := &Plugin{
683+
restClient: mockAPI,
684+
}
685+
686+
info, err := plugin.getChapInfoFromController(test.args.ctx, test.args.volume, test.args.node)
687+
test.assert.Error(t, err)
688+
test.assert.Empty(t, info)
689+
test.assert.Equal(t, info, test.data.chapInfo)
690+
})
691+
}
692+
}
693+
609694
func TestUpdateChapInfoFromController_Success(t *testing.T) {
610695
testCtx := context.Background()
611696
volumeName := "foo"
@@ -665,6 +750,129 @@ func TestUpdateChapInfoFromController_Error(t *testing.T) {
665750
assert.EqualValues(t, models.IscsiChapInfo{}, testPublishInfo.IscsiAccessInfo.IscsiChapInfo)
666751
}
667752

753+
func TestUpdateChapInfoForSessions(t *testing.T) {
754+
// Populate sessions with only the state that matters. The rest of the fields are not relevant for this test.
755+
publishedSessions := &models.ISCSISessions{}
756+
currentSessions := &models.ISCSISessions{}
757+
758+
// CHAP portals
759+
chapPortals := []string{"0.0.0.0", "4.4.4.4", "5.5.5.5"}
760+
761+
// Unique CHAP portal
762+
uniqueChapPortal := "9.9.9.9"
763+
uniqueIQN := "iqn.9999-99.com.netapp:target"
764+
765+
// non-CHAP portals
766+
nonChapPortals := []string{"1.1.1.1", "2.2.2.2", "3.3.3.3"}
767+
768+
sessionID := "0"
769+
770+
// Shared CHAP credentials
771+
sharedCHAPInfo := models.IscsiChapInfo{
772+
UseCHAP: true,
773+
IscsiUsername: "user",
774+
IscsiInitiatorSecret: "pass",
775+
IscsiTargetUsername: "user2",
776+
IscsiTargetSecret: "pass2",
777+
}
778+
779+
// Add CHAP sessions to both maps.
780+
for _, portal := range chapPortals {
781+
err := publishedSessions.AddPortal(portal, models.PortalInfo{
782+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
783+
Credentials: sharedCHAPInfo,
784+
})
785+
assert.NoError(t, err)
786+
787+
err = currentSessions.AddPortal(portal, models.PortalInfo{
788+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
789+
SessionNumber: sessionID,
790+
})
791+
assert.NoError(t, err)
792+
}
793+
794+
// Add a CHAP session with a unique IQN.
795+
err := publishedSessions.AddPortal(uniqueChapPortal, models.PortalInfo{
796+
// Use a different IQN here to prove we cache returned values from the REST API.
797+
ISCSITargetIQN: uniqueIQN,
798+
Credentials: sharedCHAPInfo,
799+
})
800+
assert.NoError(t, err)
801+
err = currentSessions.AddPortal(uniqueChapPortal, models.PortalInfo{
802+
ISCSITargetIQN: uniqueIQN,
803+
SessionNumber: sessionID,
804+
})
805+
assert.NoError(t, err)
806+
807+
// Add non-CHAP session
808+
for _, portal := range nonChapPortals {
809+
err := publishedSessions.AddPortal(portal, models.PortalInfo{
810+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
811+
Credentials: models.IscsiChapInfo{},
812+
})
813+
assert.NoError(t, err)
814+
815+
err = currentSessions.AddPortal(portal, models.PortalInfo{
816+
ISCSITargetIQN: "iqn.2020-01.com.netapp:target",
817+
Credentials: models.IscsiChapInfo{},
818+
SessionNumber: sessionID,
819+
})
820+
assert.NoError(t, err)
821+
}
822+
823+
// Populate a single of LUN and volume in each session.
824+
volume := "foo"
825+
node := "bar"
826+
for _, sessionData := range publishedSessions.Info {
827+
sessionData.LUNs.Info[0] = volume
828+
}
829+
830+
// Create a mock controller client that will return the expected CHAP info.
831+
mockCtrl := gomock.NewController(t)
832+
mockISCSI := mock_iscsi.NewMockISCSI(mockCtrl)
833+
mockClient := mockControllerAPI.NewMockTridentController(mockCtrl)
834+
835+
plugin := &Plugin{
836+
nodeName: node,
837+
iscsi: mockISCSI,
838+
restClient: mockClient,
839+
}
840+
841+
// Expect calls on the mock client for all sessions that use CHAP.
842+
freshCHAPInfo := &models.IscsiChapInfo{
843+
UseCHAP: true,
844+
IscsiUsername: "user2",
845+
IscsiInitiatorSecret: "pass2",
846+
IscsiTargetUsername: "user",
847+
IscsiTargetSecret: "pass",
848+
}
849+
850+
// Mock calls to the iSCSI client
851+
count := len(currentSessions.Info)
852+
mockISCSI.EXPECT().IsSessionStale(gomock.Any(), sessionID).Return(true).Times(count)
853+
854+
// Mock API calls
855+
count = len(chapPortals) - (len(chapPortals) - 1) + 1 // Expect one more call for the unique CHAP portal.
856+
mockClient.EXPECT().GetChap(
857+
gomock.Any(), volume, node,
858+
).Return(freshCHAPInfo, nil).Times(count)
859+
860+
err = plugin.updateCHAPInfoForSessions(ctx, publishedSessions, currentSessions)
861+
assert.NoError(t, err)
862+
863+
// Verify that the CHAP info was updated in all sessions that use CHAP.
864+
for _, portal := range chapPortals {
865+
chapInfoInSession := publishedSessions.Info[portal].PortalInfo.Credentials
866+
assert.EqualValues(t, *freshCHAPInfo, chapInfoInSession)
867+
}
868+
869+
// Verify that the non-CHAP portals were not updated.
870+
for _, portal := range nonChapPortals {
871+
chapInfoInSession := publishedSessions.Info[portal].PortalInfo.Credentials
872+
assert.EqualValues(t, models.IscsiChapInfo{}, chapInfoInSession)
873+
}
874+
}
875+
668876
type PortalAction struct {
669877
Portal string
670878
Action models.ISCSIAction

0 commit comments

Comments
 (0)