From e4fc26ee4976da16e5c692b88684e2a8ee34587d Mon Sep 17 00:00:00 2001 From: Harsh Rawat Date: Wed, 24 Jun 2026 17:31:20 +0530 Subject: [PATCH] Add LCOW live migration support across the controller stacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement end-to-end live migration support for LCOW pods so a running guest and its workloads can be moved between hosts. Every controller in the stack — VM, pod, Linux container, process, network, and SCSI/Plan9/VPCI devices — gains a save/import lifecycle plus the destination-side patch and resume plumbing needed to rebind local resources and bring the workload back online. Migration lifecycle: - Source: Save serializes a controller's state into a self-describing protobuf envelope and freezes the controller (StateSourceMigrating) so no mutating ops race the transfer; Resume rolls the freeze back, and a finalize Stop or VM teardown terminates it. - Destination: Import rehydrates a controller into a migrating state, Patch repoints saved resources (layer VHDs, process IO/bundle, network namespace) at the destination host, and Resume binds the live VM, guest, and devices and republishes events so containerd treats the task as locally running. AbortMigrated discards an imported-but-never-resumed controller and emits synthetic exits so Delete can proceed. VM controller: - Add the source/destination migrating states and the full HCS migration lifecycle: InitializeLiveMigrationOnSource, StartLiveMigrationOnSource, StartLiveMigrationTransfer, FinalizeLiveMigration, plus StartWithMigrationOptions on the destination. - Exchange the opaque compatibility blob, retain the final HCS document so the destination can recreate an identical VM, and recover the GCS port/bridge-id allocator floors so reissued ids cannot collide. - Make SCSI initialization lazy (built on first use from the HCS document) and handle never-started/destination teardown paths, including the already-stopped HCS error. Controller-specific changes: - SCSI controller switches to an RWMutex and rejects all ops while migrating; ReserveForRootfs now carries the full disk config. - Process, network, container, and VM state machines document and enforce the new migrating states and transitions. - Pod gains a migrating guard, AbortMigrated fan-out, and routes new containers through lazy SCSI init. Includes accompanying unit tests for the new save/import/patch/resume paths across all controllers. Signed-off-by: Harsh Rawat --- internal/controller/device/plan9/save.go | 20 + internal/controller/device/plan9/save_test.go | 33 + internal/controller/device/scsi/controller.go | 26 +- .../controller/device/scsi/controller_test.go | 120 ++++ internal/controller/device/scsi/disk/save.go | 84 +++ .../controller/device/scsi/disk/save_test.go | 130 ++++ internal/controller/device/scsi/doc.go | 10 +- .../controller/device/scsi/mount/save_lcow.go | 58 ++ .../device/scsi/mount/save_lcow_test.go | 105 +++ .../controller/device/scsi/mount/save_wcow.go | 15 + internal/controller/device/scsi/save.go | 247 +++++++ internal/controller/device/scsi/save_test.go | 215 ++++++ internal/controller/device/vpci/save.go | 20 + .../controller/device/vpci/save_lcow_test.go | 34 + .../controller/linuxcontainer/container.go | 67 +- .../linuxcontainer/container_test.go | 95 ++- internal/controller/linuxcontainer/doc.go | 33 + .../controller/linuxcontainer/document.go | 16 + .../linuxcontainer/document_test.go | 73 ++ .../linuxcontainer/mocks/mock_types.go | 29 + internal/controller/linuxcontainer/save.go | 440 ++++++++++++ .../controller/linuxcontainer/save_test.go | 642 ++++++++++++++++++ internal/controller/linuxcontainer/state.go | 43 +- internal/controller/linuxcontainer/types.go | 2 + internal/controller/network/doc.go | 30 +- internal/controller/network/network.go | 17 +- internal/controller/network/network_lcow.go | 19 +- .../controller/network/network_lcow_test.go | 33 +- internal/controller/network/network_test.go | 35 + internal/controller/network/network_wcow.go | 7 +- .../controller/network/network_wcow_test.go | 8 +- internal/controller/network/save.go | 196 ++++++ internal/controller/network/save_test.go | 256 +++++++ internal/controller/network/state.go | 40 +- internal/controller/pod/doc.go | 7 + internal/controller/pod/mocks/mock_types.go | 80 ++- internal/controller/pod/pod_lcow.go | 36 +- internal/controller/pod/pod_lcow_test.go | 18 +- internal/controller/pod/save_lcow.go | 251 +++++++ internal/controller/pod/save_lcow_test.go | 478 +++++++++++++ internal/controller/pod/types_lcow.go | 24 +- internal/controller/process/doc.go | 70 +- internal/controller/process/process.go | 22 +- internal/controller/process/process_test.go | 28 +- internal/controller/process/save.go | 241 +++++++ internal/controller/process/save_test.go | 442 ++++++++++++ internal/controller/process/state.go | 43 +- internal/controller/vm/doc.go | 48 +- internal/controller/vm/save_lcow.go | 296 ++++++++ internal/controller/vm/state.go | 89 ++- internal/controller/vm/types.go | 7 + internal/controller/vm/vm.go | 122 +++- internal/controller/vm/vm_devices.go | 37 +- internal/controller/vm/vm_lcow.go | 8 + internal/controller/vm/vm_migration.go | 210 ++++++ internal/hcs/errors.go | 6 +- internal/hcs/errors_test.go | 53 ++ internal/logfields/fields.go | 28 +- internal/protocol/guestresource/parse.go | 4 +- internal/uvm/network.go | 2 +- 60 files changed, 5650 insertions(+), 198 deletions(-) create mode 100644 internal/controller/device/plan9/save.go create mode 100644 internal/controller/device/plan9/save_test.go create mode 100644 internal/controller/device/scsi/disk/save.go create mode 100644 internal/controller/device/scsi/disk/save_test.go create mode 100644 internal/controller/device/scsi/mount/save_lcow.go create mode 100644 internal/controller/device/scsi/mount/save_lcow_test.go create mode 100644 internal/controller/device/scsi/mount/save_wcow.go create mode 100644 internal/controller/device/scsi/save.go create mode 100644 internal/controller/device/scsi/save_test.go create mode 100644 internal/controller/device/vpci/save.go create mode 100644 internal/controller/device/vpci/save_lcow_test.go create mode 100644 internal/controller/linuxcontainer/save.go create mode 100644 internal/controller/linuxcontainer/save_test.go create mode 100644 internal/controller/network/save.go create mode 100644 internal/controller/network/save_test.go create mode 100644 internal/controller/pod/save_lcow.go create mode 100644 internal/controller/pod/save_lcow_test.go create mode 100644 internal/controller/process/save.go create mode 100644 internal/controller/process/save_test.go create mode 100644 internal/controller/vm/save_lcow.go create mode 100644 internal/controller/vm/vm_migration.go diff --git a/internal/controller/device/plan9/save.go b/internal/controller/device/plan9/save.go new file mode 100644 index 0000000000..1610be454c --- /dev/null +++ b/internal/controller/device/plan9/save.go @@ -0,0 +1,20 @@ +//go:build windows && lcow + +package plan9 + +import ( + "fmt" +) + +// Save is not yet supported for the Plan9 sub-controller; any tracked state +// indicates a live-migration scenario the controller cannot represent. +func (c *Controller) Save() error { + c.mu.Lock() + defer c.mu.Unlock() + + if len(c.sharesByHostPath) > 0 || len(c.reservations) > 0 { + return fmt.Errorf("plan9 controller save not supported: %d shares, %d reservations", len(c.sharesByHostPath), len(c.reservations)) + } + + return nil +} diff --git a/internal/controller/device/plan9/save_test.go b/internal/controller/device/plan9/save_test.go new file mode 100644 index 0000000000..2738d8d975 --- /dev/null +++ b/internal/controller/device/plan9/save_test.go @@ -0,0 +1,33 @@ +//go:build windows && lcow + +package plan9 + +import ( + "testing" + + "github.com/Microsoft/go-winio/pkg/guid" + + "github.com/Microsoft/hcsshim/internal/controller/device/plan9/share" +) + +func TestSave_EmptyOK(t *testing.T) { + c := &Controller{ + reservations: map[guid.GUID]*reservation{}, + sharesByHostPath: map[string]*share.Share{}, + } + + if err := c.Save(); err != nil { + t.Fatalf("Save on empty controller: %v", err) + } +} + +func TestSave_NonEmptyErrors(t *testing.T) { + c := &Controller{ + reservations: map[guid.GUID]*reservation{{}: {hostPath: "/h"}}, + sharesByHostPath: map[string]*share.Share{}, + } + + if err := c.Save(); err == nil { + t.Fatal("expected Save to error when reservations are present") + } +} diff --git a/internal/controller/device/scsi/controller.go b/internal/controller/device/scsi/controller.go index a89f4acea6..bddc8f9659 100644 --- a/internal/controller/device/scsi/controller.go +++ b/internal/controller/device/scsi/controller.go @@ -34,7 +34,7 @@ import ( // it succeeds to release the reservation and all resources. type Controller struct { // mu serializes all public operations on the Controller. - mu sync.Mutex + mu sync.RWMutex // vm is the host-side interface for adding and removing SCSI disks. // Immutable after construction. @@ -58,6 +58,10 @@ type Controller struct { // ControllerID = index / numLUNsPerController // LUN = index % numLUNsPerController controllerSlots []*disk.Disk + + // isMigrating rejects all public ops while set: true once a snapshot has + // been taken or imported, until migration is resumed. Guarded by mu. + isMigrating bool } // New creates a new [Controller] for the given number of SCSI controllers and @@ -78,10 +82,14 @@ func New(numControllers int, vm VMSCSIOps, guest GuestSCSIOps) *Controller { // once per controller and lun location, and must be called before any calls to // Reserve() to ensure the rootfs reservation is not evicted by a dynamic // reservation. -func (c *Controller) ReserveForRootfs(ctx context.Context, controller, lun uint) error { +func (c *Controller) ReserveForRootfs(ctx context.Context, controller, lun uint, cfg disk.Config) error { c.mu.Lock() defer c.mu.Unlock() + if c.isMigrating { + return fmt.Errorf("SCSI controller is migrating; call Resume first") + } + slot := int(controller*numLUNsPerController + lun) if slot >= len(c.controllerSlots) { return fmt.Errorf("invalid controller %d or lun %d", controller, lun) @@ -89,7 +97,7 @@ func (c *Controller) ReserveForRootfs(ctx context.Context, controller, lun uint) if c.controllerSlots[slot] != nil { return fmt.Errorf("slot for controller %d and lun %d is already reserved", controller, lun) } - c.controllerSlots[slot] = disk.NewReserved(controller, lun, disk.Config{}) + c.controllerSlots[slot] = disk.NewReserved(controller, lun, cfg) return nil } @@ -103,6 +111,10 @@ func (c *Controller) Reserve(ctx context.Context, diskConfig disk.Config, mountC c.mu.Lock() defer c.mu.Unlock() + if c.isMigrating { + return guid.GUID{}, fmt.Errorf("SCSI controller is migrating; call Resume first") + } + ctx, _ = log.WithContext(ctx, logrus.WithFields(logrus.Fields{ logfields.HostPath: diskConfig.HostPath, logfields.Partition: mountConfig.Partition, @@ -178,6 +190,10 @@ func (c *Controller) MapToGuest(ctx context.Context, id guid.GUID) (string, erro c.mu.Lock() defer c.mu.Unlock() + if c.isMigrating { + return "", fmt.Errorf("SCSI controller is migrating; call Resume first") + } + r, ok := c.reservations[id] if !ok { return "", fmt.Errorf("reservation %s not found", id) @@ -212,6 +228,10 @@ func (c *Controller) UnmapFromGuest(ctx context.Context, id guid.GUID) error { c.mu.Lock() defer c.mu.Unlock() + if c.isMigrating { + return fmt.Errorf("SCSI controller is migrating; call Resume first") + } + ctx, _ = log.WithContext(ctx, logrus.WithField("reservation", id.String())) r, ok := c.reservations[id] diff --git a/internal/controller/device/scsi/controller_test.go b/internal/controller/device/scsi/controller_test.go index 85b8fb92de..ac4be92f30 100644 --- a/internal/controller/device/scsi/controller_test.go +++ b/internal/controller/device/scsi/controller_test.go @@ -6,6 +6,7 @@ import ( "context" "errors" "fmt" + "strings" "testing" "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" @@ -70,6 +71,17 @@ func mappedController(t *testing.T) (*Controller, guid.GUID) { return c, id } +func attachmentsContainPath(att map[string]hcsschema.Scsi, path string) bool { + for _, s := range att { + for _, a := range s.Attachments { + if a.Path == path { + return true + } + } + } + return false +} + // --- Tests: New --- func TestNew(t *testing.T) { @@ -397,3 +409,111 @@ func TestUnmapFromGuest_RetryAfterDetachFailure(t *testing.T) { t.Fatalf("re-reserve after retry: %v", err) } } + +// --- Tests: ReserveForRootfs --- + +func TestReserveForRootfs_Success(t *testing.T) { + c := newController(&mockVMOps{}, newMockGuestOps()) + cfg := defaultDiskConfig() + if err := c.ReserveForRootfs(context.Background(), 0, 0, cfg); err != nil { + t.Fatalf("unexpected error: %v", err) + } + // The reserved rootfs disk surfaces in the VM topology with its config. + if !attachmentsContainPath(c.HCSAttachments(), cfg.HostPath) { + t.Errorf("expected rootfs path %q in HCS attachments", cfg.HostPath) + } +} + +func TestReserveForRootfs_InvalidLocation(t *testing.T) { + c := newController(&mockVMOps{}, newMockGuestOps()) + // Controller index beyond the single configured controller. + if err := c.ReserveForRootfs(context.Background(), 1, 0, defaultDiskConfig()); err == nil { + t.Fatal("expected error for out-of-range location") + } +} + +func TestReserveForRootfs_AlreadyReserved(t *testing.T) { + c := newController(&mockVMOps{}, newMockGuestOps()) + if err := c.ReserveForRootfs(context.Background(), 0, 0, defaultDiskConfig()); err != nil { + t.Fatalf("first reserve: %v", err) + } + if err := c.ReserveForRootfs(context.Background(), 0, 0, defaultDiskConfig()); err == nil { + t.Fatal("expected error reserving an occupied location") + } +} + +// --- Tests: migration guard --- + +func TestPublicOps_RejectedWhileMigrating(t *testing.T) { + ctx := t.Context() + ops := []struct { + name string + call func(*Controller) error + }{ + {"ReserveForRootfs", func(c *Controller) error { + return c.ReserveForRootfs(ctx, 0, 0, defaultDiskConfig()) + }}, + {"Reserve", func(c *Controller) error { + _, err := c.Reserve(ctx, defaultDiskConfig(), defaultMountConfig()) + return err + }}, + {"MapToGuest", func(c *Controller) error { + _, err := c.MapToGuest(ctx, guid.GUID{}) + return err + }}, + {"UnmapFromGuest", func(c *Controller) error { + return c.UnmapFromGuest(ctx, guid.GUID{}) + }}, + } + for _, op := range ops { + t.Run(op.name, func(t *testing.T) { + // Saving the source blocks further operations until migration resumes. + src := New(1, &mockVMOps{}, newMockGuestOps()) + env, err := src.Save(ctx) + if err != nil { + t.Fatalf("Save: %v", err) + } + if err := op.call(src); err == nil || !strings.Contains(err.Error(), "migrating") { + t.Fatalf("source: expected migrating error, got %v", err) + } + + // A freshly imported controller is also mid-migration until resumed. + c, err := Import(ctx, env) + if err != nil { + t.Fatalf("Import: %v", err) + } + if err := op.call(c); err == nil || !strings.Contains(err.Error(), "migrating") { + t.Fatalf("imported: expected migrating error, got %v", err) + } + }) + } +} + +func TestResume_LiftsMigrationGuard(t *testing.T) { + ctx := t.Context() + // Snapshot a controller holding a reservation, then import it. + src := newController(&mockVMOps{}, newMockGuestOps()) + if _, err := src.Reserve(ctx, defaultDiskConfig(), defaultMountConfig()); err != nil { + t.Fatalf("setup Reserve: %v", err) + } + env, err := src.Save(ctx) + if err != nil { + t.Fatalf("Save: %v", err) + } + c, err := Import(ctx, env) + if err != nil { + t.Fatalf("Import: %v", err) + } + + // Rejected while migrating. + if _, err := c.Reserve(ctx, defaultDiskConfig(), defaultMountConfig()); err == nil { + t.Fatal("expected migrating error before Resume") + } + + // Resuming binds live interfaces and lifts the guard. + c.Resume(ctx, &mockVMOps{}, newMockGuestOps()) + dc := disk.Config{HostPath: `C:\other.vhdx`, Type: disk.TypeVirtualDisk} + if _, err := c.Reserve(ctx, dc, defaultMountConfig()); err != nil { + t.Fatalf("Reserve after Resume: %v", err) + } +} diff --git a/internal/controller/device/scsi/disk/save.go b/internal/controller/device/scsi/disk/save.go new file mode 100644 index 0000000000..55b6f80a28 --- /dev/null +++ b/internal/controller/device/scsi/disk/save.go @@ -0,0 +1,84 @@ +//go:build windows && (lcow || wcow) + +package disk + +import ( + "fmt" + + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" +) + +// Save returns a migration snapshot of the disk and its mounts. It fails unless +// the disk is attached or reserved and every mount can be saved. +func (d *Disk) Save() (*scsisave.DiskState, error) { + if d.state != StateAttached && d.state != StateReserved { + return nil, fmt.Errorf("scsi disk controller=%d lun=%d in state %s; want %s", d.controller, d.lun, d.state, StateAttached) + } + + out := &scsisave.DiskState{ + Config: &scsisave.DiskConfig{ + HostPath: d.config.HostPath, + ReadOnly: d.config.ReadOnly, + Type: string(d.config.Type), + EvdType: d.config.EVDType, + }, + } + + if len(d.mounts) > 0 { + out.Mounts = make(map[uint64]*scsisave.MountState, len(d.mounts)) + + // Snapshot every mount; abort if any cannot be saved. + for partition, m := range d.mounts { + ms, err := m.Save() + if err != nil { + return nil, err + } + out.Mounts[partition] = ms + } + } + return out, nil +} + +// Import reconstructs a disk and its mounts from a migration snapshot at the +// given controller and lun. It returns nil if the snapshot is nil. +func Import(state *scsisave.DiskState, controller, lun uint) *Disk { + if state == nil { + return nil + } + + // Rebuild the host-side config from the snapshot, if present. + cfg := Config{} + if c := state.GetConfig(); c != nil { + cfg = Config{ + HostPath: c.GetHostPath(), + ReadOnly: c.GetReadOnly(), + Type: Type(c.GetType()), + EVDType: c.GetEvdType(), + } + } + + // An imported disk is assumed to be live on the SCSI bus. + d := &Disk{ + controller: controller, + lun: lun, + config: cfg, + state: StateAttached, + mounts: make(map[uint64]*mount.Mount, len(state.GetMounts())), + } + + // Reconstruct each partition mount, skipping any that fail to import. + for partition, ms := range state.GetMounts() { + m := mount.Import(ms, controller, lun, partition) + if m == nil { + continue + } + d.mounts[partition] = m + } + return d +} + +// UpdateHostPath rewrites the host-side path of the disk image. +func (d *Disk) UpdateHostPath(p string) { + d.config.HostPath = p +} diff --git a/internal/controller/device/scsi/disk/save_test.go b/internal/controller/device/scsi/disk/save_test.go new file mode 100644 index 0000000000..92ebb7e807 --- /dev/null +++ b/internal/controller/device/scsi/disk/save_test.go @@ -0,0 +1,130 @@ +//go:build windows && (lcow || wcow) + +package disk + +import ( + "context" + "testing" + + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/mount" + scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" +) + +func TestSave_ErrorWhenDetached(t *testing.T) { + d := attachedDisk(t) + if err := d.DetachFromVM(context.Background(), &mockVMSCSIRemover{}, newDefaultEjector()); err != nil { + t.Fatalf("setup DetachFromVM: %v", err) + } + if _, err := d.Save(); err == nil { + t.Fatal("expected error saving a detached disk") + } +} + +func TestSave_ReservedDisk_RoundTripsConfig(t *testing.T) { + cfg := Config{ + HostPath: `C:\test\disk.vhdx`, + ReadOnly: true, + Type: TypePassThru, + EVDType: "evd", + } + state, err := NewReserved(0, 0, cfg).Save() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + // The snapshot exposes the host-side config the caller supplied. + c := state.GetConfig() + if c.GetHostPath() != cfg.HostPath || c.GetReadOnly() != cfg.ReadOnly || + c.GetType() != string(cfg.Type) || c.GetEvdType() != cfg.EVDType { + t.Errorf("unexpected config snapshot: %+v", c) + } + if len(state.GetMounts()) != 0 { + t.Errorf("expected no mounts, got %d", len(state.GetMounts())) + } +} + +func TestSave_IncludesReservedMounts(t *testing.T) { + d := attachedDisk(t) + if _, err := d.ReservePartition(context.Background(), mount.Config{Partition: 1}); err != nil { + t.Fatalf("ReservePartition: %v", err) + } + state, err := d.Save() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if _, ok := state.GetMounts()[1]; !ok { + t.Errorf("expected snapshot to include partition 1, got %v", state.GetMounts()) + } +} + +func TestImport_NilReturnsNil(t *testing.T) { + if d := Import(nil, 0, 0); d != nil { + t.Errorf("expected nil disk, got %+v", d) + } +} + +func TestImport_NilConfig_UsesDefaults(t *testing.T) { + // A snapshot without config still yields a usable, live disk. + d := Import(&scsisave.DiskState{}, 0, 0) + if d == nil { + t.Fatal("expected non-nil disk") + } + if d.State() != StateAttached { + t.Errorf("expected state %d, got %d", StateAttached, d.State()) + } + if d.HostPath() != "" { + t.Errorf("expected empty host path, got %q", d.HostPath()) + } +} + +func TestImport_SkipsNilMountEntry(t *testing.T) { + // A malformed (nil) mount entry must not break import. + d := Import(&scsisave.DiskState{Mounts: map[uint64]*scsisave.MountState{1: nil}}, 0, 0) + if d == nil { + t.Fatal("expected non-nil disk") + } + if d.State() != StateAttached { + t.Errorf("expected state %d, got %d", StateAttached, d.State()) + } +} + +func TestUpdateHostPath(t *testing.T) { + d := NewReserved(0, 0, defaultConfig()) + const newPath = `C:\new\path.vhdx` + d.UpdateHostPath(newPath) + if d.HostPath() != newPath { + t.Errorf("expected host path %q, got %q", newPath, d.HostPath()) + } +} + +func TestSaveImport_RoundTrip(t *testing.T) { + state := &scsisave.DiskState{ + Config: &scsisave.DiskConfig{ + HostPath: `C:\test\disk.vhdx`, + ReadOnly: true, + Type: string(TypeVirtualDisk), + EvdType: "evd", + }, + Mounts: map[uint64]*scsisave.MountState{2: {}}, + } + + d := Import(state, 3, 4) + if d == nil { + t.Fatal("expected non-nil disk") + } + // An imported disk is live, so its config is queryable and it can be re-saved. + if d.State() != StateAttached { + t.Errorf("expected state %d, got %d", StateAttached, d.State()) + } + if d.HostPath() != state.GetConfig().GetHostPath() { + t.Errorf("expected host path %q, got %q", state.GetConfig().GetHostPath(), d.HostPath()) + } + + out, err := d.Save() + if err != nil { + t.Fatalf("re-save: %v", err) + } + if _, ok := out.GetMounts()[2]; !ok { + t.Errorf("expected re-saved snapshot to include partition 2, got %v", out.GetMounts()) + } +} diff --git a/internal/controller/device/scsi/doc.go b/internal/controller/device/scsi/doc.go index 6756f0176f..1cf18bc761 100644 --- a/internal/controller/device/scsi/doc.go +++ b/internal/controller/device/scsi/doc.go @@ -19,7 +19,7 @@ // // # Usage // -// c := scsi.New(numControllers, vmOps, linuxGuestOps, windowsGuestOps) +// c := scsi.New(numControllers, vmOps, guestOps) // // // Reserve a slot (no I/O yet): // id, err := c.Reserve(ctx, diskConfig, mountConfig) @@ -37,6 +37,14 @@ // partway through teardown, calling it again with the same reservation ID // resumes from where the previous attempt stopped. // +// # Migration +// +// Taking a snapshot blocks all operations until migration is resumed, so the +// live state cannot diverge from the captured snapshot while it is handed off. +// A controller reconstructed from a snapshot on the destination is likewise +// blocked; resuming binds the live host and guest interfaces and lifts the +// block on both source and destination. +// // # Layered Design // // The [Controller] delegates all disk-level state to [disk.Disk] and all diff --git a/internal/controller/device/scsi/mount/save_lcow.go b/internal/controller/device/scsi/mount/save_lcow.go new file mode 100644 index 0000000000..9385cebbba --- /dev/null +++ b/internal/controller/device/scsi/mount/save_lcow.go @@ -0,0 +1,58 @@ +//go:build windows && lcow + +package mount + +import ( + "fmt" + + scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" +) + +// Save returns a migration snapshot of the mount. It fails unless the mount +// is mounted or reserved. +func (m *Mount) Save() (*scsisave.MountState, error) { + if m.state != StateMounted && m.state != StateReserved { + return nil, fmt.Errorf("scsi mount controller=%d lun=%d partition=%d in state %s; want %s", m.controller, m.lun, m.config.Partition, m.state, StateMounted) + } + return &scsisave.MountState{ + Config: &scsisave.MountConfig{ + ReadOnly: m.config.ReadOnly, + Encrypted: m.config.Encrypted, + // Clone the slice so the snapshot does not alias live config. + Options: append([]string(nil), m.config.Options...), + EnsureFilesystem: m.config.EnsureFilesystem, + Filesystem: m.config.Filesystem, + BlockDev: m.config.BlockDev, + }, + RefCount: uint32(m.refCount), + GuestPath: m.guestPath, + }, nil +} + +// Import reconstructs a mount from a migration snapshot at the given controller, +// lun, and partition. It returns nil if the snapshot is nil. +func Import(state *scsisave.MountState, controller, lun uint, partition uint64) *Mount { + if state == nil { + return nil + } + + // Rebuild the mount config from the snapshot, if present. + cfg := Config{Partition: partition} + if c := state.GetConfig(); c != nil { + cfg.ReadOnly = c.GetReadOnly() + cfg.Encrypted = c.GetEncrypted() + cfg.Options = append([]string(nil), c.GetOptions()...) + cfg.EnsureFilesystem = c.GetEnsureFilesystem() + cfg.Filesystem = c.GetFilesystem() + cfg.BlockDev = c.GetBlockDev() + } + // An imported mount is assumed to be live in the guest. + return &Mount{ + controller: controller, + lun: lun, + config: cfg, + state: StateMounted, + refCount: int(state.GetRefCount()), + guestPath: state.GetGuestPath(), + } +} diff --git a/internal/controller/device/scsi/mount/save_lcow_test.go b/internal/controller/device/scsi/mount/save_lcow_test.go new file mode 100644 index 0000000000..f41f684ee7 --- /dev/null +++ b/internal/controller/device/scsi/mount/save_lcow_test.go @@ -0,0 +1,105 @@ +//go:build windows && lcow + +package mount + +import ( + "context" + "testing" + + scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" +) + +func TestSave_ErrorWhenUnmounted(t *testing.T) { + m := mountedMount(t) + if err := m.UnmountFromGuest(context.Background(), newDefaultUnmounter()); err != nil { + t.Fatalf("setup UnmountFromGuest: %v", err) + } + if _, err := m.Save(); err == nil { + t.Fatal("expected error saving an unmounted mount") + } +} + +func TestSave_Mounted_RoundTripsConfig(t *testing.T) { + m := NewReserved(2, 3, Config{ + Partition: 1, + ReadOnly: true, + Encrypted: true, + Options: []string{"noatime"}, + Filesystem: "ext4", + }) + if _, err := m.MountToGuest(context.Background(), newDefaultMounter()); err != nil { + t.Fatalf("setup MountToGuest: %v", err) + } + + state, err := m.Save() + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + // The snapshot exposes the guest path and config the caller will restore from. + if state.GetGuestPath() != m.GuestPath() { + t.Errorf("expected guest path %q, got %q", m.GuestPath(), state.GetGuestPath()) + } + c := state.GetConfig() + if !c.GetReadOnly() || !c.GetEncrypted() || c.GetFilesystem() != "ext4" { + t.Errorf("unexpected config snapshot: %+v", c) + } +} + +func TestSave_Reserved(t *testing.T) { + // A reserved (not-yet-mounted) mount can still be snapshotted. + if _, err := NewReserved(0, 0, defaultConfig()).Save(); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestImport_NilReturnsNil(t *testing.T) { + if m := Import(nil, 0, 0, 0); m != nil { + t.Errorf("expected nil mount, got %+v", m) + } +} + +func TestImport_NilConfig_UsesDefaults(t *testing.T) { + // A snapshot without config still yields a live mount at the given partition. + m := Import(&scsisave.MountState{GuestPath: "/run/mounts/scsi/0_0_7"}, 0, 0, 7) + if m == nil { + t.Fatal("expected non-nil mount") + } + if m.State() != StateMounted { + t.Errorf("expected state %d, got %d", StateMounted, m.State()) + } + if m.GuestPath() != "/run/mounts/scsi/0_0_7" { + t.Errorf("unexpected guest path %q", m.GuestPath()) + } +} + +func TestSaveImport_RoundTrip(t *testing.T) { + state := &scsisave.MountState{ + Config: &scsisave.MountConfig{ + ReadOnly: true, + Options: []string{"ro"}, + Filesystem: "ext4", + }, + RefCount: 2, + GuestPath: "/run/mounts/scsi/0_0_5", + } + + m := Import(state, 0, 0, 5) + if m == nil { + t.Fatal("expected non-nil mount") + } + // An imported mount is live, exposing its guest path from the mounted state. + if m.State() != StateMounted { + t.Errorf("expected state %d, got %d", StateMounted, m.State()) + } + if m.GuestPath() != state.GetGuestPath() { + t.Errorf("expected guest path %q, got %q", state.GetGuestPath(), m.GuestPath()) + } + + out, err := m.Save() + if err != nil { + t.Fatalf("re-save: %v", err) + } + if out.GetRefCount() != state.GetRefCount() { + t.Errorf("expected ref count %d, got %d", state.GetRefCount(), out.GetRefCount()) + } +} diff --git a/internal/controller/device/scsi/mount/save_wcow.go b/internal/controller/device/scsi/mount/save_wcow.go new file mode 100644 index 0000000000..25fac7ed08 --- /dev/null +++ b/internal/controller/device/scsi/mount/save_wcow.go @@ -0,0 +1,15 @@ +//go:build windows && wcow + +package mount + +import scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" + +// Save is a WCOW no-op stub as of now. +func (m *Mount) Save() (*scsisave.MountState, error) { + return &scsisave.MountState{}, nil +} + +// Import is a WCOW no-op stub as of now. +func Import(_ *scsisave.MountState, _, _ uint, _ uint64) *Mount { + return &Mount{} +} diff --git a/internal/controller/device/scsi/save.go b/internal/controller/device/scsi/save.go new file mode 100644 index 0000000000..0c5484138e --- /dev/null +++ b/internal/controller/device/scsi/save.go @@ -0,0 +1,247 @@ +//go:build windows && (lcow || wcow) + +package scsi + +import ( + "context" + "fmt" + "strconv" + + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" + hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" + "github.com/sirupsen/logrus" + + "github.com/Microsoft/go-winio/pkg/guid" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" +) + +// Save returns a serialized snapshot of the controller's current state, +// suitable for transferring to another host during live migration. After it +// returns, all operations are rejected until migration is resumed. +func (c *Controller) Save(ctx context.Context) (*anypb.Any, error) { + c.mu.Lock() + defer c.mu.Unlock() + + // Capture the topology, attached disks, and outstanding reservations. + state := &scsisave.Payload{ + SchemaVersion: scsisave.SchemaVersion, + NumControllers: uint32(len(c.controllerSlots) / numLUNsPerController), + Disks: make(map[uint32]*scsisave.DiskState, len(c.disksByPath)), + Reservations: make(map[string]*scsisave.Reservation, len(c.reservations)), + } + + // Save every occupied slot. + for slot, d := range c.controllerSlots { + if d == nil { + continue + } + ds, err := d.Save() + if err != nil { + return nil, err + } + state.Disks[uint32(slot)] = ds + } + + // Save all the reservations. + for id, r := range c.reservations { + state.Reservations[id.String()] = &scsisave.Reservation{ + Slot: uint32(r.controllerSlot), + Partition: r.partition, + } + } + + payload, err := proto.Marshal(state) + if err != nil { + return nil, fmt.Errorf("marshal scsi saved state: %w", err) + } + + // Block all further operations until migration is resumed. + c.isMigrating = true + + log.G(ctx).Debug("saved scsi controller state") + return &anypb.Any{TypeUrl: scsisave.TypeURL, Value: payload}, nil +} + +// Import reconstructs a controller from a snapshot produced by [Controller.Save]. +// The result cannot serve disk operations until [Controller.Resume] supplies the +// live host and guest interfaces. +func Import(ctx context.Context, env *anypb.Any) (*Controller, error) { + if env == nil { + return nil, fmt.Errorf("scsi saved-state envelope is nil") + } + + // Reject payloads that did not come from a compatible Save. + if env.GetTypeUrl() != scsisave.TypeURL { + return nil, fmt.Errorf("unsupported scsi saved-state type %q", env.GetTypeUrl()) + } + + // Unmarshall the payload. + state := &scsisave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + return nil, fmt.Errorf("unmarshal scsi saved state: %w", err) + } + + // Reject payloads written by an incompatible shim version. + if v := state.GetSchemaVersion(); v != scsisave.SchemaVersion { + return nil, fmt.Errorf("unsupported scsi saved-state schema version %d (want %d)", v, scsisave.SchemaVersion) + } + + // Create a new controller. + numCtrls := int(state.GetNumControllers()) + c := &Controller{ + reservations: make(map[guid.GUID]*reservation, len(state.GetReservations())), + disksByPath: make(map[string]int, len(state.GetDisks())), + controllerSlots: make([]*disk.Disk, numCtrls*numLUNsPerController), + isMigrating: true, + } + + // Place each saved disk back at its original slot. + for slot, ds := range state.GetDisks() { + idx := int(slot) + if idx >= len(c.controllerSlots) { + return nil, fmt.Errorf("invalid controller slot: %d", slot) + } + + // Derive the controller and LUN from the slot index and rebuild the disk. + controller, lun := uint(idx/numLUNsPerController), uint(idx%numLUNsPerController) + d := disk.Import(ds, controller, lun) + if d == nil { + return nil, fmt.Errorf("failed to import disk at controller=%d lun=%d", controller, lun) + } + + // Store the disk and index it by host path for later lookups. + c.controllerSlots[idx] = d + if hp := ds.GetConfig().GetHostPath(); hp != "" { + c.disksByPath[hp] = idx + } + } + + // Rehydrate all the reservations. + for idStr, r := range state.GetReservations() { + // Skip any reservation whose ID cannot be parsed. + id, err := guid.FromString(idStr) + if err != nil { + return nil, fmt.Errorf("invalid reservation id %q: %w", idStr, err) + } + + c.reservations[id] = &reservation{ + controllerSlot: int(r.GetSlot()), + partition: r.GetPartition(), + } + } + + log.G(ctx).Debug("imported scsi controller state") + return c, nil +} + +// Resume binds the live host and guest interfaces to an imported controller, +// enabling normal disk operations. It must be called on the destination +// before any reserve, attach, or mount calls. +func (c *Controller) Resume(ctx context.Context, vm VMSCSIOps, guest GuestSCSIOps) { + c.mu.Lock() + defer c.mu.Unlock() + + c.vm = vm + c.guest = guest + c.isMigrating = false + + log.G(ctx).Debug("resumed scsi controller") +} + +// Disks returns the configuration of every disk currently attached to the +// controller. +func (c *Controller) Disks() []disk.Config { + c.mu.RLock() + defer c.mu.RUnlock() + + // Collect the config of each indexed disk. + configs := make([]disk.Config, 0, len(c.disksByPath)) + for _, slot := range c.disksByPath { + if d := c.controllerSlots[slot]; d != nil { + configs = append(configs, d.Config()) + } + } + + return configs +} + +// HCSAttachments returns every attached disk described in HCS schema form, ready +// to be handed to HCS when constructing or resuming a VM. +func (c *Controller) HCSAttachments() map[string]hcsschema.Scsi { + c.mu.RLock() + defer c.mu.RUnlock() + + // Group attachments by their controller GUID. + out := map[string]hcsschema.Scsi{} + for idx, d := range c.controllerSlots { + if d == nil { + continue + } + + // Map the slot index to its controller GUID. + ctrlIdx := idx / numLUNsPerController + gid := guestrequest.ScsiControllerGuids[ctrlIdx] + s, ok := out[gid] + if !ok { + s = hcsschema.Scsi{Attachments: map[string]hcsschema.Attachment{}} + } + + // Record the disk at its LUN within the controller. + s.Attachments[strconv.FormatUint(uint64(idx%numLUNsPerController), 10)] = hcsschema.Attachment{ + Path: d.Config().HostPath, + Type_: string(d.Config().Type), + ReadOnly: d.Config().ReadOnly, + ExtensibleVirtualDiskType: d.Config().EVDType, + } + out[gid] = s + } + + return out +} + +// UpdateDiskHostPath points the disk backing the given reservation at a new +// host path. It is only valid between [Import] and [Controller.Resume], so the +// destination's disk locations can be corrected before the VM resumes. +func (c *Controller) UpdateDiskHostPath(ctx context.Context, reservationID guid.GUID, newPath string) error { + c.mu.Lock() + defer c.mu.Unlock() + + if !c.isMigrating { + return fmt.Errorf("UpdateDiskHostPath is only valid while migrating") + } + + // Find the reservation. + r, ok := c.reservations[reservationID] + if !ok { + return fmt.Errorf("reservation %s not found", reservationID) + } + + // Find the requested disk. + d := c.controllerSlots[r.controllerSlot] + if d == nil { + return fmt.Errorf("disk for reservation %s not found", reservationID) + } + + // Update old path to new path. + oldPath := d.HostPath() + if oldPath == newPath { + return nil + } + if slot, ok := c.disksByPath[oldPath]; ok && slot == r.controllerSlot { + delete(c.disksByPath, oldPath) + } + + d.UpdateHostPath(newPath) + c.disksByPath[newPath] = r.controllerSlot + + log.G(ctx).WithFields(logrus.Fields{ + "OldPath": oldPath, + "NewPath": newPath, + }).Debug("updated disk host path") + + return nil +} diff --git a/internal/controller/device/scsi/save_test.go b/internal/controller/device/scsi/save_test.go new file mode 100644 index 0000000000..97bc672ebb --- /dev/null +++ b/internal/controller/device/scsi/save_test.go @@ -0,0 +1,215 @@ +//go:build windows && (lcow || wcow) + +package scsi + +import ( + "context" + "strings" + "testing" + + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + scsisave "github.com/Microsoft/hcsshim/internal/controller/device/scsi/save" + "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" + + "github.com/Microsoft/go-winio/pkg/guid" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" +) + +// --- Helpers --- + +func mustGUID(t *testing.T) guid.GUID { + t.Helper() + g, err := guid.NewV4() + if err != nil { + t.Fatalf("NewV4: %v", err) + } + return g +} + +// wrapImport imports a hand-built payload wrapped with the SCSI save type url. +func wrapImport(t *testing.T, p *scsisave.Payload) *Controller { + t.Helper() + b, err := proto.Marshal(p) + if err != nil { + t.Fatalf("marshal: %v", err) + } + c, err := Import(t.Context(), &anypb.Any{TypeUrl: scsisave.TypeURL, Value: b}) + if err != nil { + t.Fatalf("Import: %v", err) + } + return c +} + +// importedReserved snapshots a controller holding a single reservation and +// imports it, returning the migrating controller and that reservation's ID. +func importedReserved(t *testing.T) (*Controller, guid.GUID) { + t.Helper() + src, id := reservedController(t) + env, err := src.Save(t.Context()) + if err != nil { + t.Fatalf("Save: %v", err) + } + c, err := Import(t.Context(), env) + if err != nil { + t.Fatalf("Import: %v", err) + } + return c, id +} + +// --- Tests: Save + Import round trip --- + +func TestSave_RoundTrip(t *testing.T) { + src, id := mappedController(t) + + env, err := src.Save(t.Context()) + if err != nil { + t.Fatalf("Save: %v", err) + } + // A caller sees a payload tagged with the SCSI save type. + if env.GetTypeUrl() != scsisave.TypeURL { + t.Fatalf("unexpected type url %q", env.GetTypeUrl()) + } + + dst, err := Import(t.Context(), env) + if err != nil { + t.Fatalf("Import: %v", err) + } + + // The attached disk and its host path survive the round trip. + cfg := defaultDiskConfig() + if disks := dst.Disks(); len(disks) != 1 || disks[0].HostPath != cfg.HostPath { + t.Fatalf("unexpected disks after import: %+v", disks) + } + if !attachmentsContainPath(dst.HCSAttachments(), cfg.HostPath) { + t.Errorf("expected %q in HCS attachments", cfg.HostPath) + } + + // The reservation survives, so its disk path can be corrected before resume. + const newPath = `C:\migrated\disk.vhdx` + if err := dst.UpdateDiskHostPath(t.Context(), id, newPath); err != nil { + t.Fatalf("UpdateDiskHostPath: %v", err) + } + if disks := dst.Disks(); len(disks) != 1 || disks[0].HostPath != newPath { + t.Fatalf("expected host path %q after update, got %+v", newPath, disks) + } +} + +func TestSave_EmptyRoundTrip(t *testing.T) { + env, err := New(2, &mockVMOps{}, newMockGuestOps()).Save(t.Context()) + if err != nil { + t.Fatalf("Save: %v", err) + } + dst, err := Import(t.Context(), env) + if err != nil { + t.Fatalf("Import: %v", err) + } + if d := dst.Disks(); len(d) != 0 { + t.Errorf("expected no disks, got %+v", d) + } + if a := dst.HCSAttachments(); len(a) != 0 { + t.Errorf("expected no attachments, got %+v", a) + } +} + +// --- Tests: Import errors --- + +func TestImport_Errors(t *testing.T) { + wrap := func(p *scsisave.Payload) *anypb.Any { + b, err := proto.Marshal(p) + if err != nil { + t.Fatalf("marshal: %v", err) + } + return &anypb.Any{TypeUrl: scsisave.TypeURL, Value: b} + } + + tests := []struct { + name string + env *anypb.Any + wantErr string + }{ + {"nil envelope", nil, "nil"}, + {"wrong type url", &anypb.Any{TypeUrl: "bogus"}, "unsupported scsi saved-state type"}, + {"corrupt payload", &anypb.Any{TypeUrl: scsisave.TypeURL, Value: []byte{0xff}}, "unmarshal"}, + {"schema mismatch", wrap(&scsisave.Payload{SchemaVersion: scsisave.SchemaVersion + 1}), "schema version"}, + {"slot out of range", wrap(&scsisave.Payload{ + SchemaVersion: scsisave.SchemaVersion, + NumControllers: 1, + Disks: map[uint32]*scsisave.DiskState{numLUNsPerController: {}}, + }), "invalid controller slot"}, + {"bad reservation id", wrap(&scsisave.Payload{ + SchemaVersion: scsisave.SchemaVersion, + NumControllers: 1, + Reservations: map[string]*scsisave.Reservation{"not-a-guid": {}}, + }), "invalid reservation id"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if _, err := Import(t.Context(), tt.env); err == nil || !strings.Contains(err.Error(), tt.wantErr) { + t.Fatalf("expected error containing %q, got %v", tt.wantErr, err) + } + }) + } +} + +// --- Tests: HCSAttachments schema mapping --- + +func TestHCSAttachments(t *testing.T) { + c := New(1, &mockVMOps{}, newMockGuestOps()) + cfg := disk.Config{HostPath: `C:\rootfs.vhdx`, ReadOnly: true, Type: disk.TypeVirtualDisk} + if err := c.ReserveForRootfs(context.Background(), 0, 5, cfg); err != nil { + t.Fatalf("ReserveForRootfs: %v", err) + } + + // The disk surfaces under its controller GUID, keyed by LUN, with its config. + s, ok := c.HCSAttachments()[guestrequest.ScsiControllerGuids[0]] + if !ok { + t.Fatalf("expected attachment under controller 0 guid, got %+v", c.HCSAttachments()) + } + a, ok := s.Attachments["5"] + if !ok { + t.Fatalf("expected attachment at lun 5, got %+v", s.Attachments) + } + if a.Path != cfg.HostPath || a.Type_ != string(cfg.Type) || !a.ReadOnly { + t.Errorf("unexpected attachment: %+v", a) + } +} + +// --- Tests: UpdateDiskHostPath --- + +func TestUpdateDiskHostPath(t *testing.T) { + resID := mustGUID(t) + // Migrating controller whose lone reservation points at an empty slot. + noDisk := wrapImport(t, &scsisave.Payload{ + SchemaVersion: scsisave.SchemaVersion, + NumControllers: 1, + Reservations: map[string]*scsisave.Reservation{resID.String(): {Slot: 5}}, + }) + + t.Run("reservation not found", func(t *testing.T) { + if err := noDisk.UpdateDiskHostPath(t.Context(), mustGUID(t), `C:\x.vhdx`); err == nil || + !strings.Contains(err.Error(), "not found") { + t.Fatalf("expected not-found error, got %v", err) + } + }) + t.Run("disk not found", func(t *testing.T) { + if err := noDisk.UpdateDiskHostPath(t.Context(), resID, `C:\x.vhdx`); err == nil || + !strings.Contains(err.Error(), "disk for reservation") { + t.Fatalf("expected disk-not-found error, got %v", err) + } + }) + t.Run("same path is a no-op", func(t *testing.T) { + c, id := importedReserved(t) + if err := c.UpdateDiskHostPath(t.Context(), id, defaultDiskConfig().HostPath); err != nil { + t.Fatalf("unexpected error: %v", err) + } + }) + t.Run("rejected after resume", func(t *testing.T) { + c, id := importedReserved(t) + c.Resume(t.Context(), &mockVMOps{}, newMockGuestOps()) + if err := c.UpdateDiskHostPath(t.Context(), id, `C:\x.vhdx`); err == nil || + !strings.Contains(err.Error(), "migrating") { + t.Fatalf("expected migrating error, got %v", err) + } + }) +} diff --git a/internal/controller/device/vpci/save.go b/internal/controller/device/vpci/save.go new file mode 100644 index 0000000000..11ad361dad --- /dev/null +++ b/internal/controller/device/vpci/save.go @@ -0,0 +1,20 @@ +//go:build windows && (lcow || wcow) + +package vpci + +import ( + "fmt" +) + +// Save is not yet supported for the VPCI sub-controller; any tracked state +// indicates a live-migration scenario the controller cannot represent. +func (c *Controller) Save() error { + c.mu.Lock() + defer c.mu.Unlock() + + if len(c.devices) > 0 { + return fmt.Errorf("vpci controller save not supported: %d devices", len(c.devices)) + } + + return nil +} diff --git a/internal/controller/device/vpci/save_lcow_test.go b/internal/controller/device/vpci/save_lcow_test.go new file mode 100644 index 0000000000..3b5e28991c --- /dev/null +++ b/internal/controller/device/vpci/save_lcow_test.go @@ -0,0 +1,34 @@ +//go:build windows && (lcow || wcow) + +package vpci + +import ( + "testing" + + "github.com/Microsoft/go-winio/pkg/guid" +) + +func TestSave_EmptyOK(t *testing.T) { + c := &Controller{ + devices: map[guid.GUID]*deviceInfo{}, + deviceToGUID: map[Device]guid.GUID{}, + } + + if err := c.Save(); err != nil { + t.Fatalf("Save on empty controller: %v", err) + } +} + +func TestSave_NonEmptyErrors(t *testing.T) { + g := guid.GUID{} + dev := Device{DeviceInstanceID: "PCI\\VEN_X"} + + c := &Controller{ + devices: map[guid.GUID]*deviceInfo{g: {device: dev, vmBusGUID: g, state: StateReady, refCount: 1}}, + deviceToGUID: map[Device]guid.GUID{dev: g}, + } + + if err := c.Save(); err == nil { + t.Fatal("expected Save to error when devices are present") + } +} diff --git a/internal/controller/linuxcontainer/container.go b/internal/controller/linuxcontainer/container.go index 03320485f6..d28ff65a51 100644 --- a/internal/controller/linuxcontainer/container.go +++ b/internal/controller/linuxcontainer/container.go @@ -21,6 +21,7 @@ import ( "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" "github.com/Microsoft/hcsshim/internal/signals" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" "github.com/Microsoft/hcsshim/internal/vm/vmutils" "github.com/Microsoft/go-winio/pkg/guid" @@ -225,7 +226,9 @@ func (c *Controller) closeContainer() { // state went with it) or the VM itself has exited (so host-side state went // with it). Either way, teardown can move on instead of retrying. func isResourceAlreadyReleased(err error) bool { - return errors.Is(err, gcs.ErrBridgeClosed) || vmutils.IsVMNotAvailableError(err) + return errors.Is(err, gcs.ErrBridgeClosed) || + errors.Is(err, guestmanager.ErrGuestConnectionUnavailable) || + vmutils.IsVMNotAvailableError(err) } // releaseResources undoes each allocation in reverse order. @@ -299,16 +302,19 @@ func (c *Controller) releaseResources(ctx context.Context) error { // After layer overlay has been removed, we can safely delete the // bundle path inside the UVM for the container. Therefore, delete - // the guest-side container state if supported. - if !c.isContainerStateDeleted && c.guest.Capabilities().IsDeleteContainerStateSupported() { - // GCS bridge evicts the container from its host-state map even if the inner Delete fails, - // so retries will always return not-found. - if err := c.guest.DeleteContainerState(ctx, c.gcsContainerID); err != nil && !isResourceAlreadyReleased(err) { - return fmt.Errorf("delete container state: %w", err) - } + // the guest-side container state if supported. Short-circuit on + // isContainerStateDeleted so retries don't re-issue Capabilities. + if !c.isContainerStateDeleted { + if caps := c.guest.Capabilities(); caps != nil && caps.IsDeleteContainerStateSupported() { + // GCS bridge evicts the container from its host-state map even if the inner Delete fails, + // so retries will always return not-found. + if err := c.guest.DeleteContainerState(ctx, c.gcsContainerID); err != nil && !isResourceAlreadyReleased(err) { + return fmt.Errorf("delete container state: %w", err) + } - // Set isContainerStateDeleted to true so that we do not retry this post successful delete. - c.isContainerStateDeleted = true + // Set isContainerStateDeleted to true so that we do not retry this post successful delete. + c.isContainerStateDeleted = true + } } return nil @@ -542,20 +548,31 @@ func (c *Controller) KillProcess(ctx context.Context, execID string, signal uint return fmt.Errorf("cannot signal all for non-empty exec %q: %w", execID, errdefs.ErrFailedPrecondition) } - signalsSupported := c.guest.Capabilities().IsSignalProcessSupported() - signalOptions, err := signals.ValidateLCOW(int(signal), signalsSupported) - if err != nil { - return fmt.Errorf("validate signal %d for container %s: %w", signal, c.containerID, err) - } - c.mu.Lock() defer c.mu.Unlock() - // The container must have been created for any process to exist. - if c.state == StateNotCreated { + // The container must have been created (and not be mid-migration on either + // the source or destination) for any process to exist or for c.guest to be + // safe to dereference. + if c.state == StateNotCreated || c.state == StateDestinationMigrating || c.state == StateSourceMigrating { return fmt.Errorf("container %s is in state %s; cannot kill: %w", c.containerID, c.state, errdefs.ErrFailedPrecondition) } + // Already terminal (e.g. aborted destination-migrated container): no-op + // rather than dereference c.guest, which may be nil. + if c.state == StateStopped || c.state == StateInvalid { + return nil + } + + signalsSupported := false + if caps := c.guest.Capabilities(); caps != nil { + signalsSupported = caps.IsSignalProcessSupported() + } + signalOptions, err := signals.ValidateLCOW(int(signal), signalsSupported) + if err != nil { + return fmt.Errorf("validate signal %d for container %s: %w", signal, c.containerID, err) + } + // When "all" is requested, deliver the signal to every additional exec // on a best-effort basis. Errors are logged but do not prevent the // target process from being signaled. @@ -602,8 +619,9 @@ func (c *Controller) DeleteProcess(ctx context.Context, execID string) (*task.St c.mu.Lock() defer c.mu.Unlock() - // The container must have been created for any process to exist. - if c.state == StateNotCreated { + // The container must have been created (and not be mid-migration on either + // the source or destination) for any process to exist. + if c.state == StateNotCreated || c.state == StateDestinationMigrating || c.state == StateSourceMigrating { return nil, fmt.Errorf("container %s is in state %s; cannot delete process: %w", c.containerID, c.state, errdefs.ErrFailedPrecondition) } @@ -626,11 +644,12 @@ func (c *Controller) DeleteProcess(ctx context.Context, execID string) (*task.St // For containers that were created but never started, handleInitProcessExit // was never launched, so closeContainer was never called. Perform full // teardown now. closeContainer is retriable. - if err = c.releaseResources(ctx); err != nil { - return nil, fmt.Errorf("releasing resources for container %s: %w", c.containerID, err) + if c.guest != nil { + if err = c.releaseResources(ctx); err != nil { + return nil, fmt.Errorf("releasing resources for container %s: %w", c.containerID, err) + } + c.closeContainer() } - // Close container handle after the resources are released. - c.closeContainer() } // Remove the process entry only after all fallible operations have diff --git a/internal/controller/linuxcontainer/container_test.go b/internal/controller/linuxcontainer/container_test.go index 08169890e9..8350161788 100644 --- a/internal/controller/linuxcontainer/container_test.go +++ b/internal/controller/linuxcontainer/container_test.go @@ -18,6 +18,7 @@ import ( hcs "github.com/Microsoft/hcsshim/internal/hcs/v2" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" "github.com/Microsoft/hcsshim/internal/signals" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" "github.com/Microsoft/go-winio/pkg/guid" "github.com/containerd/errdefs" @@ -356,20 +357,33 @@ func TestKillProcess_InvalidSignal(t *testing.T) { // when the container has not been created yet. func TestKillProcess_NotCreatedState(t *testing.T) { t.Parallel() - c, _, _, _, guestCtrl := newContainerTestController(t) + c, _, _, _, _ := newContainerTestController(t) c.state = StateNotCreated - // SIGTERM (15) with no signal support returns nil signal options. - guestCtrl.EXPECT(). - Capabilities(). - Return(&gcs.LCOWGuestDefinedCapabilities{}) - err := c.KillProcess(t.Context(), "", 15, false) if !errors.Is(err, errdefs.ErrFailedPrecondition) { t.Errorf("KillProcess() error = %v, want ErrFailedPrecondition", err) } } +// TestKillProcess_FrozenWhileMigrating verifies that a container frozen mid +// migration (source or destination) rejects kills. +func TestKillProcess_FrozenWhileMigrating(t *testing.T) { + t.Parallel() + for _, state := range []State{StateSourceMigrating, StateDestinationMigrating} { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = state + + err := c.KillProcess(t.Context(), "", 15, false) + if !errors.Is(err, errdefs.ErrFailedPrecondition) { + t.Errorf("KillProcess() error = %v, want ErrFailedPrecondition", err) + } + }) + } +} + // TestKillProcess_ProcessNotFound verifies that KillProcess returns ErrNotFound // when the target exec ID does not exist. func TestKillProcess_ProcessNotFound(t *testing.T) { @@ -402,6 +416,24 @@ func TestDeleteProcess_NotCreatedState(t *testing.T) { } } +// TestDeleteProcess_FrozenWhileMigrating verifies that a container frozen mid +// migration (source or destination) rejects process deletion. +func TestDeleteProcess_FrozenWhileMigrating(t *testing.T) { + t.Parallel() + for _, state := range []State{StateSourceMigrating, StateDestinationMigrating} { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = state + + _, err := c.DeleteProcess(t.Context(), "exec-1") + if !errors.Is(err, errdefs.ErrFailedPrecondition) { + t.Errorf("DeleteProcess() error = %v, want ErrFailedPrecondition", err) + } + }) + } +} + // TestDeleteProcess_ProcessNotFound verifies that DeleteProcess returns // ErrNotFound when the target exec ID does not exist. func TestDeleteProcess_ProcessNotFound(t *testing.T) { @@ -621,6 +653,7 @@ func TestReleaseResources_StopsOnFirstError(t *testing.T) { }{ {name: "RealError_StopsChain", scsiErr: errUnmapSCSI, wantStops: true}, {name: "BridgeClosed_Continues", scsiErr: fmt.Errorf("transport gone: %w", gcs.ErrBridgeClosed)}, + {name: "GuestConnectionUnavailable_Continues", scsiErr: fmt.Errorf("guest RPC: %w", guestmanager.ErrGuestConnectionUnavailable)}, {name: "ComputeSystemDoesNotExist_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrComputeSystemDoesNotExist)}, {name: "VmcomputeAlreadyStopped_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeAlreadyStopped)}, {name: "VmcomputeOperationInvalidState_Continues", scsiErr: fmt.Errorf("hcs::System::Modify: %w", hcs.ErrVmcomputeOperationInvalidState)}, @@ -908,6 +941,7 @@ func TestReleaseResources_DeleteContainerState_ToleratesAlreadyGone(t *testing.T err error }{ {name: "BridgeClosed", err: fmt.Errorf("transport gone: %w", gcs.ErrBridgeClosed)}, + {name: "GuestConnectionUnavailable", err: fmt.Errorf("guest RPC failure: %w", guestmanager.ErrGuestConnectionUnavailable)}, {name: "ComputeSystemDoesNotExist", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrComputeSystemDoesNotExist)}, {name: "VmcomputeAlreadyStopped", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrVmcomputeAlreadyStopped)}, {name: "VmcomputeOperationInvalidState", err: fmt.Errorf("guest RPC failure: %w", hcs.ErrVmcomputeOperationInvalidState)}, @@ -1024,35 +1058,56 @@ func TestStats_WrongState(t *testing.T) { // --- KillProcess (additional state and flow tests) --- // TestKillProcess_AllowedInPostCreatedStates verifies that KillProcess does -// not reject containers in StateCreated or StateStopped on container-level -// state grounds. Errors that surface from the underlying process controller -// (which here is in StateNotCreated) are tolerated; the test only asserts -// that the container's own "cannot kill" precondition does not fire. +// not reject containers in StateCreated on container-level state grounds. +// Errors that surface from the underlying process controller (which here is +// in StateNotCreated) are tolerated; the test only asserts that the +// container's own "cannot kill" precondition does not fire. func TestKillProcess_AllowedInPostCreatedStates(t *testing.T) { + t.Parallel() + c, _, _, _, guestCtrl := newContainerTestController(t) + c.state = StateCreated + c.processes[""] = process.New(testContainerID, "", nil, 0) + + guestCtrl.EXPECT(). + Capabilities(). + Return(&gcs.LCOWGuestDefinedCapabilities{}) + + // SIGTERM (15) with no signal support returns nil options. + err := c.KillProcess(t.Context(), "", 15, false) + if err != nil && strings.Contains(err.Error(), "cannot kill") { + t.Errorf("KillProcess should not reject %s containers, got: %v", c.state, err) + } +} + +// TestKillProcess_TerminalStatesAreNoOp verifies that KillProcess short-circuits +// when the container has already reached a terminal state (StateStopped or +// StateInvalid). In these states, the underlying processes are gone and +// c.guest may be nil (e.g. for a container imported during live migration +// and aborted on the destination-stop path without ever calling Resume). +// Kill must be a no-op so CRI's stop flow does not fail on an already-dead +// workload — and must not dereference c.guest. +func TestKillProcess_TerminalStatesAreNoOp(t *testing.T) { t.Parallel() tests := []struct { name string state State }{ - {name: "created", state: StateCreated}, {name: "stopped", state: StateStopped}, + {name: "invalid", state: StateInvalid}, } for _, tc := range tests { t.Run(tc.name, func(t *testing.T) { t.Parallel() - c, _, _, _, guestCtrl := newContainerTestController(t) + c, _, _, _, _ := newContainerTestController(t) c.state = tc.state c.processes[""] = process.New(testContainerID, "", nil, 0) + // Simulate the post-AbortMigrated / aborted-import condition + // where c.guest is nil. KillProcess must NOT dereference it. + c.guest = nil - guestCtrl.EXPECT(). - Capabilities(). - Return(&gcs.LCOWGuestDefinedCapabilities{}) - - // SIGTERM (15) with no signal support returns nil options. - err := c.KillProcess(t.Context(), "", 15, false) - if err != nil && strings.Contains(err.Error(), "cannot kill") { - t.Errorf("KillProcess should not reject %s containers, got: %v", tc.state, err) + if err := c.KillProcess(t.Context(), "", 15, false); err != nil { + t.Errorf("KillProcess(%s) should be a no-op, got: %v", tc.state, err) } }) } diff --git a/internal/controller/linuxcontainer/doc.go b/internal/controller/linuxcontainer/doc.go index fc36d43939..b8c4d8aa00 100644 --- a/internal/controller/linuxcontainer/doc.go +++ b/internal/controller/linuxcontainer/doc.go @@ -6,6 +6,14 @@ // It coordinates host-side resource allocation (SCSI layers, Plan9 shares, vPCI devices), // guest-side container creation via the GCS (Guest Compute Service), and process management. // +// Live-migration entry points are provided on both sides: the source freezes a +// running container via [Controller.Save] (state snapshot), while the +// destination rehydrates it via [Import] (state-only rehydration), +// [Controller.Patch] (repoints the imported state at the destination host's disks +// and IO), and [Controller.Resume] (binds the live VM, guest, and devices once the +// destination VM is running). [Controller.Resume] is also the source's rollback, +// lifting the freeze back to the running state. +// // # Lifecycle // // A container follows the state machine below. @@ -29,6 +37,23 @@ // │ StateStopped │ // └──────────────┘ // +// Live migration adds two branches. The destination rehydrates a container via +// [Import] into [StateDestinationMigrating], rejoining the live states only after +// [Controller.Resume] binds the live dependencies, or being discarded via +// [Controller.AbortMigrated]. The source freezes a running container via +// [Controller.Save] into [StateSourceMigrating]; [Controller.Resume] rolls it back +// to running, or its init-process exit on source VM teardown stops it: +// +// destination source +// ┌───────────────────────────┐ ┌──────────────────────┐ +// │ StateDestinationMigrating │ │ StateSourceMigrating │ +// └───┬───────────────────┬───┘ └───┬──────────────┬───┘ +// │ Resume │ AbortMigrated │ Resume │ init exit +// ▼ ▼ ▼ ▼ +// ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +// │ StateRunning │ │ StateStopped │ │ StateRunning │ │ StateStopped │ +// └──────────────┘ └──────────────┘ └──────────────┘ └──────────────┘ +// // State descriptions: // // - [StateNotCreated]: initial state; no resources have been allocated. @@ -42,6 +67,14 @@ // mid-way; host-side resources are released. If the failure occurred after the // GCS container was successfully created, guest-side state may still require // cleanup via [Controller.DeleteProcess]. +// - [StateDestinationMigrating]: initial state for [Import] on the destination; +// the live VM, guest, and device controllers are not yet bound. [Controller.Resume] +// binds them and moves to [StateRunning], while [Controller.AbortMigrated] discards +// the import and moves to [StateStopped]. +// - [StateSourceMigrating]: a running container frozen by [Controller.Save] on the +// source while a migration is in flight. [Controller.Resume] rolls it back to +// [StateRunning]; if the source VM is torn down, the init process exit moves it to +// [StateStopped]. // // # Resource Allocation // diff --git a/internal/controller/linuxcontainer/document.go b/internal/controller/linuxcontainer/document.go index b89c1a112b..3f3c26fe70 100644 --- a/internal/controller/linuxcontainer/document.go +++ b/internal/controller/linuxcontainer/document.go @@ -74,6 +74,10 @@ func (c *Controller) generateContainerDocument( } linuxSpec.Root.Path = c.layers.rootfsPath + // Rewrite sandbox-id to the GCS-known pod ID, which differs from the + // spec's pod ID after live migration. No-op pre-migration. + rewriteSandboxIDAnnotation(linuxSpec, c.gcsPodID) + return &vmHostedContainerSettingsV2{ SchemaVersion: schemaversion.SchemaV21(), OCIBundlePath: ospath.Join("linux", guestpath.LCOWV2RootPrefixInVM, c.gcsPodID, c.gcsContainerID), @@ -82,6 +86,18 @@ func (c *Controller) generateContainerDocument( }, nil } +// rewriteSandboxIDAnnotation sets [annotations.KubernetesSandboxID] to gcsPodID +// if the annotation is present. +func rewriteSandboxIDAnnotation(spec *specs.Spec, gcsPodID string) { + if spec == nil || spec.Annotations == nil { + return + } + if _, ok := spec.Annotations[annotations.KubernetesSandboxID]; !ok { + return + } + spec.Annotations[annotations.KubernetesSandboxID] = gcsPodID +} + // sanitizeSpec deep-copies the OCI spec and strips fields unsupported by the GCS. func sanitizeSpec(ctx context.Context, origSpec *specs.Spec) (*specs.Spec, error) { // Deep copy via JSON round-trip so mutations do not affect the caller. diff --git a/internal/controller/linuxcontainer/document_test.go b/internal/controller/linuxcontainer/document_test.go index 21c85cde53..809ef00f54 100644 --- a/internal/controller/linuxcontainer/document_test.go +++ b/internal/controller/linuxcontainer/document_test.go @@ -301,3 +301,76 @@ func TestGenerateContainerDocument_NilLinux(t *testing.T) { t.Fatal("expected error for nil Linux section") } } + +// TestRewriteSandboxIDAnnotation verifies the post-migration sandbox-id +// translation: rewrite when present, no-op otherwise. +func TestRewriteSandboxIDAnnotation(t *testing.T) { + t.Parallel() + + const ( + sourcePodID = "source-pod" + destinationPodID = "destination-pod" + ) + + tests := []struct { + name string + annotations map[string]string + wantPresent bool + wantValue string + }{ + { + name: "workload-container-sandbox-id-rewritten-to-gcs-pod-id", + annotations: map[string]string{ + annotations.KubernetesContainerType: "container", + annotations.KubernetesSandboxID: destinationPodID, + }, + wantPresent: true, + wantValue: sourcePodID, + }, + { + name: "no-sandbox-id-annotation-leaves-map-unchanged", + annotations: map[string]string{ + annotations.KubernetesContainerType: "sandbox", + }, + wantPresent: false, + }, + { + name: "nil-annotations-is-a-noop", + annotations: nil, + wantPresent: false, + }, + { + name: "non-migrated-pod-rewrite-is-idempotent", + annotations: map[string]string{ + annotations.KubernetesSandboxID: sourcePodID, + }, + wantPresent: true, + wantValue: sourcePodID, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + spec := &specs.Spec{Annotations: tt.annotations} + + rewriteSandboxIDAnnotation(spec, sourcePodID) + + got, present := spec.Annotations[annotations.KubernetesSandboxID] + if present != tt.wantPresent { + t.Fatalf("KubernetesSandboxID present = %v, want %v (annotations=%v)", + present, tt.wantPresent, spec.Annotations) + } + if present && got != tt.wantValue { + t.Errorf("KubernetesSandboxID = %q, want %q", got, tt.wantValue) + } + }) + } +} + +// TestRewriteSandboxIDAnnotation_NilSpec ensures the helper does not panic +// on a nil spec. +func TestRewriteSandboxIDAnnotation_NilSpec(t *testing.T) { + t.Parallel() + rewriteSandboxIDAnnotation(nil, "pod") +} diff --git a/internal/controller/linuxcontainer/mocks/mock_types.go b/internal/controller/linuxcontainer/mocks/mock_types.go index f73c117939..8ac8f156fa 100644 --- a/internal/controller/linuxcontainer/mocks/mock_types.go +++ b/internal/controller/linuxcontainer/mocks/mock_types.go @@ -107,6 +107,21 @@ func (mr *MockguestMockRecorder) DeleteContainerState(ctx, cid any) *gomock.Call return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteContainerState", reflect.TypeOf((*Mockguest)(nil).DeleteContainerState), ctx, cid) } +// OpenContainer mocks base method. +func (m *Mockguest) OpenContainer(ctx context.Context, cid string) (*gcs.Container, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "OpenContainer", ctx, cid) + ret0, _ := ret[0].(*gcs.Container) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// OpenContainer indicates an expected call of OpenContainer. +func (mr *MockguestMockRecorder) OpenContainer(ctx, cid any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "OpenContainer", reflect.TypeOf((*Mockguest)(nil).OpenContainer), ctx, cid) +} + // RemoveCombinedLayers mocks base method. func (m *Mockguest) RemoveCombinedLayers(ctx context.Context, settings guestresource.LCOWCombinedLayers) error { m.ctrl.T.Helper() @@ -189,6 +204,20 @@ func (mr *MockscsiControllerMockRecorder) UnmapFromGuest(ctx, reservation any) * return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UnmapFromGuest", reflect.TypeOf((*MockscsiController)(nil).UnmapFromGuest), ctx, reservation) } +// UpdateDiskHostPath mocks base method. +func (m *MockscsiController) UpdateDiskHostPath(ctx context.Context, reservationID guid.GUID, newPath string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "UpdateDiskHostPath", ctx, reservationID, newPath) + ret0, _ := ret[0].(error) + return ret0 +} + +// UpdateDiskHostPath indicates an expected call of UpdateDiskHostPath. +func (mr *MockscsiControllerMockRecorder) UpdateDiskHostPath(ctx, reservationID, newPath any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "UpdateDiskHostPath", reflect.TypeOf((*MockscsiController)(nil).UpdateDiskHostPath), ctx, reservationID, newPath) +} + // Mockplan9Controller is a mock of plan9Controller interface. type Mockplan9Controller struct { ctrl *gomock.Controller diff --git a/internal/controller/linuxcontainer/save.go b/internal/controller/linuxcontainer/save.go new file mode 100644 index 0000000000..ee62055c93 --- /dev/null +++ b/internal/controller/linuxcontainer/save.go @@ -0,0 +1,440 @@ +//go:build windows && lcow + +package linuxcontainer + +import ( + "context" + "fmt" + + lcsave "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer/save" + "github.com/Microsoft/hcsshim/internal/controller/process" + "github.com/Microsoft/hcsshim/internal/layers" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + + "github.com/Microsoft/go-winio/pkg/guid" + eventstypes "github.com/containerd/containerd/api/events" + "github.com/containerd/containerd/api/runtime/task/v3" + "github.com/containerd/errdefs" + "github.com/sirupsen/logrus" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/durationpb" +) + +// Save serializes a running container's current state into a portable +// envelope that can be handed to a migration destination. It succeeds only +// when the container is running, the single stable state a live migration +// can be performed from. On success the source is frozen until it is resumed. +func (c *Controller) Save(ctx context.Context) (*anypb.Any, error) { + c.mu.Lock() + defer c.mu.Unlock() + + // Only a running container is in a stable, migratable state. + if c.state != StateRunning { + return nil, fmt.Errorf("container %q in state %s; want %s", c.containerID, c.state, StateRunning) + } + + // Capture the container's scalar bookkeeping into the snapshot. + state := &lcsave.Payload{ + SchemaVersion: lcsave.SchemaVersion, + ContainerID: c.containerID, + GcsContainerID: c.gcsContainerID, + IoRetryTimeout: durationpb.New(c.ioRetryTimeout), + ScsiReservationIds: guidsToStrings(c.scsiResources), + } + + // Record the rootfs layer reservations so the destination can re-create + // the same read-only and scratch disks. + if c.layers != nil { + ls := &lcsave.Layers{ + LayersCombined: c.layers.layersCombined, + RootfsPath: c.layers.rootfsPath, + Scratch: &lcsave.LayerReservation{ + ReservationID: c.layers.scratch.id.String(), + GuestPath: c.layers.scratch.guestPath, + }, + } + + ls.RoLayers = make([]*lcsave.LayerReservation, 0, len(c.layers.roLayers)) + for _, r := range c.layers.roLayers { + ls.RoLayers = append(ls.RoLayers, &lcsave.LayerReservation{ + ReservationID: r.id.String(), + GuestPath: r.guestPath, + }) + } + state.Layers = ls + } + + // Live migration only supports a container whose sole process is the + // init process; reject a missing init process or any additional execs. + if _, ok := c.processes[""]; !ok || len(c.processes) > 1 { + return nil, fmt.Errorf("container %q must have only the init process for live migration; has %d processes", c.containerID, len(c.processes)) + } + + // Snapshot the init process as an opaque payload the process controller + // owns end to end. + state.Processes = make(map[string]*anypb.Any, len(c.processes)) + for execID, p := range c.processes { + ps, err := p.Save(ctx) + if err != nil { + return nil, fmt.Errorf("save process %q/%q: %w", c.containerID, execID, err) + } + state.Processes[execID] = ps + } + + // Marshal and wrap the snapshot in a self-describing envelope. + payload, err := proto.Marshal(state) + if err != nil { + return nil, fmt.Errorf("marshal container saved state for %q: %w", c.containerID, err) + } + + // Freeze the source until the migration is resumed or its VM is torn down. + c.state = StateSourceMigrating + + log.G(ctx).WithField(logfields.SourceContainerID, c.containerID).Debug("container controller saved state") + + return &anypb.Any{TypeUrl: lcsave.TypeURL, Value: payload}, nil +} + +// guidsToStrings encodes reservation GUIDs for the wire payload. +func guidsToStrings(in []guid.GUID) []string { + if len(in) == 0 { + return nil + } + out := make([]string, len(in)) + for i, g := range in { + out[i] = g.String() + } + return out +} + +// Import reconstructs a container from an envelope produced by +// [Controller.Save]. The returned container carries the saved state but is +// not yet bound to a running VM, guest, or device controllers, so +// operational calls are rejected until [Controller.Resume]. Child processes +// are imported too but must each be resumed individually by the caller. +func Import(ctx context.Context, env *anypb.Any) (*Controller, error) { + // Reject an empty or mistyped envelope before touching its bytes. + if env == nil { + return nil, fmt.Errorf("container saved-state envelope is nil") + } + + if env.GetTypeUrl() != lcsave.TypeURL { + return nil, fmt.Errorf("unsupported container saved-state type %q", env.GetTypeUrl()) + } + + // Decode and reject any payload this build cannot interpret. + state := &lcsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + return nil, fmt.Errorf("unmarshal container saved state: %w", err) + } + + if v := state.GetSchemaVersion(); v != lcsave.SchemaVersion { + return nil, fmt.Errorf("unsupported container saved-state schema version %d (want %d)", v, lcsave.SchemaVersion) + } + + // Rehydrate into the destination-migrating state: state is restored but no + // live VM/guest/device interfaces are bound, so operational calls are + // rejected until Resume. + c := &Controller{ + containerID: state.GetContainerID(), + gcsContainerID: state.GetGcsContainerID(), + state: StateDestinationMigrating, + ioRetryTimeout: state.GetIoRetryTimeout().AsDuration(), + plan9Resources: []guid.GUID{}, + devices: []guid.GUID{}, + processes: make(map[string]*process.Controller), + terminatedCh: make(chan struct{}), + } + + scsiIDs, err := stringsToGuids(state.GetScsiReservationIds()) + if err != nil { + return nil, fmt.Errorf("decode scsi reservation ids: %w", err) + } + c.scsiResources = scsiIDs + + // Rebuild the rootfs layer reservations captured at save time. + if l := state.GetLayers(); l != nil { + ls := &scsiLayers{ + layersCombined: l.GetLayersCombined(), + rootfsPath: l.GetRootfsPath(), + } + + if sc := l.GetScratch(); sc != nil { + id, err := guid.FromString(sc.GetReservationID()) + if err != nil { + return nil, fmt.Errorf("decode scratch reservation id: %w", err) + } + ls.scratch = scsiReservation{id: id, guestPath: sc.GetGuestPath()} + } + + for _, ro := range l.GetRoLayers() { + id, err := guid.FromString(ro.GetReservationID()) + if err != nil { + return nil, fmt.Errorf("decode ro layer reservation id: %w", err) + } + ls.roLayers = append(ls.roLayers, scsiReservation{id: id, guestPath: ro.GetGuestPath()}) + } + + c.layers = ls + } + + // Import each saved process into its own migrating controller. + // The caller resumes them individually. + for execID, ps := range state.GetProcesses() { + p, err := process.Import(ctx, ps, c.containerID) + if err != nil { + return nil, fmt.Errorf("import process %q/%q: %w", c.containerID, execID, err) + } + + c.processes[execID] = p + } + + log.G(ctx).WithField(logfields.SourceContainerID, c.containerID).Debug("container controller imported state") + + return c, nil +} + +// Resume brings a migrating container back to the running state. On the +// destination it binds the live VM, guest, and device controllers, reattaches +// the init process along with its IO, begins watching for the process to exit, +// and republishes a TaskCreate event so containerd treats the migrated task as +// running locally. On the source it simply lifts the freeze applied by Save, +// since the live bindings and running processes are still intact. +func (c *Controller) Resume( + ctx context.Context, + vmID string, + gcsPodID string, + guestMgr guest, + scsiCtrl scsiController, + plan9Ctrl plan9Controller, + vpci vPCIController, + events chan interface{}, +) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Source rollback: bindings and running processes are intact, so just lift + // the freeze that Save applied. + if c.state == StateSourceMigrating { + c.state = StateRunning + return nil + } + + // Reopen the guest-side container that survived the move inside the UVM. + gcsContainer, err := guestMgr.OpenContainer(ctx, c.gcsContainerID) + if err != nil { + return fmt.Errorf("open gcs container %q: %w", c.gcsContainerID, err) + } + + initProc, ok := c.processes[""] + if !ok { + _ = gcsContainer.Close() + return fmt.Errorf("init process missing in container %q", c.containerID) + } + + // Reattach the init process to its live guest counterpart and rewire its + // IO. events is nil here because the container, not the process, owns + // publishing the init process's TaskExit, which handleInitProcessExit + // (started below) does after teardown. + if err := initProc.Resume(ctx, gcsContainer, nil); err != nil { + _ = gcsContainer.Close() + return fmt.Errorf("resume init process in container %q: %w", c.gcsContainerID, err) + } + + c.vmID = vmID + c.gcsPodID = gcsPodID + c.guest = guestMgr + c.scsi = scsiCtrl + c.plan9 = plan9Ctrl + c.vpci = vpci + c.container = gcsContainer + c.state = StateRunning + + // Watch the init process so its exit tears the container down and + // publishes TaskExit, exactly as a freshly started container would. + go c.handleInitProcessExit(ctx, initProc, events) + + // Announce the migrated task to containerd using the bundle/IO/pid that + // Patch seeded from the destination's create request. + if events != nil { + status := initProc.Status(true) + + log.G(ctx).WithFields(logrus.Fields{ + logfields.ContainerID: c.containerID, + "pid": status.Pid, + }).Info("container.Resume: republishing TaskCreate") + + events <- &eventstypes.TaskCreate{ + ContainerID: c.containerID, + Bundle: status.Bundle, + IO: &eventstypes.TaskIO{ + Stdin: status.Stdin, + Stdout: status.Stdout, + Stderr: status.Stderr, + Terminal: status.Terminal, + }, + Pid: status.Pid, + } + } + + log.G(ctx).WithField(logfields.DestinationContainerID, c.containerID).Debug("container controller resumed state") + return nil +} + +// stringsToGuids decodes wire-format reservation GUIDs back into GUIDs. +func stringsToGuids(in []string) ([]guid.GUID, error) { + if len(in) == 0 { + return nil, nil + } + + out := make([]guid.GUID, 0, len(in)) + for _, s := range in { + g, err := guid.FromString(s) + if err != nil { + return nil, fmt.Errorf("parse guid %q: %w", s, err) + } + out = append(out, g) + } + + return out, nil +} + +// ContainerID returns the containerd-visible identifier for this container. +func (c *Controller) ContainerID() string { + c.mu.RLock() + defer c.mu.RUnlock() + + return c.containerID +} + +// Patch updates an imported container to match the destination host's create +// request, readying it for [Controller.Resume]. It repoints every layer +// reservation at the destination's local VHD paths, reopens the init +// process's IO and bundle, and finally adopts the destination container ID. +// It is valid only while the container is migrating, and only for a container +// whose sole process is the init process. +func (c *Controller) Patch(ctx context.Context, scsiCtrl scsiController, request *task.CreateTaskRequest) error { + c.mu.Lock() + defer c.mu.Unlock() + + if c.state != StateDestinationMigrating { + return fmt.Errorf("container %s is in state %s; cannot patch: %w", c.containerID, c.state, errdefs.ErrFailedPrecondition) + } + + // Repoint every saved layer reservation at the destination host's VHDs + // so future SCSI operations resolve to local disks. + if c.layers != nil { + if scsiCtrl == nil { + return fmt.Errorf("scsi controller is required to patch container %q layers", c.containerID) + } + + lcowLayers, err := layers.ParseLCOWLayers(request.Rootfs, nil) + if err != nil { + return fmt.Errorf("parse destination lcow layers: %w", err) + } + + if got, want := len(lcowLayers.Layers), len(c.layers.roLayers); got != want { + return fmt.Errorf("ro layer count mismatch: got %d, want %d", got, want) + } + + for i, ro := range c.layers.roLayers { + // Resolve to the canonical (volume-prefixed) path, exactly as + // allocateLayers did at create time, so the SCSI controller keys + // this disk the same way and dedupes later Reserve calls for it. + hp, err := resolvePath(lcowLayers.Layers[i].VHDPath) + if err != nil { + return fmt.Errorf("resolve ro layer %d host path: %w", i, err) + } + + // Update the disk path to local VHD on this destination host. + if err := scsiCtrl.UpdateDiskHostPath(ctx, ro.id, hp); err != nil { + return fmt.Errorf("patch ro layer %d: %w", i, err) + } + } + + scratchHP, err := resolvePath(lcowLayers.ScratchVHDPath) + if err != nil { + return fmt.Errorf("resolve scratch host path: %w", err) + } + + // Update the disk path to local VHD on this destination host. + if err := scsiCtrl.UpdateDiskHostPath(ctx, c.layers.scratch.id, scratchHP); err != nil { + return fmt.Errorf("patch scratch layer: %w", err) + } + } + + // Re-establish IO for the init process from the destination's request. + // Live migration carries only the init process: execs are rejected at the + // source, and the destination's CreateTaskRequest describes the init + // process alone, so it is the only process we can patch here. + initProc, ok := c.processes[""] + if !ok { + return fmt.Errorf("init process missing in container %q", c.containerID) + } + + if len(c.processes) > 1 { + return fmt.Errorf("container %q has %d processes; live migration only supports the init process", c.containerID, len(c.processes)) + } + + if err := initProc.Patch(ctx, request.ID, &process.CreateOptions{ + Bundle: request.Bundle, + Terminal: request.Terminal, + Stdin: request.Stdin, + Stdout: request.Stdout, + Stderr: request.Stderr, + }); err != nil { + return fmt.Errorf("patch init process in container %q: %w", c.containerID, err) + } + + log.G(ctx).WithFields(logrus.Fields{ + logfields.SourceContainerID: c.containerID, + logfields.DestinationContainerID: request.ID, + }).Debug("patched container resource paths") + + // Adopt the destination's container ID last so a partial failure above + // leaves the controller addressable for a retry. + c.containerID = request.ID + + return nil +} + +// AbortMigrated discards an imported-but-never-resumed container: it drains +// each imported process, marks the container stopped, and publishes a +// synthetic TaskExit so containerd will accept a Delete. It is a no-op once +// the container has left the migrating state. +func (c *Controller) AbortMigrated(ctx context.Context, events chan interface{}) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.state != StateDestinationMigrating { + return + } + + log.G(ctx).WithField(logfields.DestinationContainerID, c.containerID).Debug("aborting migrated container") + + // Tear down each imported process and the container handle before + // reporting the container as exited. + for _, proc := range c.processes { + proc.AbortMigrated(ctx) + } + + c.state = StateStopped + c.closeContainer() + + // Emit a synthetic exit for the init process so containerd unblocks Delete. + initProc := c.processes[""] + if events == nil || initProc == nil { + return + } + + status := initProc.Status(true) + events <- &eventstypes.TaskExit{ + ContainerID: c.containerID, + ID: status.ExecID, + Pid: status.Pid, + ExitStatus: status.ExitStatus, + ExitedAt: status.ExitedAt, + } +} diff --git a/internal/controller/linuxcontainer/save_test.go b/internal/controller/linuxcontainer/save_test.go new file mode 100644 index 0000000000..7d4495f958 --- /dev/null +++ b/internal/controller/linuxcontainer/save_test.go @@ -0,0 +1,642 @@ +//go:build windows && lcow + +package linuxcontainer + +import ( + "errors" + "testing" + "time" + + "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer/mocks" + lcsave "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer/save" + "github.com/Microsoft/hcsshim/internal/controller/process" + procsave "github.com/Microsoft/hcsshim/internal/controller/process/save" + + "github.com/Microsoft/go-winio/pkg/guid" + eventstypes "github.com/containerd/containerd/api/events" + "github.com/containerd/containerd/api/runtime/task/v3" + containerdtypes "github.com/containerd/containerd/api/types" + "github.com/containerd/errdefs" + "go.uber.org/mock/gomock" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/durationpb" +) + +var errOpenContainer = errors.New("open container failed") + +// buildProcessEnvelope wraps a minimal, valid process payload for the given +// exec ID so the container importer can reconstruct a migrating process. +func buildProcessEnvelope(t *testing.T, execID string) *anypb.Any { + t.Helper() + value, err := proto.Marshal(&procsave.Payload{ + SchemaVersion: procsave.SchemaVersion, + ExecID: execID, + IoRetryTimeout: durationpb.New(time.Second), + }) + if err != nil { + t.Fatalf("marshal process payload = %v", err) + } + return &anypb.Any{TypeUrl: procsave.TypeURL, Value: value} +} + +// importedInitProcess returns an init process controller restored into the +// migrating state, ready to be patched, resumed, or aborted. +func importedInitProcess(t *testing.T) *process.Controller { + t.Helper() + p, err := process.Import(t.Context(), buildProcessEnvelope(t, ""), testContainerID) + if err != nil { + t.Fatalf("import init process = %v", err) + } + return p +} + +// patchedInitProcess returns a migrating init process whose IO has been opened, +// mirroring an imported-and-patched-but-never-resumed process. Empty IO paths +// avoid real named-pipe connections. +func patchedInitProcess(t *testing.T) *process.Controller { + t.Helper() + p := importedInitProcess(t) + if err := p.Patch(t.Context(), testContainerID, &process.CreateOptions{}); err != nil { + t.Fatalf("patch init process = %v", err) + } + return p +} + +// baseContainerPayload returns a fully valid container payload that individual +// tests mutate to exercise specific decode failures. +func baseContainerPayload(t *testing.T) *lcsave.Payload { + t.Helper() + scsiGUID, _ := guid.NewV4() + scratchGUID, _ := guid.NewV4() + roGUID, _ := guid.NewV4() + return &lcsave.Payload{ + SchemaVersion: lcsave.SchemaVersion, + ContainerID: "src-ctr", + GcsContainerID: "gcs-ctr", + IoRetryTimeout: durationpb.New(2 * time.Second), + ScsiReservationIds: []string{scsiGUID.String()}, + Layers: &lcsave.Layers{ + LayersCombined: true, + RootfsPath: "/rootfs", + Scratch: &lcsave.LayerReservation{ReservationID: scratchGUID.String(), GuestPath: "/dev/scratch"}, + RoLayers: []*lcsave.LayerReservation{{ReservationID: roGUID.String(), GuestPath: "/dev/ro0"}}, + }, + Processes: map[string]*anypb.Any{"": buildProcessEnvelope(t, "")}, + } +} + +// containerEnvelope marshals a payload into the self-describing wire envelope. +func containerEnvelope(t *testing.T, p *lcsave.Payload) *anypb.Any { + t.Helper() + value, err := proto.Marshal(p) + if err != nil { + t.Fatalf("marshal container payload = %v", err) + } + return &anypb.Any{TypeUrl: lcsave.TypeURL, Value: value} +} + +// --- Save --- + +// TestSave_WrongState verifies that only a running container can be saved. +func TestSave_WrongState(t *testing.T) { + t.Parallel() + invalidStates := []State{StateNotCreated, StateCreated, StateStopped, StateInvalid, StateDestinationMigrating, StateSourceMigrating} + + for _, state := range invalidStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = state + + if _, err := c.Save(t.Context()); err == nil { + t.Errorf("Save() = nil; want error for state %s", state) + } + }) + } +} + +// TestSave_ProcessConstraints verifies that Save rejects a running container +// unless its sole process is the init process. +func TestSave_ProcessConstraints(t *testing.T) { + t.Parallel() + tests := []struct { + name string + seedProc func(c *Controller) + }{ + {name: "no init process", seedProc: func(c *Controller) { + c.processes["exec-1"] = process.New(testContainerID, "exec-1", nil, 0) + }}, + {name: "extra exec process", seedProc: func(c *Controller) { + c.processes[""] = process.New(testContainerID, "", nil, 0) + c.processes["exec-1"] = process.New(testContainerID, "exec-1", nil, 0) + }}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = StateRunning + tc.seedProc(c) + + if _, err := c.Save(t.Context()); err == nil { + t.Error("Save() = nil; want error for invalid process set") + } + }) + } +} + +// TestSave_ProcessSaveFails verifies that a failure to save the init process +// (here, because it is not running) surfaces from Save. +func TestSave_ProcessSaveFails(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = StateRunning + // process.New starts in StateNotCreated, so its Save fails. + c.processes[""] = process.New(testContainerID, "", nil, 0) + + if _, err := c.Save(t.Context()); err == nil { + t.Error("Save() = nil; want error when init process cannot be saved") + } +} + +// --- Import --- + +// TestImport_InvalidEnvelope verifies that Import rejects malformed or +// incompatible envelopes. +func TestImport_InvalidEnvelope(t *testing.T) { + t.Parallel() + badVersion := containerEnvelope(t, &lcsave.Payload{SchemaVersion: lcsave.SchemaVersion + 1}) + + tests := []struct { + name string + env *anypb.Any + }{ + {name: "nil envelope", env: nil}, + {name: "wrong type url", env: &anypb.Any{TypeUrl: "type.microsoft.com/other"}}, + {name: "undecodable value", env: &anypb.Any{TypeUrl: lcsave.TypeURL, Value: []byte{0x08, 0xff}}}, + {name: "schema version mismatch", env: badVersion}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + if _, err := Import(t.Context(), tc.env); err == nil { + t.Error("Import() = nil; want error") + } + }) + } +} + +// TestImport_InvalidGUIDs verifies that Import rejects any reservation GUID it +// cannot decode, whether in the scsi list, scratch, or a read-only layer. +func TestImport_InvalidGUIDs(t *testing.T) { + t.Parallel() + tests := []struct { + name string + mutate func(p *lcsave.Payload) + }{ + {name: "bad scsi id", mutate: func(p *lcsave.Payload) { p.ScsiReservationIds = []string{"not-a-guid"} }}, + {name: "bad scratch id", mutate: func(p *lcsave.Payload) { p.Layers.Scratch.ReservationID = "not-a-guid" }}, + {name: "bad ro layer id", mutate: func(p *lcsave.Payload) { p.Layers.RoLayers[0].ReservationID = "not-a-guid" }}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + p := baseContainerPayload(t) + tc.mutate(p) + + if _, err := Import(t.Context(), containerEnvelope(t, p)); err == nil { + t.Error("Import() = nil; want error for invalid GUID") + } + }) + } +} + +// TestImport_ProcessImportFails verifies that a bad embedded process envelope +// fails the whole container import. +func TestImport_ProcessImportFails(t *testing.T) { + t.Parallel() + p := baseContainerPayload(t) + p.Processes[""] = &anypb.Any{TypeUrl: "type.microsoft.com/other"} + + if _, err := Import(t.Context(), containerEnvelope(t, p)); err == nil { + t.Error("Import() = nil; want error for unimportable process") + } +} + +// TestImport_Succeeds verifies that Import reconstructs a migrating container +// carrying every saved field, including layers and the init process. +func TestImport_Succeeds(t *testing.T) { + t.Parallel() + p := baseContainerPayload(t) + + c, err := Import(t.Context(), containerEnvelope(t, p)) + if err != nil { + t.Fatalf("Import() = %v; want nil", err) + } + if c.state != StateDestinationMigrating { + t.Errorf("state = %s; want StateDestinationMigrating", c.state) + } + if c.containerID != p.ContainerID { + t.Errorf("containerID = %q; want %q", c.containerID, p.ContainerID) + } + if c.gcsContainerID != p.GcsContainerID { + t.Errorf("gcsContainerID = %q; want %q", c.gcsContainerID, p.GcsContainerID) + } + if c.ioRetryTimeout != 2*time.Second { + t.Errorf("ioRetryTimeout = %s; want 2s", c.ioRetryTimeout) + } + if len(c.scsiResources) != 1 || c.scsiResources[0].String() != p.ScsiReservationIds[0] { + t.Errorf("scsiResources = %v; want %v", c.scsiResources, p.ScsiReservationIds) + } + if c.layers == nil { + t.Fatal("layers must be restored") + } + if !c.layers.layersCombined || c.layers.rootfsPath != "/rootfs" { + t.Errorf("layers = %+v; want combined with rootfs /rootfs", c.layers) + } + if c.layers.scratch.id.String() != p.Layers.Scratch.ReservationID || c.layers.scratch.guestPath != "/dev/scratch" { + t.Errorf("scratch = %+v; want %s", c.layers.scratch, p.Layers.Scratch.ReservationID) + } + if len(c.layers.roLayers) != 1 || c.layers.roLayers[0].id.String() != p.Layers.RoLayers[0].ReservationID { + t.Errorf("roLayers = %+v; want %s", c.layers.roLayers, p.Layers.RoLayers[0].ReservationID) + } + if _, ok := c.processes[""]; !ok { + t.Error("init process must be imported") + } + if c.terminatedCh == nil { + t.Error("terminatedCh must be non-nil after Import") + } +} + +// TestImport_NoLayers verifies that a payload without layers imports cleanly. +func TestImport_NoLayers(t *testing.T) { + t.Parallel() + p := baseContainerPayload(t) + p.Layers = nil + + c, err := Import(t.Context(), containerEnvelope(t, p)) + if err != nil { + t.Fatalf("Import() = %v; want nil", err) + } + if c.layers != nil { + t.Errorf("layers = %+v; want nil", c.layers) + } +} + +// --- Patch --- + +// TestPatch_WrongState verifies that Patch only operates on a destination-migrating container. +func TestPatch_WrongState(t *testing.T) { + t.Parallel() + invalidStates := []State{StateNotCreated, StateCreated, StateRunning, StateStopped, StateInvalid, StateSourceMigrating} + + for _, state := range invalidStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + c, scsiCtrl, _, _, _ := newContainerTestController(t) + c.state = state + + err := c.Patch(t.Context(), scsiCtrl, &task.CreateTaskRequest{ID: "dest"}) + if !errors.Is(err, errdefs.ErrFailedPrecondition) { + t.Errorf("Patch() = %v; want ErrFailedPrecondition", err) + } + }) + } +} + +// TestPatch_LayerErrors verifies the layer-repointing failure modes: a missing +// scsi controller, unparsable rootfs, and a read-only layer count mismatch. +func TestPatch_LayerErrors(t *testing.T) { + roGUID, _ := guid.NewV4() + scratchGUID, _ := guid.NewV4() + + tests := []struct { + name string + scsiNil bool + layers *scsiLayers + rootfs []*containerdtypes.Mount + stubPath bool + }{ + { + name: "nil scsi controller", + scsiNil: true, + layers: &scsiLayers{scratch: scsiReservation{id: scratchGUID}}, + }, + { + name: "unparsable rootfs", + layers: &scsiLayers{scratch: scsiReservation{id: scratchGUID}}, + rootfs: nil, + }, + { + name: "ro layer count mismatch", + layers: &scsiLayers{roLayers: []scsiReservation{{id: roGUID}, {id: roGUID}}, scratch: scsiReservation{id: scratchGUID}}, + rootfs: []*containerdtypes.Mount{{Type: "lcow-layer", Source: `C:\scratch`, Options: []string{`parentLayerPaths=["C:\\layers\\base"]`}}}, + stubPath: true, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + if tc.stubPath { + stubResolvePath(t) + } + c, scsiCtrl, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + c.layers = tc.layers + + var sc scsiController = scsiCtrl + if tc.scsiNil { + sc = nil + } + + err := c.Patch(t.Context(), sc, &task.CreateTaskRequest{ID: "dest", Rootfs: tc.rootfs}) + if err == nil { + t.Error("Patch() = nil; want error") + } + }) + } +} + +// TestPatch_UpdateDiskHostPathFails verifies that a SCSI disk repoint failure +// surfaces from Patch, whether it occurs on a read-only or the scratch layer. +// Not parallel: stubs the package-level resolvePath. +func TestPatch_UpdateDiskHostPathFails(t *testing.T) { + stubResolvePath(t) + + roGUID, _ := guid.NewV4() + scratchGUID, _ := guid.NewV4() + wantErr := errors.New("update disk host path failed") + rootfs := []*containerdtypes.Mount{{Type: "lcow-layer", Source: `C:\scratch`, Options: []string{`parentLayerPaths=["C:\\layers\\base"]`}}} + + tests := []struct { + name string + expect func(scsiCtrl *mocks.MockscsiController) + }{ + { + name: "ro layer fails", + expect: func(scsiCtrl *mocks.MockscsiController) { + scsiCtrl.EXPECT().UpdateDiskHostPath(gomock.Any(), roGUID, gomock.Any()).Return(wantErr) + }, + }, + { + name: "scratch layer fails", + expect: func(scsiCtrl *mocks.MockscsiController) { + scsiCtrl.EXPECT().UpdateDiskHostPath(gomock.Any(), roGUID, gomock.Any()).Return(nil) + scsiCtrl.EXPECT().UpdateDiskHostPath(gomock.Any(), scratchGUID, gomock.Any()).Return(wantErr) + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c, scsiCtrl, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + c.layers = &scsiLayers{ + roLayers: []scsiReservation{{id: roGUID}}, + scratch: scsiReservation{id: scratchGUID}, + } + tc.expect(scsiCtrl) + + err := c.Patch(t.Context(), scsiCtrl, &task.CreateTaskRequest{ID: "dest", Rootfs: rootfs}) + if !errors.Is(err, wantErr) { + t.Errorf("Patch() = %v; want %v", err, wantErr) + } + }) + } +} + +// TestPatch_ProcessConstraints verifies that, with no layers to repoint, Patch +// still requires exactly the init process. +func TestPatch_ProcessConstraints(t *testing.T) { + t.Parallel() + tests := []struct { + name string + seedProc func(c *Controller) + }{ + {name: "no init process", seedProc: func(*Controller) {}}, + {name: "extra exec process", seedProc: func(c *Controller) { + c.processes[""] = importedInitProcess(t) + c.processes["exec-1"] = process.New(testContainerID, "exec-1", nil, 0) + }}, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + c, scsiCtrl, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + tc.seedProc(c) + + err := c.Patch(t.Context(), scsiCtrl, &task.CreateTaskRequest{ID: "dest"}) + if err == nil { + t.Error("Patch() = nil; want error for invalid process set") + } + }) + } +} + +// TestPatch_Succeeds verifies that Patch repoints the layers, re-opens init IO, +// and adopts the destination container ID. Empty IO paths avoid real pipes. +// Not parallel: stubs the package-level resolvePath. +func TestPatch_Succeeds(t *testing.T) { + stubResolvePath(t) + c, scsiCtrl, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + + roGUID, _ := guid.NewV4() + scratchGUID, _ := guid.NewV4() + c.layers = &scsiLayers{ + roLayers: []scsiReservation{{id: roGUID}}, + scratch: scsiReservation{id: scratchGUID}, + } + c.processes[""] = importedInitProcess(t) + + scsiCtrl.EXPECT().UpdateDiskHostPath(gomock.Any(), roGUID, gomock.Any()).Return(nil) + scsiCtrl.EXPECT().UpdateDiskHostPath(gomock.Any(), scratchGUID, gomock.Any()).Return(nil) + + const destID = "dest-ctr-9999" + rootfs := []*containerdtypes.Mount{{Type: "lcow-layer", Source: `C:\scratch`, Options: []string{`parentLayerPaths=["C:\\layers\\base"]`}}} + + if err := c.Patch(t.Context(), scsiCtrl, &task.CreateTaskRequest{ID: destID, Bundle: "/bundle", Rootfs: rootfs}); err != nil { + t.Fatalf("Patch() = %v; want nil", err) + } + if c.containerID != destID { + t.Errorf("containerID = %q; want %q", c.containerID, destID) + } + if c.state != StateDestinationMigrating { + t.Errorf("state = %s; want StateDestinationMigrating", c.state) + } +} + +// --- Resume --- + +// TestResume_OpenContainerFails verifies that Resume surfaces a guest +// OpenContainer failure before touching the init process. +func TestResume_OpenContainerFails(t *testing.T) { + t.Parallel() + c, scsiCtrl, plan9Ctrl, vpciCtrl, guestCtrl := newContainerTestController(t) + c.state = StateDestinationMigrating + + guestCtrl.EXPECT(). + OpenContainer(gomock.Any(), c.gcsContainerID). + Return(nil, errOpenContainer) + + err := c.Resume(t.Context(), testVMID, testPodID, guestCtrl, scsiCtrl, plan9Ctrl, vpciCtrl, nil) + if !errors.Is(err, errOpenContainer) { + t.Errorf("Resume() = %v; want %v", err, errOpenContainer) + } +} + +// TestResume_SourceRollback verifies that resuming a source-migrating container +// lifts the freeze and returns it to running without touching the guest. +func TestResume_SourceRollback(t *testing.T) { + t.Parallel() + c, scsiCtrl, plan9Ctrl, vpciCtrl, guestCtrl := newContainerTestController(t) + c.state = StateSourceMigrating + + // No guest call is expected: the source keeps its live bindings. + if err := c.Resume(t.Context(), testVMID, testPodID, guestCtrl, scsiCtrl, plan9Ctrl, vpciCtrl, nil); err != nil { + t.Fatalf("Resume() = %v; want nil", err) + } + if c.state != StateRunning { + t.Errorf("state = %s; want StateRunning", c.state) + } +} + +// --- AbortMigrated --- + +// TestAbortMigrated_NoOp verifies that AbortMigrated leaves a non-migrating +// container untouched and publishes nothing. +func TestAbortMigrated_NoOp(t *testing.T) { + t.Parallel() + otherStates := []State{StateNotCreated, StateCreated, StateRunning, StateStopped, StateInvalid, StateSourceMigrating} + + for _, state := range otherStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = state + + events := make(chan interface{}, 1) + c.AbortMigrated(t.Context(), events) + + if c.state != state { + t.Errorf("state = %s; want unchanged %s", c.state, state) + } + select { + case <-events: + t.Error("AbortMigrated published an event for a non-migrating container") + default: + } + }) + } +} + +// TestAbortMigrated_Succeeds verifies that AbortMigrated drains the init +// process, marks the container stopped, closes waiters, and publishes a +// synthetic TaskExit when an events channel is supplied. +func TestAbortMigrated_Succeeds(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + c.processes[""] = patchedInitProcess(t) + + events := make(chan interface{}, 1) + c.AbortMigrated(t.Context(), events) + + if c.state != StateStopped { + t.Errorf("state = %s; want StateStopped", c.state) + } + select { + case <-c.terminatedCh: + default: + t.Error("terminatedCh should be closed after AbortMigrated") + } + select { + case ev := <-events: + if _, ok := ev.(*eventstypes.TaskExit); !ok { + t.Errorf("event = %T; want *eventstypes.TaskExit", ev) + } + default: + t.Error("AbortMigrated should publish a TaskExit event") + } +} + +// TestAbortMigrated_NoEventChannel verifies that AbortMigrated still tears the +// container down when no events channel is provided. +func TestAbortMigrated_NoEventChannel(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + c.processes[""] = importedInitProcess(t) + + c.AbortMigrated(t.Context(), nil) + + if c.state != StateStopped { + t.Errorf("state = %s; want StateStopped", c.state) + } +} + +// TestAbortMigrated_NoInitProcess verifies that AbortMigrated tears the +// container down but publishes nothing when there is no init process. +func TestAbortMigrated_NoInitProcess(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + c.state = StateDestinationMigrating + + events := make(chan interface{}, 1) + c.AbortMigrated(t.Context(), events) + + if c.state != StateStopped { + t.Errorf("state = %s; want StateStopped", c.state) + } + select { + case <-events: + t.Error("AbortMigrated should not publish an event without an init process") + default: + } +} + +// --- ContainerID --- + +// TestContainerID verifies that ContainerID returns the current identifier. +func TestContainerID(t *testing.T) { + t.Parallel() + c, _, _, _, _ := newContainerTestController(t) + if got := c.ContainerID(); got != testContainerID { + t.Errorf("ContainerID() = %q; want %q", got, testContainerID) + } +} + +// --- guidsToStrings / stringsToGuids --- + +// TestGUIDRoundTrip verifies the GUID encode/decode helpers, including nil +// handling and an undecodable string. +func TestGUIDRoundTrip(t *testing.T) { + t.Parallel() + + if got := guidsToStrings(nil); got != nil { + t.Errorf("guidsToStrings(nil) = %v; want nil", got) + } + if got, err := stringsToGuids(nil); err != nil || got != nil { + t.Errorf("stringsToGuids(nil) = (%v, %v); want (nil, nil)", got, err) + } + if _, err := stringsToGuids([]string{"not-a-guid"}); err == nil { + t.Error("stringsToGuids(invalid) = nil; want error") + } + + g1, _ := guid.NewV4() + g2, _ := guid.NewV4() + encoded := guidsToStrings([]guid.GUID{g1, g2}) + decoded, err := stringsToGuids(encoded) + if err != nil { + t.Fatalf("stringsToGuids() = %v; want nil", err) + } + if len(decoded) != 2 || decoded[0] != g1 || decoded[1] != g2 { + t.Errorf("round trip = %v; want [%v %v]", decoded, g1, g2) + } +} diff --git a/internal/controller/linuxcontainer/state.go b/internal/controller/linuxcontainer/state.go index 6f6ed963eb..149d5b5cf8 100644 --- a/internal/controller/linuxcontainer/state.go +++ b/internal/controller/linuxcontainer/state.go @@ -8,17 +8,30 @@ package linuxcontainer // // StateNotCreated → StateCreated → StateRunning → StateStopped // +// Live migration adds two branches. On the destination, the controller is +// rehydrated via [Import] directly into [StateDestinationMigrating] and rejoins +// the table above once [Controller.Resume] binds the live VM/guest dependencies +// (→ [StateRunning]) or [Controller.AbortMigrated] discards it (→ [StateStopped]). +// On the source, [Controller.Save] freezes a running container into +// [StateSourceMigrating]; [Controller.Resume] rolls it back to [StateRunning], or +// its init process exit on source VM teardown moves it to [StateStopped]. +// // Full state-transition table: // -// Current State │ Trigger │ Next State -// ─────────────────┼──────────────────────────────────────────────────┼──────────────── -// StateNotCreated │ Create succeeds │ StateCreated -// StateNotCreated │ Create fails during resource allocation or later │ StateInvalid -// StateCreated │ Start succeeds │ StateRunning -// StateCreated │ Start fails │ StateInvalid -// StateRunning │ init process exits │ StateStopped -// StateStopped │ (terminal — no further transitions) │ — -// StateInvalid │ (terminal — no further transitions) │ — +// Current State │ Trigger │ Next State +// ──────────────────────────┼──────────────────────────────────────────────────┼────────────────────── +// StateNotCreated │ Create succeeds │ StateCreated +// StateNotCreated │ Create fails during resource allocation or later │ StateInvalid +// StateCreated │ Start succeeds │ StateRunning +// StateCreated │ Start fails │ StateInvalid +// StateRunning │ init process exits │ StateStopped +// StateRunning │ Save freezes the source │ StateSourceMigrating +// StateStopped │ (terminal — no further transitions) │ — +// StateInvalid │ (terminal — no further transitions) │ — +// StateDestinationMigrating │ Resume binds the live VM, guest, and devices │ StateRunning +// StateDestinationMigrating │ AbortMigrated discards the import │ StateStopped +// StateSourceMigrating │ Resume rolls back the migration │ StateRunning +// StateSourceMigrating │ init process exits (source VM torn down) │ StateStopped type State int32 const ( @@ -38,6 +51,14 @@ const ( // StateInvalid indicates the container entered an unrecoverable failure // during Create or Start. StateInvalid + + // StateDestinationMigrating indicates a container rehydrated from a snapshot + // on the destination, awaiting Resume (→ StateRunning) or AbortMigrated (→ StateStopped). + StateDestinationMigrating + + // StateSourceMigrating indicates a running container frozen by Save on the + // source, awaiting Resume (→ StateRunning) or its VM teardown (→ StateStopped). + StateSourceMigrating ) // String returns a human-readable representation of the container State. @@ -53,6 +74,10 @@ func (s State) String() string { return "Stopped" case StateInvalid: return "Invalid" + case StateDestinationMigrating: + return "DestinationMigrating" + case StateSourceMigrating: + return "SourceMigrating" default: return "Unknown" } diff --git a/internal/controller/linuxcontainer/types.go b/internal/controller/linuxcontainer/types.go index d49a959ecc..e8a834ff3c 100644 --- a/internal/controller/linuxcontainer/types.go +++ b/internal/controller/linuxcontainer/types.go @@ -25,6 +25,7 @@ type CreateOpts struct { type guest interface { Capabilities() gcs.GuestDefinedCapabilities CreateContainer(ctx context.Context, cid string, config interface{}) (*gcs.Container, error) + OpenContainer(ctx context.Context, cid string) (*gcs.Container, error) DeleteContainerState(ctx context.Context, cid string) error AddCombinedLayers(ctx context.Context, settings guestresource.LCOWCombinedLayers) error @@ -36,6 +37,7 @@ type scsiController interface { Reserve(ctx context.Context, diskConfig disk.Config, mountConfig scsiMount.Config) (guid.GUID, error) UnmapFromGuest(ctx context.Context, reservation guid.GUID) error MapToGuest(ctx context.Context, id guid.GUID) (string, error) + UpdateDiskHostPath(ctx context.Context, reservationID guid.GUID, newPath string) error } // plan9Controller abstracts host-side Plan9 share reservation and guest mapping. diff --git a/internal/controller/network/doc.go b/internal/controller/network/doc.go index d94de04236..4ca718cc01 100644 --- a/internal/controller/network/doc.go +++ b/internal/controller/network/doc.go @@ -4,11 +4,15 @@ // running inside a Utility VM (UVM). // // It handles attaching an HCN namespace and its endpoints to the guest VM, -// and tearing them down on pod removal. +// and tearing them down on pod removal. Live-migration entry points are +// provided on both sides: the source freezes a configured network via +// [Controller.Save], while the destination rehydrates it via [Import] +// (state-only rehydration) and [Controller.Resume] (binds host/guest interfaces +// once the destination VM is running). // // # Lifecycle // -// A network follows the state machine below. +// A network controller created via [New] follows the live-creation path: // // ┌────────────────────┐ // │ StateNotConfigured │ @@ -24,6 +28,19 @@ // │ StateTornDown │ // └─────────────────────────────────────┘ // +// Live migration adds two branches. The destination rehydrates a controller via +// [Import] into [StateDestinationMigrating]; the source freezes a configured +// network via [Controller.Save] into [StateSourceMigrating]. Either returns to +// [StateConfigured] via [Controller.Resume], or to [StateTornDown] via +// [Controller.Teardown]: +// +// ┌───────────────────────────┐ ┌─────────────────┐ +// │ StateDestinationMigrating │── Resume ──▶ │ StateConfigured │ +// │ or │ └─────────────────┘ +// │ StateSourceMigrating │ ┌─────────────────┐ +// └───────────────────────────┘── Teardown ─▶ │ StateTornDown │ +// └─────────────────┘ +// // State descriptions: // // - [StateNotConfigured]: initial state; no namespace or NICs have been configured. @@ -31,7 +48,14 @@ // and all endpoints are wired up inside the guest. // - [StateInvalid]: entered when [Controller.Setup] fails mid-way; best-effort // cleanup should be performed via [Controller.Teardown]. -// - [StateTornDown]: terminal state reached after [Controller.Teardown] completes. +// - [StateTornDown]: terminal state reached after [Controller.Teardown] +// completes. +// - [StateDestinationMigrating]: initial state for [Import] on the destination; +// host/guest interfaces are not yet bound. [Controller.Resume] binds them and +// moves to [StateConfigured]; [Controller.Teardown] aborts to [StateTornDown]. +// - [StateSourceMigrating]: a configured network frozen by [Controller.Save] on the +// source while a migration is in flight. [Controller.Resume] rolls it back to +// [StateConfigured]; [Controller.Teardown] tears it down to [StateTornDown]. // // # Platform Variants // diff --git a/internal/controller/network/network.go b/internal/controller/network/network.go index 990add5f02..58e030b872 100644 --- a/internal/controller/network/network.go +++ b/internal/controller/network/network.go @@ -18,11 +18,15 @@ import ( ) type Controller struct { - mu sync.Mutex + mu sync.RWMutex // namespaceID is the HCN namespace ID in use after a successful Setup. namespaceID string + // migratedNamespaceID is the HCN namespace ID on the LM destination + // which is used for rehydrating the guest namespace. + migratedNamespaceID string + // vmEndpoints maps nicID (ID within UVM) -> HCN endpoint. vmEndpoints map[string]*hcn.HostComputeEndpoint @@ -128,7 +132,7 @@ func (c *Controller) Setup(ctx context.Context) (err error) { // add the nicID and endpointID to the context for trace. nicCtx, _ := log.WithContext(ctx, logrus.WithFields(logrus.Fields{"vm_nic_id": nicGUID.String(), "hns_endpoint_id": endpoint.Id})) - if err = c.addEndpointToGuestNamespace(nicCtx, nicGUID.String(), endpoint, c.policyBasedRouting); err != nil { + if err = c.addEndpointToGuestNamespace(nicCtx, endpoint.HostComputeNamespace, nicGUID.String(), endpoint, c.policyBasedRouting); err != nil { return fmt.Errorf("add endpoint %s to guest: %w", endpoint.Name, err) } } @@ -164,6 +168,15 @@ func (c *Controller) Teardown(ctx context.Context) error { return nil } + if c.netState == StateDestinationMigrating || c.netState == StateSourceMigrating { + // Migration finalized: the guest-side NICs are not ours to remove — + // never wired on the destination, or gone with the torn-down source VM. + // Drop the bindings and mark torn down. + c.vmEndpoints = make(map[string]*hcn.HostComputeEndpoint) + c.netState = StateTornDown + return nil + } + // Remove all endpoints from the guest. // Use a continue-on-error strategy: attempt every NIC regardless of individual // failures, then collect all errors. diff --git a/internal/controller/network/network_lcow.go b/internal/controller/network/network_lcow.go index 4452f7539a..88df88c638 100644 --- a/internal/controller/network/network_lcow.go +++ b/internal/controller/network/network_lcow.go @@ -12,6 +12,7 @@ import ( hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" "github.com/Microsoft/hcsshim/internal/vm/vmutils" ) @@ -39,7 +40,7 @@ func (c *Controller) removeNetNSInsideGuest(_ context.Context, _ string) error { // addEndpointToGuestNamespace hot-adds an HCN endpoint to the UVM and, // configures it inside the LCOW guest. -func (c *Controller) addEndpointToGuestNamespace(ctx context.Context, nicID string, endpoint *hcn.HostComputeEndpoint, isPolicyBasedRoutingSupported bool) error { +func (c *Controller) addEndpointToGuestNamespace(ctx context.Context, netnsID string, nicID string, endpoint *hcn.HostComputeEndpoint, isPolicyBasedRoutingSupported bool) error { log.G(ctx).Info("adding endpoint to guest namespace") // 1. Host-side hot-add. @@ -57,7 +58,7 @@ func (c *Controller) addEndpointToGuestNamespace(ctx context.Context, nicID stri // 2. Guest-side add. if c.isNamespaceSupportedByGuest { - lcowAdapter, err := guestresource.BuildLCOWNetworkAdapter(nicID, endpoint, isPolicyBasedRoutingSupported) + lcowAdapter, err := guestresource.BuildLCOWNetworkAdapter(netnsID, nicID, endpoint, isPolicyBasedRoutingSupported) if err != nil { return fmt.Errorf("build LCOW network adapter for endpoint %s: %w", endpoint.Id, err) } @@ -84,7 +85,7 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID if err := c.guestNetwork.RemoveNetworkInterface(ctx, &guestresource.LCOWNetworkAdapter{ NamespaceID: c.namespaceID, ID: nicID, - }); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) { + }); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) && !errors.Is(err, guestmanager.ErrGuestConnectionUnavailable) { return fmt.Errorf("remove NIC %s from guest: %w", nicID, err) } @@ -92,11 +93,13 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID } // 2. Host-side removal. - if err := c.vmNetwork.RemoveNIC(ctx, nicID, &hcsschema.NetworkAdapter{ - EndpointId: endpoint.Id, - MacAddress: endpoint.MacAddress, - }); err != nil && !vmutils.IsVMNotAvailableError(err) { - return fmt.Errorf("remove NIC %s from host (endpoint %s): %w", nicID, endpoint.Id, err) + if endpoint != nil { + if err := c.vmNetwork.RemoveNIC(ctx, nicID, &hcsschema.NetworkAdapter{ + EndpointId: endpoint.Id, + MacAddress: endpoint.MacAddress, + }); err != nil && !vmutils.IsVMNotAvailableError(err) { + return fmt.Errorf("remove NIC %s from host (endpoint %s): %w", nicID, endpoint.Id, err) + } } log.G(ctx).Debug("removed NIC from host") diff --git a/internal/controller/network/network_lcow_test.go b/internal/controller/network/network_lcow_test.go index c52bae178a..cf6b51d5e5 100644 --- a/internal/controller/network/network_lcow_test.go +++ b/internal/controller/network/network_lcow_test.go @@ -17,6 +17,7 @@ import ( hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" hcs "github.com/Microsoft/hcsshim/internal/hcs/v2" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" ) var ( @@ -79,7 +80,7 @@ func TestLCOW_AddEndpoint_Success_NamespaceSupport(t *testing.T) { c, vm, guest := newLCOWController(t, ctrl, true) ep := newLCOWEndpoint("eth0") - expectedAdapter, err := guestresource.BuildLCOWNetworkAdapter("nic-1", ep, false) + expectedAdapter, err := guestresource.BuildLCOWNetworkAdapter(ep.HostComputeNamespace, "nic-1", ep, false) if err != nil { t.Fatalf("failed to build expected adapter: %v", err) } @@ -92,7 +93,7 @@ func TestLCOW_AddEndpoint_Success_NamespaceSupport(t *testing.T) { guest.EXPECT().AddNetworkInterface(gomock.Any(), expectedAdapter).Return(nil), ) - if err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false); err != nil { + if err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false); err != nil { t.Fatalf("unexpected error: %v", err) } if got, ok := c.vmEndpoints["nic-1"]; !ok || got != ep { @@ -116,7 +117,7 @@ func TestLCOW_AddEndpoint_Success_NoNamespaceSupport(t *testing.T) { // guest.AddNetworkInterface is intentionally not expected — gomock will // fail the test if the controller calls it without namespace support. - if err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false); err != nil { + if err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false); err != nil { t.Fatalf("unexpected error: %v", err) } if _, ok := c.vmEndpoints["nic-1"]; !ok { @@ -137,7 +138,7 @@ func TestLCOW_AddEndpoint_HostFails_NotTracked(t *testing.T) { vm.EXPECT().AddNIC(gomock.Any(), "nic-1", gomock.Any()).Return(errLCOWHostAdd) // guest.AddNetworkInterface must not be called when host add fails. - err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false) + err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false) if !errors.Is(err, errLCOWHostAdd) { t.Fatalf("expected host add error to wrap, got: %v", err) } @@ -217,6 +218,28 @@ func TestLCOW_RemoveEndpoint_BridgeClosed_HostStillCalled(t *testing.T) { } } +// TestLCOW_RemoveEndpoint_GuestConnectionUnavailable_HostStillCalled mirrors +// the bridge-closed case for [guestmanager.ErrGuestConnectionUnavailable]. +func TestLCOW_RemoveEndpoint_GuestConnectionUnavailable_HostStillCalled(t *testing.T) { + ctrl := gomock.NewController(t) + c, vm, guest := newLCOWController(t, ctrl, true) + + ep := newLCOWEndpoint("eth0") + + gomock.InOrder( + guest.EXPECT().RemoveNetworkInterface(gomock.Any(), gomock.Any()). + Return(fmt.Errorf("guest RPC: %w", guestmanager.ErrGuestConnectionUnavailable)), + vm.EXPECT().RemoveNIC(gomock.Any(), "nic-1", &hcsschema.NetworkAdapter{ + EndpointId: ep.Id, + MacAddress: ep.MacAddress, + }).Return(nil), + ) + + if err := c.removeEndpointFromGuestNamespace(context.Background(), "nic-1", ep); err != nil { + t.Fatalf("unexpected error: %v", err) + } +} + // TestLCOW_RemoveEndpoint_NoNamespaceSupport_HostOnly verifies that when the // guest never received the namespace, the controller skips the guest-side // removal and only hot-removes the NIC from the host. @@ -404,7 +427,7 @@ func TestLCOW_AddEndpoint_HostOK_GuestFails_TeardownUnwindsHost(t *testing.T) { guest.EXPECT().AddNetworkInterface(gomock.Any(), gomock.Any()).Return(errLCOWGuestAdd), ) - if err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false); !errors.Is(err, errLCOWGuestAdd) { + if err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false); !errors.Is(err, errLCOWGuestAdd) { t.Fatalf("expected guest add error to wrap, got: %v", err) } if _, ok := c.vmEndpoints["nic-1"]; !ok { diff --git a/internal/controller/network/network_test.go b/internal/controller/network/network_test.go index 02c99fea05..94e9a87b71 100644 --- a/internal/controller/network/network_test.go +++ b/internal/controller/network/network_test.go @@ -9,6 +9,7 @@ import ( "go.uber.org/mock/gomock" + "github.com/Microsoft/hcsshim/hcn" "github.com/Microsoft/hcsshim/internal/controller/network/mocks" "github.com/Microsoft/hcsshim/internal/gcs" "github.com/Microsoft/hcsshim/internal/guest/prot" @@ -193,3 +194,37 @@ func TestTeardown_NoOpFromTornDown(t *testing.T) { t.Errorf("expected state to remain TornDown, got %s", c.netState) } } + +// TestTeardown_FromMigratingDropsBindings verifies that tearing down a migrating +// controller (destination abort or source termination) drops its endpoint +// bindings without touching the guest and moves to TornDown. +func TestTeardown_FromMigratingDropsBindings(t *testing.T) { + for _, st := range []State{StateDestinationMigrating, StateSourceMigrating} { + t.Run(st.String(), func(t *testing.T) { + ctrl := gomock.NewController(t) + vm := mocks.NewMockvmNetworkManager(ctrl) + guest := mocks.NewMockguestNetwork(ctrl) + c := New( + &Options{NetworkNamespace: "ns-1"}, + vm, + guest, + newCapsProvider(t, ctrl, true), + ) + c.netState = st + c.vmEndpoints = map[string]*hcn.HostComputeEndpoint{ + "nic-1": {Id: "ep-1", Name: "eth0"}, + } + + // No EXPECT() on vm or guest — no guest-side removal must occur. + if err := c.Teardown(context.Background()); err != nil { + t.Fatalf("expected nil from Teardown in %s, got: %v", st, err) + } + if c.netState != StateTornDown { + t.Errorf("expected state TornDown, got %s", c.netState) + } + if len(c.vmEndpoints) != 0 { + t.Errorf("expected endpoint bindings to be dropped, got %d", len(c.vmEndpoints)) + } + }) + } +} diff --git a/internal/controller/network/network_wcow.go b/internal/controller/network/network_wcow.go index 55a9a8ef69..193ee547be 100644 --- a/internal/controller/network/network_wcow.go +++ b/internal/controller/network/network_wcow.go @@ -13,6 +13,7 @@ import ( "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" "github.com/Microsoft/hcsshim/internal/vm/vmutils" "github.com/sirupsen/logrus" @@ -61,7 +62,7 @@ func (c *Controller) removeNetNSInsideGuest(ctx context.Context, namespaceID str // If the GCS bridge is already closed (e.g. the guest agent crashed), the // namespace will be torn down with the VM, so treat that as success and let // teardown continue. - if err = c.guestNetwork.RemoveNetworkNamespace(ctx, hcnNamespace); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) { + if err = c.guestNetwork.RemoveNetworkNamespace(ctx, hcnNamespace); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) && !errors.Is(err, guestmanager.ErrGuestConnectionUnavailable) { return fmt.Errorf("remove network namespace %s from guest: %w", namespaceID, err) } } @@ -71,7 +72,7 @@ func (c *Controller) removeNetNSInsideGuest(ctx context.Context, namespaceID str // addEndpointToGuestNamespace wires an HCN endpoint into the WCOW guest in three steps: // pre-add (guest notification), host-side hot-add, and guest-side finalisation. -func (c *Controller) addEndpointToGuestNamespace(ctx context.Context, nicID string, endpoint *hcn.HostComputeEndpoint, _ bool) error { +func (c *Controller) addEndpointToGuestNamespace(ctx context.Context, _ string, nicID string, endpoint *hcn.HostComputeEndpoint, _ bool) error { log.G(ctx).Info("adding network endpoint to guest namespace") // 1. Guest pre-add: informs WCOW guest that a NIC is about to arrive. @@ -125,7 +126,7 @@ func (c *Controller) removeEndpointFromGuestNamespace(ctx context.Context, nicID nicID, guestrequest.RequestTypeRemove, nil, - ); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) { + ); err != nil && !errors.Is(err, gcs.ErrBridgeClosed) && !errors.Is(err, guestmanager.ErrGuestConnectionUnavailable) { return fmt.Errorf("remove NIC %s from guest (endpoint %s): %w", nicID, endpoint.Id, err) } diff --git a/internal/controller/network/network_wcow_test.go b/internal/controller/network/network_wcow_test.go index acc07bfe9d..e4fadabe24 100644 --- a/internal/controller/network/network_wcow_test.go +++ b/internal/controller/network/network_wcow_test.go @@ -94,7 +94,7 @@ func TestWCOW_AddEndpoint_3PhaseSequence_Success(t *testing.T) { ).Return(nil), ) - if err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false); err != nil { + if err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false); err != nil { t.Fatalf("unexpected error: %v", err) } if got, ok := c.vmEndpoints["nic-1"]; !ok || got != ep { @@ -117,7 +117,7 @@ func TestWCOW_AddEndpoint_PreAddFails_NotTracked(t *testing.T) { ).Return(errWCOWGuestPreAdd) // No vm.AddNIC, no second guest.AddNetworkInterface — gomock fails if either is called. - err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false) + err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false) if !errors.Is(err, errWCOWGuestPreAdd) { t.Fatalf("expected pre-add error to wrap, got: %v", err) } @@ -142,7 +142,7 @@ func TestWCOW_AddEndpoint_HostFails_NotTracked(t *testing.T) { vm.EXPECT().AddNIC(gomock.Any(), "nic-1", gomock.Any()).Return(errWCOWHostAdd), ) - err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false) + err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false) if !errors.Is(err, errWCOWHostAdd) { t.Fatalf("expected host add error to wrap, got: %v", err) } @@ -373,7 +373,7 @@ func TestWCOW_AddEndpoint_FinalAddFails_TeardownUnwindsHost(t *testing.T) { ).Return(errWCOWGuestAdd), ) - if err := c.addEndpointToGuestNamespace(context.Background(), "nic-1", ep, false); !errors.Is(err, errWCOWGuestAdd) { + if err := c.addEndpointToGuestNamespace(context.Background(), ep.HostComputeNamespace, "nic-1", ep, false); !errors.Is(err, errWCOWGuestAdd) { t.Fatalf("expected guest add error to wrap, got: %v", err) } if _, ok := c.vmEndpoints["nic-1"]; !ok { diff --git a/internal/controller/network/save.go b/internal/controller/network/save.go new file mode 100644 index 0000000000..0c249f490a --- /dev/null +++ b/internal/controller/network/save.go @@ -0,0 +1,196 @@ +//go:build windows && (lcow || wcow) + +package network + +import ( + "context" + "fmt" + + "github.com/Microsoft/go-winio/pkg/guid" + "github.com/Microsoft/hcsshim/hcn" + netsave "github.com/Microsoft/hcsshim/internal/controller/network/save" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" + "github.com/Microsoft/hcsshim/internal/vm/vmmanager" + "github.com/sirupsen/logrus" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" +) + +// Save serializes the controller's current network state into a portable +// envelope that can be handed to a migration destination. It succeeds only +// when the network is fully configured, and on success freezes the source +// until it is resumed or torn down. +func (c *Controller) Save(ctx context.Context) (*anypb.Any, error) { + c.mu.Lock() + defer c.mu.Unlock() + + // Only a fully configured network is in a stable, migratable state. + if c.netState != StateConfigured { + return nil, fmt.Errorf("network controller in state %s; want %s", c.netState, StateConfigured) + } + + // Capture the scalar configuration into the snapshot. + state := &netsave.Payload{ + SchemaVersion: netsave.SchemaVersion, + NamespaceID: c.namespaceID, + PolicyBasedRouting: c.policyBasedRouting, + IsNamespaceSupportedByGuest: c.isNamespaceSupportedByGuest, + VmEndpoints: make(map[string]*netsave.EndpointBinding, len(c.vmEndpoints)), + } + + // Copy each bound endpoint so the destination can re-create the NICs. + for nicID, ep := range c.vmEndpoints { + if ep == nil { + return nil, fmt.Errorf("nil endpoint bound to NIC %s", nicID) + } + state.VmEndpoints[nicID] = &netsave.EndpointBinding{ + EndpointID: ep.Id, + MacAddress: ep.MacAddress, + EndpointName: ep.Name, + } + } + + // Marshal and wrap the snapshot in a self-describing envelope. + payload, err := proto.Marshal(state) + if err != nil { + return nil, fmt.Errorf("marshal network saved state: %w", err) + } + + // Freeze the source until the migration is resumed or torn down. + c.netState = StateSourceMigrating + + log.G(ctx).WithField(logfields.GuestNetworkNamespaceID, c.namespaceID).Info("network controller saved state for migration") + + return &anypb.Any{TypeUrl: netsave.TypeURL, Value: payload}, nil +} + +// Import reconstructs a controller from an envelope produced by [Controller.Save]. +// The returned controller carries the saved state but is not yet bound to a +// running VM, so operational calls are rejected until [Controller.Resume]. +func Import(ctx context.Context, env *anypb.Any) (*Controller, error) { + // Reject an empty or mistyped envelope before touching its bytes. + if env == nil { + return nil, fmt.Errorf("network saved-state envelope is nil") + } + + if env.GetTypeUrl() != netsave.TypeURL { + return nil, fmt.Errorf("unsupported network saved-state type %q", env.GetTypeUrl()) + } + + // Decode and reject any payload this build cannot interpret. + state := &netsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + return nil, fmt.Errorf("unmarshal network saved state: %w", err) + } + + if v := state.GetSchemaVersion(); v != netsave.SchemaVersion { + return nil, fmt.Errorf("unsupported network saved-state schema version %d (want %d)", v, netsave.SchemaVersion) + } + + // Rehydrate into the destination-migrating state: state is restored but no + // live host/guest interfaces are bound, so operational calls are rejected + // until Resume. + c := &Controller{ + vmEndpoints: make(map[string]*hcn.HostComputeEndpoint), + netState: StateDestinationMigrating, + } + + // Restore the scalar configuration. + c.namespaceID = state.GetNamespaceID() + c.policyBasedRouting = state.GetPolicyBasedRouting() + c.isNamespaceSupportedByGuest = state.GetIsNamespaceSupportedByGuest() + + // Rebuild the endpoint bindings captured at save time. + for nicID, b := range state.GetVmEndpoints() { + if nicID == "" || b == nil { + return nil, fmt.Errorf("invalid endpoint binding for NIC %q in saved state", nicID) + } + c.vmEndpoints[nicID] = &hcn.HostComputeEndpoint{ + Id: b.GetEndpointID(), + MacAddress: b.GetMacAddress(), + Name: b.GetEndpointName(), + } + } + + log.G(ctx).WithField(logfields.GuestNetworkNamespaceID, c.namespaceID).Info("network controller imported") + + return c, nil +} + +// Patch records the destination-side namespace ID that a later +// [Controller.ResetAfterMigration] uses to rebind endpoints on the new host. +func (c *Controller) Patch(ctx context.Context, networkNamespaceID string) { + c.mu.Lock() + defer c.mu.Unlock() + + c.migratedNamespaceID = networkNamespaceID + + log.G(ctx).WithFields(logrus.Fields{ + logfields.GuestNetworkNamespaceID: c.namespaceID, + logfields.MigratedNamespaceID: c.migratedNamespaceID, + }).Debug("network controller patched with migrated namespace ID") +} + +// Resume returns a migrating controller to the configured, operational state. +// On the destination it binds the live VM and guest; on the source it rolls the +// snapshot back, lifting the freeze that Save applied. +func (c *Controller) Resume(ctx context.Context, vm *vmmanager.UtilityVM, guest *guestmanager.Guest) { + c.mu.Lock() + defer c.mu.Unlock() + + c.vmNetwork = vm + // The guest manager provides both guest-side network ops and capability checks. + c.guestNetwork = guest + c.capsProvider = guest + c.netState = StateConfigured + + log.G(ctx).WithField(logfields.GuestNetworkNamespaceID, c.namespaceID).Debug("network controller resumed") +} + +// ResetAfterMigration swaps the endpoints carried over from the source for the +// ones present in the destination namespace, leaving the network operational +// on the new host. +func (c *Controller) ResetAfterMigration(ctx context.Context) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Drop the stale source NICs inherited from the saved state. + for nicID, ep := range c.vmEndpoints { + if err := c.removeEndpointFromGuestNamespace(ctx, nicID, nil); err != nil { + return fmt.Errorf("reset stale source NIC %s (endpoint %s): %w", nicID, ep.Id, err) + } + delete(c.vmEndpoints, nicID) + } + + // Look up the destination namespace and its endpoints. + hcnNamespace, err := hcn.GetNamespaceByID(c.migratedNamespaceID) + if err != nil { + return fmt.Errorf("get destination namespace %s: %w", c.migratedNamespaceID, err) + } + + endpoints, err := c.fetchEndpointsInNamespace(ctx, hcnNamespace) + if err != nil { + return fmt.Errorf("fetch endpoints in destination namespace %s: %w", c.migratedNamespaceID, err) + } + + // Add each destination endpoint to the guest under a fresh NIC ID. + for _, endpoint := range endpoints { + nicGUID, err := guid.NewV4() + if err != nil { + return fmt.Errorf("generate NIC GUID: %w", err) + } + if err := c.addEndpointToGuestNamespace(ctx, c.namespaceID, nicGUID.String(), endpoint, c.policyBasedRouting); err != nil { + return fmt.Errorf("add destination endpoint %s to guest: %w", endpoint.Name, err) + } + } + + c.netState = StateConfigured + c.migratedNamespaceID = "" + + log.G(ctx).WithField(logfields.GuestNetworkNamespaceID, c.namespaceID). + Info("network reset for migration: rebound destination endpoints") + + return nil +} diff --git a/internal/controller/network/save_test.go b/internal/controller/network/save_test.go new file mode 100644 index 0000000000..8a950d5a5a --- /dev/null +++ b/internal/controller/network/save_test.go @@ -0,0 +1,256 @@ +//go:build windows && (lcow || wcow) + +package network + +import ( + "strings" + "testing" + + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + + "github.com/Microsoft/hcsshim/hcn" + netsave "github.com/Microsoft/hcsshim/internal/controller/network/save" + "github.com/Microsoft/hcsshim/internal/vm/guestmanager" + "github.com/Microsoft/hcsshim/internal/vm/vmmanager" +) + +// mustEnvelope marshals a payload and wraps it in an envelope with the +// well-known type URL, matching what Save emits. +func mustEnvelope(t *testing.T, p *netsave.Payload) *anypb.Any { + t.Helper() + b, err := proto.Marshal(p) + if err != nil { + t.Fatalf("marshal payload: %v", err) + } + return &anypb.Any{TypeUrl: netsave.TypeURL, Value: b} +} + +// configuredController returns a fully-configured controller carrying the +// supplied endpoint bindings, ready to be saved. +func configuredController(eps map[string]*hcn.HostComputeEndpoint) *Controller { + return &Controller{ + namespaceID: "ns-1", + policyBasedRouting: true, + isNamespaceSupportedByGuest: true, + vmEndpoints: eps, + netState: StateConfigured, + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Save +// ───────────────────────────────────────────────────────────────────────────── + +// TestSave_RejectsUnstableState verifies that a snapshot is only produced from +// a fully-configured network; every other state yields an error and no payload. +func TestSave_RejectsUnstableState(t *testing.T) { + for _, st := range []State{StateNotConfigured, StateInvalid, StateTornDown, StateDestinationMigrating, StateSourceMigrating} { + t.Run(st.String(), func(t *testing.T) { + c := configuredController(map[string]*hcn.HostComputeEndpoint{}) + c.netState = st + + env, err := c.Save(t.Context()) + if err == nil { + t.Fatalf("expected error saving from state %s, got nil", st) + } + if env != nil { + t.Errorf("expected nil envelope on failure, got %+v", env) + } + if !strings.Contains(err.Error(), st.String()) { + t.Errorf("expected error to mention state %s, got: %v", st, err) + } + }) + } +} + +// TestSave_NilEndpointBinding verifies that a configured network holding a nil +// endpoint cannot be saved, since the destination could not re-create the NIC. +func TestSave_NilEndpointBinding(t *testing.T) { + c := configuredController(map[string]*hcn.HostComputeEndpoint{"nic-1": nil}) + + env, err := c.Save(t.Context()) + if err == nil { + t.Fatal("expected error saving nil endpoint, got nil") + } + if env != nil { + t.Errorf("expected nil envelope on failure, got %+v", env) + } +} + +// TestSave_Success verifies the produced envelope is self-describing and that +// its decoded payload reproduces the controller's scalar config and every +// endpoint binding. +func TestSave_Success(t *testing.T) { + c := configuredController(map[string]*hcn.HostComputeEndpoint{ + "nic-1": {Id: "ep-1", MacAddress: "aa:bb:cc:dd:ee:01", Name: "eth0"}, + "nic-2": {Id: "ep-2", MacAddress: "aa:bb:cc:dd:ee:02", Name: "eth1"}, + }) + + env, err := c.Save(t.Context()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if env.GetTypeUrl() != netsave.TypeURL { + t.Errorf("expected type URL %q, got %q", netsave.TypeURL, env.GetTypeUrl()) + } + + got := &netsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), got); err != nil { + t.Fatalf("unmarshal saved payload: %v", err) + } + if got.GetSchemaVersion() != netsave.SchemaVersion { + t.Errorf("expected schema version %d, got %d", netsave.SchemaVersion, got.GetSchemaVersion()) + } + if got.GetNamespaceID() != "ns-1" || !got.GetPolicyBasedRouting() || !got.GetIsNamespaceSupportedByGuest() { + t.Errorf("scalar config not preserved: %+v", got) + } + if len(got.GetVmEndpoints()) != 2 { + t.Fatalf("expected 2 endpoint bindings, got %d", len(got.GetVmEndpoints())) + } + b1 := got.GetVmEndpoints()["nic-1"] + if b1.GetEndpointID() != "ep-1" || b1.GetMacAddress() != "aa:bb:cc:dd:ee:01" || b1.GetEndpointName() != "eth0" { + t.Errorf("nic-1 binding not preserved: %+v", b1) + } + + // A successful save freezes the source until it is resumed or torn down. + if c.netState != StateSourceMigrating { + t.Errorf("expected state SourceMigrating after Save, got %s", c.netState) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Import +// ───────────────────────────────────────────────────────────────────────────── + +// TestImport_Rejects verifies that a destination refuses any envelope it cannot +// safely interpret: a missing envelope, a foreign type, undecodable bytes, an +// unknown schema version, or a binding missing its NIC key. +func TestImport_Rejects(t *testing.T) { + cases := []struct { + name string + env *anypb.Any + }{ + {"NilEnvelope", nil}, + {"WrongTypeURL", &anypb.Any{TypeUrl: "type.microsoft.com/bogus", Value: nil}}, + {"CorruptPayload", &anypb.Any{TypeUrl: netsave.TypeURL, Value: []byte{0x08}}}, + {"SchemaMismatch", mustEnvelope(t, &netsave.Payload{SchemaVersion: netsave.SchemaVersion + 1})}, + {"EmptyNICKey", mustEnvelope(t, &netsave.Payload{ + SchemaVersion: netsave.SchemaVersion, + VmEndpoints: map[string]*netsave.EndpointBinding{"": {EndpointID: "ep-1"}}, + })}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + c, err := Import(t.Context(), tc.env) + if err == nil { + t.Fatal("expected error, got nil") + } + if c != nil { + t.Errorf("expected nil controller on failure, got %+v", c) + } + }) + } +} + +// TestImport_Success verifies a valid envelope rehydrates into a migrating +// controller (not yet operational) with all scalar config and bindings restored. +func TestImport_Success(t *testing.T) { + env := mustEnvelope(t, &netsave.Payload{ + SchemaVersion: netsave.SchemaVersion, + NamespaceID: "ns-1", + PolicyBasedRouting: true, + IsNamespaceSupportedByGuest: true, + VmEndpoints: map[string]*netsave.EndpointBinding{ + "nic-1": {EndpointID: "ep-1", MacAddress: "aa:bb:cc:dd:ee:01", EndpointName: "eth0"}, + }, + }) + + c, err := Import(t.Context(), env) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if c.netState != StateDestinationMigrating { + t.Errorf("expected state DestinationMigrating, got %s", c.netState) + } + if c.namespaceID != "ns-1" || !c.policyBasedRouting || !c.isNamespaceSupportedByGuest { + t.Errorf("scalar config not restored: %+v", c) + } + ep, ok := c.vmEndpoints["nic-1"] + if !ok || ep.Id != "ep-1" || ep.MacAddress != "aa:bb:cc:dd:ee:01" || ep.Name != "eth0" { + t.Errorf("nic-1 binding not restored: %+v (present=%v)", ep, ok) + } +} + +// TestSaveImport_RoundTrip verifies that the destination reconstructs exactly +// what the source saved, leaving the rehydrated controller non-operational +// until it is resumed. +func TestSaveImport_RoundTrip(t *testing.T) { + src := configuredController(map[string]*hcn.HostComputeEndpoint{ + "nic-1": {Id: "ep-1", MacAddress: "aa:bb:cc:dd:ee:01", Name: "eth0"}, + "nic-2": {Id: "ep-2", MacAddress: "aa:bb:cc:dd:ee:02", Name: "eth1"}, + }) + + env, err := src.Save(t.Context()) + if err != nil { + t.Fatalf("save: %v", err) + } + dst, err := Import(t.Context(), env) + if err != nil { + t.Fatalf("import: %v", err) + } + + if dst.netState != StateDestinationMigrating { + t.Errorf("expected state DestinationMigrating after import, got %s", dst.netState) + } + if dst.namespaceID != src.namespaceID || + dst.policyBasedRouting != src.policyBasedRouting || + dst.isNamespaceSupportedByGuest != src.isNamespaceSupportedByGuest { + t.Errorf("scalar config drifted across round-trip: src=%+v dst=%+v", src, dst) + } + if len(dst.vmEndpoints) != len(src.vmEndpoints) { + t.Fatalf("expected %d endpoints, got %d", len(src.vmEndpoints), len(dst.vmEndpoints)) + } + for nicID, want := range src.vmEndpoints { + got, ok := dst.vmEndpoints[nicID] + if !ok || got.Id != want.Id || got.MacAddress != want.MacAddress || got.Name != want.Name { + t.Errorf("binding %s drifted: want %+v got %+v (present=%v)", nicID, want, got, ok) + } + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Patch / Resume +// ───────────────────────────────────────────────────────────────────────────── + +// TestPatch_RecordsDestinationNamespace verifies the destination namespace +// supplied for rebinding is retained for the later migration reset. +func TestPatch_RecordsDestinationNamespace(t *testing.T) { + c := &Controller{netState: StateDestinationMigrating} + + c.Patch(t.Context(), "dst-ns") + + if c.migratedNamespaceID != "dst-ns" { + t.Errorf("expected migrated namespace dst-ns, got %q", c.migratedNamespaceID) + } +} + +// TestResume_TransitionsToConfigured verifies that resuming returns a migrating +// controller to the operational state — binding the live VM/guest on the +// destination and rolling the snapshot back on the source. +func TestResume_TransitionsToConfigured(t *testing.T) { + for _, st := range []State{StateDestinationMigrating, StateSourceMigrating} { + t.Run(st.String(), func(t *testing.T) { + c := &Controller{ + netState: st, + vmEndpoints: map[string]*hcn.HostComputeEndpoint{}, + } + + c.Resume(t.Context(), (*vmmanager.UtilityVM)(nil), (*guestmanager.Guest)(nil)) + + if c.netState != StateConfigured { + t.Errorf("expected state Configured after resume, got %s", c.netState) + } + }) + } +} diff --git a/internal/controller/network/state.go b/internal/controller/network/state.go index ce41a7faaf..b28605210f 100644 --- a/internal/controller/network/state.go +++ b/internal/controller/network/state.go @@ -4,7 +4,7 @@ package network // State represents the current lifecycle state of the network for a pod. // -// The normal progression is: +// The normal (live-creation) progression is: // // StateNotConfigured → StateConfigured → StateTornDown // @@ -12,15 +12,27 @@ package network // transitions to [StateInvalid] instead. // A network in [StateInvalid] can only be cleaned up via [Controller.Teardown]. // +// Live migration adds two branches. On the destination, [Import] rehydrates the +// controller into [StateDestinationMigrating] until [Controller.Resume] binds the +// live host/guest interfaces (→ [StateConfigured]) or [Controller.Teardown] aborts +// it (→ [StateTornDown]). On the source, [Controller.Save] freezes a configured +// network into [StateSourceMigrating]; [Controller.Resume] rolls it back +// (→ [StateConfigured]) or [Controller.Teardown] tears it down (→ [StateTornDown]). +// // Full state-transition table: // -// Current State │ Trigger │ Next State -// ────────────────────┼──────────────────┼────────────────── -// StateNotConfigured │ Setup succeeds │ StateConfigured -// StateNotConfigured │ Setup fails │ StateInvalid -// StateConfigured │ Teardown called │ StateTornDown -// StateInvalid │ Teardown called │ StateTornDown -// StateTornDown │ (terminal) │ — +// Current State │ Trigger │ Next State +// ──────────────────────────┼──────────────────┼────────────────────── +// StateNotConfigured │ Setup succeeds │ StateConfigured +// StateNotConfigured │ Setup fails │ StateInvalid +// StateConfigured │ Save freezes src │ StateSourceMigrating +// StateConfigured │ Teardown called │ StateTornDown +// StateInvalid │ Teardown called │ StateTornDown +// StateDestinationMigrating │ Resume called │ StateConfigured +// StateDestinationMigrating │ Teardown called │ StateTornDown +// StateSourceMigrating │ Resume called │ StateConfigured +// StateSourceMigrating │ Teardown called │ StateTornDown +// StateTornDown │ (terminal) │ — type State int32 const ( @@ -47,6 +59,14 @@ const ( // (regardless of whether Setup previously succeeded or failed). // No further calls to Setup or Teardown are permitted. StateTornDown + + // StateDestinationMigrating indicates a controller rehydrated from a snapshot + // on the destination, awaiting Resume (→ StateConfigured) or Teardown (→ StateTornDown). + StateDestinationMigrating + + // StateSourceMigrating indicates a configured network frozen by Save on the + // source, awaiting Resume (→ StateConfigured) or Teardown (→ StateTornDown). + StateSourceMigrating ) // String returns a human-readable string representation of the network State. @@ -60,6 +80,10 @@ func (s State) String() string { return "Invalid" case StateTornDown: return "TornDown" + case StateDestinationMigrating: + return "DestinationMigrating" + case StateSourceMigrating: + return "SourceMigrating" default: return "Unknown" } diff --git a/internal/controller/pod/doc.go b/internal/controller/pod/doc.go index 770f2d6d25..fc1b4812a9 100644 --- a/internal/controller/pod/doc.go +++ b/internal/controller/pod/doc.go @@ -10,4 +10,11 @@ // the [network.Controller]. // - Creating, retrieving, listing, and deleting container controllers // within the pod. +// +// # Migration +// +// Taking a snapshot blocks the pod's operations until migration is resumed, +// so its live state cannot diverge from the captured snapshot during handoff. +// A pod reconstructed from a snapshot on the destination is likewise blocked +// until resumed. package pod diff --git a/internal/controller/pod/mocks/mock_types.go b/internal/controller/pod/mocks/mock_types.go index 5059ad096f..31c181e909 100644 --- a/internal/controller/pod/mocks/mock_types.go +++ b/internal/controller/pod/mocks/mock_types.go @@ -20,7 +20,9 @@ import ( vpci "github.com/Microsoft/hcsshim/internal/controller/device/vpci" network "github.com/Microsoft/hcsshim/internal/controller/network" guestmanager "github.com/Microsoft/hcsshim/internal/vm/guestmanager" + vmmanager "github.com/Microsoft/hcsshim/internal/vm/vmmanager" gomock "go.uber.org/mock/gomock" + anypb "google.golang.org/protobuf/types/known/anypb" ) // MockvmController is a mock of vmController interface. @@ -104,17 +106,32 @@ func (mr *MockvmControllerMockRecorder) RuntimeID() *gomock.Call { } // SCSIController mocks base method. -func (m *MockvmController) SCSIController() *scsi.Controller { +func (m *MockvmController) SCSIController(ctx context.Context) (*scsi.Controller, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "SCSIController") + ret := m.ctrl.Call(m, "SCSIController", ctx) ret0, _ := ret[0].(*scsi.Controller) - return ret0 + ret1, _ := ret[1].(error) + return ret0, ret1 } // SCSIController indicates an expected call of SCSIController. -func (mr *MockvmControllerMockRecorder) SCSIController() *gomock.Call { +func (mr *MockvmControllerMockRecorder) SCSIController(ctx any) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SCSIController", reflect.TypeOf((*MockvmController)(nil).SCSIController)) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SCSIController", reflect.TypeOf((*MockvmController)(nil).SCSIController), ctx) +} + +// VM mocks base method. +func (m *MockvmController) VM() *vmmanager.UtilityVM { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "VM") + ret0, _ := ret[0].(*vmmanager.UtilityVM) + return ret0 +} + +// VM indicates an expected call of VM. +func (mr *MockvmControllerMockRecorder) VM() *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "VM", reflect.TypeOf((*MockvmController)(nil).VM)) } // VPCIController mocks base method. @@ -155,6 +172,59 @@ func (m *MocknetworkController) EXPECT() *MocknetworkControllerMockRecorder { return m.recorder } +// Patch mocks base method. +func (m *MocknetworkController) Patch(ctx context.Context, networkNamespaceID string) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Patch", ctx, networkNamespaceID) +} + +// Patch indicates an expected call of Patch. +func (mr *MocknetworkControllerMockRecorder) Patch(ctx, networkNamespaceID any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Patch", reflect.TypeOf((*MocknetworkController)(nil).Patch), ctx, networkNamespaceID) +} + +// ResetAfterMigration mocks base method. +func (m *MocknetworkController) ResetAfterMigration(ctx context.Context) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "ResetAfterMigration", ctx) + ret0, _ := ret[0].(error) + return ret0 +} + +// ResetAfterMigration indicates an expected call of ResetAfterMigration. +func (mr *MocknetworkControllerMockRecorder) ResetAfterMigration(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "ResetAfterMigration", reflect.TypeOf((*MocknetworkController)(nil).ResetAfterMigration), ctx) +} + +// Resume mocks base method. +func (m *MocknetworkController) Resume(ctx context.Context, vm *vmmanager.UtilityVM, guest *guestmanager.Guest) { + m.ctrl.T.Helper() + m.ctrl.Call(m, "Resume", ctx, vm, guest) +} + +// Resume indicates an expected call of Resume. +func (mr *MocknetworkControllerMockRecorder) Resume(ctx, vm, guest any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Resume", reflect.TypeOf((*MocknetworkController)(nil).Resume), ctx, vm, guest) +} + +// Save mocks base method. +func (m *MocknetworkController) Save(ctx context.Context) (*anypb.Any, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "Save", ctx) + ret0, _ := ret[0].(*anypb.Any) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// Save indicates an expected call of Save. +func (mr *MocknetworkControllerMockRecorder) Save(ctx any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "Save", reflect.TypeOf((*MocknetworkController)(nil).Save), ctx) +} + // Setup mocks base method. func (m *MocknetworkController) Setup(ctx context.Context) error { m.ctrl.T.Helper() diff --git a/internal/controller/pod/pod_lcow.go b/internal/controller/pod/pod_lcow.go index d0b7a1840c..a5848bbb9c 100644 --- a/internal/controller/pod/pod_lcow.go +++ b/internal/controller/pod/pod_lcow.go @@ -30,6 +30,10 @@ type Controller struct { // containers maps containerID → [linuxcontainer.Controller] for every // live container in this pod. Access must be guarded by mu. containers map[string]*linuxcontainer.Controller + + // isMigrating rejects operations while set: true once a snapshot has been + // taken or imported, until migration is resumed. Guarded by mu. + isMigrating bool } // New creates a ready-to-use [Controller] for the given pod. @@ -49,6 +53,14 @@ func New( } } +// PodID returns the pod's containerd-facing identifier. +func (c *Controller) PodID() string { + c.mu.RLock() + defer c.mu.RUnlock() + + return c.podID +} + // SetupNetwork performs network setup for the pod. func (c *Controller) SetupNetwork(ctx context.Context) error { if err := c.network.Setup(ctx); err != nil { @@ -84,17 +96,27 @@ func (c *Controller) NewContainer(ctx context.Context, containerID string) (*lin c.mu.Lock() defer c.mu.Unlock() + // The VM is not bound until Resume, so reject new containers while inert. + if c.isMigrating { + return nil, fmt.Errorf("pod %q is migrating; call Resume first", c.podID) + } + // Ensure we don't create a duplicate container controller. if _, ok := c.containers[containerID]; ok { return nil, fmt.Errorf("container %q already exists in pod %q", containerID, c.podID) } + scsiCtrl, err := c.vm.SCSIController(ctx) + if err != nil { + return nil, fmt.Errorf("get SCSI controller: %w", err) + } + containerCtrl := linuxcontainer.New( c.vm.RuntimeID(), c.gcsPodID, containerID, c.vm.Guest(), - c.vm.SCSIController(), + scsiCtrl, c.vm.Plan9Controller(), c.vm.VPCIController(), ) @@ -128,3 +150,15 @@ func (c *Controller) DeleteContainer(ctx context.Context, containerID string) er delete(c.containers, containerID) return nil } + +// AbortMigrated marks every container in the pod as stopped and reports their +// exit, so that containerd no longer sees them as UNKNOWN and can delete them. +// This is primarily used on destination during finalize stop. +func (c *Controller) AbortMigrated(ctx context.Context, events chan interface{}) { + c.mu.Lock() + defer c.mu.Unlock() + + for _, ctr := range c.containers { + ctr.AbortMigrated(ctx, events) + } +} diff --git a/internal/controller/pod/pod_lcow_test.go b/internal/controller/pod/pod_lcow_test.go index 5e28a6ad8a..58a6c8122a 100644 --- a/internal/controller/pod/pod_lcow_test.go +++ b/internal/controller/pod/pod_lcow_test.go @@ -42,7 +42,7 @@ func newSetup(t *testing.T) (*mocks.MockvmController, *mocks.MocknetworkControll func expectVMCallsForNewContainer(vm *mocks.MockvmController) { vm.EXPECT().RuntimeID().Return("vm-runtime-1") vm.EXPECT().Guest().Return(nil) - vm.EXPECT().SCSIController().Return(nil) + vm.EXPECT().SCSIController(gomock.Any()).Return(nil, nil) vm.EXPECT().Plan9Controller().Return(nil) vm.EXPECT().VPCIController().Return(nil) } @@ -193,6 +193,22 @@ func TestNewContainer(t *testing.T) { }) } +// TestNewContainer_WhileMigrating verifies that an imported, not-yet-resumed +// pod rejects NewContainer with a clear error instead of touching its nil VM. +func TestNewContainer_WhileMigrating(t *testing.T) { + c := &Controller{ + podID: testPodID, + containers: make(map[string]*linuxcontainer.Controller), + isMigrating: true, + } + if _, err := c.NewContainer(t.Context(), "container-1"); err == nil { + t.Fatal("expected error creating a container on a migrating pod") + } + if len(c.containers) != 0 { + t.Error("expected no container to be registered while migrating") + } +} + // TestListContainers verifies snapshots of the live container map. func TestListContainers(t *testing.T) { t.Run("empty", func(t *testing.T) { diff --git a/internal/controller/pod/save_lcow.go b/internal/controller/pod/save_lcow.go new file mode 100644 index 0000000000..a3ebd2fc3d --- /dev/null +++ b/internal/controller/pod/save_lcow.go @@ -0,0 +1,251 @@ +//go:build windows && lcow + +package pod + +import ( + "context" + "fmt" + "sort" + + "github.com/Microsoft/hcsshim/internal/controller/device/scsi" + "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer" + "github.com/Microsoft/hcsshim/internal/controller/network" + podsave "github.com/Microsoft/hcsshim/internal/controller/pod/save" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + "github.com/containerd/containerd/api/runtime/task/v3" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" +) + +// Save returns a serialized snapshot of the pod — its identifiers plus the +// network and per-container state — wrapped in an [anypb.Any] for the caller +// to ship to a migration destination. After it returns, all operations are +// rejected until migration is resumed. +func (c *Controller) Save(ctx context.Context) (*anypb.Any, error) { + c.mu.Lock() + defer c.mu.Unlock() + + // Snapshot containers in a fixed order so the same pod always serializes + // to the same bytes, which keeps snapshot diffs and tests stable. + ids := make([]string, 0, len(c.containers)) + for id := range c.containers { + ids = append(ids, id) + } + sort.Strings(ids) + + // Serialize each container into its own opaque envelope. + containers := make([]*anypb.Any, 0, len(ids)) + for _, id := range ids { + cs, err := c.containers[id].Save(ctx) + if err != nil { + return nil, fmt.Errorf("save container %q: %w", id, err) + } + containers = append(containers, cs) + } + + // Assemble the pod-level payload with its identifiers and children. + state := &podsave.Payload{ + SchemaVersion: podsave.SchemaVersion, + PodID: c.podID, + GcsPodID: c.gcsPodID, + Containers: containers, + } + + // Fold in the network snapshot. + if c.network != nil { + ns, err := c.network.Save(ctx) + if err != nil { + return nil, fmt.Errorf("save network controller: %w", err) + } + state.Network = ns + } + + // Marshal the assembled payload into the typed migration envelope. + payload, err := proto.Marshal(state) + if err != nil { + return nil, fmt.Errorf("marshal pod saved state for %q: %w", c.podID, err) + } + + // Block all further operations until migration is resumed. + c.isMigrating = true + + log.G(ctx).WithField(logfields.SourcePodID, c.podID).Debug("saved pod state") + + return &anypb.Any{TypeUrl: podsave.TypeURL, Value: payload}, nil +} + +// Import reconstructs a pod [Controller] from a [Controller.Save] envelope. +// The returned controller is inert: its network and containers are restored +// but not bound to a live VM, so it does nothing until [Controller.Resume]. +func Import(ctx context.Context, env *anypb.Any) (*Controller, error) { + // Reject an empty or mistyped envelope before touching its bytes. + if env == nil { + return nil, fmt.Errorf("pod saved-state envelope is nil") + } + + if env.GetTypeUrl() != podsave.TypeURL { + return nil, fmt.Errorf("unsupported pod saved-state type %q", env.GetTypeUrl()) + } + + // Decode and reject any payload this build cannot interpret. + state := &podsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + return nil, fmt.Errorf("unmarshal pod saved state: %w", err) + } + + if v := state.GetSchemaVersion(); v != podsave.SchemaVersion { + return nil, fmt.Errorf("unsupported pod saved-state schema version %d (want %d)", v, podsave.SchemaVersion) + } + + // Restore the network controller in its own inert state. + netCtrl, err := network.Import(ctx, state.GetNetwork()) + if err != nil { + return nil, fmt.Errorf("import network controller: %w", err) + } + + // Rebuild the pod shell with its identifiers and the restored network. + c := &Controller{ + podID: state.GetPodID(), + gcsPodID: state.GetGcsPodID(), + containers: make(map[string]*linuxcontainer.Controller, len(state.GetContainers())), + network: netCtrl, + isMigrating: true, + } + + // Rehydrate each container and re-key by its own restored ID. + for _, cAny := range state.GetContainers() { + ctr, err := linuxcontainer.Import(ctx, cAny) + if err != nil { + return nil, fmt.Errorf("import container in pod %q: %w", c.podID, err) + } + c.containers[ctr.ContainerID()] = ctr + } + + log.G(ctx).WithField(logfields.SourcePodID, c.podID).Debug("imported pod state") + + return c, nil +} + +// Resume brings an imported pod back online on the destination host: it binds +// the live VM, reattaches the network, and resumes every container so their +// tasks become visible to containerd again. +// +// Must be called only after the VM's GCS bridge is up, since each container +// reopens its guest container over that bridge. Each container republishes a +// TaskCreate on events upon resume. +func (c *Controller) Resume(ctx context.Context, vm vmController, events chan interface{}) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Bind the live VM and re-wire the network to it. + c.vm = vm + c.isMigrating = false + c.network.Resume(ctx, vm.VM(), vm.Guest()) + + // Fetch the SCSI controller shared by all containers in this VM. + scsiCtrl, err := vm.SCSIController(ctx) + if err != nil { + return fmt.Errorf("get SCSI controller: %w", err) + } + + // Resume each container against the live guest and device controllers. + for _, ctr := range c.containers { + if err := ctr.Resume( + ctx, + vm.RuntimeID(), + c.gcsPodID, + vm.Guest(), + scsiCtrl, + vm.Plan9Controller(), + vm.VPCIController(), + events, + ); err != nil { + return fmt.Errorf("resume container %q: %w", ctr.ContainerID(), err) + } + } + + // Swap the source-side NIC bindings for the destination namespace's now + // that containers are live here. + if err := c.network.ResetAfterMigration(ctx); err != nil { + return fmt.Errorf("reset network for migration: %w", err) + } + + log.G(ctx).WithField(logfields.DestinationPodID, c.podID).Debug("resumed pod") + + return nil +} + +// Patch updates a migrated container so it matches the new task created by +// containerd on this destination host. It points the container at the +// destination's disk paths and assigns the new container ID and IO carried in +// request. For the sandbox container it also takes on the new pod ID and +// records the destination network namespace for a later attach. +func (c *Controller) Patch( + ctx context.Context, + sourceContainerID string, + isSandbox bool, + scsiCtrl *scsi.Controller, + request *task.CreateTaskRequest, + spec specs.Spec, +) error { + // A destination request with a container ID is required before we mutate + // any state. + if request == nil || request.ID == "" { + return fmt.Errorf("invalid create task request: %+v", request) + } + + log.G(ctx).WithFields(logrus.Fields{ + logfields.SourcePodID: c.podID, + logfields.DestinationPodID: request.ID, + "IsSandbox": isSandbox, + "Spec": log.Format(ctx, spec), + }).Debug("patching pod") + + c.mu.Lock() + defer c.mu.Unlock() + + // Resolve the container by its source-side ID and guard against colliding + // with an existing container on destination when the ID is changing. + ctr, ok := c.containers[sourceContainerID] + if !ok { + return fmt.Errorf("source container %q not found in pod %q", sourceContainerID, c.podID) + } + + // If the ID is changing, reject the rename when a different container + // already occupies the destination ID. + if _, exists := c.containers[request.ID]; exists && sourceContainerID != request.ID { + return fmt.Errorf("destination container %q already exists in pod %q", request.ID, c.podID) + } + + // Retarget the container's identity and resource paths to the destination. + if err := ctr.Patch(ctx, scsiCtrl, request); err != nil { + return fmt.Errorf("patch source container %q: %w", sourceContainerID, err) + } + + // Re-key the container under its new ID once the patch succeeds. + if sourceContainerID != request.ID { + delete(c.containers, sourceContainerID) + c.containers[request.ID] = ctr + } + + // A sandbox container also carries the pod identity and network namespace. + if isSandbox { + // Adopt the destination pod ID. + c.podID = request.ID + + // The sandbox spec must name the destination network namespace. + if spec.Windows == nil || spec.Windows.Network == nil || spec.Windows.Network.NetworkNamespace == "" { + return fmt.Errorf("windows network namespace is required for sandbox container") + } + + // Hand the destination namespace to the network controller for later attach. + c.network.Patch(ctx, spec.Windows.Network.NetworkNamespace) + } + + log.G(ctx).WithField(logfields.DestinationPodID, c.podID).Debug("patched migrated pod") + return nil +} diff --git a/internal/controller/pod/save_lcow_test.go b/internal/controller/pod/save_lcow_test.go new file mode 100644 index 0000000000..bdb7bbaf99 --- /dev/null +++ b/internal/controller/pod/save_lcow_test.go @@ -0,0 +1,478 @@ +//go:build windows && lcow + +package pod + +import ( + "testing" + + "github.com/containerd/containerd/api/runtime/task/v3" + "github.com/opencontainers/runtime-spec/specs-go" + "go.uber.org/mock/gomock" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + + "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer" + lcsave "github.com/Microsoft/hcsshim/internal/controller/linuxcontainer/save" + netsave "github.com/Microsoft/hcsshim/internal/controller/network/save" + "github.com/Microsoft/hcsshim/internal/controller/pod/mocks" + podsave "github.com/Microsoft/hcsshim/internal/controller/pod/save" + procsave "github.com/Microsoft/hcsshim/internal/controller/process/save" +) + +// migratingContainer restores a real container into StateMigrating with a +// single init process whose IO paths are empty, so Patch needs no live guest. +func migratingContainer(t *testing.T, id string) *linuxcontainer.Controller { + t.Helper() + proc := mustAny(t, procsave.TypeURL, &procsave.Payload{SchemaVersion: procsave.SchemaVersion}) + env := mustAny(t, lcsave.TypeURL, &lcsave.Payload{ + SchemaVersion: lcsave.SchemaVersion, + ContainerID: id, + GcsContainerID: id, + Processes: map[string]*anypb.Any{"": proc}, + }) + ctr, err := linuxcontainer.Import(t.Context(), env) + if err != nil { + t.Fatalf("import migrating container %q: %v", id, err) + } + return ctr +} + +// mustAny marshals a message and wraps it in the given typed envelope. +func mustAny(t *testing.T, typeURL string, m proto.Message) *anypb.Any { + t.Helper() + b, err := proto.Marshal(m) + if err != nil { + t.Fatalf("marshal %s: %v", typeURL, err) + } + return &anypb.Any{TypeUrl: typeURL, Value: b} +} + +// containerEnvelope builds a minimal, importable container envelope. +func containerEnvelope(t *testing.T, id string) *anypb.Any { + t.Helper() + return mustAny(t, lcsave.TypeURL, &lcsave.Payload{SchemaVersion: lcsave.SchemaVersion, ContainerID: id}) +} + +// newNetMock returns a network controller mock wired to a fresh gomock controller. +func newNetMock(t *testing.T) *mocks.MocknetworkController { + t.Helper() + return mocks.NewMocknetworkController(gomock.NewController(t)) +} + +// TestSave covers the snapshot envelope a caller receives, plus the error +// paths surfaced by the network and container children. +func TestSave(t *testing.T) { + tests := []struct { + name string + build func(t *testing.T) *Controller + wantErr bool + // hasNet asserts whether the serialized payload carries a network envelope. + hasNet bool + }{ + { + name: "no containers, nil network", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, gcsPodID: testPodID, containers: map[string]*linuxcontainer.Controller{}} + }, + }, + { + name: "no containers, with network", + build: func(t *testing.T) *Controller { + t.Helper() + net := newNetMock(t) + net.EXPECT().Save(gomock.Any()).Return(mustAny(t, netsave.TypeURL, &netsave.Payload{SchemaVersion: netsave.SchemaVersion}), nil) + return &Controller{podID: testPodID, gcsPodID: testPodID, network: net, containers: map[string]*linuxcontainer.Controller{}} + }, + hasNet: true, + }, + { + name: "network save fails", + build: func(t *testing.T) *Controller { + t.Helper() + net := newNetMock(t) + net.EXPECT().Save(gomock.Any()).Return(nil, errTest) + return &Controller{podID: testPodID, gcsPodID: testPodID, network: net, containers: map[string]*linuxcontainer.Controller{}} + }, + wantErr: true, + }, + { + name: "container not running", + build: func(t *testing.T) *Controller { + t.Helper() + // A freshly created container is not StateRunning, so its Save fails. + ctr := linuxcontainer.New("vm-1", testPodID, "container-1", nil, nil, nil, nil) + return &Controller{podID: testPodID, gcsPodID: testPodID, containers: map[string]*linuxcontainer.Controller{"container-1": ctr}} + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := tt.build(t) + env, err := c.Save(t.Context()) + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if env.GetTypeUrl() != podsave.TypeURL { + t.Errorf("type url = %q, want %q", env.GetTypeUrl(), podsave.TypeURL) + } + state := &podsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + t.Fatalf("unmarshal payload: %v", err) + } + if (state.GetNetwork() != nil) != tt.hasNet { + t.Errorf("network present = %v, want %v", state.GetNetwork() != nil, tt.hasNet) + } + }) + } +} + +// TestSaveBlocksSourceUntilResume verifies that taking a snapshot marks the +// source pod as migrating so its operations are rejected until Resume. +func TestSaveBlocksSourceUntilResume(t *testing.T) { + src := &Controller{podID: testPodID, gcsPodID: testPodID, containers: map[string]*linuxcontainer.Controller{}} + + if _, err := src.Save(t.Context()); err != nil { + t.Fatalf("Save: %v", err) + } + if !src.isMigrating { + t.Fatal("expected source pod to be migrating after Save") + } + // A blocked operation now fails until Resume rebinds the live VM. + if _, err := src.NewContainer(t.Context(), "container-1"); err == nil { + t.Fatal("expected NewContainer to fail on a migrating source pod") + } +} + +// TestImport covers envelope validation and the fields a caller can observe +// on the reconstructed controller. +func TestImport(t *testing.T) { + tests := []struct { + name string + env func(t *testing.T) *anypb.Any + wantErr bool + check func(t *testing.T, c *Controller) + }{ + { + name: "nil envelope", + env: func(t *testing.T) *anypb.Any { t.Helper(); return nil }, + wantErr: true, + }, + { + name: "wrong type url", + env: func(t *testing.T) *anypb.Any { t.Helper(); return &anypb.Any{TypeUrl: "type.microsoft.com/bogus"} }, + wantErr: true, + }, + { + name: "corrupt payload", + env: func(t *testing.T) *anypb.Any { + t.Helper() + return &anypb.Any{TypeUrl: podsave.TypeURL, Value: []byte{0xff}} + }, + wantErr: true, + }, + { + name: "unsupported schema version", + env: func(t *testing.T) *anypb.Any { + t.Helper() + return mustAny(t, podsave.TypeURL, &podsave.Payload{SchemaVersion: podsave.SchemaVersion + 1, PodID: testPodID}) + }, + wantErr: true, + }, + { + name: "network import fails", + env: func(t *testing.T) *anypb.Any { + t.Helper() + return mustAny(t, podsave.TypeURL, &podsave.Payload{ + SchemaVersion: podsave.SchemaVersion, + PodID: testPodID, + Network: &anypb.Any{TypeUrl: "type.microsoft.com/bogus"}, + }) + }, + wantErr: true, + }, + { + name: "container import fails", + env: func(t *testing.T) *anypb.Any { + t.Helper() + return mustAny(t, podsave.TypeURL, &podsave.Payload{ + SchemaVersion: podsave.SchemaVersion, + PodID: testPodID, + Containers: []*anypb.Any{{TypeUrl: "type.microsoft.com/bogus"}}, + }) + }, + wantErr: true, + }, + { + name: "valid, no containers", + env: func(t *testing.T) *anypb.Any { + t.Helper() + return mustAny(t, podsave.TypeURL, &podsave.Payload{ + SchemaVersion: podsave.SchemaVersion, + PodID: testPodID, + GcsPodID: testPodID, + Network: mustAny(t, netsave.TypeURL, &netsave.Payload{SchemaVersion: netsave.SchemaVersion}), + }) + }, + check: func(t *testing.T, c *Controller) { + t.Helper() + if c.podID != testPodID || c.gcsPodID != testPodID { + t.Errorf("ids = (%q, %q), want (%q, %q)", c.podID, c.gcsPodID, testPodID, testPodID) + } + if c.network == nil { + t.Error("expected non-nil network controller") + } + if len(c.containers) != 0 { + t.Errorf("expected no containers, got %d", len(c.containers)) + } + if !c.isMigrating { + t.Error("expected imported pod to be migrating until Resume") + } + }, + }, + { + name: "valid, with container", + env: func(t *testing.T) *anypb.Any { + t.Helper() + return mustAny(t, podsave.TypeURL, &podsave.Payload{ + SchemaVersion: podsave.SchemaVersion, + PodID: testPodID, + GcsPodID: testPodID, + Network: mustAny(t, netsave.TypeURL, &netsave.Payload{SchemaVersion: netsave.SchemaVersion}), + Containers: []*anypb.Any{containerEnvelope(t, "container-1")}, + }) + }, + check: func(t *testing.T, c *Controller) { + t.Helper() + if _, ok := c.containers["container-1"]; !ok { + t.Error("expected container-1 to be re-keyed by its restored ID") + } + }, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c, err := Import(t.Context(), tt.env(t)) + if tt.wantErr { + if err == nil { + t.Fatal("expected error, got nil") + } + return + } + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + tt.check(t, c) + }) + } +} + +// TestSaveImportRoundTrip verifies that a saved pod reconstructs to an +// equivalent controller a caller can observe. +func TestSaveImportRoundTrip(t *testing.T) { + net := newNetMock(t) + net.EXPECT().Save(gomock.Any()).Return(mustAny(t, netsave.TypeURL, &netsave.Payload{SchemaVersion: netsave.SchemaVersion}), nil) + src := &Controller{podID: testPodID, gcsPodID: testPodID, network: net, containers: map[string]*linuxcontainer.Controller{}} + + env, err := src.Save(t.Context()) + if err != nil { + t.Fatalf("Save: %v", err) + } + + got, err := Import(t.Context(), env) + if err != nil { + t.Fatalf("Import: %v", err) + } + if got.podID != testPodID || got.gcsPodID != testPodID { + t.Errorf("ids = (%q, %q), want (%q, %q)", got.podID, got.gcsPodID, testPodID, testPodID) + } + if got.network == nil { + t.Error("expected non-nil network controller after round trip") + } +} + +// TestResume covers binding a live VM and re-wiring the network for an +// imported pod with no containers. +func TestResume(t *testing.T) { + tests := []struct { + name string + scsiErr error + resetErr error + wantErr bool + }{ + {name: "happy path"}, + {name: "scsi controller fails", scsiErr: errTest, wantErr: true}, + {name: "network reset fails", resetErr: errTest, wantErr: true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mc := gomock.NewController(t) + vm := mocks.NewMockvmController(mc) + net := mocks.NewMocknetworkController(mc) + + vm.EXPECT().VM().Return(nil) + vm.EXPECT().Guest().Return(nil) + net.EXPECT().Resume(gomock.Any(), gomock.Any(), gomock.Any()) + vm.EXPECT().SCSIController(gomock.Any()).Return(nil, tt.scsiErr) + // ResetAfterMigration is only reached when SCSI lookup succeeds. + if tt.scsiErr == nil { + net.EXPECT().ResetAfterMigration(gomock.Any()).Return(tt.resetErr) + } + + c := &Controller{podID: testPodID, gcsPodID: testPodID, network: net, containers: map[string]*linuxcontainer.Controller{}, isMigrating: true} + err := c.Resume(t.Context(), vm, nil) + if (err != nil) != tt.wantErr { + t.Fatalf("Resume() error = %v, wantErr %v", err, tt.wantErr) + } + // Resume clears the migrating guard so normal ops are allowed again. + if c.isMigrating { + t.Error("expected isMigrating to be cleared after Resume") + } + }) + } +} + +// TestPatch covers request validation, container lookup, ID-collision +// rejection, delegation errors, and the successful retarget (including the +// sandbox identity/namespace adoption) a caller can trigger. +func TestPatch(t *testing.T) { + tests := []struct { + name string + build func(t *testing.T) *Controller + sourceID string + isSandbox bool + request *task.CreateTaskRequest + spec specs.Spec + wantErr bool + check func(t *testing.T, c *Controller) + }{ + { + name: "nil request", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{}} + }, + request: nil, + wantErr: true, + }, + { + name: "empty request id", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{}} + }, + request: &task.CreateTaskRequest{ID: ""}, + wantErr: true, + }, + { + name: "source container not found", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{}} + }, + sourceID: "missing", + request: &task.CreateTaskRequest{ID: "dst"}, + wantErr: true, + }, + { + name: "destination id already exists", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{ + "src": linuxcontainer.New("vm-1", testPodID, "src", nil, nil, nil, nil), + "dst": linuxcontainer.New("vm-1", testPodID, "dst", nil, nil, nil, nil), + }} + }, + sourceID: "src", + request: &task.CreateTaskRequest{ID: "dst"}, + wantErr: true, + }, + { + name: "delegated container patch fails", + build: func(t *testing.T) *Controller { + t.Helper() + // A non-migrating container rejects Patch, surfacing as a wrapped error. + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{ + "src": linuxcontainer.New("vm-1", testPodID, "src", nil, nil, nil, nil), + }} + }, + sourceID: "src", + request: &task.CreateTaskRequest{ID: "src"}, + wantErr: true, + }, + { + name: "workload container retargeted and re-keyed", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{"src": migratingContainer(t, "src")}} + }, + sourceID: "src", + request: &task.CreateTaskRequest{ID: "dst"}, + check: func(t *testing.T, c *Controller) { + t.Helper() + if _, ok := c.containers["dst"]; !ok { + t.Error("expected container to be re-keyed to dst") + } + if _, ok := c.containers["src"]; ok { + t.Error("expected old src key to be removed") + } + if c.podID != testPodID { + t.Errorf("podID = %q, want unchanged %q for a workload container", c.podID, testPodID) + } + }, + }, + { + name: "sandbox adopts pod id and namespace", + build: func(t *testing.T) *Controller { + t.Helper() + net := newNetMock(t) + net.EXPECT().Patch(gomock.Any(), "ns-dst") + return &Controller{podID: testPodID, network: net, containers: map[string]*linuxcontainer.Controller{"src": migratingContainer(t, "src")}} + }, + sourceID: "src", + isSandbox: true, + request: &task.CreateTaskRequest{ID: "sbx-dst"}, + spec: specs.Spec{Windows: &specs.Windows{Network: &specs.WindowsNetwork{NetworkNamespace: "ns-dst"}}}, + check: func(t *testing.T, c *Controller) { + t.Helper() + if c.podID != "sbx-dst" { + t.Errorf("podID = %q, want %q", c.podID, "sbx-dst") + } + if _, ok := c.containers["sbx-dst"]; !ok { + t.Error("expected sandbox container to be re-keyed to sbx-dst") + } + }, + }, + { + name: "sandbox without namespace fails", + build: func(t *testing.T) *Controller { + t.Helper() + return &Controller{podID: testPodID, containers: map[string]*linuxcontainer.Controller{"src": migratingContainer(t, "src")}} + }, + sourceID: "src", + isSandbox: true, + request: &task.CreateTaskRequest{ID: "sbx-dst"}, + spec: specs.Spec{}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := tt.build(t) + err := c.Patch(t.Context(), tt.sourceID, tt.isSandbox, nil, tt.request, tt.spec) + if (err != nil) != tt.wantErr { + t.Fatalf("Patch() error = %v, wantErr %v", err, tt.wantErr) + } + if err == nil && tt.check != nil { + tt.check(t, c) + } + }) + } +} diff --git a/internal/controller/pod/types_lcow.go b/internal/controller/pod/types_lcow.go index a2455839ba..66ba7cf5eb 100644 --- a/internal/controller/pod/types_lcow.go +++ b/internal/controller/pod/types_lcow.go @@ -10,6 +10,9 @@ import ( "github.com/Microsoft/hcsshim/internal/controller/device/vpci" "github.com/Microsoft/hcsshim/internal/controller/network" "github.com/Microsoft/hcsshim/internal/vm/guestmanager" + "github.com/Microsoft/hcsshim/internal/vm/vmmanager" + + "google.golang.org/protobuf/types/known/anypb" ) // vmController exposes the subset of the VM manager that the pod controller @@ -19,11 +22,14 @@ type vmController interface { // RuntimeID returns the unique runtime identifier for the VM. RuntimeID() string + // VM returns the vm manager used for UVM host side operations. + VM() *vmmanager.UtilityVM + // Guest returns the guest manager used for guest-side operations. Guest() *guestmanager.Guest // SCSIController returns the SCSI device controller for the VM. - SCSIController() *scsi.Controller + SCSIController(ctx context.Context) (*scsi.Controller, error) // VPCIController returns the vPCI device controller for the VM. VPCIController() *vpci.Controller @@ -43,4 +49,20 @@ type networkController interface { // Teardown performs network teardown for the pod. Teardown(ctx context.Context) error + + // Save returns the network controller's migration payload as an + // [anypb.Any] envelope owned by the network controller package. + Save(ctx context.Context) (*anypb.Any, error) + + // Patch supplies the destination host's network namespace ID, which is used + // later to attach endpoints when migration completes. + Patch(ctx context.Context, networkNamespaceID string) + + // Resume binds live host/guest dependencies during destination-side + // migration rehydration. + Resume(ctx context.Context, vm *vmmanager.UtilityVM, guest *guestmanager.Guest) + + // ResetAfterMigration detaches the stale source endpoints and wires up the + // destination namespace's endpoints in the guest. + ResetAfterMigration(ctx context.Context) error } diff --git a/internal/controller/process/doc.go b/internal/controller/process/doc.go index 6ac9c8816e..dbd845f1e7 100644 --- a/internal/controller/process/doc.go +++ b/internal/controller/process/doc.go @@ -1,31 +1,50 @@ //go:build windows && (lcow || wcow) // Package process provides a controller for managing individual process -// (exec) instances within a container. It handles the full lifecycle from -// creation through exit, including IO plumbing, signal delivery, and exit +// (init or exec) instances within a container. It handles the full lifecycle +// from creation through exit, including IO plumbing, signal delivery, and exit // status reporting. // // # Lifecycle // -// [Controller] drives a single process through a linear state machine: -// -// ┌───────────────────┐ -// │ StateNotCreated │ -// └────────┬──────────┘ -// │ Create -// ▼ -// ┌───────────────────┐ -// │ StateCreated │── Start fails / Kill / Delete──┐ -// └────────┬──────────┘ │ -// │ Start ok │ -// ▼ │ -// ┌───────────────────┐ │ -// │ StateRunning │──── process exits / Kill ──────┤ -// └───────────────────┘ │ -// ▼ -// ┌───────────────────┐ -// │ StateTerminated │ -// └───────────────────┘ +// A controller created via [New] follows the live-creation path: +// +// ┌───────────────────┐ +// │ StateNotCreated │ +// └────────┬──────────┘ +// │ Create +// ▼ +// ┌───────────────────┐ +// │ StateCreated │── Start fails / Kill / Delete──┐ +// └────────┬──────────┘ │ +// │ Start ok │ +// ▼ │ +// ┌───────────────────┐ │ +// │ StateRunning │──── process exits / Kill ──────┤ +// └───────────────────┘ │ +// ▼ +// ┌───────────────────┐ +// │ StateTerminated │ +// └───────────────────┘ +// +// Live migration adds two branches. The destination rehydrates a process via +// [Import] into StateDestinationMigrating; [Controller.Patch] rebinds its IO +// without leaving that state, and [Controller.Resume] reattaches the live +// process (→ StateRunning) or [Controller.AbortMigrated] tears it down +// (→ StateTerminated). The source freezes a running process via [Controller.Save] +// into StateSourceMigrating; [Controller.Resume] rolls it back (→ StateRunning), +// or its exit on source VM teardown terminates it (→ StateTerminated): +// +// destination source +// Patch (rebinds IO, stays) +// ┌──────────────┐ +// ▼ │ +// ┌───────────────────────────┐ ┌──────────────────────┐ +// │ StateDestinationMigrating │ │ StateSourceMigrating │ +// └───┬───────────────────┬───┘ └───┬──────────────┬───┘ +// │ Resume │ Abort │ Resume │ exit +// ▼ ▼ ▼ ▼ +// StateRunning StateTerminated StateRunning StateTerminated // // - [Controller.Create] sets up upstream IO connections and stores the // process spec. The controller transitions from StateNotCreated to @@ -34,7 +53,7 @@ // and spawns a background goroutine to monitor exit. The controller // transitions from StateCreated to StateRunning. // - [Controller.Kill] delivers a signal to a running process or -// terminates a created-but-not-started process. +// terminates a process that has not yet started running. // - [Controller.Delete] prepares the process for removal from the // container's process table. For a created-but-never-started process, // it transitions to StateTerminated and releases its IO resources. @@ -47,7 +66,8 @@ // // When a process is started, a background goroutine waits for the process // to exit, records the exit code and timestamp, drains all IO copies, and -// publishes a TaskExit event via the caller-supplied channel. The -// exitedCh channel is closed once all cleanup is complete, unblocking any -// [Controller.Wait] callers. +// publishes a TaskExit event when the caller supplies an events channel (an +// init process passes none, since its exit is reported by its owning +// container). The exitedCh channel is closed once all cleanup is complete, +// unblocking any [Controller.Wait] callers. package process diff --git a/internal/controller/process/process.go b/internal/controller/process/process.go index a3cc52e4b8..6885895d14 100644 --- a/internal/controller/process/process.go +++ b/internal/controller/process/process.go @@ -69,6 +69,14 @@ type Controller struct { // exitedCh is closed when the process has exited and all cleanup is done. exitedCh chan struct{} + + // vsock ports restored from a migrated process, used to reattach the + // stdio relay on resume. + stdinPort, stdoutPort, stderrPort uint32 + + // Wait request id carried over from a migrated process, reused on resume + // so no duplicate wait is issued. Zero if absent. + waitCallID int64 } // New creates a [Controller] for a process in the given container. @@ -225,12 +233,15 @@ func (c *Controller) Status(isDetailed bool) *task.StateResponse { if isDetailed && c.state != StateNotCreated { resp.Bundle = c.bundle - resp.Stdin = c.upstreamIO.StdinPath() - resp.Stdout = c.upstreamIO.StdoutPath() - resp.Stderr = c.upstreamIO.StderrPath() - resp.Terminal = c.upstreamIO.Terminal() resp.ExitStatus = c.exitCode resp.ExitedAt = timestamppb.New(c.exitedAt) + // upstreamIO is nil for an imported process not yet patched. + if c.upstreamIO != nil { + resp.Stdin = c.upstreamIO.StdinPath() + resp.Stdout = c.upstreamIO.StdoutPath() + resp.Stderr = c.upstreamIO.StderrPath() + resp.Terminal = c.upstreamIO.Terminal() + } } return resp @@ -267,7 +278,8 @@ func (c *Controller) CloseIO(ctx context.Context) { c.mu.RLock() defer c.mu.RUnlock() - if c.state == StateNotCreated { + // upstreamIO is nil before create or for an unpatched imported process. + if c.state == StateNotCreated || c.upstreamIO == nil { return } diff --git a/internal/controller/process/process_test.go b/internal/controller/process/process_test.go index cff0dcbe8a..58cede86a7 100644 --- a/internal/controller/process/process_test.go +++ b/internal/controller/process/process_test.go @@ -211,13 +211,15 @@ func TestStart_Succeeds(t *testing.T) { t.Errorf("state after exit = %s; want StateTerminated", controller.State()) } - // Verify that a TaskExit event was published. + // Verify that a TaskExit event was published. Block (with a timeout) so the + // handleProcessExit goroutine finishes publishing — which happens after its + // final Status mock calls — before the gomock controller runs Finish. select { case event := <-events: if event == nil { t.Error("received nil event; want TaskExit") } - default: + case <-time.After(time.Second): t.Error("expected a TaskExit event in events channel; got none") } } @@ -537,6 +539,8 @@ func TestCloseIO(t *testing.T) { {"Created", StateCreated, true}, {"Running", StateRunning, true}, {"Terminated", StateTerminated, true}, + // Imported but not yet patched: upstreamIO is nil and CloseIO must not panic. + {"MigratingUnpatched", StateDestinationMigrating, false}, } for _, testCase := range tests { @@ -715,6 +719,26 @@ func TestStatus_Created_Detailed(t *testing.T) { } } +// TestStatus_MigratingUnpatched_Detailed verifies that detailed Status on an +// imported-but-not-yet-patched process (nil upstreamIO) does not panic and +// returns bundle/exit fields while leaving IO paths empty. +func TestStatus_MigratingUnpatched_Detailed(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = StateDestinationMigrating + controller.bundle = "/test/bundle" + // upstreamIO intentionally nil. + + status := controller.Status(true) + + if status.Bundle != "/test/bundle" { + t.Errorf("Bundle = %q; want /test/bundle", status.Bundle) + } + if status.Stdin != "" || status.Stdout != "" || status.Stderr != "" || status.Terminal { + t.Errorf("IO fields should be empty for unpatched process, got (%q,%q,%q,%v)", status.Stdin, status.Stdout, status.Stderr, status.Terminal) + } +} + // TestStatus_Running verifies that Status reflects RUNNING state and the stored PID. // Status(false) does not access upstreamIO so no IO mock expectations are needed. func TestStatus_Running(t *testing.T) { diff --git a/internal/controller/process/save.go b/internal/controller/process/save.go new file mode 100644 index 0000000000..b9c06f01f0 --- /dev/null +++ b/internal/controller/process/save.go @@ -0,0 +1,241 @@ +//go:build windows && (lcow || wcow) + +package process + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/Microsoft/hcsshim/internal/cmd" + procsave "github.com/Microsoft/hcsshim/internal/controller/process/save" + "github.com/Microsoft/hcsshim/internal/gcs" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + + "github.com/containerd/errdefs" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/durationpb" +) + +// Save captures a running process as a portable payload that a destination +// shim can later restore. It is only valid while the process is running, and on +// success freezes the source until it is resumed or terminated. +func (c *Controller) Save(ctx context.Context) (*anypb.Any, error) { + c.mu.Lock() + defer c.mu.Unlock() + + // Only a live process has the IO ports and wait id needed to restore it. + if c.state != StateRunning { + return nil, fmt.Errorf("process %q in container %q in state %s; want %s", c.execID, c.containerID, c.state, StateRunning) + } + + // Capture the host-independent identity of the process. + state := &procsave.Payload{ + SchemaVersion: procsave.SchemaVersion, + ExecID: c.execID, + Pid: int32(c.processID), + Bundle: c.bundle, + IoRetryTimeout: durationpb.New(c.ioRetryTimeout), + } + + // A running process contributes the live IO ports and wait id needed to + // reattach on the destination. + if c.process != nil { + ms := c.process.MigrationState() + state.StdinPort, state.StdoutPort, state.StderrPort = ms.StdinPort, ms.StdoutPort, ms.StderrPort + state.WaitCallID = ms.WaitCallID + } + + // Exec processes carry their OCI spec; init processes leave it unset. + if c.processSpec != nil { + raw, err := json.Marshal(c.processSpec) + if err != nil { + return nil, fmt.Errorf("marshal process spec for %q/%q: %w", c.containerID, c.execID, err) + } + state.OciProcessSpecJson = raw + } + + // Wrap the encoded payload so the destination can identify and version it. + payload, err := proto.Marshal(state) + if err != nil { + return nil, fmt.Errorf("marshal process saved state for %q/%q: %w", c.containerID, c.execID, err) + } + + // Freeze the source until the migration is resumed or terminated. + c.state = StateSourceMigrating + + log.G(ctx).WithFields(logrus.Fields{ + logfields.SourceContainerID: c.containerID, + logfields.ProcessID: c.processID, + }).Debug("saved process state") + + return &anypb.Any{TypeUrl: procsave.TypeURL, Value: payload}, nil +} + +// Import reconstructs a process from a payload produced by [Controller.Save]. +// The result is inert: it holds no live IO or process handle, and operational +// calls are rejected until it has been patched and resumed. +func Import(ctx context.Context, env *anypb.Any, containerID string) (*Controller, error) { + if env == nil { + return nil, fmt.Errorf("process saved-state envelope is nil") + } + + // Refuse envelopes that were not produced by this save format. + if env.GetTypeUrl() != procsave.TypeURL { + return nil, fmt.Errorf("unsupported process saved-state type %q", env.GetTypeUrl()) + } + + state := &procsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + return nil, fmt.Errorf("unmarshal process saved state: %w", err) + } + + // Reject payloads written by an incompatible shim version. + if v := state.GetSchemaVersion(); v != procsave.SchemaVersion { + return nil, fmt.Errorf("unsupported process saved-state schema version %d (want %d)", v, procsave.SchemaVersion) + } + + // Rebuild the controller in the destination-migrating state, holding the + // saved IO ports and wait id until resume rebinds them to a live process. + c := &Controller{ + containerID: containerID, + execID: state.GetExecID(), + ioRetryTimeout: state.GetIoRetryTimeout().AsDuration(), + state: StateDestinationMigrating, + processID: int(state.GetPid()), + bundle: state.GetBundle(), + exitedCh: make(chan struct{}), + stdinPort: state.GetStdinPort(), + stdoutPort: state.GetStdoutPort(), + stderrPort: state.GetStderrPort(), + waitCallID: state.GetWaitCallID(), + } + + // Restore the exec spec when present; absence marks an init process. + if raw := state.GetOciProcessSpecJson(); len(raw) > 0 { + spec := &specs.Process{} + if err := json.Unmarshal(raw, spec); err != nil { + return nil, fmt.Errorf("unmarshal process spec for %q/%q: %w", c.containerID, c.execID, err) + } + c.processSpec = spec + } + + log.G(ctx).WithFields(logrus.Fields{ + logfields.SourceContainerID: c.containerID, + logfields.ProcessID: c.processID, + }).Debug("imported process state") + + return c, nil +} + +// Patch rebinds an imported process to its destination container and opens +// fresh IO ahead of resume. It is valid only on an imported, not-yet-resumed +// process. +func (c *Controller) Patch(ctx context.Context, containerID string, opts *CreateOptions) error { + if opts == nil { + return fmt.Errorf("patch options are required: %w", errdefs.ErrInvalidArgument) + } + if containerID == "" { + return fmt.Errorf("destination container id is required: %w", errdefs.ErrInvalidArgument) + } + + c.mu.Lock() + defer c.mu.Unlock() + + if c.state != StateDestinationMigrating { + return fmt.Errorf("process %q in container %s is in state %s; cannot patch: %w", c.execID, c.containerID, c.state, errdefs.ErrFailedPrecondition) + } + + // Reject a terminal/stderr combination that a fresh create would refuse. + if opts.Terminal && opts.Stderr != "" { + return fmt.Errorf("process %q in container %s has terminal enabled but stderr is not empty: %w", c.execID, containerID, errdefs.ErrFailedPrecondition) + } + + // Open IO against the destination first so a failure leaves the process + // retryable with its old state intact. + upstreamIO, err := cmd.NewUpstreamIO(ctx, containerID, opts.Stdout, opts.Stderr, opts.Stdin, opts.Terminal, c.ioRetryTimeout) + if err != nil { + return fmt.Errorf("create upstream io for process %q in container %s: %w", c.execID, containerID, err) + } + + // Adopt the destination identity now that IO is secured. + oldContainerID := c.containerID + c.containerID = containerID + c.bundle = opts.Bundle + c.upstreamIO = upstreamIO + + log.G(ctx).WithFields(logrus.Fields{ + logfields.SourceContainerID: oldContainerID, + logfields.DestinationContainerID: containerID, + logfields.ProcessID: c.processID, + }).Debug("patched migrated process IO") + + return nil +} + +// Resume returns a migrating process to the running state. On the destination +// it reattaches the patched process to its live guest counterpart, wires up the +// stdio relay, and begins watching for exit. On the source it simply lifts the +// freeze that Save applied, since the live process and IO are still intact. +// Pass events=nil for an init process, whose exit is reported by its owning +// container instead. +func (c *Controller) Resume(ctx context.Context, gcsContainer *gcs.Container, events chan interface{}) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Source rollback: the live process and IO are intact, so just lift the + // freeze that Save applied. + if c.state == StateSourceMigrating { + c.state = StateRunning + return nil + } + + if c.state != StateDestinationMigrating { + return fmt.Errorf("process %q in container %q is in state %s; cannot resume: %w", c.execID, c.containerID, c.state, errdefs.ErrFailedPrecondition) + } + + // Reopen the live process on its preserved IO ports and wait id. + gcsProc, err := gcsContainer.OpenProcessWithIO(ctx, uint32(c.processID), c.stdinPort, c.stdoutPort, c.stderrPort, c.waitCallID) + if err != nil { + return fmt.Errorf("open gcs process pid %d in container %q: %w", c.processID, c.containerID, err) + } + + // Detach from the caller's context so a canceled RPC does not kill the + // restored process while IO is being attached. + execCmd, err := cmd.Attach(context.WithoutCancel(ctx), gcsProc, c.upstreamIO.Stdin(), c.upstreamIO.Stdout(), c.upstreamIO.Stderr()) + if err != nil { + _ = gcsProc.Close() + return fmt.Errorf("attach process IO pid %d in container %q: %w", c.processID, c.containerID, err) + } + + c.hostingSystem = gcsContainer + c.process = gcsProc + c.state = StateRunning + // Ports are single-use; clear them now that IO is reattached. + c.stdinPort, c.stdoutPort, c.stderrPort = 0, 0, 0 + + // Watch for exit in the background, mirroring a freshly started process. + go c.handleProcessExit(ctx, execCmd, events) + + log.G(ctx).WithField(logfields.ProcessID, c.processID).Debug("resumed migrated process on destination") + return nil +} + +// AbortMigrated terminates an imported, not-yet-resumed process so it can be +// deleted. It is a no-op once the process has been resumed or otherwise left +// the migrating state. +func (c *Controller) AbortMigrated(ctx context.Context) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.state != StateDestinationMigrating { + return + } + + log.G(ctx).WithField(logfields.ProcessID, c.processID).Debug("aborting migrated process") + c.abortInternal(ctx, 137) +} diff --git a/internal/controller/process/save_test.go b/internal/controller/process/save_test.go new file mode 100644 index 0000000000..6eef63c8ad --- /dev/null +++ b/internal/controller/process/save_test.go @@ -0,0 +1,442 @@ +//go:build windows && (lcow || wcow) + +package process + +import ( + "encoding/json" + "errors" + "reflect" + "testing" + "time" + + "github.com/containerd/errdefs" + "github.com/opencontainers/runtime-spec/specs-go" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/durationpb" + + "github.com/Microsoft/hcsshim/internal/controller/process/mocks" + procsave "github.com/Microsoft/hcsshim/internal/controller/process/save" + "github.com/Microsoft/hcsshim/internal/cow" +) + +const ( + testBundle = "/test/bundle" + testStdinPort = uint32(101) + testStdoutPort = uint32(102) + testStderrPort = uint32(103) + testWaitCallID = int64(99) +) + +// TestSave_WrongState verifies that only a running process can be saved. +func TestSave_WrongState(t *testing.T) { + t.Parallel() + invalidStates := []State{StateNotCreated, StateCreated, StateTerminated, StateDestinationMigrating, StateSourceMigrating} + + for _, state := range invalidStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = state + + if _, err := controller.Save(t.Context()); err == nil { + t.Errorf("Save() = nil; want error for state %s", state) + } + }) + } +} + +// TestSave_Succeeds verifies that a running process is serialized into a +// payload carrying its identity, live IO ports, and (for an exec) its spec. +func TestSave_Succeeds(t *testing.T) { + t.Parallel() + tests := []struct { + name string + spec *specs.Process + }{ + {name: "exec with spec", spec: &specs.Process{Args: []string{"/bin/sh"}}}, + {name: "init without spec", spec: nil}, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + mockCtrl, _, _, controller := newSetup(t) + mockProc := mocks.NewMockProcess(mockCtrl) + controller.state = StateRunning + controller.process = mockProc + controller.processID = testPID + controller.bundle = testBundle + controller.processSpec = testCase.spec + + mockProc.EXPECT().MigrationState().Return(cow.MigrationState{ + StdinPort: testStdinPort, + StdoutPort: testStdoutPort, + StderrPort: testStderrPort, + WaitCallID: testWaitCallID, + }) + + env, err := controller.Save(t.Context()) + if err != nil { + t.Fatalf("Save() = %v; want nil", err) + } + if env.GetTypeUrl() != procsave.TypeURL { + t.Errorf("TypeUrl = %q; want %q", env.GetTypeUrl(), procsave.TypeURL) + } + + // Decode the payload and verify the serialized fields. + got := &procsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), got); err != nil { + t.Fatalf("Unmarshal payload = %v; want nil", err) + } + if got.GetSchemaVersion() != procsave.SchemaVersion { + t.Errorf("SchemaVersion = %d; want %d", got.GetSchemaVersion(), procsave.SchemaVersion) + } + if got.GetExecID() != testExecID { + t.Errorf("ExecID = %q; want %q", got.GetExecID(), testExecID) + } + if got.GetPid() != int32(testPID) { + t.Errorf("Pid = %d; want %d", got.GetPid(), testPID) + } + if got.GetBundle() != testBundle { + t.Errorf("Bundle = %q; want %q", got.GetBundle(), testBundle) + } + if got.GetStdinPort() != testStdinPort || got.GetStdoutPort() != testStdoutPort || got.GetStderrPort() != testStderrPort { + t.Errorf("ports = (%d,%d,%d); want (%d,%d,%d)", got.GetStdinPort(), got.GetStdoutPort(), got.GetStderrPort(), testStdinPort, testStdoutPort, testStderrPort) + } + if got.GetWaitCallID() != testWaitCallID { + t.Errorf("WaitCallID = %d; want %d", got.GetWaitCallID(), testWaitCallID) + } + // The spec is present only for an exec process. + if (len(got.GetOciProcessSpecJson()) > 0) != (testCase.spec != nil) { + t.Errorf("spec present = %v; want %v", len(got.GetOciProcessSpecJson()) > 0, testCase.spec != nil) + } + // A successful save freezes the source until it is resumed or terminated. + if controller.state != StateSourceMigrating { + t.Errorf("state = %s; want StateSourceMigrating", controller.state) + } + }) + } +} + +// TestImport_InvalidEnvelope verifies that Import rejects malformed or +// incompatible envelopes. +func TestImport_InvalidEnvelope(t *testing.T) { + t.Parallel() + + // A payload stamped with an unsupported schema version. + badVersion, err := proto.Marshal(&procsave.Payload{SchemaVersion: procsave.SchemaVersion + 1}) + if err != nil { + t.Fatalf("marshal bad-version payload = %v", err) + } + + tests := []struct { + name string + env *anypb.Any + }{ + {name: "nil envelope", env: nil}, + {name: "wrong type url", env: &anypb.Any{TypeUrl: "type.microsoft.com/other", Value: nil}}, + {name: "undecodable value", env: &anypb.Any{TypeUrl: procsave.TypeURL, Value: []byte{0x08, 0xff}}}, + {name: "schema version mismatch", env: &anypb.Any{TypeUrl: procsave.TypeURL, Value: badVersion}}, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + if _, err := Import(t.Context(), testCase.env, testContainerID); err == nil { + t.Errorf("Import() = nil; want error") + } + }) + } +} + +// TestImport_Succeeds verifies that Import reconstructs the controller in the +// migrating state with the saved fields, restoring the spec only for an exec. +func TestImport_Succeeds(t *testing.T) { + t.Parallel() + tests := []struct { + name string + spec *specs.Process + }{ + {name: "exec with spec", spec: &specs.Process{Args: []string{"/bin/sh"}}}, + {name: "init without spec", spec: nil}, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + env := buildEnvelope(t, testCase.spec) + + controller, err := Import(t.Context(), env, testContainerID) + if err != nil { + t.Fatalf("Import() = %v; want nil", err) + } + if controller.state != StateDestinationMigrating { + t.Errorf("state = %s; want StateDestinationMigrating", controller.state) + } + if controller.containerID != testContainerID { + t.Errorf("containerID = %q; want %q", controller.containerID, testContainerID) + } + if controller.execID != testExecID { + t.Errorf("execID = %q; want %q", controller.execID, testExecID) + } + if controller.processID != testPID { + t.Errorf("processID = %d; want %d", controller.processID, testPID) + } + if controller.bundle != testBundle { + t.Errorf("bundle = %q; want %q", controller.bundle, testBundle) + } + if controller.stdinPort != testStdinPort || controller.stdoutPort != testStdoutPort || controller.stderrPort != testStderrPort { + t.Errorf("ports = (%d,%d,%d); want (%d,%d,%d)", controller.stdinPort, controller.stdoutPort, controller.stderrPort, testStdinPort, testStdoutPort, testStderrPort) + } + if controller.waitCallID != testWaitCallID { + t.Errorf("waitCallID = %d; want %d", controller.waitCallID, testWaitCallID) + } + if controller.exitedCh == nil { + t.Error("exitedCh must be non-nil after Import") + } + if !reflect.DeepEqual(controller.processSpec, testCase.spec) { + t.Errorf("processSpec = %+v; want %+v", controller.processSpec, testCase.spec) + } + }) + } +} + +// TestPatch_InvalidArgs verifies that Patch rejects missing options or an +// empty destination container id. +func TestPatch_InvalidArgs(t *testing.T) { + t.Parallel() + tests := []struct { + name string + opts *CreateOptions + containerID string + }{ + {name: "nil options", opts: nil, containerID: testContainerID}, + {name: "empty container id", opts: &CreateOptions{}, containerID: ""}, + } + + for _, testCase := range tests { + t.Run(testCase.name, func(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = StateDestinationMigrating + + err := controller.Patch(t.Context(), testCase.containerID, testCase.opts) + if !errors.Is(err, errdefs.ErrInvalidArgument) { + t.Errorf("Patch() = %v; want ErrInvalidArgument", err) + } + }) + } +} + +// TestPatch_WrongState verifies that Patch only operates on a destination-migrating process. +func TestPatch_WrongState(t *testing.T) { + t.Parallel() + invalidStates := []State{StateNotCreated, StateCreated, StateRunning, StateTerminated, StateSourceMigrating} + + for _, state := range invalidStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = state + + err := controller.Patch(t.Context(), testContainerID, &CreateOptions{}) + if !errors.Is(err, errdefs.ErrFailedPrecondition) { + t.Errorf("Patch() = %v; want ErrFailedPrecondition", err) + } + }) + } +} + +// TestPatch_TerminalWithStderr verifies that Patch rejects the terminal+stderr +// combination a fresh create would also refuse. +func TestPatch_TerminalWithStderr(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = StateDestinationMigrating + + err := controller.Patch(t.Context(), testContainerID, &CreateOptions{ + Terminal: true, + Stderr: `\\.\pipe\some-stderr`, + }) + if !errors.Is(err, errdefs.ErrFailedPrecondition) { + t.Errorf("Patch(terminal+stderr) = %v; want ErrFailedPrecondition", err) + } +} + +// TestPatch_Succeeds verifies that Patch adopts the destination container and +// opens fresh IO while leaving the process in the migrating state. Empty IO +// paths are used so no real named-pipe connections are attempted. +func TestPatch_Succeeds(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = StateDestinationMigrating + + const destContainerID = "dest-container-9999" + opts := &CreateOptions{Bundle: testBundle} + + if err := controller.Patch(t.Context(), destContainerID, opts); err != nil { + t.Fatalf("Patch() = %v; want nil", err) + } + if controller.state != StateDestinationMigrating { + t.Errorf("state = %s; want StateDestinationMigrating", controller.state) + } + if controller.containerID != destContainerID { + t.Errorf("containerID = %q; want %q", controller.containerID, destContainerID) + } + if controller.bundle != testBundle { + t.Errorf("bundle = %q; want %q", controller.bundle, testBundle) + } + if controller.upstreamIO == nil { + t.Error("upstreamIO must be non-nil after Patch") + } +} + +// TestResume_WrongState verifies that Resume only operates on a migrating +// process and rejects other states before touching the host. +func TestResume_WrongState(t *testing.T) { + t.Parallel() + invalidStates := []State{StateNotCreated, StateCreated, StateRunning, StateTerminated} + + for _, state := range invalidStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = state + + err := controller.Resume(t.Context(), nil, nil) + if !errors.Is(err, errdefs.ErrFailedPrecondition) { + t.Errorf("Resume() = %v; want ErrFailedPrecondition", err) + } + }) + } +} + +// TestResume_SourceRollback verifies that resuming a source-migrating process +// lifts the freeze and returns it to running without touching the host. +func TestResume_SourceRollback(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = StateSourceMigrating + + // nil host/events are unused: the live process and IO stay intact. + if err := controller.Resume(t.Context(), nil, nil); err != nil { + t.Fatalf("Resume() = %v; want nil", err) + } + if controller.state != StateRunning { + t.Errorf("state = %s; want StateRunning", controller.state) + } +} + +// TestAbortMigrated_NoOp verifies that AbortMigrated leaves a non-migrating +// process untouched. +func TestAbortMigrated_NoOp(t *testing.T) { + t.Parallel() + otherStates := []State{StateNotCreated, StateCreated, StateRunning, StateTerminated, StateSourceMigrating} + + for _, state := range otherStates { + t.Run(state.String(), func(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = state + + controller.AbortMigrated(t.Context()) + if controller.state != state { + t.Errorf("state = %s; want unchanged %s", controller.state, state) + } + }) + } +} + +// TestAbortMigrated_Succeeds verifies that AbortMigrated terminates a migrating +// process, recording exit code 137 and unblocking waiters. +func TestAbortMigrated_Succeeds(t *testing.T) { + t.Parallel() + _, _, _, controller := newSetup(t) + controller.state = StateDestinationMigrating + // upstreamIO intentionally nil — abort must tolerate it. + + controller.AbortMigrated(t.Context()) + + if controller.state != StateTerminated { + t.Errorf("state = %s; want StateTerminated", controller.state) + } + if controller.exitCode != 137 { + t.Errorf("exitCode = %d; want 137", controller.exitCode) + } + select { + case <-controller.exitedCh: + default: + t.Error("exitedCh should be closed after AbortMigrated") + } +} + +// TestSaveImport_RoundTrip verifies that a payload produced by Save restores an +// equivalent process via Import. +func TestSaveImport_RoundTrip(t *testing.T) { + t.Parallel() + mockCtrl, _, _, src := newSetup(t) + mockProc := mocks.NewMockProcess(mockCtrl) + src.state = StateRunning + src.process = mockProc + src.processID = testPID + src.bundle = testBundle + src.processSpec = &specs.Process{Args: []string{"/bin/sh"}} + + mockProc.EXPECT().MigrationState().Return(cow.MigrationState{ + StdinPort: testStdinPort, + StdoutPort: testStdoutPort, + StderrPort: testStderrPort, + WaitCallID: testWaitCallID, + }) + + env, err := src.Save(t.Context()) + if err != nil { + t.Fatalf("Save() = %v; want nil", err) + } + + dst, err := Import(t.Context(), env, testContainerID) + if err != nil { + t.Fatalf("Import() = %v; want nil", err) + } + + if dst.execID != src.execID || dst.processID != src.processID || dst.bundle != src.bundle { + t.Errorf("restored identity mismatch: got (%q,%d,%q); want (%q,%d,%q)", dst.execID, dst.processID, dst.bundle, src.execID, src.processID, src.bundle) + } + if dst.stdinPort != testStdinPort || dst.stdoutPort != testStdoutPort || dst.stderrPort != testStderrPort || dst.waitCallID != testWaitCallID { + t.Errorf("restored migration state mismatch: ports=(%d,%d,%d) wait=%d", dst.stdinPort, dst.stdoutPort, dst.stderrPort, dst.waitCallID) + } + if !reflect.DeepEqual(dst.processSpec, src.processSpec) { + t.Errorf("restored spec = %+v; want %+v", dst.processSpec, src.processSpec) + } +} + +// buildEnvelope marshals a payload with the standard test fields and the given +// spec into an envelope Import can consume. +func buildEnvelope(t *testing.T, spec *specs.Process) *anypb.Any { + t.Helper() + payload := &procsave.Payload{ + SchemaVersion: procsave.SchemaVersion, + ExecID: testExecID, + Pid: int32(testPID), + Bundle: testBundle, + IoRetryTimeout: durationpb.New(time.Second), + StdinPort: testStdinPort, + StdoutPort: testStdoutPort, + StderrPort: testStderrPort, + WaitCallID: testWaitCallID, + } + if spec != nil { + raw, err := json.Marshal(spec) + if err != nil { + t.Fatalf("marshal spec = %v", err) + } + payload.OciProcessSpecJson = raw + } + + value, err := proto.Marshal(payload) + if err != nil { + t.Fatalf("marshal payload = %v", err) + } + return &anypb.Any{TypeUrl: procsave.TypeURL, Value: value} +} diff --git a/internal/controller/process/state.go b/internal/controller/process/state.go index 3e991b716c..48630f4c30 100644 --- a/internal/controller/process/state.go +++ b/internal/controller/process/state.go @@ -12,21 +12,32 @@ import ( // // StateNotCreated → StateCreated → StateRunning → StateTerminated // +// Live migration adds two branches. On the destination, a process is restored +// directly into StateDestinationMigrating and rejoins the progression once +// resumed (StateRunning) or aborted (StateTerminated). On the source, Save +// freezes a running process into StateSourceMigrating; resume rolls it back to +// StateRunning, or its exit (when the source VM is torn down) terminates it. +// // Full state-transition table: // -// Current State │ Trigger │ Next State -// ─────────────────┼──────────────────────────────────────┼──────────────── -// StateNotCreated │ Create succeeds │ StateCreated -// StateCreated │ Start succeeds │ StateRunning -// StateCreated │ Start fails / Kill / Delete │ StateTerminated -// StateRunning │ process exits │ StateTerminated -// StateRunning │ Kill succeeds (signal or terminate) │ StateTerminated -// StateTerminated │ (terminal — no further transitions) │ — +// Current State │ Trigger │ Next State +// ──────────────────────────┼──────────────────────────────┼───────────────────── +// StateNotCreated │ create succeeds │ StateCreated +// StateNotCreated │ kill │ StateTerminated +// StateCreated │ start succeeds │ StateRunning +// StateCreated │ start fails / kill / delete │ StateTerminated +// StateRunning │ process exits (incl. kill) │ StateTerminated +// StateRunning │ Save freezes the source │ StateSourceMigrating +// StateDestinationMigrating │ resume succeeds │ StateRunning +// StateDestinationMigrating │ migration aborted │ StateTerminated +// StateSourceMigrating │ resume rolls back │ StateRunning +// StateSourceMigrating │ process exits (VM torn down) │ StateTerminated +// StateTerminated │ terminal — no transitions │ — type State int32 const ( // StateNotCreated indicates the process has not been created yet. - // This is the initial state set by [New]. + // This is the initial state for a newly constructed process. StateNotCreated State = iota // StateCreated indicates the process has been created but not started. @@ -39,6 +50,14 @@ const ( // StateTerminated indicates the process has exited and all cleanup is done. // This is a terminal state — no further transitions are possible. StateTerminated + + // StateDestinationMigrating indicates a process restored from a snapshot on + // the destination, awaiting resume (→ StateRunning) or abort (→ StateTerminated). + StateDestinationMigrating + + // StateSourceMigrating indicates a running process frozen by Save on the + // source, awaiting resume (→ StateRunning) or its exit (→ StateTerminated). + StateSourceMigrating ) // String returns a human-readable representation of the State. @@ -52,6 +71,10 @@ func (s State) String() string { return "Running" case StateTerminated: return "Terminated" + case StateDestinationMigrating: + return "DestinationMigrating" + case StateSourceMigrating: + return "SourceMigrating" default: return "Unknown" } @@ -67,7 +90,7 @@ func (s State) ContainerdStatus() containerdtypes.Status { case StateTerminated: return containerdtypes.Status_STOPPED default: - // StateNotCreated has no direct containerd equivalent. + // StateNotCreated and the migrating states have no direct containerd equivalent. return containerdtypes.Status_UNKNOWN } } diff --git a/internal/controller/vm/doc.go b/internal/controller/vm/doc.go index 764eaa3e27..dc13a15417 100644 --- a/internal/controller/vm/doc.go +++ b/internal/controller/vm/doc.go @@ -7,6 +7,12 @@ // creation, startup, stats collection, and termination — with the [Controller] // as the primary implementation. // +// Live-migration entry points are provided on both sides: the source captures a +// running VM via [Controller.Save] (state snapshot), while the destination +// rehydrates it via [Controller.Import] (state-only rehydration), recreates the +// VM, and rebinds its disks via [Controller.Patch] before resuming. +// [Controller.Resume] returns either side to [StateRunning]. +// // # Lifecycle // // A VM follows the state machine below. @@ -31,6 +37,33 @@ // └─►│ StateTerminated │ // └─────────────────────────────────────────────────┘ // +// Live migration adds side-specific paths. The source toggles a running VM into +// [StateSourceMigrating]; the destination walks a dedicated path — +// [Controller.Import] → [Controller.CreateVM] → [Controller.StartWithMigrationOptions]. +// From either migrating state the resumed side returns to [StateRunning] via +// [Controller.Resume], while the stopped side reaches [StateTerminated] via a +// finalize Stop or a teardown ([Controller.TerminateVM]). The forward flow stops +// the source and resumes the destination; the reverse flow resumes the source and +// stops the destination. +// +// source destination +// ┌──────────────────────┐ ┌───────────────────────────┐ +// │ StateSourceMigrating │ │ StateMigratingImported │ +// └───┬──────────────┬───┘ └─────────────┬─────────────┘ +// │ Resume │ Finalize(Stop)/Terminate │ CreateVM +// ▼ ▼ ▼ +// StateRunning StateTerminated ┌───────────────────────────┐ +// │ StateMigratingCreated │ +// └─────────────┬─────────────┘ +// │ StartWithMigrationOptions +// ▼ +// ┌───────────────────────────┐ +// │ StateDestinationMigrating │ +// └──────┬─────────────┬──────┘ +// Resume │ │ Finalize(Stop)/Terminate +// ▼ ▼ +// StateRunning StateTerminated +// // State descriptions: // // - [StateNotCreated]: initial state after [New] is called. @@ -41,8 +74,21 @@ // [Controller.TerminateVM] completes successfully. // - [StateInvalid]: error state entered when [Controller.StartVM] fails after the underlying // HCS VM has already started, or when [Controller.TerminateVM] fails during uvm.Close -// (from either [StateCreated] or [StateRunning]). +// (from [StateCreated], [StateRunning], or [StateMigratingCreated]). // A VM in this state can only be cleaned up by calling [Controller.TerminateVM]. +// - [StateSourceMigrating]: the running source VM has begun an outgoing migration; +// only live-migration calls and [Controller.Save] are permitted. [Controller.Resume] +// rolls it back to [StateRunning]; a finalize Stop (forward flow) or +// [Controller.TerminateVM] terminates it to [StateTerminated]. +// - [StateMigratingImported]: the destination has been rehydrated from a snapshot via +// [Controller.Import] but the VM does not exist yet; [Controller.CreateVM] is the next step. +// - [StateMigratingCreated]: the destination VM has been created from the snapshot but not +// started; disks are rebound via [Controller.Patch] and +// [Controller.StartWithMigrationOptions] advances it to [StateDestinationMigrating]. +// - [StateDestinationMigrating]: the destination VM is running against the migration +// transport awaiting the source's state; [Controller.Resume] reaches [StateRunning], +// while a finalize Stop (reverse flow) or [Controller.TerminateVM] terminates it to +// [StateTerminated]. // // # Platform Variants // diff --git a/internal/controller/vm/save_lcow.go b/internal/controller/vm/save_lcow.go new file mode 100644 index 0000000000..7c8779d477 --- /dev/null +++ b/internal/controller/vm/save_lcow.go @@ -0,0 +1,296 @@ +//go:build windows && lcow + +package vm + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/Microsoft/hcsshim/internal/builder/vm/lcow" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" + vmsave "github.com/Microsoft/hcsshim/internal/controller/vm/save" + "github.com/Microsoft/hcsshim/internal/gcs/prot" + hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" + "github.com/Microsoft/hcsshim/internal/wclayer" + + "github.com/Microsoft/go-winio" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" +) + +// Save captures the migrating VM's state into a serialized snapshot that the +// destination host consumes to recreate an equivalent VM. +func (c *Controller) Save(ctx context.Context) (*anypb.Any, error) { + // CompatibilityInfo takes its own read lock; fetch it before acquiring + // ours to avoid recursive RLock acquisition. + compatInfo, err := c.CompatibilityInfo(ctx) + if err != nil { + return nil, fmt.Errorf("get compatibility info: %w", err) + } + + c.mu.RLock() + defer c.mu.RUnlock() + + // Save is only valid once the source has begun migrating. + if c.vmState != StateSourceMigrating { + return nil, fmt.Errorf("cannot save VM: VM is in state %s", c.vmState) + } + + // Seed the payload with the VM identity, creation options, and compat blob. + state := &vmsave.Payload{ + SchemaVersion: vmsave.SchemaVersion, + VmID: c.vmID, + SandboxOptions: sandboxOptionsToProto(c.sandboxOptions), + CompatInfo: compatInfo, + } + + // Ship the final HCS ComputeSystem document so the destination can + // recreate an identical VM. We encode it as JSON because the schema is + // owned by hcsschema (not protobuf) and JSON is the canonical wire + // format HCS itself consumes. + if c.hcsDocument != nil { + docBytes, err := json.Marshal(c.hcsDocument) + if err != nil { + return nil, fmt.Errorf("marshal hcs document: %w", err) + } + + state.HcsDocument = docBytes + } + + if c.scsiController != nil { + s, err := c.scsiController.Save(ctx) + if err != nil { + return nil, fmt.Errorf("save scsi controller: %w", err) + } + + state.Scsi = s + } + + // VPCI and Plan9 carry no transferable state today; Save fails if any + // is present so unsupported topologies surface instead of silently dropping. + if c.vpciController != nil { + if err := c.vpciController.Save(); err != nil { + return nil, fmt.Errorf("save vpci controller: %w", err) + } + } + + if c.plan9Controller != nil { + if err := c.plan9Controller.Save(); err != nil { + return nil, fmt.Errorf("save plan9 controller: %w", err) + } + } + + // Capture the GCS port and bridge-id allocator floors so the destination + // resumes its allocators above ids the guest still has outstanding. + if p := c.guest.NextPort(); p != 0 { + state.GcsNextPort = p + } + + if id := c.guest.BridgeNextID(); id != 0 { + state.BridgeNextID = id + } + + payload, err := proto.Marshal(state) + if err != nil { + return nil, fmt.Errorf("marshal vm saved state: %w", err) + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("saved VM migration state") + return &anypb.Any{TypeUrl: vmsave.TypeURL, Value: payload}, nil +} + +// Import rebuilds a controller's static state from a snapshot produced by +// Save. The controller comes back inert in the migrating state and performs no +// live work until Resume supplies the running VM. +func (c *Controller) Import(ctx context.Context, env *anypb.Any) error { + if env == nil { + return fmt.Errorf("vm saved-state envelope is nil") + } + + // Reject envelopes that did not originate from a compatible Save. + if env.GetTypeUrl() != vmsave.TypeURL { + return fmt.Errorf("unsupported vm saved-state type %q", env.GetTypeUrl()) + } + + state := &vmsave.Payload{} + if err := proto.Unmarshal(env.GetValue(), state); err != nil { + return fmt.Errorf("unmarshal vm saved state: %w", err) + } + + // Reject payloads written by an incompatible shim version. + if v := state.GetSchemaVersion(); v != vmsave.SchemaVersion { + return fmt.Errorf("unsupported vm saved-state schema version %d (want %d)", v, vmsave.SchemaVersion) + } + + c.mu.Lock() + defer c.mu.Unlock() + + // We can import a new VM only on a freshly created controller. + if c.vmState != StateNotCreated { + return fmt.Errorf("unsupported vm state during Import %q", c.vmState) + } + + // Restore the VM identity, allocator floors, and compat blob, then mark + // the controller migrating so only migration APIs are permitted. + c.vmID = state.GetVmID() + c.sandboxOptions = sandboxOptionsFromProto(state.GetSandboxOptions()) + if c.sandboxOptions != nil { + c.isPhysicallyBacked = c.sandboxOptions.FullyPhysicallyBacked + } + c.nextGuestPort = state.GetGcsNextPort() + c.nextBridgeID = state.GetBridgeNextID() + c.compatInfo = state.GetCompatInfo() + c.vmState = StateMigratingImported + + // Decode the HCS document so [Controller.CreateVM] (called next on the + // destination with MigrationOptions populated) can reuse it verbatim. + if raw := state.GetHcsDocument(); len(raw) > 0 { + doc := &hcsschema.ComputeSystem{} + if err := json.Unmarshal(raw, doc); err != nil { + return fmt.Errorf("unmarshal hcs document: %w", err) + } + + c.hcsDocument = doc + } + + // Import the SCSI sub-controller. + if env := state.GetScsi(); env != nil { + s, err := scsi.Import(ctx, env) + if err != nil { + return fmt.Errorf("import scsi controller: %w", err) + } + c.scsiController = s + } + + log.G(ctx).Debug("imported VM migration state") + return nil +} + +// Patch grants the migrated VM filesystem access to its backing disk paths on +// the destination host, readying it for [Controller.Resume]. Run after the +// disk locations have been rewritten to their destination-local paths. +func (c *Controller) Patch(ctx context.Context) error { + c.mu.Lock() + defer c.mu.Unlock() + + if c.vmState != StateMigratingImported && c.vmState != StateMigratingCreated { + return fmt.Errorf("cannot patch VM: VM is in state %s", c.vmState) + } + + if c.scsiController == nil { + return fmt.Errorf("cannot patch VM: SCSI controller is nil") + } + + // Grant access only for disk types whose host paths the VM must reach. + for _, cfg := range c.scsiController.Disks() { + if cfg.Type != disk.TypeVirtualDisk && cfg.Type != disk.TypePassThru { + continue + } + if err := wclayer.GrantVmAccess(ctx, c.vmID, cfg.HostPath); err != nil { + return fmt.Errorf("grant vm access to %s: %w", cfg.HostPath, err) + } + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("patched VM disk access for migration") + return nil +} + +// Resume reactivates a migrated VM and returns it to the running state. The +// source side rebuilds its guest bridge to recover outstanding RPCs; the +// destination side reuses the connection already armed at start. +func (c *Controller) Resume(ctx context.Context, rebuildBridge bool) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Resume returns either migration side to the running state. + if c.vmState != StateSourceMigrating && c.vmState != StateDestinationMigrating { + return fmt.Errorf("cannot resume from migration: VM is in state %s", c.vmState) + } + + switch { + case rebuildBridge: + // Source rollback: re-arm the listener and swap the bridge transport + // onto the fresh hvsock so outstanding RPCs (e.g. WaitForProcess) survive. + if err := c.guest.PrepareConnection(winio.VsockServiceID(prot.LinuxGcsVsockPort)); err != nil { + return fmt.Errorf("prepare guest connection on resume: %w", err) + } + if err := c.guest.ResumeConnection(ctx); err != nil { + return fmt.Errorf("resume guest connection: %w", err) + } + default: + // Destination: reuse the connection already armed at start. + if err := c.guest.CreateConnection(ctx, false); err != nil { + return fmt.Errorf("resume guest connection: %w", err) + } + } + + // Clear migrating flag only now that the new transport is in place. + c.guest.SetMigrating(false) + + // Lift the GCS port and bridge-id allocators above the values the guest + // still has outstanding so newly issued ids cannot collide. + if c.nextGuestPort != 0 { + c.guest.SetNextPort(c.nextGuestPort) + } + + if c.nextBridgeID != 0 { + // Seed before sub-controller Resume so pre-registered ids stay below new ones. + c.guest.SeedBridgeNextID(c.nextBridgeID) + } + + // Sub-controller Resume: required on destination, no-op on source. + if c.scsiController != nil { + c.scsiController.Resume(ctx, c.uvm, c.guest) + } + + c.vmState = StateRunning + + if c.sandboxOptions != nil { + c.sandboxOptions.LiveMigrationSupportEnabled = true + } + + // Destination never ran setupLoggingListener; close so [Controller.Wait] + // does not block. Already closed on source — receive falls through. + select { + case <-c.logOutputDone: + default: + close(c.logOutputDone) + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("resumed VM from migration") + return nil +} + +// sandboxOptionsToProto converts the in-memory sandbox options into their +// wire form for inclusion in a migration payload. +func sandboxOptionsToProto(o *lcow.SandboxOptions) *vmsave.SandboxOptions { + if o == nil { + return nil + } + return &vmsave.SandboxOptions{ + NoWritableFileShares: o.NoWritableFileShares, + EnableScratchEncryption: o.EnableScratchEncryption, + PolicyBasedRouting: o.PolicyBasedRouting, + Architecture: o.Architecture, + FullyPhysicallyBacked: o.FullyPhysicallyBacked, + } +} + +// sandboxOptionsFromProto reconstructs the in-memory sandbox options from a +// migration payload's wire form. +func sandboxOptionsFromProto(p *vmsave.SandboxOptions) *lcow.SandboxOptions { + if p == nil { + return nil + } + return &lcow.SandboxOptions{ + NoWritableFileShares: p.GetNoWritableFileShares(), + EnableScratchEncryption: p.GetEnableScratchEncryption(), + PolicyBasedRouting: p.GetPolicyBasedRouting(), + Architecture: p.GetArchitecture(), + FullyPhysicallyBacked: p.GetFullyPhysicallyBacked(), + } +} diff --git a/internal/controller/vm/state.go b/internal/controller/vm/state.go index 5ea9360a3f..21f65272e2 100644 --- a/internal/controller/vm/state.go +++ b/internal/controller/vm/state.go @@ -12,19 +12,49 @@ package vm // [Controller.TerminateVM], the VM transitions to [StateInvalid] instead. // A VM in [StateInvalid] can only be cleaned up via [Controller.TerminateVM]. // +// Live migration has two side-specific migrating states. The source toggles a +// running VM into [StateSourceMigrating]; the destination walks a dedicated +// path: [Controller.Import] → [StateMigratingImported], [Controller.CreateVM] → +// [StateMigratingCreated], [Controller.StartWithMigrationOptions] → +// [StateDestinationMigrating]. From either migrating state the resumed side +// returns to [StateRunning] via [Controller.Resume], while the stopped side +// reaches [StateTerminated] via a finalize Stop or a teardown +// ([Controller.TerminateVM]). The forward flow stops the source and resumes the +// destination; the reverse flow resumes the source and stops the destination. +// // Full state-transition table: // -// Current State │ Trigger │ Next State -// ─────────────────┼────────────────────────────────────┼───────────────── -// StateNotCreated │ CreateVM succeeds │ StateCreated -// StateCreated │ StartVM succeeds │ StateRunning -// StateCreated │ TerminateVM succeeds │ StateTerminated -// StateCreated │ StartVM fails │ StateInvalid -// StateCreated │ TerminateVM fails │ StateInvalid -// StateRunning │ VM exits or TerminateVM succeeds │ StateTerminated -// StateRunning │ TerminateVM fails (uvm.Close) │ StateInvalid -// StateInvalid │ TerminateVM called │ StateTerminated -// StateTerminated │ (terminal — no further transitions)│ — +// Current State │ Trigger │ Next State +// ──────────────────────────┼────────────────────────────────────┼─────────────────────── +// StateNotCreated │ CreateVM succeeds │ StateCreated +// StateCreated │ StartVM succeeds │ StateRunning +// StateCreated │ TerminateVM succeeds │ StateTerminated +// StateCreated │ StartVM fails │ StateInvalid +// StateCreated │ TerminateVM fails │ StateInvalid +// StateRunning │ VM exits or TerminateVM succeeds │ StateTerminated +// StateRunning │ TerminateVM fails (uvm.Close) │ StateInvalid +// StateRunning │ InitializeLiveMigrationOnSource │ StateSourceMigrating +// StateSourceMigrating │ StartLiveMigrationOnSource │ StateSourceMigrating +// StateSourceMigrating │ StartLiveMigrationTransfer │ StateSourceMigrating +// StateSourceMigrating │ Save │ StateSourceMigrating +// StateSourceMigrating │ FinalizeLiveMigration (Resume) │ StateSourceMigrating +// StateSourceMigrating │ FinalizeLiveMigration (Stop) │ StateTerminated +// StateSourceMigrating │ Resume │ StateRunning +// StateSourceMigrating │ TerminateVM (abort) │ StateTerminated +// StateNotCreated │ Import (destination) │ StateMigratingImported +// StateMigratingImported │ CreateVM (destination) │ StateMigratingCreated +// StateMigratingCreated │ Patch │ StateMigratingCreated +// StateMigratingCreated │ StartWithMigrationOptions │ StateDestinationMigrating +// StateMigratingImported │ TerminateVM │ StateTerminated +// StateMigratingCreated │ TerminateVM succeeds │ StateTerminated +// StateMigratingCreated │ TerminateVM fails (uvm.Close) │ StateInvalid +// StateDestinationMigrating │ StartLiveMigrationTransfer │ StateDestinationMigrating +// StateDestinationMigrating │ FinalizeLiveMigration (Resume) │ StateDestinationMigrating +// StateDestinationMigrating │ FinalizeLiveMigration (Stop) │ StateTerminated +// StateDestinationMigrating │ Resume │ StateRunning +// StateDestinationMigrating │ TerminateVM (abort) │ StateTerminated +// StateInvalid │ TerminateVM called │ StateTerminated +// StateTerminated │ (terminal — no further transitions)│ — type State int32 const ( @@ -45,6 +75,7 @@ const ( // Valid transitions: // - StateRunning → StateTerminated (VM exits naturally or [Controller.TerminateVM] succeeds) // - StateRunning → StateInvalid ([Controller.TerminateVM] fails during uvm.Close) + // - StateRunning → StateSourceMigrating ([Controller.InitializeLiveMigrationOnSource] succeeds) StateRunning // StateTerminated indicates the VM has exited or been successfully terminated. @@ -54,9 +85,35 @@ const ( // StateInvalid indicates that an unrecoverable error has occurred. // The VM transitions to this state when: // - [Controller.StartVM] fails after the underlying HCS VM has already started, or - // - [Controller.TerminateVM] fails during uvm.Close (from either [StateCreated] or [StateRunning]). + // - [Controller.TerminateVM] fails during uvm.Close (from [StateCreated], + // [StateRunning], or [StateMigratingCreated]). // A VM in this state can only be cleaned up by calling [Controller.TerminateVM]. StateInvalid + + // StateSourceMigrating indicates this VM is the source of an in-progress live + // migration. Entered from [StateRunning] via [Controller.InitializeLiveMigrationOnSource]. + // Only live-migration APIs and [Controller.Save] are permitted; [Controller.Resume] + // returns it to [StateRunning], while a finalize Stop (forward flow) or + // [Controller.TerminateVM] terminates it. + StateSourceMigrating + + // StateMigratingImported indicates the destination controller has been + // rehydrated from a snapshot via [Controller.Import] but the VM does not + // exist yet. Only [Controller.CreateVM] (or [Controller.TerminateVM]) is + // permitted next. + StateMigratingImported + + // StateMigratingCreated indicates the destination VM has been created from + // an imported snapshot but not yet started. [Controller.Patch] is valid + // only in this state; [Controller.StartWithMigrationOptions] advances it to + // [StateDestinationMigrating], and [Controller.TerminateVM] tears it down. + StateMigratingCreated + + // StateDestinationMigrating indicates this VM is the destination of an + // in-progress live migration. Entered via [Controller.StartWithMigrationOptions]. + // Only live-migration APIs are permitted; [Controller.Resume] reaches [StateRunning], + // while a finalize Stop (reverse flow) or [Controller.TerminateVM] terminates it. + StateDestinationMigrating ) // String returns a human-readable string representation of the VM State. @@ -72,6 +129,14 @@ func (s State) String() string { return "Terminated" case StateInvalid: return "Invalid" + case StateSourceMigrating: + return "SourceMigrating" + case StateMigratingImported: + return "MigratingImported" + case StateMigratingCreated: + return "MigratingCreated" + case StateDestinationMigrating: + return "DestinationMigrating" default: return "Unknown" } diff --git a/internal/controller/vm/types.go b/internal/controller/vm/types.go index 9312bedaae..dae0eaa903 100644 --- a/internal/controller/vm/types.go +++ b/internal/controller/vm/types.go @@ -6,6 +6,7 @@ import ( "time" runhcsoptions "github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options" + hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/vm/guestmanager" vmsandbox "github.com/Microsoft/hcsshim/sandbox-spec/vm/v2" @@ -28,6 +29,12 @@ type CreateOptions struct { // SandboxSpec specifies the sandbox specification from CRI. SandboxSpec *vmsandbox.Spec + + // MigrationOptions, when non-nil, marks this CreateVM as the + // destination side of a live migration: the imported HCS document is + // reused and these options are stamped into VirtualMachine.MigrationOptions. + // Requires the controller to be in [StateMigratingImported]. + MigrationOptions *hcsschema.MigrationInitializeOptions } // StartOptions contains the configuration needed to start a VM and establish diff --git a/internal/controller/vm/vm.go b/internal/controller/vm/vm.go index 518e2d00c1..4f2b91d27c 100644 --- a/internal/controller/vm/vm.go +++ b/internal/controller/vm/vm.go @@ -14,6 +14,7 @@ import ( "github.com/Microsoft/hcsshim/internal/cmd" "github.com/Microsoft/hcsshim/internal/controller/device/scsi" "github.com/Microsoft/hcsshim/internal/controller/device/vpci" + "github.com/Microsoft/hcsshim/internal/hcs" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" @@ -65,6 +66,15 @@ type Controller struct { // vpciController manages virtual PCI device assignments for this VM. vpciController *vpci.Controller + // hcsDocument is the final HCS document used to create this VM, + // retained for lazy SCSI controller construction and for shipping to + // the destination during live migration. + hcsDocument *hcsschema.ComputeSystem + + // compatInfo is the opaque VM compatibility blob rehydrated from + // a migration snapshot on the destination side. + compatInfo []byte + // platformControllers embeds platform-specific sub-controllers (e.g., Plan9 for LCOW). platformControllers //nolint:unused,nolintlint // embedded for cross-platform compatibility; empty on WCOW } @@ -83,6 +93,12 @@ func (c *Controller) Guest() *guestmanager.Guest { return c.guest } +// VM returns the vm manager instance for this VM. +// The vm manager provides access to the VM host side operations. +func (c *Controller) VM() *vmmanager.UtilityVM { + return c.uvm +} + // State returns the current VM state. func (c *Controller) State() State { c.mu.RLock() @@ -103,24 +119,56 @@ func (c *Controller) RuntimeID() string { return c.uvm.RuntimeID().String() } -// CreateVM creates the VM using the HCS document and initializes device state. +// CreateVM creates the VM from either a freshly built HCS document (cold boot) +// or the document imported on the migration destination. func (c *Controller) CreateVM(ctx context.Context, opts *CreateOptions) error { ctx, _ = log.WithContext(ctx, logrus.WithField(logfields.Operation, "CreateVM")) c.mu.Lock() defer c.mu.Unlock() - // In case of duplicate CreateVM call for the same controller, we want to fail. - if c.vmState != StateNotCreated { + // Pick the HCS document we hand to vmmanager based on the controller's + // current state: + // - StateNotCreated: cold-boot path; build a fresh document. + // - StateMigratingImported: destination side of a live migration; reuse + // the document rehydrated by Import and stamp opts.MigrationOptions + // onto it. + // Any other state is invalid for CreateVM. + var hcsDocument *hcsschema.ComputeSystem + // Cold boot lands in StateCreated; the destination migration path lands in + // StateMigratingCreated. + nextState := StateCreated + switch c.vmState { + case StateNotCreated: + doc, err := c.buildHCSConfig(ctx, opts) + if err != nil { + return fmt.Errorf("failed to build VM config: %w", err) + } + hcsDocument = doc + case StateMigratingImported: + nextState = StateMigratingCreated + if c.hcsDocument == nil { + return fmt.Errorf("cannot create VM in state %s: no imported HCS document available", c.vmState) + } + if c.hcsDocument.VirtualMachine == nil { + return fmt.Errorf("cannot create VM in state %s: imported HCS document has no VirtualMachine", c.vmState) + } + hcsDocument = c.hcsDocument + hcsDocument.VirtualMachine.MigrationOptions = opts.MigrationOptions + if c.compatInfo != nil { + hcsDocument.VirtualMachine.MigrationOptions.CompatibilityData = &hcsschema.CompatibilityInfo{ + Data: c.compatInfo, + } + } + // SCSI controller is the source of truth for the destination + // topology (rootfs + hot-added, path-patched); use it verbatim. + if c.scsiController != nil { + hcsDocument.VirtualMachine.Devices.Scsi = c.scsiController.HCSAttachments() + } + default: return fmt.Errorf("cannot create VM: VM is in incorrect state %s", c.vmState) } - // Build the HCS document and sandbox options from the platform-specific builder. - hcsDocument, err := c.buildHCSConfig(ctx, opts) - if err != nil { - return fmt.Errorf("failed to build VM config: %w", err) - } - // Create the VM via vmmanager. uvm, err := vmmanager.Create(ctx, opts.ID, hcsDocument) if err != nil { @@ -130,19 +178,17 @@ func (c *Controller) CreateVM(ctx context.Context, opts *CreateOptions) error { // Set the Controller parameters after successful creation. c.vmID = opts.ID c.uvm = uvm + // Retain the final HCS document for lazy SCSI init and migration save. + c.hcsDocument = hcsDocument // Initialize the GuestManager for managing guest interactions. // We will create the guest connection via GuestManager during StartVM. c.guest = guestmanager.New(ctx, uvm) - // Eager initialize the SCSI controller as opposed to all other controllers. - // This is because we always use SCSI for attaching scratch VHDs. - c.scsiController, err = newSCSIController(ctx, hcsDocument, c.uvm, c.guest) - if err != nil { - return fmt.Errorf("failed to initialize SCSI controller: %w", err) - } - - c.vmState = StateCreated + // Cold-boot lands in StateCreated; the destination-side migration path + // lands in StateMigratingCreated, from which Patch and + // StartWithMigrationOptions drive the controller forward. + c.vmState = nextState return nil } @@ -411,7 +457,7 @@ func (c *Controller) DumpStacks(ctx context.Context) (string, error) { return "", fmt.Errorf("cannot dump stacks: VM is in incorrect state %s", c.vmState) } - if c.guest.Capabilities().IsDumpStacksSupported() { + if caps := c.guest.Capabilities(); caps != nil && caps.IsDumpStacksSupported() { return c.guest.DumpStacks(ctx) } @@ -429,6 +475,12 @@ func (c *Controller) Wait(ctx context.Context) error { c.mu.RUnlock() return fmt.Errorf("cannot wait on VM: VM is in incorrect state %s", c.vmState) } + + // Destination terminated before CreateVM: nothing to wait on. + if c.uvm == nil { + c.mu.RUnlock() + return nil + } c.mu.RUnlock() // Wait for the utility VM to exit. @@ -530,10 +582,22 @@ func (c *Controller) TerminateVM(ctx context.Context) (err error) { return nil } + // Destination migration after Import but before CreateVM: no HCS handle yet. + if c.uvm == nil { + c.vmState = StateTerminated + return nil + } + // Best effort attempt to clean up the open vmmem handle. _ = windows.Close(c.vmmemProcess) - // Terminate the utility VM. This will also cause the Wait() call in the background goroutine to unblock. - _ = c.uvm.Terminate(ctx) + + // Skip HCS Terminate for a never-started VM (cold-created or destination + // migration-created). The HCS document sets + // ShouldTerminateOnLastHandleClosed, so uvm.Close below is sufficient. + if c.vmState != StateCreated && c.vmState != StateMigratingCreated { + // Terminate the utility VM. This will also cause the Wait() call in the background goroutine to unblock. + _ = c.uvm.Terminate(ctx) + } if err := c.guest.CloseConnection(); err != nil { log.G(ctx).Errorf("close guest connection failed: %s", err) @@ -558,7 +622,7 @@ func (c *Controller) StartTime() (startTime time.Time) { c.mu.RLock() defer c.mu.RUnlock() - if c.vmState == StateRunning || c.vmState == StateTerminated { + if (c.vmState == StateRunning || c.vmState == StateTerminated) && c.uvm != nil { return c.uvm.StartedTime() } @@ -576,8 +640,16 @@ func (c *Controller) ExitStatus() (*ExitStatus, error) { return nil, fmt.Errorf("cannot get exit status: VM is in incorrect state %s", c.vmState) } - return &ExitStatus{ - StoppedTime: c.uvm.StoppedTime(), - Err: c.uvm.ExitError(), - }, nil + // Destination terminated before CreateVM: no uvm to query. + if c.uvm == nil { + return &ExitStatus{}, nil + } + + // Close-before-Terminate (never-started VM) surfaces ErrAlreadyClosed; treat as clean exit. + err := c.uvm.ExitError() + if errors.Is(err, hcs.ErrAlreadyClosed) { + err = nil + } + + return &ExitStatus{StoppedTime: c.uvm.StoppedTime(), Err: err}, nil } diff --git a/internal/controller/vm/vm_devices.go b/internal/controller/vm/vm_devices.go index e85044a956..c7948a6f66 100644 --- a/internal/controller/vm/vm_devices.go +++ b/internal/controller/vm/vm_devices.go @@ -8,14 +8,35 @@ import ( "strconv" "github.com/Microsoft/hcsshim/internal/controller/device/scsi" + "github.com/Microsoft/hcsshim/internal/controller/device/scsi/disk" "github.com/Microsoft/hcsshim/internal/controller/device/vpci" hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" ) -// SCSIController returns the singleton SCSI device controller for this VM. -func (c *Controller) SCSIController() *scsi.Controller { - return c.scsiController +// SCSIController returns the SCSI device controller for this VM, lazily +// initializing it from the VM's HCS document on first use. +func (c *Controller) SCSIController(ctx context.Context) (*scsi.Controller, error) { + c.mu.Lock() + defer c.mu.Unlock() + + if c.scsiController != nil { + return c.scsiController, nil + } + + if c.hcsDocument == nil { + return nil, fmt.Errorf("cannot initialize SCSI controller: VM has no HCS document") + } + + s, err := newSCSIController(ctx, c.hcsDocument, c.uvm, c.guest) + if err != nil { + return nil, fmt.Errorf("failed to initialize SCSI controller: %w", err) + } + c.scsiController = s + + log.G(ctx).Debug("lazily initialized SCSI controller from HCS document") + return c.scsiController, nil } // VPCIController returns the singleton vPCI device controller for this VM. @@ -58,13 +79,19 @@ func newSCSIController( } // Found the controller GUID in the document. - for lunStr := range c.Attachments { + for lunStr, att := range c.Attachments { lun, err := strconv.ParseUint(lunStr, 10, 32) if err != nil { continue } - if err := ctrl.ReserveForRootfs(ctx, uint(ctrlIdx), uint(lun)); err != nil { + cfg := disk.Config{ + HostPath: att.Path, + ReadOnly: att.ReadOnly, + Type: disk.Type(att.Type_), + EVDType: att.ExtensibleVirtualDiskType, + } + if err := ctrl.ReserveForRootfs(ctx, uint(ctrlIdx), uint(lun), cfg); err != nil { return nil, fmt.Errorf("reserve SCSI slot (controller=%d, lun=%d): %w", ctrlIdx, lun, err) } } diff --git a/internal/controller/vm/vm_lcow.go b/internal/controller/vm/vm_lcow.go index 9df0846dd6..3c548df04c 100644 --- a/internal/controller/vm/vm_lcow.go +++ b/internal/controller/vm/vm_lcow.go @@ -29,6 +29,14 @@ type platformControllers struct { // sandboxOptions contains parsed, shim-level configuration for the sandbox. sandboxOptions *lcow.SandboxOptions + + // nextGuestPort is the GCS IO port-allocator floor restored from a + // migration snapshot; consumed by [Controller.Resume]. + nextGuestPort uint32 + + // nextBridgeID is the GCS bridge request-id floor restored from a + // migration snapshot; consumed by [Controller.Resume]. + nextBridgeID int64 } // SandboxOptions returns the sandbox options stored during CreateVM. diff --git a/internal/controller/vm/vm_migration.go b/internal/controller/vm/vm_migration.go new file mode 100644 index 0000000000..b623bcbbc6 --- /dev/null +++ b/internal/controller/vm/vm_migration.go @@ -0,0 +1,210 @@ +//go:build windows && (lcow || wcow) + +package vm + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/Microsoft/go-winio" + + "github.com/Microsoft/hcsshim/internal/gcs/prot" + hcsschema "github.com/Microsoft/hcsshim/internal/hcs/schema2" + hcs "github.com/Microsoft/hcsshim/internal/hcs/v2" + "github.com/Microsoft/hcsshim/internal/log" + "github.com/Microsoft/hcsshim/internal/logfields" +) + +// compatibilityInfoProperty is the HCS property name used to retrieve the +// VM's opaque migration-compatibility blob via PropertiesV3. +const compatibilityInfoProperty = "CompatibilityInfo" + +// InitializeLiveMigrationOnSource prepares the running source VM for an +// outgoing live migration. Once it succeeds the VM accepts only live-migration +// calls until the migration completes or is rolled back. +func (c *Controller) InitializeLiveMigrationOnSource(ctx context.Context, options *hcsschema.MigrationInitializeOptions) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Only a running VM can begin a migration. + if c.vmState != StateRunning { + return fmt.Errorf("cannot initialize live migration on source: VM is in state %s", c.vmState) + } + + // Hand the initialize request to the HCS for the UVM. + if err := c.uvm.InitializeLiveMigrationOnSource(ctx, options); err != nil { + return fmt.Errorf("failed to initialize live migration on source: %w", err) + } + + // From here on only live-migration APIs are permitted. + c.vmState = StateSourceMigrating + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("initialized live migration on source") + + return nil +} + +// CompatibilityInfo returns the opaque, source-emitted blob that the destination +// hands back to HCS when starting the target VM, letting the platform confirm +// the two hosts can interchange live-migration state. Available while the VM is +// running or migrating. +func (c *Controller) CompatibilityInfo(ctx context.Context) ([]byte, error) { + c.mu.RLock() + defer c.mu.RUnlock() + + // The blob is read from a live source VM, including once it has begun migrating. + if c.vmState != StateRunning && c.vmState != StateSourceMigrating { + return nil, fmt.Errorf("cannot query compatibility info: VM is in state %s", c.vmState) + } + + // Ask the HCS for the compatibility property. + props, err := c.uvm.PropertiesV3(ctx, &hcsschema.PropertyQuery{ + Queries: map[string]interface{}{compatibilityInfoProperty: nil}, + }) + if err != nil { + return nil, fmt.Errorf("query compatibility info: %w", err) + } + + // Pull the raw blob out of the property response. + resp, ok := props.PropertyResponses[compatibilityInfoProperty] + if !ok || len(resp.Response) == 0 { + return nil, fmt.Errorf("compatibility info not present in property response") + } + + // Decode the opaque payload and return its bytes to the caller. + var info hcsschema.CompatibilityInfo + if err := json.Unmarshal(resp.Response, &info); err != nil { + return nil, fmt.Errorf("decode compatibility info: %w", err) + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debugf("queried compatibility info") + return info.Data, nil +} + +// MigrationNotifications returns the VM's live-migration event channel. The +// channel lives for the VM's lifetime, so callers can subscribe any time after +// the VM is created and will not miss early events. +func (c *Controller) MigrationNotifications() (<-chan hcsschema.OperationSystemMigrationNotificationInfo, error) { + c.mu.RLock() + defer c.mu.RUnlock() + + // Notifications are valid from creation through the migration window, + // on both the source and destination sides. + if c.vmState != StateCreated && c.vmState != StateRunning && + c.vmState != StateSourceMigrating && c.vmState != StateMigratingCreated && + c.vmState != StateDestinationMigrating { + return nil, fmt.Errorf("cannot query migration notifications: VM is in state %s", c.vmState) + } + + return c.uvm.MigrationNotifications(), nil +} + +// StartWithMigrationOptions starts the VM as the destination of a live +// migration over the supplied transport socket. On return the VM is migrating +// and awaiting the source's state transfer. +func (c *Controller) StartWithMigrationOptions(ctx context.Context, config *hcs.MigrationConfig) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Destination start is only valid on a created migrating VM. + if c.vmState != StateMigratingCreated { + return fmt.Errorf("cannot start with migration options: VM is in state %s", c.vmState) + } + + // Arm the host-side GCS listener before start so the guest's dial cannot race it. + if err := c.guest.PrepareConnection(winio.VsockServiceID(prot.LinuxGcsVsockPort)); err != nil { + return fmt.Errorf("prepare destination gcs connection: %w", err) + } + + // Start the destination VM against the migration socket. + if err := c.uvm.StartWithMigrationOptions(ctx, config); err != nil { + return fmt.Errorf("failed to start with migration options: %w", err) + } + + // Watch for VM exit in the background. + go c.waitForVMExit(ctx) + c.vmState = StateDestinationMigrating + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("started destination VM with migration options") + return nil +} + +// StartLiveMigrationOnSource begins the source side of the migration over the +// supplied transport socket. The memory-transfer phase is driven separately via +// [Controller.StartLiveMigrationTransfer]. +func (c *Controller) StartLiveMigrationOnSource(ctx context.Context, config *hcs.MigrationConfig) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Source start is only valid once the migration has been initialized. + if c.vmState != StateSourceMigrating { + return fmt.Errorf("cannot start live migration on source: VM is in state %s", c.vmState) + } + + // Tolerate the blackout-induced hvsock drop; cleared in [FinalizeLiveMigration]. + c.guest.SetMigrating(true) + if err := c.uvm.StartLiveMigrationOnSource(ctx, config); err != nil { + // Roll the flag back if the host rejects the start. + c.guest.SetMigrating(false) + return fmt.Errorf("failed to start live migration on source: %w", err) + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("started live migration on source") + return nil +} + +// StartLiveMigrationTransfer drives the memory-transfer phase of an in-progress +// migration. Progress is reported through [Controller.MigrationNotifications]. +func (c *Controller) StartLiveMigrationTransfer(ctx context.Context, options *hcsschema.MigrationTransferOptions) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Transfer is only valid mid-migration, on either source or destination. + if c.vmState != StateSourceMigrating && c.vmState != StateDestinationMigrating { + return fmt.Errorf("cannot start live migration transfer: VM is in state %s", c.vmState) + } + + if err := c.uvm.StartLiveMigrationTransfer(ctx, options); err != nil { + return fmt.Errorf("failed to start live migration transfer: %w", err) + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("started live migration memory transfer") + return nil +} + +// FinalizeLiveMigration completes the migration. A Stop finalize tears down the +// stopped side (the source in the forward flow, the destination in the reverse); +// a Resume finalize returns control to the caller, who must then call [Controller.Resume]. +func (c *Controller) FinalizeLiveMigration(ctx context.Context, options *hcsschema.MigrationFinalizedOptions) error { + c.mu.Lock() + defer c.mu.Unlock() + + // Finalize is only valid mid-migration, on either source or destination. + if c.vmState != StateSourceMigrating && c.vmState != StateDestinationMigrating { + return fmt.Errorf("cannot finalize live migration: VM is in state %s", c.vmState) + } + + if err := c.uvm.FinalizeLiveMigration(ctx, options); err != nil { + return fmt.Errorf("failed to finalize live migration: %w", err) + } + + // On a finalize Stop, drain the stopped side's VM (source in the forward + // flow, destination in the reverse) to termination. + if options != nil && options.FinalizedOperation == hcsschema.MigrationFinalOperationStop { + // Source stop: lift the Save-time freeze so the defunct containers' + // scratch-layer unmap on delete is no longer rejected. + if options.Origin == hcsschema.MigrationOriginSource && c.scsiController != nil { + c.scsiController.Resume(ctx, c.uvm, c.guest) + } + + c.guest.SetMigrating(false) + _ = c.uvm.Wait(ctx) + c.vmState = StateTerminated + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("finalized live migration: VM terminated") + return nil + } + + log.G(ctx).WithField(logfields.UVMID, c.vmID).Debug("finalized live migration") + return nil +} diff --git a/internal/hcs/errors.go b/internal/hcs/errors.go index 3e10f5c7e0..ee225b1f79 100644 --- a/internal/hcs/errors.go +++ b/internal/hcs/errors.go @@ -59,6 +59,10 @@ var ( // ErrVmcomputeAlreadyStopped is an error encountered when a shutdown or terminate request is made on a stopped container ErrVmcomputeAlreadyStopped = syscall.Errno(0xc0370110) + // ErrVmcomputeSystemAlreadyStopped is returned when an operation targets a compute system that is no longer running + // (e.g. modifying a UVM during migration teardown after it has been stopped). + ErrVmcomputeSystemAlreadyStopped = syscall.Errno(0x80370110) + // ErrVmcomputeOperationPending is an error encountered when the operation is being completed asynchronously ErrVmcomputeOperationPending = syscall.Errno(0xC0370103) @@ -300,7 +304,7 @@ func IsTimeout(err error) bool { // already exited, or does not exist. Both IsAlreadyStopped and IsNotExist // will currently return true when the error is ErrElementNotFound. func IsAlreadyStopped(err error) bool { - return IsAny(err, ErrVmcomputeAlreadyStopped, ErrProcessAlreadyStopped, ErrElementNotFound) + return IsAny(err, ErrVmcomputeAlreadyStopped, ErrVmcomputeSystemAlreadyStopped, ErrProcessAlreadyStopped, ErrElementNotFound) } // IsNotSupported returns a boolean indicating whether the error is caused by diff --git a/internal/hcs/errors_test.go b/internal/hcs/errors_test.go index 421163eac6..94354b67b8 100644 --- a/internal/hcs/errors_test.go +++ b/internal/hcs/errors_test.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "net" + "syscall" "testing" ) @@ -150,3 +151,55 @@ func TestHcsErrorUnwrapNet(t *testing.T) { }) } } + +func TestIsAlreadyStopped(t *testing.T) { + for _, tc := range []struct { + name string + err error + want bool + }{ + { + name: "vmcompute already stopped (0xc0370110)", + err: ErrVmcomputeAlreadyStopped, + want: true, + }, + { + // Compute system reported as no longer running (e.g. UVM stopped during migration teardown). + name: "system already stopped (0x80370110)", + err: ErrVmcomputeSystemAlreadyStopped, + want: true, + }, + { + name: "process already stopped", + err: ErrProcessAlreadyStopped, + want: true, + }, + { + name: "element not found", + err: ErrElementNotFound, + want: true, + }, + { + name: "system not running wrapped in SystemError", + err: &SystemError{ + ID: "uvm-test", + HcsError: HcsError{ + Op: "hcs::System::Modify", + Err: ErrVmcomputeSystemAlreadyStopped, + }, + }, + want: true, + }, + { + name: "unrelated error", + err: syscall.Errno(0x5), + want: false, + }, + } { + t.Run(tc.name, func(t *testing.T) { + if got := IsAlreadyStopped(tc.err); got != tc.want { + t.Errorf("IsAlreadyStopped(%v) = %t, want %t", tc.err, got, tc.want) + } + }) + } +} diff --git a/internal/logfields/fields.go b/internal/logfields/fields.go index 445202554d..78eeca8a21 100644 --- a/internal/logfields/fields.go +++ b/internal/logfields/fields.go @@ -7,17 +7,23 @@ const ( Namespace = "namespace" Operation = "operation" - ID = "id" - ContainerID = "cid" - GCSContainerID = "gcs_container_id" - ExecID = "eid" - NamespaceID = "namespace-id" - PodID = "pod-id" - ProcessID = "pid" - SandboxID = "sandbox-id" - TaskID = "tid" - UVMID = "uvm-id" - VirtualSandboxID = "virtual-sandbox-id" + ID = "id" + SourceContainerID = "src_cid" + DestinationContainerID = "dst_cid" + ContainerID = "cid" + GCSContainerID = "gcs_container_id" + ExecID = "eid" + GuestNetworkNamespaceID = "guest_netns_id" + MigratedNamespaceID = "migrated_netns_id" + NamespaceID = "namespace-id" + SourcePodID = "src_pod_id" + DestinationPodID = "dst_pod_id" + PodID = "pod-id" + ProcessID = "pid" + SandboxID = "sandbox-id" + TaskID = "tid" + UVMID = "uvm-id" + VirtualSandboxID = "virtual-sandbox-id" // networking and IO diff --git a/internal/protocol/guestresource/parse.go b/internal/protocol/guestresource/parse.go index 20d1ed5880..b93ff93be3 100644 --- a/internal/protocol/guestresource/parse.go +++ b/internal/protocol/guestresource/parse.go @@ -14,9 +14,9 @@ import ( // BuildLCOWNetworkAdapter converts an HCN endpoint into the [LCOWNetworkAdapter] // payload that the GCS expects. -func BuildLCOWNetworkAdapter(nicID string, endpoint *hcn.HostComputeEndpoint, policyBasedRouting bool) (*LCOWNetworkAdapter, error) { +func BuildLCOWNetworkAdapter(netnsID string, nicID string, endpoint *hcn.HostComputeEndpoint, policyBasedRouting bool) (*LCOWNetworkAdapter, error) { req := &LCOWNetworkAdapter{ - NamespaceID: endpoint.HostComputeNamespace, + NamespaceID: netnsID, ID: nicID, MacAddress: endpoint.MacAddress, IPConfigs: make([]LCOWIPConfig, 0, len(endpoint.IpConfigurations)), diff --git a/internal/uvm/network.go b/internal/uvm/network.go index c8d5139f17..1ba32bba96 100644 --- a/internal/uvm/network.go +++ b/internal/uvm/network.go @@ -683,7 +683,7 @@ func (uvm *UtilityVM) addNIC(ctx context.Context, id string, endpoint *hcn.HostC nil), } } else { - s, err := guestresource.BuildLCOWNetworkAdapter(id, endpoint, uvm.policyBasedRouting) + s, err := guestresource.BuildLCOWNetworkAdapter(endpoint.HostComputeNamespace, id, endpoint, uvm.policyBasedRouting) if err != nil { return err }