Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 50 additions & 15 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/spf13/cobra"

"github.com/aifoundry-org/oxide-controller/pkg/cluster"
oxidepkg "github.com/aifoundry-org/oxide-controller/pkg/oxide"
"github.com/aifoundry-org/oxide-controller/pkg/server"
"github.com/aifoundry-org/oxide-controller/pkg/util"
)
Expand All @@ -26,7 +27,8 @@ const (
func rootCmd() (*cobra.Command, error) {
var (
oxideAPIURL string
tokenFilePath string
oxideToken string
oxideProfile string
clusterProject string
controlPlanePrefix string
workerPrefix string
Expand Down Expand Up @@ -106,21 +108,53 @@ func rootCmd() (*cobra.Command, error) {
return fmt.Errorf("failed to load kubeconfig at %s: %w", kubeconfigPath, err)
}

logentry.Debugf("Loading Oxide token from %s", tokenFilePath)
b, err := util.LoadFile(tokenFilePath)
oxideToken := strings.TrimSuffix(string(b), "\n")
if err != nil {
return fmt.Errorf("failed to load oxide token at %s: %w", tokenFilePath, err)
// scenarios for oxide:
// 1- oxideAPIURL and oxideToken are provided, use them
// 2- oxideProfile is provided, use it to get oxideAPIURL and oxideToken
// 3- oxideProfile is not provided, use the default profile, succeed, use them
// 4- oxideProfile is not provided, did not succeed, look for it in-cluster
switch {
case oxideProfile != "" && (oxideAPIURL != "" || oxideToken != ""):
// cannot provide both --oxide-profile and --oxide-token/--oxide-api-url
return fmt.Errorf("cannot provide both --oxide-profile and --oxide-token/--oxide-api-url")
case oxideAPIURL != "" && oxideToken != "":
// use the provided oxideAPIURL and oxideToken
logentry.Debugf("Using oxide API URL %s and token ****", oxideAPIURL)
case oxideProfile != "":
// read the profile
logentry.Debugf("Loading oxide profile %s", oxideProfile)
profileHost, profileToken, err := oxidepkg.GetProfile(&oxide.Config{Profile: oxideProfile})
if err != nil {
return fmt.Errorf("failed to load oxide profile %s: %w", oxideProfile, err)
}
oxideAPIURL = profileHost
oxideToken = profileToken
default:
// nothing provided, try to load the default
logentry.Debugf("Loading oxide default profile")
profileHost, profileToken, err := oxidepkg.GetProfile(&oxide.Config{UseDefaultProfile: true})
if err != nil {
// TODO: this should default do trying it in cluster
return fmt.Errorf("failed to load oxide default profile: %w", err)
}
oxideAPIURL = profileHost
oxideToken = profileToken
}

cfg := oxide.Config{
oxideConfig := &oxide.Config{
Host: oxideAPIURL,
Token: string(oxideToken),
}
logentry.Debugf("Creating Oxide API client with config: %+v", cfg)
oxideClient, err := oxide.NewClient(&cfg)
if err != nil {
return fmt.Errorf("failed to create Oxide API client: %v", err)

if strings.HasPrefix(oxideToken, "file:") {
tokenFilePath := strings.TrimPrefix(oxideToken, "file:")
oxideToken = ""
logentry.Debugf("Loading oxide token from %s", tokenFilePath)
b, err := os.ReadFile(tokenFilePath)
if err != nil {
return fmt.Errorf("failed to load oxide token at %s: %w", tokenFilePath, err)
}
oxideToken = strings.TrimSuffix(string(b), "\n")
}

// if tailscaleAuthKey starts with file: then it is a path
Expand Down Expand Up @@ -161,7 +195,7 @@ func rootCmd() (*cobra.Command, error) {

ctx := context.Background()

c := cluster.New(logentry, oxideClient, clusterProject,
c := cluster.New(logentry, oxideConfig, clusterProject,
controlPlanePrefix, workerPrefix, int(controlPlaneCount), int(workerCount),
cluster.NodeSpec{Image: cluster.Image{Name: controlPlaneImageName, Source: controlPlaneImageSource, Blocksize: controlPlaneImageBlocksize}, MemoryGB: int(controlPlaneMemory), CPUCount: int(controlPlaneCPU), ExternalIP: controlPlaneExternalIP, RootDiskSize: int(controlPlaneRootDiskSizeGB * cluster.GB), ExtraDiskSize: int(controlPlaneExtraDiskSizeGB * cluster.GB), TailscaleAuthKey: tailscaleAuthKey, TailscaleTag: tailscaleTag},
cluster.NodeSpec{Image: cluster.Image{Name: workerImageName, Source: workerImageSource, Blocksize: workerImageBlocksize}, MemoryGB: int(workerMemory), CPUCount: int(workerCPU), ExternalIP: workerExternalIP, RootDiskSize: int(workerRootDiskSizeGB * cluster.GB), ExtraDiskSize: int(workerExtraDiskSizeGB * cluster.GB), TailscaleAuthKey: tailscaleAuthKey, TailscaleTag: tailscaleTag},
Expand Down Expand Up @@ -277,7 +311,7 @@ func rootCmd() (*cobra.Command, error) {
// serve REST endpoints
defer wg.Done()
logentry.Infof("Starting server on address %s", address)
s := server.New(address, logentry, oxideClient, c, controlPlaneSecret, clusterProject, controlPlanePrefix, workerImageName, int(workerMemory), int(workerCPU))
s := server.New(address, logentry, c, controlPlaneSecret, clusterProject, controlPlanePrefix, workerImageName, int(workerMemory), int(workerCPU))
errCh <- s.Serve()
}()

Expand All @@ -293,8 +327,9 @@ func rootCmd() (*cobra.Command, error) {
}

// Define CLI flags
cmd.Flags().StringVar(&oxideAPIURL, "oxide-api-url", "https://oxide-api.example.com", "Oxide API base URL")
cmd.Flags().StringVar(&tokenFilePath, "token-file", "/data/oxide_token", "Path to Oxide API token file")
cmd.Flags().StringVar(&oxideAPIURL, "oxide-api-url", "", "Oxide API base URL; if not provided, will read from Kubernetes secret if available, or from ~/.config/oxide, or faill back to the default URL")
cmd.Flags().StringVar(&oxideToken, "oxide-token", "", "Oxide API token; if starts with 'file:' then will read the key from the file; if none provided, will read from Kubernetes secret if available, or from ~/.config/oxide. Must not provide both --oxide-profile and --oxide-token")
cmd.Flags().StringVar(&oxideProfile, "oxide-profile", "", "Oxide profile to use; if none provided, will use default. Must not provide both --oxide-profile and --oxide-token")
cmd.Flags().StringVar(&clusterProject, "cluster-project", "ainekko-cluster", "Oxide project name for Kubernetes cluster")
cmd.Flags().StringVar(&controlPlanePrefix, "control-plane-prefix", "ainekko-control-plane-", "Prefix for control plane instances")
cmd.Flags().StringVar(&workerPrefix, "worker-prefix", "ainekko-worker-", "Prefix for worker instances")
Expand Down
13 changes: 9 additions & 4 deletions pkg/cluster/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import (

type Cluster struct {
logger *log.Entry
client *oxide.Client
oxideConfig *oxide.Config
projectID string
controlPlanePrefix string
workerPrefix string
Expand All @@ -43,10 +43,10 @@ type Cluster struct {
}

// New creates a new Cluster instance
func New(logger *log.Entry, client *oxide.Client, projectID string, controlPlanePrefix, workerPrefix string, controlPlaneCount, workerCount int, controlPlaneSpec, workerSpec NodeSpec, imageParallelism int, namespace, secretName string, kubeconfig, pubkey []byte, clusterInitWait time.Duration, kubeconfigOverwrite bool, tailscaleAPIKey, tailscaleTailnet, OCIimage string) *Cluster {
func New(logger *log.Entry, oxideConfig *oxide.Config, projectID string, controlPlanePrefix, workerPrefix string, controlPlaneCount, workerCount int, controlPlaneSpec, workerSpec NodeSpec, imageParallelism int, namespace, secretName string, kubeconfig, pubkey []byte, clusterInitWait time.Duration, kubeconfigOverwrite bool, tailscaleAPIKey, tailscaleTailnet, OCIimage string) *Cluster {
c := &Cluster{
logger: logger.WithField("component", "cluster"),
client: client,
oxideConfig: oxideConfig,
projectID: projectID,
controlPlanePrefix: controlPlanePrefix,
workerPrefix: workerPrefix,
Expand All @@ -71,7 +71,10 @@ func New(logger *log.Entry, client *oxide.Client, projectID string, controlPlane
// ensureClusterExists checks if a k3s cluster exists, and creates one if needed
func (c *Cluster) ensureClusterExists(ctx context.Context) (newKubeconfig []byte, err error) {
// local vars just for convenience
client := c.client
client, err := oxide.NewClient(c.oxideConfig)
if err != nil {
return nil, fmt.Errorf("failed to create Oxide API client: %v", err)
}
projectID := c.projectID
controlPlanePrefix := c.controlPlanePrefix
controlPlaneCount := c.controlPlaneCount
Expand Down Expand Up @@ -290,6 +293,8 @@ func (c *Cluster) ensureClusterExists(ctx context.Context) (newKubeconfig []byte
secrets[secretKeySystemSSHPublic] = pub
secrets[secretKeySystemSSHPrivate] = priv
secrets[secretKeyJoinToken] = joinToken
secrets[secretKeyOxideToken] = []byte(c.oxideConfig.Token)
secrets[secretKeyOxideURL] = []byte(c.oxideConfig.Host)

// save the user ssh public key to the secrets map
if c.userPubkey != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/cluster/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ const (
secretKeySystemSSHPublic = "system-ssh-public-key"
secretKeySystemSSHPrivate = "system-ssh-private-key"
secretKeyWorkerCount = "worker-count"
secretKeyOxideToken = "oxide-token"
secretKeyOxideURL = "oxide-url"
maximumChunkSize = 512 * KB

devModeOCIImage = "dev"
Expand Down
9 changes: 7 additions & 2 deletions pkg/cluster/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"fmt"

"github.com/aifoundry-org/oxide-controller/pkg/util"
"github.com/oxidecomputer/oxide.go/oxide"
)

// Execute execute the core functionality, which includes:
Expand All @@ -14,8 +15,12 @@ import (
// 4. Ensuring the worker nodes exist as desired
// 5. Returning the new kubeconfig, if any
func (c *Cluster) Execute(ctx context.Context) (newKubeconfig []byte, err error) {
client, err := oxide.NewClient(c.oxideConfig)
if err != nil {
return nil, fmt.Errorf("failed to create Oxide API client: %v", err)
}

projectID, err := ensureProjectExists(ctx, c.logger, c.client, c.projectID)
projectID, err := ensureProjectExists(ctx, c.logger, client, c.projectID)
if err != nil {
return nil, fmt.Errorf("project verification failed: %v", err)
}
Expand All @@ -24,7 +29,7 @@ func (c *Cluster) Execute(ctx context.Context) (newKubeconfig []byte, err error)
c.logger.Infof("Using project ID: %s", c.projectID)
}

images, err := ensureImagesExist(ctx, c.logger, c.client, c.projectID, c.imageParallelism, c.controlPlaneSpec.Image, c.workerSpec.Image)
images, err := ensureImagesExist(ctx, c.logger, client, c.projectID, c.imageParallelism, c.controlPlaneSpec.Image, c.workerSpec.Image)
if err != nil {
return nil, fmt.Errorf("image verification failed: %v", err)
}
Expand Down
8 changes: 6 additions & 2 deletions pkg/cluster/ip.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,20 @@ func GetControlPlaneIP(ctx context.Context, logger *log.Entry, client *oxide.Cli
}

func (c *Cluster) ensureControlPlaneIP(ctx context.Context, controlPlanePrefix string) (*oxide.FloatingIp, error) {
client, err := oxide.NewClient(c.oxideConfig)
if err != nil {
return nil, fmt.Errorf("failed to create Oxide API client: %v", err)
}
var controlPlaneIP *oxide.FloatingIp
c.logger.Debugf("getting control plane IP for prefix %s", controlPlanePrefix)
controlPlaneIP, err := GetControlPlaneIP(ctx, c.logger, c.client, c.projectID, controlPlanePrefix)
controlPlaneIP, err = GetControlPlaneIP(ctx, c.logger, client, c.projectID, controlPlanePrefix)
if err != nil {
return nil, fmt.Errorf("failed to get control plane IP: %w", err)
}
// what if we did not find one?
if controlPlaneIP == nil {
c.logger.Infof("Control plane floating IP not found. Creating one...")
fip, err := c.client.FloatingIpCreate(ctx, oxide.FloatingIpCreateParams{
fip, err := client.FloatingIpCreate(ctx, oxide.FloatingIpCreateParams{
Project: oxide.NameOrId(c.projectID),
Body: &oxide.FloatingIpCreate{
Name: oxide.Name(fmt.Sprintf("%s-floating-ip", controlPlanePrefix)),
Expand Down
15 changes: 11 additions & 4 deletions pkg/cluster/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,15 @@ func GenerateCloudConfig(nodeType string, initCluster bool, controlPlaneIP, join

// createControlPlaneNodes creates new control plane nodes
func (c *Cluster) CreateControlPlaneNodes(ctx context.Context, initCluster bool, count, start int, additionalPubKeys []string) ([]oxide.Instance, error) {
client, err := oxide.NewClient(c.oxideConfig)
if err != nil {
return nil, fmt.Errorf("failed to create Oxide API client: %v", err)
}
var controlPlaneNodes []oxide.Instance
c.logger.Debugf("Creating %d control plane nodes with prefix %s", count, c.controlPlanePrefix)

var joinToken string
var pubkey []byte
var err error

if !initCluster {
joinToken, err = c.GetJoinToken(ctx)
Expand Down Expand Up @@ -228,7 +231,7 @@ func (c *Cluster) CreateControlPlaneNodes(ctx context.Context, initCluster bool,
}

for i := start; i < start+count; i++ {
instance, err := CreateInstance(ctx, c.client, c.projectID, fmt.Sprintf("%s%d", c.controlPlanePrefix, i), c.controlPlaneSpec, cloudConfig)
instance, err := CreateInstance(ctx, client, c.projectID, fmt.Sprintf("%s%d", c.controlPlanePrefix, i), c.controlPlaneSpec, cloudConfig)
if err != nil {
return nil, fmt.Errorf("failed to create control plane node: %w", err)
}
Expand All @@ -240,6 +243,10 @@ func (c *Cluster) CreateControlPlaneNodes(ctx context.Context, initCluster bool,

// EnsureWorkerNodes ensures the count of worker nodes matches what it should be
func (c *Cluster) EnsureWorkerNodes(ctx context.Context) ([]oxide.Instance, error) {
client, err := oxide.NewClient(c.oxideConfig)
if err != nil {
return nil, fmt.Errorf("failed to create Oxide API client: %v", err)
}
// try to get the worker count from the cluster
// if it fails, we will use the default value
// if it succeeds, we will use that value
Expand All @@ -258,7 +265,7 @@ func (c *Cluster) EnsureWorkerNodes(ctx context.Context) ([]oxide.Instance, erro
c.logger.Debugf("Ensuring %d worker nodes", count)
var nodes []oxide.Instance
// first check how many worker nodes we have, by asking the cluster
_, workers, err := getNodesOxide(ctx, c.logger, c.client, c.projectID, c.controlPlanePrefix, c.workerPrefix)
_, workers, err := getNodesOxide(ctx, c.logger, client, c.projectID, c.controlPlanePrefix, c.workerPrefix)
if err != nil {
return nil, fmt.Errorf("failed to get nodes: %w", err)
}
Expand Down Expand Up @@ -293,7 +300,7 @@ func (c *Cluster) EnsureWorkerNodes(ctx context.Context) ([]oxide.Instance, erro

for i := actualCount; i < int(count); i++ {
workerName := fmt.Sprintf("%s%d", c.workerPrefix, time.Now().Unix())
instance, err := CreateInstance(ctx, c.client, c.projectID, workerName, c.workerSpec, cloudConfig)
instance, err := CreateInstance(ctx, client, c.projectID, workerName, c.workerSpec, cloudConfig)
if err != nil {
return nil, fmt.Errorf("failed to create worker node: %w", err)
}
Expand Down
100 changes: 100 additions & 0 deletions pkg/oxide/profile.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package oxide

import (
"errors"
"fmt"
"os"
"path/filepath"

"github.com/oxidecomputer/oxide.go/oxide"
"github.com/pelletier/go-toml"
)

// everything in this file is a duplicate of the code in oxide.go, but
// all of that is private, and we need to be able to get the actual token and host
// to store in the cluster secret, so we had to duplicate it here.
// When that becomes public, or if they create service accounts on oxide,
// we can remove this code.

const (
defaultConfigDir string = ".config/oxide"
configFile string = "config.toml"
credentialsFile string = "credentials.toml"
)

func GetProfile(cfg *oxide.Config) (string, string, error) {
configDir := cfg.ConfigDir
if configDir == "" {
homeDir, err := os.UserHomeDir()
if err != nil {
return "", "", fmt.Errorf("unable to find user's home directory: %w", err)
}
configDir = filepath.Join(homeDir, defaultConfigDir)
}

profile := cfg.Profile

// Use explicitly configured profile over default when both are set.
if cfg.UseDefaultProfile && profile == "" {
configPath := filepath.Join(configDir, configFile)

var err error
profile, err = defaultProfile(configPath)
if err != nil {
return "", "", fmt.Errorf("failed to get default profile from %q: %w", configPath, err)
}
}

credentialsPath := filepath.Join(configDir, credentialsFile)
host, token, err := parseCredentialsFile(credentialsPath, profile)
if err != nil {
return "", "", fmt.Errorf("failed to get credentials for profile %q from %q: %w", profile, credentialsPath, err)
}

return host, token, nil
}

// defaultProfile returns the default profile from config.toml, if present.
func defaultProfile(configPath string) (string, error) {
configFile, err := toml.LoadFile(configPath)
if err != nil {
return "", fmt.Errorf("failed to open config: %w", err)
}

if profileName := configFile.Get("default-profile"); profileName != nil {
return profileName.(string), nil
}

return "", errors.New("no default profile set")
}

// parseCredentialsFile parses a credentials.toml and returns the token and host
// associated with the requested profile.
func parseCredentialsFile(credentialsPath, profileName string) (string, string, error) {
if profileName == "" {
return "", "", errors.New("no profile name provided")
}

credentialsFile, err := toml.LoadFile(credentialsPath)
if err != nil {
return "", "", fmt.Errorf("failed to open %q: %v", credentialsPath, err)
}

profile, ok := credentialsFile.Get("profile." + profileName).(*toml.Tree)
if !ok {
return "", "", errors.New("profile not found")
}

var hostTokenErr error
token, ok := profile.Get("token").(string)
if !ok {
hostTokenErr = errors.New("token not found")
}

host, ok := profile.Get("host").(string)
if !ok {
hostTokenErr = errors.Join(errors.New("host not found"))
}

return host, token, hostTokenErr
}
Loading