diff --git a/docs/addons/index.md b/docs/addons/index.md index 829beec44..46ea5acd1 100644 --- a/docs/addons/index.md +++ b/docs/addons/index.md @@ -59,6 +59,7 @@ The framework currently supports the following add-ons. | [`MetricsServerAddOn`](./metrics-server.md) | Adds metrics server (pre-req for HPA and other monitoring tools). | ✅ | ✅ | | [`NewRelicAddOn`](./newrelic.md) | Adds [New Relic](https://newrelic.com/) and [Pixie](https://pixielabs.ai/) observability for Amazon EKS. | ✅ | | [`NginxAddOn`](./nginx.md) | Adds NGINX ingress controller | ✅ | ✅ | | +| [`NeuronAddOn`](./neuron-plugin-addon.md) | Adds Neuron Addon | ✅ | | | [`OpaGatekeeperAddOn`](./opa-gatekeeper.md) | Adds OPA Gatekeeper | ✅ | ✅ | | [`ParalusAddOn`](./paralus.md) | Adds [Paralus](https://paralus.io/) | ✅ | ✅ | | [`PixieAddOn`](./pixie.md) | Adds [Pixie](https://px.dev) to the EKS Cluster. Pixie provides auto-telemetry for requests, metrics, application profiles, and more. | ✅ | diff --git a/docs/addons/neuron-plugin-addon.md b/docs/addons/neuron-plugin-addon.md new file mode 100644 index 000000000..445170c4b --- /dev/null +++ b/docs/addons/neuron-plugin-addon.md @@ -0,0 +1,52 @@ +# Neuron Device Plugin Addon + +[AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/) is the SDK used to run deep learning workloads on AWS Inferentia and AWS Trainium based instances. This addon will install the Neuron Device Plugin necessary to run the instances on Amazon EKS (and Blueprints). Note that you **must** use *inf1, inf2, trn1,* or *trn1n* instances. + +## Usage + +#### **`index.ts`** +```typescript +import 'source-map-support/register'; +import * as cdk from 'aws-cdk-lib'; +import * as blueprints from '@aws-quickstart/eks-blueprints'; + +const app = new cdk.App(); + +const addOn = new blueprints.addons.NeuronPluginAddon(); + +const clusterProvider = new blueprints.GenericClusterProvider({ + version: KubernetesVersion.V1_27, + managedNodeGroups: [ + inferentiaNodeGroup() + ] +}); + +function inferentiaNodeGroup(): blueprints.ManagedNodeGroup { + return { + id: "mng1", + instanceTypes: [new ec2.InstanceType('inf1.2xlarge')], + desiredSize: 1, + maxSize: 2, + nodeGroupSubnets: { subnetType: ec2.SubnetType.PRIVATE_WITH_EGRESS }, + }; +} + +const blueprint = blueprints.EksBlueprint.builder() + .clusterProvider(clusterProvider) + .addOns(addOn) + .build(app, 'my-stack-name'); +``` + +Once deployed, you can see the plugin daemonset in the `kube-system` namespace. + +```sh +$ kubectl get daemonset neuron-device-plugin-daemonset -n kube-system + +NAME DESIRED CURRENT READY UP-TO-DATE AVAILABLE NODE SELECTOR AGE +neuron-device-plugin-daemonset 1 1 1 1 1 24m 20m +``` + +## Functionality + +1. Deploys the plugin daemonset in `kube-system` namespace by default. +2. Provides a plugin for the blueprint to leverage the Inferentia or Trainium instances to use the Neuron SDK. \ No newline at end of file diff --git a/examples/blueprint-construct/index.ts b/examples/blueprint-construct/index.ts index 1f58cc24a..cb6f813ed 100644 --- a/examples/blueprint-construct/index.ts +++ b/examples/blueprint-construct/index.ts @@ -228,6 +228,7 @@ export default class BlueprintConstruct { }), new blueprints.ExternalsSecretsAddOn(), new blueprints.EksPodIdentityAgentAddOn(), + new blueprints.NeuronPluginAddOn(), ]; // Instantiated to for helm version check. @@ -248,7 +249,8 @@ export default class BlueprintConstruct { addGenericNodeGroup(), addCustomNodeGroup(), addWindowsNodeGroup(), // commented out to check the impact on e2e - addGpuNodeGroup() + addGpuNodeGroup(), + addInferentiaNodeGroup(), ] }); @@ -413,4 +415,23 @@ function addGpuNodeGroup(): blueprints.ManagedNodeGroup { }; } +function addInferentiaNodeGroup(): blueprints.ManagedNodeGroup { + + return { + id: "mng4-inferentia", + instanceTypes: [new ec2.InstanceType('inf1.2xlarge')], + desiredSize: 1, + minSize: 1, + nodeRole: blueprints.getNamedResource("node-role") as iam.Role, + diskSize: 50, + tags: { + "Name": "Mng4", + "Type": "Managed-InferentiaNode-Group", + "LaunchTemplate": "Inferentia", + "kubernetes.io/cluster/blueprint-construct-dev": "owned" + } + }; +} + + diff --git a/lib/addons/index.ts b/lib/addons/index.ts index 1d1d81768..a3749d501 100644 --- a/lib/addons/index.ts +++ b/lib/addons/index.ts @@ -54,6 +54,7 @@ export * from './emr-on-eks'; export * from './aws-batch-on-eks'; export * from './upbound-universal-crossplane'; export * from './apache-airflow'; +export * from './neuron'; export * from './eks-pod-identity-agent'; export class Constants { diff --git a/lib/addons/neuron/index.ts b/lib/addons/neuron/index.ts new file mode 100644 index 000000000..faf8d3eb7 --- /dev/null +++ b/lib/addons/neuron/index.ts @@ -0,0 +1,39 @@ +import { Construct } from "constructs"; + +import { ClusterAddOn, ClusterInfo } from "../../spi"; +import { KubectlProvider, ManifestDeployment } from "../helm-addon/kubectl-provider"; +import { loadExternalYaml } from "../../utils/yaml-utils"; + +const PLUGIN_URL = "https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-device-plugin.yml"; +const RBAC_URL = "https://raw.githubusercontent.com/aws-neuron/aws-neuron-sdk/master/src/k8/k8s-neuron-device-plugin-rbac.yml"; + +export class NeuronPluginAddOn implements ClusterAddOn { + deploy(clusterInfo: ClusterInfo): Promise { + const kubectlProvider = new KubectlProvider(clusterInfo); + + // Read in YAML docs + const rbac = loadExternalYaml(RBAC_URL); + const rbacManifest: ManifestDeployment = { + name: "neuron-rbac-manifest", + namespace: "", + manifest: rbac, + values: {} + }; + + const plugin = loadExternalYaml(PLUGIN_URL); + const pluginManifest: ManifestDeployment = { + name: "neuron-plugin-manifest", + namespace: "kube-system", + manifest: plugin, + values: {} + }; + + const rbacStatement = kubectlProvider.addManifest(rbacManifest); + const pluginStatement = kubectlProvider.addManifest(pluginManifest); + + // Plugin dependency on the RBAC manifest + pluginStatement.node.addDependency(rbacStatement); + + return Promise.resolve(pluginStatement); + } +} \ No newline at end of file diff --git a/lib/utils/yaml-utils.ts b/lib/utils/yaml-utils.ts index 48435ecd1..dcc5afbb2 100644 --- a/lib/utils/yaml-utils.ts +++ b/lib/utils/yaml-utils.ts @@ -25,6 +25,11 @@ export function applyYamlFromDir(dir: string, cluster: eks.ICluster, namespaceMa }); } +/** + * Reads the YAML document from a local path. + * @param path YAML document path + * @returns YAML document string + */ export function readYamlDocument(path: string): string { try { const doc = fs.readFileSync(path, 'utf8'); @@ -35,11 +40,33 @@ export function readYamlDocument(path: string): string { } } +/** + * Reads the YAML document from a local path and parses them as + * multiple YAML documents separated by `---` as expected in a Kubernetes manifest file + * @param path YAML document path + * @returns a list of parsed YAML documents + */ +export function loadMultiResourceYaml(path: string): any { + const doc = readYamlDocument(path); + return doc.split("---").map((e: any) => loadYaml(e)); +} +/** + * Parses the sting document into a single YAML document + * @param document document + * @returns yaml document + */ export function loadYaml(document: string): any { return yaml.load(document); } +/** + * Reads the YAML document from a URL and parses + * multiple YAML documents separated by `---` as expected in a Kubernetes manifest file Note: The file from the URL is + * not validated, so user must ensure the URL contains a valid manifest. + * @param url YAML document URL + * @returns a list of parsed YAML documents + */ export function loadExternalYaml(url: string): any { /* eslint-disable */ const request = require('sync-request'); // moved away from import as it is causing open handles that prevents jest from completion @@ -47,6 +74,11 @@ export function loadExternalYaml(url: string): any { return yaml.loadAll(response.getBody().toString()); } +/** + * Serializes object as a YAML document + * @param document document + * @returns yaml document + */ export function serializeYaml(document: any): string { return yaml.dump(document); } \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index 8f5138dc9..0ef312b15 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -70,6 +70,7 @@ nav: - Metrics Server: 'addons/metrics-server.md' - New Relic: 'addons/newrelic.md' - Nginx: 'addons/nginx.md' + - Neuron: 'addons/neuron-plugin-addon.md' - OPA Gatekeeper: 'addons/opa-gatekeeper.md' - Paralus: 'addons/paralus.md' - Pixie: 'addons/pixie.md' diff --git a/test/utils/multi-yaml-test.yaml b/test/utils/multi-yaml-test.yaml new file mode 100644 index 000000000..aa71248ef --- /dev/null +++ b/test/utils/multi-yaml-test.yaml @@ -0,0 +1,6 @@ +--- +kind: ClusterRole +--- +kind: Deployment +--- +kind: Pod diff --git a/test/utils/yaml-test.yaml b/test/utils/yaml-test.yaml new file mode 100644 index 000000000..1bef97f3b --- /dev/null +++ b/test/utils/yaml-test.yaml @@ -0,0 +1 @@ +apiVersion: apps/v1 \ No newline at end of file diff --git a/test/utils/yaml-utils.test.ts b/test/utils/yaml-utils.test.ts new file mode 100644 index 000000000..b1011407e --- /dev/null +++ b/test/utils/yaml-utils.test.ts @@ -0,0 +1,65 @@ +import * as yaml from "../../lib/utils/yaml-utils"; + +describe('Unit tests for yaml utils', () => { + + test("The YAML Document file is read correctly", () => { + const doc = yaml.readYamlDocument(__dirname +'/yaml-test.yaml'); + + expect(doc).toBe("apiVersion: apps/v1"); + }); + + test("The YAML Document file is serialized correctly", () => { + const sample = {"apiVersion":"apps/v1","resource":"Deployment"}; + + const serialized = yaml.serializeYaml(sample); + + expect(serialized.length).toBe(41); + }); + + test("The YAML Document with multiple resources is read correctly", () => { + const doc = yaml.loadMultiResourceYaml(__dirname +'/multi-yaml-test.yaml'); + + const firstPart = { "kind": "ClusterRole" }; + const secondPart = { "kind": "Deployment" }; + const lastPart = { "kind": "Pod" }; + + expect(doc.length).toBe(4); + expect(doc[1]).toStrictEqual(firstPart); + expect(doc[2]).toStrictEqual(secondPart); + expect(doc[3]).toStrictEqual(lastPart); + }); + + test("External YAML Document is read correctly", () => { + const doc = yaml.loadExternalYaml('https://raw.githubusercontent.com/kubernetes/examples/master/guestbook/legacy/frontend-controller.yaml'); + const part = { + apiVersion: "v1", + kind: "ReplicationController", + metadata: {name: "frontend"}, + spec: { + replicas: 3, + template: { + metadata: { + labels: {app: "guestbook", tier: "frontend"} + }, + spec: { + containers: [{ + name: "php-redis", + image: "gcr.io/google_samples/gb-frontend:v4", + resources: { + requests: { + cpu: "100m", + memory: "100Mi" + } + }, + env: [{name: "GET_HOSTS_FROM", value: "dns"}], + ports:[{containerPort: 80}] + }] + } + } + } + }; + + expect(doc.length).toBe(1); + expect(doc[0]).toStrictEqual(part); + }); +});