From 9f9f9150f86449567dc07eb0221283de3107e473 Mon Sep 17 00:00:00 2001 From: Niharika Dutta Date: Tue, 17 Nov 2020 14:31:00 -0800 Subject: [PATCH] Update latest Spark .NET notebook (#62) Updating latest Spark .NET notebook to reflect capabilities from v1.0. --- ...ers Guide to DotNET for Apache Spark.ipynb | 1808 ++++++++++------- 1 file changed, 1100 insertions(+), 708 deletions(-) diff --git a/Notebooks/Spark.NET C#/Hitchhikers Guide to DotNET for Apache Spark.ipynb b/Notebooks/Spark.NET C#/Hitchhikers Guide to DotNET for Apache Spark.ipynb index c199cd0..2e3bb95 100644 --- a/Notebooks/Spark.NET C#/Hitchhikers Guide to DotNET for Apache Spark.ipynb +++ b/Notebooks/Spark.NET C#/Hitchhikers Guide to DotNET for Apache Spark.ipynb @@ -1,708 +1,1100 @@ -{ - "metadata": { - "description": "This tutorial gives you a tour of .NET and what how you can interact with Apache Spark through .NET.", - "saveOutput": true, - "language_info": { - "name": "csharp" - } - }, - "nbformat": 4, - "nbformat_minor": 2, - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Hitchhiker's Guide to .NET for Apache Spark\n", - "\n", - "Welcome to the .NET for Apache Spark tutorial! We are glad to have you here. Before we begin, let us cover answers to a few quick questions:\n", - "\n", - " - #### What is .NET for Apache Spark?\n", - " .NET for Apache Spark provides high performance APIs for using Apache Spark from C# and F#. With these .NET APIs, you can access the most popular Dataframe and SparkSQL aspects of Apache Spark, for working with structured data, and Spark Structured Streaming, for working with streaming data.\n", - "\n", - " .NET for Apache Spark is compliant with .NET Standard - a formal specification of .NET APIs that are common across .NET implementations. This means you can use .NET for Apache Spark anywhere you write .NET code allowing you to reuse all the knowledge, skills, code, and libraries you already have as a .NET developer.\n", - "\n", - " - #### Where can I find more on .NET for Apache Spark?\n", - " https://github.com/dotnet/spark\n", - "\n", - " - #### I did not know there was a REPL for C#!?\n", - " Great question! :) We collaborated with the .NET team and they built one for us! https://github.com/dotnet/interactive \n", - "\n", - "Whew! Now that we have covered some basic information, let's begin! \n", - "\n", - "Since the .NET REPL is something very new, let us start by exploring what you can do with the REPL. " - ], - "attachments": {} - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Basic Capabilities of the C# REPL" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 4, - "outputs": [], - "metadata": {}, - "source": [ - "// Simple assignments should just work \n", - "var x = 1 + 25;" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 5, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 5, - "data": { - "text/plain": "26\n256" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// You can either use traditional approach to printing a variable...\n", - "Console.WriteLine(x);\n", - "\n", - "// ... or just type it and execute a cell\n", - "256" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 6, - "outputs": [ - { - "output_type": "error", - "status": "error", - "execution_count": 6, - "data": null, - "ename": "Error", - "evalue": "System.Exception: watzzz\n at Submission#11.<>d__0.MoveNext()\n--- End of stack trace from previous location where exception was thrown ---\n at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)", - "traceback": [ - "Error : System.Exception: watzzz\n at Submission#11.<>d__0.MoveNext()\n--- End of stack trace from previous location where exception was thrown ---\n at Microsoft.CodeAnalysis.Scripting.ScriptExecutionState.RunSubmissionsAsync[TResult](ImmutableArray`1 precedingExecutors, Func`2 currentExecutor, StrongBox`1 exceptionHolderOpt, Func`2 catchExceptionOpt, CancellationToken cancellationToken)" - ] - } - ], - "metadata": {}, - "source": [ - "// You could write code that throws exceptions and they bubble up to the notebook\n", - "throw new Exception(\"watzzz\");" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 7, - "outputs": [ - { - "output_type": "error", - "status": "error", - "execution_count": 7, - "data": null, - "ename": "Error", - "evalue": "(2,14): error CS1002: ; expected", - "traceback": [ - "Error : (2,14): error CS1002: ; expected" - ] - } - ], - "metadata": {}, - "source": [ - "// Whatever code is deemed invalid by the C# Compiler, is invalid here too \n", - "var z = 12345" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 8, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 8, - "data": { - "text/html": "
indexvalue
01
12
23
34
45
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// You can even play with built-in libraries/functions\n", - "Enumerable.Range(1, 5)" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 9, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 9, - "data": { - "text/html": "
StartEnd
{ Index: Value: 1, IsFromEnd: False }{ Index: Value: 4, IsFromEnd: False }
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// And now for some C# 8.0 features. If you haven't read it,\n", - "// here's the link: \n", - "// https://docs.microsoft.com/en-us/dotnet/csharp/whats-new/csharp-8\n", - "1..4" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 10, - "outputs": [], - "metadata": {}, - "source": [ - "// We can even do pattern matching!\n", - "public static string RockPaperScissors(string first, string second)\n", - " => (first, second) switch\n", - " {\n", - " (\"rock\", \"paper\") => \"rock is covered by paper. Paper wins.\", // <-- Next cell prints this out\n", - " (\"rock\", \"scissors\") => \"rock breaks scissors. Rock wins.\",\n", - " (\"paper\", \"rock\") => \"paper covers rock. Paper wins.\",\n", - " (\"paper\", \"scissors\") => \"paper is cut by scissors. Scissors wins.\",\n", - " (\"scissors\", \"rock\") => \"scissors is broken by rock. Rock wins.\",\n", - " (\"scissors\", \"paper\") => \"scissors cuts paper. Scissors wins.\",\n", - " (_, _) => \"tie\"\n", - " };" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 11, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 11, - "data": { - "text/plain": "rock is covered by paper. Paper wins." - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "RockPaperScissors(\"rock\", \"paper\")" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 12, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 12, - "data": { - "text/html": "

Our Incredibly Declarative Example

Can you believe we wrote this in C#?

What will you create next?

" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// Now, for the fun part! You can render HTML\n", - "display(\n", - " div(\n", - " h1(\"Our Incredibly Declarative Example\"),\n", - " p(\"Can you believe we wrote this \", b(\"in C#\"), \"?\"),\n", - " img[src:\"https://media.giphy.com/media/xUPGcguWZHRC2HyBRS/giphy.gif\"],\n", - " p(\"What will \", b(\"you\"), \" create next?\")\n", - " )\n", - ");" - ], - "attachments": {} - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Looking at data through Spark.NET\n", - "" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 38, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 38, - "data": { - "text/plain": "+---+--------+\n| id| name|\n+---+--------+\n| 0| Michael|\n| 1| Elva|\n| 2| Terry|\n| 3| Steve|\n| 4| Brigit|\n| 5|Niharika|\n| 6| Rahul|\n| 7| Tomas|\n| 8| Euan|\n| 9| Lev|\n| 10| Saveen|\n+---+--------+" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// Let us use some sample data. In this cell, we create this data \n", - "// from *scratch* but you can also load it from your storage container. \n", - "// For instance, \n", - "// var df = spark.Read().Json(\"wasbs://@.blob.core.windows.net/people.json\");\n", - "\n", - "using Microsoft.Spark.Sql;\n", - "using Microsoft.Spark.Sql.Types;\n", - "using static Microsoft.Spark.Sql.Functions;\n", - "\n", - "var schema = new StructType(new List()\n", - " {\n", - " new StructField(\"id\", new IntegerType()),\n", - " new StructField(\"name\", new StringType())\n", - " });\n", - "\n", - "var data = new List();\n", - "data.Add(new GenericRow(new object[] { 0, \"Michael\" }));\n", - "data.Add(new GenericRow(new object[] { 1, \"Elva\" }));\n", - "data.Add(new GenericRow(new object[] { 2, \"Terry\" }));\n", - "data.Add(new GenericRow(new object[] { 3, \"Steve\" }));\n", - "data.Add(new GenericRow(new object[] { 4, \"Brigit\" }));\n", - "data.Add(new GenericRow(new object[] { 5, \"Niharika\"}));\n", - "data.Add(new GenericRow(new object[] { 6, \"Rahul\" }));\n", - "data.Add(new GenericRow(new object[] { 7, \"Tomas\" }));\n", - "data.Add(new GenericRow(new object[] { 8, \"Euan\" }));\n", - "data.Add(new GenericRow(new object[] { 9, \"Lev\" }));\n", - "data.Add(new GenericRow(new object[] { 10, \"Saveen\" }));\n", - "\n", - "var df = spark.CreateDataFrame(data, schema);\n", - "df.Show();" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 39, - "outputs": [], - "metadata": {}, - "source": [ - "// Wait, that rendering is old-school plain! Let's spice things up a bit!\n", - "// What we're doing here is to define a specific formatter that is tied to \n", - "// Microsoft.Spark.Sql.DataFrame and registering it. When we then invoke\n", - "// display() and pass a DataFrame, the formatter is invoked, which then\n", - "// generates the necessary HTML\n", - "\n", - "Microsoft.DotNet.Interactive.Formatting.Formatter.Register((df, writer) =>\n", - "{\n", - " var headers = new List();\n", - " var columnNames = df.Columns();\n", - " headers.Add(th(i(\"index\")));\n", - " headers.AddRange(columnNames.Select(c => th(c)));\n", - "\n", - " var rows = new List>();\n", - " var currentRow = 0;\n", - " var dfRows = df.Take(Math.Min(20, (int)df.Count()));\n", - " foreach (Row dfRow in dfRows)\n", - " {\n", - " var cells = new List();\n", - " cells.Add(td(currentRow));\n", - "\n", - " foreach (string columnName in columnNames)\n", - " {\n", - " cells.Add(td(dfRow.Get(columnName)));\n", - " }\n", - "\n", - " rows.Add(cells);\n", - " ++currentRow;\n", - " }\n", - "\n", - " var t = table[@border: \"0.1\"](\n", - " thead[@style: \"background-color: blue; color: white\"](headers),\n", - " tbody[@style: \"color: red\"](rows.Select(r => tr(r))));\n", - "\n", - " writer.Write(t);\n", - "}, \"text/html\");" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 40, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 40, - "data": { - "text/html": "
+---+--------+\n| id|    name|\n+---+--------+\n|  0| Michael|\n|  1|    Elva|\n|  2|   Terry|\n|  3|   Steve|\n|  4|  Brigit|\n|  5|Niharika|\n|  6|   Rahul|\n|  7|   Tomas|\n|  8|    Euan|\n|  9|     Lev|\n| 10|  Saveen|\n+---+--------+\n\n
indexidname
00Michael
11Elva
22Terry
33Steve
44Brigit
55Niharika
66Rahul
77Tomas
88Euan
99Lev
1010Saveen
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// Now, let's try rendering the Spark's DataFrame in two ways...\n", - "\n", - "// ... a regular way ...\n", - "df.Show();\n", - "\n", - "// ... and just typing df (so it invokes the formatter we just defined)\n", - "df" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 41, - "outputs": [], - "metadata": {}, - "source": [ - "// Let us now try something more advanced like, defining C# classes on-the-fly...\n", - "public static class A {\n", - " public static readonly string s = \"The person named \";\n", - "}" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 45, - "outputs": [], - "metadata": {}, - "source": [ - "// ... and just for illustration, let's define one more simple class\n", - "public static class B {\n", - " private static Random _r = new Random();\n", - " private static List _moods = new List{ \"happy\",\"funny\",\"awesome\",\"cool\"};\n", - "\n", - " public static string GetMood() {\n", - " return _moods[_r.Next(_moods.Count)];\n", - " }\n", - "}" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 50, - "outputs": [], - "metadata": {}, - "source": [ - "// Let us now define a Spark User-defined Function (UDF) that utilizes\n", - "// the classes we just defined above. If you do not recognize the syntax\n", - "// below, here's some relevant documentation:\n", - "// https://docs.microsoft.com/en-us/dotnet/api/system.func-2?view=netframework-4.8\n", - "// https://github.com/dotnet/spark/blob/master/examples/Microsoft.Spark.CSharp.Examples/Sql/Basic.cs\n", - "//\n", - "// Note: If you change the class definition above, and execute the cell,\n", - "// you should re-execute this cell (i.e., the cell that defines the UDF)\n", - "var udf = Udf(str => $\"{A.s} - {str} - is {B.GetMood()}!\");" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 51, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 51, - "data": { - "text/html": "
indexSystem.String <<Initialize>>b__0_0(System.String)(name)
0The person named - Michael - is happy!
1The person named - Elva - is happy!
2The person named - Terry - is funny!
3The person named - Steve - is awesome!
4The person named - Brigit - is cool!
5The person named - Niharika - is happy!
6The person named - Rahul - is cool!
7The person named - Tomas - is cool!
8The person named - Euan - is cool!
9The person named - Lev - is funny!
10The person named - Saveen - is awesome!
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// Let's use the UDF on our Spark DataFrame\n", - "display(\n", - " df\n", - " .Select(\n", - " udf((Microsoft.Spark.Sql.Column)df[\"name\"])));" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 52, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 52, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// Tables are not that interesting, right? :) Let's do some visualizations now.\n", - "// Let us start with something simple to illustrate the idea. We highly encourage\n", - "// you to look at https://fslab.org/XPlot/ to understand how you can use XPlot's\n", - "// full capabilities. While the examples are in F#, it is fairly straightforward\n", - "// to rewrite in C#.\n", - "\n", - "using XPlot.Plotly;\n", - "\n", - "var lineChart = Chart.Line(new List { 1, 2, 3, 4, 5, 6, 10, 44 });\n", - "lineChart.WithTitle(\"My awesome chart\");\n", - "lineChart.WithXTitle(\"X axis\");\n", - "lineChart.WithYTitle(\"Y axis\");\n", - "display(lineChart);" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 54, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 54, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// Good! Now let us try to visualize the Spark DataFrame we have.\n", - "// Now is a good time to refresh your concept of a Spark DataFrame\n", - "// https://spark.apache.org/docs/latest/sql-programming-guide.html\n", - "// Remember that a Spark DataFrame is a distributed representation \n", - "// of your dataset (yes, even if your data is a few KB). Since we\n", - "// are using a visualization library, we need to first 'collect'\n", - "// (notice how we are using df.Collect().ToArray() below)\n", - "// all the data that is distributed on your cluster, and shape it\n", - "// appropriately for XPlot.\n", - "//\n", - "// Note: Visualizations are good for smaller datasets (typically, \n", - "// a few 10s of thousands of data points coming to KBs), so if you are\n", - "// trying to visualize GBs of data, it is usually a good idea to\n", - "// summarize your data appropriately using Spark.NET's APIs. For\n", - "// a list of summarization APIs, see here:\n", - "// https://docs.microsoft.com/en-us/dotnet/api/microsoft.spark.sql.functions?view=spark-dotnet\n", - "\n", - "var names = new List();\n", - "var ids = new List();\n", - "\n", - "foreach (Row row in df.Collect().ToArray())\n", - "{\n", - " names.Add(row.GetAs(\"name\"));\n", - " int? id = row.GetAs(\"id\");\n", - " ids.Add( id ?? 0);\n", - "}\n", - "var bar = new Graph.Bar\n", - "{\n", - " name = \"bar chart\",\n", - " x = names,\n", - " y = ids\n", - "};\n", - "\n", - "var chart = Chart.Plot(new[] {bar});\n", - "display(chart);" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 75, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 75, - "data": { - "text/plain": "+--------------------+\n| number|\n+--------------------+\n| 0.9696137248396938|\n|0.032666774947506735|\n| 0.9982835552647168|\n| 0.623668246261621|\n| 0.886883255507277|\n| 0.6830984720415895|\n| 0.208581972964379|\n| 0.11822117544627803|\n| 0.41178633617786053|\n| 0.39932316653398947|\n| 0.07784702818740487|\n| 0.717773464842594|\n| 0.21077453215176917|\n|0.008377783935692992|\n| 0.16170730356206525|\n| 0.46337959564448317|\n| 0.7769349020798387|\n| 0.15755635414158756|\n| 0.9235577191801545|\n| 0.9462093380029357|\n+--------------------+\nonly showing top 20 rows" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// As a final step, let us now plot a histogram of a random dataset\n", - "\n", - "using XPlot.Plotly;\n", - "\n", - "var schema = new StructType(new List()\n", - " {\n", - " new StructField(\"number\", new DoubleType())\n", - " });\n", - "\n", - "Random random = new Random();\n", - "\n", - "var data = new List();\n", - "for(int i = 0; i <=100; i++) {\n", - " data.Add(new GenericRow(new object[] { random.NextDouble() }));\n", - "}\n", - "\n", - "var histogramDf = spark.CreateDataFrame(data, schema);\n", - "histogramDf.Show()" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 76, - "outputs": [], - "metadata": {}, - "source": [ - "// Time to use LINQ (or Language Integrated Query) :)\n", - "// For those that are not familiar with LINQ, you can read more about it\n", - "// here: https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/linq/\n", - "\n", - "using System.Linq;\n", - "\n", - "// Let us take the histogramDf we loaded through Spark and sample some data points\n", - "// for the histogram. We will then use LINQ to shape the data for our next \n", - "// steps (visualization!)\n", - "var sample1 = \n", - " histogramDf.Sample(0.5, true).Collect().ToArray() // <---- Spark APIs\n", - " .Select(x => x.GetAs(\"number\")); // <---- LINQ APIs\n", - " \n", - "// Let us create two more sample sets we can use for plotting\n", - "var sample2 = histogramDf.Sample(0.3, false).Collect().ToArray().Select(x => x.GetAs(\"number\"));\n", - "var sample3 = histogramDf.Sample(0.6, true).Collect().ToArray().Select(x => x.GetAs(\"number\"));" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 77, - "outputs": [], - "metadata": {}, - "source": [ - "// Let us plot the histograms now!\n", - "var hist1 = new Graph.Histogram{x = sample1, opacity = 0.75};\n", - "var hist2 = new Graph.Histogram{x = sample2, opacity = 0.75};\n", - "var hist3 = new Graph.Histogram{x = sample3, opacity = 0.75};" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 78, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 78, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "Chart.Plot(new[] {hist1})" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 79, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 79, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "Chart.Plot(new[] {hist2})" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 80, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 80, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "Chart.Plot(new[] {hist3})" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 81, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 81, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// but wait, that's three different graphs and it's impossible to read them\n", - "// altogether! Let's try an overlay histogram, shall we?\n", - "var layout = new XPlot.Plotly.Layout.Layout{barmode=\"overlay\", title=\"Overlaid Histogram\"};\n", - "var histogram = Chart.Plot(new[] {hist1, hist2, hist3});\n", - "histogram.WithLayout(layout);\n", - "histogram" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": 86, - "outputs": [ - { - "output_type": "execute_result", - "execution_count": 86, - "data": { - "text/html": "
" - }, - "metadata": {} - } - ], - "metadata": {}, - "source": [ - "// And for the final touches\n", - "using static XPlot.Plotly.Graph;\n", - "\n", - "layout.title = \"Overlaid Histogram with cool colors!\";\n", - "hist1.marker = new Marker {color = \"#D65108)\"};\n", - "hist2.marker = new Marker {color = \"#ffff00\"}; \n", - "hist3.marker = new Marker {color = \"#462255\"};\n", - "\n", - "histogram" - ], - "attachments": {} - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "metadata": {}, - "source": [], - "attachments": {} - } - ] -} +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Hitchhiker's Guide to .NET for Apache Spark\n", + "\n", + "Welcome to the .NET for Apache Spark tutorial! We are glad to have you here. Before we begin, let us cover answers to a few quick questions:\n", + "\n", + " - #### What is .NET for Apache Spark?\n", + " .NET for Apache Spark provides high performance APIs for using Apache Spark from C# and F#. With these .NET APIs, you can access the most popular Dataframe and SparkSQL aspects of Apache Spark, for working with structured data, and Spark Structured Streaming, for working with streaming data.\n", + "\n", + " .NET for Apache Spark is compliant with .NET Standard - a formal specification of .NET APIs that are common across .NET implementations. This means you can use .NET for Apache Spark anywhere you write .NET code allowing you to reuse all the knowledge, skills, code, and libraries you already have as a .NET developer.\n", + "\n", + " - #### Where can I find more on .NET for Apache Spark?\n", + " https://github.com/dotnet/spark\n", + "\n", + " - #### I did not know there was a REPL for C#!?\n", + " Great question! :) We collaborated with the .NET team and they built one for us! https://github.com/dotnet/interactive \n", + "\n", + "Whew! Now that we have covered some basic information, let's begin! \n", + "\n", + "Since the .NET REPL is something very new, let us start by exploring what you can do with the REPL. " + ], + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "# Basic Capabilities of the C# REPL" + ], + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Simple assignments should just work \n", + "var x = 1 + 25;" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// You can either use traditional approach to printing a variable...\n", + "Console.WriteLine(x);\n", + "\n", + "// ... or just type it and execute a cell\n", + "256" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// You can even play with built-in libraries/functions\n", + "Enumerable.Range(1, 5)" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// And now for some C# 8.0 features. If you haven't read it,\n", + "// here's the link: \n", + "// https://docs.microsoft.com/en-us/dotnet/csharp/whats-new/csharp-8\n", + "1..4" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// We can even do pattern matching!\n", + "public static string RockPaperScissors(string first, string second)\n", + " => (first, second) switch\n", + " {\n", + " (\"rock\", \"paper\") => \"rock is covered by paper. Paper wins.\", // <-- Next cell prints this out\n", + " (\"rock\", \"scissors\") => \"rock breaks scissors. Rock wins.\",\n", + " (\"paper\", \"rock\") => \"paper covers rock. Paper wins.\",\n", + " (\"paper\", \"scissors\") => \"paper is cut by scissors. Scissors wins.\",\n", + " (\"scissors\", \"rock\") => \"scissors is broken by rock. Rock wins.\",\n", + " (\"scissors\", \"paper\") => \"scissors cuts paper. Scissors wins.\",\n", + " (_, _) => \"tie\"\n", + " };" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "RockPaperScissors(\"rock\", \"paper\")" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Now, for the fun part! You can render HTML\n", + "display(\n", + " div(\n", + " h1(\"Our Incredibly Declarative Example\"),\n", + " p(\"Can you believe we wrote this \", b(\"in C#\"), \"?\"),\n", + " img[src:\"https://media.giphy.com/media/xUPGcguWZHRC2HyBRS/giphy.gif\"],\n", + " p(\"What will \", b(\"you\"), \" create next?\")\n", + " )\n", + ");" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "# Looking at data through Spark.NET\n" + ], + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Let us use some sample data. In this cell, we create this data \n", + "// from *scratch* but you can also load it from your storage container. \n", + "// For instance, \n", + "// var df = spark.Read().Json(\"wasbs://@.blob.core.windows.net/people.json\");\n", + "\n", + "using Microsoft.Spark.Sql;\n", + "using Microsoft.Spark.Sql.Types;\n", + "using static Microsoft.Spark.Sql.Functions;\n", + "\n", + "var schema = new StructType(new List()\n", + " {\n", + " new StructField(\"id\", new IntegerType()),\n", + " new StructField(\"name\", new StringType())\n", + " });\n", + "\n", + "var data = new List();\n", + "data.Add(new GenericRow(new object[] { 0, \"Michael\" }));\n", + "data.Add(new GenericRow(new object[] { 1, \"Elva\" }));\n", + "data.Add(new GenericRow(new object[] { 2, \"Terry\" }));\n", + "data.Add(new GenericRow(new object[] { 3, \"Steve\" }));\n", + "data.Add(new GenericRow(new object[] { 4, \"Brigit\" }));\n", + "data.Add(new GenericRow(new object[] { 5, \"Niharika\"}));\n", + "data.Add(new GenericRow(new object[] { 6, \"Rahul\" }));\n", + "data.Add(new GenericRow(new object[] { 7, \"Tomas\" }));\n", + "data.Add(new GenericRow(new object[] { 8, \"Euan\" }));\n", + "data.Add(new GenericRow(new object[] { 9, \"Lev\" }));\n", + "data.Add(new GenericRow(new object[] { 10, \"Saveen\" }));\n", + "\n", + "var df = spark.CreateDataFrame(data, schema);\n", + "df.Show();" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Wait, that rendering is old-school plain! Let's spice things up a bit!\n", + "// What we're doing here is to define a specific formatter that is tied to \n", + "// Microsoft.Spark.Sql.DataFrame and registering it. When we then invoke\n", + "// display() and pass a DataFrame, the formatter is invoked, which then\n", + "// generates the necessary HTML\n", + "\n", + "Microsoft.DotNet.Interactive.Formatting.Formatter.Register((df, writer) =>\n", + "{\n", + " var headers = new List();\n", + " var columnNames = df.Columns();\n", + " headers.Add(th(i(\"index\")));\n", + " headers.AddRange(columnNames.Select(c => th(c)));\n", + "\n", + " var rows = new List>();\n", + " var currentRow = 0;\n", + " var dfRows = df.Take(Math.Min(20, (int)df.Count()));\n", + " foreach (Row dfRow in dfRows)\n", + " {\n", + " var cells = new List();\n", + " cells.Add(td(currentRow));\n", + "\n", + " foreach (string columnName in columnNames)\n", + " {\n", + " cells.Add(td(dfRow.Get(columnName)));\n", + " }\n", + "\n", + " rows.Add(cells);\n", + " ++currentRow;\n", + " }\n", + "\n", + " var t = table[@border: \"0.1\"](\n", + " thead[@style: \"background-color: blue; color: white\"](headers),\n", + " tbody[@style: \"color: red\"](rows.Select(r => tr(r))));\n", + "\n", + " writer.Write(t);\n", + "}, \"text/html\");" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Now, let's try rendering the Spark's DataFrame in two ways...\n", + "\n", + "// ... a regular way ...\n", + "df.Show();\n", + "\n", + "// Using dotnet-interactive's display method (so it invokes the formatter we just defined)\n", + "display(df);\n" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// ... and just typing df (equivalent to \"display(df);\")\r\n", + "df" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Let us now try something more advanced like, defining C# classes on-the-fly...\n", + "public static class A {\n", + " public static readonly string s = \"The person named \";\n", + "}" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// ... and just for illustration, let's define one more simple class\n", + "public static class B {\n", + " private static Random _r = new Random();\n", + " private static List _moods = new List{ \"happy\",\"funny\",\"awesome\",\"cool\"};\n", + "\n", + " public static string GetMood() {\n", + " return _moods[_r.Next(_moods.Count)];\n", + " }\n", + "}" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Let us now define a Spark User-defined Function (UDF) that utilizes\n", + "// the classes we just defined above. If you do not recognize the syntax\n", + "// below, here's some relevant documentation:\n", + "// https://docs.microsoft.com/en-us/dotnet/api/system.func-2?view=netcore-3.1\n", + "// https://github.com/dotnet/spark/blob/master/examples/Microsoft.Spark.CSharp.Examples/Sql/Batch/Basic.cs\n", + "//\n", + "// Note: If you change the class definition above, and execute the cell,\n", + "// you should re-execute this cell (i.e., the cell that defines the UDF)\n", + "var udf = Udf(str => $\"{A.s} - {str} - is {B.GetMood()}!\");" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Let's use the UDF on our Spark DataFrame\n", + "display(\n", + " df\n", + " .Select(\n", + " udf((Microsoft.Spark.Sql.Column)df[\"name\"])));" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Tables are not that interesting, right? :) Let's do some visualizations now.\r\n", + "// Let us start with something simple to illustrate the idea. We highly encourage\r\n", + "// you to look at https://fslab.org/XPlot/ to understand how you can use XPlot's\r\n", + "// full capabilities. While the examples are in F#, it is fairly straightforward\r\n", + "// to rewrite in C#.\r\n", + "\r\n", + "using XPlot.Plotly;\r\n", + "\r\n", + "var lineChart = Chart.Line(new List { 1, 2, 3, 4, 5, 6, 10, 44 });\r\n", + "lineChart.WithTitle(\"My awesome chart\");\r\n", + "lineChart.WithXTitle(\"X axis\");\r\n", + "lineChart.WithYTitle(\"Y axis\");\r\n", + "lineChart" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Good! Now let us try to visualize the Spark DataFrame we have.\n", + "// Now is a good time to refresh your concept of a Spark DataFrame\n", + "// https://spark.apache.org/docs/latest/sql-programming-guide.html\n", + "// Remember that a Spark DataFrame is a distributed representation \n", + "// of your dataset (yes, even if your data is a few KB). Since we\n", + "// are using a visualization library, we need to first 'collect'\n", + "// (notice how we are using df.Collect().ToArray() below)\n", + "// all the data that is distributed on your cluster, and shape it\n", + "// appropriately for XPlot.\n", + "//\n", + "// Note: Visualizations are good for smaller datasets (typically, \n", + "// a few 10s of thousands of data points coming to KBs), so if you are\n", + "// trying to visualize GBs of data, it is usually a good idea to\n", + "// summarize your data appropriately using Spark.NET's APIs. For\n", + "// a list of summarization APIs, see here:\n", + "// https://docs.microsoft.com/en-us/dotnet/api/microsoft.spark.sql.functions?view=spark-dotnet\n", + "\n", + "var names = new List();\n", + "var ids = new List();\n", + "\n", + "foreach (Row row in df.Collect().ToArray())\n", + "{\n", + " names.Add(row.GetAs(\"name\"));\n", + " int? id = row.GetAs(\"id\");\n", + " ids.Add( id ?? 0);\n", + "}\n", + "var bar = new Graph.Bar\n", + "{\n", + " name = \"bar chart\",\n", + " x = names,\n", + " y = ids\n", + "};\n", + "\n", + "var chart = Chart.Plot(new[] {bar});\n", + "display(chart);" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// As a final step, let us now plot a histogram of a random dataset\n", + "\n", + "using XPlot.Plotly;\n", + "\n", + "var schema = new StructType(new List()\n", + " {\n", + " new StructField(\"number\", new DoubleType())\n", + " });\n", + "\n", + "Random random = new Random();\n", + "\n", + "var data = new List();\n", + "for(int i = 0; i <=100; i++) {\n", + " data.Add(new GenericRow(new object[] { random.NextDouble() }));\n", + "}\n", + "\n", + "var histogramDf = spark.CreateDataFrame(data, schema);\n", + "histogramDf.Show()" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Time to use LINQ (or Language Integrated Query) :)\n", + "// For those that are not familiar with LINQ, you can read more about it\n", + "// here: https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/linq/\n", + "\n", + "using System.Linq;\n", + "\n", + "// Let us take the histogramDf we loaded through Spark and sample some data points\n", + "// for the histogram. We will then use LINQ to shape the data for our next \n", + "// steps (visualization!)\n", + "var sample1 = \n", + " histogramDf.Sample(0.5, true).Collect().ToArray() // <---- Spark APIs\n", + " .Select(x => x.GetAs(\"number\")); // <---- LINQ APIs\n", + " \n", + "// Let us create two more sample sets we can use for plotting\n", + "var sample2 = histogramDf.Sample(0.3, false).Collect().ToArray().Select(x => x.GetAs(\"number\"));\n", + "var sample3 = histogramDf.Sample(0.6, true).Collect().ToArray().Select(x => x.GetAs(\"number\"));" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Let us plot the histograms now!\n", + "var hist1 = new Graph.Histogram{x = sample1, opacity = 0.75};\n", + "var hist2 = new Graph.Histogram{x = sample2, opacity = 0.75};\n", + "var hist3 = new Graph.Histogram{x = sample3, opacity = 0.75};" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "Chart.Plot(new[] {hist1})" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "Chart.Plot(new[] {hist2})" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "Chart.Plot(new[] {hist3})" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// but wait, that's three different graphs and it's impossible to read them\n", + "// altogether! Let's try an overlay histogram, shall we?\n", + "var layout = new XPlot.Plotly.Layout.Layout{barmode=\"overlay\", title=\"Overlaid Histogram\"};\n", + "var histogram = Chart.Plot(new[] {hist1, hist2, hist3});\n", + "histogram.WithLayout(layout);\n", + "histogram" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// And for the final touches\n", + "using static XPlot.Plotly.Graph;\n", + "\n", + "layout.title = \"Overlaid Histogram with cool colors!\";\n", + "hist1.marker = new Marker {color = \"#D65108)\"};\n", + "hist2.marker = new Marker {color = \"#ffff00\"}; \n", + "hist3.marker = new Marker {color = \"#462255\"};\n", + "\n", + "histogram" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "markdown", + "source": [ + "# VectorUdfs using Apache Arrow\n", + "Spark .NET supports constructing Arrow-backed VectorUdfs by directly using the [Apache Arrow](https://github.com/apache/arrow) library or by using the [Microsoft DataFrame](https://devblogs.microsoft.com/dotnet/an-introduction-to-dataframe/) library." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// Let's construct a VectorUdf by directly using Arrow.\r\n", + "using Apache.Arrow;\r\n", + "using static Microsoft.Spark.Sql.ArrowFunctions;\r\n", + "using Column = Microsoft.Spark.Sql.Column;\r\n", + "\r\n", + "// Helper method to construct an ArrowArray from a string[].\r\n", + "public static IArrowArray ToStringArrowArray(string[] array)\r\n", + "{\r\n", + " var valueOffsets = new ArrowBuffer.Builder();\r\n", + " var valueBuffer = new ArrowBuffer.Builder();\r\n", + " int offset = 0;\r\n", + "\r\n", + " foreach (string str in array)\r\n", + " {\r\n", + " byte[] bytes = Encoding.UTF8.GetBytes(str);\r\n", + " valueOffsets.Append(offset);\r\n", + " valueBuffer.Append(bytes);\r\n", + " offset += bytes.Length;\r\n", + " }\r\n", + "\r\n", + " valueOffsets.Append(offset);\r\n", + " return new StringArray(\r\n", + " new ArrayData(\r\n", + " Apache.Arrow.Types.StringType.Default,\r\n", + " valueOffsets.Length - 1,\r\n", + " 0,\r\n", + " 0,\r\n", + " new[] { ArrowBuffer.Empty, valueOffsets.Build(), valueBuffer.Build() }));\r\n", + "}\r\n", + "\r\n", + "Func arrowUdf =\r\n", + " (ids, names) => (StringArray)ToStringArrowArray(\r\n", + " Enumerable.Range(0, names.Length)\r\n", + " .Select(i => $\"id: {ids.GetValue(i)}, name: {names.GetString(i)}\")\r\n", + " .ToArray());\r\n", + "\r\n", + "Func vectorUdf1 = VectorUdf(arrowUdf);" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "df.Select(vectorUdf1(df[\"id\"], df[\"name\"]))" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Now let's construct a VectorUdf by using Microsoft Dataframe\r\n", + "using Microsoft.Data.Analysis;\r\n", + "using static Microsoft.Spark.Sql.DataFrameFunctions;\r\n", + "\r\n", + "Func msftDfFunc =\r\n", + " (ids, names) =>\r\n", + " {\r\n", + " long i = 0;\r\n", + " return names.Apply(cur => $\"id: {ids[i++]}, name: {cur}\");\r\n", + " };\r\n", + "\r\n", + "Func vectorUdf2 = VectorUdf(msftDfFunc);" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "df.Select(vectorUdf2(df[\"id\"], df[\"name\"]))" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "markdown", + "source": [ + "# Running custom Nugets as UDFs inside Spark\r\n", + "In .NET for Spark, it is very easy to install a library from Nuget and use in UDFs in Spark." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// Use #r to install new packages into the current session\r\n", + "\r\n", + "// Installs latest version\r\n", + "#r \"nuget: MathNet.Numerics\"\r\n", + "\r\n", + "// Installs specified version\r\n", + "#r \"nuget: NumSharp,0.20.5\"" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// Let's construct some Udfs that have a dependency on the installed packages.\r\n", + "using MathNet.Numerics.LinearAlgebra;\r\n", + "using MathNet.Numerics.LinearAlgebra.Double;\r\n", + "using NumSharp;\r\n", + "\r\n", + "var mathNetUdf = Udf(str => {\r\n", + " Matrix matrix = DenseMatrix.OfArray(new double[,] {\r\n", + " {1,1,1,1},\r\n", + " {1,2,3,4},\r\n", + " {4,3,2,1}});\r\n", + "\r\n", + " return $\"{matrix[0, 0]} - {str} - {matrix[1, 1]}!\";\r\n", + "});\r\n", + "\r\n", + "var numSharpUdf = Udf(str => {\r\n", + " var nd = np.arange(12);\r\n", + "\r\n", + " return $\"{nd[1].ToString()} - {str} - {nd[5].ToString()}!\";\r\n", + "});" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// UDFs are run on the Microsoft.Spark.Worker process. The package assemblies\r\n", + "// defined as a Udf depedency are shipped to the Worker so they are available\r\n", + "// at the time of execution.\r\n", + "df.Select(mathNetUdf(df[\"name\"])).Show();\r\n", + "\r\n", + "df.Select(numSharpUdf(df[\"name\"])).Show();\r\n", + "\r\n", + "// We can also chain udfs.\r\n", + "df.Select(mathNetUdf(numSharpUdf(df[\"name\"])))" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "markdown", + "source": [ + "# Synapse Spark Utility Methods\r\n", + "[Microsoft.Spark.Extensions.Azure.Synapse.Analytics.Notebook.MSSparkUtils](dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FMSSparkUtils)" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// Utility for obtaining credentials (tokens and keys) for Synapse resources.\r\n", + "// Credentials methods https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FMSSparkUtils%2FCredentials.cs\r\n", + "using Microsoft.Spark.Extensions.Azure.Synapse.Analytics.Notebook.MSSparkUtils;\r\n", + "\r\n", + "// Note that the help message is the help message returned by the Scala implementation. This needs to be updated and addressed in a future version.\r\n", + "Console.WriteLine($\"Help:\\n{Credentials.Help()}\");" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Utility for obtaining environment metadata for Synapse.\r\n", + "// Env methods https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FMSSparkUtils%2FEnv.cs\r\n", + "Console.WriteLine($\"UserName: {Env.GetUserName()}\");\r\n", + "Console.WriteLine($\"UserId: {Env.GetUserId()}\");\r\n", + "Console.WriteLine($\"WorkspaceName: {Env.GetWorkspaceName()}\");\r\n", + "Console.WriteLine($\"PoolName: {Env.GetPoolName()}\");\r\n", + "Console.WriteLine($\"ClusterId: {Env.GetClusterId()}\");\r\n", + "\r\n", + "// Note that the help message is the help message returned by the Scala implementation. This needs to be updated and addressed in a future version.\r\n", + "Console.WriteLine($\"Help:\\n{Env.Help()}\");" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Utility for filesystem operations in Synapse notebook\r\n", + "// FS methods https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FMSSparkUtils%2FFS.cs\r\n", + "// FileInfo methods https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FMSSparkUtils%2FFileInfo.cs\r\n", + "\r\n", + "// Note that the help message is the help message returned by the Scala implementation. This needs to be updated and addressed in a future version.\r\n", + "FS.Help(\"\");" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Utility for notebook operations (e.g, chaining Synapse notebooks together)\r\n", + "// Notebook methods https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FMSSparkUtils%2FNotebook.cs\r\n", + "\r\n", + "// Note that the help message is the help message returned by the Scala implementation. This needs to be updated and addressed in a future version.\r\n", + "Notebook.Help(\"\");" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "markdown", + "source": [ + "# [Microsoft.Spark.Extensions.Azure.Synapse.Analytics.Notebook.Visualization](https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FNotebook%2FVisualization%2FFunctions.cs)" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "using Microsoft.Spark.Extensions.Azure.Synapse.Analytics.Notebook.Visualization;\r\n", + "// Construct an specific html fragment to synapse notebook front-end for rendering\r\n", + "// based on user-input html content.\r\n", + "DisplayHTML(\"

Hello World

\");" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "markdown", + "source": [ + "# [TokenLibrary](https://dev.azure.com/dnceng/internal/_git/dotnet-spark-extensions?path=%2Fsrc%2FMicrosoft.Spark.Extensions.Azure.Synapse.Analytics%2FUtils%2FTokenLibrary.cs)\r\n", + "\r\n", + "[Synapse Analytics TokenLibrary Official Docs](https://docs.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-secure-credentials-with-tokenlibrary)" + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "using Microsoft.Spark.Extensions.Azure.Synapse.Analytics.Utils;\r\n", + "\r\n", + "// Note that the help message is the help message returned by the Scala implementation. This needs to be updated and addressed in a future version.\r\n", + "// TODO: Methodname needs to be uppercase.\r\n", + "Console.WriteLine($\"Help:\\n{TokenLibrary.help()}\");" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "markdown", + "source": [ + "# Miscellaneous Helpers\n", + "Learn about some internal functions offered by using .NET for Spark." + ], + "metadata": { + "nteract": { + "transient": { + "deleting": false + } + } + } + }, + { + "cell_type": "code", + "source": [ + "// Curious about the version of Spark .NET currently installed?\r\n", + "// Let's use the following method to find out!\r\n", + "using Microsoft.Spark.Experimental.Sql;\r\n", + "spark.GetAssemblyInfo()" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Current version of the dotnet-interactive REPL.\r\n", + "#!about" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// We can even run powershell core commands\r\n", + "#!pwsh\r\n", + "cat /etc/hosts" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// We can also run F# code\r\n", + "#!fsharp\r\n", + "open System\r\n", + "printfn \"Hello World from F#!\"" + ], + "outputs": [], + "execution_count": null, + "metadata": { + "jupyter": { + "source_hidden": false, + "outputs_hidden": false + }, + "nteract": { + "transient": { + "deleting": false + } + }, + "collapsed": true + } + }, + { + "cell_type": "code", + "source": [ + "// Whatever code is deemed invalid by the C# Compiler, is invalid here too \n", + "var z = 12345" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + }, + { + "cell_type": "code", + "source": [ + "// You could write code that throws exceptions and they bubble up to the notebook\n", + "throw new Exception(\"watzzz\");" + ], + "outputs": [], + "execution_count": null, + "metadata": {} + } + ], + "metadata": { + "sessionOptions": { + "driverMemory": "56g", + "driverCores": 8, + "executorMemory": "56g", + "executorCores": 8, + "numExecutors": 2, + "keepAliveTimeout": 30, + "conf": { + "spark.dynamicAllocation.minExecutors": "2", + "spark.dynamicAllocation.maxExecutors": "2" + } + }, + "saveOutput": true, + "kernelspec": { + "name": "synapse_pyspark", + "display_name": "Synapse PySpark" + }, + "language_info": { + "name": "csharp" + }, + "a365ComputeOptions": { + "nodeSize": "Small", + "auth": { + "authResource": "https://dev.azuresynapse.net", + "type": "AAD" + }, + "name": "bugbash", + "nodeCount": 3, + "endpoint": "https://niduttacanaryws.dev.azuresynapse.net/livyApi/versions/2019-11-01-preview/sparkPools/bugbash", + "automaticScaleJobs": false, + "type": "Spark", + "id": "/subscriptions/00baffc3-c40a-4478-a424-4bdc9566c82a/resourceGroups/niduttarg/providers/Microsoft.Synapse/workspaces/niduttacanaryws/bigDataPools/bugbash", + "sparkVersion": "2.4" + }, + "microsoft": { + "language": "csharp" + }, + "nteract": { + "version": "nteract-front-end@1.0.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file