From c570eb064f98e745ba59c5a728e72d0e8a93ad05 Mon Sep 17 00:00:00 2001 From: "Dmitry A. Grechka" Date: Fri, 17 Jun 2016 00:31:16 +0300 Subject: [PATCH] Initial data drop Downloads a series of files. Merge them into single NEtCDF --- App.config | 6 ++ AssemblyInfo.fs | 41 ++++++++++++++ BulkReanalysisDownload.fsproj | 78 ++++++++++++++++++++++++++ BulkReanalysisDownload.sln | 22 ++++++++ Program.fs | 102 ++++++++++++++++++++++++++++++++++ 5 files changed, 249 insertions(+) create mode 100644 App.config create mode 100644 AssemblyInfo.fs create mode 100644 BulkReanalysisDownload.fsproj create mode 100644 BulkReanalysisDownload.sln create mode 100644 Program.fs diff --git a/App.config b/App.config new file mode 100644 index 0000000..d740e88 --- /dev/null +++ b/App.config @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/AssemblyInfo.fs b/AssemblyInfo.fs new file mode 100644 index 0000000..172f413 --- /dev/null +++ b/AssemblyInfo.fs @@ -0,0 +1,41 @@ +namespace MergeReanalysis.AssemblyInfo + +open System.Reflection +open System.Runtime.CompilerServices +open System.Runtime.InteropServices + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[] +[] +[] +[] +[] +[] +[] +[] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [] +[] +[] + +do + () \ No newline at end of file diff --git a/BulkReanalysisDownload.fsproj b/BulkReanalysisDownload.fsproj new file mode 100644 index 0000000..b47e92d --- /dev/null +++ b/BulkReanalysisDownload.fsproj @@ -0,0 +1,78 @@ + + + + + Debug + AnyCPU + 2.0 + 420c7da4-b854-4476-a9af-8fb4e192b543 + Exe + MergeReanalysis + MergeReanalysis + v4.5.2 + true + 4.4.0.0 + BulkReanalysisDownload + + + true + full + false + false + bin\Debug\ + DEBUG;TRACE + 3 + AnyCPU + bin\Debug\MergeReanalysis.XML + true + + + pdbonly + true + true + bin\Release\ + TRACE + 3 + AnyCPU + bin\Release\MergeReanalysis.XML + true + + + + + + True + + + + + + + + + + + + 11 + + + + + $(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets + + + + + $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\FSharp\Microsoft.FSharp.Targets + + + + + + \ No newline at end of file diff --git a/BulkReanalysisDownload.sln b/BulkReanalysisDownload.sln new file mode 100644 index 0000000..6b228b4 --- /dev/null +++ b/BulkReanalysisDownload.sln @@ -0,0 +1,22 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.25123.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "BulkReanalysisDownload", "BulkReanalysisDownload.fsproj", "{420C7DA4-B854-4476-A9AF-8FB4E192B543}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {420C7DA4-B854-4476-A9AF-8FB4E192B543}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {420C7DA4-B854-4476-A9AF-8FB4E192B543}.Debug|Any CPU.Build.0 = Debug|Any CPU + {420C7DA4-B854-4476-A9AF-8FB4E192B543}.Release|Any CPU.ActiveCfg = Release|Any CPU + {420C7DA4-B854-4476-A9AF-8FB4E192B543}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/Program.fs b/Program.fs new file mode 100644 index 0000000..7ebfc7d --- /dev/null +++ b/Program.fs @@ -0,0 +1,102 @@ +open System.IO +open System.Net +open System.Text + +type DataSet = Microsoft.Research.Science.Data.DataSet +type Variable = Microsoft.Research.Science.Data.Variable + +let creds = NetworkCredential("anonymous","reanslysis@merger.com") + +let bufferSize=1024*1024*256 + +let getSurfaceVariableFile varName year = + let rec copy (rs:Stream) (ws:Stream) buffer = + let bytesRead = rs.Read(buffer,0,buffer.Length) + if bytesRead = 0 then + () + else + ws.Write(buffer, 0, bytesRead); + copy rs ws buffer + try + let addr = sprintf "ftp://ftp.cdc.noaa.gov/Datasets/ncep.reanalysis/surface/%s.%d.nc" varName year + let request = + WebRequest.Create(addr) + :?> (FtpWebRequest) + request.Method <- WebRequestMethods.Ftp.DownloadFile + request.Credentials <- creds :> ICredentials + //request.UseBinary <- true; + request.UsePassive <- true; + request.KeepAlive <- true; + printf "Downloading %s..." addr + let response = request.GetResponse() + let response = response :?> FtpWebResponse + let filename = Path.GetRandomFileName() + use rs = response.GetResponseStream() + use ws = new FileStream(filename,FileMode.Create) + let buffer = Array.zeroCreate(bufferSize) + copy rs ws buffer + printfn "Done (%s)" response.StatusDescription + Some(filename) + with + | :? System.Net.WebException -> + printfn "Not found"; + None + +[] +let main argv = + let startYear = 2015 + let varName = "air" + let layerName = "sig995" + + let varFileName = sprintf "%s.%s" varName layerName + + let datasetURL = sprintf "msds:nc?file=%s.nc&openMode=create" varFileName + use dataSet = Microsoft.Research.Science.Data.DataSet.Open(datasetURL) + + let variablesToBulkCopy = ["lat"; "lon"] + + let folder (dataSet:DataSet) file = + let sourceDs = DataSet.Open(sprintf "msds:nc?file=%s&openMode=readOnly" file) + let sourceVar,sourceTimeVar = sourceDs.Variables.[varName],sourceDs.Variables.["time"] + let sourceData,timeData = sourceVar.GetData(),sourceTimeVar.GetData() + let targetVar,targetTimeVar = + if dataSet.Variables.Contains(varName) then + dataSet.Variables.[varName],dataSet.Variables.["time"] + else + //first file in a series + for bulkCopyVar in variablesToBulkCopy do + let sourceV = sourceDs.Variables.[bulkCopyVar] + let v = dataSet.AddVariable(bulkCopyVar,sourceV.GetData(),sourceV.Dimensions.AsNamesArray()) + //metadata for 1D variables + for key in sourceV.Metadata.AsDictionary().Keys do + v.Metadata.[key] <- sourceV.Metadata.[key] + + //global metadata + for key in sourceDs.Metadata.AsDictionary().Keys do + dataSet.Metadata.[key] <- sourceDs.Metadata.[key] + + //placeholders for incremental updates + let targetV = dataSet.AddVariable(varName,sourceVar.Dimensions.AsNamesArray()) :> Microsoft.Research.Science.Data.Variable + let targetTimeV = dataSet.AddVariable("time",[|"time"|]) :> Microsoft.Research.Science.Data.Variable + + //target var metadata + for key in sourceVar.Metadata.AsDictionary().Keys do + targetV.Metadata.[key] <- sourceVar.Metadata.[key] + //time var metadat + for key in sourceTimeVar.Metadata.AsDictionary().Keys do + targetTimeV.Metadata.[key] <- sourceTimeVar.Metadata.[key] + targetV,targetTimeV + targetVar.Append(sourceData,"time") + targetTimeVar.Append(timeData) + dataSet.Commit() + sourceDs.Dispose() + File.Delete file + dataSet + + Seq.initInfinite (fun i -> startYear+i) + |> Seq.map (fun year -> getSurfaceVariableFile varFileName year) + |> Seq.takeWhile (fun elem -> elem.IsSome) + |> Seq.choose (fun elem -> elem) + |> Seq.fold folder dataSet + |> ignore + 0 // return an integer exit code